diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000000..b79aa2ff06 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,21 @@ +# Setting up CODEOWNERS for UST related codebase +# Documentation for open sourced models relevant to UST +examples/speech_to_text @kahne @sravyapopuri388 @jmp84 +examples/speech_to_speech @an918tw @sravyapopuri388 @jmp84 +examples/speech_synthesis @kahne @jmp84 +examples/simultaneous_translation @kahne @jmp84 +examples/speech_text_joint_to_text @yuntang @jmp84 + +# Speech related models relevant to UST +fairseq/models/speech_to_speech @sravyapopuri388 @jmp84 +fairseq/models/speech_to_text @kahne @sravyapopuri388 @jmp84 +fairseq/models/text_to_speech @kahne @jmp84 + +# CONFORMER IMPLEMENTATION +fairseq/modules/conformer_layer.py @sravyapopuri388 @jmp84 +fairseq/modules/espnet_multihead_attention.py @sravyapopuri388 @jmp84 +fairseq/modules/rotary_positional_embedding.py @sravyapopuri388 @jmp84 +fairseq/modules/positional_encoding.py @sravyapopuri388 @jmp84 + +# Machine Translation/NLLB +fairseq/tasks/translation.py @gwenzek diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index a7f4f0a902..aa15123d8e 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -19,7 +19,7 @@ Steps to reproduce the behavior (**always include the command you ran**): #### Code sample - ### Expected behavior @@ -28,7 +28,7 @@ Minimal means having the shortest code but still preserving the bug. --> ### Environment - - fairseq Version (e.g., 1.0 or master): + - fairseq Version (e.g., 1.0 or main): - PyTorch Version (e.g., 1.0) - OS (e.g., Linux): - How you installed fairseq (`pip`, source): diff --git a/.github/ISSUE_TEMPLATE/how-to-question.md b/.github/ISSUE_TEMPLATE/how-to-question.md index 4beb180dbf..04f3f15d3e 100644 --- a/.github/ISSUE_TEMPLATE/how-to-question.md +++ b/.github/ISSUE_TEMPLATE/how-to-question.md @@ -6,9 +6,9 @@ labels: 'question, needs triage' ## ❓ Questions and Help -### Before asking: -1. search the issues. -2. search the docs. +### Before asking: +1. search the issues. +2. search the docs. @@ -16,13 +16,13 @@ labels: 'question, needs triage' #### Code - + #### What have you tried? #### What's your environment? - - fairseq Version (e.g., 1.0 or master): + - fairseq Version (e.g., 1.0 or main): - PyTorch Version (e.g., 1.0) - OS (e.g., Linux): - How you installed fairseq (`pip`, source): diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index b28ff98e7b..d005e2df4f 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,15 +1,15 @@ # Before submitting - [ ] Was this discussed/approved via a Github issue? (no need for typos, doc improvements) -- [ ] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/master/CONTRIBUTING.md)? -- [ ] Did you make sure to update the docs? -- [ ] Did you write any new necessary tests? +- [ ] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/main/CONTRIBUTING.md)? +- [ ] Did you make sure to update the docs? +- [ ] Did you write any new necessary tests? ## What does this PR do? Fixes # (issue). -## PR review -Anyone in the community is free to review the PR once the tests have passed. +## PR review +Anyone in the community is free to review the PR once the tests have passed. If we didn't discuss your PR in Github issues there's a high chance it will not be merged. ## Did you have fun? diff --git a/.github/stale.yml b/.github/stale.yml new file mode 100644 index 0000000000..b12867dab0 --- /dev/null +++ b/.github/stale.yml @@ -0,0 +1,30 @@ +# Configuration for probot-stale - https://github.com/probot/stale +# Mostly copied from github.com/facebook/react/blob/master/.github/stale.yml +# Number of days of inactivity before an issue becomes stale +daysUntilStale: 90 +# Number of days of inactivity before a stale issue is closed +daysUntilClose: 7 +# Issues with these labels will never be considered stale +exemptLabels: + - bug +# Label to use when marking an issue as stale +staleLabel: stale +issues: + # Comment to post when marking an issue as stale. + markComment: > + This issue has been automatically marked as stale. + **If this issue is still affecting you, please leave any comment** (for example, "bump"), and we'll keep it open. + We are sorry that we haven't been able to prioritize it yet. If you have any new additional information, please include it with your comment! + # Comment to post when closing a stale issue. + closeComment: > + Closing this issue after a prolonged period of inactivity. If this issue is still present in the latest release, please create a new issue with up-to-date information. Thank you! +pulls: + # Comment to post when marking a pull request as stale. + markComment: > + This pull request has been automatically marked as stale. + **If this pull request is still relevant, please leave any comment** (for example, "bump"), and we'll keep it open. + We are sorry that we haven't been able to prioritize reviewing it yet. Your contribution is very much appreciated. + # Comment to post when closing a stale pull request. + closeComment: > + Closing this pull request after a prolonged period of inactivity. If this issue is still present in the latest release, please ask for this pull request to be reopened. Thank you! + diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6ae8093a8a..036233d8cf 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,10 +1,10 @@ name: build on: - # Trigger the workflow on push to master or any pull request + # Trigger the workflow on push to main or any pull request push: branches: - - master + - main pull_request: jobs: @@ -14,31 +14,68 @@ jobs: max-parallel: 4 matrix: platform: [ubuntu-latest, macos-latest] - python-version: [3.6, 3.7] + python-version: [3.8, 3.9] runs-on: ${{ matrix.platform }} steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} + - name: Conditionally install pytorch if: matrix.platform == 'windows-latest' run: pip3 install torch -f https://download.pytorch.org/whl/torch_stable.html + - name: Install locally run: | python -m pip install --upgrade pip - python setup.py build_ext --inplace - python -m pip install --editable . + git submodule update --init --recursive + python -m pip install . + + - name: Check installation + working-directory: /tmp + run: python $GITHUB_WORKSPACE/scripts/check_installation.py + + - name: Install optional test requirements + run: | + python -m pip install '.[dev,docs]' + python -m pip install iopath transformers pyarrow + python -m pip install git+https://github.com/facebookresearch/fairscale.git@main + python -m pip install pygit2 pgzip + + - name: Install xformers for Macos + if: matrix.platform == 'macos-latest' + run: | + brew install llvm libomp + CC=/usr/local/opt/llvm/bin/clang CXX=clang++ pip install git+https://github.com/facebookresearch/xformers.git@main + + - name: Install xformers for non-MacOS + if: matrix.platform != 'macos-latest' + run: | + python -m pip install --progress-bar off git+https://github.com/facebookresearch/xformers.git@main + + - name: Lint with black + run: black --check --diff . + - name: Lint with flake8 run: | - pip install flake8 # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + + - name: Build doc + run: make singlehtml + working-directory: docs/ + - name: Run tests - run: | - python setup.py test + # When installing in non-editable mode, the .so files will be generated in 'site-packages/fairseq'. + # But by default, pytest import machinery will load local fairseq, and won't see the .so. + # Use --import-mode=append to favorize the 'site-packages/fairseq'. + # https://docs.pytest.org/en/7.1.x/explanation/pythonpath.html + run: pytest --import-mode=append -vvv tests/ + diff --git a/.github/workflows/depreview.yml b/.github/workflows/depreview.yml new file mode 100644 index 0000000000..032eddef5f --- /dev/null +++ b/.github/workflows/depreview.yml @@ -0,0 +1,14 @@ +name: 'Dependency Review' +on: [pull_request] + +permissions: + contents: read + +jobs: + dependency-review: + runs-on: ubuntu-latest + steps: + - name: 'Checkout Repository' + uses: actions/checkout@v4 + - name: Dependency Review + uses: actions/dependency-review-action@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000000..241b74b32d --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,161 @@ +name: Fairseq Release + +on: + workflow_dispatch: + inputs: + name: + description: 'Release Type' + default: 'patch' + required: true + +jobs: + + get_next_version: + runs-on: ubuntu-latest + steps: + - name: checkout-repo-content + uses: actions/checkout@v2 + + - name: setup-python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: get next version and tag + id: get-next-version-and-tag + run: | + output=$(python3 release_utils.py --release-type ${{ github.event.inputs.name }}) + echo $output + new_version=$(echo $output | awk '{print $1}') + new_tag=$(echo $output | awk '{print $2}') + echo "new version is $new_version" + echo "new tag is $new_tag" + echo ::set-output name=version::$new_version + echo ::set-output name=tag::$new_tag + echo ::set-output name=branch_name::$new_version-release + echo "NEW_TAG=$new_tag" >> $GITHUB_ENV + echo "NEW_BRANCH=$new_version-release" >> $GITHUB_ENV + + + # update the version number in version.txt + - name: update version + id: update-version + run : | + echo "current folder = $PWD" + echo "current branch = $(git branch --show-current)" + output=$(python3 release_utils.py --release-type ${{ github.event.inputs.name }} --update-version) + + - name: add and commit + uses: EndBug/add-and-commit@v9 + with: + author_name: ${{ secrets.AUTHOR_NAME }} + author_email: ${{ secrets.AUTHOR_EMAIL }} + + # TODO: change this to main once shipit is disabled. + new_branch: '${{ env.NEW_BRANCH }}' + default_author: github_actor + message: '${{ env.NEW_TAG }} release' + pathspec_error_handling: exitAtEnd + + # Arguments for the git pull command. Use NO-PULL to avoid the action pulling at all. + # pull: 'NO-PULL' + tag: '${{ env.NEW_TAG }}' + + outputs: + new_version: ${{ steps.get-next-version-and-tag.outputs.version }} + new_tag: ${{ steps.get-next-version-and-tag.outputs.tag }} + branch_name: ${{ steps.get-next-version-and-tag.outputs.branch_name }} + + create_sdist: + runs-on: ubuntu-latest + name: Create Source Distribution + needs: get_next_version + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ needs.get_next_version.outputs.branch_name }} + + - name: Install Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + + - name: Upgrade pip + run: | + python3 -m pip install --upgrade pip + + - name: Create Source Distribution + run: | + python3 -m pip install setuptools wheel twine torch + python3 setup.py sdist + + - uses: actions/upload-artifact@v2 + with: + path: dist/*.tar.gz + + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + needs: get_next_version + strategy: + matrix: + os: [ubuntu-latest, macos-latest] + + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ needs.get_next_version.outputs.branch_name }} + + - name: Install Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + + - name: Upgrade pip + run: | + python3 -m pip install --upgrade pip + + - name: Install cibuildwheel + run: | + python3 -m pip install cibuildwheel + + - name: Build wheels for CPython + run: | + python3 -m cibuildwheel --output-dir dist + env: + CIBW_BUILD: "cp38-*64" + CIBW_MANYLINUX_X86_64_IMAGE: manylinux1 + CIBW_BEFORE_BUILD: git submodule update --init --recursive && pip install . + # Install system library + CIBW_BEFORE_BUILD_LINUX: (yum install -y libffi-devel || apt-get install -y libffi-devel || apk add --update --no-cache libffi-devel || true) && (yum install -y libc6 || apt-get install -y libc6 || apk add --update --no-cache libc6 || true) + CIBW_ENVIRONMENT: "PIP_ONLY_BINARY=numpy" + CIBW_SKIP: "*musllinux*" + + - uses: actions/upload-artifact@v2 + with: + path: dist + + upload: + name: Upload to PyPi and create release + runs-on: ubuntu-latest + needs: [build_wheels, create_sdist, get_next_version] + steps: + - uses: actions/download-artifact@v2 + with: + name: artifact + path: dist + + # build the PyPI package and upload it + - name: upload + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + pip install setuptools wheel twine + python3 -m twine upload --repository pypi dist/* + + # create the release on github + - name: create release on github + uses: ncipollo/release-action@v1 + with: + tag: '${{ needs.get_next_version.outputs.new_tag }}' diff --git a/.gitignore b/.gitignore index 9546cffd90..4be13638de 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,11 @@ data-bin/ # Experimental Folder experimental/* + +# Weights and Biases logs +wandb/ + +# Hydra artifacts +nohup.out +multirun +outputs diff --git a/.gitmodules b/.gitmodules index df0d3d3071..07a55d45d4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,3 @@ -[submodule "fairseq/models/huggingface/transformers"] - path = fairseq/models/huggingface/transformers - url = https://github.com/myleott/transformers.git - branch = fairseq [submodule "fairseq/model_parallel/megatron"] path = fairseq/model_parallel/megatron url = https://github.com/ngoyal2707/Megatron-LM diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..6b1d6aed8c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,40 @@ +exclude: 'build|stubs' + +default_language_version: + python: python3 + +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.1.0 + hooks: + - id: trailing-whitespace + - id: check-ast + - id: check-merge-conflict + - id: no-commit-to-branch + args: ['--branch=master'] + - id: check-added-large-files + args: ['--maxkb=500'] + - id: end-of-file-fixer + +- repo: https://github.com/ambv/black + rev: 22.3.0 + hooks: + - id: black + language_version: python3.8 + +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.9.2 + hooks: + - id: flake8 + args: [ + # only error for syntax errors and undefined names + "--select=E9,F63,F7,F82", + ] + +- repo: https://github.com/pycqa/isort + rev: 5.10.1 + hooks: + - id: isort + exclude: README.md + additional_dependencies: [toml] + args: ["--profile", "black"] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4d7ca6a98e..60e9025887 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,7 +5,7 @@ possible. ## Pull Requests We actively welcome your pull requests. -1. Fork the repo and create your branch from `master`. +1. Fork the repo and create your branch from `main`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. @@ -26,3 +26,57 @@ clear and has sufficient instructions to be able to reproduce the issue. By contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq), you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. + +## Pre-commit hooks +In order to ensure your code lints, there are pre-commit hooks configured in the repository which you can install. +After installation, they will automatically run each time you commit. +An abbreviated guide is given below; for more information, refer to [the offical pre-commit documentation](https://pre-commit.com/). + +### Installation +``` +pip install pre-commit +pre-commit install +``` + +### Usage +Just commit your changes: +``` +git commit -m "My informative commit message" +``` + +If there was a failure, you will get feedback +``` +[INFO] Initializing environment for https://github.com/PyCQA/flake8. +[INFO] Installing environment for https://github.com/pre-commit/pre-commit-hooks. +[INFO] Once installed this environment will be reused. +[INFO] This may take a few minutes... +[INFO] Installing environment for https://github.com/PyCQA/flake8. +[INFO] Once installed this environment will be reused. +[INFO] This may take a few minutes... +Trim Trailing Whitespace.................................................Failed +- hook id: trailing-whitespace +- exit code: 1 +- files were modified by this hook +Fixing examples/nllb/modeling/wmt15_benchmark/eval_langs2.sh +Fix End of Files.........................................................Failed +- hook id: end-of-file-fixer +- exit code: 1 +- files were modified by this hook +Fixing examples/few_shot/scripts/schedule_jobs_few_shot.py +flake8...................................................................Passed +``` + +Certain hooks modify your files to comply. +To include these modifications, you will need to add them (i.e. `git add ...`) and commit again. + +If all is well, you should see something like: +``` +Trim Trailing Whitespace.................................................Passed +Fix End of Files.........................................................Passed +flake8...................................................................Passed +[gshard-fix-ci 8698644e1] Fix lint, add pre-commit hooks + 10 files changed, 148 insertions(+), 110 deletions(-) + create mode 100644 .flake8 + create mode 100644 .pre-commit-config.yaml + rename examples/nllb/modeling/wmt15_benchmark/{eval_langs2.py => eval_langs2.sh} (99%) + ``` diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000..4f719da85c --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include fairseq/version.txt diff --git a/README.md b/README.md index 56ec16cdab..1150c66cbe 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,12 @@

- MIT License + Support Ukraine + MIT License Latest Release Build Status Documentation Status + CicleCI Status

-------------------------------------------------------------------------------- @@ -13,129 +15,169 @@ Fairseq(-py) is a sequence modeling toolkit that allows researchers and developers to train custom models for translation, summarization, language modeling and other text generation tasks. + We provide reference implementations of various sequence modeling papers:
List of implemented papers

-- **Convolutional Neural Networks (CNN)** - - [Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)](examples/language_model/conv_lm/README.md) - - [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](examples/conv_seq2seq/README.md) - - [Classical Structured Prediction Losses for Sequence to Sequence Learning (Edunov et al., 2018)](https://github.com/pytorch/fairseq/tree/classic_seqlevel) - - [Hierarchical Neural Story Generation (Fan et al., 2018)](examples/stories/README.md) - - [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md) -- **LightConv and DynamicConv models** - - [Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)](examples/pay_less_attention_paper/README.md) -- **Long Short-Term Memory (LSTM) networks** - - Effective Approaches to Attention-based Neural Machine Translation (Luong et al., 2015) -- **Transformer (self-attention) networks** - - Attention Is All You Need (Vaswani et al., 2017) - - [Scaling Neural Machine Translation (Ott et al., 2018)](examples/scaling_nmt/README.md) - - [Understanding Back-Translation at Scale (Edunov et al., 2018)](examples/backtranslation/README.md) - - [Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)](examples/language_model/transformer_lm/README.md) - - [Lexically constrained decoding with dynamic beam allocation (Post & Vilar, 2018)](examples/constrained_decoding/README.md) - - [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md) - - [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md) - - [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md) - - [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](examples/joint_alignment_translation/README.md ) - - [Multilingual Denoising Pre-training for Neural Machine Translation (Liu et at., 2020)](examples/mbart/README.md) - - [Neural Machine Translation with Byte-Level Subwords (Wang et al., 2020)](examples/byte_level_bpe/README.md) - - [Unsupervised Quality Estimation for Neural Machine Translation (Fomicheva et al., 2020)](examples/unsupervised_quality_estimation/README.md) - - [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations (Baevski et al., 2020)](examples/wav2vec/README.md) - - [Generating Medical Reports from Patient-Doctor Conversations Using Sequence-to-Sequence Models (Enarvi et al., 2020)](examples/pointer_generator/README.md) - - [Linformer: Self-Attention with Linear Complexity (Wang et al., 2020)](examples/linformer/README.md) - - [Cross-lingual Retrieval for Iterative Self-Supervised Training (Tran et al., 2020)](examples/criss/README.md) - - [Deep Transformers with Latent Depth (Li et al., 2020)](examples/latent_depth/README.md) -- **Non-autoregressive Transformers** - - Non-Autoregressive Neural Machine Translation (Gu et al., 2017) - - Deterministic Non-Autoregressive Neural Sequence Modeling by Iterative Refinement (Lee et al. 2018) - - Insertion Transformer: Flexible Sequence Generation via Insertion Operations (Stern et al. 2019) - - Mask-Predict: Parallel Decoding of Conditional Masked Language Models (Ghazvininejad et al., 2019) - - [Levenshtein Transformer (Gu et al., 2019)](examples/nonautoregressive_translation/README.md) -- **Finetuning** - - [Better Fine-Tuning by Reducing Representational Collapse (Aghajanyan et al. 2020)](examples/rxf/README.md) +* **Convolutional Neural Networks (CNN)** + + [Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)](examples/language_model/conv_lm/README.md) + + [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](examples/conv_seq2seq/README.md) + + [Classical Structured Prediction Losses for Sequence to Sequence Learning (Edunov et al., 2018)](https://github.com/pytorch/fairseq/tree/classic_seqlevel) + + [Hierarchical Neural Story Generation (Fan et al., 2018)](examples/stories/README.md) + + [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md) +* **LightConv and DynamicConv models** + + [Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)](examples/pay_less_attention_paper/README.md) +* **Long Short-Term Memory (LSTM) networks** + + Effective Approaches to Attention-based Neural Machine Translation (Luong et al., 2015) +* **Transformer (self-attention) networks** + + Attention Is All You Need (Vaswani et al., 2017) + + [Scaling Neural Machine Translation (Ott et al., 2018)](examples/scaling_nmt/README.md) + + [Understanding Back-Translation at Scale (Edunov et al., 2018)](examples/backtranslation/README.md) + + [Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)](examples/language_model/README.adaptive_inputs.md) + + [Lexically constrained decoding with dynamic beam allocation (Post & Vilar, 2018)](examples/constrained_decoding/README.md) + + [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context (Dai et al., 2019)](examples/truncated_bptt/README.md) + + [Adaptive Attention Span in Transformers (Sukhbaatar et al., 2019)](examples/adaptive_span/README.md) + + [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md) + + [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md) + + [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md) + + [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](examples/joint_alignment_translation/README.md ) + + [Multilingual Denoising Pre-training for Neural Machine Translation (Liu et at., 2020)](examples/mbart/README.md) + + [Neural Machine Translation with Byte-Level Subwords (Wang et al., 2020)](examples/byte_level_bpe/README.md) + + [Unsupervised Quality Estimation for Neural Machine Translation (Fomicheva et al., 2020)](examples/unsupervised_quality_estimation/README.md) + + [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations (Baevski et al., 2020)](examples/wav2vec/README.md) + + [Generating Medical Reports from Patient-Doctor Conversations Using Sequence-to-Sequence Models (Enarvi et al., 2020)](examples/pointer_generator/README.md) + + [Linformer: Self-Attention with Linear Complexity (Wang et al., 2020)](examples/linformer/README.md) + + [Cross-lingual Retrieval for Iterative Self-Supervised Training (Tran et al., 2020)](examples/criss/README.md) + + [Deep Transformers with Latent Depth (Li et al., 2020)](examples/latent_depth/README.md) + + [Unsupervised Cross-lingual Representation Learning for Speech Recognition (Conneau et al., 2020)](https://arxiv.org/abs/2006.13979) + + [Self-training and Pre-training are Complementary for Speech Recognition (Xu et al., 2020)](https://arxiv.org/abs/2010.11430) + + [Robust wav2vec 2.0: Analyzing Domain Shift in Self-Supervised Pre-Training (Hsu, et al., 2021)](https://arxiv.org/abs/2104.01027) + + [Unsupervised Speech Recognition (Baevski, et al., 2021)](https://arxiv.org/abs/2105.11084) + + [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al., 2021)](https://arxiv.org/abs/2109.11680) + + [VideoCLIP: Contrastive Pre-training for Zero-shot Video-Text Understanding (Xu et. al., 2021)](https://arxiv.org/pdf/2109.14084.pdf) + + [VLM: Task-agnostic Video-Language Model Pre-training for Video Understanding (Xu et. al., 2021)](https://aclanthology.org/2021.findings-acl.370.pdf) + + [NormFormer: Improved Transformer Pretraining with Extra Normalization (Shleifer et. al, 2021)](examples/normformer/README.md) +* **Non-autoregressive Transformers** + + Non-Autoregressive Neural Machine Translation (Gu et al., 2017) + + Deterministic Non-Autoregressive Neural Sequence Modeling by Iterative Refinement (Lee et al. 2018) + + Insertion Transformer: Flexible Sequence Generation via Insertion Operations (Stern et al. 2019) + + Mask-Predict: Parallel Decoding of Conditional Masked Language Models (Ghazvininejad et al., 2019) + + [Levenshtein Transformer (Gu et al., 2019)](examples/nonautoregressive_translation/README.md) +* **Finetuning** + + [Better Fine-Tuning by Reducing Representational Collapse (Aghajanyan et al. 2020)](examples/rxf/README.md)

### What's New: +* May 2023 [Released models for Scaling Speech Technology to 1,000+ Languages (Pratap, et al., 2023)](examples/mms/README.md) +* June 2022 [Released code for wav2vec-U 2.0 from Towards End-to-end Unsupervised Speech Recognition (Liu, et al., 2022)](examples/wav2vec/unsupervised/README.md) +* May 2022 [Integration with xFormers](https://github.com/facebookresearch/xformers) +* December 2021 [Released Direct speech-to-speech translation code](examples/speech_to_speech/README.md) +* October 2021 [Released VideoCLIP and VLM models](examples/MMPT/README.md) +* October 2021 [Released multilingual finetuned XLSR-53 model](examples/wav2vec/README.md) +* September 2021 [`master` branch renamed to `main`](https://github.com/github/renaming). +* July 2021 [Released DrNMT code](examples/discriminative_reranking_nmt/README.md) +* July 2021 [Released Robust wav2vec 2.0 model](examples/wav2vec/README.md) +* June 2021 [Released XLMR-XL and XLMR-XXL models](examples/xlmr/README.md) +* May 2021 [Released Unsupervised Speech Recognition code](examples/wav2vec/unsupervised/README.md) +* March 2021 [Added full parameter and optimizer state sharding + CPU offloading](examples/fully_sharded_data_parallel/README.md) +* February 2021 [Added LASER training code](examples/laser/README.md) +* December 2020: [Added Adaptive Attention Span code](examples/adaptive_span/README.md) +* December 2020: [GottBERT model and code released](examples/gottbert/README.md) +* November 2020: Adopted the [Hydra](https://github.com/facebookresearch/hydra) configuration framework + * [see documentation explaining how to use it for new and existing projects](docs/hydra_integration.md) +* November 2020: [fairseq 0.10.0 released](https://github.com/pytorch/fairseq/releases/tag/v0.10.0) +* October 2020: [Added R3F/R4F (Better Fine-Tuning) code](examples/rxf/README.md) +* October 2020: [Deep Transformer with Latent Depth code released](examples/latent_depth/README.md) +* October 2020: [Added CRISS models and code](examples/criss/README.md) -- October 2020: [Added R3F/R4F (Better Fine-Tuning) code](examples/rxf/README.md) -- October 2020: [Deep Transformer with Latent Depth code released](examples/latent_depth/README.md) -- October 2020: [Added CRISS models and code](examples/criss/README.md) -- September 2020: [Added Linformer code](examples/linformer/README.md) -- September 2020: [Added pointer-generator networks](examples/pointer_generator/README.md) -- August 2020: [Added lexically constrained decoding](examples/constrained_decoding/README.md) -- August 2020: [wav2vec2 models and code released](examples/wav2vec/README.md) -- July 2020: [Unsupervised Quality Estimation code released](examples/unsupervised_quality_estimation/README.md) -- May 2020: [Follow fairseq on Twitter](https://twitter.com/fairseq) -- April 2020: [Monotonic Multihead Attention code released](examples/simultaneous_translation/README.md) -- April 2020: [Quant-Noise code released](examples/quant_noise/README.md) -- April 2020: [Initial model parallel support and 11B parameters unidirectional LM released](examples/megatron_11b/README.md)
Previous updates

-- March 2020: [Byte-level BPE code released](examples/byte_level_bpe/README.md) -- February 2020: [mBART model and code released](examples/mbart/README.md) -- February 2020: [Added tutorial for back-translation](https://github.com/pytorch/fairseq/tree/master/examples/backtranslation#training-your-own-model-wmt18-english-german) -- December 2019: [fairseq 0.9.0 released](https://github.com/pytorch/fairseq/releases/tag/v0.9.0) -- November 2019: [VizSeq released (a visual analysis toolkit for evaluating fairseq models)](https://facebookresearch.github.io/vizseq/docs/getting_started/fairseq_example) -- November 2019: [CamemBERT model and code released](examples/camembert/README.md) -- November 2019: [BART model and code released](examples/bart/README.md) -- November 2019: [XLM-R models and code released](examples/xlmr/README.md) -- September 2019: [Nonautoregressive translation code released](examples/nonautoregressive_translation/README.md) -- August 2019: [WMT'19 models released](examples/wmt19/README.md) -- July 2019: fairseq relicensed under MIT license -- July 2019: [RoBERTa models and code released](examples/roberta/README.md) -- June 2019: [wav2vec models and code released](examples/wav2vec/README.md) +* September 2020: [Added Linformer code](examples/linformer/README.md) +* September 2020: [Added pointer-generator networks](examples/pointer_generator/README.md) +* August 2020: [Added lexically constrained decoding](examples/constrained_decoding/README.md) +* August 2020: [wav2vec2 models and code released](examples/wav2vec/README.md) +* July 2020: [Unsupervised Quality Estimation code released](examples/unsupervised_quality_estimation/README.md) +* May 2020: [Follow fairseq on Twitter](https://twitter.com/fairseq) +* April 2020: [Monotonic Multihead Attention code released](examples/simultaneous_translation/README.md) +* April 2020: [Quant-Noise code released](examples/quant_noise/README.md) +* April 2020: [Initial model parallel support and 11B parameters unidirectional LM released](examples/megatron_11b/README.md) +* March 2020: [Byte-level BPE code released](examples/byte_level_bpe/README.md) +* February 2020: [mBART model and code released](examples/mbart/README.md) +* February 2020: [Added tutorial for back-translation](https://github.com/pytorch/fairseq/tree/main/examples/backtranslation#training-your-own-model-wmt18-english-german) +* December 2019: [fairseq 0.9.0 released](https://github.com/pytorch/fairseq/releases/tag/v0.9.0) +* November 2019: [VizSeq released (a visual analysis toolkit for evaluating fairseq models)](https://facebookresearch.github.io/vizseq/docs/getting_started/fairseq_example) +* November 2019: [CamemBERT model and code released](examples/camembert/README.md) +* November 2019: [BART model and code released](examples/bart/README.md) +* November 2019: [XLM-R models and code released](examples/xlmr/README.md) +* September 2019: [Nonautoregressive translation code released](examples/nonautoregressive_translation/README.md) +* August 2019: [WMT'19 models released](examples/wmt19/README.md) +* July 2019: fairseq relicensed under MIT license +* July 2019: [RoBERTa models and code released](examples/roberta/README.md) +* June 2019: [wav2vec models and code released](examples/wav2vec/README.md)

### Features: -- multi-GPU training on one machine or across multiple machines (data and model parallel) -- fast generation on both CPU and GPU with multiple search algorithms implemented: - - beam search - - Diverse Beam Search ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424)) - - sampling (unconstrained, top-k and top-p/nucleus) - - lexically constrained decoding ([Post & Vilar, 2018](examples/constrained_decoding/README.md)) -- large mini-batch training even on a single GPU via delayed updates -- mixed precision training (trains faster with less GPU memory on [NVIDIA tensor cores](https://developer.nvidia.com/tensor-cores)) -- extensible: easily register new models, criterions, tasks, optimizers and learning rate schedulers +* multi-GPU training on one machine or across multiple machines (data and model parallel) +* fast generation on both CPU and GPU with multiple search algorithms implemented: + + beam search + + Diverse Beam Search ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424)) + + sampling (unconstrained, top-k and top-p/nucleus) + + [lexically constrained decoding](examples/constrained_decoding/README.md) (Post & Vilar, 2018) +* [gradient accumulation](https://fairseq.readthedocs.io/en/latest/getting_started.html#large-mini-batch-training-with-delayed-updates) enables training with large mini-batches even on a single GPU +* [mixed precision training](https://fairseq.readthedocs.io/en/latest/getting_started.html#training-with-half-precision-floating-point-fp16) (trains faster with less GPU memory on [NVIDIA tensor cores](https://developer.nvidia.com/tensor-cores)) +* [extensible](https://fairseq.readthedocs.io/en/latest/overview.html): easily register new models, criterions, tasks, optimizers and learning rate schedulers +* [flexible configuration](docs/hydra_integration.md) based on [Hydra](https://github.com/facebookresearch/hydra) allowing a combination of code, command-line and file based configuration +* [full parameter and optimizer state sharding](examples/fully_sharded_data_parallel/README.md) +* [offloading parameters to CPU](examples/fully_sharded_data_parallel/README.md) We also provide [pre-trained models for translation and language modeling](#pre-trained-models-and-examples) with a convenient `torch.hub` interface: -```python + +``` python en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model') en2de.translate('Hello world', beam=5) # 'Hallo Welt' ``` + See the PyTorch Hub tutorials for [translation](https://pytorch.org/hub/pytorch_fairseq_translation/) and [RoBERTa](https://pytorch.org/hub/pytorch_fairseq_roberta/) for more examples. # Requirements and Installation -* [PyTorch](http://pytorch.org/) version >= 1.4.0 -* Python version >= 3.6 +* [PyTorch](http://pytorch.org/) version >= 1.10.0 +* Python version >= 3.8 * For training new models, you'll also need an NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl) * **To install fairseq** and develop locally: -```bash + +``` bash git clone https://github.com/pytorch/fairseq cd fairseq pip install --editable ./ # on MacOS: # CFLAGS="-stdlib=libc++" pip install --editable ./ + +# to install the latest stable release (0.10.x) +# pip install fairseq ``` + * **For faster training** install NVIDIA's [apex](https://github.com/NVIDIA/apex) library: -```bash + +``` bash git clone https://github.com/NVIDIA/apex cd apex pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" \ --global-option="--deprecated_fused_adam" --global-option="--xentropy" \ --global-option="--fast_multihead_attn" ./ ``` -* **For large datasets** install [PyArrow](https://arrow.apache.org/docs/python/install.html#using-pip): `pip install pyarrow` -* If you use Docker make sure to increase the shared memory size either with -`--ipc=host` or `--shm-size` as command line options to `nvidia-docker run`. +* **For large datasets** install [PyArrow](https://arrow.apache.org/docs/python/install.html#using-pip): `pip install pyarrow` +* If you use Docker make sure to increase the shared memory size either with `--ipc=host` or `--shm-size` + as command line options to `nvidia-docker run` . # Getting Started @@ -148,30 +190,32 @@ types and tasks. We provide pre-trained models and pre-processed, binarized test sets for several tasks listed below, as well as example training and evaluation commands. -- [Translation](examples/translation/README.md): convolutional and transformer models are available -- [Language Modeling](examples/language_model/README.md): convolutional and transformer models are available +* [Translation](examples/translation/README.md): convolutional and transformer models are available +* [Language Modeling](examples/language_model/README.md): convolutional and transformer models are available We also have more detailed READMEs to reproduce results from specific papers: -- [Cross-lingual Retrieval for Iterative Self-Supervised Training (Tran et al., 2020)](examples/criss/README.md) -- [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations (Baevski et al., 2020)](examples/wav2vec/README.md) -- [Unsupervised Quality Estimation for Neural Machine Translation (Fomicheva et al., 2020)](examples/unsupervised_quality_estimation/README.md) -- [Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)](examples/quant_noise/README.md) -- [Neural Machine Translation with Byte-Level Subwords (Wang et al., 2020)](examples/byte_level_bpe/README.md) -- [Multilingual Denoising Pre-training for Neural Machine Translation (Liu et at., 2020)](examples/mbart/README.md) -- [Reducing Transformer Depth on Demand with Structured Dropout (Fan et al., 2019)](examples/layerdrop/README.md) -- [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](examples/joint_alignment_translation/README.md) -- [Levenshtein Transformer (Gu et al., 2019)](examples/nonautoregressive_translation/README.md) -- [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md) -- [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md) -- [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md) -- [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md) -- [Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)](examples/pay_less_attention_paper/README.md) -- [Understanding Back-Translation at Scale (Edunov et al., 2018)](examples/backtranslation/README.md) -- [Classical Structured Prediction Losses for Sequence to Sequence Learning (Edunov et al., 2018)](https://github.com/pytorch/fairseq/tree/classic_seqlevel) -- [Hierarchical Neural Story Generation (Fan et al., 2018)](examples/stories/README.md) -- [Scaling Neural Machine Translation (Ott et al., 2018)](examples/scaling_nmt/README.md) -- [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](examples/conv_seq2seq/README.md) -- [Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)](examples/language_model/conv_lm/README.md) + +* [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale (Babu et al., 2021)](examples/wav2vec/xlsr/README.md) +* [Cross-lingual Retrieval for Iterative Self-Supervised Training (Tran et al., 2020)](examples/criss/README.md) +* [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations (Baevski et al., 2020)](examples/wav2vec/README.md) +* [Unsupervised Quality Estimation for Neural Machine Translation (Fomicheva et al., 2020)](examples/unsupervised_quality_estimation/README.md) +* [Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)](examples/quant_noise/README.md) +* [Neural Machine Translation with Byte-Level Subwords (Wang et al., 2020)](examples/byte_level_bpe/README.md) +* [Multilingual Denoising Pre-training for Neural Machine Translation (Liu et at., 2020)](examples/mbart/README.md) +* [Reducing Transformer Depth on Demand with Structured Dropout (Fan et al., 2019)](examples/layerdrop/README.md) +* [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](examples/joint_alignment_translation/README.md) +* [Levenshtein Transformer (Gu et al., 2019)](examples/nonautoregressive_translation/README.md) +* [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md) +* [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md) +* [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md) +* [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md) +* [Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)](examples/pay_less_attention_paper/README.md) +* [Understanding Back-Translation at Scale (Edunov et al., 2018)](examples/backtranslation/README.md) +* [Classical Structured Prediction Losses for Sequence to Sequence Learning (Edunov et al., 2018)](https://github.com/pytorch/fairseq/tree/classic_seqlevel) +* [Hierarchical Neural Story Generation (Fan et al., 2018)](examples/stories/README.md) +* [Scaling Neural Machine Translation (Ott et al., 2018)](examples/scaling_nmt/README.md) +* [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](examples/conv_seq2seq/README.md) +* [Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)](examples/language_model/README.conv.md) # Join the fairseq community @@ -188,7 +232,7 @@ The license applies to the pre-trained models as well. Please cite as: -```bibtex +``` bibtex @inproceedings{ott2019fairseq, title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling}, author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli}, diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 0000000000..79480a11c5 --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,13 @@ +# Creating a New Release + +In order to create a new release: + +1. Navigate to the [Fairseq Workflows](https://github.com/facebookresearch/fairseq/actions) and find the one named _Fairseq Release_. + +2. Under _Run Workflow_ choose the branch `main` and for _Release Type_ enter either `major`, `minor`, or `patch`. + +3. A branch named `$new_version-release` will be created where the `version.txt` file is updated. Merge those changes into `main`. + +4. Make sure that a [new PYPI package](https://pypi.org/project/fairseq/) has been uploaded. + +5. Make sure that a [new github release](https://github.com/facebookresearch/fairseq/releases) has been created. diff --git a/config/config.yaml b/config/config.yaml deleted file mode 100644 index b9ee6c74ac..0000000000 --- a/config/config.yaml +++ /dev/null @@ -1,111 +0,0 @@ -# @package _group_ -common: - no_progress_bar: false - log_interval: 100 - log_format: null - tensorboard_logdir: null - seed: 1 - cpu: false - tpu: false - bf16: false - fp16: false - memory_efficient_fp16: false - memory_efficient_bf16: false - fp16_no_flatten_grads: false - fp16_init_scale: 128 - fp16_scale_window: null - fp16_scale_tolerance: 0.0 - min_loss_scale: 1.0e-4 - threshold_loss_scale: null - user_dir: null - empty_cache_freq: 0 - all_gather_list_size: 16384 - model_parallel_size: 1 - quantization_config_path: null - profile: false -distributed_training: - distributed_rank: 0 - distributed_backend: "nccl" - distributed_init_method: null - distributed_port: -1 - device_id: 0 - local_rank: 0 - distributed_no_spawn: false - ddp_backend: "c10d" - bucket_cap_mb: 25 - fix_batches_to_gpus: false - find_unused_parameters: false - fast_stat_sync: false - broadcast_buffers: false - distributed_wrapper: "DDP" - slowmo_momentum: null - slowmo_algorithm: "LocalSGD" - localsgd_frequency: 3 -dataset: - num_workers: 1 - skip_invalid_size_inputs_valid_test: false - max_tokens: null - batch_size: null - required_batch_size_multiple: 8 - dataset_impl: null - data_buffer_size: 10 - train_subset: "train" - valid_subset: "valid" - validate_interval: 1 - fixed_validation_seed: null - disable_validation: false - curriculum: 0 - gen_subset: "test" - num_shards: 1 - shard_id: 0 - max_tokens_valid: ${dataset.max_tokens} - batch_size_valid: ${dataset.batch_size} -optimization: - max_epoch: 0 - max_update: 0 - clip_norm: 25.0 - sentence_avg: false - update_freq: [ 1 ] - lr: [ 0.25 ] - min_lr: -1.0 - use_bmuf: false -checkpoint: - save_dir: "checkpoints" - restore_file: "checkpoint_last.pt" - reset_dataloader: false - reset_lr_scheduler: false - reset_meters: false - reset_optimizer: false - optimizer_overrides: "{}" - save_interval: 1 - save_interval_updates: 0 - keep_interval_updates: -1 - keep_last_epochs: -1 - keep_best_checkpoints: -1 - no_save: false - no_epoch_checkpoints: false - no_last_checkpoints: false - no_save_optimizer_state: false - best_checkpoint_metric: "loss" - maximize_best_checkpoint_metric: false - patience: -1 - checkpoint_suffix: "" -bmuf: - block_lr: 1 - block_momentum: 0.875 - global_sync_iter: 50 - warmup_iterations: 500 - use_nbm: false - average_sync: false -defaults: - - task: language_modeling - - model: null - - criterion: null - - optimizer: null - - lr_scheduler: null - - bpe: null - - tokenizer: null - - scoring: null - - generation: null - - common_eval: null - - eval_lm: null diff --git a/config/criterion/adaptive_loss.yaml b/config/criterion/adaptive_loss.yaml deleted file mode 100644 index 7997b0766e..0000000000 --- a/config/criterion/adaptive_loss.yaml +++ /dev/null @@ -1,3 +0,0 @@ -# @package _group_ -sentence_avg: ${optimization.sentence_avg} -ddp_backend: ${distributed_training.ddp_backend} diff --git a/config/criterion/cross_entropy.yaml b/config/criterion/cross_entropy.yaml deleted file mode 100644 index ad3d4148c2..0000000000 --- a/config/criterion/cross_entropy.yaml +++ /dev/null @@ -1,2 +0,0 @@ -# @package _group_ -sentence_avg: ${optimization.sentence_avg} diff --git a/config/lr_scheduler/cosine.yaml b/config/lr_scheduler/cosine.yaml deleted file mode 100644 index 0f91e0d240..0000000000 --- a/config/lr_scheduler/cosine.yaml +++ /dev/null @@ -1,7 +0,0 @@ -# @package _group_ -warmup_updates: 0 -warmup_init_lr: -1 -max_lr: 1.0 -t_mult: 1.0 -lr_period_updates: -1 -lr_shrink: 0.1 diff --git a/config/lr_scheduler/inverse_sqrt.yaml b/config/lr_scheduler/inverse_sqrt.yaml deleted file mode 100644 index 0eac7d88eb..0000000000 --- a/config/lr_scheduler/inverse_sqrt.yaml +++ /dev/null @@ -1,3 +0,0 @@ -# @package _group_ -warmup_updates: 4000 -warmup_init_lr: -1 diff --git a/config/model/transformer_lm.yaml b/config/model/transformer_lm.yaml deleted file mode 100644 index 3837ea54e1..0000000000 --- a/config/model/transformer_lm.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# @package _group_ -activation_fn: "relu" -dropout: 0.1 -attention_dropout: 0.0 -activation_dropout: 0.0 -relu_dropout: 0.0 -decoder_embed_dim: 512 -decoder_output_dim: 512 -decoder_input_dim: 512 -decoder_ffn_embed_dim: 2048 -decoder_layers: 6 -decoder_attention_heads: 8 -decoder_normalize_before: true -no_decoder_final_norm: false -adaptive_softmax_cutoff: null -adaptive_softmax_dropout: 0 -adaptive_softmax_factor: 4 -no_token_positional_embeddings: false -share_decoder_input_output_embed: false -character_embeddings: false -character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" -character_embedding_dim: 4 -char_embedder_highway_layers: 2 -adaptive_input: false -adaptive_input_factor: 4 -adaptive_input_cutoff: null -tie_adaptive_weights: false -tie_adaptive_proj: false -decoder_learned_pos: false -decoder_layerdrop: 0 -decoder_layers_to_keep: null -layernorm_embedding: false -no_scale_embedding: false -quant_noise_pq: 0 -quant_noise_pq_block_size: 8 -quant_noise_scalar: 0 diff --git a/config/optimizer/adam.yaml b/config/optimizer/adam.yaml deleted file mode 100644 index e5264f895e..0000000000 --- a/config/optimizer/adam.yaml +++ /dev/null @@ -1,5 +0,0 @@ -# @package _group_ -adam_betas: "(0.9, 0.999)" -adam_eps: 1.0e-8 -weight_decay: 0 -use_old_adam: false diff --git a/config/optimizer/nag.yaml b/config/optimizer/nag.yaml deleted file mode 100644 index 4ab2745686..0000000000 --- a/config/optimizer/nag.yaml +++ /dev/null @@ -1,3 +0,0 @@ -# @package _group_ -momentum: 0.99 -weight_decay: 0.0 diff --git a/config/task/language_modeling.yaml b/config/task/language_modeling.yaml deleted file mode 100644 index 58a2ad1358..0000000000 --- a/config/task/language_modeling.yaml +++ /dev/null @@ -1,10 +0,0 @@ -# @package _group_ -data: ??? -sample_break_mode: "none" -tokens_per_sample: 1024 -output_dictionary_size: -1 -self_target: false -future_target: false -past_target: false -add_bos_token: false -max_target_positions: null diff --git a/docs/_static/theme_overrides.css b/docs/_static/theme_overrides.css deleted file mode 100644 index 2a07641936..0000000000 --- a/docs/_static/theme_overrides.css +++ /dev/null @@ -1,9 +0,0 @@ -.wy-table-responsive table td kbd { - white-space: nowrap; -} -.wy-table-responsive table td { - white-space: normal !important; -} -.wy-table-responsive { - overflow: visible !important; -} diff --git a/docs/conf.py b/docs/conf.py index 440784bfae..0bc049f802 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -55,7 +55,7 @@ copyright = "Facebook AI Research (FAIR)" author = "Facebook AI Research (FAIR)" -github_doc_root = "https://github.com/pytorch/fairseq/tree/master/docs/" +github_doc_root = "https://github.com/pytorch/fairseq/tree/main/docs/" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -88,43 +88,7 @@ # -- Options for HTML output ---------------------------------------------- -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "sphinx_rtd_theme" - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] - -html_context = { - "css_files": [ - "_static/theme_overrides.css", # override wide tables in RTD theme - ], -} - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# This is required for the alabaster theme -# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars -# html_sidebars = { -# '**': [ -# 'about.html', -# 'navigation.html', -# 'relations.html', # needs 'show_related': True theme option to display -# 'searchbox.html', -# 'donate.html', -# ] -# } - +html_theme = "classic" # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { diff --git a/docs/getting_started.rst b/docs/getting_started.rst index fa5971dd31..745ad7763c 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -90,7 +90,7 @@ well for the IWSLT 2014 dataset: > mkdir -p checkpoints/fconv > CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt14.tokenized.de-en \ - --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \ + --optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \ --arch fconv_iwslt_de_en --save-dir checkpoints/fconv By default, :ref:`fairseq-train` will use all available GPUs on your machine. Use the @@ -170,21 +170,31 @@ The easiest way to launch jobs is with the `torch.distributed.launch For example, to train a large English-German Transformer model on 2 nodes each with 8 GPUs (in total 16 GPUs), run the following command on each node, -replacing ``node_rank=0`` with ``node_rank=1`` on the second node: +replacing ``node_rank=0`` with ``node_rank=1`` on the second node and making +sure to update ``--master_addr`` to the IP address of the first node: .. code-block:: console > python -m torch.distributed.launch --nproc_per_node=8 \ --nnodes=2 --node_rank=0 --master_addr="192.168.1.1" \ - --master_port=1234 \ + --master_port=12345 \ $(which fairseq-train) data-bin/wmt16_en_de_bpe32k \ --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \ - --lr 0.0005 --min-lr 1e-09 \ + --lr 0.0005 \ --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --max-tokens 3584 \ - --fp16 --distributed-no-spawn + --max-epoch 70 \ + --fp16 + +On SLURM clusters, fairseq will automatically detect the number of nodes and +GPUs, but a port number must be provided: + +.. code-block:: console + + > salloc --gpus=16 --nodes 2 (...) + > srun fairseq-train --distributed-port 12345 (...). Sharding very large datasets ---------------------------- diff --git a/docs/hydra_integration.md b/docs/hydra_integration.md index 0973cd279e..6a15298382 100644 --- a/docs/hydra_integration.md +++ b/docs/hydra_integration.md @@ -1,111 +1,284 @@ +## Hydra +[Hydra](https://github.com/facebookresearch/hydra) is an open-source Python +framework that simplifies the development of research and other complex +applications. The key feature is the ability to dynamically create a +hierarchical configuration by composition and override it through config files +and the command line. The name Hydra comes from its ability to run multiple +similar jobs - much like a Hydra with multiple heads. -## Hydra +## Motivation + +Until recently, all components in fairseq were configured through a shared +`args` namespace that was created at application startup. Components declared +their own `add_args` method to update the argparse parser, hoping that the names +would not clash with arguments from other components. While this model works for +smaller applications, as fairseq grew and became integrated into other +applications, this became problematic. In order to determine how to configure +each component, one needed to a) examine what args were added by this component, +and b) read the code to figure out what shared arguments it is using that were +added in other places. Reproducing models involved sharing commands that often +contained dozens of command line switches. + +The model described above is still supported by fairseq for backward +compatibility, but will be deprecated some time in the future. + +New components in fairseq should now create a dataclass that encapsulates all +parameters required to configure this component. The dataclass is registered +along with the component, and fairseq takes care of constructing and providing +this configuration object to the component's constructor. Note that sharing +parameters can optionally still work, but one has to explicitly point to the +"source of truth" (see inheritance example below). These changes make components +in fairseq more independent and re-usable by other applications: all that is +needed to create a component is to initialize its dataclass and overwrite some +of the defaults. + +While configuring fairseq through command line (using either the legacy argparse +based or the new Hydra based entry points) is still fully supported, you can now +take advantage of configuring fairseq completely or piece-by-piece through +hierarchical YAML configuration files. These files can also be shipped as +examples that others can use to run an identically configured job. + +Additionally, Hydra has a rich and growing [library of +plugins](https://github.com/facebookresearch/hydra/tree/master/plugins) that +provide functionality such as hyperparameter sweeping (including using bayesian +optimization through the [Ax](https://github.com/facebook/Ax) library), job +launching across various platforms, and more. + +## Creating or migrating components -Hydra is an open-source Python framework that simplifies the development of research and other complex applications. The key feature is the ability to dynamically create a hierarchical configuration by composition and override it through config files and the command line. The name Hydra comes from its ability to run multiple similar jobs - much like a Hydra with multiple heads. +In general, each new (or updated) component should provide a companion +[dataclass](https://www.python.org/dev/peps/pep-0557/). These dataclass are +typically located in the same file as the component and are passed as arguments +to the `register_*()` functions. Top-level configs that should be present in +every fairseq application are placed in the +[global](fairseq/dataclass/configs.py) config file and added to the +`FairseqConfig` object. -## Train models with hydra interface +Each dataclass is a plain-old-data object, similar to a `NamedTuple`. These +classes are decorated with a `@dataclass` decorator, and typically inherit from +`FairseqDataclass` (which adds some functionality for backward compatibility). +Each field must have a type, and generally has metadata (such as a help string) +and a default value. Only primitive types or other config objects are allowed as +data types for each field. -#### Provide parameters in `.yaml` files -For example, if we'd like to train a language model with transformer, we could provide parameters in yaml files. Note that the modules used (task, model, criterion, optimizer, lr scheduler) in training must be migrated with hydra interface already (See session below). +#### Example: -- Provide top level choices on which generic parameter file, and which modules to use: `config/config.yaml`, this will look like for example: +```python +from dataclasses import dataclass, field +from fairseq.dataclass import FairseqDataclass +@dataclass +class InteractiveConfig(FairseqDataclass): + buffer_size: int = field( + default=0, + metadata={ + "help": "read this many sentences into a buffer before processing them" + }, + ) + input: str = field( + default="-", + metadata={"help": "file to read from; use - for stdin"}, + ) ``` -defaults: - - task: language_modeling - - model: transformer_lm - - criterion: cross_entropy - - optimizer: adam - - lr_scheduler: inverse_sqrt + +### Inherting values + +Some components require sharing a value. For example, a learning rate scheduler +and an optimizer may both need to know the initial learning rate value. One can +declare a field that, by default, will inherit its value from another config +node in the same hierarchy: + +```python +@dataclass +FairseqAdamConfig(FairseqDataclass): + ... + lr: List[float] = II("optimization.lr") + ... ``` -- Provide generic parameters common across different jobs: `config.yaml` -- Provide task parameters: `config/task/language_modeling.yaml` -- Provide model parameters: `config/model/transformer_lm.yaml` -- Provide criterion parameters: `config/criterion/cross_entropy.yaml` -- Provide optimizer parameters: `config/optimizer/adam.yaml` -- Provide lr_scheduler parameters `config/lr_scheduler/inverse_sqrt.yaml` +`II("optimization.lr")` is syntactic sugar for `"${optimization.lr}"`, which is +the value one can use in a YAML config file or through command line to achieve +the same effect. Note that this assumes that there is an "optimization" config +object in the root config and it has a field called "lr". + +### Tasks and Models -#### Command line overriding -`train_hydra.py` is the main entry point for training with hydra interface. If we specify all parameters we want in `.yaml` files, then we could simply use command: +Creating Tasks and Models works same as before, except that legacy +implementations now inherit from `LegacyFairseq*` base classes, while new +components inherit from `FairseqTask` and `FairseqModel` and provide a dataclass +to the `register_*()` functions. +#### Task example: + +```python +@dataclass +class LanguageModelingConfig(FairseqDataclass): + data: Optional[str] = field( + default=None, metadata={"help": "path to data directory"} + ) + ... + +@register_task("language_modeling", dataclass=LanguageModelingConfig) +class LanguageModelingTask(FairseqTask): + ... + @classmethod + def setup_task(cls, cfg: LanguageModelingConfig): + ... ``` -# task.data is requested field marked by `???` in yaml -python fairseq_cli/train_hydra.py \ -task.data=/private/home/abaevski/data/wiki103 \ + +#### Model example: + +```python +@dataclass +class TransformerLanguageModelConfig(FairseqDataclass): + activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( + default="relu", metadata={"help": "activation function to use"} + ) + dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) + ... + +@register_model("transformer_lm", dataclass=TransformerLanguageModelConfig) +class TransformerLanguageModel(FairseqLanguageModel): + ... + @classmethod + def build_model(cls, cfg: TransformerLanguageModelConfig, task: FairseqTask): + ... ``` -Alternatively, if we need to override certain params from the command line, we could do so as below (note the structure of where each parameter sits) +### Other components + +Other components work as before, but they now take their configuration dataclass +as the only constructor argument: + +```python +@dataclass +class MosesTokenizerConfig(FairseqDataclass): + source_lang: str = field(default="en", metadata={"help": "source language"}) + ... +@register_tokenizer("moses", dataclass=MosesTokenizerConfig) +class MosesTokenizer(object): + def __init__(self, cfg: MosesTokenizerConfig): + ... ``` -python fairseq_cli/train_hydra.py -task=language_modeling \ -task.data=/private/home/abaevski/data/wiki103 \ -task.tokens_per_sample=512 \ -task.sample_break_mode=none \ -model=transformer_lm \ -model.share_decoder_input_output_embed=true \ -model.dropout=0.1 \ -optimizer=adam \ -optimizer.adam_betas="'(0.9, 0.98)'" \ -optimizer.weight_decay=0.01 \ -lr_scheduler=inverse_sqrt \ -lr_scheduler.warmup_updates=4000 \ -lr_scheduler.warmup_init_lr=1e-07 \ -criterion=cross_entropy \ -common.fp16=true \ -common.log_format=json \ -common.log_interval=1 \ -dataset.max_tokens=1024 \ -dataset.num_workers=4 \ -optimization.update_freq=[16] \ -optimization.max_update=50000 \ -optimization.clip_norm=0.0 \ -optimization.lr=[0.0005] \ -checkpoint.save_dir=/checkpoint/mtian/transformer_wikitext-103-hydra-args-cli \ -checkpoint.save_interval_updates=10 + +Note that if you are adding a new registry for a new set of components, you need +to add it to the `FairseqConfig` object in `fairseq/dataclass/configs.py`: + +```python +@dataclass +class FairseqConfig(object): + ... + my_new_registry: Any = None ``` -## Migrate existing/Creating new modules to hydra interface +## Training with `fairseq-hydra-train` + +To fully take advantage of configuration flexibility offered by Hydra, you may +want to train new models using the `fairseq-hydra-train` entry point. Legacy CLI +tools such as `fairseq-train` will remain supported for the foreseeable future +but will be deprecated eventually. -In each of the modules we want to migrated/create with hydra interface, fundamentally we need to +On startup, Hydra will create a configuration object that contains a hierarchy +of all the necessary dataclasses populated with their default values in the +code. The default values are overwritten by values found in YAML files in +`fairseq/config` directory (which currently sets minimal defaults) and then +further overwritten by values provided through command line arguments. -- Provide a dataclass that layouts the parameters used in the module. +Some of the most common use cases are shown below: -- Modify the builder and/or constructor that previously takes `argparse.Namespace` argument `args`, into taking `omegaconf.DictConfig` config objects. At this moment we allow `Union[omegaconf.DictConfig, argparse.Namespace]` to support compatibility. +### 1. Override default values through command line: -- For `add_args()`, we need to extract argument from the dataclass defined in the same file, and append them into `parser`. This is also to support compatibility. This is simply supported with `gen_parser_from_dataclass` API, see examples files below. +```shell script +$ fairseq-hydra-train \ + distributed_training.distributed_world_size=1 \ + dataset.batch_size=2 \ + task.data=data-bin \ + model=transformer_lm/transformer_lm_gpt \ + task=language_modeling \ + optimization.max_update=5000 +``` + +Note that along with explicitly providing values for parameters such as +`dataset.batch_size`, this also tells Hydra to overlay configuration found in +`fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml` over the default +values in the dataclass. If you want to train a model without specifying a +particular architecture you can simply specify `model=transformer_lm`. This only +works for migrated tasks and models. -#### Migrated examples: +### 2. Replace bundled configs with an external config: -- Task: `fairseq/tasks/language_modeling.py` +```shell script +$ fairseq-hydra-train \ + --config-dir /path/to/external/configs \ + --config-name wiki103 +``` -- Model: `fairseq/models/transformer_lm.py` +where `/path/to/external/configs/wiki103.yaml` contains: -- Criterion: `fairseq/criterions/adaptive_loss.py` and `fairseq/criterions/cross_entropy.py` +```yaml +# @package _group_ -- Optimizer: `fairseq/optim/adam.py` and `fairseq/optim/nag.py` +model: + _name: transformer_lm +distributed_training: + distributed_world_size: 1 +dataset: + batch_size: 2 +task: + _name: language_modeling + data: /path/to/data + add_bos_token: false + max_target_positions: 1024 +optimization: + max_update: 50000 + lr: [ 0.25 ] +criterion: cross_entropy +optimizer: adam +lr_scheduler: + _name: cosine +``` -- LR scheduler: `fairseq/optim/lr_scheduler/cosine_lr_scheduler.py` and `fairseq/optim/lr_scheduler/inverse_square_root_schedule.py` +Note that here bundled configs from `fairseq/config` directory are not used, +however the defaults from each dataclass will still be used (unless overwritten +by your external config). +Additionally you can choose to break up your configs by creating a directory +structure in the same location as your main config file, with the names of the +top-level fields (such as "model", "dataset", etc), and placing config files +with meaningful names that would populate that specific section of your +top-level config file (for example, you might have +`model/small_transformer_lm.yaml`, `model/big_transformer_lm.yaml`, etc). You +can then specify the correct configuration via command line, defaults in the +main config, or even launch all of them as a sweep (see Hydra documentation on +how to do this). -## Interpolate parameters across different places +### 3. Add an external config directory to Hydra search path: -## Support of legacy interface -If you still like to pass legacy style arguments in command line, `fairseq_cli/train.py` can support this. Internally it coverted `args` into hydra config objects whenever there are migrated modules aligned. +This allows combining default configuration (including using any bundled config +files), while specifying your own config files for some parts of the +configuration. +```shell script +$ fairseq-hydra-train \ + distributed_training.distributed_world_size=1 \ + dataset.batch_size=2 \ + task.data=/path/to/data/ \ + model=transformer_lm/2_layers \ + task=language_modeling \ + optimization.max_update=5000 \ + --config-dir /path/to/external/configs ``` -python fairseq_cli/train.py --task language_modeling \ -/private/home/abaevski/data/wiki103 \ ---save-dir /checkpoint/mtian/transformer_wikitext-103-hydra-args-cli \ ---arch transformer_lm --share-decoder-input-output-embed \ ---dropout 0.1 \ ---optimizer adam --adam-betas '(0.9, 0.98)' --weight-decay 0.01 --clip-norm 0.0 \ ---lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \ ---tokens-per-sample 512 --sample-break-mode none \ ---max-tokens 1024 --update-freq 16 \ ---fp16 \ ---max-update 50000 --log-format json --log-interval 1 --num-workers 4 \ ---save-interval-updates 10 + +where `/path/to/external/configs` has the following structure: +``` +. ++-- model +| +-- transformer_lm +| | +-- 2_layers.yaml ``` + +and `2_layers.yaml` contains a copy of `transformer_lm_gpt.yaml` but with +`decoder_layers` set to 2. You can add other configs to configure other +components as well. diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index c734a1f04f..0000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -sphinx<2.0 -sphinx-argparse diff --git a/docs/tutorial_classifying_names.rst b/docs/tutorial_classifying_names.rst index b02fec0489..de099f08f5 100644 --- a/docs/tutorial_classifying_names.rst +++ b/docs/tutorial_classifying_names.rst @@ -208,7 +208,7 @@ following contents:: import torch from fairseq.data import Dictionary, LanguagePairDataset - from fairseq.tasks import FairseqTask, register_task + from fairseq.tasks import LegacyFairseqTask, register_task @register_task('simple_classification') diff --git a/examples/MMPT/.gitignore b/examples/MMPT/.gitignore new file mode 100644 index 0000000000..70a255dc91 --- /dev/null +++ b/examples/MMPT/.gitignore @@ -0,0 +1,139 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ +runs +data +pretrained_models +projects/mmfusion_* +log_test +third-party +python_log +slurm_snapshot_code +lightning_logs +demos diff --git a/examples/MMPT/CONFIG.md b/examples/MMPT/CONFIG.md new file mode 100644 index 0000000000..bbd1403dfa --- /dev/null +++ b/examples/MMPT/CONFIG.md @@ -0,0 +1,41 @@ +### Config Files Explained + +Taking `projects/mfmmlm.yaml` for example, which run pretraining using masked frame model (MFM) and masked language model (MLM) on a single BERT: + +```yaml +project_dir: mfmmlm # specify the project dir for this baseline. +run_task: + - how2.yaml # run pretraining on how2 when launching `projects/taskmfmmlm.yaml` + - [vtt.yaml, vttcap.yaml, vttqa.yaml, youcook.yaml, youcookcap.yaml, crosstask.yaml, coin.yaml] # run fine-tuning tasks. +base_dir: task # a global template folder to specify each training task. +task_group: + pretrain: # section for pretraining. Most baselines differs in this section. + task_list: + - how2.yaml # reconfig `projects/task/how2.yaml` + dataset: + aligner: MFMMLMAligner # overwrite the aligner for MFMMLM training task. + model: + model_cls: MMFusionMFMMLM # overwrite the model, which constructs negative examples for MFM on-the-fly. + loss: + loss_cls: MFMMLM # overwrite the loss as MFMMLM, which combines MFM and MLM together. + fairseq: # all fairseq args can be expecified under this name. + dataset: + batch_size: 128 + finetune: # section for fine-tuning tasks, we don't need to change anything here mostly since we want to see how pretraining can contribute to finetuning. + task_list: # specify the list of downstream tasks, e.g., copy `projects/task/vtt.yaml` to `projects/mfmmlm`. + - vtt.yaml + - vttqa.yaml + - youcook.yaml + - youcookcap.yaml + - crosstask.yaml + - coin.yaml + test: # section for testing. + task_list: + - test_vtt.yaml + - test_vttqa.yaml + - test_youcook.yaml + - test_youcookcap.yaml + - test_crosstask.yaml + - test_crosstask_zs.yaml + - test_coin.yaml +``` diff --git a/examples/MMPT/DATASET.md b/examples/MMPT/DATASET.md new file mode 100644 index 0000000000..930403eb36 --- /dev/null +++ b/examples/MMPT/DATASET.md @@ -0,0 +1,34 @@ +# Dataset + +We understand video data are challenging to download and process. For videos, we provide our preprocessing scripts under `scripts/video_feature_extractor` (deeply adapted from `https://github.com/antoine77340/video_feature_extractor`); for text, we pre-tokenizing scripts under `scripts/text_token_extractor`. + +### S3D Feature Extraction +We use pre-trained [S3D](https://github.com/antoine77340/S3D_HowTo100M) for video feature extraction. Please place the models as `pretrained_models/s3d_dict.npy` and `pretrained_models/s3d_howto100m.pth`. + +We implement a `PathBuilder` to automatically track video ids, source video paths to their feature locations (you may need `conda install -c anaconda pandas`). Decoding may need `pip install ffmpeg-python`. + +### Howto100M +[Howto100M](https://www.di.ens.fr/willow/research/howto100m/) is a large-scale video pre-training datasets. You may download videos by yourself and run preprocessing of our scripts. + +Several key differences of our preprocessing from existing papers: (1) we use `raw_caption.json` instead of `caption.json` to have pure self-supervision on text (`caption.json` has manual removal of stop words); (2) we remove partially duplicated texts that are originally designed for real-time readability (see `mmpt/processors/dedupprocessor.py`); (3) then we shard video/text features using `SharedTensor` in `mmpt/utils/shardedtensor.py` for fast loading during training (faster than `h5py`). + +#### Steps +##### video +To extract video features: edit and run `bash scripts/video_feature_extractor/how2/s3d.sh`. (consider to run this on multiple machines; by default, we store features in fp16 to save space and also for faster training). + +Split available video ids as `data/how2/how2_s3d_train.lst` and `data/how2/how2_s3d_val.lst`. + +Lastly, pack video features into `ShardedTensor` using `python scripts/video_feature_extractor/shard_feature.py`. + +##### text +Clean captions using `python -m mmpt.processors.dedupprocessor`. + +Tokenize dedupped captions `data/how2/raw_caption_dedup.pkl` into sharded numpy arrays: +``` +python scripts/text_token_extractor/pretokenization.py scripts/text_token_extractor/configs/bert-base-uncased.yaml +``` + +### Youcook, MSRVTT etc. +We use the version of Youcook and MSRVTT come with Howto100M and MILNCE. Please download the data to `data/youcook` and `data/msrvtt` accordingly, you can also check `projects/task/youcook.yaml` and `projects/task/vtt.yaml` etc. in details. +We extract features for Youcook, MSRVTT similar to the first step of Howto100M but we read text from meta data directly and perform on-the-fly tokenization. + diff --git a/examples/MMPT/README.md b/examples/MMPT/README.md new file mode 100644 index 0000000000..4a84819d9d --- /dev/null +++ b/examples/MMPT/README.md @@ -0,0 +1,166 @@ +# VideoCLIP and VLM + +You just find this toolkit for multimodal video understanding! It contains implementation of two recent multi-modal video understanding papers [VideoCLIP](https://arxiv.org/pdf/2109.14084.pdf) (EMNLP, 2021) and [VLM](https://aclanthology.org/2021.findings-acl.370.pdf) (ACL Findings, 2021), along with high-performance toolkits that are typically lacking in existing codebase. The toolkit is desigend to contain generic performance-tuned components that can be potentially adapted to other frameworks (we initially use fairseq). + +VideoCLIP is a contrastive learning model for zero-shot transfer to retrieval/classification/sequence labeling style tasks. + + + +VLM is a masked language model style pre-training using only one encoder with masked modality model (MMM) for retrieval/generation/sequence labeling style tasks. + + + +### News +[Oct. 2021] Initial release of implementation for the following papers: +[VideoCLIP: Contrastive Pre-training for Zero-shot Video-Text Understanding](https://arxiv.org/pdf/2109.14084.pdf) (Xu et. al., EMNLP 2021) +[VLM: Task-agnostic Video-Language Model Pre-training for Video Understanding](https://aclanthology.org/2021.findings-acl.370.pdf) (Xu et. al., ACL Findings 2021) + + +### Installation +We aim to minimize the dependency of this repo on other packages. +We use fairseq as the main trainer (no models/datasets dependency on fairseq. We will support other trainer in future): +``` +git clone https://github.com/pytorch/fairseq +cd fairseq +pip install -e . # also optionally follow fairseq README for apex installation for fp16 training. +export MKL_THREADING_LAYER=GNU # fairseq may need this for numpy. +``` + +Then install this toolkit: +``` +cd examples/MMPT # MMPT can be in any folder, not necessarily under fairseq/examples. +pip install -e . +``` + +The code is developed under Python=3.8.8, Pytorch=1.8, cuda=11.0 with fairseq=1.0.0a0+af0389f and tested under Python=3.8.8 pytorch=1.9 cuda=11.0 fairseq=1.0.0a0+8e7bc73 during code release. +Most models require `transformers==3.4` for API compatibility `pip install transformers==3.4`. +In addition, some downstream tasks may need `conda install pandas`. + + +### Usage +#### Download Checkpoints +We use pre-trained [S3D](https://github.com/antoine77340/S3D_HowTo100M) for video feature extraction. Please place the models as `pretrained_models/s3d_dict.npy` and `pretrained_models/s3d_howto100m.pth`. + +Download VideoCLIP checkpoint `https://dl.fbaipublicfiles.com/MMPT/retri/videoclip/checkpoint_best.pt` to `runs/retri/videoclip` or VLM checkpoint `https://dl.fbaipublicfiles.com/MMPT/mtm/vlm/checkpoint_best.pt` to `runs/mtm/vlm`. + +#### Demo of Inference +run `python locallaunch.py projects/retri/videoclip.yaml --dryrun` to get all `.yaml`s for VideoCLIP. + +```python +import torch + +from mmpt.models import MMPTModel + + +model, tokenizer, aligner = MMPTModel.from_pretrained( + "projects/retri/videoclip/how2.yaml") + +model.eval() + + +# B, T, FPS, H, W, C (VideoCLIP is trained on 30 fps of s3d) +video_frames = torch.randn(1, 2, 30, 224, 224, 3) +caps, cmasks = aligner._build_text_seq( + tokenizer("some text", add_special_tokens=False)["input_ids"] +) + +caps, cmasks = caps[None, :], cmasks[None, :] # bsz=1 + +with torch.no_grad(): + output = model(video_frames, caps, cmasks, return_score=True) +print(output["score"]) # dot-product +``` + +#### Data Preparation +See [dataset](DATASET.md) for each dataset. + +#### Global Config for Training Pipeline +We organize a global config file for a training/testing pipeline under projects (see a detailed [explanation](CONFIG.md)). For example, VideoCLIP in `projects/retri/videoclip.yaml` and VLM is in `projects/mtm/vlm.yaml`. + +We wrap all cmds into `locallaunch.py` and `mmpt_cli/localjob.py`. You can check concrete cmds by `--dryrun` and then drop it for actual run. + +First, run `python locallaunch.py projects/retri/videoclip.yaml --dryrun` will generate configs for all configs of pre-training, zero-shot evaluation, fine-tuning and testing, for VideoCLIP under `projects/retri/videoclip`. + +Then each (either training or evaluation) process will be configed by a concrete config file (we save all complex arguments into the concrete config file for reproducibility, including fairseq args). For example, run zero-shot evaluation on youcook, +``` +python locallaunch.py projects/retri/videoclip/test_youcook_zs.yaml --jobtype local_predict # zero-shot evaluation. +python locallaunch.py projects/retri/videoclip/youcook_videoclip.yaml --jobtype local_single --dryrun # fine-tuning: use --dryrun to check cmds and drop it to make an actual run; local_small will run on two gpus (as in paper). +python locallaunch.py projects/retri/videoclip/test_youcook_videoclip.yaml --jobtype local_predict # testing on fine-tuned model. +``` + +Pretraining can be run as: +``` +python locallaunch.py projects/retri/videoclip/how2.yaml --jobtype local_single --dryrun # check then drop dryrun; paper is ran on local_big as 8 gpus. +``` +You may need to change `--jobtype`, check/extend `LocalJob` in `mmpt_cli/localjob.py` for multi-gpu/multi-node pre-training. + +The detailed instructions of pretraining and fine-tuning can be found at [pretraining instruction](pretraining.md) and [finetuning instruction](endtask.md). + + +### Development +Several components of this toolkit can be re-used for future research (and also our ongoing research). + +#### Framework Wrapper +We currently only support fairseq, but most components can be easily fit into other frameworks like huggingface. This repo is a `--user-dir` of fairseq with fairseq wrapper. For example, `mmpt/tasks` includes a `FairseqMMTTask`, which manages `mmpt/datasets` with `FairseqDataset`, `mmpt/models` with `FairseqModel`, `mmpt/losses` with `FairseqCriterion`. + +#### Processors +**Multi**modal research introduces the complexity on modality alignment from different input sources to losses. Inspired by [MMF](https://github.com/facebookresearch/mmf), this toolkit leverages `mmpt/processors` to handle various needs of data preprocessing and loading, **alleviating** the needs of multiple `torch.data.utils.Dataset` (that can be tricky for ablation study). +Processors can also be decoupled from `torch.data.utils.Dataset` for offline preprocessing instead of on-the-fly data preprocessing. + +We decouple a `mmpt.MMDataset` as 3 types of processors: `MetaProcessor`, `VideoProcessor`, `TextProcessor` and `Aligner`. They can be configed in `dataset` field of a config file (e.g., see `projects/task/how2.yaml`). +`MetaProcessor` is used to load the meta data about a dataset, aka, all video_ids of how2 dataset. +`VideoProcessor` is used to load the video features about a dataset. For example, S3D features for each second of a video. +`TextProcessor` is used to load the text (feature). For example, BERT pre-tokenized text clips for how2 dataset (with `start`s, `end`s of timestamps and `cap` for `token_ids`). +`Aligner` is the core class for different baselines that prepares the training data. For example, sampling a clip, masking tokens for MLM, etc. + +#### Performance-tuned Components +To speed up pre-training, this toolkit uses sharded features stored in mmaped numpy, backed by `ShardedTensor` in `mmpt/utils/shardedtensor.py` (adopted from MARGE paper). This reduces the loads of IO for multi-GPU training without loading all features for a video into the memory each time and `ShardedTensor` ensure features are stored in continuous disk space for near random access. This is used for both How2 video features and texts in `mmpt/processors/how2processor.py`. + + +### Citation +If this codebase is useful for your work, please cite the following papers: + +```BibTeX +@inproceedings{xu-etal-2021-videoclip, + title = "{VideoCLIP}: Contrastive Pre-training for\\Zero-shot Video-Text Understanding", + author = "Xu, Hu and + Ghosh, Gargi and + Huang, Po-Yao and + Okhonko, Dmytro and + Aghajanyan, Armen and + Metze, Florian and + Zettlemoyer, Luke and + Feichtenhofer, Christoph", + booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing (EMNLP)", + month = nov, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", +} + +@inproceedings{xu-etal-2021-vlm, + title = "{VLM}: Task-agnostic Video-Language Model Pre-training for Video Understanding", + author = "Xu, Hu and + Ghosh, Gargi and + Huang, Po-Yao and + Arora, Prahal and + Aminzadeh, Masoumeh and + Feichtenhofer, Christoph and + Metze, Florian and + Zettlemoyer, Luke", + booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021", + month = aug, + year = "2021", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.findings-acl.370", + doi = "10.18653/v1/2021.findings-acl.370", + pages = "4227--4239", +} +``` + +### Bug Reports +This repo is in its initial stage, welcome bug reports to huxu@fb.com + +### Copyright +The majority of Multimodal Pre-training (MMPT) is licensed under CC-BY-NC, however portions of the project are available under separate license terms: Evaluation Codes/Models: Howto100M and HuggingFace Transformers are licensed under the Apache2.0 license; COIN and NLG-eval are licensed under the MIT license; CrossTask is licensed under the BSD-3; DiDeMo is licensed under the BSD-2 license. diff --git a/examples/MMPT/endtask.md b/examples/MMPT/endtask.md new file mode 100644 index 0000000000..7690955327 --- /dev/null +++ b/examples/MMPT/endtask.md @@ -0,0 +1,41 @@ +# Zero-shot Transfer and Finetuning + +(If you are new to the ideas of `mmpt.processors`, see [README](README.md) first.) +All finetuning datasets (specifically `processors`) are defined in `mmpt.processors.dsprocessor`. +Given the complexity of different types of finetuning tasks, each task may have their own meta/video/text/aligner processors and `mmpt/evaluators/{Predictor,Metric}`. + +### Tasks + +Currently, we support 5 end datasets: `MSRVTT`, `Youcook`, `COIN`, `Crosstask` and `DiDeMo` with the following tasks: +text-video retrieval: `MSRVTT`, `Youcook`, `DiDeMo`; +video captioning: `Youcook`; +Video Question and Answering: `MSRVTT-QA`. + +To add your own dataset, you can specify the corresponding processors and config them in the `dataset` field of a config file, such as `projects/task/vtt.yaml`. + +### Zero-shot Transfer (no Training) +Zero-shot transfer will run the pre-trained model (e.g., VideoCLIP) directly on testing data. Configs with pattern: `projects/task/*_zs_*.yaml` are dedicated for zero-shot transfer. + +### Fine-tuning + +The training of a downstream task is similar to pretraining, execept you may need to specify the `restore_file` in `fairseq.checkpoint` and reset optimizers, see `projects/task/ft.yaml` that is included by `projects/task/vtt.yaml`. + +We typically do finetuning on 2 gpus (`local_small`). + +### Testing +For each finetuning dataset, you may need to specify a testing config, similar to `projects/task/test_vtt.yaml`. + +We define `mmpt.evaluators.Predictor` for different types of prediction. For example, `MSRVTT` and `Youcook` are video-retrieval tasks and expecting to use `RetrievalPredictor`. You may need to define your new type of predictors and specify that in `predictor` field of a testing config. + +Each task may also have their own metric for evaluation. This can be created in `mmpt.evaluators.Metric` and specified in the `metric` field of a testing config. + +Launching a testing is as simple as training by specifying the path of a testing config: +```python locallaunch.py projects/mfmmlm/test_vtt.yaml``` +Testing will be launched locally by default since prediction is computationally less expensive. + +### Third-party Libraries +We list the following finetuning tasks that require third-party libraries. + +Youcook captioning: `https://github.com/Maluuba/nlg-eval` + +CrossTask: `https://github.com/DmZhukov/CrossTask`'s `dp` under `third-party/CrossTask` (`python setup.py build_ext --inplace`) diff --git a/examples/MMPT/locallaunch.py b/examples/MMPT/locallaunch.py new file mode 100644 index 0000000000..e20fd816fa --- /dev/null +++ b/examples/MMPT/locallaunch.py @@ -0,0 +1,148 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import argparse +import os + +from omegaconf import OmegaConf + +from mmpt.utils import recursive_config, overwrite_dir +from mmpt_cli.localjob import LocalJob + + +class JobLauncher(object): + JOB_CONFIG = { + "local": LocalJob, + } + + def __init__(self, yaml_file): + self.yaml_file = yaml_file + job_key = "local" + + if yaml_file.endswith(".yaml"): + config = recursive_config(yaml_file) + if config.task_type is not None: + job_key = config.task_type.split("_")[0] + else: + raise ValueError("unknown extension of job file:", yaml_file) + self.job_key = job_key + + def __call__(self, job_type=None, dryrun=False): + if job_type is not None: + self.job_key = job_type.split("_")[0] + print("[JobLauncher] job_key", self.job_key) + job = JobLauncher.JOB_CONFIG[self.job_key]( + self.yaml_file, job_type=job_type, dryrun=dryrun) + return job.submit() + + +class Pipeline(object): + """a job that loads yaml config.""" + + def __init__(self, fn): + """ + load a yaml config of a job and save generated configs as yaml for each task. + return: a list of files to run as specified by `run_task`. + """ + if fn.endswith(".py"): + # a python command. + self.backend = "python" + self.run_yamls = [fn] + return + + job_config = recursive_config(fn) + if job_config.base_dir is None: # single file job config. + self.run_yamls = [fn] + return + + self.project_dir = os.path.join("projects", job_config.project_dir) + self.run_dir = os.path.join("runs", job_config.project_dir) + + if job_config.run_task is not None: + run_yamls = [] + for stage in job_config.run_task: + # each stage can have multiple tasks running in parallel. + if OmegaConf.is_list(stage): + stage_yamls = [] + for task_file in stage: + stage_yamls.append( + os.path.join(self.project_dir, task_file)) + run_yamls.append(stage_yamls) + else: + run_yamls.append(os.path.join(self.project_dir, stage)) + self.run_yamls = run_yamls + configs_to_save = self._overwrite_task(job_config) + self._save_configs(configs_to_save) + + def __getitem__(self, idx): + yaml_files = self.run_yamls[idx] + if isinstance(yaml_files, list): + return [JobLauncher(yaml_file) for yaml_file in yaml_files] + return [JobLauncher(yaml_files)] + + def __len__(self): + return len(self.run_yamls) + + def _save_configs(self, configs_to_save: dict): + # save + os.makedirs(self.project_dir, exist_ok=True) + for config_file in configs_to_save: + config = configs_to_save[config_file] + print("saving", config_file) + OmegaConf.save(config=config, f=config_file) + + def _overwrite_task(self, job_config): + configs_to_save = {} + self.base_project_dir = os.path.join("projects", job_config.base_dir) + self.base_run_dir = os.path.join("runs", job_config.base_dir) + + for config_sets in job_config.task_group: + overwrite_config = job_config.task_group[config_sets] + if ( + overwrite_config.task_list is None + or len(overwrite_config.task_list) == 0 + ): + print( + "[warning]", + job_config.task_group, + "has no task_list specified.") + # we don't want this added to a final config. + task_list = overwrite_config.pop("task_list", None) + for config_file in task_list: + config_file_path = os.path.join( + self.base_project_dir, config_file) + config = recursive_config(config_file_path) + # overwrite it. + if overwrite_config: + config = OmegaConf.merge(config, overwrite_config) + overwrite_dir(config, self.run_dir, basedir=self.base_run_dir) + save_file_path = os.path.join(self.project_dir, config_file) + configs_to_save[save_file_path] = config + return configs_to_save + + +def main(args): + job_type = args.jobtype if args.jobtype else None + # parse multiple pipelines. + pipelines = [Pipeline(fn) for fn in args.yamls.split(",")] + + for pipe_id, pipeline in enumerate(pipelines): + if not hasattr(pipeline, "project_dir"): + for job in pipeline[0]: + job(job_type=job_type, dryrun=args.dryrun) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("yamls", type=str) + parser.add_argument( + "--dryrun", + action="store_true", + help="run config and prepare to submit without launch the job.", + ) + parser.add_argument( + "--jobtype", type=str, default="", + help="force to run jobs as specified.") + args = parser.parse_args() + main(args) diff --git a/examples/MMPT/mmpt/__init__.py b/examples/MMPT/mmpt/__init__.py new file mode 100644 index 0000000000..6ff86ddd5c --- /dev/null +++ b/examples/MMPT/mmpt/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +try: + # fairseq user dir + from .datasets import FairseqMMDataset + from .losses import FairseqCriterion + from .models import FairseqMMModel + from .tasks import FairseqMMTask +except ImportError: + pass diff --git a/examples/MMPT/mmpt/datasets/__init__.py b/examples/MMPT/mmpt/datasets/__init__.py new file mode 100644 index 0000000000..2578235e17 --- /dev/null +++ b/examples/MMPT/mmpt/datasets/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +from .mmdataset import * + +try: + from .fairseqmmdataset import * +except ImportError: + pass diff --git a/examples/MMPT/mmpt/datasets/fairseqmmdataset.py b/examples/MMPT/mmpt/datasets/fairseqmmdataset.py new file mode 100644 index 0000000000..02c49141db --- /dev/null +++ b/examples/MMPT/mmpt/datasets/fairseqmmdataset.py @@ -0,0 +1,57 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +TODO (huxu): fairseq wrapper class for all dataset you defined: mostly MMDataset. +""" + +from collections import OrderedDict + +from torch.utils.data import Dataset +from torch.utils.data.dataloader import default_collate +from fairseq.data import FairseqDataset, data_utils + + +class FairseqMMDataset(FairseqDataset): + """ + A wrapper class for MMDataset for fairseq. + """ + + def __init__(self, mmdataset): + if not isinstance(mmdataset, Dataset): + raise TypeError("mmdataset must be of type `torch.utils.data.dataset`.") + self.mmdataset = mmdataset + + def set_epoch(self, epoch, **unused): + super().set_epoch(epoch) + self.epoch = epoch + + def __getitem__(self, idx): + with data_utils.numpy_seed(43211, self.epoch, idx): + return self.mmdataset[idx] + + def __len__(self): + return len(self.mmdataset) + + def collater(self, samples): + if hasattr(self.mmdataset, "collator"): + return self.mmdataset.collator(samples) + if len(samples) == 0: + return {} + if isinstance(samples[0], dict): + batch = OrderedDict() + for key in samples[0]: + if samples[0][key] is not None: + batch[key] = default_collate([sample[key] for sample in samples]) + return batch + else: + return default_collate(samples) + + def size(self, index): + """dummy implementation: we don't use --max-tokens""" + return 1 + + def num_tokens(self, index): + """dummy implementation: we don't use --max-tokens""" + return 1 diff --git a/examples/MMPT/mmpt/datasets/mmdataset.py b/examples/MMPT/mmpt/datasets/mmdataset.py new file mode 100644 index 0000000000..3d07283f91 --- /dev/null +++ b/examples/MMPT/mmpt/datasets/mmdataset.py @@ -0,0 +1,111 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from collections import OrderedDict + +from torch.utils.data import Dataset +from torch.utils.data.dataloader import default_collate + +from ..utils import set_seed + + +class MMDataset(Dataset): + """ + A generic multi-modal dataset. + Args: + `meta_processor`: a meta processor, + handling loading meta data and return video_id and text_id. + `video_processor`: a video processor, + handling e.g., decoding, loading .np files. + `text_processor`: a text processor, + handling e.g., tokenization. + `aligner`: combine the video and text feature + as one training example. + """ + + def __init__( + self, + meta_processor, + video_processor, + text_processor, + align_processor, + ): + self.split = meta_processor.split + self.meta_processor = meta_processor + self.video_processor = video_processor + self.text_processor = text_processor + self.align_processor = align_processor + + def __len__(self): + return len(self.meta_processor) + + def __getitem__(self, idx): + if self.split == "test": + set_seed(idx) + video_id, text_id = self.meta_processor[idx] + video_feature = self.video_processor(video_id) + text_feature = self.text_processor(text_id) + output = self.align_processor(video_id, video_feature, text_feature) + # TODO (huxu): the following is for debug purpose. + output.update({"idx": idx}) + return output + + def collater(self, samples): + """This collator is deprecated. + set self.collator = MMDataset.collater. + see collator in FairseqMMDataset. + """ + + if len(samples) == 0: + return {} + if isinstance(samples[0], dict): + batch = OrderedDict() + for key in samples[0]: + if samples[0][key] is not None: + batch[key] = default_collate( + [sample[key] for sample in samples]) + # if torch.is_tensor(batch[key]): + # print(key, batch[key].size()) + # else: + # print(key, len(batch[key])) + return batch + else: + return default_collate(samples) + + def print_example(self, output): + print("[one example]", output["video_id"]) + if ( + hasattr(self.align_processor, "subsampling") + and self.align_processor.subsampling is not None + and self.align_processor.subsampling > 1 + ): + for key in output: + if torch.is_tensor(output[key]): + output[key] = output[key][0] + + # search tokenizer to translate ids back. + tokenizer = None + if hasattr(self.text_processor, "tokenizer"): + tokenizer = self.text_processor.tokenizer + elif hasattr(self.align_processor, "tokenizer"): + tokenizer = self.align_processor.tokenizer + if tokenizer is not None: + caps = output["caps"].tolist() + if isinstance(caps[0], list): + caps = caps[0] + print("caps", tokenizer.decode(caps)) + print("caps", tokenizer.convert_ids_to_tokens(caps)) + + for key, value in output.items(): + if torch.is_tensor(value): + if len(value.size()) >= 3: # attention_mask. + print(key, value.size()) + print(key, "first", value[0, :, :]) + print(key, "last", value[-1, :, :]) + else: + print(key, value) + print("[end of one example]") diff --git a/examples/MMPT/mmpt/evaluators/__init__.py b/examples/MMPT/mmpt/evaluators/__init__.py new file mode 100644 index 0000000000..2d06b9d797 --- /dev/null +++ b/examples/MMPT/mmpt/evaluators/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +from .metric import * +from .evaluator import * + + +# experimental. +try: + from .expmetric import * +except ImportError: + pass diff --git a/examples/MMPT/mmpt/evaluators/evaluator.py b/examples/MMPT/mmpt/evaluators/evaluator.py new file mode 100644 index 0000000000..94d9c5ec9a --- /dev/null +++ b/examples/MMPT/mmpt/evaluators/evaluator.py @@ -0,0 +1,54 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import os +import glob +import numpy as np + +from . import metric as metric_path +from . import predictor as predictor_path + + +class Evaluator(object): + """ + perform evaluation on a single (downstream) task. + make this both offline and online. + TODO(huxu) saving evaluation results. + """ + + def __init__(self, config, eval_dataloader=None): + if config.metric is None: + raise ValueError("config.metric is", config.metric) + metric_cls = getattr(metric_path, config.metric) + self.metric = metric_cls(config) + if config.predictor is None: + raise ValueError("config.predictor is", config.predictor) + predictor_cls = getattr(predictor_path, config.predictor) + self.predictor = predictor_cls(config) + self.eval_dataloader = eval_dataloader + + def __call__(self): + try: + print(self.predictor.pred_dir) + for pred_file in glob.glob( + self.predictor.pred_dir + "/*_merged.npy"): + outputs = np.load(pred_file) + results = self.metric.compute_metrics(outputs) + self.metric.print_computed_metrics(results) + + outputs = np.load(os.path.join( + self.predictor.pred_dir, "merged.npy")) + results = self.metric.compute_metrics(outputs) + return {"results": results, "metric": self.metric} + except FileNotFoundError: + print("\n[missing]", self.predictor.pred_dir) + return {} + + def evaluate(self, model, eval_dataloader=None, output_file="merged"): + if eval_dataloader is None: + eval_dataloader = self.eval_dataloader + outputs = self.predictor.predict_loop( + model, eval_dataloader, output_file) + results = self.metric.compute_metrics(**outputs) + return results diff --git a/examples/MMPT/mmpt/evaluators/metric.py b/examples/MMPT/mmpt/evaluators/metric.py new file mode 100644 index 0000000000..163724bb25 --- /dev/null +++ b/examples/MMPT/mmpt/evaluators/metric.py @@ -0,0 +1,313 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import json + + +class Metric(object): + def __init__(self, config, metric_names): + self.metric_names = metric_names + + def best_metric(self, metric): + return metric[self.metric_names[0]] + + def save_metrics(self, fn, metrics): + with open(fn, "w") as fw: + json.dump(fw, metrics) + + def print_computed_metrics(self, metrics): + raise NotImplementedError + + +class RetrievalMetric(Metric): + """ + this is modified from `howto100m/metrics.py`. + History of changes: + refactor as a class. + add metric_key in __init__ + """ + + def __init__(self, config, metric_names=["R1", "R5", "R10", "MR"]): + super().__init__(config, metric_names) + self.error = False # TODO(huxu): add to config to print error. + + def compute_metrics(self, outputs, texts, **kwargs): + x = outputs + sx = np.sort(-x, axis=1) + d = np.diag(-x) + d = d[:, np.newaxis] + ind = sx - d + ind = np.where(ind == 0) + ind = ind[1] + metrics = {} + metrics["R1"] = float(np.sum(ind == 0)) / len(ind) + metrics["R5"] = float(np.sum(ind < 5)) / len(ind) + metrics["R10"] = float(np.sum(ind < 10)) / len(ind) + metrics["MR"] = np.median(ind) + 1 + + max_idx = np.argmax(outputs, axis=1) + if self.error: + # print top-20 errors. + error = [] + for ex_idx in range(20): + error.append((texts[ex_idx], texts[max_idx[ex_idx]])) + metrics["error"] = error + return metrics + + def print_computed_metrics(self, metrics): + r1 = metrics["R1"] + r5 = metrics["R5"] + r10 = metrics["R10"] + mr = metrics["MR"] + print( + "R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}".format( + r1, r5, r10, mr + ) + ) + if "error" in metrics: + print(metrics["error"]) + + +class DiDeMoMetric(Metric): + """ + History of changes: + python 2.x to python 3.x. + merge utils.py into eval to save one file. + reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py + Code to evaluate your results on the DiDeMo dataset. + """ + def __init__(self, config, metric_names=["rank1", "rank5", "miou"]): + super().__init__(config, metric_names) + + def compute_metrics(self, outputs, targets, **kwargs): + assert len(outputs) == len(targets) + rank1, rank5, miou = self._eval_predictions(outputs, targets) + metrics = { + "rank1": rank1, + "rank5": rank5, + "miou": miou + } + return metrics + + def print_computed_metrics(self, metrics): + rank1 = metrics["rank1"] + rank5 = metrics["rank5"] + miou = metrics["miou"] + # print("Average rank@1: %f" % rank1) + # print("Average rank@5: %f" % rank5) + # print("Average iou: %f" % miou) + + print( + "Average rank@1: {:.4f} Average rank@5: {:.4f} Average iou: {:.4f}".format( + rank1, rank5, miou + ) + ) + + def _iou(self, pred, gt): + intersection = max(0, min(pred[1], gt[1]) + 1 - max(pred[0], gt[0])) + union = max(pred[1], gt[1]) + 1 - min(pred[0], gt[0]) + return float(intersection)/union + + def _rank(self, pred, gt): + return pred.index(tuple(gt)) + 1 + + def _eval_predictions(self, segments, data): + ''' + Inputs: + segments: For each item in the ground truth data, rank possible video segments given the description and video. + In DiDeMo, there are 21 posible moments extracted for each video so the list of video segments will be of length 21. + The first video segment should be the video segment that best corresponds to the text query. + There are 4180 sentence in the validation data, so when evaluating a model on the val dataset, + segments should be a list of lenght 4180, and each item in segments should be a list of length 21. + data: ground truth data + ''' + average_ranks = [] + average_iou = [] + for s, d in zip(segments, data): + pred = s[0] + ious = [self._iou(pred, t) for t in d['times']] + average_iou.append(np.mean(np.sort(ious)[-3:])) + ranks = [self._rank(s, t) for t in d['times'] if tuple(t) in s] # if t in s] is added for s, e not in prediction. + average_ranks.append(np.mean(np.sort(ranks)[:3])) + rank1 = np.sum(np.array(average_ranks) <= 1)/float(len(average_ranks)) + rank5 = np.sum(np.array(average_ranks) <= 5)/float(len(average_ranks)) + miou = np.mean(average_iou) + + # print("Average rank@1: %f" % rank1) + # print("Average rank@5: %f" % rank5) + # print("Average iou: %f" % miou) + return rank1, rank5, miou + + +class NLGMetric(Metric): + def __init__( + self, + config, + metric_names=[ + "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", + "METEOR", "ROUGE_L", "CIDEr" + ] + ): + super().__init__(config, metric_names) + # please install NLGEval from `https://github.com/Maluuba/nlg-eval` + from nlgeval import NLGEval + self.nlg = NLGEval() + + def compute_metrics(self, outputs, targets, **kwargs): + return self.nlg.compute_metrics( + hyp_list=outputs, ref_list=targets) + + def print_computed_metrics(self, metrics): + Bleu_1 = metrics["Bleu_1"] + Bleu_2 = metrics["Bleu_2"] + Bleu_3 = metrics["Bleu_3"] + Bleu_4 = metrics["Bleu_4"] + METEOR = metrics["METEOR"] + ROUGE_L = metrics["ROUGE_L"] + CIDEr = metrics["CIDEr"] + + print( + "Bleu_1: {:.4f} - Bleu_2: {:.4f} - Bleu_3: {:.4f} - Bleu_4: {:.4f} - METEOR: {:.4f} - ROUGE_L: {:.4f} - CIDEr: {:.4f}".format( + Bleu_1, Bleu_2, Bleu_3, Bleu_4, METEOR, ROUGE_L, CIDEr + ) + ) + + +class QAMetric(Metric): + def __init__( + self, + config, + metric_names=["acc"] + ): + super().__init__(config, metric_names) + + def compute_metrics(self, outputs, targets, **kwargs): + from sklearn.metrics import accuracy_score + return {"acc": accuracy_score(targets, outputs)} + + def print_computed_metrics(self, metrics): + print("acc: {:.4f}".format(metrics["acc"])) + + +class COINActionSegmentationMetric(Metric): + """ + COIN dataset listed 3 repos for Action Segmentation. + Action Sets, NeuralNetwork-Viterbi, TCFPN-ISBA. + The first and second are the same. + https://github.com/alexanderrichard/action-sets/blob/master/eval.py + + Future reference for the third: + `https://github.com/Zephyr-D/TCFPN-ISBA/blob/master/utils/metrics.py` + """ + def __init__(self, config, metric_name=["frame_acc"]): + super().__init__(config, metric_name) + + def compute_metrics(self, outputs, targets): + n_frames = 0 + n_errors = 0 + n_errors = sum(outputs != targets) + n_frames = len(targets) + return {"frame_acc": 1.0 - float(n_errors) / n_frames} + + def print_computed_metrics(self, metrics): + fa = metrics["frame_acc"] + print("frame accuracy:", fa) + + +class CrossTaskMetric(Metric): + def __init__(self, config, metric_names=["recall"]): + super().__init__(config, metric_names) + + def compute_metrics(self, outputs, targets, **kwargs): + """refactored from line 166: + https://github.com/DmZhukov/CrossTask/blob/master/train.py""" + + recalls = self._get_recalls(Y_true=targets, Y_pred=outputs) + results = {} + for task, rec in recalls.items(): + results[str(task)] = rec + + avg_recall = np.mean(list(recalls.values())) + results["recall"] = avg_recall + return results + + def print_computed_metrics(self, metrics): + print('Recall: {0:0.3f}'.format(metrics["recall"])) + for task in metrics: + if task != "recall": + print('Task {0}. Recall = {1:0.3f}'.format( + task, metrics[task])) + + def _get_recalls(self, Y_true, Y_pred): + """refactored from + https://github.com/DmZhukov/CrossTask/blob/master/train.py""" + + step_match = {task: 0 for task in Y_true.keys()} + step_total = {task: 0 for task in Y_true.keys()} + for task, ys_true in Y_true.items(): + ys_pred = Y_pred[task] + for vid in set(ys_pred.keys()).intersection(set(ys_true.keys())): + y_true = ys_true[vid] + y_pred = ys_pred[vid] + step_total[task] += (y_true.sum(axis=0) > 0).sum() + step_match[task] += (y_true*y_pred).sum() + recalls = { + task: step_match[task] / n for task, n in step_total.items()} + return recalls + + +class ActionRecognitionMetric(Metric): + def __init__( + self, + config, + metric_names=["acc", "acc_splits", "r1_splits", "r5_splits", "r10_splits"] + ): + super().__init__(config, metric_names) + + def compute_metrics(self, outputs, targets, splits, **kwargs): + all_video_embd = outputs + labels = targets + split1, split2, split3 = splits + accs = [] + r1s = [] + r5s = [] + r10s = [] + for split in range(3): + if split == 0: + s = split1 + elif split == 1: + s = split2 + else: + s = split3 + + X_pred = all_video_embd[np.where(s == 2)[0]] + label_test = labels[np.where(s == 2)[0]] + logits = X_pred + X_pred = np.argmax(X_pred, axis=1) + acc = np.sum(X_pred == label_test) / float(len(X_pred)) + accs.append(acc) + # compute recall. + sorted_pred = (-logits).argsort(axis=-1) + label_test_sp = label_test.reshape(-1, 1) + + r1 = np.mean((sorted_pred[:, :1] == label_test_sp).sum(axis=1), axis=0) + r5 = np.mean((sorted_pred[:, :5] == label_test_sp).sum(axis=1), axis=0) + r10 = np.mean((sorted_pred[:, :10] == label_test_sp).sum(axis=1), axis=0) + r1s.append(r1) + r5s.append(r5) + r10s.append(r10) + + return {"acc": accs[0], "acc_splits": accs, "r1_splits": r1s, "r5_splits": r5s, "r10_splits": r10s} + + def print_computed_metrics(self, metrics): + for split, acc in enumerate(metrics["acc_splits"]): + print("Top 1 accuracy on split {}: {}; r1 {}; r5 {}; r10 {}".format( + split + 1, acc, + metrics["r1_splits"][split], + metrics["r5_splits"][split], + metrics["r10_splits"][split], + ) + ) diff --git a/examples/MMPT/mmpt/evaluators/predictor.py b/examples/MMPT/mmpt/evaluators/predictor.py new file mode 100644 index 0000000000..2ffef6ab47 --- /dev/null +++ b/examples/MMPT/mmpt/evaluators/predictor.py @@ -0,0 +1,595 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import os +import random +import json +import numpy as np +import torch +import pickle +import math + +from tqdm import tqdm + + +class Predictor(object): + """this base class is used to save predictions to disk + (and being called by a evaluator later). + Predictor has minimum support of single gpu prediction. + """ + def __init__(self, config): + self.pred_dir = None # on-the-fly eval does not save the results. + if hasattr(config, "eval") and config.eval is not None: + self.pred_dir = config.eval.save_path + os.makedirs(self.pred_dir, exist_ok=True) + + def __call__(self, outputs): + """extract the prediction and save it.""" + raise NotImplementedError + + def predict_loop(self, model, eval_dataloader, output_file=None): + """on-the-fly prediction on a single gpu.""" + self.full_scores = [] + model.eval() + model = model.to(0) + with torch.no_grad(): + for data in eval_dataloader: + data = self.to_ctx(data) + outputs = model(**data) + outputs.update(data) + self(outputs) + return self.finalize(output_file) + + def finalize(self, output_file): + pass + + def to_ctx(self, data, ctx=0, dtype=None): + if isinstance(data, dict): + for key in data: + if torch.is_tensor(data[key]): + if dtype is not None and data[key].dtype == torch.float32: + data[key] = data[key].to(dtype) + data[key] = data[key].to(ctx) + return data + else: + raise ValueError("non-dict type of batch is not supported yet.") + + +class NLGPredictor(Predictor): + """Predicting Text from MMFusion models.""" + """TODO: make a context.""" + def __init__(self, config): + super().__init__(config) + from transformers import AutoTokenizer + + self.tokenizer = AutoTokenizer.from_pretrained( + config.dataset.bert_name, + bos_token="[CLS]", eos_token="[SEP]") + self.bos_token_id = self.tokenizer.bos_token_id + self.eos_token_id = self.tokenizer.eos_token_id + + def predict_loop(self, model, eval_dataloader, output_file=None): + """TODO: refactor base classes.""" + ctx = 0 + outputs = {"outputs": [], "targets": [[]]} + model.eval() + model = model.to(ctx) + with torch.no_grad(): + for data in tqdm(eval_dataloader): + data = self.to_ctx(data, ctx) + self(data, model, outputs) + return self.finalize(outputs, output_file) + + def __call__(self, data, model, outputs): + data.update({ + "bos_token_id": self.bos_token_id, + "eos_token_id": self.eos_token_id + }) + + output = model.generate(**data) + assert len(output) == len(data["ref"]) + for idx, _output in enumerate(output): + generated_text = self.tokenizer.decode( + _output, skip_special_tokens=True) + if generated_text == "": + generated_text = "none" + outputs["outputs"].append(generated_text) + outputs["targets"][0].append(data["ref"][idx]) + if random.random() < 0.001: + print("_output", _output) + print("generated_text", generated_text) + print("ref", data["ref"][idx]) + + def finalize(self, outputs, output_file=None): + if output_file is not None: + with open(os.path.join( + self.pred_dir, output_file + ".json"), "w") as fw: + json.dump(outputs, fw, indent=4) + return outputs + + +class RetrievalPredictor(Predictor): + """generated `pooled_video` and `pooled_text`.""" + def __init__(self, config): + super().__init__(config) + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + config.dataset.bert_name) + + def predict_loop( + self, + model, + eval_dataloader, + output_file="retrieval.npy" + ): + """on-the-fly prediction on a single gpu.""" + full_scores = [] + texts = [] + model.eval() + model = model.cuda() + with torch.no_grad(): + for data in eval_dataloader: + # convert to dict. + if not isinstance(data, dict): + data = { + "caps": data[0], + "cmasks": data[1], + "vfeats": data[2], + "vmasks": data[3], + "video_id": data[4] + } + data = self.to_ctx(data) + outputs = model(**data) + outputs.update(data) + self(outputs, full_scores) + for _cap in data["caps"]: + texts.append( + self.tokenizer.decode(_cap, skip_special_tokens=True) + ) + + return self.finalize(full_scores, texts, output_file) + + def __call__(self, sample, full_scores): + scores = self._get_pooled_outputs(sample) + self._append_scores(scores, full_scores) + + def finalize(self, full_scores, texts, output_file=None): + outputs = self._aggregate_scores(full_scores) + if output_file is not None: + np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs) + return {"outputs": outputs, "texts": texts} + + def _get_pooled_outputs(self, outputs): + if "pooled_video" in outputs: + return outputs["pooled_video"], outputs["pooled_text"] + else: + raise ValueError("unknown format of outputs.") + + def _append_scores(self, scores, full_scores): + assert len(scores) == 2 + if len(full_scores) == 0: + full_scores.append([]) + full_scores.append([]) + full_scores[0].append(scores[0].cpu().detach().numpy()) + full_scores[1].append(scores[1].cpu().detach().numpy()) + + def _aggregate_scores(self, scores): + assert len(scores) == 2 + video_hidden = np.concatenate(scores[0], axis=0) + text_hidden = np.concatenate(scores[1], axis=0) + # clear up. + self.full_scores = [] + return np.matmul(text_hidden, video_hidden.T) + + +class QAPredictor(Predictor): + """generated `pooled_video` and `pooled_text`.""" + def __init__(self, config): + super().__init__(config) + """predictor maintains scores and aggregate them.""" + + def predict_loop(self, model, eval_dataloader, output_file="qa.npy"): + """on-the-fly prediction on a single gpu.""" + self.full_scores = [] + model.eval() + model = model.cuda() + with torch.no_grad(): + for data in eval_dataloader: + # reshape ans and dup video 5 times. + v_len = data["vfeats"].size(1) + hidden_size = data["vfeats"].size(2) + data["vfeats"] = data["vfeats"].unsqueeze(1).repeat(1, 5, 1, 1).view(-1, v_len, hidden_size) + data["vmasks"] = data["vmasks"].unsqueeze(1).repeat(1, 5, 1).view(-1, v_len) + + t_len = data["caps"].size(-1) + data["caps"] = data["caps"].view(-1, t_len) + data["cmasks"] = data["cmasks"].view(-1, t_len) + + data = self.to_ctx(data) + outputs = model(**data) + outputs.update(data) + self(outputs) + return self.finalize(output_file) + + def __call__(self, sample): + hidden_size = sample["pooled_video"].size(-1) + pooled_video = sample["pooled_video"].view(-1, 5, hidden_size) + pooled_text = sample["pooled_text"].view(-1, 5, hidden_size) + scores = torch.bmm(pooled_video, pooled_text.transpose(2, 1)) + scores = scores.argmax(-1) + self._append_scores(scores[:, 0], sample["answers"], self.full_scores) + + def finalize(self, output_file=None): + outputs, targets = self._aggregate_scores(self.full_scores) + if output_file is not None: + np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs) + return {"outputs": outputs, "targets": targets} + + def _append_scores(self, scores, answers, full_scores): + if len(full_scores) == 0: + full_scores.append([]) + full_scores.append([]) + full_scores[0].append(scores.cpu().detach().numpy()) + full_scores[1].append(answers.cpu().detach().numpy()) + + def _aggregate_scores(self, scores): + assert len(scores) == 2 + outputs = np.concatenate(scores[0], axis=0) + targets = np.concatenate(scores[1], axis=0) + # clear up. + self.full_scores = [] + return outputs, targets + + +class CrossTaskPredictor(Predictor): + """ + CrossTaskPredictor needs to compute the average of logits + for overlapped sliding-window. + """ + def __init__(self, config): + super().__init__(config) + self.lsm = torch.nn.LogSoftmax(dim=1) + self.max_video_len = config.dataset.max_video_len + self.sliding_window = config.dataset.sliding_window + self.sliding_window_size = config.dataset.sliding_window_size + self.annotation_path = config.dataset.annotation_path + + def predict_loop(self, model, eval_dataloader, output_file="result.pkl"): + """refactored from line 144: + https://github.com/DmZhukov/CrossTask/blob/master/train.py + """ + ctx = 0 + model.eval() + model = model.to(ctx) + # this is not a loss but just compute neg_log_prob. + Y_pred = {} + Y_true = {} + with torch.no_grad(): + for batch in eval_dataloader: + self(batch, model, Y_pred, Y_true) + return self.finalize(Y_pred, Y_true, output_file) + + def __call__(self, sample, model, Y_pred, Y_true): + # please install dp from `https://github.com/DmZhukov/CrossTask` + from dp import dp + vid, task = sample['video_id'][0], sample['task'][0] + sample = self.to_ctx(sample) + # compute the average logits over sliding windows. + output = model(**sample) + batch_logits = output["logits"].cpu() + + video_len = sample["video_len"][0] + + # the following version is slow. + logits = torch.zeros((video_len, batch_logits.size(1))) + logits_counts = torch.zeros((video_len, 1), dtype=torch.long) + # use the same loop as aligner to recover. + batch_logit_idx = 0 + for window_start in range(0, video_len, self.sliding_window): + video_end = min(video_len - window_start, self.sliding_window_size) + logits[window_start: window_start + video_end] += batch_logits[ + batch_logit_idx: batch_logit_idx + video_end] + batch_logit_idx += video_end + logits_counts[window_start: window_start + video_end] += torch.ones((video_end, 1), dtype=torch.long) + + if (video_len - window_start) <= self.sliding_window_size: + break + + logits /= logits_counts + assert logits.size() == (video_len, batch_logits.size(1)), "{}, {}".format(logits.size(), video_len) + + O = self.lsm(logits) + y = np.zeros(O.size(), dtype=np.float32) + dp(y, -O.detach().cpu().numpy()) + if task not in Y_pred: + Y_pred[task] = {} + Y_pred[task][vid] = y + annot_path = os.path.join( + self.annotation_path, task+'_'+vid+'.csv') + if os.path.exists(annot_path): + if task not in Y_true: + Y_true[task] = {} + Y_true[task][vid] = self._read_assignment( + *y.shape, annot_path) + + def finalize(self, Y_pred, Y_true, output_file=None): + if output_file is not None: + with open( + os.path.join(self.pred_dir, output_file + ".pkl"), + "wb") as fw: + pickle.dump( + {"Y_pred": Y_pred, "Y_true": Y_true}, fw, + protocol=pickle.HIGHEST_PROTOCOL) + return {"outputs": Y_pred, "targets": Y_true} + + def _read_assignment(self, T, K, path): + """ + refactored from https://github.com/DmZhukov/CrossTask/blob/master/data.py + Howto interpret contraints on loss that is going to be minimized: + lambd is a big number; + self.lambd * C is a big number for all valid position (csv stores invalids) + + def forward(self, O, Y, C): + return (Y*(self.lambd * C - self.lsm(O))).mean(dim=0).sum() + + This will load the csv file and fill-in the step col from start to end rows. + """ + + Y = np.zeros([T, K], dtype=np.uint8) + with open(path, 'r') as f: + for line in f: + step, start, end = line.strip().split(',') + start = int(math.floor(float(start))) + end = int(math.ceil(float(end))) + step = int(step) - 1 + Y[start:end, step] = 1 + return Y + + +class COINPredictor(Predictor): + """ + COINPredictor is similar to CrossTask on sliding windows. + """ + def __init__(self, config): + super().__init__(config) + self.max_video_len = config.dataset.max_video_len + self.sliding_window = config.dataset.sliding_window + self.sliding_window_size = config.dataset.sliding_window_size + + def predict_loop(self, model, eval_dataloader, output_file="result.pkl"): + """refactored from line 144: + https://github.com/DmZhukov/CrossTask/blob/master/train.py + """ + ctx = 0 + model.eval() + model = model.to(ctx) + # this is not a loss but just compute neg_log_prob. + Y_pred = [] + Y_true = [] + with torch.no_grad(): + for batch in eval_dataloader: + self(batch, model, Y_pred, Y_true) + return self.finalize(Y_pred, Y_true, output_file) + + def __call__(self, sample, model, Y_pred, Y_true): + sample = self.to_ctx(sample) + # compute the average logits over sliding windows. + output = model(**sample) + logits = self._merge_windows(sample, output) + Y_pred.append(logits.argmax(dim=1)) + Y_true.append(sample["video_targets"].squeeze(0).cpu()) + + def _merge_windows(self, sample, output): + targets = sample["targets"].reshape(-1).cpu() + valid_mask = targets != -100 + targets = targets[valid_mask] + batch_logits = output["logits"].cpu() + batch_logits = batch_logits.reshape(-1, batch_logits.size(-1)) + batch_logits = batch_logits[valid_mask] + + video_len = sample["video_len"][0] + + # the following version is slow. + logits = torch.zeros((video_len, batch_logits.size(1))) + logits_counts = torch.zeros((video_len, 1), dtype=torch.long) + # use the same loop as aligner to recover. + batch_logit_idx = 0 + for window_start in range(0, video_len, self.sliding_window): + video_end = min(video_len - window_start, self.sliding_window_size) + logits[window_start: window_start + video_end] += batch_logits[ + batch_logit_idx: batch_logit_idx + video_end] + batch_logit_idx += video_end + logits_counts[window_start: window_start + video_end] += torch.ones((video_end, 1), dtype=torch.long) + if (video_len - window_start) <= self.sliding_window_size: + break + logits /= logits_counts + assert logits.size() == (video_len, batch_logits.size(1)), "{}, {}".format(logits.size(), video_len) + return logits + + def finalize(self, Y_pred, Y_true, output_file=None): + Y_pred = torch.cat(Y_pred, dim=0).numpy() + Y_true = torch.cat(Y_true, dim=0).numpy() + assert len(Y_pred) == len(Y_true) + + error_mask = Y_pred != Y_true + print("sample error", Y_pred[error_mask][:10], Y_true[error_mask][:10]) + print("sample error", Y_pred[error_mask][10:20], Y_true[error_mask][10:20]) + + if output_file is not None: + with open( + os.path.join(self.pred_dir, output_file + ".pkl"), + "wb") as fw: + pickle.dump( + {"Y_pred": Y_pred, "Y_true": Y_true}, fw, + protocol=pickle.HIGHEST_PROTOCOL) + return {"outputs": Y_pred, "targets": Y_true} + + +class COINZSPredictor(COINPredictor): + """ + COINZSPredictor for COIN zero-shot prediction. + """ + + def __init__(self, config): + super().__init__(config) + self.dataset_config = config.dataset + + def predict_loop(self, model, eval_dataloader, output_file="result.pkl"): + """refactored from line 144: + https://github.com/DmZhukov/CrossTask/blob/master/train.py + """ + ctx = 0 + model.eval() + model = model.to(ctx) + + with torch.no_grad(): + outputs = eval_dataloader.dataset.meta_processor.meta_text_labels( + self.dataset_config) + outputs = self.to_ctx(outputs, ctx) + label_hidden_states = model.forward_text(**outputs).cpu() + label_sim = label_hidden_states @ label_hidden_states.t() + num_labels = label_sim.size(0) + eye_mask = ~torch.eye(num_labels, dtype=torch.bool) + label_sim = label_sim.masked_select(eye_mask).view(num_labels, num_labels - 1) + lbd = label_sim.max() + + # this is not a loss but just compute neg_log_prob. + Y_pred = [] + Y_true = [] + with torch.no_grad(): + for batch in eval_dataloader: + self(batch, label_hidden_states, model, lbd, Y_pred, Y_true) + return self.finalize(Y_pred, Y_true, output_file) + + def reshape_subsample(self, sample): + for key in sample: + if torch.is_tensor(sample[key]): + sample[key] = self.flat_subsample(sample[key]) + return sample + + def flat_subsample(self, tensor): + if len(tensor.size()) > 1 and tensor.size(0) == 1: + tensor = tensor.squeeze(0) + return tensor + + def __call__(self, sample, label_hidden_states, model, lbd, Y_pred, Y_true): + sample = self.reshape_subsample(sample) + sample = self.to_ctx(sample) + # compute the average logits over sliding windows. + sample["output_hidden_states"] = True + video_outputs = model.forward_video(**sample).cpu() + output = {"logits": video_outputs[:, 1:sample["vmasks"].size(1)+1] @ label_hidden_states.t()} + logits = self._merge_windows(sample, output) + # logic of zero-shot for sequence labeling. + logits_argmax = logits.argmax(dim=1) + 1 # 0 is "O" label. + logits_max = logits.max(dim=1)[0] + + pred = torch.zeros_like(logits_argmax) + label_select = logits_max > lbd # 73 or 74 + pred[label_select] = logits_argmax[label_select] + + Y_pred.append(pred) + Y_true.append(sample["video_targets"].squeeze(0).cpu()) + + def finalize(self, Y_pred, Y_true, output_file=None): + Y_pred = torch.cat(Y_pred, dim=0).numpy() + Y_true = torch.cat(Y_true, dim=0).numpy() + assert len(Y_pred) == len(Y_true) + + error_mask = Y_pred != Y_true + print("sample error", Y_pred[error_mask][:10], Y_true[error_mask][:10]) + print("sample error", Y_pred[error_mask][10:20], Y_true[error_mask][10:20]) + + if output_file is not None: + with open( + os.path.join(self.pred_dir, output_file + ".pkl"), + "wb") as fw: + pickle.dump( + {"Y_pred": Y_pred, "Y_true": Y_true}, fw, + protocol=pickle.HIGHEST_PROTOCOL) + return {"outputs": Y_pred, "targets": Y_true} + + +class DiDeMoPredictor(Predictor): + """reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py + https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/data_processing.py + """ + def __init__(self, config): + super().__init__(config) + # load targets. + with open(config.dataset.test_path) as data_file: + self.test_data = json.load(data_file) + + def predict_loop(self, model, eval_dataloader, output_file="didemo.npy"): + """ + TODO: two solutions here. + """ + import itertools + # 21 chunks. + self.possible_segments = [(0,0), (1,1), (2,2), (3,3), (4,4), (5,5)] + for i in itertools.combinations(range(6), 2): + self.possible_segments.append(i) + # pick segments from a video. + + """on-the-fly prediction on a single gpu.""" + self.full_scores = [] + model.eval() + model = model.cuda() + with torch.no_grad(): + for data in eval_dataloader: + # TODO special forwarding logic here. + data = self.to_ctx(data) + data["output_hidden_states"] = True + hidden_video = model.forward_video(**data) + data["output_hidden_states"] = False + pooled_text = model.forward_text(**data) + outputs = { + "hidden_video": hidden_video, + "pooled_text": pooled_text + } + outputs.update(data) + self(outputs) + return self.finalize(output_file) + + def __call__(self, sample): + # TODO: make an index select from self.possible_segments. + hidden_video = sample["hidden_video"] + pooled_text = sample["pooled_text"] + vmasks = sample["vmasks"] + # probably maintain valid results here. + + hidden_video = hidden_video[:, 1:-1, :] + # probably maintain valid results here. + pooled_video = [] + for s, e in self.possible_segments: + pooled_video.append( + torch.mean( + hidden_video[:, int(s*5):int((e+1)*5), :], + dim=1, keepdim=True) + ) + pooled_video = torch.cat(pooled_video, dim=1) + scores = torch.bmm( + pooled_video, pooled_text.unsqueeze(-1)).squeeze(-1).cpu() + + ranks = scores.argsort(dim=-1, descending=True) + + for batch_idx, rank in enumerate(ranks): + rank_of_moment = [] + for m_idx, moment in enumerate(rank): + s, e = self.possible_segments[moment.item()] + if torch.any( + vmasks[batch_idx, int(s*5):int((e+1)*5)] + ): + rank_of_moment.append((s, e)) + self.full_scores.append(rank_of_moment) + + def finalize(self, output_file=None): + outputs = self._aggregate_scores(self.full_scores) + if output_file is not None: + np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs) + return {"outputs": outputs, "targets": self.test_data} + + def _aggregate_scores(self, scores): + self.full_scores = [] + return scores diff --git a/examples/MMPT/mmpt/losses/__init__.py b/examples/MMPT/mmpt/losses/__init__.py new file mode 100644 index 0000000000..8dc32c96d2 --- /dev/null +++ b/examples/MMPT/mmpt/losses/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +from .loss import * +from .nce import * + +try: + from .fairseqmmloss import * +except ImportError: + pass + +try: + from .expnce import * +except ImportError: + pass diff --git a/examples/MMPT/mmpt/losses/fairseqmmloss.py b/examples/MMPT/mmpt/losses/fairseqmmloss.py new file mode 100644 index 0000000000..a95e5ecf45 --- /dev/null +++ b/examples/MMPT/mmpt/losses/fairseqmmloss.py @@ -0,0 +1,63 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +TODO (huxu): a general fairseq criterion for all your pre-defined losses. +""" + +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.logging import metrics + + +@register_criterion("mmloss") +class MMCriterion(FairseqCriterion): + def __init__(self, task): + super().__init__(task) + # TODO (huxu): wrap forward call of loss_fn and eval_fn into task. + self.mmtask = task.mmtask + + def forward(self, model, sample): + """Compute the loss for the given sample. + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + outputs = self.mmtask(model, sample) + + loss, loss_scalar, max_len, batch_size, sample_size = ( + outputs["loss"], + outputs["loss_scalar"], + outputs["max_len"], + outputs["batch_size"], + outputs["sample_size"], + ) + + logging_output = { + "loss": loss_scalar, + "ntokens": max_len * batch_size, # dummy report. + "nsentences": batch_size, # dummy report. + "sample_size": sample_size, + } + + return loss, 1, logging_output + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + """since we use NCE, our actual batch_size is 1 per GPU. + Then we take the mean of each worker.""" + loss_sum = sum(log.get("loss", 0.0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + metrics.log_scalar("loss", loss_sum / sample_size, round=3) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True diff --git a/examples/MMPT/mmpt/losses/loss.py b/examples/MMPT/mmpt/losses/loss.py new file mode 100644 index 0000000000..99c05d067e --- /dev/null +++ b/examples/MMPT/mmpt/losses/loss.py @@ -0,0 +1,87 @@ +# Copyright (c) Facebook, Inc. All Rights Reserved + +import torch + +from torch import nn + + +class Loss(object): + def __call__(self, *args, **kwargs): + raise NotImplementedError + + +# Dummy Loss for testing. +class DummyLoss(Loss): + def __init__(self): + self.loss = nn.CrossEntropyLoss() + + def __call__(self, logits, targets, **kwargs): + return self.loss(logits, targets) + + +class DummyK400Loss(Loss): + """dummy k400 loss for MViT.""" + def __init__(self): + self.loss = nn.CrossEntropyLoss() + + def __call__(self, logits, targets, **kwargs): + return self.loss( + logits, torch.randint(0, 400, (logits.size(0),), device=logits.device)) + + +class CrossEntropy(Loss): + def __init__(self): + self.loss = nn.CrossEntropyLoss() + + def __call__(self, logits, targets, **kwargs): + return self.loss(logits.reshape(-1, logits.size(-1)), targets.reshape(-1)) + + +class ArgmaxCrossEntropy(Loss): + def __init__(self): + self.loss = nn.CrossEntropyLoss() + + def __call__(self, logits, targets, **kwargs): + return self.loss(logits, targets.argmax(dim=1)) + + +class BCE(Loss): + def __init__(self): + self.loss = nn.BCEWithLogitsLoss() + + def __call__(self, logits, targets, **kwargs): + targets = targets.squeeze(0) + return self.loss(logits, targets) + + +class NLGLoss(Loss): + def __init__(self): + self.loss = nn.CrossEntropyLoss() + + def __call__(self, logits, text_label, **kwargs): + targets = text_label[text_label != -100] + return self.loss(logits, targets) + + +class MSE(Loss): + def __init__(self): + self.loss = nn.MSELoss() + + def __call__(self, logits, targets, **kwargs): + return self.loss(logits, targets) + + +class L1(Loss): + def __init__(self): + self.loss = nn.L1Loss() + + def __call__(self, logits, targets, **kwargs): + return self.loss(logits, targets) + + +class SmoothL1(Loss): + def __init__(self): + self.loss = nn.SmoothL1Loss() + + def __call__(self, logits, targets, **kwargs): + return self.loss(logits, targets) diff --git a/examples/MMPT/mmpt/losses/nce.py b/examples/MMPT/mmpt/losses/nce.py new file mode 100644 index 0000000000..ed7be8d372 --- /dev/null +++ b/examples/MMPT/mmpt/losses/nce.py @@ -0,0 +1,156 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +softmax-based NCE loss, used by this project. +""" + +import torch + +from torch import nn + +from .loss import Loss + + +class NCE(Loss): + def __init__(self): + # TODO (huxu): define temperature. + self.loss = nn.CrossEntropyLoss() + + def __call__(self, align_scores, **kargs): + # note: we reuse the same shape as cls head in BERT (batch_size, 2) + # but NCE only needs one logits. + # (so we drop all weights in the second neg logits.) + align_scores = align_scores[:, :1] + # duplicate negative examples + batch_size = align_scores.size(0) // 2 + pos_scores = align_scores[:batch_size] + neg_scores = align_scores[batch_size:].view(1, batch_size).repeat( + batch_size, 1) + scores = torch.cat([pos_scores, neg_scores], dim=1) + return self.loss( + scores, + torch.zeros( + (batch_size,), + dtype=torch.long, + device=align_scores.device), + ) + + +class T2VContraLoss(Loss): + """NCE for MM joint space, on softmax text2video matrix. + """ + def __init__(self): + # TODO (huxu): define temperature. + self.loss = nn.CrossEntropyLoss() + + def __call__(self, pooled_video, pooled_text, **kargs): + batch_size = pooled_video.size(0) + logits = torch.mm(pooled_text, pooled_video.transpose(1, 0)) + targets = torch.arange( + batch_size, + dtype=torch.long, + device=pooled_video.device) + return self.loss(logits, targets) + + +class V2TContraLoss(Loss): + """NCE for MM joint space, with softmax on video2text matrix.""" + + def __init__(self): + # TODO (huxu): define temperature. + self.loss = nn.CrossEntropyLoss() + + def __call__(self, pooled_video, pooled_text, **kargs): + batch_size = pooled_video.size(0) + logits = torch.mm(pooled_video, pooled_text.transpose(1, 0)) + targets = torch.arange( + batch_size, + dtype=torch.long, + device=pooled_video.device) + return self.loss(logits, targets) + + +class MMContraLoss(Loss): + def __init__(self): + self.loss = nn.CrossEntropyLoss() + + def __call__(self, pooled_video, pooled_text, **kwargs): + logits_per_video = pooled_video @ pooled_text.t() + logits_per_text = pooled_text @ pooled_video.t() + + targets = torch.arange( + pooled_video.size(0), + dtype=torch.long, + device=pooled_video.device) + loss_video = self.loss(logits_per_video, targets) + loss_text = self.loss(logits_per_text, targets) + return loss_video + loss_text + + +class MTM(Loss): + """Combination of MFM and MLM.""" + + def __init__(self): + self.loss = nn.CrossEntropyLoss() + + def __call__( + self, + video_logits, + text_logits, + video_label, + text_label, + **kwargs + ): + text_logits = torch.cat([ + text_logits, + torch.zeros( + (text_logits.size(0), 1), device=text_logits.device) + ], dim=1) + vt_logits = torch.cat([video_logits, text_logits], dim=0) + # loss for video. + video_label = torch.zeros( + (video_logits.size(0),), + dtype=torch.long, + device=video_logits.device + ) + + # loss for text. + text_label = text_label.reshape(-1) + labels_mask = text_label != -100 + selected_text_label = text_label[labels_mask] + + vt_label = torch.cat([video_label, selected_text_label], dim=0) + return self.loss(vt_logits, vt_label) + + +class MFMMLM(Loss): + """Combination of MFM and MLM.""" + + def __init__(self): + self.loss = nn.CrossEntropyLoss() + + def __call__( + self, + video_logits, + text_logits, + video_label, + text_label, + **kwargs + ): + # loss for video. + video_label = torch.zeros( + (video_logits.size(0),), + dtype=torch.long, + device=video_logits.device + ) + masked_frame_loss = self.loss(video_logits, video_label) + + # loss for text. + text_label = text_label.reshape(-1) + labels_mask = text_label != -100 + selected_text_label = text_label[labels_mask] + masked_lm_loss = self.loss(text_logits, selected_text_label) + return masked_frame_loss + masked_lm_loss diff --git a/examples/MMPT/mmpt/models/__init__.py b/examples/MMPT/mmpt/models/__init__.py new file mode 100644 index 0000000000..825250cd00 --- /dev/null +++ b/examples/MMPT/mmpt/models/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +from .mmfusion import * +from .transformermodel import * +from .mmfusionnlg import * + +try: + from .fairseqmmmodel import * +except ImportError: + pass + +try: + from .expmmfusion import * +except ImportError: + pass diff --git a/examples/MMPT/mmpt/models/fairseqmmmodel.py b/examples/MMPT/mmpt/models/fairseqmmmodel.py new file mode 100644 index 0000000000..b7dd643693 --- /dev/null +++ b/examples/MMPT/mmpt/models/fairseqmmmodel.py @@ -0,0 +1,51 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.models import ( + BaseFairseqModel, + register_model, + register_model_architecture +) + + +@register_model("mmmodel") +class FairseqMMModel(BaseFairseqModel): + """a fairseq wrapper of model built by `task`.""" + + @classmethod + def build_model(cls, args, task): + return FairseqMMModel(task.mmtask.model) + + def __init__(self, mmmodel): + super().__init__() + self.mmmodel = mmmodel + + def forward(self, *args, **kwargs): + return self.mmmodel(*args, **kwargs) + + def upgrade_state_dict_named(self, state_dict, name): + + super().upgrade_state_dict_named(state_dict, name) + + keys_to_delete = [] + + for key in state_dict: + if key not in self.state_dict(): + keys_to_delete.append(key) + for key in keys_to_delete: + print("[INFO]", key, "not used anymore.") + del state_dict[key] + + # copy any newly defined parameters. + for key in self.state_dict(): + if key not in state_dict: + print("[INFO] adding", key) + state_dict[key] = self.state_dict()[key] + + +# a dummy arch, we config the model. +@register_model_architecture("mmmodel", "mmarch") +def mmarch(args): + pass diff --git a/examples/MMPT/mmpt/models/mmfusion.py b/examples/MMPT/mmpt/models/mmfusion.py new file mode 100644 index 0000000000..2509e26b67 --- /dev/null +++ b/examples/MMPT/mmpt/models/mmfusion.py @@ -0,0 +1,926 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copyright (c) Facebook, Inc. All Rights Reserved + + +import torch + +from torch import nn + +try: + from transformers import AutoConfig, AutoTokenizer +except ImportError: + pass + +from . import transformermodel + + +class MMPTModel(nn.Module): + """An e2e wrapper of inference model. + """ + @classmethod + def from_pretrained(cls, config, checkpoint="checkpoint_best.pt"): + import os + from ..utils import recursive_config + from ..tasks import Task + config = recursive_config(config) + mmtask = Task.config_task(config) + checkpoint_path = os.path.join(config.eval.save_path, checkpoint) + mmtask.build_model(checkpoint=checkpoint_path) + # TODO(huxu): make the video encoder configurable. + from ..processors.models.s3dg import S3D + video_encoder = S3D('pretrained_models/s3d_dict.npy', 512) + video_encoder.load_state_dict( + torch.load('pretrained_models/s3d_howto100m.pth')) + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained( + config.dataset.bert_name, use_fast=config.dataset.use_fast + ) + from ..processors import Aligner + aligner = Aligner(config.dataset) + return ( + MMPTModel(config, mmtask.model, video_encoder), + tokenizer, + aligner + ) + + def __init__(self, config, model, video_encoder, **kwargs): + super().__init__() + self.max_video_len = config.dataset.max_video_len + self.video_encoder = video_encoder + self.model = model + + def forward(self, video_frames, caps, cmasks, return_score=False): + bsz = video_frames.size(0) + assert bsz == 1, "only bsz=1 is supported now." + seq_len = video_frames.size(1) + video_frames = video_frames.view(-1, *video_frames.size()[2:]) + vfeats = self.video_encoder(video_frames.permute(0, 4, 1, 2, 3)) + vfeats = vfeats['video_embedding'] + vfeats = vfeats.view(bsz, seq_len, vfeats.size(-1)) + padding = torch.zeros( + bsz, self.max_video_len - seq_len, vfeats.size(-1)) + vfeats = torch.cat([vfeats, padding], dim=1) + vmasks = torch.cat([ + torch.ones((bsz, seq_len), dtype=torch.bool), + torch.zeros((bsz, self.max_video_len - seq_len), dtype=torch.bool) + ], + dim=1 + ) + output = self.model(caps, cmasks, vfeats, vmasks) + if return_score: + output = {"score": torch.bmm( + output["pooled_video"][:, None, :], + output["pooled_text"][:, :, None] + ).squeeze(-1).squeeze(-1)} + return output + + +class MMFusion(nn.Module): + """a MMPT wrapper class for MMBert style models. + TODO: move isolated mask to a subclass. + """ + def __init__(self, config, **kwargs): + super().__init__() + transformer_config = AutoConfig.from_pretrained( + config.dataset.bert_name) + self.hidden_size = transformer_config.hidden_size + self.is_train = False + if config.dataset.train_path is not None: + self.is_train = True + # 0 means no iso; 1-12 means iso up to that layer. + self.num_hidden_layers = transformer_config.num_hidden_layers + self.last_iso_layer = 0 + if config.dataset.num_iso_layer is not None: + self.last_iso_layer = config.dataset.num_iso_layer - 1 + 1 + + if config.model.mm_encoder_cls is not None: + mm_encoder_cls = getattr(transformermodel, config.model.mm_encoder_cls) + model_config = AutoConfig.from_pretrained(config.dataset.bert_name) + model_config.max_video_len = config.dataset.max_video_len + # TODO: a general way to add parameter for a model. + model_config.use_seg_emb = config.model.use_seg_emb + self.mm_encoder = mm_encoder_cls.from_pretrained( + config.dataset.bert_name, config=model_config) + elif config.model.video_encoder_cls is not None\ + and config.model.text_encoder_cls is not None: + video_encoder_cls = getattr(transformermodel, config.model.video_encoder_cls) + model_config = AutoConfig.from_pretrained(config.dataset.bert_name) + model_config.max_video_len = config.dataset.max_video_len + # TODO: make each model a set of config class. + if hasattr(model_config, "num_layers"): + model_config.num_layers = config.model.num_hidden_video_layers + else: + model_config.num_hidden_layers = config.model.num_hidden_video_layers + self.video_encoder = video_encoder_cls.from_pretrained( + config.dataset.bert_name, config=model_config) + # exact same NLP model from Huggingface. + text_encoder_cls = getattr(transformermodel, config.model.text_encoder_cls) + self.text_encoder = text_encoder_cls.from_pretrained( + config.dataset.bert_name) + else: + raise ValueError("the encoder must be either MM or two backbones.") + + def forward( + self, + caps, + cmasks, + vfeats, + vmasks, + **kwargs + ): + raise NotImplementedError( + "Please derive MMFusion module." + ) + + def _mm_on_the_fly( + self, + cmasks, + vmasks, + attention_mask + ): + """helper function for mask, seg_ids and token_type_ids.""" + if attention_mask is None: + attention_mask = self._mm_attention_mask(cmasks, vmasks) + + """ + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + """ + token_type_ids = torch.cat( + [ + torch.zeros( + (vmasks.size(0), vmasks.size(1) + 2), + dtype=torch.long, + device=vmasks.device, + ), + torch.ones( + (cmasks.size(0), cmasks.size(1) - 2), + dtype=torch.long, + device=cmasks.device, + ), + ], + dim=1, + ) + return attention_mask, token_type_ids + + def _mm_attention_mask(self, cmasks, vmasks): + assert cmasks.size(0) == vmasks.size(0), "{}, {}, {}, {}".format( + str(cmasks.size()), + str(vmasks.size()), + str(cmasks.size(0)), + str(vmasks.size(0)), + ) + + mm_mask = torch.cat([cmasks[:, :1], vmasks, cmasks[:, 1:]], dim=1) + if self.last_iso_layer == 0: + # hard attention mask. + return mm_mask + else: + # a gpu iso mask; 0 : num_iso_layer is isolated; + # num_iso_layer: are MM-fused. + # make an iso layer + batch_size = cmasks.size(0) + iso_mask = self._make_iso_mask(batch_size, cmasks, vmasks) + mm_mask = mm_mask[:, None, :].repeat(1, mm_mask.size(-1), 1) + iso_mm_masks = [] + # hard attention mask. + iso_mask = iso_mask[:, None, :, :].repeat( + 1, self.last_iso_layer, 1, 1) + iso_mm_masks.append(iso_mask) + if self.last_iso_layer < self.num_hidden_layers: + mm_mask = mm_mask[:, None, :, :].repeat( + 1, self.num_hidden_layers - self.last_iso_layer, 1, 1 + ) + iso_mm_masks.append(mm_mask) + iso_mm_masks = torch.cat(iso_mm_masks, dim=1) + return iso_mm_masks + + def _make_iso_mask(self, batch_size, cmasks, vmasks): + cls_self_mask = torch.cat( + [ + torch.ones( + (batch_size, 1), dtype=torch.bool, device=cmasks.device), + torch.zeros( + (batch_size, cmasks.size(1) + vmasks.size(1) - 1), + dtype=torch.bool, device=cmasks.device) + ], dim=1) + + iso_video_mask = torch.cat( + [ + # [CLS] is not used. + torch.zeros( + (batch_size, 1), dtype=torch.bool, device=cmasks.device + ), + vmasks, + # assume to be 1. + cmasks[:, 1:2], + # 2 means [CLS] + [SEP] + torch.zeros( + (batch_size, cmasks.size(1) - 2), + dtype=torch.bool, + device=cmasks.device, + ), + ], + dim=1, + ) + iso_text_mask = torch.cat( + [ + torch.zeros( + (batch_size, 2 + vmasks.size(1)), + dtype=torch.bool, + device=cmasks.device, + ), # [CLS] is not used. + cmasks[:, 2:], # assume to be 1. + ], + dim=1, + ) + cls_self_mask = cls_self_mask[:, None, :] + iso_video_mask = iso_video_mask[:, None, :].repeat( + 1, vmasks.size(1) + 1, 1) + iso_text_mask = iso_text_mask[:, None, :].repeat( + 1, cmasks.size(1) - 2, 1) + return torch.cat([cls_self_mask, iso_video_mask, iso_text_mask], dim=1) + + def _pooling_vt_layer( + self, + layered_sequence_output, + cmasks, + vmasks + ): + layer_idx = self.last_iso_layer \ + if self.last_iso_layer > 0 else self.num_hidden_layers + hidden_state = layered_sequence_output[layer_idx] + # also output pooled_video and pooled_text. + batch_size = cmasks.size(0) + # pool the modality. + text_offset = vmasks.size(1) + 2 # [CLS] + [SEP] + # video tokens + [SEP] + video_outputs = hidden_state[:, 1:text_offset] + video_attention_mask = torch.cat( + [ + vmasks, + torch.ones( + (batch_size, 1), dtype=torch.bool, device=vmasks.device), + ], + dim=1, + ) + assert video_outputs.size(1) == video_attention_mask.size(1) + pooled_video = torch.sum( + video_outputs * video_attention_mask.unsqueeze(-1), dim=1 + ) / video_attention_mask.sum(1, keepdim=True) + # pooled_video = torch.mean(video_outputs[0], dim=1) + + # text tokens + [SEP] + text_attention_mask = cmasks[:, 2:] + text_outputs = hidden_state[:, text_offset:] + assert text_outputs.size(1) == text_attention_mask.size(1) + pooled_text = torch.sum( + text_outputs * text_attention_mask.unsqueeze(-1), dim=1 + ) / text_attention_mask.sum(1, keepdim=True) + return pooled_video, pooled_text + + +class MMFusionMFMMLM(MMFusion): + """forward function for MFM and MLM.""" + def forward( + self, + caps, + cmasks, + vfeats, + vmasks, + attention_mask=None, + video_label=None, + text_label=None, + **kwargs + ): + output_hidden_states = False if self.is_train else True + + target_vfeats, non_masked_frame_mask = None, None + if video_label is not None: + target_vfeats = vfeats.masked_select( + video_label.unsqueeze(-1)).view( + -1, vfeats.size(-1) + ) + # mask video token. + vfeats[video_label] = 0.0 + non_masked_frame_mask = vmasks.clone() + non_masked_frame_mask[video_label] = False + + attention_mask, token_type_ids = self._mm_on_the_fly( + cmasks, vmasks, attention_mask) + + outputs = self.mm_encoder( + input_ids=caps, + input_video_embeds=vfeats, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + masked_frame_labels=video_label, + target_video_hidden_states=target_vfeats, + non_masked_frame_mask=non_masked_frame_mask, + masked_lm_labels=text_label, + output_hidden_states=output_hidden_states, + ) + + video_logits, text_logits = outputs[0], outputs[1] + + if self.is_train: # return earlier for training. + return { + "video_logits": video_logits, + "text_logits": text_logits, + } + + pooled_video, pooled_text = self._pooling_vt_layer( + outputs[2], cmasks, vmasks) + return {"pooled_video": pooled_video, "pooled_text": pooled_text} + + +class MMFusionMTM(MMFusionMFMMLM): + def __init__(self, config, **kwargs): + super().__init__(config) + """ + For reproducibility: + self.mm_encoder will be initialized then discarded. + """ + from .transformermodel import MMBertForMTM + model_config = AutoConfig.from_pretrained(config.dataset.bert_name) + model_config.max_video_len = config.dataset.max_video_len + model_config.use_seg_emb = config.model.use_seg_emb + self.mm_encoder = MMBertForMTM.from_pretrained( + config.dataset.bert_name, config=model_config) + + +class MMFusionShare(MMFusion): + """A retrival wrapper using mm_encoder as both video/text backbone. + TODO: move formally. + """ + def forward( + self, + caps, + cmasks, + vfeats, + vmasks, + attention_mask=None, + video_label=None, + text_label=None, + output_hidden_states=False, + **kwargs + ): + pooled_video = self.forward_video( + vfeats, + vmasks, + caps, + cmasks, + output_hidden_states + ) + + pooled_text = self.forward_text( + caps, + cmasks, + output_hidden_states + ) + + return {"pooled_video": pooled_video, "pooled_text": pooled_text} + + def forward_video( + self, + vfeats, + vmasks, + caps, + cmasks, + output_hidden_states=False, + **kwargs + ): + input_ids = caps[:, :2] + + attention_mask = torch.cat([ + cmasks[:, :1], + vmasks, + cmasks[:, 1:2] + ], dim=1) + + token_type_ids = torch.zeros( + (vmasks.size(0), vmasks.size(1) + 2), + dtype=torch.long, + device=vmasks.device) + + outputs = self.mm_encoder( + input_ids=input_ids, + input_video_embeds=vfeats, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + output_hidden_states=True + ) + video_outputs = outputs[0] + + if output_hidden_states: + return video_outputs + + batch_size = cmasks.size(0) + + video_attention_mask = torch.cat( + [ + torch.zeros( + (batch_size, 1), dtype=torch.bool, device=vmasks.device), + vmasks, + torch.ones( + (batch_size, 1), dtype=torch.bool, device=vmasks.device), + ], + dim=1, + ) + assert video_outputs.size(1) == video_attention_mask.size(1) + + video_attention_mask = video_attention_mask.type(video_outputs.dtype) \ + / video_attention_mask.sum(1, keepdim=True) + + pooled_video = torch.bmm( + video_outputs.transpose(2, 1), + video_attention_mask.unsqueeze(2) + ).squeeze(-1) + return pooled_video # video_outputs + + def forward_text( + self, + caps, + cmasks, + output_hidden_states=False, + **kwargs + ): + input_ids = torch.cat([ + caps[:, :1], caps[:, 2:], + ], dim=1) + + attention_mask = torch.cat([ + cmasks[:, :1], + cmasks[:, 2:] + ], dim=1) + + token_type_ids = torch.cat([ + torch.zeros( + (cmasks.size(0), 1), + dtype=torch.long, + device=cmasks.device), + torch.ones( + (cmasks.size(0), cmasks.size(1) - 2), + dtype=torch.long, + device=cmasks.device) + ], dim=1) + + outputs = self.mm_encoder( + input_ids=input_ids, + input_video_embeds=None, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + output_hidden_states=True + ) + text_outputs = outputs[0] + + if output_hidden_states: + return text_outputs + + batch_size = caps.size(0) + # text tokens + [SEP] + text_attention_mask = torch.cat([ + torch.zeros( + (batch_size, 1), dtype=torch.bool, device=cmasks.device), + cmasks[:, 2:] + ], dim=1) + + assert text_outputs.size(1) == text_attention_mask.size(1) + + text_attention_mask = text_attention_mask.type(text_outputs.dtype) \ + / text_attention_mask.sum(1, keepdim=True) + + pooled_text = torch.bmm( + text_outputs.transpose(2, 1), + text_attention_mask.unsqueeze(2) + ).squeeze(-1) + return pooled_text # text_outputs + + +class MMFusionSeparate(MMFusionShare): + def forward_video( + self, + vfeats, + vmasks, + caps, + cmasks, + output_hidden_states=False, + **kwargs + ): + input_ids = caps[:, :2] + + attention_mask = torch.cat([ + cmasks[:, :1], + vmasks, + cmasks[:, 1:2] + ], dim=1) + + token_type_ids = torch.zeros( + (vmasks.size(0), vmasks.size(1) + 2), + dtype=torch.long, + device=vmasks.device) + + outputs = self.video_encoder( + input_ids=input_ids, + input_video_embeds=vfeats, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + output_hidden_states=True + ) + video_outputs = outputs[0] + + if output_hidden_states: + return video_outputs + + batch_size = cmasks.size(0) + + video_attention_mask = torch.cat( + [ + torch.zeros( + (batch_size, 1), dtype=torch.bool, device=vmasks.device), + vmasks, + torch.ones( + (batch_size, 1), dtype=torch.bool, device=vmasks.device), + ], + dim=1, + ) + assert video_outputs.size(1) == video_attention_mask.size(1) + + video_attention_mask = video_attention_mask.type(video_outputs.dtype) \ + / video_attention_mask.sum(1, keepdim=True) + + pooled_video = torch.bmm( + video_outputs.transpose(2, 1), + video_attention_mask.unsqueeze(2) + ).squeeze(-1) + return pooled_video # video_outputs + + def forward_text( + self, + caps, + cmasks, + output_hidden_states=False, + **kwargs + ): + input_ids = torch.cat([ + caps[:, :1], caps[:, 2:], + ], dim=1) + + attention_mask = torch.cat([ + cmasks[:, :1], + cmasks[:, 2:] + ], dim=1) + # different from sharing, we use all-0 type. + token_type_ids = torch.zeros( + (cmasks.size(0), cmasks.size(1) - 1), + dtype=torch.long, + device=cmasks.device) + + outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + output_hidden_states=True + ) + text_outputs = outputs[0] + + if output_hidden_states: + return text_outputs + + batch_size = caps.size(0) + # text tokens + [SEP] + text_attention_mask = torch.cat([ + torch.zeros( + (batch_size, 1), dtype=torch.bool, device=cmasks.device), + cmasks[:, 2:] + ], dim=1) + + assert text_outputs.size(1) == text_attention_mask.size(1) + + text_attention_mask = text_attention_mask.type(text_outputs.dtype) \ + / text_attention_mask.sum(1, keepdim=True) + + pooled_text = torch.bmm( + text_outputs.transpose(2, 1), + text_attention_mask.unsqueeze(2) + ).squeeze(-1) + return pooled_text # text_outputs + + +class MMFusionJoint(MMFusion): + """fine-tuning wrapper for retrival task.""" + + def forward( + self, + caps, + cmasks, + vfeats, + vmasks, + attention_mask=None, + video_label=None, + text_label=None, + **kwargs + ): + # TODO (huxu): other ways to do negative examples; move the following + # into your criterion forward. + output_hidden_states = True + + attention_mask, token_type_ids = self._mm_on_the_fly( + cmasks, vmasks, attention_mask) + + separate_forward_split = ( + None if self.is_train else vmasks.size(1) + 2 + ) # [CLS] + [SEP] + + outputs = self.mm_encoder( + input_ids=caps, + input_video_embeds=vfeats, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + output_hidden_states=output_hidden_states, + separate_forward_split=separate_forward_split, + ) + + pooled_video, pooled_text = self._pooling_vt_layer( + outputs[2], cmasks, vmasks) + return {"pooled_video": pooled_video, "pooled_text": pooled_text} + + +class MMFusionActionSegmentation(MMFusion): + """Fine-tuning wrapper for action segmentation. + TODO: rename this for VLM. + """ + def forward( + self, + caps, + cmasks, + vfeats, + vmasks, + attention_mask=None, + **kwargs + ): + # ActionLocalization assume of batch_size=1, squeeze it. + caps = caps.view(-1, caps.size(-1)) + cmasks = cmasks.view(-1, cmasks.size(-1)) + vfeats = vfeats.view(-1, vfeats.size(2), vfeats.size(3)) + vmasks = vmasks.view(-1, vmasks.size(-1)) + + # this may not cover all shapes of attention_mask. + attention_mask = attention_mask.view( + -1, attention_mask.size(2), attention_mask.size(3)) \ + if attention_mask is not None else None + + # TODO (huxu): other ways to do negative examples; move the following + # into your criterion forward. + output_hidden_states = True + + # video forwarding, text is dummy; never use attention_mask. + attention_mask, token_type_ids = self._mm_on_the_fly( + cmasks, vmasks, attention_mask) + + logits = self.mm_encoder( + input_ids=caps, + input_video_embeds=vfeats, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + output_hidden_states=output_hidden_states, + ) + return {"logits": logits[0][:, 1:vmasks.size(1)+1]} + + +class MMFusionActionLocalization(MMFusion): + """fine-tuning model for retrival task.""" + + def __init__(self, config, **kwargs): + super().__init__(config) + tokenizer = AutoTokenizer.from_pretrained( + config.dataset.bert_name) + self.cls_token_id = tokenizer.cls_token_id + self.sep_token_id = tokenizer.sep_token_id + self.pad_token_id = tokenizer.pad_token_id + + def forward( + self, + caps, + cmasks, + vfeats, + vmasks, + attention_mask=None, + **kwargs + ): + # ActionLocalization assume of batch_size=1, squeeze it. + caps = caps.squeeze(0) + cmasks = cmasks.squeeze(0) + vfeats = vfeats.squeeze(0) + vmasks = vmasks.squeeze(0) + attention_mask = attention_mask.squeeze(0) if attention_mask is not None else None + + # TODO (huxu): other ways to do negative examples; move the following + # into your criterion forward. + output_hidden_states = True + + # a len1 dummy video token. + dummy_vfeats = torch.zeros( + (caps.size(0), 1, vfeats.size(-1)), device=vfeats.device, dtype=vfeats.dtype) + dummy_vmasks = torch.ones( + (caps.size(0), 1), dtype=torch.bool, + device=vfeats.device) + + dummy_caps = torch.LongTensor( + [[self.cls_token_id, self.sep_token_id, + self.pad_token_id, self.sep_token_id]], + ).to(caps.device).repeat(vfeats.size(0), 1) + dummy_cmasks = torch.BoolTensor( + [[0, 1, 0, 1]] # pad are valid for attention. + ).to(caps.device).repeat(vfeats.size(0), 1) + + # video forwarding, text is dummy; never use attention_mask. + attention_mask, token_type_ids = self._mm_on_the_fly( + dummy_cmasks, vmasks, None) + + outputs = self.mm_encoder( + input_ids=dummy_caps, + input_video_embeds=vfeats, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + output_hidden_states=output_hidden_states, + ) + + layer_idx = self.last_iso_layer \ + if self.last_iso_layer > 0 else self.num_hidden_layers + + video_seq = outputs[2][layer_idx][:, 1:vmasks.size(1)+1].masked_select( + vmasks.unsqueeze(-1) + ).view(-1, self.hidden_size) + + # text forwarding, video is dummy + attention_mask, token_type_ids = self._mm_on_the_fly( + cmasks, dummy_vmasks, None) + + outputs = self.mm_encoder( + input_ids=caps, + input_video_embeds=dummy_vfeats, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + output_hidden_states=output_hidden_states, + ) + + _, pooled_text = self._pooling_vt_layer( + outputs[2], cmasks, dummy_vmasks) + # this line is not right. + logits = torch.mm(video_seq, pooled_text.transpose(1, 0)) + return {"logits": logits} + + +# --------------- MMFusionSeparate for end tasks --------------- + +class MMFusionSeparateActionSegmentation(MMFusionSeparate): + """Fine-tuning wrapper for action segmentation.""" + def forward( + self, + caps, + cmasks, + vfeats, + vmasks, + attention_mask=None, + **kwargs + ): + # ActionLocalization assume of batch_size=1, squeeze it. + caps = caps.view(-1, caps.size(-1)) + cmasks = cmasks.view(-1, cmasks.size(-1)) + vfeats = vfeats.view(-1, vfeats.size(2), vfeats.size(3)) + vmasks = vmasks.view(-1, vmasks.size(-1)) + logits = self.forward_video( + vfeats, + vmasks, + caps, + cmasks, + output_hidden_states=True + ) + return {"logits": logits[:, 1:vmasks.size(1)+1]} + + +class MMFusionSeparateActionLocalization(MMFusionSeparate): + def __init__(self, config, **kwargs): + super().__init__(config) + tokenizer = AutoTokenizer.from_pretrained( + config.dataset.bert_name) + self.cls_token_id = tokenizer.cls_token_id + self.sep_token_id = tokenizer.sep_token_id + self.pad_token_id = tokenizer.pad_token_id + + def forward( + self, + caps, + cmasks, + vfeats, + vmasks, + **kwargs + ): + # ActionLocalization assume of batch_size=1, squeeze it. + caps = caps.squeeze(0) + cmasks = cmasks.squeeze(0) + vfeats = vfeats.squeeze(0) + vmasks = vmasks.squeeze(0) + + # TODO (huxu): other ways to do negative examples; move the following + # into your criterion forward. + dummy_caps = torch.LongTensor( + [[self.cls_token_id, self.sep_token_id, + self.pad_token_id, self.sep_token_id]], + ).to(caps.device).repeat(vfeats.size(0), 1) + dummy_cmasks = torch.BoolTensor( + [[0, 1, 0, 1]] # pad are valid for attention. + ).to(caps.device).repeat(vfeats.size(0), 1) + + outputs = self.forward_video( + vfeats, + vmasks, + dummy_caps, + dummy_cmasks, + output_hidden_states=True + ) + + video_seq = outputs[:, 1:vmasks.size(1)+1].masked_select( + vmasks.unsqueeze(-1) + ).view(-1, self.hidden_size) + + pooled_text = self.forward_text( + caps, + cmasks, + output_hidden_states=False + ) + + # this line is not right. + logits = torch.mm(video_seq, pooled_text.transpose(1, 0)) + return {"logits": logits} + + +class MMFusionShareActionLocalization(MMFusionShare): + def __init__(self, config, **kwargs): + super().__init__(config) + tokenizer = AutoTokenizer.from_pretrained( + config.dataset.bert_name) + self.cls_token_id = tokenizer.cls_token_id + self.sep_token_id = tokenizer.sep_token_id + self.pad_token_id = tokenizer.pad_token_id + + def forward( + self, + caps, + cmasks, + vfeats, + vmasks, + **kwargs + ): + # ActionLocalization assume of batch_size=1, squeeze it. + caps = caps.squeeze(0) + cmasks = cmasks.squeeze(0) + vfeats = vfeats.squeeze(0) + vmasks = vmasks.squeeze(0) + + # TODO (huxu): other ways to do negative examples; move the following + # into your criterion forward. + dummy_caps = torch.LongTensor( + [[self.cls_token_id, self.sep_token_id, + self.pad_token_id, self.sep_token_id]], + ).to(caps.device).repeat(vfeats.size(0), 1) + dummy_cmasks = torch.BoolTensor( + [[0, 1, 0, 1]] # pad are valid for attention. + ).to(caps.device).repeat(vfeats.size(0), 1) + + outputs = self.forward_video( + vfeats, + vmasks, + dummy_caps, + dummy_cmasks, + output_hidden_states=True + ) + + video_seq = outputs[:, 1:vmasks.size(1)+1].masked_select( + vmasks.unsqueeze(-1) + ).view(-1, self.hidden_size) + + pooled_text = self.forward_text( + caps, + cmasks, + output_hidden_states=False + ) + + # this line is not right. + logits = torch.mm(video_seq, pooled_text.transpose(1, 0)) + return {"logits": logits} diff --git a/examples/MMPT/mmpt/models/mmfusionnlg.py b/examples/MMPT/mmpt/models/mmfusionnlg.py new file mode 100644 index 0000000000..9207e77dab --- /dev/null +++ b/examples/MMPT/mmpt/models/mmfusionnlg.py @@ -0,0 +1,999 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copyright (c) Facebook, Inc. All Rights Reserved + + +import torch + +from torch.nn import functional as F + +from typing import Optional, Iterable + +try: + from transformers import BertPreTrainedModel + from transformers.modeling_bert import BertOnlyMLMHead + + from transformers.file_utils import ModelOutput + from transformers.modeling_outputs import CausalLMOutput + from transformers.generation_utils import ( + BeamHypotheses, + top_k_top_p_filtering + ) +except ImportError: + pass + +from .mmfusion import MMFusion +from .transformermodel import MMBertModel +from ..modules import VideoTokenMLP + + +class MMFusionNLG(MMFusion): + def __init__(self, config, **kwargs): + super().__init__(config) + if config.model.max_decode_length is not None: + self.max_length = min( + config.model.max_decode_length, + config.dataset.max_len - config.dataset.max_video_len - 3 + ) + else: + self.max_length = \ + config.dataset.max_len - config.dataset.max_video_len - 3 + self.gen_param = config.gen_param if config.gen_param is not None \ + else {} + + def forward( + self, + caps, + cmasks, + vfeats, + vmasks, + attention_mask, + video_label=None, + text_label=None, + **kwargs + ): + """use pre-trained LM header for generation.""" + attention_mask, token_type_ids = self._mm_on_the_fly( + cmasks, vmasks, attention_mask) + + outputs = self.mm_encoder( + input_ids=caps, + input_video_embeds=vfeats, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + masked_lm_labels=text_label, + ) + return {"logits": outputs[0]} + + @torch.no_grad() + def generate( + self, + caps, cmasks, vfeats, vmasks, + attention_mask=None, + bos_token_id=None, + eos_token_id=None, + **kwargs + ): + # a simplified interface from + # https://huggingface.co/transformers/v3.4.0/_modules/transformers/generation_utils.html#GenerationMixin.generate + + # caps now only have + # [CLS], [SEP] (for video) and [CLS] (as bos_token) + assert caps.size(1) == 3 + + attention_mask, token_type_ids = self._mm_on_the_fly( + cmasks, vmasks, attention_mask) + + output = self.mm_encoder.generate( + input_ids=caps, + input_video_embeds=vfeats, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + max_length=self.max_length, + **self.gen_param + ) + return output + + +class MMBertForNLG(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.bert = MMBertModel(config) + self.videomlp = VideoTokenMLP(config) + # we do not use `BertGenerationOnlyLMHead` + # because we can reuse pretraining. + self.cls = BertOnlyMLMHead(config) + self.hidden_size = config.hidden_size + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def forward( + self, + input_ids=None, + input_video_embeds=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + masked_lm_labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + # similar to MMBertForMFMMLM without MFM. + video_tokens = self.videomlp(input_video_embeds) + outputs = self.bert( + input_ids, + video_tokens, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + prediction_scores = None + if masked_lm_labels is not None: + text_offset = input_video_embeds.size(1) + 1 # [CLS] + # recover caps format: [CLS] [SEP] text [SEP] + text_sequence_output = torch.cat( + [sequence_output[:, :1], sequence_output[:, text_offset:]], + dim=1 + ) + + # only compute select tokens to training to speed up. + hidden_size = text_sequence_output.size(-1) + # masked_lm_labels = masked_lm_labels.reshape(-1) + labels_mask = masked_lm_labels != -100 + + selected_text_output = text_sequence_output.masked_select( + labels_mask.unsqueeze(-1) + ).view(-1, hidden_size) + prediction_scores = self.cls(selected_text_output) + + if not return_dict: + output = ( + prediction_scores, + ) + outputs[2:] + return output + + # for generation. + text_offset = input_video_embeds.size(1) + 2 # [CLS] + text_sequence_output = sequence_output[:, text_offset:] + prediction_scores = self.cls(text_sequence_output) + return CausalLMOutput( + loss=None, + logits=prediction_scores, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + input_video_embeds, + attention_mask=None, + token_type_ids=None, + **model_kwargs + ): + # must return a dictionary. + seq_len = input_ids.size(1) + input_video_embeds.size(1) + if attention_mask is not None: + if len(attention_mask.size()) == 4: + attention_mask = attention_mask[:, :, :seq_len, :seq_len] + elif len(attention_mask.size()) == 3: + attention_mask = attention_mask[:, :seq_len, :seq_len] + else: + attention_mask = attention_mask[:, :seq_len] + if token_type_ids is not None: + token_type_ids = token_type_ids[:, :seq_len] + + return { + "input_ids": input_ids, + "input_video_embeds": input_video_embeds, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + @torch.no_grad() + def generate( + self, + input_ids: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + max_length: Optional[int] = None, + min_length: Optional[int] = None, + do_sample: Optional[bool] = None, + early_stopping: Optional[bool] = None, + num_beams: Optional[int] = None, + temperature: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + repetition_penalty: Optional[float] = None, + bad_words_ids: Optional[Iterable[int]] = None, + bos_token_id: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + length_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + num_return_sequences: Optional[int] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_start_token_id: Optional[int] = None, + use_cache: Optional[bool] = None, + **model_kwargs + ) -> torch.LongTensor: + r""" + Generates sequences for models with a language modeling head. The method currently supports greedy decoding, + beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. + Adapted in part from `Facebook's XLM beam search code + `__. + Apart from :obj:`input_ids` and :obj:`attention_mask`, all the arguments below will default to the value of the + attribute of the same name inside the :class:`~transformers.PretrainedConfig` of the model. The default values + indicated are the default values of those config. + Most of these parameters are explained in more detail in `this blog post + `__. + Parameters: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + The sequence used as a prompt for the generation. If :obj:`None` the method initializes + it as an empty :obj:`torch.LongTensor` of shape :obj:`(1,)`. + decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + initial input_ids for the decoder of encoder-decoder type models. If :obj:`None` then only + decoder_start_token_id is passed as the first token to the decoder. + max_length (:obj:`int`, `optional`, defaults to 20): + The maximum length of the sequence to be generated. + min_length (:obj:`int`, `optional`, defaults to 10): + The minimum length of the sequence to be generated. + do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to use sampling ; use greedy decoding otherwise. + early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. + num_beams (:obj:`int`, `optional`, defaults to 1): + Number of beams for beam search. 1 means no beam search. + temperature (:obj:`float`, `optional`, defaults tp 1.0): + The value used to module the next token probabilities. + top_k (:obj:`int`, `optional`, defaults to 50): + The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p (:obj:`float`, `optional`, defaults to 1.0): + If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or + higher are kept for generation. + repetition_penalty (:obj:`float`, `optional`, defaults to 1.0): + The parameter for repetition penalty. 1.0 means no penalty. See `this paper + `__ for more details. + pad_token_id (:obj:`int`, `optional`): + The id of the `padding` token. + bos_token_id (:obj:`int`, `optional`): + The id of the `beginning-of-sequence` token. + eos_token_id (:obj:`int`, `optional`): + The id of the `end-of-sequence` token. + length_penalty (:obj:`float`, `optional`, defaults to 1.0): + Exponential penalty to the length. 1.0 means no penalty. + Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in + order to encourage the model to produce longer sequences. + no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0): + If set to int > 0, all ngrams of that size can only occur once. + bad_words_ids(:obj:`List[int]`, `optional`): + List of token ids that are not allowed to be generated. In order to get the tokens of the words that + should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`. + num_return_sequences(:obj:`int`, `optional`, defaults to 1): + The number of independently computed returned sequences for each element in the batch. + attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for + tokens that are not masked, and 0 for masked tokens. + If not provided, will default to a tensor the same shape as :obj:`input_ids` that masks the pad token. + `What are attention masks? <../glossary.html#attention-mask>`__ + decoder_start_token_id (:obj:`int`, `optional`): + If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token. + use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the model should use the past last key/values attentions (if applicable to the model) to + speed up decoding. + model_kwargs: + Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. + Return: + :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: + The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or + shorter if all batches finished early due to the :obj:`eos_token_id`. + Examples:: + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + outputs = model.generate(max_length=40) # do greedy decoding + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True) # generate 3 candidates using sampling + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. + input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. + input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl + bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated + """ + + # We cannot generate if the model does not have a LM head + if self.get_output_embeddings() is None: + raise AttributeError( + "You tried to generate sequences with a model that does not have a LM Head." + "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )" + ) + + max_length = max_length if max_length is not None else self.config.max_length + min_length = min_length if min_length is not None else self.config.min_length + do_sample = do_sample if do_sample is not None else self.config.do_sample + early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping + use_cache = use_cache if use_cache is not None else self.config.use_cache + num_beams = num_beams if num_beams is not None else self.config.num_beams + temperature = temperature if temperature is not None else self.config.temperature + top_k = top_k if top_k is not None else self.config.top_k + top_p = top_p if top_p is not None else self.config.top_p + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty + bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty + no_repeat_ngram_size = ( + no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size + ) + bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids + num_return_sequences = ( + num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences + ) + decoder_start_token_id = ( + decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id + ) + + if input_ids is not None: + batch_size = input_ids.shape[0] # overriden by the input batch_size + else: + batch_size = 1 + + assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." + assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." + assert isinstance(do_sample, bool), "`do_sample` should be a boolean." + assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." + assert isinstance(use_cache, bool), "`use_cache` should be a boolean." + assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." + assert temperature > 0, "`temperature` should be strictly positive." + assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." + assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." + assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." + assert input_ids is not None or ( + isinstance(bos_token_id, int) and bos_token_id >= 0 + ), "If input_ids is not defined, `bos_token_id` should be a positive integer." + assert pad_token_id is None or ( + isinstance(pad_token_id, int) and (pad_token_id >= 0) + ), "`pad_token_id` should be a positive integer." + assert (eos_token_id is None) or ( + isinstance(eos_token_id, int) and (eos_token_id >= 0) + ), "`eos_token_id` should be a positive integer." + assert length_penalty > 0, "`length_penalty` should be strictly positive." + assert ( + isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0 + ), "`no_repeat_ngram_size` should be a positive integer." + assert ( + isinstance(num_return_sequences, int) and num_return_sequences > 0 + ), "`num_return_sequences` should be a strictly positive integer." + assert ( + bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) + ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" + + if input_ids is None: + assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( + "you should either supply a context to complete as `input_ids` input " + "or a `bos_token_id` (integer >= 0) as a first token to start the generation." + ) + input_ids = torch.full( + (batch_size, 1), + bos_token_id, + dtype=torch.long, + device=next(self.parameters()).device, + ) + else: + assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)." + + # not allow to duplicate outputs when greedy decoding + if do_sample is False: + if num_beams == 1: + # no_beam_search greedy generation conditions + assert ( + num_return_sequences == 1 + ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" + + else: + # beam_search greedy generation conditions + assert ( + num_beams >= num_return_sequences + ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" + + # create attention mask if necessary + # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 + if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids): + attention_mask = input_ids.ne(pad_token_id).long() + elif attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + + # set pad_token_id to eos_token_id if not set. Important that this is done after + # attention_mask is created + if pad_token_id is None and eos_token_id is not None: + print( + "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) + ) + pad_token_id = eos_token_id + + # vocab size + if hasattr(self.config, "vocab_size"): + vocab_size = self.config.vocab_size + elif ( + self.config.is_encoder_decoder + and hasattr(self.config, "decoder") + and hasattr(self.config.decoder, "vocab_size") + ): + vocab_size = self.config.decoder.vocab_size + else: + raise ValueError("either self.config.vocab_size or self.config.decoder.vocab_size needs to be defined") + + # set effective batch size and effective batch multiplier according to do_sample + if do_sample: + effective_batch_size = batch_size * num_return_sequences + effective_batch_mult = num_return_sequences + else: + effective_batch_size = batch_size + effective_batch_mult = 1 + + if self.config.is_encoder_decoder: + if decoder_start_token_id is None: + # see if BOS token can be used for decoder_start_token_id + if bos_token_id is not None: + decoder_start_token_id = bos_token_id + elif ( + hasattr(self.config, "decoder") + and hasattr(self.config.decoder, "bos_token_id") + and self.config.decoder.bos_token_id is not None + ): + decoder_start_token_id = self.config.decoder.bos_token_id + else: + raise ValueError( + "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" + ) + + assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) + assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) + + # get encoder and store encoder outputs + encoder = self.get_encoder() + encoder_outputs: ModelOutput = encoder(input_ids, attention_mask=attention_mask, return_dict=True) + + # Expand input ids if num_beams > 1 or num_return_sequences > 1 + if num_return_sequences > 1 or num_beams > 1: + # TODO: make this a call-back function. + # input_ids=caps, + # input_video_embeds=vfeats, + # attention_mask=attention_mask, + # token_type_ids=token_type_ids, + input_video_embeds = model_kwargs.pop("input_video_embeds", None) + token_type_ids = model_kwargs.pop("token_type_ids", None) + + input_ids_len = input_ids.shape[-1] + input_ids = input_ids.unsqueeze(1).expand( + batch_size, effective_batch_mult * num_beams, input_ids_len) + + input_video_embeds_len, input_video_embeds_hidden = input_video_embeds.size(1), input_video_embeds.size(2) + input_video_embeds = input_video_embeds.unsqueeze(1).expand( + batch_size, effective_batch_mult * num_beams, input_video_embeds_len, input_video_embeds_hidden) + + attention_mask_from_len, attention_mask_to_len = attention_mask.size(1), attention_mask.size(2) + attention_mask = attention_mask.unsqueeze(1).expand( + batch_size, effective_batch_mult * num_beams, attention_mask_from_len, attention_mask_to_len + ) + + token_type_ids_len = token_type_ids.size(1) + token_type_ids = token_type_ids.unsqueeze(1).expand( + batch_size, effective_batch_mult * num_beams, token_type_ids_len + ) + + # contiguous ... + input_ids = input_ids.contiguous().view( + effective_batch_size * num_beams, input_ids_len + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + + input_video_embeds = input_video_embeds.contiguous().view( + effective_batch_size * num_beams, input_video_embeds_len, input_video_embeds_hidden) + + attention_mask = attention_mask.contiguous().view( + effective_batch_size * num_beams, attention_mask_from_len, attention_mask_to_len + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + + token_type_ids = token_type_ids.contiguous().view( + effective_batch_size * num_beams, token_type_ids_len + ) + + model_kwargs["input_video_embeds"] = input_video_embeds + model_kwargs["token_type_ids"] = token_type_ids + + if self.config.is_encoder_decoder: + device = next(self.parameters()).device + if decoder_input_ids is not None: + # give initial decoder input ids + input_ids = decoder_input_ids.repeat(effective_batch_size * num_beams, 1).to(device) + else: + # create empty decoder input_ids + input_ids = torch.full( + (effective_batch_size * num_beams, 1), + decoder_start_token_id, + dtype=torch.long, + device=device, + ) + cur_len = input_ids.shape[-1] + + assert ( + batch_size == encoder_outputs.last_hidden_state.shape[0] + ), f"expected encoder_outputs.last_hidden_state to have 1st dimension bs={batch_size}, got {encoder_outputs.last_hidden_state.shape[0]} " + + # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) + expanded_batch_idxs = ( + torch.arange(batch_size) + .view(-1, 1) + .repeat(1, num_beams * effective_batch_mult) + .view(-1) + .to(input_ids.device) + ) + + # expand encoder_outputs + encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.index_select( + 0, expanded_batch_idxs + ) + + # save encoder_outputs in `model_kwargs` + model_kwargs["encoder_outputs"] = encoder_outputs + + else: + cur_len = input_ids.shape[-1] + + assert ( + cur_len < max_length + ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" + + if num_beams > 1: + output = self._generate_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + early_stopping=early_stopping, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + batch_size=effective_batch_size, + num_return_sequences=num_return_sequences, + length_penalty=length_penalty, + num_beams=num_beams, + vocab_size=vocab_size, + attention_mask=attention_mask, + use_cache=use_cache, + model_kwargs=model_kwargs, + ) + else: + output = self._generate_no_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + batch_size=effective_batch_size, + attention_mask=attention_mask, + use_cache=use_cache, + model_kwargs=model_kwargs, + ) + + return output + + def _generate_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + early_stopping, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + pad_token_id, + eos_token_id, + batch_size, + num_return_sequences, + length_penalty, + num_beams, + vocab_size, + attention_mask, + use_cache, + model_kwargs, + ): + """Generate sequences for each example with beam search.""" + + # generated hypotheses + generated_hyps = [ + BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) + for _ in range(batch_size) + ] + + # scores for each sentence in the beam + beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) + + # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times + if do_sample is False: + beam_scores[:, 1:] = -1e9 + beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) + + # cache compute states + past = None + + # done sentences + done = [False for _ in range(batch_size)] + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_kwargs + ) + outputs = self(**model_inputs, return_dict=True) # (batch_size * num_beams, cur_len, vocab_size) + next_token_logits = outputs.logits[:, -1, :] # (batch_size * num_beams, vocab_size) + + # if model has past, then set the past variable to speed up decoding + if "past_key_values" in outputs: + past = outputs.past_key_values + elif "mems" in outputs: + past = outputs.mems + + if self.config.is_encoder_decoder and do_sample is False: + # TODO (PVP) still a bit hacky here - there might be a better solution + next_token_logits = self.adjust_logits_during_generation( + next_token_logits, cur_len=cur_len, max_length=max_length + ) + + scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size) + + scores = self.postprocess_next_token_scores( + scores=scores, + input_ids=input_ids, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + cur_len=cur_len, + min_length=min_length, + max_length=max_length, + eos_token_id=eos_token_id, + repetition_penalty=repetition_penalty, + batch_size=batch_size, + num_beams=num_beams, + ) + + assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format( + scores.shape, (batch_size * num_beams, vocab_size) + ) + + if do_sample: + _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) + # Temperature + if temperature != 1.0: + _scores = _scores / temperature + # Top-p/top-k filtering + _scores = top_k_top_p_filtering( + _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 + ) # (batch_size * num_beams, vocab_size) + # re-organize to group the beam together to sample from all beam_idxs + _scores = _scores.contiguous().view( + batch_size, num_beams * vocab_size + ) # (batch_size, num_beams * vocab_size) + + # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) + probs = F.softmax(_scores, dim=-1) + next_tokens = torch.multinomial(probs, num_samples=2 * num_beams) # (batch_size, num_beams * 2) + # Compute next scores + next_scores = torch.gather(_scores, -1, next_tokens) # (batch_size, num_beams * 2) + # sort the sampled vector to make sure that the first num_beams samples are the best + next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1) + next_tokens = torch.gather(next_tokens, -1, next_scores_indices) # (batch_size, num_beams * 2) + + else: + next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) + + # re-organize to group the beam together (we are keeping top hypothesis accross beams) + next_scores = next_scores.view( + batch_size, num_beams * vocab_size + ) # (batch_size, num_beams * vocab_size) + + next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True) + + assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams) + + # next batch beam content + next_batch_beam = [] + + # for each sentence + for batch_idx in range(batch_size): + + # if we are done with this sentence, add a pad token + if done[batch_idx]: + assert ( + len(generated_hyps[batch_idx]) >= num_beams + ), "Batch can only be done if at least {} beams have been generated".format(num_beams) + assert ( + eos_token_id is not None and pad_token_id is not None + ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" + next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch + continue + + # next sentence beam content, this will get added to next_batch_beam + next_sent_beam = [] + + # next tokens for this sentence + for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( + zip(next_tokens[batch_idx], next_scores[batch_idx]) + ): + # get beam and token IDs + beam_id = beam_token_id // vocab_size + token_id = beam_token_id % vocab_size + + effective_beam_id = batch_idx * num_beams + beam_id + # add to generated hypotheses if end of sentence + if (eos_token_id is not None) and (token_id.item() == eos_token_id): + # if beam_token does not belong to top num_beams tokens, it should not be added + is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams + if is_beam_token_worse_than_top_num_beams: + continue + generated_hyps[batch_idx].add( + input_ids[effective_beam_id].clone(), + beam_token_score.item(), + ) + else: + # add next predicted token since it is not eos_token + next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) + + # once the beam for next step is full, don't add more tokens to it. + if len(next_sent_beam) == num_beams: + break + + # Check if we are done so that we can save a pad step if all(done) + done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( + next_scores[batch_idx].max().item(), cur_len + ) + + # update next beam content + assert len(next_sent_beam) == num_beams, "Beam should always be full" + next_batch_beam.extend(next_sent_beam) + assert len(next_batch_beam) == num_beams * (batch_idx + 1), "We should have added num_beams each step" + + # stop when we are done with each sentence + if all(done): + break + + # sanity check / prepare next batch + assert len(next_batch_beam) == batch_size * num_beams + beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) + beam_tokens = input_ids.new([x[1] for x in next_batch_beam]) + beam_idx = input_ids.new([x[2] for x in next_batch_beam]) + + # re-order batch and update current length + input_ids = input_ids[beam_idx, :] + input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1) + cur_len = cur_len + 1 + + # re-order internal states + if past is not None: + past = self._reorder_cache(past, beam_idx) + + # extend attention_mask for new generated input if only decoder + # (huxu): move out since we trim attention_mask by ourselves. + # if self.config.is_encoder_decoder is False: + # attention_mask = torch.cat( + # [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + # ) + + # finalize all open beam hypotheses and add to generated hypotheses + for batch_idx in range(batch_size): + if done[batch_idx]: + continue + + # test that beam scores match previously calculated scores if not eos and batch_idx not done + if eos_token_id is not None and all( + (token_id % vocab_size).item() != eos_token_id for token_id in next_tokens[batch_idx] + ): + assert torch.all( + next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx] + ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( + next_scores[:, :num_beams][batch_idx], + beam_scores.view(batch_size, num_beams)[batch_idx], + ) + + # need to add best num_beams hypotheses to generated hyps + for beam_id in range(num_beams): + effective_beam_id = batch_idx * num_beams + beam_id + final_score = beam_scores[effective_beam_id].item() + final_tokens = input_ids[effective_beam_id] + generated_hyps[batch_idx].add(final_tokens, final_score) + + # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch + output_batch_size = batch_size if do_sample else batch_size * num_return_sequences + output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences + + # select the best hypotheses + sent_lengths = input_ids.new(output_batch_size) + best = [] + + # retrieve best hypotheses + for i, hypotheses in enumerate(generated_hyps): + sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) + for j in range(output_num_return_sequences_per_batch): + effective_batch_idx = output_num_return_sequences_per_batch * i + j + best_hyp = sorted_hyps.pop()[1] + sent_lengths[effective_batch_idx] = len(best_hyp) + best.append(best_hyp) + + # prepare for adding eos + sent_max_len = min(sent_lengths.max().item() + 1, max_length) + decoded = input_ids.new(output_batch_size, sent_max_len) + # shorter batches are padded if needed + if sent_lengths.min().item() != sent_lengths.max().item(): + assert pad_token_id is not None, "`pad_token_id` has to be defined" + decoded.fill_(pad_token_id) + + # fill with hypotheses and eos_token_id if the latter fits in + for i, hypo in enumerate(best): + decoded[i, : sent_lengths[i]] = hypo + if sent_lengths[i] < max_length: + decoded[i, sent_lengths[i]] = eos_token_id + + return decoded + + def _generate_no_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + pad_token_id, + eos_token_id, + batch_size, + attention_mask, + use_cache, + model_kwargs, + ): + """Generate sequences for each example without beam search (num_beams == 1). + All returned sequence are generated independantly. + """ + # length of generated sentences / unfinished sentences + unfinished_sents = input_ids.new(batch_size).fill_(1) + sent_lengths = input_ids.new(batch_size).fill_(max_length) + + past = None + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_kwargs + ) + + outputs = self(**model_inputs, return_dict=True) + next_token_logits = outputs.logits[:, -1, :] + scores = self.postprocess_next_token_scores( + scores=next_token_logits, + input_ids=input_ids, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + cur_len=cur_len, + min_length=min_length, + max_length=max_length, + eos_token_id=eos_token_id, + repetition_penalty=repetition_penalty, + batch_size=batch_size, + num_beams=1, + ) + + # if model has past, then set the past variable to speed up decoding + if "past_key_values" in outputs: + past = outputs.past_key_values + elif "mems" in outputs: + past = outputs.mems + + if do_sample: + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + scores = scores / temperature + # Top-p/top-k filtering + next_token_logscores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p) + # Sample + probs = F.softmax(next_token_logscores, dim=-1) + next_token = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + # Greedy decoding + next_token = torch.argmax(next_token_logits, dim=-1) + + # print(next_token_logits[0,next_token[0]], next_token_logits[0,eos_token_id]) + + # update generations and finished sentences + if eos_token_id is not None: + # pad finished sentences if eos_token_id exist + tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) + else: + tokens_to_add = next_token + + # add token and increase length by one + input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1) + cur_len = cur_len + 1 + + if eos_token_id is not None: + eos_in_sents = tokens_to_add == eos_token_id + # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length + is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool() + sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len) + # unfinished_sents is set to zero if eos in sentence + unfinished_sents.mul_((~eos_in_sents).long()) + + # stop when there is a in each sentence, or if we exceed the maximul length + if unfinished_sents.max() == 0: + break + + + # extend attention_mask for new generated input if only decoder + # if self.config.is_encoder_decoder is False: + # attention_mask = torch.cat( + # [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + # ) + + return input_ids diff --git a/examples/MMPT/mmpt/models/transformermodel.py b/examples/MMPT/mmpt/models/transformermodel.py new file mode 100644 index 0000000000..6acc419f09 --- /dev/null +++ b/examples/MMPT/mmpt/models/transformermodel.py @@ -0,0 +1,734 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copyright (c) Facebook, Inc. All Rights Reserved + +import torch + +from torch import nn + +try: + from transformers.modeling_bert import ( + BertPreTrainedModel, + BertModel, + BertEncoder, + BertPredictionHeadTransform, + ) +except ImportError: + pass + +from ..modules import VideoTokenMLP, MMBertEmbeddings + + +# --------------- fine-tuning models --------------- +class MMBertForJoint(BertPreTrainedModel): + """A BertModel with isolated attention mask to separate modality.""" + + def __init__(self, config): + super().__init__(config) + self.videomlp = VideoTokenMLP(config) + self.bert = MMBertModel(config) + self.init_weights() + + def forward( + self, + input_ids=None, + input_video_embeds=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + next_sentence_label=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + separate_forward_split=None, + ): + return_dict = ( + return_dict if return_dict is not None + else self.config.use_return_dict + ) + video_tokens = self.videomlp(input_video_embeds) + + outputs = self.bert( + input_ids, + video_tokens, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + separate_forward_split=separate_forward_split, + ) + + return outputs + + +class MMBertForTokenClassification(BertPreTrainedModel): + """A BertModel similar to MMJointUni, with extra wrapper layer + to be fine-tuned from other pretrained MMFusion model.""" + + def __init__(self, config): + super().__init__(config) + self.videomlp = VideoTokenMLP(config) + self.bert = MMBertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # TODO(huxu): 779 is the number of classes for COIN: move to config? + self.classifier = nn.Linear(config.hidden_size, 779) + self.init_weights() + + def forward( + self, + input_ids=None, + input_video_embeds=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + next_sentence_label=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + separate_forward_split=None, + ): + return_dict = ( + return_dict if return_dict is not None + else self.config.use_return_dict + ) + + video_tokens = self.videomlp(input_video_embeds) + outputs = self.bert( + input_ids, + video_tokens, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + separate_forward_split=separate_forward_split, + ) + + return (self.classifier(outputs[0]),) + + +# ------------ pre-training models ---------------- + +class MMBertForEncoder(BertPreTrainedModel): + """A BertModel for Contrastive Learning.""" + def __init__(self, config): + super().__init__(config) + self.videomlp = VideoTokenMLP(config) + self.bert = MMBertModel(config) + self.init_weights() + + def forward( + self, + input_ids=None, + input_video_embeds=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + return_dict = ( + return_dict if return_dict is not None + else self.config.use_return_dict + ) + if input_video_embeds is not None: + video_tokens = self.videomlp(input_video_embeds) + else: + video_tokens = None + + outputs = self.bert( + input_ids, + video_tokens, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + return outputs + + +class MMBertForMFMMLM(BertPreTrainedModel): + """A BertModel with shared prediction head on MFM-MLM.""" + def __init__(self, config): + super().__init__(config) + self.videomlp = VideoTokenMLP(config) + self.bert = MMBertModel(config) + self.cls = MFMMLMHead(config) + self.hidden_size = config.hidden_size + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def forward( + self, + input_ids=None, + input_video_embeds=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + masked_frame_labels=None, + target_video_hidden_states=None, + non_masked_frame_mask=None, + masked_lm_labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + return_dict = ( + return_dict if return_dict is not None + else self.config.use_return_dict + ) + if input_video_embeds is not None: + video_tokens = self.videomlp(input_video_embeds) + else: + video_tokens = None + + if target_video_hidden_states is not None: + target_video_hidden_states = self.videomlp( + target_video_hidden_states) + + non_masked_frame_hidden_states = video_tokens.masked_select( + non_masked_frame_mask.unsqueeze(-1) + ).view(-1, self.hidden_size) + + outputs = self.bert( + input_ids, + video_tokens, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + mfm_scores, prediction_scores = None, None + if masked_frame_labels is not None and masked_lm_labels is not None: + # split the sequence. + text_offset = masked_frame_labels.size(1) + 1 # [CLS] + video_sequence_output = sequence_output[ + :, 1:text_offset + ] # remove [SEP] as not in video_label. + text_sequence_output = torch.cat( + [sequence_output[:, :1], sequence_output[:, text_offset:]], + dim=1 + ) + + hidden_size = video_sequence_output.size(-1) + selected_video_output = video_sequence_output.masked_select( + masked_frame_labels.unsqueeze(-1) + ).view(-1, hidden_size) + + # only compute select tokens to training to speed up. + hidden_size = text_sequence_output.size(-1) + # masked_lm_labels = masked_lm_labels.reshape(-1) + labels_mask = masked_lm_labels != -100 + + selected_text_output = text_sequence_output.masked_select( + labels_mask.unsqueeze(-1) + ).view(-1, hidden_size) + mfm_scores, prediction_scores = self.cls( + selected_video_output, + target_video_hidden_states, + non_masked_frame_hidden_states, + selected_text_output, + ) + + output = ( + mfm_scores, + prediction_scores, + ) + outputs + return output + + +class BertMFMMLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly + # resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward( + self, + video_hidden_states=None, + target_video_hidden_states=None, + non_masked_frame_hidden_states=None, + text_hidden_states=None, + ): + video_logits, text_logits = None, None + if video_hidden_states is not None: + video_hidden_states = self.transform(video_hidden_states) + non_masked_frame_logits = torch.mm( + video_hidden_states, + non_masked_frame_hidden_states.transpose(1, 0) + ) + masked_frame_logits = torch.bmm( + video_hidden_states.unsqueeze(1), + target_video_hidden_states.unsqueeze(-1), + ).squeeze(-1) + video_logits = torch.cat( + [masked_frame_logits, non_masked_frame_logits], dim=1 + ) + + if text_hidden_states is not None: + text_hidden_states = self.transform(text_hidden_states) + text_logits = self.decoder(text_hidden_states) + return video_logits, text_logits + + +class MFMMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertMFMMLMPredictionHead(config) + + def forward( + self, + video_hidden_states=None, + target_video_hidden_states=None, + non_masked_frame_hidden_states=None, + text_hidden_states=None, + ): + video_logits, text_logits = self.predictions( + video_hidden_states, + target_video_hidden_states, + non_masked_frame_hidden_states, + text_hidden_states, + ) + return video_logits, text_logits + + +class MMBertForMTM(MMBertForMFMMLM): + def __init__(self, config): + BertPreTrainedModel.__init__(self, config) + self.videomlp = VideoTokenMLP(config) + self.bert = MMBertModel(config) + self.cls = MTMHead(config) + self.hidden_size = config.hidden_size + self.init_weights() + + +class BertMTMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + self.decoder = nn.Linear( + config.hidden_size, config.vocab_size, bias=False) + + def forward( + self, + video_hidden_states=None, + target_video_hidden_states=None, + non_masked_frame_hidden_states=None, + text_hidden_states=None, + ): + non_masked_frame_hidden_states = non_masked_frame_hidden_states.transpose(1, 0) + video_logits, text_logits = None, None + if video_hidden_states is not None: + video_hidden_states = self.transform(video_hidden_states) + + masked_frame_logits = torch.bmm( + video_hidden_states.unsqueeze(1), + target_video_hidden_states.unsqueeze(-1), + ).squeeze(-1) + + non_masked_frame_logits = torch.mm( + video_hidden_states, + non_masked_frame_hidden_states + ) + video_on_vocab_logits = self.decoder(video_hidden_states) + video_logits = torch.cat([ + masked_frame_logits, + non_masked_frame_logits, + video_on_vocab_logits], dim=1) + + if text_hidden_states is not None: + text_hidden_states = self.transform(text_hidden_states) + # text first so label does not need to be shifted. + text_on_vocab_logits = self.decoder(text_hidden_states) + text_on_video_logits = torch.mm( + text_hidden_states, + non_masked_frame_hidden_states + ) + text_logits = torch.cat([ + text_on_vocab_logits, + text_on_video_logits + ], dim=1) + + return video_logits, text_logits + + +class MTMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertMTMPredictionHead(config) + + def forward( + self, + video_hidden_states=None, + target_video_hidden_states=None, + non_masked_frame_hidden_states=None, + text_hidden_states=None, + ): + video_logits, text_logits = self.predictions( + video_hidden_states, + target_video_hidden_states, + non_masked_frame_hidden_states, + text_hidden_states, + ) + return video_logits, text_logits + + +class MMBertModel(BertModel): + """MMBertModel has MMBertEmbedding to support video tokens.""" + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + # overwrite embedding + self.embeddings = MMBertEmbeddings(config) + self.encoder = MultiLayerAttentionMaskBertEncoder(config) + self.init_weights() + + def forward( + self, + input_ids=None, + input_video_embeds=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + separate_forward_split=None, + ): + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None + else self.config.use_return_dict + ) + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids " + "and inputs_embeds at the same time" + ) + elif input_ids is not None: + if input_video_embeds is not None: + input_shape = ( + input_ids.size(0), + input_ids.size(1) + input_video_embeds.size(1), + ) + else: + input_shape = ( + input_ids.size(0), + input_ids.size(1), + ) + elif inputs_embeds is not None: + if input_video_embeds is not None: + input_shape = ( + inputs_embeds.size(0), + inputs_embeds.size(1) + input_video_embeds.size(1), + ) + else: + input_shape = ( + input_ids.size(0), + input_ids.size(1), + ) + else: + raise ValueError( + "You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None \ + else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + token_type_ids = torch.zeros( + input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions + # [batch_size, from_seq_length, to_seq_length] + # ourselves in which case + # we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = \ + self.get_extended_attention_mask( + attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to + # [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.size() + encoder_hidden_shape = ( + encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones( + encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask( + encoder_attention_mask + ) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or + # [num_hidden_layers x num_heads] + # and head_mask is converted to shape + # [num_hidden_layers x batch x num_heads x seq_length x seq_length] + + head_mask = self.get_head_mask( + head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids, + input_video_embeds, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + ) + + if separate_forward_split is not None: + split_embedding_output = \ + embedding_output[:, :separate_forward_split] + split_extended_attention_mask = extended_attention_mask[ + :, :, :, :separate_forward_split, :separate_forward_split + ] + split_encoder_outputs = self.encoder( + split_embedding_output, + attention_mask=split_extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + assert ( + len(split_encoder_outputs) <= 2 + ), "we do not support merge on attention for now." + encoder_outputs = [] + encoder_outputs.append([split_encoder_outputs[0]]) + if len(split_encoder_outputs) == 2: + encoder_outputs.append([]) + for _all_hidden_states in split_encoder_outputs[1]: + encoder_outputs[-1].append([_all_hidden_states]) + + split_embedding_output = \ + embedding_output[:, separate_forward_split:] + split_extended_attention_mask = extended_attention_mask[ + :, :, :, separate_forward_split:, separate_forward_split: + ] + + split_encoder_outputs = self.encoder( + split_embedding_output, + attention_mask=split_extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + assert ( + len(split_encoder_outputs) <= 2 + ), "we do not support merge on attention for now." + encoder_outputs[0].append(split_encoder_outputs[0]) + encoder_outputs[0] = torch.cat(encoder_outputs[0], dim=1) + if len(split_encoder_outputs) == 2: + for layer_idx, _all_hidden_states in enumerate( + split_encoder_outputs[1] + ): + encoder_outputs[1][layer_idx].append(_all_hidden_states) + encoder_outputs[1][layer_idx] = torch.cat( + encoder_outputs[1][layer_idx], dim=1 + ) + encoder_outputs = tuple(encoder_outputs) + else: + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + pooled_output = ( + self.pooler(sequence_output) if self.pooler is not None else None + ) + + return (sequence_output, pooled_output) + encoder_outputs[1:] + + def get_extended_attention_mask(self, attention_mask, input_shape, device): + """This is borrowed from `modeling_utils.py` with the support of + multi-layer attention masks. + The second dim is expected to be number of layers. + See `MMAttentionMaskProcessor`. + Makes broadcastable attention and causal masks so that future + and masked tokens are ignored. + + Arguments: + attention_mask (:obj:`torch.Tensor`): + Mask with ones indicating tokens to attend to, + zeros for tokens to ignore. + input_shape (:obj:`Tuple[int]`): + The shape of the input to the model. + device: (:obj:`torch.device`): + The device of the input to the model. + + Returns: + :obj:`torch.Tensor` The extended attention mask, \ + with a the same dtype as :obj:`attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions + # [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable + # to all heads. + if attention_mask.dim() == 4: + extended_attention_mask = attention_mask[:, :, None, :, :] + extended_attention_mask = extended_attention_mask.to( + dtype=self.dtype + ) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) \ + * -10000.0 + return extended_attention_mask + else: + return super().get_extended_attention_mask( + attention_mask, input_shape, device + ) + + +class MultiLayerAttentionMaskBertEncoder(BertEncoder): + """extend BertEncoder with the capability of + multiple layers of attention mask.""" + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False, + ): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_attention_mask = ( + attention_mask[:, i, :, :, :] + if attention_mask.dim() == 5 + else attention_mask + ) + + if getattr(self.config, "gradient_checkpointing", False): + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + layer_attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + layer_attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + return tuple( + v + for v in [hidden_states, all_hidden_states, all_attentions] + if v is not None + ) diff --git a/examples/MMPT/mmpt/modules/__init__.py b/examples/MMPT/mmpt/modules/__init__.py new file mode 100644 index 0000000000..4c78594c21 --- /dev/null +++ b/examples/MMPT/mmpt/modules/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +from .mm import * + +try: + from .expmm import * +except ImportError: + pass diff --git a/examples/MMPT/mmpt/modules/mm.py b/examples/MMPT/mmpt/modules/mm.py new file mode 100644 index 0000000000..5d9777371a --- /dev/null +++ b/examples/MMPT/mmpt/modules/mm.py @@ -0,0 +1,145 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copyright (c) Facebook, Inc. All Rights Reserved + + +import torch + +from torch import nn + +try: + from transformers.modeling_bert import ( + BertEmbeddings, + ACT2FN, + ) +except ImportError: + pass + + +class VideoTokenMLP(nn.Module): + def __init__(self, config): + super().__init__() + input_dim = config.input_dim if hasattr(config, "input_dim") else 512 + self.linear1 = nn.Linear(input_dim, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size) + self.activation = ACT2FN[config.hidden_act] + self.linear2 = nn.Linear(config.hidden_size, config.hidden_size) + + def forward(self, hidden_states): + hidden_states = self.linear1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.linear2(hidden_states) + return hidden_states + + +class MMBertEmbeddings(BertEmbeddings): + def __init__(self, config): + super().__init__(config) + self.max_video_len = config.max_video_len + if hasattr(config, "use_seg_emb") and config.use_seg_emb: + """the original VLM paper uses seg_embeddings for temporal space. + although not used it changed the randomness of initialization. + we keep it for reproducibility. + """ + self.seg_embeddings = nn.Embedding(256, config.hidden_size) + + def forward( + self, + input_ids, + input_video_embeds, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + ): + input_tensor = input_ids if input_ids is not None else inputs_embeds + if input_video_embeds is not None: + input_shape = ( + input_tensor.size(0), + input_tensor.size(1) + input_video_embeds.size(1), + ) + else: + input_shape = (input_tensor.size(0), input_tensor.size(1)) + + if position_ids is None: + """ + Auto skip position embeddings for text only case. + use cases: + (1) action localization and segmentation: + feed in len-1 dummy video token needs text part to + skip input_video_embeds.size(1) for the right + position_ids for video [SEP] and rest text tokens. + (2) MMFusionShare for two forward passings: + in `forward_text`: input_video_embeds is None. + need to skip video [SEP] token. + + # video_len + 1: [CLS] + video_embed + # self.max_video_len + 1: [SEP] for video. + # self.max_video_len + 2: [SEP] for video. + # self.max_video_len + input_ids.size(1): rest for text. + """ + if input_video_embeds is not None: + video_len = input_video_embeds.size(1) + starting_offset = self.max_video_len + 1 # video [SEP] + ending_offset = self.max_video_len + input_ids.size(1) + else: + video_len = 0 + starting_offset = self.max_video_len + 2 # first text token. + ending_offset = self.max_video_len + input_ids.size(1) + 1 + position_ids = torch.cat([ + self.position_ids[:, :video_len + 1], + self.position_ids[:, starting_offset:ending_offset] + ], dim=1) + + if token_type_ids is None: + token_type_ids = torch.zeros( + input_shape, dtype=torch.long, device=self.position_ids.device + ) + + """ + the format of input_ids is [CLS] [SEP] caption [SEP] padding. + the goal is to build [CLS] video tokens [SEP] caption [SEP] . + """ + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + if input_video_embeds is not None: + inputs_mm_embeds = torch.cat([ + inputs_embeds[:, :1], input_video_embeds, inputs_embeds[:, 1:] + ], dim=1) + else: + # text only for `MMFusionShare`. + inputs_mm_embeds = inputs_embeds + + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + embeddings = inputs_mm_embeds + position_embeddings + embeddings += token_type_embeddings + + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class AlignHead(nn.Module): + """this will load pre-trained weights for NSP, which is desirable.""" + + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, dropout_pooled_output): + logits = self.seq_relationship(dropout_pooled_output) + return logits diff --git a/examples/MMPT/mmpt/modules/retri.py b/examples/MMPT/mmpt/modules/retri.py new file mode 100644 index 0000000000..d1b288f8e5 --- /dev/null +++ b/examples/MMPT/mmpt/modules/retri.py @@ -0,0 +1,429 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import os +import numpy as np +import pickle +import time + +try: + import faiss +except ImportError: + pass + +from collections import defaultdict + +from ..utils import get_local_rank, print_on_rank0 + + +class VectorRetriever(object): + """ + How2 Video Retriver. + Reference usage of FAISS: + https://github.com/fairinternal/fairseq-py/blob/paraphrase_pretraining/fairseq/data/multilingual_faiss_dataset.py + """ + + def __init__(self, hidden_size, cent, db_type, examples_per_cent_to_train): + if db_type == "flatl2": + quantizer = faiss.IndexFlatL2(hidden_size) # the other index + self.db = faiss.IndexIVFFlat( + quantizer, hidden_size, cent, faiss.METRIC_L2) + elif db_type == "pq": + self.db = faiss.index_factory( + hidden_size, f"IVF{cent}_HNSW32,PQ32" + ) + else: + raise ValueError("unknown type of db", db_type) + self.train_thres = cent * examples_per_cent_to_train + self.train_cache = [] + self.train_len = 0 + self.videoid_to_vectoridx = {} + self.vectoridx_to_videoid = None + self.make_direct_maps_done = False + + def make_direct_maps(self): + faiss.downcast_index(self.db).make_direct_map() + + def __len__(self): + return self.db.ntotal + + def save(self, out_dir): + faiss.write_index( + self.db, + os.path.join(out_dir, "faiss_idx") + ) + with open( + os.path.join( + out_dir, "videoid_to_vectoridx.pkl"), + "wb") as fw: + pickle.dump( + self.videoid_to_vectoridx, fw, + protocol=pickle.HIGHEST_PROTOCOL + ) + + def load(self, out_dir): + fn = os.path.join(out_dir, "faiss_idx") + self.db = faiss.read_index(fn) + with open( + os.path.join(out_dir, "videoid_to_vectoridx.pkl"), "rb") as fr: + self.videoid_to_vectoridx = pickle.load(fr) + + def add(self, hidden_states, video_ids, last=False): + assert len(hidden_states) == len(video_ids), "{}, {}".format( + str(len(hidden_states)), str(len(video_ids))) + assert len(hidden_states.shape) == 2 + assert hidden_states.dtype == np.float32 + + valid_idx = [] + for idx, video_id in enumerate(video_ids): + if video_id not in self.videoid_to_vectoridx: + valid_idx.append(idx) + self.videoid_to_vectoridx[video_id] = \ + len(self.videoid_to_vectoridx) + + hidden_states = hidden_states[valid_idx] + if not self.db.is_trained: + self.train_cache.append(hidden_states) + self.train_len += hidden_states.shape[0] + if self.train_len < self.train_thres: + return + self.finalize_training() + else: + self.db.add(hidden_states) + + def finalize_training(self): + hidden_states = np.concatenate(self.train_cache, axis=0) + del self.train_cache + local_rank = get_local_rank() + if local_rank == 0: + start = time.time() + print("training db on", self.train_thres, "/", self.train_len) + self.db.train(hidden_states[:self.train_thres]) + if local_rank == 0: + print("training db for", time.time() - start) + self.db.add(hidden_states) + + def search( + self, + query_hidden_states, + orig_dist, + ): + if len(self.videoid_to_vectoridx) != self.db.ntotal: + raise ValueError( + "cannot search: size mismatch in-between index and db", + len(self.videoid_to_vectoridx), + self.db.ntotal + ) + + if self.vectoridx_to_videoid is None: + self.vectoridx_to_videoid = { + self.videoid_to_vectoridx[videoid]: videoid + for videoid in self.videoid_to_vectoridx + } + assert len(self.vectoridx_to_videoid) \ + == len(self.videoid_to_vectoridx) + + # MultilingualFaissDataset uses the following; not sure the purpose. + # faiss.ParameterSpace().set_index_parameter(self.db, "nprobe", 10) + queried_dist, index = self.db.search(query_hidden_states, 1) + queried_dist, index = queried_dist[:, 0], index[:, 0] + + outputs = np.array( + [self.vectoridx_to_videoid[_index] + if _index != -1 else (-1, -1, -1) for _index in index], + dtype=np.int32) + outputs[queried_dist <= orig_dist] = -1 + return outputs + + def search_by_video_ids( + self, + video_ids, + retri_factor + ): + if len(self.videoid_to_vectoridx) != self.db.ntotal: + raise ValueError( + len(self.videoid_to_vectoridx), + self.db.ntotal + ) + + if not self.make_direct_maps_done: + self.make_direct_maps() + + if self.vectoridx_to_videoid is None: + self.vectoridx_to_videoid = { + self.videoid_to_vectoridx[videoid]: videoid + for videoid in self.videoid_to_vectoridx + } + assert len(self.vectoridx_to_videoid) \ + == len(self.videoid_to_vectoridx) + + query_hidden_states = [] + vector_ids = [] + for video_id in video_ids: + vector_id = self.videoid_to_vectoridx[video_id] + vector_ids.append(vector_id) + query_hidden_state = self.db.reconstruct(vector_id) + query_hidden_states.append(query_hidden_state) + query_hidden_states = np.stack(query_hidden_states) + + # MultilingualFaissDataset uses the following; not sure the reason. + # faiss.ParameterSpace().set_index_parameter(self.db, "nprobe", 10) + _, index = self.db.search(query_hidden_states, retri_factor) + outputs = [] + for sample_idx, sample in enumerate(index): + # the first video_id is always the video itself. + cands = [video_ids[sample_idx]] + for vector_idx in sample: + if vector_idx >= 0 \ + and vector_ids[sample_idx] != vector_idx: + cands.append( + self.vectoridx_to_videoid[vector_idx] + ) + outputs.append(cands) + return outputs + + +class VectorRetrieverDM(VectorRetriever): + """ + with direct map. + How2 Video Retriver. + Reference usage of FAISS: + https://github.com/fairinternal/fairseq-py/blob/paraphrase_pretraining/fairseq/data/multilingual_faiss_dataset.py + """ + + def __init__( + self, + hidden_size, + cent, + db_type, + examples_per_cent_to_train + ): + super().__init__( + hidden_size, cent, db_type, examples_per_cent_to_train) + self.make_direct_maps_done = False + + def make_direct_maps(self): + faiss.downcast_index(self.db).make_direct_map() + self.make_direct_maps_done = True + + def search( + self, + query_hidden_states, + orig_dist, + ): + if len(self.videoid_to_vectoridx) != self.db.ntotal: + raise ValueError( + len(self.videoid_to_vectoridx), + self.db.ntotal + ) + + if not self.make_direct_maps_done: + self.make_direct_maps() + if self.vectoridx_to_videoid is None: + self.vectoridx_to_videoid = { + self.videoid_to_vectoridx[videoid]: videoid + for videoid in self.videoid_to_vectoridx + } + assert len(self.vectoridx_to_videoid) \ + == len(self.videoid_to_vectoridx) + + # MultilingualFaissDataset uses the following; not sure the reason. + # faiss.ParameterSpace().set_index_parameter(self.db, "nprobe", 10) + queried_dist, index = self.db.search(query_hidden_states, 1) + outputs = [] + for sample_idx, sample in enumerate(index): + # and queried_dist[sample_idx] < thres \ + if sample >= 0 \ + and queried_dist[sample_idx] < orig_dist[sample_idx]: + outputs.append(self.vectoridx_to_videoid[sample]) + else: + outputs.append(None) + return outputs + + def search_by_video_ids( + self, + video_ids, + retri_factor=8 + ): + if len(self.videoid_to_vectoridx) != self.db.ntotal: + raise ValueError( + len(self.videoid_to_vectoridx), + self.db.ntotal + ) + + if not self.make_direct_maps_done: + self.make_direct_maps() + if self.vectoridx_to_videoid is None: + self.vectoridx_to_videoid = { + self.videoid_to_vectoridx[videoid]: videoid + for videoid in self.videoid_to_vectoridx + } + assert len(self.vectoridx_to_videoid) \ + == len(self.videoid_to_vectoridx) + + query_hidden_states = [] + vector_ids = [] + for video_id in video_ids: + vector_id = self.videoid_to_vectoridx[video_id] + vector_ids.append(vector_id) + query_hidden_state = self.db.reconstruct(vector_id) + query_hidden_states.append(query_hidden_state) + query_hidden_states = np.stack(query_hidden_states) + + # MultilingualFaissDataset uses the following; not sure the reason. + # faiss.ParameterSpace().set_index_parameter(self.db, "nprobe", 10) + _, index = self.db.search(query_hidden_states, retri_factor) + outputs = [] + for sample_idx, sample in enumerate(index): + # the first video_id is always the video itself. + cands = [video_ids[sample_idx]] + for vector_idx in sample: + if vector_idx >= 0 \ + and vector_ids[sample_idx] != vector_idx: + cands.append( + self.vectoridx_to_videoid[vector_idx] + ) + outputs.append(cands) + return outputs + + +class MMVectorRetriever(VectorRetrieverDM): + """ + multimodal vector retriver: + text retrieve video or video retrieve text. + """ + + def __init__(self, hidden_size, cent, db_type, examples_per_cent_to_train): + super().__init__( + hidden_size, cent, db_type, examples_per_cent_to_train) + video_db = self.db + super().__init__( + hidden_size, cent, db_type, examples_per_cent_to_train) + text_db = self.db + self.db = {"video": video_db, "text": text_db} + self.video_to_videoid = defaultdict(list) + + def __len__(self): + assert self.db["video"].ntotal == self.db["text"].ntotal + return self.db["video"].ntotal + + def make_direct_maps(self): + faiss.downcast_index(self.db["video"]).make_direct_map() + faiss.downcast_index(self.db["text"]).make_direct_map() + + def save(self, out_dir): + faiss.write_index( + self.db["video"], + os.path.join(out_dir, "video_faiss_idx") + ) + faiss.write_index( + self.db["text"], + os.path.join(out_dir, "text_faiss_idx") + ) + + with open( + os.path.join( + out_dir, "videoid_to_vectoridx.pkl"), + "wb") as fw: + pickle.dump( + self.videoid_to_vectoridx, fw, + protocol=pickle.HIGHEST_PROTOCOL + ) + + def load(self, out_dir): + fn = os.path.join(out_dir, "video_faiss_idx") + video_db = faiss.read_index(fn) + fn = os.path.join(out_dir, "text_faiss_idx") + text_db = faiss.read_index(fn) + self.db = {"video": video_db, "text": text_db} + with open( + os.path.join(out_dir, "videoid_to_vectoridx.pkl"), "rb") as fr: + self.videoid_to_vectoridx = pickle.load(fr) + self.video_to_videoid = defaultdict(list) + + def add(self, hidden_states, video_ids): + """hidden_states is a pair `(video, text)`""" + assert len(hidden_states) == len(video_ids), "{}, {}".format( + str(len(hidden_states)), str(len(video_ids))) + assert len(hidden_states.shape) == 3 + assert len(self.video_to_videoid) == 0 + + valid_idx = [] + for idx, video_id in enumerate(video_ids): + if video_id not in self.videoid_to_vectoridx: + valid_idx.append(idx) + self.videoid_to_vectoridx[video_id] = \ + len(self.videoid_to_vectoridx) + + batch_size = hidden_states.shape[0] + hidden_states = hidden_states[valid_idx] + + hidden_states = np.transpose(hidden_states, (1, 0, 2)).copy() + if not self.db["video"].is_trained: + self.train_cache.append(hidden_states) + train_len = batch_size * len(self.train_cache) + if train_len < self.train_thres: + return + + hidden_states = np.concatenate(self.train_cache, axis=1) + del self.train_cache + self.db["video"].train(hidden_states[0, :self.train_thres]) + self.db["text"].train(hidden_states[1, :self.train_thres]) + self.db["video"].add(hidden_states[0]) + self.db["text"].add(hidden_states[1]) + + def get_clips_by_video_id(self, video_id): + if not self.video_to_videoid: + for video_id, video_clip, text_clip in self.videoid_to_vectoridx: + self.video_to_videoid[video_id].append( + (video_id, video_clip, text_clip)) + return self.video_to_videoid[video_id] + + def search( + self, + video_ids, + target_modality, + retri_factor=8 + ): + if len(self.videoid_to_vectoridx) != len(self): + raise ValueError( + len(self.videoid_to_vectoridx), + len(self) + ) + + if not self.make_direct_maps_done: + self.make_direct_maps() + if self.vectoridx_to_videoid is None: + self.vectoridx_to_videoid = { + self.videoid_to_vectoridx[videoid]: videoid + for videoid in self.videoid_to_vectoridx + } + assert len(self.vectoridx_to_videoid) \ + == len(self.videoid_to_vectoridx) + + src_modality = "text" if target_modality == "video" else "video" + + query_hidden_states = [] + vector_ids = [] + for video_id in video_ids: + vector_id = self.videoid_to_vectoridx[video_id] + vector_ids.append(vector_id) + query_hidden_state = self.db[src_modality].reconstruct(vector_id) + query_hidden_states.append(query_hidden_state) + query_hidden_states = np.stack(query_hidden_states) + + # MultilingualFaissDataset uses the following; not sure the reason. + # faiss.ParameterSpace().set_index_parameter(self.db, "nprobe", 10) + _, index = self.db[target_modality].search( + query_hidden_states, retri_factor) + outputs = [] + for sample_idx, sample in enumerate(index): + cands = [] + for vector_idx in sample: + if vector_idx >= 0: + cands.append( + self.vectoridx_to_videoid[vector_idx] + ) + outputs.append(cands) + return outputs diff --git a/examples/MMPT/mmpt/modules/vectorpool.py b/examples/MMPT/mmpt/modules/vectorpool.py new file mode 100644 index 0000000000..d2b23d2da8 --- /dev/null +++ b/examples/MMPT/mmpt/modules/vectorpool.py @@ -0,0 +1,246 @@ +# Copyright (c) Facebook, Inc. All Rights Reserved + +import torch +import os +import numpy as np +import pickle + +from . import retri +from ..utils import get_local_rank + + +class VectorPool(object): + """ + Base class of retrieval space. + """ + + def __init__(self, config): + from transformers import AutoConfig + self.hidden_size = AutoConfig.from_pretrained( + config.dataset.bert_name).hidden_size + self.retriever_cls = getattr(retri, config.retriever_cls) + + def __call__(self, sample, **kwargs): + raise NotImplementedError + + def build_retriver( + self, + retriever_cls=None, + hidden_size=None, + centroids=512, + db_type="flatl2", + examples_per_cent_to_train=48 + ): + + """merge results from multiple gpus and return a retriver..""" + self.retriver = retriever_cls( + hidden_size, centroids, db_type, examples_per_cent_to_train) + return self.retriver + + def __repr__(self): + if hasattr(self, "retriver"): + retriver_name = str(len(self.retriver)) + else: + retriver_name = "no retriver field yet" + return self.__class__.__name__ \ + + "(" + retriver_name + ")" + + +class VideoVectorPool(VectorPool): + """ + average clips of a video as video representation. + """ + def __init__(self, config): + super().__init__(config) + self.build_retriver(self.retriever_cls, self.hidden_size) + + def __call__(self, sample, subsampling, **kwargs): + hidden_states = ( + sample["pooled_video"] + sample["pooled_text"]) / 2. + hidden_states = hidden_states.view( + -1, subsampling, + hidden_states.size(-1)) + hidden_states = torch.mean(hidden_states, dim=1) + hidden_states = hidden_states.cpu().detach().numpy() + video_ids = [] + for offset_idx, video_id in enumerate(sample["video_id"]): + if isinstance(video_id, tuple) and len(video_id) == 3: + # a sharded video_id. + video_id = video_id[0] + video_ids.append(video_id) + assert len(video_ids) == len(hidden_states) + self.retriver.add( + hidden_states.astype("float32"), + video_ids + ) + + +class DistributedVectorPool(VectorPool): + """ + support sync of multiple gpus/nodes. + """ + def __init__(self, config): + super().__init__(config) + self.out_dir = os.path.join( + config.fairseq.checkpoint.save_dir, + "retri") + os.makedirs(self.out_dir, exist_ok=True) + self.hidden_states = [] + self.video_ids = [] + + def build_retriver( + self, + retriever_cls=None, + hidden_size=None, + centroids=4096, + db_type="flatl2", + examples_per_cent_to_train=48 + ): + if retriever_cls is None: + retriever_cls = self.retriever_cls + if hidden_size is None: + hidden_size = self.hidden_size + """merge results from multiple gpus and return a retriver..""" + if torch.distributed.is_initialized(): + self.save() + # sync saving. + torch.distributed.barrier() + world_size = torch.distributed.get_world_size() + else: + world_size = 1 + self.retriver = retriever_cls( + hidden_size, centroids, db_type, examples_per_cent_to_train) + # each gpu process has its own retriever. + for local_rank in range(world_size): + if get_local_rank() == 0: + print("load local_rank", local_rank) + hidden_states, video_ids = self.load(local_rank) + hidden_states = hidden_states.astype("float32") + self.retriver.add(hidden_states, video_ids) + return self.retriver + + def load(self, local_rank): + hidden_states = np.load( + os.path.join( + self.out_dir, + "hidden_state" + str(local_rank) + ".npy" + ) + ) + + with open( + os.path.join( + self.out_dir, "video_id" + str(local_rank) + ".pkl"), + "rb") as fr: + video_ids = pickle.load(fr) + return hidden_states, video_ids + + def save(self): + hidden_states = np.vstack(self.hidden_states) + assert len(hidden_states) == len(self.video_ids), "{}, {}".format( + len(hidden_states), + len(self.video_ids) + ) + local_rank = torch.distributed.get_rank() \ + if torch.distributed.is_initialized() else 0 + + np.save( + os.path.join( + self.out_dir, + "hidden_state" + str(local_rank) + ".npy"), + hidden_states) + + with open( + os.path.join( + self.out_dir, + "video_id" + str(local_rank) + ".pkl"), + "wb") as fw: + pickle.dump( + self.video_ids, + fw, + protocol=pickle.HIGHEST_PROTOCOL + ) + + +class DistributedVideoVectorPool(DistributedVectorPool): + """ + average clips of a video as video representation. + """ + def __call__(self, sample, subsampling, **kwargs): + hidden_states = ( + sample["pooled_video"] + sample["pooled_text"]) / 2. + hidden_states = hidden_states.view( + -1, subsampling, + hidden_states.size(-1)) + hidden_states = torch.mean(hidden_states, dim=1) + hidden_states = hidden_states.cpu().detach().numpy() + video_ids = [] + for offset_idx, video_id in enumerate(sample["video_id"]): + if isinstance(video_id, tuple) and len(video_id) == 3: + # a sharded video_id. + video_id = video_id[0] + video_ids.append(video_id) + assert len(video_ids) == len(hidden_states) + self.hidden_states.append(hidden_states) + self.video_ids.extend(video_ids) + + +# ------------ the following are deprecated -------------- + +class TextClipVectorPool(VectorPool): + def __init__(self, config): + from transformers import AutoConfig + hidden_size = AutoConfig.from_pretrained( + config.dataset.bert_name).hidden_size + retriever_cls = getattr(retri, config.retriever_cls) + self.build_retriver(retriever_cls, hidden_size) + + def __call__(self, sample, **kwargs): + clip_meta = sample["clip_meta"].cpu() + assert torch.all(torch.le(clip_meta[:, 4], clip_meta[:, 5])) + text_meta = [tuple(item.tolist()) for item in clip_meta[:, 3:]] + + if hasattr(self, "retriver"): + # build_retriver is called. + self.retriver.add( + sample["pooled_text"].cpu().numpy().astype("float32"), + text_meta + ) + else: + raise NotImplementedError + + +class MMClipVectorPool(VectorPool): + """ + Multimodal Clip-level vector pool. + """ + def __init__(self, out_dir): + """use hidden_states to store `(video, text)`.""" + """use video_ids to store `(video_id, start, end)`.""" + super().__init__(out_dir) + + def __call__(self, sample, **kwargs): + pooled_video = sample["pooled_video"].cpu().unsqueeze(1).numpy() + pooled_text = sample["pooled_text"].cpu().unsqueeze(1).numpy() + + self.hidden_states.append( + np.concatenate([pooled_video, pooled_text], axis=1) + ) + + video_starts = sample["video_start"].cpu() + video_ends = sample["video_end"].cpu() + assert torch.all(torch.le(video_starts, video_ends)) + + text_starts = sample["text_start"].cpu() + text_ends = sample["text_end"].cpu() + assert torch.all(torch.le(text_starts, text_ends)) + subsample_size = sample["pooled_video"].size(0) // len(sample["video_id"]) + video_ids = [video_id for video_id in sample["video_id"] + for _ in range(subsample_size) + ] + for video_id, video_start, video_end, text_start, text_end in zip( + video_ids, video_starts, video_ends, text_starts, text_ends): + self.video_ids.append(( + video_id, + (int(video_start), int(video_end)), + (int(text_start), int(text_end)) + )) diff --git a/examples/MMPT/mmpt/processors/__init__.py b/examples/MMPT/mmpt/processors/__init__.py new file mode 100644 index 0000000000..434d1d92b9 --- /dev/null +++ b/examples/MMPT/mmpt/processors/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +from .processor import * + +from .how2processor import * +from .how2retriprocessor import * + +from .dsprocessor import * + +try: + from .rawvideoprocessor import * + from .codecprocessor import * + from .webvidprocessor import * + from .expprocessor import * + from .exphow2processor import * + from .exphow2retriprocessor import * + from .expcodecprocessor import * + from .expfeatureencoder import * + from .expdsprocessor import * +except ImportError: + pass diff --git a/examples/MMPT/mmpt/processors/dedupprocessor.py b/examples/MMPT/mmpt/processors/dedupprocessor.py new file mode 100644 index 0000000000..8a1ad402cd --- /dev/null +++ b/examples/MMPT/mmpt/processors/dedupprocessor.py @@ -0,0 +1,242 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import random +import json +import pickle +from tqdm import tqdm +import os +import numpy as np + + +class CaptionDedupProcessor(object): + """remove overlapping of caption sentences(clip). + Some statistics: + caption: + {'t_clip_len': 246.6448431320854, + 'video_len': 281.09174795676245, + 'clip_tps': 0.8841283727427481, + 'video_tps': 0.7821156477732097, + 'min_clip_len': 0.0, + 'max_clip_len': 398.3, + 'mean_clip_len': 3.196580003006861, + 'num_clip': 77.15897706301081} + + raw_caption: + {'t_clip_len': 238.95908778424115, + 'video_len': 267.5914859862507, + 'clip_tps': 2.4941363624267963, + 'video_tps': 2.258989769647173, + 'min_clip_len': 0.0, + 'max_clip_len': 398.3, + 'mean_clip_len': 3.0537954186814265, + 'num_clip': 78.24986779481756} + """ + + def __init__(self, pkl_file): + with open(pkl_file, "rb") as fd: + self.data = pickle.load(fd) + self.stat = { + "t_clip_len": [], + "video_len": [], + "clip_tps": [], + "video_tps": [], + "clip_len": [], + } + + def __call__(self): + for idx, video_id in enumerate(tqdm(self.data)): + caption = json.loads(self.data[video_id]) + caption = self._dedup(caption) + if idx < 4096: # for the first 4096 examples, compute the statistics. + self.save_stat(video_id, caption) + self.data[video_id] = json.dumps(caption) + self.print_stat() + + def single(self, video_id): + caption = json.loads(self.data[video_id]) + for clip_idx, (start, end, text) in enumerate( + zip(caption["start"], caption["end"], caption["text"]) + ): + print(start, end, text) + print("@" * 100) + caption = self._dedup(caption) + for clip_idx, (start, end, text) in enumerate( + zip(caption["start"], caption["end"], caption["text"]) + ): + print(start, end, text) + print("#" * 100) + self.save_stat(video_id, caption) + self.print_stat() + + def finalize(self, tgt_fn): + with open(tgt_fn, "wb") as fw: + pickle.dump(self.data, fw, pickle.HIGHEST_PROTOCOL) + + def save_stat(self, video_id, caption): + video_fn = os.path.join( + "data/feat/feat_how2_s3d", video_id + ".npy" + ) + if os.path.isfile(video_fn): + with open(video_fn, "rb", 1) as fr: # 24 is the buffer size. buffered + version = np.lib.format.read_magic(fr) + shape, fortran, dtype = np.lib.format._read_array_header(fr, version) + video_len = shape[0] + + t_clip_len = 0.0 + t_tokens = 0 + for idx, (start, end, text) in enumerate( + zip(caption["start"], caption["end"], caption["text"]) + ): + clip_len = ( + (end - max(caption["end"][idx - 1], start)) + if idx > 0 + else end - start + ) + t_clip_len += clip_len + t_tokens += len(text.split(" ")) + self.stat["clip_len"].append(clip_len) + self.stat["t_clip_len"].append(t_clip_len) + self.stat["video_len"].append(video_len) + self.stat["clip_tps"].append(t_tokens / t_clip_len) + self.stat["video_tps"].append(t_tokens / video_len) + + def print_stat(self): + result = { + "t_clip_len": np.mean(self.stat["t_clip_len"]), + "video_len": np.mean(self.stat["video_len"]), + "clip_tps": np.mean(self.stat["clip_tps"]), + "video_tps": np.mean(self.stat["video_tps"]), + "min_clip_len": min(self.stat["clip_len"]), + "max_clip_len": max(self.stat["clip_len"]), + "mean_clip_len": np.mean(self.stat["clip_len"]), + "num_clip": len(self.stat["clip_len"]) / len(self.stat["video_tps"]), + } + print(result) + + def _dedup(self, caption): + def random_merge(end_idx, start, end, text, starts, ends, texts): + if random.random() > 0.5: + # print(clip_idx, "[PARTIAL INTO PREV]", end_idx) + # overlapped part goes to the end of previous. + ends[-1] = max(ends[-1], start) # ? + rest_text = text[end_idx:].strip() + if rest_text: + starts.append(max(ends[-1], start)) + ends.append(max(end, starts[-1])) + texts.append(rest_text) + else: # goes to the beginning of the current. + # strip the previous. + left_text = texts[-1][:-end_idx].strip() + if left_text: + # print(clip_idx, "[PREV PARTIAL INTO CUR]", end_idx) + ends[-1] = min(ends[-1], start) + texts[-1] = left_text + else: + # print(clip_idx, "[PREV LEFT NOTHING ALL INTO CUR]", end_idx) + starts.pop(-1) + ends.pop(-1) + texts.pop(-1) + starts.append(start) + ends.append(end) + texts.append(text) + + starts, ends, texts = [], [], [] + for clip_idx, (start, end, text) in enumerate( + zip(caption["start"], caption["end"], caption["text"]) + ): + if not isinstance(text, str): + continue + text = text.replace("\n", " ").strip() + if len(text) == 0: + continue + starts.append(start) + ends.append(end) + texts.append(text) + break + + for clip_idx, (start, end, text) in enumerate( + zip( + caption["start"][clip_idx + 1:], + caption["end"][clip_idx + 1:], + caption["text"][clip_idx + 1:], + ) + ): + if not isinstance(text, str): + continue + text = text.replace("\n", " ").strip() + if len(text) == 0: + continue + + # print(clip_idx, texts[-5:]) + # print(clip_idx, start, end, text) + if texts[-1].endswith(text): # subset of prev caption -> merge + # print(clip_idx, "[MERGE INTO PREV]") + ends[-1] = max(ends[-1], end) + elif text.startswith(texts[-1]): # superset of prev caption -> merge + # print(clip_idx, "[PREV MERGE INTO CUR]") + texts[-1] = text + starts[-1] = min(starts[-1], start) + ends[-1] = max(ends[-1], end) + else: # overlapping or non-overlapping. + for end_idx in range(1, len(text) + 1): + if texts[-1].endswith(text[:end_idx]): + random_merge(end_idx, start, end, text, starts, ends, texts) + break + else: + starts.append(start) + ends.append(end) + texts.append(text) + + assert (ends[-1] + 0.001) >= starts[-1] and len( + texts[-1] + ) > 0, "{} {} {} <- {} {} {}, {} {} {}".format( + str(starts[-1]), + str(ends[-1]), + texts[-1], + caption["start"][clip_idx - 1], + caption["end"][clip_idx - 1], + caption["text"][clip_idx - 1], + str(start), + str(end), + text, + ) + + return {"start": starts, "end": ends, "text": texts} + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="dedup how2 caption") + parser.add_argument('--how2dir', default="data/how2") + args = parser.parse_args() + + raw_caption_json = os.path.join(args.how2dir, "raw_caption.json") + raw_caption_pickle = os.path.join(args.how2dir, "raw_caption.pkl") + raw_caption_dedup_pickle = os.path.join(args.how2dir, "raw_caption_dedup.pkl") + + def convert_to_pickle(src_fn, tgt_fn): + with open(src_fn) as fd: + captions = json.load(fd) + + for video_id in captions: + captions[video_id] = json.dumps(captions[video_id]) + + with open(tgt_fn, "wb") as fw: + pickle.dump(captions, fw, pickle.HIGHEST_PROTOCOL) + + if not os.path.isfile(raw_caption_pickle): + convert_to_pickle(raw_caption_json, raw_caption_pickle) + + deduper = CaptionDedupProcessor(raw_caption_pickle) + deduper() + deduper.finalize(raw_caption_dedup_pickle) + + """ + # demo + deduper = CaptionDedupProcessor("data/how2/raw_caption.pkl") + deduper.single("HfIeQ9pzL5U") + """ diff --git a/examples/MMPT/mmpt/processors/dsprocessor.py b/examples/MMPT/mmpt/processors/dsprocessor.py new file mode 100644 index 0000000000..ecebf0eea5 --- /dev/null +++ b/examples/MMPT/mmpt/processors/dsprocessor.py @@ -0,0 +1,848 @@ +# Copyright (c) Facebook, Inc. All Rights Reserved + +""" +Processors for all downstream (ds) tasks. +""" + +import json +import os +import pickle +import random +import math +import numpy as np +import torch + +from collections import defaultdict + +from .processor import ( + MetaProcessor, + VideoProcessor, + TextProcessor, + Aligner, + MMAttentionMask2DProcessor, +) + +from .how2processor import TextGenerationProcessor + + +# ------------- A General Aligner for all downstream tasks----------------- + + +class DSAligner(Aligner): + """ + Downstream (DS) aligner shared by all datasets. + """ + + def __call__(self, video_id, video_feature, text_feature, wps=0.7): + # random sample a starting sec for video. + video_start = 0 + video_end = min(len(video_feature), self.max_video_len) + # the whole sequence is a single clip. + video_clips = {"start": [video_start], "end": [video_end]} + + text_feature = { + "cap": [text_feature], + "start": [video_start], + "end": [len(text_feature) / wps], + } + text_clip_indexs = [0] + + vfeats, vmasks = self._build_video_seq( + video_feature, video_clips + ) + caps, cmasks = self._build_text_seq( + text_feature, text_clip_indexs + ) + + return { + "caps": caps, + "cmasks": cmasks, + "vfeats": vfeats, + "vmasks": vmasks, + "video_id": video_id, + } + + +class NLGTextProcessor(TextProcessor): + """ + Also return the original text as ref. + """ + def __call__(self, text_id): + return super().__call__(text_id), text_id + + +class DSNLGAligner(DSAligner): + """extend with the capability of 2d mask for generation.""" + def __init__(self, config): + super().__init__(config) + self.attnmasker = MMAttentionMask2DProcessor() + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained( + self.bert_name, use_fast=self.use_fast, + bos_token="[CLS]", eos_token="[SEP]" + ) + self.tokenizer = tokenizer + self.bos_token_id = tokenizer.bos_token_id + self.eos_token_id = tokenizer.eos_token_id + self.textgen = TextGenerationProcessor(tokenizer) + + def __call__(self, video_id, video_feature, text_feature): + output = super().__call__(video_id, video_feature, text_feature[0]) + if self.split == "test": + # output.update({"ref": text_feature[1]}) + output.update({"ref": self.tokenizer.decode( + output["caps"], skip_special_tokens=True)}) + text_label = output["caps"] + cmasks = torch.BoolTensor([1] * text_label.size(0)) + caps = torch.LongTensor([ + self.cls_token_id, + self.sep_token_id, + self.bos_token_id]) + else: + caps, text_label = self.textgen(output["caps"]) + cmasks = output["cmasks"] + + attention_mask = self.attnmasker( + output["vmasks"], cmasks, "textgen") + + output.update({ + "caps": caps, + "cmasks": cmasks, + "text_label": text_label, + "attention_mask": attention_mask, + }) + return output + + +# -------------------- MSRVTT ------------------------ + + +class MSRVTTMetaProcessor(MetaProcessor): + """MSRVTT dataset. + reference: `howto100m/msrvtt_dataloader.py` + """ + + def __init__(self, config): + super().__init__(config) + import pandas as pd + data = pd.read_csv(self._get_split_path(config)) + # TODO: add a text1ka flag. + if config.split == "train" \ + and config.full_test_path is not None \ + and config.jsfusion_path is not None: + # add testing videos from full_test_path not used by jfusion. + additional_data = pd.read_csv(config.full_test_path) + jsfusion_data = pd.read_csv(config.jsfusion_path) + + for video_id in additional_data["video_id"]: + if video_id not in jsfusion_data["video_id"].values: + data = data.append( + {"video_id": video_id}, ignore_index=True) + + if config.dup is not None and config.split == "train": + data = data.append([data] * (config.dup - 1), ignore_index=True) + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + """slightly modify with if condition to combine train/test.""" + vid, sentence = None, None + vid = self.data["video_id"].values[idx] + if "sentence" in self.data: # for testing. + sentence = self.data["sentence"].values[idx] + else: # for training. + sentence = vid + return vid, sentence + + +class MSRVTTTextProcessor(TextProcessor): + """MSRVTT dataset. + reference: `msrvtt_dataloader.py` `MSRVTT_TrainDataLoader`. + TODO (huxu): add max_words. + """ + + def __init__(self, config): + super().__init__(config) + self.sentences = None + if config.json_path is not None and config.split == "train": + with open(config.json_path) as fd: + self.data = json.load(fd) + self.sentences = defaultdict(list) + for s in self.data["sentences"]: + self.sentences[s["video_id"]].append(s["caption"]) + + def __call__(self, text_id): + if self.sentences is not None: + rind = random.randint(0, len(self.sentences[text_id]) - 1) + sentence = self.sentences[text_id][rind] + else: + sentence = text_id + caption = self.tokenizer(sentence, add_special_tokens=False) + return caption["input_ids"] + + +class MSRVTTNLGTextProcessor(MSRVTTTextProcessor): + """TODO: change dsaligner and merge to avoid any NLG text processor.""" + def __call__(self, text_id): + if self.sentences is not None: + rind = random.randint(0, len(self.sentences[text_id]) - 1) + sentence = self.sentences[text_id][rind] + else: + sentence = text_id + caption = self.tokenizer(sentence, add_special_tokens=False) + return caption["input_ids"], sentence + + +class MSRVTTQAMetaProcessor(MetaProcessor): + """MSRVTT-QA: retrieval-based multi-choice QA from JSFusion dataset. + For simplicity, we use the train retrieval model. + reference: `https://github.com/yj-yu/lsmdc` + """ + + def __init__(self, config): + super().__init__(config) + import pandas as pd + csv_data = pd.read_csv(self._get_split_path(config), sep="\t") + data = [] + for video_id, a1, a2, a3, a4, a5, answer in zip( + csv_data["vid_key"].values, + csv_data["a1"].values, + csv_data["a2"].values, + csv_data["a3"].values, + csv_data["a4"].values, + csv_data["a5"].values, + csv_data["answer"].values): + video_id = video_id.replace("msr", "video") + data.append((video_id, (answer, [a1, a2, a3, a4, a5]))) + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + + +class MSRVTTQATextProcessor(TextProcessor): + """MSRVTT-QA dataset. + text_ans is of format `(answer, [a1, a2, a3, a4, a5])`. + """ + + def __call__(self, text_ans): + for ans_idx, ans in enumerate(text_ans[1]): + if isinstance(ans, str): + text_ans[1][ans_idx] = self.tokenizer(ans, add_special_tokens=False)["input_ids"] + return text_ans + + +class MSRVTTQAAligner(DSAligner): + """MSRVTT dataset. + similar to sample in how2. + we call __call__ multiple times. + """ + + def __call__(self, video_id, video_feature, text_feature, wps=0.7): + caps = [] + cmasks = [] + answer = text_feature[0] + for ans_idx, _text_feature in enumerate(text_feature[1]): + output = super().__call__( + video_id, video_feature, _text_feature, wps) + caps.append(output["caps"]) + cmasks.append(output["cmasks"]) + output.update({ + "caps": torch.stack(caps), + "cmasks": torch.stack(cmasks), + "answers": torch.LongTensor([answer]), + }) + return output + + +# -------------------- Youcook ----------------------- + + +class YoucookMetaProcessor(MetaProcessor): + """Youcook dataset. + reference: `howto100m/youcook_dataloader.py` + note that the data can be different as the + (1) some videos already in Howto100m are removed. + (2) stop words are removed from caption + TODO (huxu): make a flag to load the original caption. + (see youcookii_annotations_trainval.json). + + The max_video_len can be 264 and text can be 64 tokens. + In reality we may not need that long. see projects/task/youcook.yaml + """ + + def __init__(self, config): + super().__init__(config) + vfeat_dir = config.vfeat_dir + print(self._get_split_path(config)) + with open(self._get_split_path(config), "rb") as fd: + data = pickle.load(fd) + all_valid_video_ids = set( + [os.path.splitext(fn)[0] for fn in os.listdir(vfeat_dir)] + ) + recs = [] + video_ids = set() + valid_video_ids = set() + for rec in data: # filter videos not available. + udl_idx = rec["id"].rindex("_") + video_id = rec["id"][:udl_idx] + video_ids.add(video_id) + if video_id in all_valid_video_ids: + valid_video_ids.add(video_id) + recs.append(rec) + print("total video_ids in .pkl", len(video_ids)) + print("valid video_ids in .pkl", len(valid_video_ids)) + print("please verify {train,val}_list.txt") + data = recs + self.data = data + + with open(config.trainval_annotation) as fd: + self.youcook_annotation = json.load(fd)["database"] + if config.use_annotation_text is True: + print("using text in annotation.") + self.use_annotation_caption = True + else: + self.use_annotation_caption = False + + def __getitem__(self, idx): + def _get_video_and_caption(rec): + vid = rec["id"] + udl_idx = vid.rindex("_") + video_id, clip_id = vid[:udl_idx], int(vid[udl_idx + 1:]) + clip = self.youcook_annotation[video_id]["annotations"][clip_id] + start, end = clip["segment"] + if self.use_annotation_caption: + caption = clip["sentence"] + else: + caption = rec["caption"] + return (video_id, start, end), caption + + rec = self.data[idx] + video_info, text_info = _get_video_and_caption(rec) + return video_info, text_info + + +class YoucookVideoProcessor(VideoProcessor): + """video_fn is a tuple of (video_id, start, end) now.""" + + def __call__(self, video_fn): + video_id, start, end = video_fn + feat = np.load(os.path.join(self.vfeat_dir, video_id + ".npy")) + return feat[start:end] + + +class YoucookNLGMetaProcessor(MetaProcessor): + """NLG uses the original split: + `train_list.txt` and `val_list.txt` + """ + + def __init__(self, config): + super().__init__(config) + vfeat_dir = config.vfeat_dir + print(self._get_split_path(config)) + with open(self._get_split_path(config)) as fd: + video_ids = [ + line.strip().split("/")[1] for line in fd.readlines()] + print("total video_ids in train/val_list.txt", len(video_ids)) + + all_valid_video_ids = set( + [os.path.splitext(fn)[0] for fn in os.listdir(vfeat_dir)] + ) + video_ids = [ + video_id for video_id in video_ids + if video_id in all_valid_video_ids] + + print("valid video_ids in train/val_list.txt", len(video_ids)) + with open(config.trainval_annotation) as fd: + self.youcook_annotation = json.load(fd)["database"] + + data = [] + for video_id in video_ids: + for clip in self.youcook_annotation[video_id]["annotations"]: + start, end = clip["segment"] + caption = clip["sentence"] + data.append(((video_id, start, end), caption)) + self.data = data + + def __getitem__(self, idx): + return self.data[idx] + + +# --------------------- CrossTask ------------------------- + +class CrossTaskMetaProcessor(MetaProcessor): + def __init__(self, config): + super().__init__(config) + np.random.seed(0) # deterministic random split. + task_vids = self._get_vids( + config.train_csv_path, + config.vfeat_dir, + config.annotation_path) + + val_vids = self._get_vids( + config.val_csv_path, + config.vfeat_dir, + config.annotation_path) + + # filter out those task and vids appear in val_vids. + task_vids = { + task: [ + vid for vid in vids + if task not in val_vids or vid not in val_vids[task]] + for task, vids in task_vids.items()} + + primary_info = self._read_task_info(config.primary_path) + test_tasks = set(primary_info['steps'].keys()) + + # if args.use_related: + related_info = self._read_task_info(config.related_path) + task_steps = {**primary_info['steps'], **related_info['steps']} + n_steps = {**primary_info['n_steps'], **related_info['n_steps']} + # else: + # task_steps = primary_info['steps'] + # n_steps = primary_info['n_steps'] + all_tasks = set(n_steps.keys()) + # filter and keep task in primary or related. + task_vids = { + task: vids for task, vids in task_vids.items() + if task in all_tasks} + # vocab-by-step matrix (A) and vocab (M) + # (huxu): we do not use BoW. + # A, M = self._get_A(task_steps, share="words") + + train_vids, test_vids = self._random_split( + task_vids, test_tasks, config.n_train) + print("train_num_videos", sum(len(vids) for vids in train_vids.values())) + print("test_num_videos", sum(len(vids) for vids in test_vids.values())) + # added by huxu to automatically determine the split. + split_map = { + "train": train_vids, + "valid": test_vids, + "test": test_vids + } + task_vids = split_map[config.split] + + self.vids = [] + for task, vids in task_vids.items(): + self.vids.extend([(task, vid) for vid in vids]) + self.task_steps = task_steps + self.n_steps = n_steps + + def __getitem__(self, idx): + task, vid = self.vids[idx] + n_steps = self.n_steps[task] + steps = self.task_steps[task] + assert len(steps) == n_steps + return (task, vid, steps, n_steps), (task, vid, steps, n_steps) + + def __len__(self): + return len(self.vids) + + def _random_split(self, task_vids, test_tasks, n_train): + train_vids = {} + test_vids = {} + for task, vids in task_vids.items(): + if task in test_tasks and len(vids) > n_train: + train_vids[task] = np.random.choice( + vids, n_train, replace=False).tolist() + test_vids[task] = [ + vid for vid in vids if vid not in train_vids[task]] + else: + train_vids[task] = vids + return train_vids, test_vids + + def _get_vids(self, path, vfeat_dir, annotation_path): + """refactored from + https://github.com/DmZhukov/CrossTask/blob/master/data.py + changes: add `vfeat_dir` to check if the video is available. + add `annotation_path` to check if the video is available. + """ + + task_vids = {} + with open(path, 'r') as f: + for line in f: + task, vid, url = line.strip().split(',') + # double check the video is available. + if not os.path.exists( + os.path.join(vfeat_dir, vid + ".npy")): + continue + # double check the annotation is available. + if not os.path.exists(os.path.join( + annotation_path, + task + "_" + vid + ".csv")): + continue + if task not in task_vids: + task_vids[task] = [] + task_vids[task].append(vid) + return task_vids + + def _read_task_info(self, path): + titles = {} + urls = {} + n_steps = {} + steps = {} + with open(path, 'r') as f: + idx = f.readline() + while idx != '': + idx = idx.strip() + titles[idx] = f.readline().strip() + urls[idx] = f.readline().strip() + n_steps[idx] = int(f.readline().strip()) + steps[idx] = f.readline().strip().split(',') + next(f) + idx = f.readline() + return { + 'title': titles, + 'url': urls, + 'n_steps': n_steps, + 'steps': steps + } + + def _get_A(self, task_steps, share="words"): + raise ValueError("running get_A is not allowed for BERT.") + """Step-to-component matrices.""" + if share == 'words': + # share words + task_step_comps = { + task: [step.split(' ') for step in steps] + for task, steps in task_steps.items()} + elif share == 'task_words': + # share words within same task + task_step_comps = { + task: [[task+'_'+tok for tok in step.split(' ')] for step in steps] + for task, steps in task_steps.items()} + elif share == 'steps': + # share whole step descriptions + task_step_comps = { + task: [[step] for step in steps] for task, steps in task_steps.items()} + else: + # no sharing + task_step_comps = { + task: [[task+'_'+step] for step in steps] + for task, steps in task_steps.items()} + # BERT tokenizer here? + vocab = [] + for task, steps in task_step_comps.items(): + for step in steps: + vocab.extend(step) + vocab = {comp: m for m, comp in enumerate(set(vocab))} + M = len(vocab) + A = {} + for task, steps in task_step_comps.items(): + K = len(steps) + a = torch.zeros(M, K) + for k, step in enumerate(steps): + a[[vocab[comp] for comp in step], k] = 1 + a /= a.sum(dim=0) + A[task] = a + return A, M + + +class CrossTaskVideoProcessor(VideoProcessor): + def __call__(self, video_fn): + task, vid, steps, n_steps = video_fn + video_fn = os.path.join(self.vfeat_dir, vid + ".npy") + feat = np.load(video_fn) + return feat + + +class CrossTaskTextProcessor(TextProcessor): + def __call__(self, text_id): + task, vid, steps, n_steps = text_id + step_ids = [] + for step_str in steps: + step_ids.append( + self.tokenizer(step_str, add_special_tokens=False)["input_ids"] + ) + return step_ids + + +class CrossTaskAligner(Aligner): + """ + TODO: it's not clear yet the formulation of the task; finish this later. + """ + def __init__(self, config): + super().__init__(config) + self.annotation_path = config.annotation_path + self.sliding_window = config.sliding_window + self.sliding_window_size = config.sliding_window_size + + def __call__(self, video_id, video_feature, text_feature): + task, vid, steps, n_steps = video_id + annot_path = os.path.join( + self.annotation_path, task + '_' + vid + '.csv') + video_len = len(video_feature) + + labels = torch.from_numpy(self._read_assignment( + video_len, n_steps, annot_path)).float() + + vfeats, vmasks, targets = [], [], [] + # sliding window on video features and targets. + for window_start in range(0, video_len, self.sliding_window): + video_start = 0 + video_end = min(video_len - window_start, self.sliding_window_size) + video_clip = {"start": [video_start], "end": [video_end]} + + vfeat, vmask = self._build_video_seq( + video_feature[window_start: window_start + video_end], + video_clip + ) + + target = labels[window_start: window_start + video_end] + assert len(vfeat) >= len(target), "{},{}".format(len(vfeat), len(target)) + # TODO: randomly drop all zero targets for training ? + # if self.split == "train" and target.sum() == 0: + # continue + vfeats.append(vfeat) + vmasks.append(vmask) + targets.append(target) + + if (video_len - window_start) <= self.sliding_window_size: + break + + vfeats = torch.stack(vfeats) + vmasks = torch.stack(vmasks) + targets = torch.cat(targets, dim=0) + + caps, cmasks = [], [] + for step in text_feature: + step_text_feature = {"start": [0], "end": [1], "cap": [step]} + step_text_clip_index = [0] + cap, cmask = self._build_text_seq( + step_text_feature, step_text_clip_index + ) + caps.append(cap) + cmasks.append(cmask) + caps = torch.stack(caps) + cmasks = torch.stack(cmasks) + + return { + "caps": caps, + "cmasks": cmasks, + "vfeats": vfeats, # X for original code. + "vmasks": vmasks, + "targets": targets, + "video_id": vid, + "task": task, + "video_len": video_len # for later checking. + } + + def _read_assignment(self, T, K, path): + """ + refactored from https://github.com/DmZhukov/CrossTask/blob/master/data.py + Howto interpret contraints on loss that is going to be minimized: + lambd is a big number; + self.lambd * C is a big number for all valid position (csv stores invalids) + + def forward(self, O, Y, C): + return (Y*(self.lambd * C - self.lsm(O))).mean(dim=0).sum() + + This will load the csv file and fill-in the step col from start to end rows. + """ + + Y = np.zeros([T, K], dtype=np.uint8) + with open(path, 'r') as f: + for line in f: + step, start, end = line.strip().split(',') + start = int(math.floor(float(start))) + end = int(math.ceil(float(end))) + step = int(step) - 1 + Y[start:end, step] = 1 + return Y + + +# --------------------- COIN ------------------------- + +class MetaTextBinarizer(Aligner): + def __call__(self, text_feature): + text_feature = { + "cap": [text_feature], + "start": [0.], + "end": [100.], + } + text_clip_indexs = [0] + + caps, cmasks = self._build_text_seq( + text_feature, text_clip_indexs + ) + return {"caps": caps, "cmasks": cmasks} + + +class COINActionSegmentationMetaProcessor(MetaProcessor): + split_map = { + "train": "training", + "valid": "testing", + "test": "testing", + } + + def __init__(self, config): + super().__init__(config) + with open(self._get_split_path(config)) as fr: + database = json.load(fr)["database"] + id2label = {} + data = [] + # filter the data by split. + for video_id, rec in database.items(): + # always use testing to determine label_set + if rec["subset"] == "testing": + for segment in rec["annotation"]: + id2label[int(segment["id"])] = segment["label"] + # text_labels is used for ZS setting + self.text_labels = ["none"] * len(id2label) + for label_id in id2label: + self.text_labels[label_id-1] = id2label[label_id] + + id2label[0] = "O" + print("num of labels", len(id2label)) + + for video_id, rec in database.items(): + if not os.path.isfile(os.path.join(config.vfeat_dir, video_id + ".npy")): + continue + if rec["subset"] == COINActionSegmentationMetaProcessor.split_map[self.split]: + starts, ends, labels = [], [], [] + for segment in rec["annotation"]: + start, end = segment["segment"] + label = int(segment["id"]) + starts.append(start) + ends.append(end) + labels.append(label) + data.append( + (video_id, {"start": starts, "end": ends, "label": labels})) + self.data = data + + def meta_text_labels(self, config): + from transformers import default_data_collator + from ..utils import get_local_rank + + text_processor = TextProcessor(config) + binarizer = MetaTextBinarizer(config) + # TODO: add prompts to .yaml. + text_labels = [label for label in self.text_labels] + + if get_local_rank() == 0: + print(text_labels) + + outputs = [] + for text_label in text_labels: + text_feature = text_processor(text_label) + outputs.append(binarizer(text_feature)) + return default_data_collator(outputs) + + def __getitem__(self, idx): + return self.data[idx] + + +class COINActionSegmentationTextProcessor(TextProcessor): + def __call__(self, text_label): + return text_label + + +class COINActionSegmentationAligner(Aligner): + def __init__(self, config): + super().__init__(config) + self.sliding_window = config.sliding_window + self.sliding_window_size = config.sliding_window_size + + def __call__(self, video_id, video_feature, text_feature): + starts, ends, label_ids = text_feature["start"], text_feature["end"], text_feature["label"] + # sliding window. + video_len = len(video_feature) + + vfeats, vmasks, targets = [], [], [] + # sliding window on video features and targets. + for window_start in range(0, video_len, self.sliding_window): + video_start = 0 + video_end = min(video_len - window_start, self.sliding_window_size) + video_clip = {"start": [video_start], "end": [video_end]} + vfeat, vmask = self._build_video_seq( + video_feature[window_start: window_start + video_end], + video_clip + ) + # covers video length only. + target = torch.full_like(vmask, -100, dtype=torch.long) + target[vmask] = 0 + for start, end, label_id in zip(starts, ends, label_ids): + if (window_start < end) and (start < (window_start + video_end)): + start_offset = max(0, math.floor(start) - window_start) + end_offset = min(video_end, math.ceil(end) - window_start) + target[start_offset:end_offset] = label_id + vfeats.append(vfeat) + vmasks.append(vmask) + targets.append(target) + if (video_len - window_start) <= self.sliding_window_size: + break + + vfeats = torch.stack(vfeats) + vmasks = torch.stack(vmasks) + targets = torch.stack(targets) + video_targets = torch.full((video_len,), 0) + for start, end, label_id in zip(starts, ends, label_ids): + start_offset = max(0, math.floor(start)) + end_offset = min(video_len, math.ceil(end)) + video_targets[start_offset:end_offset] = label_id + + caps = torch.LongTensor( + [[self.cls_token_id, self.sep_token_id, + self.pad_token_id, self.sep_token_id]], + ).repeat(vfeats.size(0), 1) + cmasks = torch.BoolTensor( + [[0, 1, 0, 1]] # pad are valid for attention. + ).repeat(vfeats.size(0), 1) + return { + "caps": caps, + "cmasks": cmasks, + "vfeats": vfeats, # X for original code. + "vmasks": vmasks, + "targets": targets, + "video_id": video_id, + "video_len": video_len, # for later checking. + "video_targets": video_targets + } + + +class DiDeMoMetaProcessor(MetaProcessor): + """reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py + https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/data_processing.py + """ + def __init__(self, config): + super().__init__(config) + + assert "test" in self._get_split_path(config), "DiDeMo only supports zero-shot testing for now." + + with open(self._get_split_path(config)) as data_file: + json_data = json.load(data_file) + + data = [] + for record in json_data: + data.append((record["video"], record["description"])) + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + + +class DiDeMoTextProcessor(TextProcessor): + """reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py + https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/data_processing.py + """ + + def __call__(self, text): + return self.tokenizer(text, add_special_tokens=False)["input_ids"] + + +class DiDeMoAligner(DSAligner): + """ + check video length. + """ + + def __call__(self, video_id, video_feature, text_feature): + # print(video_feature.shape[0]) + return super().__call__(video_id, video_feature, text_feature) diff --git a/examples/MMPT/mmpt/processors/how2processor.py b/examples/MMPT/mmpt/processors/how2processor.py new file mode 100644 index 0000000000..bed2168b1d --- /dev/null +++ b/examples/MMPT/mmpt/processors/how2processor.py @@ -0,0 +1,887 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Copyright (c) Facebook, Inc. All Rights Reserved + + +import torch +import math +import pickle +import random +import os +import numpy as np + +from collections import deque +from typing import Optional, Tuple, List +from .processor import ( + Processor, + MetaProcessor, + TextProcessor, + Aligner, + MMAttentionMask2DProcessor +) + +from ..utils import ShardedTensor + + +class How2MetaProcessor(MetaProcessor): + def __init__(self, config): + super().__init__(config) + path = self._get_split_path(config) + with open(path) as fd: + self.data = [line.strip() for line in fd] + + def __getitem__(self, idx): + video_id = self.data[idx] + return video_id, video_id + + +class ShardedHow2MetaProcessor(How2MetaProcessor): + def __init__(self, config): + super().__init__(config) + self.split = str(config.split) + self.vfeat_dir = config.vfeat_dir + self._init_shard() + + def _init_shard(self): + if self.split == "train": + meta_fn = os.path.join(self.vfeat_dir, "train" + "_meta.pkl") + with open(meta_fn, "rb") as fr: + meta = pickle.load(fr) + elif self.split == "valid": + meta_fn = os.path.join(self.vfeat_dir, "val" + "_meta.pkl") + with open(meta_fn, "rb") as fr: + meta = pickle.load(fr) + elif self.split == "test": + print("use how2 val as test.") + meta_fn = os.path.join(self.vfeat_dir, "val" + "_meta.pkl") + with open(meta_fn, "rb") as fr: + meta = pickle.load(fr) + else: + raise ValueError("unsupported for MetaProcessor:", self.split) + video_id_to_shard = {} + for shard_id in meta: + for video_idx, video_id in enumerate(meta[shard_id]): + video_id_to_shard[video_id] = (shard_id, video_idx) + self.video_id_to_shard = video_id_to_shard + + def __getitem__(self, idx): + video_id, video_id = super().__getitem__(idx) + shard_id, shard_idx = self.video_id_to_shard[video_id] + meta = (video_id, idx, shard_id, shard_idx) + return meta, meta + + +class ShardedVideoProcessor(Processor): + """ + mmaped shards of numpy video features. + """ + + def __init__(self, config): + self.split = str(config.split) + self.vfeat_dir = config.vfeat_dir + + def __call__(self, video_id): + _, _, shard_id, video_idx = video_id + if self.split == "train": + shard = ShardedTensor.load( + os.path.join(self.vfeat_dir, "train" + "_" + str(shard_id)), + "r" + ) + elif self.split == "valid": + shard = ShardedTensor.load( + os.path.join(self.vfeat_dir, "val" + "_" + str(shard_id)), + "r" + ) + elif self.split == "test": + shard = ShardedTensor.load( + os.path.join(self.vfeat_dir, "val" + "_" + str(shard_id)), + "r" + ) + else: + raise ValueError("unknown split", self.split) + feat = shard[video_idx] + return feat + + +class ShardedTextProcessor(Processor): + def __init__(self, config): + self.tfeat_dir = str(config.tfeat_dir) + self.split = str(config.split) + + def __call__(self, video_id): + _, _, shard_id, shard_idx = video_id + if self.split == "train": + target_path = self.tfeat_dir + "train" + "_" + str(shard_id) + elif self.split == "valid": + target_path = self.tfeat_dir + "val" + "_" + str(shard_id) + elif self.split == "test": + target_path = self.tfeat_dir + "val" + "_" + str(shard_id) + else: + raise ValueError("unknown split", self.split) + + startend = ShardedTensor.load( + target_path + ".startends", "r")[shard_idx] + cap_ids = ShardedTensor.load( + target_path + ".caps_ids", "r")[shard_idx] + cap = [] + for clip_idx in range(len(cap_ids)): + clip = cap_ids[clip_idx] + cap.append(clip[clip != -1].tolist()) + start, end = startend[:, 0].tolist(), startend[:, 1].tolist() + return {"start": start, "end": end, "cap": cap} + + +class FixedLenAligner(Aligner): + """ + In the model we assume text is on the left (closer to BERT formulation) + and video is on the right. + We fix the total length of text + video. + max_video_len is in number of secs. + max_text_len is in number of tokens. + + special tokens formats: + we use the format [CLS] [SEP] text tokens [SEP] [PAD] ... + [CLS] will be splitted out into: + [CLS] video tokens [SEP] text tokens [SEP] [PAD] ... + token_type_ids will be generated by the model (for now). + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + so each sequence owns a [SEP] token for no-ops. + """ + + def __init__(self, config): + super().__init__(config) + self.text_clip_sampler = TextClipSamplingProcessor( + self.max_len - self.max_video_len - 3 + ) + """ + decide subsampling: + `config.subsampling` will change batch_size in trainer. + `config.clip_per_video` (used by RetriTask) doesn't + change batch_size in trainer. + """ + subsampling = config.subsampling \ + if config.subsampling is not None else None + if config.clip_per_video is not None: + subsampling = config.clip_per_video + self.subsampling = subsampling + + def _get_text_maxlen(self): + # use max text len + return self.text_clip_sampler.max_text_len + + def __call__(self, video_id, video_feature, text_feature): + from transformers import default_data_collator + video_idx = video_id[1] + if self.subsampling is not None and self.subsampling >= 1: + batch = [] + for _ in range(self.subsampling): + centerclip_idx = random.randint( + 0, len(text_feature["start"]) - 1) + batch.append( + self.sampling( + video_idx, + video_feature, + text_feature, + centerclip_idx, + self._get_text_maxlen() + )) + batch = self.batch_post_processing(batch, video_feature) + batch = default_data_collator(batch) + else: + raise ValueError( + "dataset.subsampling must be >= 1 for efficient video loading.") + batch = self.sampling(video_idx, video_feature, text_feature) + batch = self.batch_post_processing(batch, video_feature) + + batch["video_id"] = video_id if isinstance(video_id, str) \ + else video_id[0] + # e2e: make sure frame ids is into tensor. + assert torch.is_tensor(batch["vfeats"]) + return batch + + def sampling( + self, + video_idx, + video_feature, + text_feature, + centerclip_idx=None, + sampled_max_text_len=None, + ): + text_clip_indexs = self.text_clip_sampler( + text_feature, centerclip_idx, + sampled_max_text_len + ) + if isinstance(video_feature, np.ndarray): + video_len = len(video_feature) + else: + video_len = math.ceil(text_feature["end"][-1]) + + video_end = min( + math.ceil(text_feature["end"][text_clip_indexs[-1]]), + video_len + ) + video_start = max( + min( + math.floor(text_feature["start"][text_clip_indexs[0]]), + video_end), + 0 + ) + + video_clips = {"start": [video_start], "end": [video_end]} + + # tensorize. + vfeats, vmasks = self._build_video_seq( + video_feature, video_clips + ) + caps, cmasks = self._build_text_seq( + text_feature, text_clip_indexs + ) + + text_start = text_clip_indexs[0] + text_end = text_clip_indexs[-1] + 1 + + return { + "caps": caps, + "cmasks": cmasks, + "vfeats": vfeats, + "vmasks": vmasks, + "video_start": video_start, + "video_end": video_end, + "text_start": text_start, + "text_end": text_end, + } + + +class VariedLenAligner(FixedLenAligner): + def __init__(self, config): + super().__init__(config) + self.sampled_min_len = config.sampled_min_len + self.sampled_max_len = config.sampled_max_len + + def _get_text_maxlen(self): + return random.randint(self.sampled_min_len, self.sampled_max_len) + + +class StartClipAligner(VariedLenAligner): + def sampling( + self, + video_idx, + video_feature, + text_feature, + centerclip_idx=None, + sampled_max_text_len=None, + ): + return super().sampling( + video_idx, video_feature, text_feature, 0) + + +class OverlappedAligner(VariedLenAligner): + """video clip and text clip has overlappings + but may not be the same start/end.""" + def __init__(self, config): + super().__init__(config) + self.sampled_video_min_len = config.sampled_video_min_len + self.sampled_video_max_len = config.sampled_video_max_len + + self.video_clip_sampler = VideoClipSamplingProcessor() + + def _get_video_maxlen(self): + return random.randint( + self.sampled_video_min_len, self.sampled_video_max_len) + + def sampling( + self, + video_idx, + video_feature, + text_feature, + centerclip_idx=None, + sampled_max_text_len=None, + ): + text_clip_indexs = self.text_clip_sampler( + text_feature, centerclip_idx, + sampled_max_text_len + ) + if isinstance(video_feature, np.ndarray): + video_len = len(video_feature) + else: + video_len = math.ceil(text_feature["end"][-1]) + low = math.floor(text_feature["start"][text_clip_indexs[0]]) + high = math.ceil(text_feature["end"][text_clip_indexs[-1]]) + if low < high: + center = random.randint(low, high) + else: + center = int((low + high) // 2) + center = max(0, min(video_feature.shape[0] - 1, center)) + + assert 0 <= center < video_feature.shape[0] + + video_clips = self.video_clip_sampler( + video_len, self._get_video_maxlen(), center + ) + video_start = video_clips["start"][0] + video_end = video_clips["end"][0] + + # tensorize. + vfeats, vmasks = self._build_video_seq( + video_feature, video_clips + ) + caps, cmasks = self._build_text_seq( + text_feature, text_clip_indexs + ) + + text_start = text_clip_indexs[0] + text_end = text_clip_indexs[-1] + 1 + + return { + "caps": caps, + "cmasks": cmasks, + "vfeats": vfeats, + "vmasks": vmasks, + "video_start": video_start, + "video_end": video_end, + "text_start": text_start, + "text_end": text_end, + } + + +class MFMMLMAligner(FixedLenAligner): + """ + `FixedLenAligner` with Masked Language Model and Masked Frame Model. + """ + + def __init__(self, config): + super().__init__(config) + keep_prob = config.keep_prob if config.keep_prob is not None else 1.0 + self.text_clip_sampler = TextClipSamplingProcessor( + self.max_len - self.max_video_len - 3, keep_prob + ) + self.sampled_min_len = config.sampled_min_len + self.sampled_max_len = config.sampled_max_len + self.masked_token_sampler = TextMaskingProcessor(config) + self.mm_type = config.mm_type \ + if config.mm_type is not None else "full" + self.attnmasker = MMAttentionMask2DProcessor() \ + if self.mm_type == "textgen" else None + self.masked_frame_sampler = FrameMaskingProcessor(config) + self.lazy_vfeat_mask = ( + False if config.lazy_vfeat_mask is None else config.lazy_vfeat_mask + ) + self.mm_prob = config.mm_prob if config.mm_prob is not None else 0. + + def __call__(self, video_id, video_feature, text_feature): + from transformers import default_data_collator + if self.subsampling is not None and self.subsampling > 1: + batch = [] + for _ in range(self.subsampling): + centerclip_idx = random.randint( + 0, len(text_feature["start"]) - 1) + sampled_max_text_len = random.randint( + self.sampled_min_len, self.sampled_max_len + ) + batch.append( + self.sampling( + video_id, + video_feature, + text_feature, + centerclip_idx, + sampled_max_text_len, + ) + ) + batch = self.batch_post_processing(batch, video_feature) + batch = default_data_collator(batch) + else: + batch = self.sampling(video_id, video_feature, text_feature) + batch = self.batch_post_processing(batch, video_feature) + batch["video_id"] = video_id if isinstance(video_id, str) \ + else video_id[0] + return batch + + def sampling( + self, + video_id, + video_feature, + text_feature, + centerclip_idx=None, + sampled_max_text_len=None, + ): + output = FixedLenAligner.sampling(self, + video_id, video_feature, text_feature, + centerclip_idx, sampled_max_text_len) + + masking_text, masking_video = None, None + if random.random() < self.mm_prob: + if random.random() > 0.5: + masking_text, masking_video = self.mm_type, "no" + else: + masking_text, masking_video = "no", "full" + video_feats = output["vfeats"] if not self.lazy_vfeat_mask else None + video_label = self.masked_frame_sampler( + output["vmasks"], masking_video, vfeats=video_feats) + caps, text_label = self.masked_token_sampler( + output["caps"], masking_text) + + output.update({ + "caps": caps, + "video_label": video_label, + "text_label": text_label, + }) + + if self.attnmasker is not None: + attention_mask = self.attnmasker( + output["vmasks"], output["cmasks"], masking_text) + output.update({ + "attention_mask": attention_mask + }) + return output + + +class FrameMaskingProcessor(Processor): + def __init__(self, config): + self.mfm_probability = 0.15 + if config.mfm_probability is not None: + self.mfm_probability = config.mfm_probability + + def __call__(self, vmasks, modality_masking=None, vfeats=None): + """ + We perform lazy masking to save data transfer time. + It only generates video_labels by default and MFM model + will do actualy masking. + Return: `video_label` is a binary mask. + """ + video_label = vmasks.clone() + if modality_masking is not None: + if modality_masking == "full": + probability_matrix = torch.full(video_label.shape, 1.) + elif modality_masking == "no": + probability_matrix = torch.full(video_label.shape, 0.) + elif modality_masking == "inverse": + probability_matrix = torch.full( + video_label.shape, 1. - self.mfm_probability) + else: + raise ValueError("unknown modality masking.", modality_masking) + else: + probability_matrix = torch.full( + video_label.shape, self.mfm_probability) + masked_indices = torch.bernoulli(probability_matrix).bool() + # We only compute loss on masked tokens + video_label[~masked_indices] = 0 + if vfeats is not None: + vfeats[video_label, :] = 0.0 + return video_label + + +class TextGenerationProcessor(Processor): + def __init__(self, tokenizer): + self.bos_token_id = tokenizer.bos_token_id + self.pad_token_id = tokenizer.pad_token_id + + def __call__(self, inputs): + labels = inputs.clone() + # [CLS] [SEP] for video + labels[:2] = -100 + # keep [SEP] for text. + pad_mask = labels == self.pad_token_id + labels[pad_mask] = -100 + inputs[2:] = torch.cat([ + torch.LongTensor([self.bos_token_id]), + inputs[2:-1]]) + inputs[pad_mask] = self.pad_token_id + assert len(inputs) == len(labels) + return inputs, labels + + +class TextMaskingProcessor(Processor): + def __init__(self, config): + """this function is borrowed from + `transformers/data/data_collator.DataCollatorForLanguageModeling`""" + self.mlm_probability = 0.15 + if config.mlm_probability is not None: + self.mlm_probability = config.mlm_probability + self.bert_name = config.bert_name + # [CLS] is used as bos_token and [SEP] is used as eos_token. + # https://huggingface.co/transformers/master/model_doc/bertgeneration.html + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + self.bert_name, bos_token="[CLS]", eos_token="[SEP]") + self.textgen = TextGenerationProcessor(self.tokenizer) + + def __call__( + self, inputs: torch.Tensor, + modality_masking=None, + special_tokens_mask: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + expand modality_masking into + None: traditional bert masking. + "no": no masking. + "full": all [MASK] token for generation. + "gen": autoregressive generation. + """ + """ + Prepare masked tokens inputs/labels for masked language modeling: + 80% MASK, 10% random, 10% original. + """ + labels = inputs.clone() + # We sample a few tokens in each sequence for MLM training + # (with probability `self.mlm_probability`) + if modality_masking is not None: + if modality_masking == "full": + probability_matrix = torch.full(labels.shape, 1.) + elif modality_masking == "no": + probability_matrix = torch.full(labels.shape, 0.) + elif modality_masking.startswith("textgen"): + # [CLS] [SEP] ... + inputs, labels = self.textgen(inputs) + if "mask" not in modality_masking: + return inputs, labels + inputs = self.mask_input(inputs, special_tokens_mask) + return inputs, labels + elif modality_masking == "mask": + inputs = self.mask_input(inputs, special_tokens_mask) + labels = torch.full(inputs.shape, -100) + return inputs, labels + elif modality_masking == "inverse": + probability_matrix = torch.full(labels.shape, 1. - self.mlm_probability) + else: + raise ValueError("unknown modality masking.", modality_masking) + else: + probability_matrix = torch.full(labels.shape, self.mlm_probability) + + if special_tokens_mask is None: + special_tokens_mask = self.get_special_tokens_mask( + labels.tolist(), already_has_special_tokens=True + ) + special_tokens_mask = torch.tensor( + special_tokens_mask, dtype=torch.bool) + else: + special_tokens_mask = special_tokens_mask.bool() + + probability_matrix.masked_fill_(special_tokens_mask, value=0.0) + masked_indices = torch.bernoulli(probability_matrix).bool() + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + # 80% of the time, + # we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = ( + torch.bernoulli( + torch.full(labels.shape, 0.8)).bool() & masked_indices + ) + inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.mask_token + ) + + # 10% of the time, we replace masked input tokens with random word + indices_random = ( + torch.bernoulli(torch.full(labels.shape, 0.5)).bool() + & masked_indices + & ~indices_replaced + ) + random_words = torch.randint( + len(self.tokenizer), labels.shape, dtype=torch.long + ) + inputs[indices_random] = random_words[indices_random] + + # The rest of the time (10% of the time) we keep the masked input + # tokens unchanged + return inputs, labels + + def mask_input(self, inputs, special_tokens_mask=None): + # the following is new with masked autoregressive. + probability_matrix = torch.full( + inputs.shape, self.mlm_probability) + if special_tokens_mask is None: + special_tokens_mask = self.get_special_tokens_mask( + inputs.tolist(), already_has_special_tokens=True + ) + special_tokens_mask = torch.tensor( + special_tokens_mask, dtype=torch.bool) + else: + special_tokens_mask = special_tokens_mask.bool() + probability_matrix.masked_fill_(special_tokens_mask, value=0.0) + masked_indices = torch.bernoulli(probability_matrix).bool() + indices_replaced = ( + torch.bernoulli( + torch.full(inputs.shape, 0.8)).bool() & masked_indices + ) + inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.mask_token + ) + + # 10% of the time, we replace masked input tokens with random word + indices_random = ( + torch.bernoulli(torch.full(inputs.shape, 0.5)).bool() + & masked_indices + & ~indices_replaced + ) + random_words = torch.randint( + len(self.tokenizer), inputs.shape, dtype=torch.long + ) + inputs[indices_random] = random_words[indices_random] + return inputs + + def get_special_tokens_mask( + self, token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False + ) -> List[int]: + """ + Note: the version from transformers do not consider pad + as special tokens. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if" + "the provided sequence of " + "ids is already formated with special tokens " + "for the model." + ) + return list(map(lambda x: 1 if x in [ + self.tokenizer.sep_token_id, + self.tokenizer.cls_token_id, + self.tokenizer.pad_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + +class TextClipSamplingProcessor(Processor): + def __init__(self, max_text_len, keep_prob=1.0): + self.max_text_len = max_text_len + self.max_video_len = 256 # always hold. + self.keep_prob = keep_prob + + def __call__( + self, + text_feature, + centerclip_idx=None, + sampled_max_text_len=None, + sampled_max_video_len=None, + ): + # Let's use all caps for now and see if 256 can cover all of them. + if sampled_max_text_len is not None: + max_text_len = sampled_max_text_len + else: + max_text_len = self.max_text_len + if sampled_max_video_len is not None: + max_video_len = sampled_max_video_len + else: + max_video_len = self.max_video_len + + t_num_clips = len(text_feature["start"]) + + if centerclip_idx is None: + centerclip_idx = random.randint(0, t_num_clips - 1) + + start_idx, end_idx = centerclip_idx, centerclip_idx + 1 + text_clip_indexs = deque() + text_clip_indexs.append(start_idx) + text_len = len(text_feature["cap"][start_idx]) + + video_len = max( + 0, + text_feature["end"][start_idx] + - text_feature["start"][start_idx], + ) + + while ( + (start_idx > 0 or end_idx < t_num_clips) + and text_len < max_text_len + and video_len < max_video_len + ): + if random.random() > 0.5 and end_idx < t_num_clips: + # skip the next one? + if random.random() > self.keep_prob and (end_idx + 1) < t_num_clips: + end_idx = end_idx + 1 + text_clip_indexs.append(end_idx) + text_len += len(text_feature["cap"][end_idx]) + end_idx += 1 + elif start_idx > 0: + if random.random() > self.keep_prob and (start_idx - 1) > 0: + start_idx = start_idx - 1 + start_idx -= 1 + text_clip_indexs.insert(0, start_idx) + text_len += len(text_feature["cap"][start_idx]) + else: + if end_idx < t_num_clips: + if random.random() > self.keep_prob and (end_idx + 1) < t_num_clips: + end_idx = end_idx + 1 + text_clip_indexs.append(end_idx) + text_len += len(text_feature["cap"][end_idx]) + end_idx += 1 + else: + return text_clip_indexs + video_len = max( + 0, + text_feature["end"][text_clip_indexs[-1]] + - text_feature["start"][text_clip_indexs[0]], + ) + return text_clip_indexs + + +class VideoClipSamplingProcessor(Processor): + def __call__(self, video_len, max_video_len, center): + """ + `video_len`: length of the video. + `max_video_len`: maximum video tokens allowd in a sequence. + `center`: initial starting index. + """ + assert center >= 0 and center < video_len + t_clip_len = 0 + start, end = center, center + while (start > 0 or end < video_len) and t_clip_len < max_video_len: + # decide the direction to grow. + if start <= 0: + end += 1 + elif end >= video_len: + start -= 1 + elif random.random() > 0.5: + end += 1 + else: + start -= 1 + t_clip_len += 1 + return {"start": [start], "end": [end]} + + +class How2MILNCEAligner(FixedLenAligner): + """reference: `antoine77340/MIL-NCE_HowTo100M/video_loader.py`""" + + def __init__(self, config): + super().__init__(config) + self.num_candidates = 4 + self.min_time = 5.0 + self.num_sec = 3.2 + # self.num_sec = self.num_frames / float(self.fps) num_frames=16 / fps = 5 + # self.num_frames = 16 + + def sampling( + self, + video_id, + video_feature, + text_feature, + centerclip_idx=None, # will be ignored. + sampled_max_text_len=None # will be ignored. + ): + text, start, end = self._get_text(text_feature) + video = self._get_video(video_feature, start, end) + + vfeats = torch.zeros((self.max_video_len, video_feature.shape[1])) + vmasks = torch.zeros((self.max_video_len,), dtype=torch.bool) + vfeats[: video.shape[0]] = torch.from_numpy(np.array(video)) + vmasks[: video.shape[0]] = 1 + + caps, cmasks = [], [] + for words in text: + cap, cmask = self._build_text_seq(text_feature, words) + caps.append(cap) + cmasks.append(cmask) + caps = torch.stack(caps) + cmasks = torch.stack(cmasks) + # video of shape: (video_len) + # text of shape (num_candidates, max_text_len) + + return { + "caps": caps, + "cmasks": cmasks, + "vfeats": vfeats, + "vmasks": vmasks, + # "video_id": video_id, + } + + def _get_video(self, video_feature, start, end): + start_seek = random.randint(start, int(max(start, end - self.num_sec))) + # duration = self.num_sec + 0.1 + return video_feature[start_seek : int(start_seek + self.num_sec)] + + def _get_text(self, cap): + ind = random.randint(0, len(cap["start"]) - 1) + if self.num_candidates == 1: + words = [ind] + else: + words = [] + cap_start = self._find_nearest_candidates(cap, ind) + for i in range(self.num_candidates): + words.append([max(0, min(len(cap["cap"]) - 1, cap_start + i))]) + + start, end = cap["start"][ind], cap["end"][ind] + # TODO: May need to be improved for edge cases. + # expand the min time. + if end - start < self.min_time: + diff = self.min_time - end + start + start = max(0, start - diff / 2) + end = start + self.min_time + return words, int(start), int(end) + + def _find_nearest_candidates(self, caption, ind): + """find the range of the clips.""" + start, end = ind, ind + #diff = caption["end"][end] - caption["start"][start] + n_candidate = 1 + while n_candidate < self.num_candidates: + # the first clip + if start == 0: + return 0 + # we add () in the following condition to fix the bug. + elif end == (len(caption["start"]) - 1): + return start - (self.num_candidates - n_candidate) + elif (caption["end"][end] - caption["start"][start - 1]) < ( + caption["end"][end + 1] - caption["start"][start] + ): + start -= 1 + else: + end += 1 + n_candidate += 1 + return start + + +class PKLJSONStrTextProcessor(TextProcessor): + """`caption.json` from howto100m are preprocessed as a + dict `[video_id, json_str]`. + Json parsing tokenization are conducted on-the-fly and cached into dict. + """ + + def __init__(self, config, max_clip_text_len=96): + print("[Warning] PKLJSONStrTextProcessor is slow for num_workers > 0.") + self.caption_pkl_path = str(config.caption_pkl_path) + with open(self.caption_pkl_path, "rb") as fd: + self.data = pickle.load(fd) + self.max_clip_text_len = max_clip_text_len + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + str(config.bert_name), use_fast=config.use_fast + ) + + def __call__(self, video_id): + caption = self.data[video_id] + if isinstance(caption, str): + import json + caption = json.loads(caption) + cap = [] + for clip_idx, text_clip in enumerate(caption["text"]): + clip_ids = [] + if isinstance(text_clip, str): + clip_ids = self.tokenizer( + text_clip[: self.max_clip_text_len], + add_special_tokens=False + )["input_ids"] + cap.append(clip_ids) + caption["cap"] = cap + caption.pop("text") # save space. + self.data[video_id] = caption + return caption diff --git a/examples/MMPT/mmpt/processors/how2retriprocessor.py b/examples/MMPT/mmpt/processors/how2retriprocessor.py new file mode 100644 index 0000000000..b5a7730ec0 --- /dev/null +++ b/examples/MMPT/mmpt/processors/how2retriprocessor.py @@ -0,0 +1,100 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .how2processor import ( + ShardedHow2MetaProcessor, + ShardedVideoProcessor, + ShardedTextProcessor, + VariedLenAligner, + OverlappedAligner +) + + +class ShardedHow2VideoRetriMetaProcessor(ShardedHow2MetaProcessor): + def __init__(self, config): + super().__init__(config) + self.num_video_per_batch = config.num_video_per_batch + self.cands = [ + self.data[batch_offset:batch_offset + self.num_video_per_batch] + for batch_offset in + range(0, (len(self.data) // (8 * self.num_video_per_batch)) * 8 * self.num_video_per_batch, self.num_video_per_batch)] + + def __len__(self): + return len(self.cands) + + def set_candidates(self, cands): + # no changes on num of batches. + print(len(self.cands), "->", len(cands)) + # assert len(self.cands) == len(cands) + self.cands = cands + + def __getitem__(self, idx): + video_ids = self.cands[idx] + assert isinstance(video_ids, list) + sharded_video_idxs = [] + for video_id in video_ids: + shard_id, video_idx = self.video_id_to_shard[video_id] + sharded_video_idxs.append((video_id, -1, shard_id, video_idx)) + return sharded_video_idxs, sharded_video_idxs + + +class ShardedVideoRetriVideoProcessor(ShardedVideoProcessor): + """In retrival case the video_id + is a list of tuples: `(shard_id, video_idx)` .""" + + def __call__(self, sharded_video_idxs): + assert isinstance(sharded_video_idxs, list) + cand_feats = [] + for shared_video_idx in sharded_video_idxs: + feat = super().__call__(shared_video_idx) + cand_feats.append(feat) + return cand_feats + + +class ShardedVideoRetriTextProcessor(ShardedTextProcessor): + """In retrival case the video_id + is a list of tuples: `(shard_id, video_idx)` .""" + + def __call__(self, sharded_video_idxs): + assert isinstance(sharded_video_idxs, list) + cand_caps = [] + for shared_video_idx in sharded_video_idxs: + caps = super().__call__(shared_video_idx) + cand_caps.append(caps) + return cand_caps + + +class VideoRetriAligner(VariedLenAligner): + # Retritask will trim dim-0. + def __call__(self, sharded_video_idxs, video_features, text_features): + from transformers import default_data_collator + batch, video_ids = [], [] + for video_id, video_feature, text_feature in \ + zip(sharded_video_idxs, video_features, text_features): + sub_batch = super().__call__(video_id, video_feature, text_feature) + batch.append(sub_batch) + if isinstance(video_id, tuple): + video_id = video_id[0] + video_ids.append(video_id) + batch = default_data_collator(batch) + batch["video_id"] = video_ids + return batch + + +class VideoRetriOverlappedAligner(OverlappedAligner): + # Retritask will trim dim-0. + def __call__(self, sharded_video_idxs, video_features, text_features): + from transformers import default_data_collator + batch, video_ids = [], [] + for video_id, video_feature, text_feature in \ + zip(sharded_video_idxs, video_features, text_features): + sub_batch = super().__call__(video_id, video_feature, text_feature) + batch.append(sub_batch) + if isinstance(video_id, tuple): + video_id = video_id[0] + video_ids.append(video_id) + batch = default_data_collator(batch) + batch["video_id"] = video_ids + return batch diff --git a/examples/MMPT/mmpt/processors/models/s3dg.py b/examples/MMPT/mmpt/processors/models/s3dg.py new file mode 100644 index 0000000000..6c7a691e33 --- /dev/null +++ b/examples/MMPT/mmpt/processors/models/s3dg.py @@ -0,0 +1,336 @@ +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +"""Contains a PyTorch definition for Gated Separable 3D network (S3D-G) +with a text module for computing joint text-video embedding from raw text +and video input. The following code will enable you to load the HowTo100M +pretrained S3D Text-Video model from: + A. Miech, J.-B. Alayrac, L. Smaira, I. Laptev, J. Sivic and A. Zisserman, + End-to-End Learning of Visual Representations from Uncurated Instructional Videos. + https://arxiv.org/abs/1912.06430. + +S3D-G was proposed by: + S. Xie, C. Sun, J. Huang, Z. Tu and K. Murphy, + Rethinking Spatiotemporal Feature Learning For Video Understanding. + https://arxiv.org/abs/1712.04851. + Tensorflow code: https://github.com/tensorflow/models/blob/master/research/slim/nets/s3dg.py + +The S3D architecture was slightly modified with a space to depth trick for TPU +optimization. +""" + +import torch as th +import torch.nn.functional as F +import torch.nn as nn +import os +import numpy as np +import re + + +class InceptionBlock(nn.Module): + def __init__( + self, + input_dim, + num_outputs_0_0a, + num_outputs_1_0a, + num_outputs_1_0b, + num_outputs_2_0a, + num_outputs_2_0b, + num_outputs_3_0b, + gating=True, + ): + super(InceptionBlock, self).__init__() + self.conv_b0 = STConv3D(input_dim, num_outputs_0_0a, [1, 1, 1]) + self.conv_b1_a = STConv3D(input_dim, num_outputs_1_0a, [1, 1, 1]) + self.conv_b1_b = STConv3D( + num_outputs_1_0a, num_outputs_1_0b, [3, 3, 3], padding=1, separable=True + ) + self.conv_b2_a = STConv3D(input_dim, num_outputs_2_0a, [1, 1, 1]) + self.conv_b2_b = STConv3D( + num_outputs_2_0a, num_outputs_2_0b, [3, 3, 3], padding=1, separable=True + ) + self.maxpool_b3 = th.nn.MaxPool3d((3, 3, 3), stride=1, padding=1) + self.conv_b3_b = STConv3D(input_dim, num_outputs_3_0b, [1, 1, 1]) + self.gating = gating + self.output_dim = ( + num_outputs_0_0a + num_outputs_1_0b + num_outputs_2_0b + num_outputs_3_0b + ) + if gating: + self.gating_b0 = SelfGating(num_outputs_0_0a) + self.gating_b1 = SelfGating(num_outputs_1_0b) + self.gating_b2 = SelfGating(num_outputs_2_0b) + self.gating_b3 = SelfGating(num_outputs_3_0b) + + def forward(self, input): + """Inception block + """ + b0 = self.conv_b0(input) + b1 = self.conv_b1_a(input) + b1 = self.conv_b1_b(b1) + b2 = self.conv_b2_a(input) + b2 = self.conv_b2_b(b2) + b3 = self.maxpool_b3(input) + b3 = self.conv_b3_b(b3) + if self.gating: + b0 = self.gating_b0(b0) + b1 = self.gating_b1(b1) + b2 = self.gating_b2(b2) + b3 = self.gating_b3(b3) + return th.cat((b0, b1, b2, b3), dim=1) + + +class SelfGating(nn.Module): + def __init__(self, input_dim): + super(SelfGating, self).__init__() + self.fc = nn.Linear(input_dim, input_dim) + + def forward(self, input_tensor): + """Feature gating as used in S3D-G. + """ + spatiotemporal_average = th.mean(input_tensor, dim=[2, 3, 4]) + weights = self.fc(spatiotemporal_average) + weights = th.sigmoid(weights) + return weights[:, :, None, None, None] * input_tensor + + +class STConv3D(nn.Module): + def __init__( + self, input_dim, output_dim, kernel_size, stride=1, padding=0, separable=False + ): + super(STConv3D, self).__init__() + self.separable = separable + self.relu = nn.ReLU(inplace=True) + assert len(kernel_size) == 3 + if separable and kernel_size[0] != 1: + spatial_kernel_size = [1, kernel_size[1], kernel_size[2]] + temporal_kernel_size = [kernel_size[0], 1, 1] + if isinstance(stride, list) and len(stride) == 3: + spatial_stride = [1, stride[1], stride[2]] + temporal_stride = [stride[0], 1, 1] + else: + spatial_stride = [1, stride, stride] + temporal_stride = [stride, 1, 1] + if isinstance(padding, list) and len(padding) == 3: + spatial_padding = [0, padding[1], padding[2]] + temporal_padding = [padding[0], 0, 0] + else: + spatial_padding = [0, padding, padding] + temporal_padding = [padding, 0, 0] + if separable: + self.conv1 = nn.Conv3d( + input_dim, + output_dim, + kernel_size=spatial_kernel_size, + stride=spatial_stride, + padding=spatial_padding, + bias=False, + ) + self.bn1 = nn.BatchNorm3d(output_dim) + self.conv2 = nn.Conv3d( + output_dim, + output_dim, + kernel_size=temporal_kernel_size, + stride=temporal_stride, + padding=temporal_padding, + bias=False, + ) + self.bn2 = nn.BatchNorm3d(output_dim) + else: + self.conv1 = nn.Conv3d( + input_dim, + output_dim, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=False, + ) + self.bn1 = nn.BatchNorm3d(output_dim) + + def forward(self, input): + out = self.relu(self.bn1(self.conv1(input))) + if self.separable: + out = self.relu(self.bn2(self.conv2(out))) + return out + + +class MaxPool3dTFPadding(th.nn.Module): + def __init__(self, kernel_size, stride=None, padding="SAME"): + super(MaxPool3dTFPadding, self).__init__() + if padding == "SAME": + padding_shape = self._get_padding_shape(kernel_size, stride) + self.padding_shape = padding_shape + self.pad = th.nn.ConstantPad3d(padding_shape, 0) + self.pool = th.nn.MaxPool3d(kernel_size, stride, ceil_mode=True) + + def _get_padding_shape(self, filter_shape, stride): + def _pad_top_bottom(filter_dim, stride_val): + pad_along = max(filter_dim - stride_val, 0) + pad_top = pad_along // 2 + pad_bottom = pad_along - pad_top + return pad_top, pad_bottom + + padding_shape = [] + for filter_dim, stride_val in zip(filter_shape, stride): + pad_top, pad_bottom = _pad_top_bottom(filter_dim, stride_val) + padding_shape.append(pad_top) + padding_shape.append(pad_bottom) + depth_top = padding_shape.pop(0) + depth_bottom = padding_shape.pop(0) + padding_shape.append(depth_top) + padding_shape.append(depth_bottom) + return tuple(padding_shape) + + def forward(self, inp): + inp = self.pad(inp) + out = self.pool(inp) + return out + + +class Sentence_Embedding(nn.Module): + def __init__( + self, + embd_dim, + num_embeddings=66250, + word_embedding_dim=300, + token_to_word_path="dict.npy", + max_words=16, + output_dim=2048, + ): + super(Sentence_Embedding, self).__init__() + self.word_embd = nn.Embedding(num_embeddings, word_embedding_dim) + self.fc1 = nn.Linear(word_embedding_dim, output_dim) + self.fc2 = nn.Linear(output_dim, embd_dim) + self.word_to_token = {} + self.max_words = max_words + token_to_word = np.load(token_to_word_path) + for i, t in enumerate(token_to_word): + self.word_to_token[t] = i + 1 + + def _zero_pad_tensor_token(self, tensor, size): + if len(tensor) >= size: + return tensor[:size] + else: + zero = th.zeros(size - len(tensor)).long() + return th.cat((tensor, zero), dim=0) + + def _split_text(self, sentence): + w = re.findall(r"[\w']+", str(sentence)) + return w + + def _words_to_token(self, words): + words = [ + self.word_to_token[word] for word in words if word in self.word_to_token + ] + if words: + we = self._zero_pad_tensor_token(th.LongTensor(words), self.max_words) + return we + else: + return th.zeros(self.max_words).long() + + def _words_to_ids(self, x): + split_x = [self._words_to_token(self._split_text(sent.lower())) for sent in x] + return th.stack(split_x, dim=0) + + def forward(self, x): + x = self._words_to_ids(x) + x = self.word_embd(x) + x = F.relu(self.fc1(x)) + x = th.max(x, dim=1)[0] + x = self.fc2(x) + return {'text_embedding': x} + + +class S3D(nn.Module): + def __init__(self, dict_path, num_classes=512, gating=True, space_to_depth=True): + super(S3D, self).__init__() + self.num_classes = num_classes + self.gating = gating + self.space_to_depth = space_to_depth + if space_to_depth: + self.conv1 = STConv3D( + 24, 64, [2, 4, 4], stride=1, padding=(1, 2, 2), separable=False + ) + else: + self.conv1 = STConv3D( + 3, 64, [3, 7, 7], stride=2, padding=(1, 3, 3), separable=False + ) + self.conv_2b = STConv3D(64, 64, [1, 1, 1], separable=False) + self.conv_2c = STConv3D(64, 192, [3, 3, 3], padding=1, separable=True) + self.gating = SelfGating(192) + self.maxpool_2a = MaxPool3dTFPadding( + kernel_size=(1, 3, 3), stride=(1, 2, 2), padding="SAME" + ) + self.maxpool_3a = MaxPool3dTFPadding( + kernel_size=(1, 3, 3), stride=(1, 2, 2), padding="SAME" + ) + self.mixed_3b = InceptionBlock(192, 64, 96, 128, 16, 32, 32) + self.mixed_3c = InceptionBlock( + self.mixed_3b.output_dim, 128, 128, 192, 32, 96, 64 + ) + self.maxpool_4a = MaxPool3dTFPadding( + kernel_size=(3, 3, 3), stride=(2, 2, 2), padding="SAME" + ) + self.mixed_4b = InceptionBlock( + self.mixed_3c.output_dim, 192, 96, 208, 16, 48, 64 + ) + self.mixed_4c = InceptionBlock( + self.mixed_4b.output_dim, 160, 112, 224, 24, 64, 64 + ) + self.mixed_4d = InceptionBlock( + self.mixed_4c.output_dim, 128, 128, 256, 24, 64, 64 + ) + self.mixed_4e = InceptionBlock( + self.mixed_4d.output_dim, 112, 144, 288, 32, 64, 64 + ) + self.mixed_4f = InceptionBlock( + self.mixed_4e.output_dim, 256, 160, 320, 32, 128, 128 + ) + self.maxpool_5a = self.maxPool3d_5a_2x2 = MaxPool3dTFPadding( + kernel_size=(2, 2, 2), stride=(2, 2, 2), padding="SAME" + ) + self.mixed_5b = InceptionBlock( + self.mixed_4f.output_dim, 256, 160, 320, 32, 128, 128 + ) + self.mixed_5c = InceptionBlock( + self.mixed_5b.output_dim, 384, 192, 384, 48, 128, 128 + ) + self.fc = nn.Linear(self.mixed_5c.output_dim, num_classes) + self.text_module = Sentence_Embedding(num_classes, + token_to_word_path=dict_path) + + def _space_to_depth(self, input): + """3D space to depth trick for TPU optimization. + """ + B, C, T, H, W = input.shape + input = input.view(B, C, T // 2, 2, H // 2, 2, W // 2, 2) + input = input.permute(0, 3, 5, 7, 1, 2, 4, 6) + input = input.contiguous().view(B, 8 * C, T // 2, H // 2, W // 2) + return input + + def forward(self, inputs): + """Defines the S3DG base architecture.""" + if self.space_to_depth: + inputs = self._space_to_depth(inputs) + net = self.conv1(inputs) + if self.space_to_depth: + # we need to replicate 'SAME' tensorflow padding + net = net[:, :, 1:, 1:, 1:] + net = self.maxpool_2a(net) + net = self.conv_2b(net) + net = self.conv_2c(net) + if self.gating: + net = self.gating(net) + net = self.maxpool_3a(net) + net = self.mixed_3b(net) + net = self.mixed_3c(net) + net = self.maxpool_4a(net) + net = self.mixed_4b(net) + net = self.mixed_4c(net) + net = self.mixed_4d(net) + net = self.mixed_4e(net) + net = self.mixed_4f(net) + net = self.maxpool_5a(net) + net = self.mixed_5b(net) + net = self.mixed_5c(net) + net = th.mean(net, dim=[2, 3, 4]) + return {'video_embedding': self.fc(net), 'mixed_5c': net} diff --git a/examples/MMPT/mmpt/processors/processor.py b/examples/MMPT/mmpt/processors/processor.py new file mode 100644 index 0000000000..98edb051f1 --- /dev/null +++ b/examples/MMPT/mmpt/processors/processor.py @@ -0,0 +1,274 @@ +# Copyright (c) Facebook, Inc. All Rights Reserved + +import numpy as np +import os +import torch + + +class Processor(object): + """ + A generic processor for video (codec, feature etc.) and text. + """ + + def __call__(self, **kwargs): + raise NotImplementedError + + +class MetaProcessor(Processor): + """ + A meta processor is expected to load the metadata of a dataset: + (e.g., video_ids, or captions). + You must implement the `__getitem__` (meta datasets are rather diverse.). + """ + + def __init__(self, config): + self.split = config.split + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + raise NotImplementedError + + def _get_split_path(self, config): + splits = { + "train": config.train_path, + "valid": config.val_path, + "test": config.test_path, + } + if config.split is not None: + return splits[config.split] + return config.train_path + + +class TextProcessor(Processor): + """ + A generic Text processor: rename this as `withTokenizer`. + tokenize a string of text on-the-fly. + Warning: mostly used for end tasks. + (on-the-fly tokenization is slow for how2.) + TODO(huxu): move this class as a subclass. + """ + + def __init__(self, config): + self.bert_name = str(config.bert_name) + self.use_fast = config.use_fast + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + self.bert_name, use_fast=self.use_fast + ) + + def __call__(self, text_id): + caption = self.tokenizer(text_id, add_special_tokens=False) + return caption["input_ids"] + + +class VideoProcessor(Processor): + """ + A generic video processor: load a numpy video tokens by default. + """ + + def __init__(self, config): + self.vfeat_dir = config.vfeat_dir + + def __call__(self, video_fn): + if isinstance(video_fn, tuple): + video_fn = video_fn[0] + assert isinstance(video_fn, str) + video_fn = os.path.join(self.vfeat_dir, video_fn + ".npy") + feat = np.load(video_fn) + return feat + + +class Aligner(object): + """ + An alignprocessor align video and text and output a dict of tensors (for a model). + """ + def __init__(self, config): + """__init__ needs to be light weight for more workers/threads.""" + self.split = config.split + self.max_video_len = config.max_video_len + self.max_len = config.max_len + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained( + str(config.bert_name), use_fast=config.use_fast + ) + self.cls_token_id = tokenizer.cls_token_id + self.sep_token_id = tokenizer.sep_token_id + self.pad_token_id = tokenizer.pad_token_id + self.mask_token_id = tokenizer.mask_token_id + + def __call__(self, video_id, video_feature, text_feature): + raise NotImplementedError + + def _build_video_seq(self, video_feature, video_clips=None): + """ + `video_feature`: available video tokens. + `video_clips`: video clip sequence to build. + """ + if not isinstance(video_feature, np.ndarray): + raise ValueError( + "unsupported type of video_feature", type(video_feature) + ) + + if video_clips is None: + # this is borrowed from DSAligner + video_start = 0 + video_end = min(len(video_feature), self.max_video_len) + # the whole sequence is a single clip. + video_clips = {"start": [video_start], "end": [video_end]} + + vfeats = np.zeros( + (self.max_video_len, video_feature.shape[1]), dtype=np.float32 + ) + vmasks = torch.zeros((self.max_video_len,), dtype=torch.bool) + video_len = 0 + for start, end in zip(video_clips["start"], video_clips["end"]): + clip_len = min(self.max_video_len - video_len, (end - start)) + if clip_len > 0: + vfeats[video_len: video_len + clip_len] = video_feature[ + start: start + clip_len + ] + vmasks[video_len: video_len + clip_len] = 1 + video_len += clip_len + vfeats = torch.from_numpy(vfeats) + + return vfeats, vmasks + + def _build_text_seq(self, text_feature, text_clip_indexs=None): + """ + `text_feature`: all available clips. + `text_clip_indexes`: clip sequence to build. + """ + if text_clip_indexs is None: + text_clip_indexs = [0] + + full_caps = [] + if isinstance(text_feature, dict): + for clip_idx in text_clip_indexs: + full_caps.extend(text_feature["cap"][clip_idx]) + else: + full_caps = text_feature + max_text_len = self.max_len - self.max_video_len - 3 + full_caps = full_caps[:max_text_len] + full_caps = ( + [self.cls_token_id, self.sep_token_id] + full_caps + [self.sep_token_id] + ) + text_pad_len = self.max_len - len(full_caps) - self.max_video_len + padded_full_caps = full_caps + [self.pad_token_id] * text_pad_len + caps = torch.LongTensor(padded_full_caps) + cmasks = torch.zeros((len(padded_full_caps),), dtype=torch.bool) + cmasks[: len(full_caps)] = 1 + + return caps, cmasks + + def batch_post_processing(self, batch, video_feature): + return batch + + +class MMAttentionMask2DProcessor(Processor): + """text generation requires 2d mask + that is harder to generate by GPU at this stage.""" + + def __call__(self, vmask, cmask, mtype): + if mtype == "textgen": + return self._build_textgeneration_mask(vmask, cmask) + elif mtype == "videogen": + return self._build_videogeneration_mask(vmask, cmask) + else: + return self._build_mm_mask(vmask, cmask) + + def _build_mm_mask(self, vmask, cmask): + mask_1d = torch.cat([cmask[:1], vmask, cmask[1:]], dim=0) + return mask_1d[None, :].repeat(mask_1d.size(0), 1) + + def _build_videogeneration_mask(self, vmask, cmask): + # cls_mask is only about text otherwise it will leak generation. + cls_text_mask = torch.cat([ + # [CLS] + torch.ones( + (1,), dtype=torch.bool, device=cmask.device), + # video tokens and [SEP] for video. + torch.zeros( + (vmask.size(0) + 1,), dtype=torch.bool, device=cmask.device), + cmask[2:] + ], dim=0) + + # concat horizontially. + video_len = int(vmask.sum()) + video_masks = torch.cat([ + # [CLS] + torch.ones( + (video_len, 1), dtype=torch.bool, device=cmask.device + ), + torch.tril( + torch.ones( + (video_len, video_len), + dtype=torch.bool, device=cmask.device)), + # video_padding + torch.zeros( + (video_len, vmask.size(0) - video_len), + dtype=torch.bool, device=cmask.device + ), + # [SEP] for video (unused). + torch.zeros( + (video_len, 1), dtype=torch.bool, device=cmask.device + ), + cmask[2:].unsqueeze(0).repeat(video_len, 1) + ], dim=1) + + text_masks = cls_text_mask[None, :].repeat( + cmask.size(0) - 2, 1) + video_padding_masks = cls_text_mask[None, :].repeat( + vmask.size(0) - video_len, 1) + + return torch.cat([ + cls_text_mask[None, :], + video_masks, + video_padding_masks, + torch.cat([cmask[:1], vmask, cmask[1:]], dim=0)[None,:], + text_masks + ], dim=0) + + def _build_textgeneration_mask(self, vmask, cmask): + # cls_mask is only about video otherwise it will leak generation. + cls_video_mask = torch.cat([ + # [CLS] + torch.ones( + (1,), dtype=torch.bool, device=cmask.device), + vmask, + # [SEP] + torch.ones((1,), dtype=torch.bool, device=cmask.device), + torch.zeros( + (cmask.size(0)-2,), dtype=torch.bool, device=cmask.device) + ], dim=0) + + # concat horizontially. + text_len = int(cmask[2:].sum()) + text_masks = torch.cat([ + # [CLS] + torch.ones( + (text_len, 1), dtype=torch.bool, device=cmask.device + ), + vmask.unsqueeze(0).repeat(text_len, 1), + # [SEP] for video. + torch.ones( + (text_len, 1), dtype=torch.bool, device=cmask.device + ), + torch.tril( + torch.ones( + (text_len, text_len), + dtype=torch.bool, device=cmask.device)), + # padding. + torch.zeros( + (text_len, cmask.size(0) - text_len - 2), + dtype=torch.bool, device=cmask.device + ) + ], dim=1) + + cls_video_masks = cls_video_mask[None, :].repeat( + vmask.size(0) + 2, 1) + text_padding_masks = cls_video_mask[None, :].repeat( + cmask.size(0) - text_len - 2, 1) + return torch.cat([ + cls_video_masks, text_masks, text_padding_masks], dim=0) diff --git a/examples/MMPT/mmpt/tasks/__init__.py b/examples/MMPT/mmpt/tasks/__init__.py new file mode 100644 index 0000000000..e2e9323a53 --- /dev/null +++ b/examples/MMPT/mmpt/tasks/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +from .task import * +from .vlmtask import * +from .retritask import * + +try: + from .fairseqmmtask import * +except ImportError: + pass + +try: + from .milncetask import * +except ImportError: + pass + +try: + from .expretritask import * +except ImportError: + pass diff --git a/examples/MMPT/mmpt/tasks/fairseqmmtask.py b/examples/MMPT/mmpt/tasks/fairseqmmtask.py new file mode 100644 index 0000000000..f6b6115a39 --- /dev/null +++ b/examples/MMPT/mmpt/tasks/fairseqmmtask.py @@ -0,0 +1,104 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +make a general fairseq task for MM pretraining. +""" + +import random + +from fairseq.tasks import LegacyFairseqTask, register_task + +from .task import Task +from .retritask import RetriTask +from ..datasets import FairseqMMDataset +from .. import utils + + +@register_task("mmtask") +class FairseqMMTask(LegacyFairseqTask): + @staticmethod + def add_args(parser): + # Add some command-line arguments for specifying where the data is + # located and the maximum supported input length. + parser.add_argument( + "taskconfig", + metavar="FILE", + help=("taskconfig to load all configurations" "outside fairseq parser."), + ) + + @classmethod + def setup_task(cls, args, **kwargs): + return FairseqMMTask(args) + + def __init__(self, args): + super().__init__(args) + config = utils.load_config(args) + self.mmtask = Task.config_task(config) + self.mmtask.build_dataset() + self.mmtask.build_model() + self.mmtask.build_loss() + + def load_dataset(self, split, **kwargs): + split_map = { + "train": self.mmtask.train_data, + "valid": self.mmtask.val_data, + "test": self.mmtask.test_data, + } + if split not in split_map: + raise ValueError("unknown split type.") + if split_map[split] is not None: + self.datasets[split] = FairseqMMDataset(split_map[split]) + + def get_batch_iterator( + self, + dataset, + max_tokens=None, + max_sentences=None, + max_positions=None, + ignore_invalid_inputs=False, + required_batch_size_multiple=1, + seed=1, + num_shards=1, + shard_id=0, + num_workers=0, + epoch=1, + data_buffer_size=0, + disable_iterator_cache=False, + skip_remainder_batch=False, + grouped_shuffling=False, + update_epoch_batch_itr=False, + ): + random.seed(epoch) + if dataset.mmdataset.split == "train" and isinstance(self.mmtask, RetriTask): + if epoch >= self.mmtask.config.retri_epoch: + if not hasattr(self.mmtask, "retri_dataloader"): + self.mmtask.build_dataloader() + self.mmtask.retrive_candidates(epoch) + + return super().get_batch_iterator( + dataset, + max_tokens, + max_sentences, + max_positions, + ignore_invalid_inputs, + required_batch_size_multiple, + seed, + num_shards, + shard_id, + num_workers, + epoch, + data_buffer_size, + disable_iterator_cache, + grouped_shuffling, + update_epoch_batch_itr, + ) + + @property + def source_dictionary(self): + return None + + @property + def target_dictionary(self): + return None diff --git a/examples/MMPT/mmpt/tasks/milncetask.py b/examples/MMPT/mmpt/tasks/milncetask.py new file mode 100644 index 0000000000..61b6ab0597 --- /dev/null +++ b/examples/MMPT/mmpt/tasks/milncetask.py @@ -0,0 +1,27 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from .task import Task + + +class MILNCETask(Task): + def reshape_subsample(self, sample): + if ( + hasattr(self.config.dataset, "subsampling") + and self.config.dataset.subsampling is not None + and self.config.dataset.subsampling > 1 + ): + for key in sample: + if torch.is_tensor(sample[key]): + tensor = self.flat_subsample(sample[key]) + if key in ["caps", "cmasks"]: + size = tensor.size() + batch_size = size[0] * size[1] + expanded_size = (batch_size,) + size[2:] + tensor = tensor.view(expanded_size) + sample[key] = tensor + return sample diff --git a/examples/MMPT/mmpt/tasks/retritask.py b/examples/MMPT/mmpt/tasks/retritask.py new file mode 100644 index 0000000000..b43f20fddb --- /dev/null +++ b/examples/MMPT/mmpt/tasks/retritask.py @@ -0,0 +1,253 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import os +import torch +import pickle +import random + +from tqdm import tqdm +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from ..processors import ( + ShardedHow2MetaProcessor, + ShardedVideoProcessor, + ShardedTextProcessor, + VariedLenAligner, +) + +from ..datasets import MMDataset +from .task import Task +from ..modules import vectorpool +from ..evaluators.predictor import Predictor +from ..utils import set_seed, get_local_rank, get_world_size + + +class RetriTask(Task): + """abstract class for task with retrival.""" + + def reshape_subsample(self, sample): + for key in sample: + if torch.is_tensor(sample[key]): + sample[key] = self.flat_subsample(sample[key]) + return sample + + def flat_subsample(self, tensor): + if tensor.size(0) == 1: + tensor = tensor.squeeze(0) + return tensor + + def build_dataloader(self): + """called by `get_batch_iterator` in fairseqmmtask. """ + # TODO: hard-code dataloader for retri for now and configurable in .yaml. + # reuse the `train.lst`. + self.config.dataset.split = "train" + meta_processor = ShardedHow2MetaProcessor(self.config.dataset) + video_processor = ShardedVideoProcessor(self.config.dataset) + text_processor = ShardedTextProcessor(self.config.dataset) + + aligner = VariedLenAligner(self.config.dataset) + aligner.subsampling = self.config.dataset.clip_per_video + + self.retri_data = MMDataset( + meta_processor, video_processor, text_processor, aligner + ) + + retri_sampler = DistributedSampler(self.retri_data) + infer_scale = 16 + batch_size = self.config.dataset.num_video_per_batch \ + * infer_scale + + self.retri_dataloader = DataLoader( + self.retri_data, + collate_fn=self.retri_data.collater, + batch_size=batch_size, + shuffle=False, + sampler=retri_sampler, + num_workers=self.config.fairseq.dataset.num_workers + ) + return self.retri_dataloader + + def retrive_candidates(self, epoch, dataloader=None): + if get_local_rank() == 0: + print("running retrieval model.") + out_dir = os.path.join( + self.config.fairseq.checkpoint.save_dir, "retri") + os.makedirs(out_dir, exist_ok=True) + + if not os.path.isfile( + os.path.join( + out_dir, "batched_e" + str(epoch) + "_videos0.pkl") + ): + if dataloader is None: + dataloader = self.retri_dataloader + + self.model.eval() + self.model.is_train = False + + assert self.retri_data.meta_processor.data == \ + self.train_data.meta_processor.data # video_ids not mutated. + + self._retri_predict(epoch, dataloader) + + self.model.train() + self.model.is_train = True + + torch.distributed.barrier() + output = self._retri_sync(epoch, out_dir) + torch.distributed.barrier() + self.train_data.meta_processor.set_candidates(output) + return output + + +class VideoRetriTask(RetriTask): + """RetriTask on video level.""" + + def reshape_subsample(self, sample): + if ( + hasattr(self.config.dataset, "clip_per_video") + and self.config.dataset.clip_per_video is not None + and self.config.dataset.clip_per_video > 1 + ): + for key in sample: + if torch.is_tensor(sample[key]): + sample[key] = self.flat_subsample(sample[key]) + return sample + + def flat_subsample(self, tensor): + if tensor.size(0) == 1: + tensor = tensor.squeeze(0) + return Task.flat_subsample(self, tensor) + + def _retri_predict(self, epoch, dataloader): + set_seed(epoch) + # save for retrival. + predictor = VideoPredictor(self.config) + predictor.predict_loop( + self.model, dataloader) + set_seed(epoch) # get the same text clips. + # retrival. + retri_predictor = VideoRetriPredictor( + self.config) + retri_predictor.predict_loop( + self.model, predictor.vecpool.retriver, epoch) + del predictor + del retri_predictor + + def _retri_sync(self, epoch, out_dir): + # gpu do the same merge. + batched_videos = [] + for local_rank in range(get_world_size()): + fn = os.path.join( + out_dir, + "batched_e" + str(epoch) + "_videos" + str(local_rank) + ".pkl") + with open(fn, "rb") as fr: + batched_videos.extend(pickle.load(fr)) + print( + "[INFO] batched_videos", + len(batched_videos), len(batched_videos[0])) + return batched_videos + + +class VideoPredictor(Predictor): + def __init__(self, config): + vectorpool_cls = getattr(vectorpool, config.vectorpool_cls) + self.vecpool = vectorpool_cls(config) + + def predict_loop( + self, + model, + dataloader, + early_stop=-1, + ): + with torch.no_grad(): + if get_local_rank() == 0: + dataloader = tqdm(dataloader) + for batch_idx, batch in enumerate(dataloader): + if batch_idx == early_stop: + break + self(batch, model) + return self.finalize() + + def __call__(self, sample, model, **kwargs): + param = next(model.parameters()) + dtype = param.dtype + device = param.device + subsample = sample["vfeats"].size(1) + sample = self.to_ctx(sample, device, dtype) + for key in sample: + if torch.is_tensor(sample[key]): + size = sample[key].size() + if len(size) >= 2: + batch_size = size[0] * size[1] + expanded_size = ( + (batch_size,) + size[2:] if len(size) > 2 + else (batch_size,) + ) + sample[key] = sample[key].view(expanded_size) + + outputs = model(**sample) + sample.update(outputs) + self.vecpool(sample, subsample) + + def finalize(self): + print("[INFO]", self.vecpool) + if not self.vecpool.retriver.db.is_trained: + self.vecpool.retriver.finalize_training() + return self.vecpool.retriver + + +class VideoRetriPredictor(Predictor): + """ + Online Retrieval Predictor for Clips (used by RetriTask). + TODO: merge this with VisPredictor? + """ + + def __init__(self, config): + self.pred_dir = os.path.join( + config.fairseq.checkpoint.save_dir, + "retri") + self.num_cands = config.num_cands + self.num_video_per_batch = config.dataset.num_video_per_batch + + def predict_loop( + self, + model, + retriver, + epoch, + early_stop=-1 + ): + # a fake loop that only try to recover video vector + # from video_id. + batched_videos = [] + # obtain available video_ids. + video_ids = list(retriver.videoid_to_vectoridx.keys()) + + dataloader = random.sample( + video_ids, + len(video_ids) // self.num_video_per_batch + ) + + if get_local_rank() == 0: + dataloader = tqdm(dataloader) + for batch_idx, batch in enumerate(dataloader): + # batch is one video id. + if batch_idx == early_stop: + break + video_ids = retriver.search_by_video_ids( + [batch], self.num_cands)[0] + if len(video_ids) > self.num_video_per_batch: + # we moved the center to make cluster robust. + video_ids = random.sample(video_ids, self.num_video_per_batch) + batched_videos.append(video_ids) + return self.finalize(batched_videos, epoch) + + def finalize(self, batched_videos, epoch): + fn = os.path.join( + self.pred_dir, + "batched_e" + str(epoch) + "_videos" + str(get_local_rank()) + ".pkl") + with open(fn, "wb") as fw: + pickle.dump(batched_videos, fw, pickle.HIGHEST_PROTOCOL) + return batched_videos diff --git a/examples/MMPT/mmpt/tasks/task.py b/examples/MMPT/mmpt/tasks/task.py new file mode 100644 index 0000000000..8bb50f24df --- /dev/null +++ b/examples/MMPT/mmpt/tasks/task.py @@ -0,0 +1,184 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import torch + +from .. import tasks +from .. import models +from .. import losses +from ..datasets import MMDataset +from .. import processors + + +class Task(object): + """ + A task refers to one generic training task (e.g., training one model). + """ + + @classmethod + def config_task(cls, config): + """ + determine whether to load a hard-coded task or config from a generic one. + via if a task string is available in config. + """ + if config.task is not None: + # TODO (huxu): expand the search scope. + task_cls = getattr(tasks, config.task) + return task_cls(config) + else: + return Task(config) + + def __init__(self, config): + self.config = config + self.train_data = None + self.val_data = None + self.test_data = None + + self.model = None + self.loss_fn = None + self.eval_fn = None + + def build_dataset(self): + """TODO (huxu): move processor breakdown to MMDataset.""" + """fill-in `self.train_data`, `self.val_data` and `self.test_data`.""" + + meta_processor_cls = getattr( + processors, self.config.dataset.meta_processor) + video_processor_cls = getattr( + processors, self.config.dataset.video_processor) + text_processor_cls = getattr( + processors, self.config.dataset.text_processor) + aligner_cls = getattr( + processors, self.config.dataset.aligner) + + if self.config.dataset.train_path is not None: + self.config.dataset.split = "train" + # may be used by meta processor. + # meta_processor controls different dataset. + meta_processor = meta_processor_cls(self.config.dataset) + video_processor = video_processor_cls(self.config.dataset) + text_processor = text_processor_cls(self.config.dataset) + aligner = aligner_cls(self.config.dataset) + self.train_data = MMDataset( + meta_processor, video_processor, text_processor, aligner + ) + print("train_len", len(self.train_data)) + output = self.train_data[0] + self.train_data.print_example(output) + if self.config.dataset.val_path is not None: + self.config.dataset.split = "valid" + # may be used by meta processor. + meta_processor = meta_processor_cls(self.config.dataset) + video_processor = video_processor_cls(self.config.dataset) + text_processor = text_processor_cls(self.config.dataset) + aligner = aligner_cls(self.config.dataset) + self.val_data = MMDataset( + meta_processor, video_processor, text_processor, aligner + ) + print("val_len", len(self.val_data)) + output = self.val_data[0] + self.val_data.print_example(output) + + if self.config.dataset.split == "test": + # the following is run via lauching fairseq-validate. + meta_processor = meta_processor_cls(self.config.dataset) + video_processor = video_processor_cls(self.config.dataset) + text_processor = text_processor_cls(self.config.dataset) + + self.test_data = MMDataset( + meta_processor, video_processor, text_processor, aligner + ) + print("test_len", len(self.test_data)) + output = self.test_data[0] + self.test_data.print_example(output) + + def build_model(self, checkpoint=None): + if self.model is None: + model_cls = getattr(models, self.config.model.model_cls) + self.model = model_cls(self.config) + if checkpoint is not None: + self.load_checkpoint(checkpoint) + return self.model + + def load_checkpoint(self, checkpoint): + if self.model is None: + raise ValueError("model is not initialized.") + state_dict = torch.load(checkpoint) + state_dict = self._trim_state_dict(state_dict) + self.model.load_state_dict(state_dict, strict=False) + # if it's a fp16 model, turn it back. + if next(self.model.parameters()).dtype == torch.float16: + self.model = self.model.float() + return self.model + + def _trim_state_dict(self, state_dict): + from collections import OrderedDict + + if "state_dict" in state_dict: + state_dict = state_dict["state_dict"] + if "model" in state_dict: # fairseq checkpoint format. + state_dict = state_dict["model"] + ret_state_dict = OrderedDict() + for ( + key, + value, + ) in state_dict.items(): + # remove fairseq wrapper since this is a task. + if key.startswith("mmmodel"): + key = key[len("mmmodel."):] + ret_state_dict[key] = value + return ret_state_dict + + def build_loss(self): + if self.loss_fn is None and self.config.loss is not None: + loss_cls = getattr(losses, self.config.loss.loss_cls) + self.loss_fn = loss_cls() + return self.loss_fn + + def flat_subsample(self, tensor): + size = tensor.size() + if len(size) >= 2: + batch_size = size[0] * size[1] + expanded_size = ( + (batch_size,) + size[2:] if len(size) > 2 + else (batch_size,) + ) + tensor = tensor.view(expanded_size) + return tensor + + def reshape_subsample(self, sample): + if ( + hasattr(self.config.dataset, "subsampling") + and self.config.dataset.subsampling is not None + and self.config.dataset.subsampling > 1 + ): + for key in sample: + if torch.is_tensor(sample[key]): + sample[key] = self.flat_subsample(sample[key]) + return sample + + def __call__(self, model, sample): + loss = None + loss_scalar = float("inf") + + sample = self.reshape_subsample(sample) + outputs = self.model(**sample) + sample.update(outputs) + if self.loss_fn is not None: + loss = self.loss_fn(**sample) + loss_scalar = loss.item() + + batch_size = sample["caps"].size(0) + sample_size = 1 + return { + "loss": loss, + "loss_scalar": loss_scalar, + "max_len": self.config.dataset.max_len, + "batch_size": batch_size, + "sample_size": sample_size, + } + + def build_dataloader(self): + """only used for trainer that lacks building loaders.""" + raise NotImplementedError diff --git a/examples/MMPT/mmpt/tasks/vlmtask.py b/examples/MMPT/mmpt/tasks/vlmtask.py new file mode 100644 index 0000000000..57dc4c9170 --- /dev/null +++ b/examples/MMPT/mmpt/tasks/vlmtask.py @@ -0,0 +1,27 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import torch + +from .task import Task + + +class VLMTask(Task): + """A VLM task for reproducibility. + the collator split subsamples into two sub-batches. + This has should have no logic changes. + but changed the randomness in frame masking. + """ + + def flat_subsample(self, tensor): + size = tensor.size() + if len(size) >= 2: + batch_size = size[0] * (size[1] // 2) + expanded_size = ( + (batch_size, 2) + size[2:] if len(size) > 2 + else (batch_size, 2) + ) + tensor = tensor.view(expanded_size) + tensor = torch.cat([tensor[:, 0], tensor[:, 1]], dim=0) + return tensor diff --git a/examples/MMPT/mmpt/utils/__init__.py b/examples/MMPT/mmpt/utils/__init__.py new file mode 100644 index 0000000000..2429ee3757 --- /dev/null +++ b/examples/MMPT/mmpt/utils/__init__.py @@ -0,0 +1,68 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import random +import numpy as np +import torch + +from .shardedtensor import * +from .load_config import * + + +def set_seed(seed=43211): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + if torch.backends.cudnn.enabled: + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + +def get_world_size(): + if torch.distributed.is_initialized(): + world_size = torch.distributed.get_world_size() + else: + world_size = 1 + return world_size + + +def get_local_rank(): + return torch.distributed.get_rank() \ + if torch.distributed.is_initialized() else 0 + + +def print_on_rank0(func): + local_rank = get_local_rank() + if local_rank == 0: + print("[INFO]", func) + + +class RetriMeter(object): + """ + Statistics on whether retrieval yields a better pair. + """ + def __init__(self, freq=1024): + self.freq = freq + self.total = 0 + self.replace = 0 + self.updates = 0 + + def __call__(self, data): + if isinstance(data, np.ndarray): + self.replace += data.shape[0] - int((data[:, 0] == -1).sum()) + self.total += data.shape[0] + elif torch.is_tensor(data): + self.replace += int(data.sum()) + self.total += data.size(0) + else: + raise ValueError("unsupported RetriMeter data type.", type(data)) + + self.updates += 1 + if get_local_rank() == 0 and self.updates % self.freq == 0: + print("[INFO]", self) + + def __repr__(self): + return "RetriMeter (" + str(self.replace / self.total) \ + + "/" + str(self.replace) + "/" + str(self.total) + ")" diff --git a/examples/MMPT/mmpt/utils/load_config.py b/examples/MMPT/mmpt/utils/load_config.py new file mode 100644 index 0000000000..ede4f94117 --- /dev/null +++ b/examples/MMPT/mmpt/utils/load_config.py @@ -0,0 +1,81 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import os +import omegaconf +from omegaconf import OmegaConf + + +def load_config(args=None, config_file=None, overwrite_fairseq=False): + """TODO (huxu): move fairseq overwrite to another function.""" + if args is not None: + config_file = args.taskconfig + config = recursive_config(config_file) + + if config.dataset.subsampling is not None: + batch_size = config.fairseq.dataset.batch_size // config.dataset.subsampling + print( + "adjusting batch_size to {} due to subsampling {}.".format( + batch_size, config.dataset.subsampling + ) + ) + config.fairseq.dataset.batch_size = batch_size + + is_test = config.dataset.split is not None and config.dataset.split == "test" + if not is_test: + if ( + config.fairseq.checkpoint is None + or config.fairseq.checkpoint.save_dir is None + ): + raise ValueError("fairseq save_dir or save_path must be specified.") + + save_dir = config.fairseq.checkpoint.save_dir + os.makedirs(save_dir, exist_ok=True) + if config.fairseq.common.tensorboard_logdir is not None: + tb_run_dir = suffix_rundir( + save_dir, config.fairseq.common.tensorboard_logdir + ) + config.fairseq.common.tensorboard_logdir = tb_run_dir + print( + "update tensorboard_logdir as", config.fairseq.common.tensorboard_logdir + ) + os.makedirs(save_dir, exist_ok=True) + OmegaConf.save(config=config, f=os.path.join(save_dir, "config.yaml")) + + if overwrite_fairseq and config.fairseq is not None and args is not None: + # flatten fields. + for group in config.fairseq: + for field in config.fairseq[group]: + print("overwrite args." + field, "as", config.fairseq[group][field]) + setattr(args, field, config.fairseq[group][field]) + return config + + +def recursive_config(config_path): + """allows for stacking of configs in any depth.""" + config = OmegaConf.load(config_path) + if config.includes is not None: + includes = config.includes + config.pop("includes") + base_config = recursive_config(includes) + config = OmegaConf.merge(base_config, config) + return config + + +def suffix_rundir(save_dir, run_dir): + max_id = -1 + for search_dir in os.listdir(save_dir): + if search_dir.startswith(run_dir): + splits = search_dir.split("_") + cur_id = int(splits[1]) if len(splits) > 1 else 0 + max_id = max(max_id, cur_id) + return os.path.join(save_dir, run_dir + "_" + str(max_id + 1)) + + +def overwrite_dir(config, replace, basedir): + for key in config: + if isinstance(config[key], str) and config[key].startswith(basedir): + config[key] = config[key].replace(basedir, replace) + if isinstance(config[key], omegaconf.dictconfig.DictConfig): + overwrite_dir(config[key], replace, basedir) diff --git a/examples/MMPT/mmpt/utils/shardedtensor.py b/examples/MMPT/mmpt/utils/shardedtensor.py new file mode 100644 index 0000000000..2424f360ef --- /dev/null +++ b/examples/MMPT/mmpt/utils/shardedtensor.py @@ -0,0 +1,46 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import os +import pickle +import numpy as np + + +class ShardedTensor(object): + def __init__(self, data, starts): + self.data = data + self.starts = starts + assert self.starts[0] == 0 + assert self.starts[-1] == len(self.data) + assert (self.starts[1:] >= self.starts[:-1]).all() + assert (self.starts > -1).all() + + @staticmethod + def from_list(xs): + starts = np.full((len(xs) + 1,), -1, dtype=np.long) + data = np.concatenate(xs, axis=0) + starts[0] = 0 + for i, x in enumerate(xs): + starts[i + 1] = starts[i] + x.shape[0] + assert (starts > -1).all() + return ShardedTensor(data, starts) + + def __getitem__(self, i): + return self.data[self.starts[i] : self.starts[i + 1]] + + def __len__(self): + return len(self.starts) - 1 + + def lengths(self): + return self.starts[1:] - self.starts[:-1] + + def save(self, path): + np.save(path + "_starts", self.starts) + np.save(path + "_data", self.data) + + @staticmethod + def load(path, mmap_mode=None): + starts = np.load(path + "_starts.npy", mmap_mode) + data = np.load(path + "_data.npy", mmap_mode) + return ShardedTensor(data, starts) diff --git a/examples/MMPT/mmpt_cli/localjob.py b/examples/MMPT/mmpt_cli/localjob.py new file mode 100644 index 0000000000..2675d3511a --- /dev/null +++ b/examples/MMPT/mmpt_cli/localjob.py @@ -0,0 +1,117 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import os + +from mmpt.utils import recursive_config + + +class BaseJob(object): + def __init__(self, yaml_file, dryrun=False): + self.yaml_file = yaml_file + self.config = recursive_config(yaml_file) + self.dryrun = dryrun + + def submit(self, **kwargs): + raise NotImplementedError + + def _normalize_cmd(self, cmd_list): + cmd_list = list(cmd_list) + yaml_index = cmd_list.index("[yaml]") + cmd_list[yaml_index] = self.yaml_file + return cmd_list + + +class LocalJob(BaseJob): + + CMD_CONFIG = { + "local_single": [ + "fairseq-train", "[yaml]", "--user-dir", "mmpt", + "--task", "mmtask", "--arch", "mmarch", + "--criterion", "mmloss", + ], + "local_small": [ + "fairseq-train", "[yaml]", "--user-dir", "mmpt", + "--task", "mmtask", "--arch", "mmarch", + "--criterion", "mmloss", + "--distributed-world-size", "2" + ], + "local_big": [ + "fairseq-train", "[yaml]", "--user-dir", "mmpt", + "--task", "mmtask", "--arch", "mmarch", + "--criterion", "mmloss", + "--distributed-world-size", "8" + ], + "local_predict": ["python", "mmpt_cli/predict.py", "[yaml]"], + } + + def __init__(self, yaml_file, job_type=None, dryrun=False): + super().__init__(yaml_file, dryrun) + if job_type is None: + self.job_type = "local_single" + if self.config.task_type is not None: + self.job_type = self.config.task_type + else: + self.job_type = job_type + if self.job_type in ["local_single", "local_small"]: + if self.config.fairseq.dataset.batch_size > 32: + print("decreasing batch_size to 32 for local testing?") + + def submit(self): + cmd_list = self._normalize_cmd(LocalJob.CMD_CONFIG[self.job_type]) + if "predict" not in self.job_type: + # append fairseq args. + from mmpt.utils import load_config + + config = load_config(config_file=self.yaml_file) + for field in config.fairseq: + for key in config.fairseq[field]: + if key in ["fp16", "reset_optimizer", "reset_dataloader", "reset_meters"]: # a list of binary flag. + param = ["--" + key.replace("_", "-")] + else: + if key == "lr": + value = str(config.fairseq[field][key][0]) + elif key == "adam_betas": + value = "'"+str(config.fairseq[field][key])+"'" + else: + value = str(config.fairseq[field][key]) + param = [ + "--" + key.replace("_", "-"), + value + ] + cmd_list.extend(param) + + print("launching", " ".join(cmd_list)) + if not self.dryrun: + os.system(" ".join(cmd_list)) + return JobStatus("12345678") + + +class JobStatus(object): + def __init__(self, job_id): + self.job_id = job_id + + def __repr__(self): + return self.job_id + + def __str__(self): + return self.job_id + + def done(self): + return False + + def running(self): + return False + + def result(self): + if self.done(): + return "{} is done.".format(self.job_id) + else: + return "{} is running.".format(self.job_id) + + def stderr(self): + return self.result() + + def stdout(self): + return self.result() diff --git a/examples/MMPT/mmpt_cli/predict.py b/examples/MMPT/mmpt_cli/predict.py new file mode 100644 index 0000000000..4071e196d2 --- /dev/null +++ b/examples/MMPT/mmpt_cli/predict.py @@ -0,0 +1,113 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import os +import glob +import argparse +import pprint +import omegaconf + +from omegaconf import OmegaConf +from torch.utils.data import DataLoader + +from mmpt.utils import load_config, set_seed +from mmpt.evaluators import Evaluator +from mmpt.evaluators import predictor as predictor_path +from mmpt.tasks import Task +from mmpt import processors +from mmpt.datasets import MMDataset + + +def get_dataloader(config): + meta_processor_cls = getattr(processors, config.dataset.meta_processor) + video_processor_cls = getattr(processors, config.dataset.video_processor) + text_processor_cls = getattr(processors, config.dataset.text_processor) + aligner_cls = getattr(processors, config.dataset.aligner) + + meta_processor = meta_processor_cls(config.dataset) + video_processor = video_processor_cls(config.dataset) + text_processor = text_processor_cls(config.dataset) + aligner = aligner_cls(config.dataset) + + test_data = MMDataset( + meta_processor, + video_processor, + text_processor, + aligner, + ) + print("test_len", len(test_data)) + output = test_data[0] + test_data.print_example(output) + + test_dataloader = DataLoader( + test_data, + batch_size=config.fairseq.dataset.batch_size, + shuffle=False, + num_workers=6, + collate_fn=test_data.collater, + ) + return test_dataloader + + +def main(args): + config = load_config(args) + + if isinstance(config, omegaconf.dictconfig.DictConfig): + print(OmegaConf.to_yaml(config)) + else: + pp = pprint.PrettyPrinter(indent=4) + pp.print(config) + + mmtask = Task.config_task(config) + mmtask.build_model() + + test_dataloader = get_dataloader(config) + checkpoint_search_path = os.path.dirname(config.eval.save_path) + results = [] + + prefix = os.path.basename(args.taskconfig) + if prefix.startswith("test"): + # loop all checkpoint for datasets without validation set. + if "best" not in config.fairseq.common_eval.path: + print("eval each epoch.") + for checkpoint in glob.glob(checkpoint_search_path + "/checkpoint*"): + model = mmtask.load_checkpoint(checkpoint) + ckpt = os.path.basename(checkpoint) + evaluator = Evaluator(config) + output = evaluator.evaluate( + model, test_dataloader, ckpt + "_merged") + results.append((checkpoint, output)) + # use the one specified by the config lastly. + model = mmtask.load_checkpoint(config.fairseq.common_eval.path) + evaluator = Evaluator(config) + output = evaluator.evaluate(model, test_dataloader) + results.append((config.fairseq.common_eval.path, output)) + + best_result = None + best_metric = 0. + for checkpoint, result in results: + print(checkpoint) + evaluator.metric.print_computed_metrics(result) + best_score = evaluator.metric.best_metric(result) + if best_score > best_metric: + best_result = (checkpoint, result) + best_metric = best_score + print("best results:") + print(best_result[0]) + evaluator.metric.print_computed_metrics(best_result[1]) + + elif prefix.startswith("vis"): + model = mmtask.load_checkpoint(config.fairseq.common_eval.path) + predictor_cls = getattr(predictor_path, config.predictor) + predictor = predictor_cls(config) + predictor.predict_loop(model, test_dataloader, mmtask, None) + else: + raise ValueError("unknown prefix of the config file", args.taskconfig) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("taskconfig", type=str) + args = parser.parse_args() + main(args) diff --git a/examples/MMPT/pretraining.md b/examples/MMPT/pretraining.md new file mode 100644 index 0000000000..8f8e6d0fac --- /dev/null +++ b/examples/MMPT/pretraining.md @@ -0,0 +1,29 @@ +# Pretraining + +(If you are new to the ideas of `mmpt.processors`, see [README](README.md) first.) +We mostly use [howto100M](https://github.com/antoine77340/howto100m) dataset for pretraining (other datasets are coming). So you are less likely to write a new `MetaProcessor`, `VideoProcessor` or `TextProcessor` but only working on a new `Aligner`, a new model and loss. + +### Data Sharding +Pretraining on Howto100M is heavy on IO since we have millions of videos or captions on the hard disk that cannot be fit into the memory. +It is desirable to have an optimized preprocessing step before the actual dataloading. + +We support data sharding to pack multiple videos into a shards of training data for both videos and captions. (see [dataset](DATASET.md) for preprocessing). +These shards will be mapped into memory to reduce the frequency of IO access on millions of files. See (processors starting with `Sharded*`). +This will be the default config for a how2 dataset `projects/task/how2.yaml`. + +Great thanks to Dmytro Okhonko for sharing the code from MARGE project. + +### Training +Pretraining on Howto100m is expected on one or multiple nodes, where each node has 8 GPUS with 32 GB mem. +launching a pretraing on MFM+MLM can be done, via: +```python locallaunch.py projects/mfmmlm/how2.yaml``` + +### Pre-training with a Retrieval Model (VideoCLIP) +This projects now support alternatively run a retrieval model and pre-training. +We implement a basic retrieval model that is built on the hidden states of a video and faiss. + +You may need to install faiss via `conda install faiss-cpu -c pytorch`. + +Right now, the hidden states of a video is computed as the average of 8 clips of their pooled visual/text hidden states. +See `mmpt/tasks/retritask.py` for more details. +The `.yaml` config for running pre-training with a retrieval model can be found at `projects/retri/videoretri.yaml`. diff --git a/examples/MMPT/projects/mfmmlm.yaml b/examples/MMPT/projects/mfmmlm.yaml new file mode 100644 index 0000000000..0f3450a1e0 --- /dev/null +++ b/examples/MMPT/projects/mfmmlm.yaml @@ -0,0 +1,59 @@ +project_dir: mfmmlm +run_task: + - how2.yaml + - [vtt.yaml, vttcap.yaml, vttqa.yaml, youcook.yaml, youcookcap.yaml, crosstask.yaml, coin.yaml] +base_dir: task +task_group: + pretrain: + task_list: + - how2.yaml + dataset: + subsampling: 32 + sampled_min_len: 10 + sampled_max_len: 64 + max_video_len: 32 + max_len: 96 + aligner: MFMMLMAligner + lazy_vfeat_mask: True + mfm_probability: 0.15 + mlm_probability: 0.15 + mm_prob: 0.5 + model: + model_cls: MMFusionMFMMLM + mm_encoder_cls: MMFusionForMFMMLM + loss: + loss_cls: MFMMLM + fairseq: + common: + fp16: true + dataset: + batch_size: 256 + optimization: + max_epoch: 15 + finetune: + task_list: + - vtt.yaml + - vttqa.yaml + - youcook.yaml + - youcookcap.yaml + - crosstask.yaml + - coin.yaml + dataset: + max_video_len: 32 + max_len: 96 + fairseq: + common: + fp16: true + # do not write any model or loss here (they are expected to be fixed in mmfusion). + test: + task_list: + - test_vtt.yaml + - test_vttqa.yaml + - test_youcook.yaml + - test_youcookcap.yaml + - test_crosstask.yaml + - test_crosstask_zs.yaml + - test_coin.yaml + dataset: + max_video_len: 32 + max_len: 96 diff --git a/examples/MMPT/projects/mtm/mmfusionmtm.yaml b/examples/MMPT/projects/mtm/mmfusionmtm.yaml new file mode 100644 index 0000000000..337d66a2aa --- /dev/null +++ b/examples/MMPT/projects/mtm/mmfusionmtm.yaml @@ -0,0 +1,19 @@ +includes: projects/mfmmlm.yaml +project_dir: mtm/mmfusionmtm +task_group: + pretrain: + task: VLMTask # reproducible + dataset: + aligner: MFMMLMAligner + model: + use_seg_emb: True # reproducible + model_cls: MMFusionMTM + mm_encoder_cls: MMBertForMFMMLM + loss: + loss_cls: MTM + finetune: + model: + use_seg_emb: True # reproducible + test: + model: + use_seg_emb: True # reproducible diff --git a/examples/MMPT/projects/mtm/vlm.yaml b/examples/MMPT/projects/mtm/vlm.yaml new file mode 100644 index 0000000000..022a2623c5 --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm.yaml @@ -0,0 +1,8 @@ +includes: projects/mtm/mmfusionmtm.yaml +project_dir: mtm/vlm +task_group: + pretrain: + dataset: + sampled_min_len: 8 + loss: + loss_cls: MTM diff --git a/examples/MMPT/projects/mtm/vlm/coin.yaml b/examples/MMPT/projects/mtm/vlm/coin.yaml new file mode 100644 index 0000000000..48fd64a5f4 --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/coin.yaml @@ -0,0 +1,47 @@ +dataset: + video_processor: VideoProcessor + bert_name: bert-base-uncased + meta_processor: COINActionSegmentationMetaProcessor + train_path: data/coin/COIN.json + val_path: data/coin/COIN.json + vfeat_dir: data/feat/feat_coin_s3d + text_processor: COINActionSegmentationTextProcessor + aligner: COINActionSegmentationAligner + num_iso_layer: 12 + sliding_window: 8 + sliding_window_size: 32 + max_video_len: 32 + max_len: 96 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 1 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 122 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 8 + checkpoint: + restore_file: runs/mtm/vlm/checkpoint_best.pt + reset_optimizer: true + reset_dataloader: true + reset_meters: true + save_dir: runs/mtm/vlm/coin +task_type: sweep_big +model: + model_cls: MMFusionActionSegmentation + mm_encoder_cls: MMBertForTokenClassification + use_seg_emb: true +loss: + loss_cls: CrossEntropy diff --git a/examples/MMPT/projects/mtm/vlm/crosstask.yaml b/examples/MMPT/projects/mtm/vlm/crosstask.yaml new file mode 100644 index 0000000000..4e706b549e --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/crosstask.yaml @@ -0,0 +1,53 @@ +dataset: + video_processor: CrossTaskVideoProcessor + bert_name: bert-base-uncased + meta_processor: CrossTaskMetaProcessor + train_path: data/crosstask/crosstask_release/videos.csv + train_csv_path: data/crosstask/crosstask_release/videos.csv + val_path: data/crosstask/crosstask_release/videos_val.csv + val_csv_path: data/crosstask/crosstask_release/videos_val.csv + primary_path: data/crosstask/crosstask_release/tasks_primary.txt + related_path: data/crosstask/crosstask_release/tasks_related.txt + vfeat_dir: data/feat/feat_crosstask_s3d + annotation_path: data/crosstask/crosstask_release/annotations + n_train: 30 + text_processor: CrossTaskTextProcessor + aligner: CrossTaskAligner + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 + max_video_len: 32 + max_len: 96 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 1 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 122 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 5 + checkpoint: + restore_file: runs/mtm/vlm/checkpoint11.pt + reset_optimizer: true + reset_dataloader: true + reset_meters: true + save_dir: runs/mtm/vlm/crosstask +task_type: sweep_small +model: + model_cls: MMFusionActionLocalization + mm_encoder_cls: MMBertForJoint + use_seg_emb: true +loss: + loss_cls: BCE diff --git a/examples/MMPT/projects/mtm/vlm/how2.yaml b/examples/MMPT/projects/mtm/vlm/how2.yaml new file mode 100644 index 0000000000..7ca40ad815 --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/how2.yaml @@ -0,0 +1,55 @@ +dataset: + video_processor: ShardedVideoProcessor + bert_name: bert-base-uncased + meta_processor: ShardedHow2MetaProcessor + train_path: data/how2/how2_s3d_train.lst + val_path: data/how2/how2_s3d_val.lst + vfeat_dir: data/feat/feat_how2_s3d_shard_small + text_processor: ShardedTextProcessor + tfeat_dir: data/feat/feat_how2_s3d_shard_small/raw_caption_dedup.bert-base-uncased. + aligner: MFMMLMAligner + subsampling: 32 + sampled_min_len: 8 + sampled_max_len: 64 + max_video_len: 32 + max_len: 96 + lazy_vfeat_mask: true + mfm_probability: 0.15 + mlm_probability: 0.15 + mm_prob: 0.5 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 256 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 1000 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 15 + checkpoint: + save_dir: runs/mtm/vlm + save_interval_updates: 1024 + keep_interval_updates: 2 + keep_last_epochs: 30 +task_type: sweep_big +slurm_config: big +eval: + save_path: runs/mtm/vlm +model: + model_cls: MMFusionMTM + mm_encoder_cls: MMBertForMFMMLM + use_seg_emb: true +loss: + loss_cls: MTM +task: VLMTask diff --git a/examples/MMPT/projects/mtm/vlm/test_coin.yaml b/examples/MMPT/projects/mtm/vlm/test_coin.yaml new file mode 100644 index 0000000000..8df2e66ad1 --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/test_coin.yaml @@ -0,0 +1,31 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: VideoProcessor + aligner: COINActionSegmentationAligner + bert_name: bert-base-uncased + test_path: data/coin/COIN.json + meta_processor: COINActionSegmentationMetaProcessor + vfeat_dir: data/feat/feat_coin_s3d + text_processor: COINActionSegmentationTextProcessor + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 1 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/mtm/vlm/coin/checkpoint_best.pt +model: + model_cls: MMFusionActionSegmentation + mm_encoder_cls: MMBertForTokenClassification + use_seg_emb: true +eval: + save_path: runs/mtm/vlm/coin/eval +metric: COINActionSegmentationMetric +predictor: COINPredictor diff --git a/examples/MMPT/projects/mtm/vlm/test_crosstask.yaml b/examples/MMPT/projects/mtm/vlm/test_crosstask.yaml new file mode 100644 index 0000000000..d159847875 --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/test_crosstask.yaml @@ -0,0 +1,38 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: CrossTaskVideoProcessor + aligner: CrossTaskAligner + bert_name: bert-base-uncased + meta_processor: CrossTaskMetaProcessor + test_path: data/crosstask/crosstask_release/videos_val.csv + train_csv_path: data/crosstask/crosstask_release/videos.csv + val_path: data/crosstask/crosstask_release/videos_val.csv + val_csv_path: data/crosstask/crosstask_release/videos_val.csv + primary_path: data/crosstask/crosstask_release/tasks_primary.txt + related_path: data/crosstask/crosstask_release/tasks_related.txt + vfeat_dir: data/feat/feat_crosstask_s3d + annotation_path: data/crosstask/crosstask_release/annotations + n_train: 30 + text_processor: CrossTaskTextProcessor + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 1 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/mtm/vlm/crosstask/checkpoint_best.pt +model: + model_cls: MMFusionActionLocalization + mm_encoder_cls: MMBertForJoint + use_seg_emb: true +eval: + save_path: runs/mtm/vlm/crosstask/eval +metric: CrossTaskMetric +predictor: CrossTaskPredictor diff --git a/examples/MMPT/projects/mtm/vlm/test_crosstask_zs.yaml b/examples/MMPT/projects/mtm/vlm/test_crosstask_zs.yaml new file mode 100644 index 0000000000..59833c5540 --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/test_crosstask_zs.yaml @@ -0,0 +1,38 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: CrossTaskVideoProcessor + aligner: CrossTaskAligner + bert_name: bert-base-uncased + meta_processor: CrossTaskMetaProcessor + test_path: data/crosstask/crosstask_release/videos_val.csv + train_csv_path: data/crosstask/crosstask_release/videos.csv + val_path: data/crosstask/crosstask_release/videos_val.csv + val_csv_path: data/crosstask/crosstask_release/videos_val.csv + primary_path: data/crosstask/crosstask_release/tasks_primary.txt + related_path: data/crosstask/crosstask_release/tasks_related.txt + vfeat_dir: data/feat/feat_crosstask_s3d + annotation_path: data/crosstask/crosstask_release/annotations + n_train: 30 + text_processor: CrossTaskTextProcessor + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 1 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/mtm/vlm/checkpoint_best.pt +model: + model_cls: MMFusionActionLocalization + mm_encoder_cls: MMBertForJoint + use_seg_emb: true +eval: + save_path: runs/mtm/vlm/crosstask_zs/eval +metric: CrossTaskMetric +predictor: CrossTaskPredictor diff --git a/examples/MMPT/projects/mtm/vlm/test_vtt.yaml b/examples/MMPT/projects/mtm/vlm/test_vtt.yaml new file mode 100644 index 0000000000..a41557df6a --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/test_vtt.yaml @@ -0,0 +1,29 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: VideoProcessor + aligner: DSAligner + bert_name: bert-base-uncased + meta_processor: MSRVTTMetaProcessor + test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTTextProcessor + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/mtm/vlm/vtt/checkpoint_last.pt +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint + use_seg_emb: true +eval: + save_path: runs/mtm/vlm/vtt/eval +metric: RetrievalMetric +predictor: RetrievalPredictor diff --git a/examples/MMPT/projects/mtm/vlm/test_vttqa.yaml b/examples/MMPT/projects/mtm/vlm/test_vttqa.yaml new file mode 100644 index 0000000000..abf3309f70 --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/test_vttqa.yaml @@ -0,0 +1,29 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: VideoProcessor + aligner: MSRVTTQAAligner + bert_name: bert-base-uncased + meta_processor: MSRVTTQAMetaProcessor + test_path: data/msrvtt-qa/MSR_MC_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTQATextProcessor + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/mtm/vlm/vttqa/checkpoint_last.pt +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint + use_seg_emb: true +eval: + save_path: runs/mtm/vlm/vttqa/eval +metric: QAMetric +predictor: QAPredictor diff --git a/examples/MMPT/projects/mtm/vlm/test_youcook.yaml b/examples/MMPT/projects/mtm/vlm/test_youcook.yaml new file mode 100644 index 0000000000..3a57d25c24 --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/test_youcook.yaml @@ -0,0 +1,31 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: YoucookVideoProcessor + aligner: DSAligner + bert_name: bert-base-uncased + meta_processor: YoucookMetaProcessor + test_path: data/youcook/youcook_val.pkl + trainval_annotation: data/youcook/youcookii_annotations_trainval.json + use_annotation_text: true + vfeat_dir: data/feat/feat_youcook_s3d + text_processor: TextProcessor + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/mtm/vlm/youcook/checkpoint_last.pt +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint + use_seg_emb: true +eval: + save_path: runs/mtm/vlm/youcook/eval +metric: RetrievalMetric +predictor: RetrievalPredictor diff --git a/examples/MMPT/projects/mtm/vlm/test_youcookcap.yaml b/examples/MMPT/projects/mtm/vlm/test_youcookcap.yaml new file mode 100644 index 0000000000..b2595d7c3c --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/test_youcookcap.yaml @@ -0,0 +1,32 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: YoucookVideoProcessor + aligner: DSNLGAligner + bert_name: bert-base-uncased + meta_processor: YoucookNLGMetaProcessor + test_path: data/youcook/val_list.txt + trainval_annotation: data/youcook/youcookii_annotations_trainval.json + vfeat_dir: data/feat/feat_youcook_s3d + text_processor: NLGTextProcessor + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/mtm/vlm/youcookcap/checkpoint_best.pt +model: + model_cls: MMFusionNLG + mm_encoder_cls: MMBertForNLG + max_decode_length: 24 + use_seg_emb: true +eval: + save_path: runs/mtm/vlm/youcookcap/eval +metric: NLGMetric +predictor: NLGPredictor +gen_param: + num_beams: 5 diff --git a/examples/MMPT/projects/mtm/vlm/vtt.yaml b/examples/MMPT/projects/mtm/vlm/vtt.yaml new file mode 100644 index 0000000000..c6c5b1ab40 --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/vtt.yaml @@ -0,0 +1,49 @@ +dataset: + video_processor: VideoProcessor + bert_name: bert-base-uncased + meta_processor: MSRVTTMetaProcessor + train_path: data/msrvtt/MSRVTT_train.csv + jsfusion_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + full_test_path: data/msrvtt/MSRVTT_FULL_test.csv + dup: 20 + val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTTextProcessor + json_path: data/msrvtt/MSRVTT_data.json + aligner: DSAligner + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 256 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 122 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 10 + checkpoint: + restore_file: runs/mtm/vlm/checkpoint_best.pt + reset_optimizer: true + reset_dataloader: true + reset_meters: true + save_dir: runs/mtm/vlm/vtt +task_type: sweep_small +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint + use_seg_emb: true +loss: + loss_cls: T2VContraLoss diff --git a/examples/MMPT/projects/mtm/vlm/vttqa.yaml b/examples/MMPT/projects/mtm/vlm/vttqa.yaml new file mode 100644 index 0000000000..0a440c7dd2 --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/vttqa.yaml @@ -0,0 +1,47 @@ +dataset: + video_processor: VideoProcessor + bert_name: bert-base-uncased + meta_processor: MSRVTTMetaProcessor + train_path: data/msrvtt/MSRVTT_train.csv + dup: 20 + val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTTextProcessor + json_path: data/msrvtt/MSRVTT_data.json + aligner: DSAligner + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 128 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 122 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 5 + checkpoint: + restore_file: runs/mtm/vlm/checkpoint_best.pt + reset_optimizer: true + reset_dataloader: true + reset_meters: true + save_dir: runs/mtm/vlm/vttqa +task_type: sweep_small +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint + use_seg_emb: true +loss: + loss_cls: V2TContraLoss diff --git a/examples/MMPT/projects/mtm/vlm/youcook.yaml b/examples/MMPT/projects/mtm/vlm/youcook.yaml new file mode 100644 index 0000000000..9ee82b81b8 --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/youcook.yaml @@ -0,0 +1,47 @@ +dataset: + video_processor: YoucookVideoProcessor + bert_name: bert-base-uncased + meta_processor: YoucookMetaProcessor + train_path: data/youcook/youcook_train.pkl + val_path: data/youcook/youcook_val.pkl + trainval_annotation: data/youcook/youcookii_annotations_trainval.json + use_annotation_text: true + vfeat_dir: data/feat/feat_youcook_s3d + text_processor: TextProcessor + aligner: DSAligner + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 128 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 122 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 10 + checkpoint: + restore_file: runs/mtm/vlm/checkpoint_best.pt + reset_optimizer: true + reset_dataloader: true + reset_meters: true + save_dir: runs/mtm/vlm/youcook +task_type: sweep_small +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint + use_seg_emb: true +loss: + loss_cls: T2VContraLoss diff --git a/examples/MMPT/projects/mtm/vlm/youcookcap.yaml b/examples/MMPT/projects/mtm/vlm/youcookcap.yaml new file mode 100644 index 0000000000..d29dfad5cd --- /dev/null +++ b/examples/MMPT/projects/mtm/vlm/youcookcap.yaml @@ -0,0 +1,45 @@ +dataset: + video_processor: YoucookVideoProcessor + bert_name: bert-base-uncased + meta_processor: YoucookNLGMetaProcessor + train_path: data/youcook/train_list.txt + val_path: data/youcook/val_list.txt + trainval_annotation: data/youcook/youcookii_annotations_trainval.json + vfeat_dir: data/feat/feat_youcook_s3d + text_processor: NLGTextProcessor + aligner: DSNLGAligner + max_video_len: 32 + max_len: 96 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 128 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 122 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 10 + checkpoint: + restore_file: runs/mtm/vlm/checkpoint_best.pt + reset_optimizer: true + reset_dataloader: true + reset_meters: true + save_dir: runs/mtm/vlm/youcookcap +task_type: sweep_small +model: + model_cls: MMFusionNLG + mm_encoder_cls: MMBertForNLG + use_seg_emb: true +loss: + loss_cls: NLGLoss diff --git a/examples/MMPT/projects/retri/videoclip.yaml b/examples/MMPT/projects/retri/videoclip.yaml new file mode 100644 index 0000000000..afd040ab05 --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip.yaml @@ -0,0 +1,10 @@ +includes: projects/retri/videoretri.yaml +project_dir: retri/videoclip +task_group: + pretrain: + model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 diff --git a/examples/MMPT/projects/retri/videoclip/coin_videoclip.yaml b/examples/MMPT/projects/retri/videoclip/coin_videoclip.yaml new file mode 100644 index 0000000000..aaed5e47f6 --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/coin_videoclip.yaml @@ -0,0 +1,49 @@ +dataset: + video_processor: VideoProcessor + bert_name: bert-base-uncased + meta_processor: COINActionSegmentationMetaProcessor + train_path: data/coin/COIN.json + val_path: data/coin/COIN.json + vfeat_dir: data/feat/feat_coin_s3d + text_processor: COINActionSegmentationTextProcessor + aligner: COINActionSegmentationAligner + num_iso_layer: 12 + sliding_window: 8 + sliding_window_size: 32 + max_video_len: 32 + max_len: 96 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 1 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 122 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 8 + checkpoint: + restore_file: runs/retri/videoclip/checkpoint_best.pt + reset_optimizer: true + reset_dataloader: true + reset_meters: true + save_dir: runs/retri/videoclip/coin +task_type: sweep_big +model: + model_cls: MMFusionSeparateActionSegmentation + mm_encoder_cls: null + video_encoder_cls: MMBertForTokenClassification + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +loss: + loss_cls: CrossEntropy diff --git a/examples/MMPT/projects/retri/videoclip/crosstask_videoclip.yaml b/examples/MMPT/projects/retri/videoclip/crosstask_videoclip.yaml new file mode 100644 index 0000000000..758601e359 --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/crosstask_videoclip.yaml @@ -0,0 +1,55 @@ +dataset: + video_processor: CrossTaskVideoProcessor + bert_name: bert-base-uncased + meta_processor: CrossTaskMetaProcessor + train_path: data/crosstask/crosstask_release/videos.csv + train_csv_path: data/crosstask/crosstask_release/videos.csv + val_path: data/crosstask/crosstask_release/videos_val.csv + val_csv_path: data/crosstask/crosstask_release/videos_val.csv + primary_path: data/crosstask/crosstask_release/tasks_primary.txt + related_path: data/crosstask/crosstask_release/tasks_related.txt + vfeat_dir: data/feat/feat_crosstask_s3d + annotation_path: data/crosstask/crosstask_release/annotations + n_train: 30 + text_processor: CrossTaskTextProcessor + aligner: CrossTaskAligner + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 + max_video_len: 32 + max_len: 96 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 1 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 122 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 5 + checkpoint: + restore_file: runs/retri/videoclip/checkpoint_best.pt + reset_optimizer: true + reset_dataloader: true + reset_meters: true + save_dir: runs/retri/videoclip/crosstask +task_type: sweep_small +model: + model_cls: MMFusionSeparateActionLocalization + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +loss: + loss_cls: BCE diff --git a/examples/MMPT/projects/retri/videoclip/how2.yaml b/examples/MMPT/projects/retri/videoclip/how2.yaml new file mode 100644 index 0000000000..b49581e878 --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/how2.yaml @@ -0,0 +1,65 @@ +dataset: + video_processor: ShardedVideoRetriVideoProcessor + bert_name: bert-base-uncased + meta_processor: ShardedHow2VideoRetriMetaProcessor + train_path: data/how2/how2_s3d_train.lst + val_path: data/how2/how2_s3d_val.lst + vfeat_dir: data/feat/feat_how2_s3d_shard_small + text_processor: ShardedVideoRetriTextProcessor + tfeat_dir: data/feat/feat_how2_s3d_shard_small/raw_caption_dedup.bert-base-uncased. + aligner: VideoRetriOverlappedAligner + subsampling: 1 + sampled_min_len: 8 + sampled_max_len: 64 + max_video_len: 32 + max_len: 96 + lazy_vfeat_mask: true + mfm_probability: 0.15 + mlm_probability: 0.15 + mm_prob: 0.5 + sampled_video_min_len: 3 + sampled_video_max_len: 32 + num_video_per_batch: 32 + clip_per_video: 16 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 1 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 1000 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 25 + checkpoint: + save_dir: runs/retri/videoclip + save_interval_updates: 1024 + keep_interval_updates: 2 + keep_last_epochs: 30 +task_type: sweep_big +slurm_config: big +eval: + save_path: runs/retri/videoclip +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +loss: + loss_cls: MMContraLoss +task: VideoRetriTask +retri_epoch: 1 +vectorpool_cls: VideoVectorPool +retriever_cls: VectorRetriever +num_cands: 64 diff --git a/examples/MMPT/projects/retri/videoclip/test_coin_videoclip.yaml b/examples/MMPT/projects/retri/videoclip/test_coin_videoclip.yaml new file mode 100644 index 0000000000..409906203c --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/test_coin_videoclip.yaml @@ -0,0 +1,33 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: VideoProcessor + aligner: COINActionSegmentationAligner + bert_name: bert-base-uncased + test_path: data/coin/COIN.json + meta_processor: COINActionSegmentationMetaProcessor + vfeat_dir: data/feat/feat_coin_s3d + text_processor: COINActionSegmentationTextProcessor + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 1 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/retri/videoclip/coin/checkpoint_best.pt +model: + model_cls: MMFusionSeparateActionSegmentation + mm_encoder_cls: null + video_encoder_cls: MMBertForTokenClassification + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/retri/videoclip/coin/eval +metric: COINActionSegmentationMetric +predictor: COINPredictor diff --git a/examples/MMPT/projects/retri/videoclip/test_coin_zs.yaml b/examples/MMPT/projects/retri/videoclip/test_coin_zs.yaml new file mode 100644 index 0000000000..b33739c7b6 --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/test_coin_zs.yaml @@ -0,0 +1,33 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: VideoProcessor + aligner: COINActionSegmentationAligner + bert_name: bert-base-uncased + test_path: data/coin/COIN.json + meta_processor: COINActionSegmentationMetaProcessor + vfeat_dir: data/feat/feat_coin_s3d + text_processor: COINActionSegmentationTextProcessor + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 1 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/retri/videoclip/checkpoint_best.pt +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/retri/videoclip/coin_zs/eval +metric: COINActionSegmentationMetric +predictor: COINZSPredictor diff --git a/examples/MMPT/projects/retri/videoclip/test_crosstask_videoclip.yaml b/examples/MMPT/projects/retri/videoclip/test_crosstask_videoclip.yaml new file mode 100644 index 0000000000..e82f54fbe5 --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/test_crosstask_videoclip.yaml @@ -0,0 +1,40 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: CrossTaskVideoProcessor + aligner: CrossTaskAligner + bert_name: bert-base-uncased + meta_processor: CrossTaskMetaProcessor + test_path: data/crosstask/crosstask_release/videos_val.csv + train_csv_path: data/crosstask/crosstask_release/videos.csv + val_path: data/crosstask/crosstask_release/videos_val.csv + val_csv_path: data/crosstask/crosstask_release/videos_val.csv + primary_path: data/crosstask/crosstask_release/tasks_primary.txt + related_path: data/crosstask/crosstask_release/tasks_related.txt + vfeat_dir: data/feat/feat_crosstask_s3d + annotation_path: data/crosstask/crosstask_release/annotations + n_train: 30 + text_processor: CrossTaskTextProcessor + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 1 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/retri/videoclip/crosstask/checkpoint_best.pt +model: + model_cls: MMFusionSeparateActionLocalization + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/retri/videoclip/crosstask/eval +metric: CrossTaskMetric +predictor: CrossTaskPredictor diff --git a/examples/MMPT/projects/retri/videoclip/test_crosstask_zs_videoclip.yaml b/examples/MMPT/projects/retri/videoclip/test_crosstask_zs_videoclip.yaml new file mode 100644 index 0000000000..6fc357cc1f --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/test_crosstask_zs_videoclip.yaml @@ -0,0 +1,40 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: CrossTaskVideoProcessor + aligner: CrossTaskAligner + bert_name: bert-base-uncased + meta_processor: CrossTaskMetaProcessor + test_path: data/crosstask/crosstask_release/videos_val.csv + train_csv_path: data/crosstask/crosstask_release/videos.csv + val_path: data/crosstask/crosstask_release/videos_val.csv + val_csv_path: data/crosstask/crosstask_release/videos_val.csv + primary_path: data/crosstask/crosstask_release/tasks_primary.txt + related_path: data/crosstask/crosstask_release/tasks_related.txt + vfeat_dir: data/feat/feat_crosstask_s3d + annotation_path: data/crosstask/crosstask_release/annotations + n_train: 30 + text_processor: CrossTaskTextProcessor + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 1 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/retri/videoclip/checkpoint_best.pt +model: + model_cls: MMFusionSeparateActionLocalization + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/retri/videoclip/crosstask_zs/eval +metric: CrossTaskMetric +predictor: CrossTaskPredictor diff --git a/examples/MMPT/projects/retri/videoclip/test_didemo_zs.yaml b/examples/MMPT/projects/retri/videoclip/test_didemo_zs.yaml new file mode 100644 index 0000000000..8dc716815d --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/test_didemo_zs.yaml @@ -0,0 +1,31 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: VideoProcessor + aligner: DiDeMoAligner + bert_name: bert-base-uncased + meta_processor: DiDeMoMetaProcessor + test_path: data/didemo/test_data.json + vfeat_dir: data/feat/feat_didemo_s3d + text_processor: DiDeMoTextProcessor + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/retri/videoclip/checkpoint_best.pt +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/retri/videoclip/didemo_zs/eval +metric: DiDeMoMetric +predictor: DiDeMoPredictor diff --git a/examples/MMPT/projects/retri/videoclip/test_vtt_videoclip.yaml b/examples/MMPT/projects/retri/videoclip/test_vtt_videoclip.yaml new file mode 100644 index 0000000000..19321ad5f4 --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/test_vtt_videoclip.yaml @@ -0,0 +1,31 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: VideoProcessor + aligner: DSAligner + bert_name: bert-base-uncased + meta_processor: MSRVTTMetaProcessor + test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTTextProcessor + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/retri/videoclip/vtt/checkpoint_last.pt +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/retri/videoclip/vtt/eval +metric: RetrievalMetric +predictor: RetrievalPredictor diff --git a/examples/MMPT/projects/retri/videoclip/test_vtt_zs.yaml b/examples/MMPT/projects/retri/videoclip/test_vtt_zs.yaml new file mode 100644 index 0000000000..d149fa3960 --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/test_vtt_zs.yaml @@ -0,0 +1,31 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: VideoProcessor + aligner: DSAligner + bert_name: bert-base-uncased + meta_processor: MSRVTTMetaProcessor + test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTTextProcessor + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/retri/videoclip/checkpoint_best.pt +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/retri/videoclip/vtt_zs/eval +metric: RetrievalMetric +predictor: RetrievalPredictor diff --git a/examples/MMPT/projects/retri/videoclip/test_vttqa_videoclip.yaml b/examples/MMPT/projects/retri/videoclip/test_vttqa_videoclip.yaml new file mode 100644 index 0000000000..295aeedbb0 --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/test_vttqa_videoclip.yaml @@ -0,0 +1,31 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: VideoProcessor + aligner: MSRVTTQAAligner + bert_name: bert-base-uncased + meta_processor: MSRVTTQAMetaProcessor + test_path: data/msrvtt-qa/MSR_MC_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTQATextProcessor + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/retri/videoclip/vttqa/checkpoint_last.pt +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/retri/videoclip/vttqa/eval +metric: QAMetric +predictor: QAPredictor diff --git a/examples/MMPT/projects/retri/videoclip/test_vttqa_zs.yaml b/examples/MMPT/projects/retri/videoclip/test_vttqa_zs.yaml new file mode 100644 index 0000000000..7a876c822a --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/test_vttqa_zs.yaml @@ -0,0 +1,31 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: VideoProcessor + aligner: MSRVTTQAAligner + bert_name: bert-base-uncased + meta_processor: MSRVTTQAMetaProcessor + test_path: data/msrvtt-qa/MSR_MC_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTQATextProcessor + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/retri/videoclip/checkpoint_best.pt +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/retri/videoclip/vttqa_zs/eval +metric: QAMetric +predictor: QAPredictor diff --git a/examples/MMPT/projects/retri/videoclip/test_youcook_videoclip.yaml b/examples/MMPT/projects/retri/videoclip/test_youcook_videoclip.yaml new file mode 100644 index 0000000000..86a4ab203e --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/test_youcook_videoclip.yaml @@ -0,0 +1,33 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: YoucookVideoProcessor + aligner: DSAligner + bert_name: bert-base-uncased + meta_processor: YoucookMetaProcessor + test_path: data/youcook/youcook_val.pkl + trainval_annotation: data/youcook/youcookii_annotations_trainval.json + use_annotation_text: true + vfeat_dir: data/feat/feat_youcook_s3d + text_processor: TextProcessor + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/retri/videoclip/youcook/checkpoint_last.pt +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/retri/videoclip/youcook/eval +metric: RetrievalMetric +predictor: RetrievalPredictor diff --git a/examples/MMPT/projects/retri/videoclip/test_youcook_zs.yaml b/examples/MMPT/projects/retri/videoclip/test_youcook_zs.yaml new file mode 100644 index 0000000000..fd2941708b --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/test_youcook_zs.yaml @@ -0,0 +1,33 @@ +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: YoucookVideoProcessor + aligner: DSAligner + bert_name: bert-base-uncased + meta_processor: YoucookMetaProcessor + test_path: data/youcook/youcook_val.pkl + trainval_annotation: data/youcook/youcookii_annotations_trainval.json + use_annotation_text: true + vfeat_dir: data/feat/feat_youcook_s3d + text_processor: TextProcessor + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 + common_eval: + path: runs/retri/videoclip/checkpoint_best.pt +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/retri/videoclip/youcook_zs/eval +metric: RetrievalMetric +predictor: RetrievalPredictor diff --git a/examples/MMPT/projects/retri/videoclip/vtt_videoclip.yaml b/examples/MMPT/projects/retri/videoclip/vtt_videoclip.yaml new file mode 100644 index 0000000000..d8b4079ac2 --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/vtt_videoclip.yaml @@ -0,0 +1,51 @@ +dataset: + video_processor: VideoProcessor + bert_name: bert-base-uncased + meta_processor: MSRVTTMetaProcessor + train_path: data/msrvtt/MSRVTT_train.csv + jsfusion_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + full_test_path: data/msrvtt/MSRVTT_FULL_test.csv + dup: 20 + val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTTextProcessor + json_path: data/msrvtt/MSRVTT_data.json + aligner: DSAligner + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 224 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 122 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 10 + checkpoint: + restore_file: runs/retri/videoclip/checkpoint_best.pt + reset_optimizer: true + reset_dataloader: true + reset_meters: true + save_dir: runs/retri/videoclip/vtt +task_type: sweep_small +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +loss: + loss_cls: T2VContraLoss diff --git a/examples/MMPT/projects/retri/videoclip/vttqa_videoclip.yaml b/examples/MMPT/projects/retri/videoclip/vttqa_videoclip.yaml new file mode 100644 index 0000000000..f0566d784a --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/vttqa_videoclip.yaml @@ -0,0 +1,49 @@ +dataset: + video_processor: VideoProcessor + bert_name: bert-base-uncased + meta_processor: MSRVTTMetaProcessor + train_path: data/msrvtt/MSRVTT_train.csv + dup: 20 + val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTTextProcessor + json_path: data/msrvtt/MSRVTT_data.json + aligner: DSAligner + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 128 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 122 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 5 + checkpoint: + restore_file: runs/retri/videoclip/checkpoint_best.pt + reset_optimizer: true + reset_dataloader: true + reset_meters: true + save_dir: runs/retri/videoclip/vttqa +task_type: sweep_small +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +loss: + loss_cls: V2TContraLoss diff --git a/examples/MMPT/projects/retri/videoclip/youcook_videoclip.yaml b/examples/MMPT/projects/retri/videoclip/youcook_videoclip.yaml new file mode 100644 index 0000000000..c2b13e5519 --- /dev/null +++ b/examples/MMPT/projects/retri/videoclip/youcook_videoclip.yaml @@ -0,0 +1,49 @@ +dataset: + video_processor: YoucookVideoProcessor + bert_name: bert-base-uncased + meta_processor: YoucookMetaProcessor + train_path: data/youcook/youcook_train.pkl + val_path: data/youcook/youcook_val.pkl + trainval_annotation: data/youcook/youcookii_annotations_trainval.json + use_annotation_text: true + vfeat_dir: data/feat/feat_youcook_s3d + text_processor: TextProcessor + aligner: DSAligner + num_iso_layer: 12 + max_video_len: 32 + max_len: 96 +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + fp16: true + dataset: + num_workers: 4 + batch_size: 128 + optimization: + lr: + - 5.0e-05 + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 + warmup_updates: 122 + weight_decay: 0.0 + ddp_backend: no_c10d + max_epoch: 10 + checkpoint: + restore_file: runs/retri/videoclip/checkpoint_best.pt + reset_optimizer: true + reset_dataloader: true + reset_meters: true + save_dir: runs/retri/videoclip/youcook +task_type: sweep_small +model: + model_cls: MMFusionSeparate + mm_encoder_cls: null + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +loss: + loss_cls: T2VContraLoss diff --git a/examples/MMPT/projects/retri/videoretri.yaml b/examples/MMPT/projects/retri/videoretri.yaml new file mode 100644 index 0000000000..969e1fb279 --- /dev/null +++ b/examples/MMPT/projects/retri/videoretri.yaml @@ -0,0 +1,51 @@ +includes: projects/mfmmlm.yaml +project_dir: retri/videoretri +run_task: + - how2.yaml +task_group: + pretrain: + task: VideoRetriTask + retri_epoch: 1 + vectorpool_cls: VideoVectorPool + retriever_cls: VectorRetriever + num_cands: 64 + dataset: + train_path: data/how2/how2_s3d_train.lst + meta_processor: ShardedHow2VideoRetriMetaProcessor + video_processor: ShardedVideoRetriVideoProcessor + text_processor: ShardedVideoRetriTextProcessor + aligner: VideoRetriOverlappedAligner + sampled_video_min_len: 3 + sampled_video_max_len: 32 + sampled_min_len: 8 + sampled_max_len: 64 + num_video_per_batch: 32 + # do not use subsampling as it changes fairseq batch_size. + subsampling: 1 # disable subsampling + clip_per_video: 16 + fairseq: + dataset: + batch_size: 1 + optimization: + max_epoch: 25 + model: + model_cls: MMFusionShare + mm_encoder_cls: MMBertForEncoder + loss: + loss_cls: MMContraLoss + finetune: + task_list: [vtt_videoclip.yaml, youcook_videoclip.yaml, vttqa_videoclip.yaml, crosstask_videoclip.yaml, coin_videoclip.yaml] + test: + task_list: + - test_youcook_zs.yaml + - test_vtt_zs.yaml + - test_vttqa_zs.yaml + - test_crosstask_zs_videoclip.yaml + - test_coin_zs.yaml + - test_didemo_zs.yaml + - test_youcook_videoclip.yaml + - test_vtt_videoclip.yaml + - test_vttqa_videoclip.yaml + - test_crosstask_videoclip.yaml + - test_coin_videoclip.yaml + diff --git a/examples/MMPT/projects/task/coin.yaml b/examples/MMPT/projects/task/coin.yaml new file mode 100644 index 0000000000..e7772486e1 --- /dev/null +++ b/examples/MMPT/projects/task/coin.yaml @@ -0,0 +1,25 @@ +includes: projects/task/ft.yaml +task_type: sweep_big +dataset: + meta_processor: COINActionSegmentationMetaProcessor + train_path: data/coin/COIN.json + val_path: data/coin/COIN.json + vfeat_dir: data/feat/feat_coin_s3d + video_processor: VideoProcessor + text_processor: COINActionSegmentationTextProcessor + aligner: COINActionSegmentationAligner + num_iso_layer: 12 + sliding_window: 8 + sliding_window_size: 32 +model: + model_cls: MMFusionActionSegmentation + mm_encoder_cls: MMBertForTokenClassification +loss: + loss_cls: CrossEntropy +fairseq: + dataset: + batch_size: 1 + optimization: + max_epoch: 8 + checkpoint: + save_dir: runs/task/coin diff --git a/examples/MMPT/projects/task/coin_videoclip.yaml b/examples/MMPT/projects/task/coin_videoclip.yaml new file mode 100644 index 0000000000..69988bc18a --- /dev/null +++ b/examples/MMPT/projects/task/coin_videoclip.yaml @@ -0,0 +1,7 @@ +includes: projects/task/coin.yaml +model: + model_cls: MMFusionSeparateActionSegmentation + mm_encoder_cls: + video_encoder_cls: MMBertForTokenClassification + text_encoder_cls: BertModel # dummy, not used. + num_hidden_video_layers: 6 diff --git a/examples/MMPT/projects/task/crosstask.yaml b/examples/MMPT/projects/task/crosstask.yaml new file mode 100644 index 0000000000..cb4dbb0cb4 --- /dev/null +++ b/examples/MMPT/projects/task/crosstask.yaml @@ -0,0 +1,31 @@ +includes: projects/task/ft.yaml +dataset: + meta_processor: CrossTaskMetaProcessor + train_path: data/crosstask/crosstask_release/videos.csv # dummy + train_csv_path: data/crosstask/crosstask_release/videos.csv + val_path: data/crosstask/crosstask_release/videos_val.csv # dummy + val_csv_path: data/crosstask/crosstask_release/videos_val.csv + primary_path: data/crosstask/crosstask_release/tasks_primary.txt + related_path: data/crosstask/crosstask_release/tasks_related.txt + vfeat_dir: data/feat/feat_crosstask_s3d + annotation_path: data/crosstask/crosstask_release/annotations + n_train: 30 + video_processor: CrossTaskVideoProcessor + text_processor: CrossTaskTextProcessor + aligner: CrossTaskAligner + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 +model: + model_cls: MMFusionActionLocalization + mm_encoder_cls: MMBertForJoint +loss: + loss_cls: BCE +fairseq: + dataset: + batch_size: 1 + optimization: + max_epoch: 5 + checkpoint: + save_dir: runs/task/crosstask + restore_file: runs/task/checkpoint11.pt # for VLM diff --git a/examples/MMPT/projects/task/crosstask_videoclip.yaml b/examples/MMPT/projects/task/crosstask_videoclip.yaml new file mode 100644 index 0000000000..6ec613c07f --- /dev/null +++ b/examples/MMPT/projects/task/crosstask_videoclip.yaml @@ -0,0 +1,10 @@ +includes: projects/task/crosstask.yaml +model: + model_cls: MMFusionSeparateActionLocalization + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel # dummy, not used. + num_hidden_video_layers: 6 +fairseq: + checkpoint: + restore_file: runs/task/checkpoint_best.pt # overwrite the default of VLM. diff --git a/examples/MMPT/projects/task/default.yaml b/examples/MMPT/projects/task/default.yaml new file mode 100644 index 0000000000..087fef71a4 --- /dev/null +++ b/examples/MMPT/projects/task/default.yaml @@ -0,0 +1,20 @@ +# this yaml cannot be run alone. you must use `how2.yaml`, `vtt.yaml` etc for training. +dataset: + video_processor: VideoProcessor + bert_name: bert-base-uncased +fairseq: + common: + tensorboard_logdir: run + log_interval: 1000 + dataset: + num_workers: 4 + optimization: + lr: [ 0.00005 ] + clip_norm: 2.0 + optimizer: adam + adam_betas: (0.9, 0.98) + lr_scheduler: polynomial_decay + total_num_update: 1000000 # backward compatible on fairseq 1.0.0a0+af0389f for reproducibility. + warmup_updates: 1000 + weight_decay: 0.0 + ddp_backend: no_c10d diff --git a/examples/MMPT/projects/task/ft.yaml b/examples/MMPT/projects/task/ft.yaml new file mode 100644 index 0000000000..c93b8a73ea --- /dev/null +++ b/examples/MMPT/projects/task/ft.yaml @@ -0,0 +1,13 @@ +includes: projects/task/default.yaml +# all derived config will be run by fairseq-train. +task_type: sweep_small +fairseq: + optimization: + warmup_updates: 122 # copied from roberta glue: https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.glue.md + checkpoint: + # save_interval_updates: 512 + # borrowed from Roberta script. + restore_file: runs/task/checkpoint_best.pt + reset_optimizer: True + reset_dataloader: True + reset_meters: True diff --git a/examples/MMPT/projects/task/how2.yaml b/examples/MMPT/projects/task/how2.yaml new file mode 100644 index 0000000000..094dd04bfc --- /dev/null +++ b/examples/MMPT/projects/task/how2.yaml @@ -0,0 +1,22 @@ +includes: projects/task/default.yaml +task_type: sweep_big +slurm_config: big +dataset: + meta_processor: ShardedHow2MetaProcessor + train_path: data/how2/how2_s3d_train.lst + val_path: data/how2/how2_s3d_val.lst + video_processor: ShardedVideoProcessor + vfeat_dir: data/feat/feat_how2_s3d_shard_small + text_processor: ShardedTextProcessor + tfeat_dir: data/feat/feat_how2_s3d_shard_small/raw_caption_dedup.bert-base-uncased. + aligner: FixedLenAligner +# disable direct running of this yaml +eval: + save_path: runs/task +fairseq: + checkpoint: + save_dir: runs/task + save_interval_updates: 1024 + keep_interval_updates: 2 + keep_last_epochs: 30 + diff --git a/examples/MMPT/projects/task/test.yaml b/examples/MMPT/projects/task/test.yaml new file mode 100644 index 0000000000..0a98445241 --- /dev/null +++ b/examples/MMPT/projects/task/test.yaml @@ -0,0 +1,13 @@ +# this yaml cannot be run alone: implement a test_${dataset}.yaml +slurm_config: big +task_type: local_predict +dataset: + split: test + video_processor: VideoProcessor + aligner: DSAligner + bert_name: bert-base-uncased +fairseq: + dataset: + batch_size: 256 + valid_subset: test + num_workers: 2 diff --git a/examples/MMPT/projects/task/test_coin.yaml b/examples/MMPT/projects/task/test_coin.yaml new file mode 100644 index 0000000000..6d919df7c2 --- /dev/null +++ b/examples/MMPT/projects/task/test_coin.yaml @@ -0,0 +1,24 @@ +includes: projects/task/test.yaml +dataset: + split: test + test_path: data/coin/COIN.json + meta_processor: COINActionSegmentationMetaProcessor + vfeat_dir: data/feat/feat_coin_s3d + video_processor: VideoProcessor + text_processor: COINActionSegmentationTextProcessor + aligner: COINActionSegmentationAligner + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 +model: + model_cls: MMFusionActionSegmentation + mm_encoder_cls: MMBertForTokenClassification +eval: + save_path: runs/task/coin/eval +fairseq: + dataset: + batch_size: 1 + common_eval: + path: runs/task/coin/checkpoint_best.pt +metric: COINActionSegmentationMetric +predictor: COINPredictor diff --git a/examples/MMPT/projects/task/test_coin_videoclip.yaml b/examples/MMPT/projects/task/test_coin_videoclip.yaml new file mode 100644 index 0000000000..b41f5bc489 --- /dev/null +++ b/examples/MMPT/projects/task/test_coin_videoclip.yaml @@ -0,0 +1,7 @@ +includes: projects/task/test_coin.yaml +model: + model_cls: MMFusionSeparateActionSegmentation + mm_encoder_cls: + video_encoder_cls: MMBertForTokenClassification + text_encoder_cls: BertModel # dummy, not used. + num_hidden_video_layers: 6 diff --git a/examples/MMPT/projects/task/test_coin_zs.yaml b/examples/MMPT/projects/task/test_coin_zs.yaml new file mode 100644 index 0000000000..5d19b09f1d --- /dev/null +++ b/examples/MMPT/projects/task/test_coin_zs.yaml @@ -0,0 +1,13 @@ +includes: projects/task/test_coin.yaml +model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/task/coin_zs/eval +fairseq: + common_eval: + path: runs/task/checkpoint_best.pt +predictor: COINZSPredictor diff --git a/examples/MMPT/projects/task/test_crosstask.yaml b/examples/MMPT/projects/task/test_crosstask.yaml new file mode 100644 index 0000000000..6dd778e30b --- /dev/null +++ b/examples/MMPT/projects/task/test_crosstask.yaml @@ -0,0 +1,32 @@ +includes: projects/task/test.yaml +dataset: + split: test + meta_processor: CrossTaskMetaProcessor + test_path: data/crosstask/crosstask_release/videos_val.csv + train_csv_path: data/crosstask/crosstask_release/videos.csv + val_path: data/crosstask/crosstask_release/videos_val.csv # dummy + val_csv_path: data/crosstask/crosstask_release/videos_val.csv + primary_path: data/crosstask/crosstask_release/tasks_primary.txt + related_path: data/crosstask/crosstask_release/tasks_related.txt + vfeat_dir: data/feat/feat_crosstask_s3d + annotation_path: data/crosstask/crosstask_release/annotations + n_train: 30 + video_processor: CrossTaskVideoProcessor + text_processor: CrossTaskTextProcessor + aligner: CrossTaskAligner + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 +model: + model_cls: MMFusionActionLocalization + mm_encoder_cls: MMBertForJoint +eval: + save_path: runs/task/crosstask/eval +fairseq: + # read code and find what is the checkpoint arg. + dataset: + batch_size: 1 + common_eval: + path: runs/task/crosstask/checkpoint_best.pt +metric: CrossTaskMetric +predictor: CrossTaskPredictor diff --git a/examples/MMPT/projects/task/test_crosstask_videoclip.yaml b/examples/MMPT/projects/task/test_crosstask_videoclip.yaml new file mode 100644 index 0000000000..df12535d23 --- /dev/null +++ b/examples/MMPT/projects/task/test_crosstask_videoclip.yaml @@ -0,0 +1,7 @@ +includes: projects/task/test_crosstask.yaml +model: + model_cls: MMFusionSeparateActionLocalization + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel # dummy, not used. + num_hidden_video_layers: 6 diff --git a/examples/MMPT/projects/task/test_crosstask_zs.yaml b/examples/MMPT/projects/task/test_crosstask_zs.yaml new file mode 100644 index 0000000000..19386e495b --- /dev/null +++ b/examples/MMPT/projects/task/test_crosstask_zs.yaml @@ -0,0 +1,32 @@ +includes: projects/task/test.yaml +dataset: + split: test + meta_processor: CrossTaskMetaProcessor + test_path: data/crosstask/crosstask_release/videos_val.csv + train_csv_path: data/crosstask/crosstask_release/videos.csv + val_path: data/crosstask/crosstask_release/videos_val.csv # dummy + val_csv_path: data/crosstask/crosstask_release/videos_val.csv + primary_path: data/crosstask/crosstask_release/tasks_primary.txt + related_path: data/crosstask/crosstask_release/tasks_related.txt + vfeat_dir: data/feat/feat_crosstask_s3d + annotation_path: data/crosstask/crosstask_release/annotations + n_train: 30 + video_processor: CrossTaskVideoProcessor + text_processor: CrossTaskTextProcessor + aligner: CrossTaskAligner + num_iso_layer: 12 + sliding_window: 16 + sliding_window_size: 32 +model: + model_cls: MMFusionActionLocalization + mm_encoder_cls: MMBertForJoint +eval: + save_path: runs/task/crosstask_zs/eval +fairseq: + # read code and find what is the checkpoint arg. + dataset: + batch_size: 1 + common_eval: + path: runs/task/checkpoint_best.pt # load the best from how2 on ACL submission: runs/task/checkpoint11.pt +metric: CrossTaskMetric +predictor: CrossTaskPredictor diff --git a/examples/MMPT/projects/task/test_crosstask_zs_videoclip.yaml b/examples/MMPT/projects/task/test_crosstask_zs_videoclip.yaml new file mode 100644 index 0000000000..7f0198276f --- /dev/null +++ b/examples/MMPT/projects/task/test_crosstask_zs_videoclip.yaml @@ -0,0 +1,7 @@ +includes: projects/task/test_crosstask_zs.yaml +model: + model_cls: MMFusionSeparateActionLocalization + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel # dummy, not used. + num_hidden_video_layers: 6 diff --git a/examples/MMPT/projects/task/test_didemo_zs.yaml b/examples/MMPT/projects/task/test_didemo_zs.yaml new file mode 100644 index 0000000000..4b53dca71e --- /dev/null +++ b/examples/MMPT/projects/task/test_didemo_zs.yaml @@ -0,0 +1,23 @@ +includes: projects/task/test.yaml +dataset: + meta_processor: DiDeMoMetaProcessor + test_path: data/didemo/test_data.json + video_processor: VideoProcessor + vfeat_dir: data/feat/feat_didemo_s3d + text_processor: DiDeMoTextProcessor + aligner: DiDeMoAligner + num_iso_layer: 12 +model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/task/didemo_zs/eval +fairseq: + # read code and find what is the checkpoint arg. + common_eval: + path: runs/task/checkpoint_best.pt +metric: DiDeMoMetric +predictor: DiDeMoPredictor diff --git a/examples/MMPT/projects/task/test_vtt.yaml b/examples/MMPT/projects/task/test_vtt.yaml new file mode 100644 index 0000000000..2f809b306d --- /dev/null +++ b/examples/MMPT/projects/task/test_vtt.yaml @@ -0,0 +1,19 @@ +includes: projects/task/test.yaml +dataset: + meta_processor: MSRVTTMetaProcessor + test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + video_processor: VideoProcessor + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTTextProcessor + num_iso_layer: 12 +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint +eval: + save_path: runs/task/vtt/eval +fairseq: + # read code and find what is the checkpoint arg. + common_eval: + path: runs/task/vtt/checkpoint_last.pt +metric: RetrievalMetric +predictor: RetrievalPredictor diff --git a/examples/MMPT/projects/task/test_vtt_videoclip.yaml b/examples/MMPT/projects/task/test_vtt_videoclip.yaml new file mode 100644 index 0000000000..cb6564394c --- /dev/null +++ b/examples/MMPT/projects/task/test_vtt_videoclip.yaml @@ -0,0 +1,8 @@ +includes: projects/task/test_vtt.yaml +model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 + diff --git a/examples/MMPT/projects/task/test_vtt_zs.yaml b/examples/MMPT/projects/task/test_vtt_zs.yaml new file mode 100644 index 0000000000..57340924b4 --- /dev/null +++ b/examples/MMPT/projects/task/test_vtt_zs.yaml @@ -0,0 +1,13 @@ +includes: projects/task/test_vtt.yaml +model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/task/vtt_zs/eval +fairseq: + # read code and find what is the checkpoint arg. + common_eval: + path: runs/task/checkpoint_best.pt diff --git a/examples/MMPT/projects/task/test_vttqa.yaml b/examples/MMPT/projects/task/test_vttqa.yaml new file mode 100644 index 0000000000..ddf813c535 --- /dev/null +++ b/examples/MMPT/projects/task/test_vttqa.yaml @@ -0,0 +1,20 @@ +includes: projects/task/test.yaml +dataset: + meta_processor: MSRVTTQAMetaProcessor + test_path: data/msrvtt-qa/MSR_MC_test.csv + video_processor: VideoProcessor + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTQATextProcessor + aligner: MSRVTTQAAligner + num_iso_layer: 12 +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint +eval: + save_path: runs/task/vttqa/eval +fairseq: + # read code and find what is the checkpoint arg. + common_eval: + path: runs/task/vttqa/checkpoint_last.pt +metric: QAMetric +predictor: QAPredictor diff --git a/examples/MMPT/projects/task/test_vttqa_videoclip.yaml b/examples/MMPT/projects/task/test_vttqa_videoclip.yaml new file mode 100644 index 0000000000..32a41e861c --- /dev/null +++ b/examples/MMPT/projects/task/test_vttqa_videoclip.yaml @@ -0,0 +1,8 @@ +includes: projects/task/test_vttqa.yaml +model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 + diff --git a/examples/MMPT/projects/task/test_vttqa_zs.yaml b/examples/MMPT/projects/task/test_vttqa_zs.yaml new file mode 100644 index 0000000000..5e0e29d207 --- /dev/null +++ b/examples/MMPT/projects/task/test_vttqa_zs.yaml @@ -0,0 +1,13 @@ +includes: projects/task/test_vttqa.yaml +model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/task/vttqa_zs/eval +fairseq: + # read code and find what is the checkpoint arg. + common_eval: + path: runs/task/checkpoint_best.pt diff --git a/examples/MMPT/projects/task/test_youcook.yaml b/examples/MMPT/projects/task/test_youcook.yaml new file mode 100644 index 0000000000..092b680fa6 --- /dev/null +++ b/examples/MMPT/projects/task/test_youcook.yaml @@ -0,0 +1,22 @@ +includes: projects/task/test.yaml +dataset: + meta_processor: YoucookMetaProcessor + test_path: data/youcook/youcook_val.pkl + trainval_annotation: data/youcook/youcookii_annotations_trainval.json + use_annotation_text: True + video_processor: YoucookVideoProcessor + vfeat_dir: data/feat/feat_youcook_s3d # /checkpoint/huxu/feat/youcook_vmz # /checkpoint/prarora/berniehuang/feat_youcook_vmz + text_processor: TextProcessor + aligner: DSAligner + num_iso_layer: 12 +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint +eval: + save_path: runs/task/youcook/eval +fairseq: + # read code and find what is the checkpoint arg. + common_eval: + path: runs/task/youcook/checkpoint_last.pt +metric: RetrievalMetric +predictor: RetrievalPredictor diff --git a/examples/MMPT/projects/task/test_youcook_videoclip.yaml b/examples/MMPT/projects/task/test_youcook_videoclip.yaml new file mode 100644 index 0000000000..b85ea43474 --- /dev/null +++ b/examples/MMPT/projects/task/test_youcook_videoclip.yaml @@ -0,0 +1,8 @@ +includes: projects/task/test_youcook.yaml +model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 + diff --git a/examples/MMPT/projects/task/test_youcook_zs.yaml b/examples/MMPT/projects/task/test_youcook_zs.yaml new file mode 100644 index 0000000000..0a5875bea4 --- /dev/null +++ b/examples/MMPT/projects/task/test_youcook_zs.yaml @@ -0,0 +1,13 @@ +includes: projects/task/test_youcook.yaml +model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +eval: + save_path: runs/task/youcook_zs/eval +fairseq: + # read code and find what is the checkpoint arg. + common_eval: + path: runs/task/checkpoint_best.pt diff --git a/examples/MMPT/projects/task/test_youcookcap.yaml b/examples/MMPT/projects/task/test_youcookcap.yaml new file mode 100644 index 0000000000..24f6518b7b --- /dev/null +++ b/examples/MMPT/projects/task/test_youcookcap.yaml @@ -0,0 +1,23 @@ +includes: projects/task/test.yaml +dataset: + meta_processor: YoucookNLGMetaProcessor + test_path: data/youcook/val_list.txt + trainval_annotation: data/youcook/youcookii_annotations_trainval.json + video_processor: YoucookVideoProcessor + vfeat_dir: data/feat/feat_youcook_s3d + text_processor: NLGTextProcessor + aligner: DSNLGAligner +model: + model_cls: MMFusionNLG + mm_encoder_cls: MMBertForNLG + max_decode_length: 24 +eval: + save_path: runs/task/youcookcap/eval +fairseq: + # read code and find what is the checkpoint arg. + common_eval: + path: runs/task/youcookcap/checkpoint_best.pt +metric: NLGMetric +predictor: NLGPredictor +gen_param: + num_beams: 5 diff --git a/examples/MMPT/projects/task/vtt.yaml b/examples/MMPT/projects/task/vtt.yaml new file mode 100644 index 0000000000..395e2ee6fe --- /dev/null +++ b/examples/MMPT/projects/task/vtt.yaml @@ -0,0 +1,25 @@ +includes: projects/task/ft.yaml +dataset: + meta_processor: MSRVTTMetaProcessor + train_path: data/msrvtt/MSRVTT_train.csv + jsfusion_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + full_test_path: data/msrvtt/MSRVTT_FULL_test.csv + dup: 20 + val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTTextProcessor + json_path: data/msrvtt/MSRVTT_data.json + aligner: DSAligner + num_iso_layer: 12 +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint +loss: + loss_cls: T2VContraLoss +fairseq: + dataset: + batch_size: 256 + optimization: + max_epoch: 10 + checkpoint: + save_dir: runs/task/vtt diff --git a/examples/MMPT/projects/task/vtt_videoclip.yaml b/examples/MMPT/projects/task/vtt_videoclip.yaml new file mode 100644 index 0000000000..a9892cab01 --- /dev/null +++ b/examples/MMPT/projects/task/vtt_videoclip.yaml @@ -0,0 +1,12 @@ +includes: projects/task/vtt.yaml +model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 +fairseq: + dataset: + batch_size: 224 +# model_cls: MMFusionShare +# mm_encoder_cls: MMBertForEncoder diff --git a/examples/MMPT/projects/task/vttqa.yaml b/examples/MMPT/projects/task/vttqa.yaml new file mode 100644 index 0000000000..56d578eff0 --- /dev/null +++ b/examples/MMPT/projects/task/vttqa.yaml @@ -0,0 +1,23 @@ +includes: projects/task/ft.yaml +dataset: + meta_processor: MSRVTTMetaProcessor + train_path: data/msrvtt/MSRVTT_train.csv + dup: 20 + val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv + vfeat_dir: data/feat/feat_vtt_s3d + text_processor: MSRVTTTextProcessor + json_path: data/msrvtt/MSRVTT_data.json + aligner: DSAligner + num_iso_layer: 12 +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint +loss: + loss_cls: V2TContraLoss +fairseq: + dataset: + batch_size: 128 + optimization: + max_epoch: 5 + checkpoint: + save_dir: runs/task/vttqa diff --git a/examples/MMPT/projects/task/vttqa_videoclip.yaml b/examples/MMPT/projects/task/vttqa_videoclip.yaml new file mode 100644 index 0000000000..2d484ca8a5 --- /dev/null +++ b/examples/MMPT/projects/task/vttqa_videoclip.yaml @@ -0,0 +1,10 @@ +includes: projects/task/vttqa.yaml +model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 + +# model_cls: MMFusionShare +# mm_encoder_cls: MMBertForEncoder diff --git a/examples/MMPT/projects/task/youcook.yaml b/examples/MMPT/projects/task/youcook.yaml new file mode 100644 index 0000000000..e0cd841747 --- /dev/null +++ b/examples/MMPT/projects/task/youcook.yaml @@ -0,0 +1,25 @@ +includes: projects/task/ft.yaml +dataset: + meta_processor: YoucookMetaProcessor + train_path: data/youcook/youcook_train.pkl + val_path: data/youcook/youcook_val.pkl + trainval_annotation: data/youcook/youcookii_annotations_trainval.json + use_annotation_text: True + video_processor: YoucookVideoProcessor + vfeat_dir: data/feat/feat_youcook_s3d # /checkpoint/huxu/feat/youcook_vmz # /checkpoint/prarora/berniehuang/feat_youcook_vmz + text_processor: TextProcessor + aligner: DSAligner + num_iso_layer: 12 +model: + model_cls: MMFusionJoint + mm_encoder_cls: MMBertForJoint +loss: + loss_cls: T2VContraLoss +fairseq: + dataset: + batch_size: 128 + optimization: + max_epoch: 10 + checkpoint: + save_dir: runs/task/youcook + diff --git a/examples/MMPT/projects/task/youcook_videoclip.yaml b/examples/MMPT/projects/task/youcook_videoclip.yaml new file mode 100644 index 0000000000..e3e901c30c --- /dev/null +++ b/examples/MMPT/projects/task/youcook_videoclip.yaml @@ -0,0 +1,9 @@ +includes: projects/task/youcook.yaml +model: + model_cls: MMFusionSeparate + mm_encoder_cls: + video_encoder_cls: MMBertForEncoder + text_encoder_cls: BertModel + num_hidden_video_layers: 6 + # model_cls: MMFusionShare + # mm_encoder_cls: MMBertForEncoder diff --git a/examples/MMPT/projects/task/youcookcap.yaml b/examples/MMPT/projects/task/youcookcap.yaml new file mode 100644 index 0000000000..047735f217 --- /dev/null +++ b/examples/MMPT/projects/task/youcookcap.yaml @@ -0,0 +1,23 @@ +# finetuning for youcook captioning. +includes: projects/task/ft.yaml +dataset: + meta_processor: YoucookNLGMetaProcessor + train_path: data/youcook/train_list.txt + val_path: data/youcook/val_list.txt + trainval_annotation: data/youcook/youcookii_annotations_trainval.json + video_processor: YoucookVideoProcessor + vfeat_dir: data/feat/feat_youcook_s3d + text_processor: NLGTextProcessor + aligner: DSNLGAligner +model: + model_cls: MMFusionNLG + mm_encoder_cls: MMBertForNLG +loss: + loss_cls: NLGLoss +fairseq: + dataset: + batch_size: 128 + optimization: + max_epoch: 10 + checkpoint: + save_dir: runs/task/youcookcap diff --git a/examples/MMPT/scripts/text_token_extractor/configs/bert-base-uncased.yaml b/examples/MMPT/scripts/text_token_extractor/configs/bert-base-uncased.yaml new file mode 100644 index 0000000000..473dd9b45b --- /dev/null +++ b/examples/MMPT/scripts/text_token_extractor/configs/bert-base-uncased.yaml @@ -0,0 +1,5 @@ +dataset: + bert_name: bert-base-uncased + caption_pkl_path: data/how2/raw_caption_dedup.pkl + use_fast: true + target_dir: data/feat/feat_how2_s3d_shard_small diff --git a/examples/MMPT/scripts/text_token_extractor/pretokenization.py b/examples/MMPT/scripts/text_token_extractor/pretokenization.py new file mode 100644 index 0000000000..29ae5dc151 --- /dev/null +++ b/examples/MMPT/scripts/text_token_extractor/pretokenization.py @@ -0,0 +1,106 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import pickle +import os +import argparse +import numpy as np + +from torch.utils.data import Dataset, DataLoader +from mmpt.processors import PKLJSONStrTextProcessor +from mmpt.utils import ShardedTensor, recursive_config + + +class TokenizerDataset(Dataset): + def __init__(self, config): + self.text_processor = PKLJSONStrTextProcessor(config) + self.video_ids = list(self.text_processor.data.keys()) + + def __getitem__(self, idx): + video_id = self.video_ids[idx] + return video_id, self.text_processor(video_id) + + def __len__(self): + return len(self.video_ids) + + +def numpify(shard_idx, video_ids, captions, target_dir, split, prefix, max_cap_len=32): + startends = [] + caps_ids = [] + for video_id in video_ids: + caption = captions[video_id] + startend = [] + cap_ids = [] + for start, end, cap in zip( + caption["start"], caption["end"], caption["cap"]): + startend.append(np.array([start, end]).astype("float32")) + cap_id = np.full((max_cap_len,), -1, dtype=np.int32) + cap = cap[:max_cap_len] + cap_id[:len(cap)] = cap + cap_ids.append(cap_id) + startends.append(np.stack(startend)) + caps_ids.append(np.stack(cap_ids)) + + startends = ShardedTensor.from_list(startends) + target_path = os.path.join( + target_dir, + prefix + split + "_" + str(shard_idx) + ) + print("save to", target_path) + startends.save(target_path + ".startends") + caps_ids = ShardedTensor.from_list(caps_ids) + caps_ids.save(target_path + ".caps_ids") + + +def sharding(config, out_file): + with open(out_file, "rb") as fr: + captions = pickle.load(fr) + target_dir = config.target_dir + prefix = os.path.basename( + os.path.splitext(config.caption_pkl_path)[0] + ) + "." + config.bert_name + "." + for split in ["train", "val"]: + target_path = os.path.join(target_dir, split + "_meta") + with open(target_path + ".pkl", "rb") as fr: + meta = pickle.load(fr) + print("load meta", target_path, len(meta)) + for shard_id in meta: + numpify( + shard_id, meta[shard_id], captions, + target_dir, split, prefix + ) + + +def tokenize(config, out_file): + def collator(samples): + return samples + dataset = TokenizerDataset(config) + data = {} + for idx, batch in enumerate( + DataLoader(dataset, collate_fn=collator, num_workers=16)): + for video_id, caption in batch: + data[video_id] = caption + if idx % 5000 == 0: + print(idx) + with open(out_file, "wb") as fw: + pickle.dump(data, fw, pickle.HIGHEST_PROTOCOL) + + +def main(args): + config = recursive_config(args.config).dataset + + out_file = os.path.splitext(config.caption_pkl_path)[0] \ + + "." + config.bert_name + ".pkl" + if not os.path.isfile(out_file): + tokenize(config, out_file) + sharding(config, out_file) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="pretokenize (raw_)caption.json into pkl.") + parser.add_argument('config', type=str) + args = parser.parse_args() + main(args) diff --git a/examples/MMPT/scripts/video_feature_extractor/extract.py b/examples/MMPT/scripts/video_feature_extractor/extract.py new file mode 100755 index 0000000000..b5ee7b7788 --- /dev/null +++ b/examples/MMPT/scripts/video_feature_extractor/extract.py @@ -0,0 +1,157 @@ +# Copyright Howto100M authors. +# Copyright (c) Facebook, Inc. All Rights Reserved + +import torch as th +import torch.nn.functional as F +import math +import numpy as np +import argparse + +from torch.utils.data import DataLoader +from model import get_model +from preprocessing import Preprocessing +from random_sequence_shuffler import RandomSequenceSampler + +from tqdm import tqdm +from pathbuilder import PathBuilder +from videoreader import VideoLoader + + +parser = argparse.ArgumentParser(description='Easy video feature extractor') + +parser.add_argument('--vdir', type=str) +parser.add_argument('--fdir', type=str) +parser.add_argument('--hflip', type=int, default=0) + +parser.add_argument('--batch_size', type=int, default=64, + help='batch size') +parser.add_argument('--type', type=str, default='2d', + help='CNN type') +parser.add_argument('--half_precision', type=int, default=0, + help='output half precision float') +parser.add_argument('--num_decoding_thread', type=int, default=4, + help='Num parallel thread for video decoding') +parser.add_argument('--l2_normalize', type=int, default=1, + help='l2 normalize feature') +parser.add_argument('--resnext101_model_path', type=str, default='model/resnext101.pth', + help='Resnext model path') +parser.add_argument('--vmz_model_path', type=str, default='model/r2plus1d_34_clip8_ig65m_from_scratch-9bae36ae.pth', + help='vmz model path') + +args = parser.parse_args() + + +# TODO: refactor all args into config. (current code is from different people.) +CONFIGS = { + "2d": { + "fps": 1, + "size": 224, + "centercrop": False, + "shards": 0, + }, + "3d": { + "fps": 24, + "size": 112, + "centercrop": True, + "shards": 0, + }, + "s3d": { + "fps": 30, + "size": 224, + "centercrop": True, + "shards": 0, + }, + "vmz": { + "fps": 24, + "size": 112, + "centercrop": True, + "shards": 0, + }, + "vae": { + "fps": 2, + "size": 256, + "centercrop": True, + "shards": 100, + } +} + +config = CONFIGS[args.type] + + +video_dirs = args.vdir +feature_dir = args.fdir + +video_dict = PathBuilder.build(video_dirs, feature_dir, ".npy", config["shards"]) + +dataset = VideoLoader( + video_dict=video_dict, + framerate=config["fps"], + size=config["size"], + centercrop=config["centercrop"], + hflip=args.hflip +) +n_dataset = len(dataset) +sampler = RandomSequenceSampler(n_dataset, 10) +loader = DataLoader( + dataset, + batch_size=1, + shuffle=False, + num_workers=args.num_decoding_thread, + sampler=sampler if n_dataset > 10 else None, +) +preprocess = Preprocessing(args.type) +model = get_model(args) + +with th.no_grad(): + for k, data in tqdm(enumerate(loader), total=loader.__len__(), ascii=True): + input_file = data['input'][0] + output_file = data['output'][0] + if len(data['video'].shape) > 3: + video = data['video'].squeeze() + if len(video.shape) == 4: + video = preprocess(video) + n_chunk = len(video) + if args.type == 'vmz': + n_chunk = math.ceil(n_chunk/float(3)) + features = th.cuda.FloatTensor(n_chunk, 512).fill_(0) + elif args.type == 's3d': + features = th.cuda.FloatTensor(n_chunk, 512).fill_(0) + elif args.type == "vae": + features = th.cuda.LongTensor(n_chunk, 1024).fill_(0) + else: + features = th.cuda.FloatTensor(n_chunk, 2048).fill_(0) + n_iter = int(math.ceil(n_chunk / float(args.batch_size))) + for i in range(n_iter): + factor = 1 + if args.type == 'vmz': + factor = 3 + min_ind = factor * i * args.batch_size + max_ind = factor * (i + 1) * args.batch_size + video_batch = video[min_ind:max_ind:factor].cuda() + if args.type == '2d': + batch_features = model(video_batch) # (51, 487), (51, 512) + elif args.type == 's3d': + batch_features = model(video_batch) + batch_features = batch_features['video_embedding'] + elif args.type == "vae": + # image_code. + batch_features = model(video_batch) + else: + batch_pred, batch_features = model(video_batch) # (51, 487), (51, 512) + if args.l2_normalize: + batch_features = F.normalize(batch_features, dim=1) + features[i*args.batch_size:(i+1)*args.batch_size] = batch_features + features = features.cpu().numpy() + if args.half_precision: + if args.type == "vae": + features = features.astype(np.int16) + else: + features = features.astype('float16') + else: + if args.type == "vae": + features = features.astype(np.int32) + else: + features = features.astype('float32') + np.save(output_file, features) + else: + print('Video {} error.'.format(input_file)) diff --git a/examples/MMPT/scripts/video_feature_extractor/how2/s3d.sh b/examples/MMPT/scripts/video_feature_extractor/how2/s3d.sh new file mode 100644 index 0000000000..90102c89fb --- /dev/null +++ b/examples/MMPT/scripts/video_feature_extractor/how2/s3d.sh @@ -0,0 +1,8 @@ +#!/bin/bash + + +python scripts/video_feature_extractor/extract.py \ + --vdir \ + --fdir data/feat/feat_how2_s3d \ + --type=s3d --num_decoding_thread=4 \ + --batch_size 32 --half_precision 1 diff --git a/examples/MMPT/scripts/video_feature_extractor/model.py b/examples/MMPT/scripts/video_feature_extractor/model.py new file mode 100755 index 0000000000..ac266e844c --- /dev/null +++ b/examples/MMPT/scripts/video_feature_extractor/model.py @@ -0,0 +1,58 @@ +# Copyright (c) Howto100M authors and Facebook, Inc. All Rights Reserved + +import torch as th + +from torch import nn + + +class GlobalAvgPool(nn.Module): + def __init__(self): + super(GlobalAvgPool, self).__init__() + + def forward(self, x): + return th.mean(x, dim=[-2, -1]) + + +def get_model(args): + assert args.type in ['2d', '3d', 'vmz', 's3d', 'vae'] + if args.type == '2d': + print('Loading 2D-ResNet-152 ...') + import torchvision.models as models + model = models.resnet152(pretrained=True) + model = nn.Sequential(*list(model.children())[:-2], GlobalAvgPool()) + model = model.cuda() + elif args.type == 'vmz': + print('Loading VMZ ...') + from vmz34 import r2plus1d_34 + model = r2plus1d_34(pretrained_path=args.vmz_model_path, pretrained_num_classes=487) + model = model.cuda() + elif args.type == 's3d': + # we use one copy of s3d instead of dup another one for feature extraction. + from mmpt.processors.models.s3dg import S3D + model = S3D('pretrained_models/s3d_dict.npy', 512) + model.load_state_dict(th.load('pretrained_models/s3d_howto100m.pth')) + model = model.cuda() + + elif args.type == '3d': + print('Loading 3D-ResneXt-101 ...') + from videocnn.models import resnext + model = resnext.resnet101( + num_classes=400, + shortcut_type='B', + cardinality=32, + sample_size=112, + sample_duration=16, + last_fc=False) + model = model.cuda() + model_data = th.load(args.resnext101_model_path) + model.load_state_dict(model_data) + elif args.type == 'vae': + from openaivae import OpenAIParallelDiscreteVAE + model = OpenAIParallelDiscreteVAE() + model = model.cuda() + else: + raise ValueError("model not supported yet.") + + model.eval() + print('loaded') + return model diff --git a/examples/MMPT/scripts/video_feature_extractor/pathbuilder.py b/examples/MMPT/scripts/video_feature_extractor/pathbuilder.py new file mode 100644 index 0000000000..2392d6d63b --- /dev/null +++ b/examples/MMPT/scripts/video_feature_extractor/pathbuilder.py @@ -0,0 +1,89 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import os +import urllib.parse +import json +import pandas as pd + +from tqdm import tqdm + + +# TODO: extending to other datasets. +supported_formats = {} + + +class PathBuilder(object): + @classmethod + def build(cls, video_dirs, feature_dir, ext, shards=0, split=None): + meta_fn = os.path.join(feature_dir, "meta_plan.json") + os.makedirs(feature_dir, exist_ok=True) + if os.path.isfile(meta_fn): + with open(meta_fn) as fr: + meta = json.load(fr) + return meta + print("searching videos...") + + video_id_to_path = {} + for video_dir in video_dirs.split(","): + # TODO: add supports of recursive listdir. + if video_dir in supported_formats: + supported_formats[video_dir].load(video_dir, video_id_to_path) + else: + for idx, fn in enumerate(tqdm(os.listdir(video_dir))): + video_fn = os.path.join(video_dir, fn) + if os.path.isfile(video_fn): + video_id = os.path.splitext(fn)[0] + video_id_to_path[video_id] = video_fn + elif os.path.isdir(video_fn): + # shards of folders. + shard_dir = video_fn + for idx, fn in enumerate(os.listdir(shard_dir)): + video_fn = os.path.join(shard_dir, fn) + if os.path.isfile(video_fn): + video_id = os.path.splitext(fn)[0] + video_id_to_path[video_id] = video_fn + + video_path, feature_path = [], [] + valid_ext = set() + for idx, video_id in enumerate(video_id_to_path): + video_path.append(video_id_to_path[video_id]) + if ext is None: + # use original file ext for format compatibility. + video_id_to_path[video_id] + path = urllib.parse.urlparse(video_id_to_path[video_id]).path + ext = os.path.splitext(path)[1] + if ext not in valid_ext: + valid_ext.add(ext) + print("adding", ext) + if shards: + shard_id = str(idx % shards) + feature_fn = os.path.join( + feature_dir, shard_id, video_id + ext) + else: + feature_fn = os.path.join( + feature_dir, video_id + ext) + feature_path.append(feature_fn) + + print("targeting", len(feature_path), "videos") + meta = { + "video_path": video_path, "feature_path": feature_path} + with open(meta_fn, "w") as fw: + json.dump(meta, fw) + + if split is not None: + splits = split.split("/") + assert len(splits) == 2 + cur, total = int(splits[0]), int(splits[1]) + assert cur < total + import math + chunk = math.ceil(len(meta["video_path"]) / total) + start = cur * chunk + end = (cur + 1) * chunk + meta = { + "video_path": meta["video_path"][start:end], + "feature_path": meta["feature_path"][start:end] + } + + return meta diff --git a/examples/MMPT/scripts/video_feature_extractor/preprocessing.py b/examples/MMPT/scripts/video_feature_extractor/preprocessing.py new file mode 100755 index 0000000000..fa0cec3a76 --- /dev/null +++ b/examples/MMPT/scripts/video_feature_extractor/preprocessing.py @@ -0,0 +1,57 @@ +# Copyright Howto100m authors. +# Copyright (c) Facebook, Inc. All Rights Reserved + +import torch as th + +class Normalize(object): + + def __init__(self, mean, std): + self.mean = th.FloatTensor(mean).view(1, 3, 1, 1) + self.std = th.FloatTensor(std).view(1, 3, 1, 1) + + def __call__(self, tensor): + tensor = (tensor - self.mean) / (self.std + 1e-8) + return tensor + +class Preprocessing(object): + + def __init__(self, type): + self.type = type + if type == '2d': + self.norm = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + elif type == '3d': + self.norm = Normalize(mean=[110.6, 103.2, 96.3], std=[1.0, 1.0, 1.0]) + elif type == 'vmz': + self.norm = Normalize(mean=[110.201, 100.64, 95.997], std=[58.1489, 56.4701, 55.3324]) + + def _zero_pad(self, tensor, size): + n = size - len(tensor) % size + if n == size: + return tensor + else: + z = th.zeros(n, tensor.shape[1], tensor.shape[2], tensor.shape[3]) + return th.cat((tensor, z), 0) + + def __call__(self, tensor): + if self.type == '2d': + tensor = tensor / 255.0 + tensor = self.norm(tensor) + elif self.type == 'vmz': + #tensor = self._zero_pad(tensor, 8) + tensor = self._zero_pad(tensor, 10) + tensor = self.norm(tensor) + #tensor = tensor.view(-1, 8, 3, 112, 112) + tensor = tensor.view(-1, 10, 3, 112, 112) + tensor = tensor.transpose(1, 2) + elif self.type == '3d': + tensor = self._zero_pad(tensor, 16) + tensor = self.norm(tensor) + tensor = tensor.view(-1, 16, 3, 112, 112) + tensor = tensor.transpose(1, 2) + elif self.type == 's3d': + tensor = tensor / 255.0 + tensor = self._zero_pad(tensor, 30) + tensor = tensor.view(-1, 30, 3, 224, 224) # N x 30 x 3 x H x W + tensor = tensor.transpose(1, 2) # N x 3 x 30 x H x W + # for vae do nothing + return tensor diff --git a/examples/MMPT/scripts/video_feature_extractor/random_sequence_shuffler.py b/examples/MMPT/scripts/video_feature_extractor/random_sequence_shuffler.py new file mode 100755 index 0000000000..1f3e4aceaa --- /dev/null +++ b/examples/MMPT/scripts/video_feature_extractor/random_sequence_shuffler.py @@ -0,0 +1,29 @@ +# Copyright (c) Facebook, Inc. All Rights Reserved + +import numpy as np + +from torch.utils.data.sampler import Sampler + + +class RandomSequenceSampler(Sampler): + + def __init__(self, n_sample, seq_len): + self.n_sample = n_sample + self.seq_len = seq_len + + def _pad_ind(self, ind): + zeros = np.zeros(self.seq_len - self.n_sample % self.seq_len) + ind = np.concatenate((ind, zeros)) + return ind + + def __iter__(self): + idx = np.arange(self.n_sample) + if self.n_sample % self.seq_len != 0: + idx = self._pad_ind(idx) + idx = np.reshape(idx, (-1, self.seq_len)) + np.random.shuffle(idx) + idx = np.reshape(idx, (-1)) + return iter(idx.astype(int)) + + def __len__(self): + return self.n_sample + (self.seq_len - self.n_sample % self.seq_len) diff --git a/examples/MMPT/scripts/video_feature_extractor/shard_feature.py b/examples/MMPT/scripts/video_feature_extractor/shard_feature.py new file mode 100644 index 0000000000..f75e1dfae5 --- /dev/null +++ b/examples/MMPT/scripts/video_feature_extractor/shard_feature.py @@ -0,0 +1,64 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import numpy as np +import os +import pickle + +from mmpt.utils import ShardedTensor + + +class Shard(object): + def __init__( + self, + vfeat_dir, + tfeat_dir, + target_dir, + file_paths, + shard_size=4096 + ): + self.vfeat_dir = vfeat_dir + self.tfeat_dir = tfeat_dir + self.target_dir = target_dir + self.video_ids = {} + for split, file_path in zip(["train", "val"], file_paths): + with open(file_path) as fr: + self.video_ids[split] = [ + line.strip() for line in fr.readlines()] + self.shard_size = shard_size + + def __call__(self, split="train"): + for split in ["train", "val"]: + meta = {} + for shard_idx, shard_offset in enumerate( + range(0, len(self.video_ids[split]), self.shard_size) + ): + print(shard_idx) + meta_shard = [] + video_shard = [] + for video_id in self.video_ids[split][shard_offset:shard_offset+self.shard_size]: + meta_shard.append(video_id) + npy_file = os.path.join(self.vfeat_dir, video_id + ".npy") + video_shard.append(np.load(npy_file)) + + meta[shard_idx] = meta_shard + video_shard = ShardedTensor.from_list(video_shard) + target_path = os.path.join( + self.target_dir, split + "_" + str(shard_idx)) + video_shard.save(target_path) + + target_path = os.path.join(self.target_dir, split + "_meta") + with open(target_path + ".pkl", "wb") as fw: + pickle.dump(meta, fw, pickle.HIGHEST_PROTOCOL) + + +if __name__ == "__main__": + shard = Shard( + "data/feat/feat_how2_s3d", + "data/how2/raw_caption_dedup.bert-base-uncased", + "data/feat/feat_how2_s3d_shard_small", + ["data/how2/how2_s3d_train.lst", "data/how2/how2_s3d_val.lst"] + ) + + shard() diff --git a/examples/MMPT/scripts/video_feature_extractor/videoreader.py b/examples/MMPT/scripts/video_feature_extractor/videoreader.py new file mode 100644 index 0000000000..429e05f8bc --- /dev/null +++ b/examples/MMPT/scripts/video_feature_extractor/videoreader.py @@ -0,0 +1,242 @@ +# Copyright Howto100M authors. +# Copyright (c) Facebook, Inc. All Rights Reserved + +import torch as th +import pandas as pd +import os +import numpy as np +import ffmpeg +import random + +from torch.utils.data import Dataset + + +class VideoLoader(Dataset): + """modified from how2's video_feature_extractor.""" + def __init__( + self, + csv=None, + video_dict=None, + framerate=1, + size=112, + centercrop=False, + hflip=False, + **kwargs + ): + if csv is None and video_dict is None: + raise ValueError("csv and video_dict cannot be both None.") + if csv is not None: + self.csv = pd.read_csv(csv) + if video_dict is not None: + self.csv = pd.DataFrame.from_dict(video_dict) + + self.centercrop = centercrop + self.size = size + self.framerate = framerate + self.hflip = hflip + + def __len__(self): + return len(self.csv) + + def _get_video_dim(self, video_path): + probe = ffmpeg.probe(video_path) + video_stream = next((stream for stream in probe['streams'] + if stream['codec_type'] == 'video'), None) + width = int(video_stream['width']) + height = int(video_stream['height']) + return height, width + + def _get_video_info(self, video_path): + probe = ffmpeg.probe(video_path) + video_stream = next((stream for stream in probe['streams'] + if stream['codec_type'] == 'video'), None) + return video_stream + + def _get_output_dim(self, h, w): + if isinstance(self.size, tuple) and len(self.size) == 2: + return self.size + elif h >= w: + return int(h * self.size / w), self.size + else: + return self.size, int(w * self.size / h) + + def __getitem__(self, idx): + video_path = self.csv['video_path'].values[idx] + output_file = self.csv['feature_path'].values[idx] + return self._decode(output_file, video_path) + + def _decode(self, output_file, video_path): + if not(os.path.isfile(output_file)) and os.path.isfile(video_path): + try: + h, w = self._get_video_dim(video_path) + except Exception: + print('ffprobe failed at: {}'.format(video_path)) + return {'video': th.zeros(1), 'input': video_path, + 'output': output_file} + try: + os.makedirs(os.path.dirname(output_file), exist_ok=True) + height, width = self._get_output_dim(h, w) + + cmd = ( + ffmpeg + .input(video_path) + .filter('fps', fps=self.framerate) + .filter('scale', width, height) + ) + if self.hflip: + cmd = cmd.filter('hflip') + + if self.centercrop: + x = int((width - self.size) / 2.0) + y = int((height - self.size) / 2.0) + cmd = cmd.crop(x, y, self.size, self.size) + video = self._run(cmd, output_file) + except Exception: + video = th.zeros(1) + else: + video = th.zeros(1) + + return {'video': video, 'input': video_path, 'output': output_file} + + def _run(self, cmd, output_file): + out, _ = ( + cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24') + .run(capture_stdout=True, quiet=True) + ) + if self.centercrop and isinstance(self.size, int): + height, width = self.size, self.size + video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3]) + video = th.from_numpy(video.astype('float32')) + return video.permute(0, 3, 1, 2) + + +class VideoVerifier(VideoLoader): + def __getitem__(self, idx): + video_path = self.csv['video_path'].values[idx] + try: + return self._get_video_info(video_path) + except Exception: + # print('ffprobe failed at: {}'.format(video_path)) + return None + + +class VideoCompressor(VideoLoader): + def __init__( + self, + csv=None, + video_dict=None, + framerate=1, + size=112, + centercrop=False, + hflip=False, + crf=32, + **kwargs + ): + super().__init__( + csv, + video_dict, + framerate, + size, + centercrop, + hflip + ) + self.crf = crf + + def _run(self, cmd, output_file): + out, _ = ( + cmd.output(filename=output_file, crf=self.crf) + .run(quiet=True) + ) + video = None + return video + + +class VideoDownloader(VideoCompressor): + """download""" + def __getitem__(self, idx): + video_path = self.csv['video_path'].values[idx] + output_file = self.csv['feature_path'].values[idx] + if not(os.path.isfile(output_file)): + os.makedirs(os.path.dirname(output_file), exist_ok=True) + cmd = "wget -O" + output_file + " " + video_path + # import subprocess + # subprocess.check_output( + # cmd, + # stderr=subprocess.STDOUT, shell=True) + os.system(cmd) + return {'video': None, 'input': video_path, 'output': output_file} + + +class AvKeyframeVideoCompressor(VideoLoader): + """extract keyframes from a video and save it as jpg. + TODO: consider to merge with `CodecProcessor`. + """ + def __init__( + self, + csv=None, + video_dict=None, + framerate=1, + size=112, + centercrop=False, + max_num_frames=5, + **kwargs + ): + super().__init__(csv, video_dict, framerate, size, centercrop) + self.max_num_frames = max_num_frames + + def _get_video_dim(self, video_fn): + """decord cannot probe the size of a video, we use pyav instead.""" + import av + with av.open(video_fn) as container: + height = container.streams.video[0].codec_context.height + width = container.streams.video[0].codec_context.width + return height, width + + def _get_output_dim(self, height, width): + """ + keep the shorter side be `self.size`, strech the other. + """ + if height >= width: + return int(height * self.size / width), self.size + else: + return self.size, int(width * self.size / height) + + def __getitem__(self, idx): + import av + video_path = self.csv['video_path'].values[idx] + output_file = self.csv['feature_path'].values[idx] + if not(os.path.isdir(output_file)) and os.path.isfile(video_path): + try: + h, w = self._get_video_dim(video_path) + except Exception: + print('probe failed at: {}'.format(video_path)) + return {'video': th.zeros(1), 'input': video_path, + 'output': output_file} + + try: + height, width = self._get_output_dim(h, w) + + # new for av. + with av.open(video_path) as container: + container.streams.video[0].thread_type = "AUTO" + container.streams.video[0].codec_context.height = height + container.streams.video[0].codec_context.width = width + if self.framerate == 0: # keyframe. + container.streams.video[0].codec_context.skip_frame = 'NONKEY' + frames = [] + for frame in container.decode(video=0): + frames.append(frame) + frames = random.sample(frames, self.max_num_frames) + + os.makedirs(output_file, exist_ok=True) + for frame in frames: + frame.to_image().save( + os.path.join( + output_file, + "%04d.jpg" % frame.index)) + except Exception: + print('extract failed at: {}'.format(video_path)) + return {'video': th.zeros(1), 'input': video_path, + 'output': output_file} + video = th.zeros(1) + return {'video': video, 'input': video_path, 'output': output_file} diff --git a/examples/MMPT/setup.py b/examples/MMPT/setup.py new file mode 100644 index 0000000000..a9a82296ea --- /dev/null +++ b/examples/MMPT/setup.py @@ -0,0 +1,24 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="mmpt", + version="0.0.1", + author="Hu Xu, Po-yao Huang", + author_email="huxu@fb.com", + description="A package for multimodal pretraining.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/pytorch/fairseq/examples/MMPT", + packages=setuptools.find_packages(), + install_requires=[ + ], + classifiers=[ + "Programming Language :: Python :: 3", + "License :: CC-BY-NC", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', +) diff --git a/examples/MMPT/videoclip.png b/examples/MMPT/videoclip.png new file mode 100644 index 0000000000..50dd0abfe4 Binary files /dev/null and b/examples/MMPT/videoclip.png differ diff --git a/examples/MMPT/vlm.png b/examples/MMPT/vlm.png new file mode 100644 index 0000000000..55c97dbc9f Binary files /dev/null and b/examples/MMPT/vlm.png differ diff --git a/examples/__init__.py b/examples/__init__.py index 80d95f5fe7..44bb24ae61 100644 --- a/examples/__init__.py +++ b/examples/__init__.py @@ -3,4 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from fairseq.version import __version__ # noqa +try: + from fairseq.version import __version__ # noqa +except ImportError: + pass diff --git a/examples/adaptive_span/README.md b/examples/adaptive_span/README.md new file mode 100644 index 0000000000..d5224fb289 --- /dev/null +++ b/examples/adaptive_span/README.md @@ -0,0 +1,90 @@ +# Adaptive Span + +Adaptive Span is a novel self-attention mechanism that can learn its optimal +attention span. This allows us to extend significantly the maximum context size +used in Transformer, while maintaining control over their memory footprint +and computational time. It uses the Truncated BPTT technique for training, +as in [transformerXL](https://github.com/pytorch/fairseq/blob/main/examples/truncated_bptt/README.md). + +Adaptive Span was introduced by paper: +[Adaptive Attention Span in Transformers](https://arxiv.org/abs/1905.07799), +which achieved state-of-the-art language modeling results at the time of publication. + +We manage to reproduce their result in fairseq and keep most of the +[original implementation](https://github.com/facebookresearch/adaptive-span) untouched. +You can refer to the their sweep file as well if any combination of hyperparameter is not clear. + +##### 0. Setup + +First you need to process the Enwik8 dataset, we use the pre-tokenized dataset +from [adaptive span paper](https://github.com/facebookresearch/adaptive-span/blob/master/get_data.sh). +You can download the dataset, and then run: +```bash +fairseq-preprocess --only-source --trainpref ~/data/enwik8/train.txt \ + --validpref ~/data/enwik8/valid.txt --testpref ~/data/enwik8/test.txt \ + --destdir ~/data/enwik8/data-bin/ --joined-dictionary --workers 20 +``` + +##### 1. Train a Adaptive Span model on Enwik8 + +We will train a 12-layer Adaptive Span model following the [hyperparameters +used in the original +paper](https://github.com/facebookresearch/adaptive-span/blob/master/experiments/enwik8.sh). + +The following command assumes 4 GPUs, so that the total batch size is 64 +sequences (4 x 16). Training should take 2-3 days on 4 V100 GPUs: +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train \ + --user-dir examples/adaptive_span \ + --data ~/data/enwik8/data-bin/ \ + --fp16 --fp16-no-flatten-grads --max-update 600000 \ + --task truncated_bptt_lm --tokens-per-sample 512 --arch adaptive_span \ + --n-layer 12 --d-model 512 --n-head 8 --d-inner 2048 --dropout 0.3 \ + --attn-span 8192 --optimizer adagrad_with_grad_clip --adagrad-clip 0.03 \ + --validate-interval-updates 1000 \ + --lr-scheduler fixed --warmup-updates 32000 --batch-size-valid 32 \ + --lr 0.07 --criterion adaptive_span_loss --batch-size 16 --update-freq 1 \ + --seed 2 --log-format json --log-interval 25 --aux-loss-scaler 5e-07 +``` +This should land around 1.05 on validation, 1.03 on test. You can lower the +--aux-loss-scaler for better performance (longer span). It gives ~0.03 bpc +improvement to the transformerXL baseline here. +If training on a single GPU, set `--update-freq=4` to accumulate 4x gradients +and simulate training on 4 GPUs. +You can also reproduce the transformerXL result on enwik8 using this code base. +It should land around 1.06 on test,matching the [original paper](https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/run_enwik8_base.sh). +You can try by +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train \ + --user-dir examples/truncated_bptt \ + ~/data/enwik8/data-bin/ \ + --task truncated_bptt_lm --fp16 --max-update 400000 \ + --tokens-per-sample 512 --arch transformer_xl --n-layer 12 \ + --d-model 512 --n-head 8 --d-head 64 --d-inner 2048 --dropout 0.1 \ + --dropatt 0.0 --mem-len 512 --optimizer adam --clip-norm 0.25 \ + --lr-scheduler cosine --warmup-updates 0 \ + --lr 0.0 --lr 0.00025 --batch-size 15 \ + --update-freq 1 --seed 2 --log-format json --log-interval 25 \ + --fp16 +``` + +##### 2. Evaluate +For Adaptive Span: +```bash +fairseq-eval-lm ~/data/enwik8/data-bin/ --path model/checkpoint_best.pt \ + --user-dir examples/adaptive_span \ + --task truncated_bptt_lm --batch-size 8 --tokens-per-sample 512 --gen-subset test +``` +For Transformer-XL evaluation: +```bash +fairseq-eval-lm ~/data/enwik8/data-bin/ --path model/checkpoint_best.pt \ + --user-dir examples/truncated_bptt/ --task truncated_bptt_lm --batch-size 8 \ + --tokens-per-sample 80 \ + --model-overrides '{"mem_len":2100,"clamp_len":820,"same_length":True}' \ + --gen-subset valid +``` + +*Note:* During training the model saw 512 tokens of context +(``--tokens-per-sample=512``), with batch size 8. These settings match the evaluation +settings from [the original +paper](https://github.com/facebookresearch/adaptive-span/blob/master/experiments/enwik8.sh). diff --git a/examples/adaptive_span/__init__.py b/examples/adaptive_span/__init__.py new file mode 100644 index 0000000000..e0a142a769 --- /dev/null +++ b/examples/adaptive_span/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import importlib +import os + +# automatically import any Python files in the current directory +cur_dir = os.path.dirname(__file__) +for file in os.listdir(cur_dir): + path = os.path.join(cur_dir, file) + if ( + not file.startswith("_") + and not file.startswith(".") + and (file.endswith(".py") or os.path.isdir(path)) + ): + mod_name = file[: file.find(".py")] if file.endswith(".py") else file + module = importlib.import_module(__name__ + "." + mod_name) diff --git a/examples/adaptive_span/adagrad_with_grad_clip.py b/examples/adaptive_span/adagrad_with_grad_clip.py new file mode 100644 index 0000000000..585ce184ab --- /dev/null +++ b/examples/adaptive_span/adagrad_with_grad_clip.py @@ -0,0 +1,128 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from torch.optim import Adagrad + +from fairseq.optim import LegacyFairseqOptimizer, register_optimizer + + +@register_optimizer("adagrad_with_grad_clip") +class FairseqAdagradWithGradClip(LegacyFairseqOptimizer): + def __init__(self, args, params): + super().__init__(args) + self._optimizer = AdagradWithGradClip(params, **self.optimizer_config) + + @staticmethod + def add_args(parser): + """Add optimizer-specific arguments to the parser.""" + # fmt: off + parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', + help='weight decay') + parser.add_argument('--adagrad-clip', default=0.0, type=float, metavar='D', + help='internal grad clip') + # fmt: on + + @property + def optimizer_config(self): + """ + Return a kwarg dictionary that will be used to override optimizer + args stored in checkpoints. This allows us to load a checkpoint and + resume training using a different set of optimizer args, e.g., with a + different learning rate. + """ + return { + "lr": self.args.lr[0], + "weight_decay": self.args.weight_decay, + "grad_clip": self.args.adagrad_clip, + } + + @property + def supports_flat_params(self): + return False + + +def _clip_grad(clr, grad, group_grad_clip): + if group_grad_clip > 0: + norm = grad.norm(2).item() + if norm > group_grad_clip: + clr *= group_grad_clip / (norm + 1e-10) + return clr + + +class AdagradWithGradClip(Adagrad): + """Adagrad algorithm with custom gradient clipping""" + + def __init__( + self, + params, + lr=1e-2, + lr_decay=0, + weight_decay=0, + initial_accumulator_value=0, + grad_clip=0, + ): + Adagrad.__init__( + self, + params, + lr=lr, + lr_decay=lr_decay, + weight_decay=weight_decay, + initial_accumulator_value=initial_accumulator_value, + ) + self.defaults["grad_clip"] = grad_clip + self.param_groups[0].setdefault("grad_clip", grad_clip) + + def step(self, closure=None): + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group["params"]: + if p.grad is None: + continue + + grad = p.grad.data + state = self.state[p] + + state["step"] += 1 + + if group["weight_decay"] != 0: + if p.grad.data.is_sparse: + raise RuntimeError( + "weight_decay option is " + "not compatible with sparse " + "gradients" + ) + grad = grad.add(group["weight_decay"], p.data) + + clr = group["lr"] / (1 + (state["step"] - 1) * group["lr_decay"]) + + # clip + clr = _clip_grad(clr=clr, grad=grad, group_grad_clip=group["grad_clip"]) + + if grad.is_sparse: + # the update is non-linear so indices must be unique + grad = grad.coalesce() + grad_indices = grad._indices() + grad_values = grad._values() + size = grad.size() + + def make_sparse(values): + constructor = grad.new + if grad_indices.dim() == 0 or values.dim() == 0: + return constructor().resize_as_(grad) + return constructor(grad_indices, values, size) + + state["sum"].add_(make_sparse(grad_values.pow(2))) + std = state["sum"]._sparse_mask(grad) + std_values = std._values().sqrt_().add_(1e-10) + p.data.add_(-clr, make_sparse(grad_values / std_values)) + else: + state["sum"].addcmul_(1, grad, grad) + std = state["sum"].sqrt().add_(1e-10) + p.data.addcdiv_(-clr, grad, std) + + return loss diff --git a/examples/adaptive_span/adaptive_span_attention.py b/examples/adaptive_span/adaptive_span_attention.py new file mode 100644 index 0000000000..07f757bb8e --- /dev/null +++ b/examples/adaptive_span/adaptive_span_attention.py @@ -0,0 +1,160 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class AdaptiveMask(nn.Module): + """Soft masking function for adaptive size. + It masks out the last K values of an input. The masking value + goes from 1 to 0 gradually, so K can be learned with + back-propagation. + Args: + max_size: maximum size (i.e. input dimension) + ramp_size: size of the ramp going from 0 to 1 + init_val: initial size proportion not to be masked out + shape: learn multiple sizes independent of each other + """ + + def __init__(self, max_size, ramp_size, init_val=0, shape=(1,)): + nn.Module.__init__(self) + self._max_size = max_size + self._ramp_size = ramp_size + self.current_val = nn.Parameter(torch.zeros(*shape) + init_val) + mask_template = torch.linspace(1 - max_size, 0, steps=max_size) + self.register_buffer("mask_template", mask_template) + + def forward(self, x): + mask = self.mask_template.float() + self.current_val.float() * self._max_size + mask = mask / self._ramp_size + 1 + mask = mask.clamp(0, 1) + if x.size(-1) < self._max_size: + # the input could have been trimmed beforehand to save computation + mask = mask.narrow(-1, self._max_size - x.size(-1), x.size(-1)) + x = (x * mask).type_as(x) + return x + + def get_current_max_size(self, include_ramp=True): + current_size = math.ceil(self.current_val.max().item() * self._max_size) + if include_ramp: + current_size += self._ramp_size + current_size = max(0, min(self._max_size, current_size)) + return current_size + + def get_current_avg_size(self, include_ramp=True): + current_size = math.ceil( + self.current_val.float().mean().item() * self._max_size + ) + if include_ramp: + current_size += self._ramp_size + current_size = max(0, min(self._max_size, current_size)) + return current_size + + def clamp_param(self): + """this need to be called after each update""" + self.current_val.data.clamp_(0, 1) + + +class AdaptiveSpan(nn.Module): + """Adaptive attention span for Transformerself. + This module learns an attention span length from data for each + self-attention head. + Args: + attn_span: maximum attention span + adapt_span_loss: loss coefficient for the span length + adapt_span_ramp: length of the masking ramp + adapt_span_init: initial size ratio + adapt_span_cache: adapt cache size to reduce memory usage + """ + + def __init__( + self, + attn_span, + adapt_span_ramp, + adapt_span_init, + n_head, + adapt_span_layer, + **kargs + ): + nn.Module.__init__(self) + self._max_span = attn_span + self._n_head = n_head + self._adapt_span_layer = adapt_span_layer + if self._adapt_span_layer: + self._mask = AdaptiveMask( + max_size=self._max_span, + ramp_size=adapt_span_ramp, + init_val=adapt_span_init, + ) + else: + self._mask = AdaptiveMask( + max_size=self._max_span, + ramp_size=adapt_span_ramp, + init_val=adapt_span_init, + shape=(n_head, 1, 1), + ) + + def forward(self, attn, normalize=True): + """mask attention with the right span""" + # batch and head dimensions are merged together, so separate them first + self.clamp_param() + if self._adapt_span_layer: + attn = self._mask(attn) + else: + B = attn.size(0) # batch size + M = attn.size(1) # block size + attn = attn.reshape(B // self._n_head, self._n_head, M, -1) + attn = self._mask(attn) + attn = attn.view(B, M, -1) + return attn + + def get_trim_len(self): + """how much of memory can be trimmed to reduce computation""" + L = self._max_span + trim_len = min(L - 1, L - self._mask.get_current_max_size()) + # too fine granularity might be bad for the memory management + trim_len = math.floor(trim_len / 64) * 64 + return trim_len + + def trim_memory(self, query, key, value, key_pe): + """trim out unnecessary memory beforehand to reduce computation""" + trim_len = self.get_trim_len() + cache_size = key.size(1) - query.size(1) + trim_len_cache = trim_len - (self._max_span - cache_size) + if trim_len_cache > 0: + key = key[:, trim_len_cache:, :] + value = value[:, trim_len_cache:, :] + elif trim_len_cache < 0: + # cache is too short! this happens when validation resumes + # after a lot of updates. + key = F.pad(key, [0, 0, -trim_len_cache, 0]) + value = F.pad(value, [0, 0, -trim_len_cache, 0]) + if trim_len > 0: + if key_pe is not None: + key_pe = key_pe[:, :, trim_len:] + return key, value, key_pe + + def get_cache_size(self): + """determine how long the cache should be""" + trim_len = self.get_trim_len() + # give a buffer of 64 steps since a span might increase + # in future updates + return min(self._max_span, self._max_span - trim_len + 64) + + def get_loss(self): + """a loss term for regularizing the span length""" + return self._max_span * self._mask.current_val.float().mean() + + def get_current_max_span(self): + return self._mask.get_current_max_size() + + def get_current_avg_span(self): + return self._mask.get_current_avg_size() + + def clamp_param(self): + self._mask.clamp_param() diff --git a/examples/adaptive_span/adaptive_span_loss.py b/examples/adaptive_span/adaptive_span_loss.py new file mode 100644 index 0000000000..fe95b0d949 --- /dev/null +++ b/examples/adaptive_span/adaptive_span_loss.py @@ -0,0 +1,107 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass + +import torch.nn.functional as F +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import register_criterion +from fairseq.criterions.cross_entropy import CrossEntropyCriterion +from fairseq.dataclass import FairseqDataclass +from omegaconf import II + + +@dataclass +class AdaptiveSpanCriterionConfig(FairseqDataclass): + sentence_avg: bool = II("optimization.sentence_avg") + + +@register_criterion("adaptive_span_loss", dataclass=AdaptiveSpanCriterionConfig) +class AdaptiveSpanCriterion(CrossEntropyCriterion): + def __init__(self, task, sentence_avg): + super().__init__(task, sentence_avg) + + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss here is summed, different from the adaptive span code + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + net_output = model(**sample["net_input"]) + loss, aux_loss, avg_span, max_span = self.compute_loss( + model, net_output, sample, reduce=reduce + ) + sample_size = ( + sample["target"].size(0) if self.sentence_avg else sample["ntokens"] + ) + loss /= sample_size + total_loss = loss + aux_loss + sample_size = 1 + + logging_output = { + "loss": loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample["target"].size(0), + "sample_size": sample_size, + "total_loss": total_loss.data, + "avg_span": avg_span * sample_size, + "max_span": max_span * sample_size, + } + return total_loss, sample_size, logging_output + + def compute_loss(self, model, net_output, sample, reduce=True): + loss, _ = super().compute_loss(model, net_output, sample, reduce) + aux_loss = model.get_aux_loss() + avg_span = model.get_current_avg_span() + max_span = model.get_current_max_span() + return loss, aux_loss, avg_span, max_span + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + loss_sum = sum(log.get("loss", 0) for log in logging_outputs) + ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + total_loss_sum = sum(log.get("total_loss", 0) for log in logging_outputs) + avg_span_sum = sum(log.get("avg_span", 0) for log in logging_outputs) + max_span_sum = sum(log.get("max_span", 0) for log in logging_outputs) + + # we divide by log(2) to convert the loss from base e to base 2 + metrics.log_scalar( + "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 + ) + metrics.log_scalar("avg_span", avg_span_sum / sample_size, sample_size, round=3) + metrics.log_scalar("max_span", max_span_sum / sample_size, sample_size, round=3) + # total loss contains the L1 norm on adaptive-span + metrics.log_scalar( + "total_loss", + total_loss_sum / sample_size / math.log(2), + sample_size, + round=3, + ) + if sample_size != ntokens: + metrics.log_scalar( + "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 + ) + metrics.log_derived( + "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) + ) + else: + metrics.log_derived( + "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg) + ) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True diff --git a/examples/adaptive_span/adaptive_span_model.py b/examples/adaptive_span/adaptive_span_model.py new file mode 100644 index 0000000000..d96c95b85d --- /dev/null +++ b/examples/adaptive_span/adaptive_span_model.py @@ -0,0 +1,263 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from fairseq.modules.layer_norm import LayerNorm + +from .adaptive_span_attention import AdaptiveSpan + +# Size notations: +# B = batch_size, H = d_model, M = block_size, L = attn_span + + +def _skew(X, pad_value): + """shift every row 1 step to right""" + # X = B x M x L + B, M, L = X.size() + X = F.pad(X, (0, M + 1), value=pad_value) # B x M x (L+M+1) + X = X.view(B, -1) # B x ML+MM+M + X = X[:, :-M] # B x ML+MM + X = X.view(B, M, M + L) # B x M x L+M + return X + + +def _unskew(X): + """reverse _skew operation""" + # X = B x M x L+M + B, M, L = X.size() + L -= M + X = X.view(B, -1) # B x ML+MM + X = F.pad(X, (0, M)) # B x ML+MM+M + X = X.view(B, M, M + L + 1) # B x M x L+M+1 + X = X[:, :, :L] # B x M x L + return X + + +class SeqAttention(nn.Module): + """Sequential self-attention layer. + Each token will attend to its previous fixed number of steps. + Note that attention doesn't include the current step itself. + """ + + def __init__(self, d_model, n_head, attn_span, dropout, adapt_span_layer, **kargs): + nn.Module.__init__(self) + self.dropout = nn.Dropout(dropout) + self.d_model = d_model # size of a single head + self.attn_span = attn_span + self.adaptive_span = AdaptiveSpan( + attn_span=attn_span, + n_head=n_head, + adapt_span_layer=adapt_span_layer, + **kargs + ) + + def forward(self, query, key, value, key_pe): + # query size = B x M x H + # key, value sizes = B x (M+L) x H + + key, value, key_pe = self.adaptive_span.trim_memory(query, key, value, key_pe) + + # compute attention from context + # B x M (dest) x (M+L) (src) + attn_cont = torch.matmul(query, key.transpose(-1, -2)) + attn_cont = _unskew(attn_cont) # B x M x L + + # compute the effect of position embedding + attn_pos = torch.matmul(query, key_pe) # B x M x L_pos + attn = attn_cont + attn_pos + + attn = attn / math.sqrt(self.d_model) # B x M X L_pos + + attn = F.softmax(attn.float(), dim=-1).type_as(attn) + + # trim attention lengths according to the learned span + attn = self.adaptive_span(attn) + + attn = self.dropout(attn) # B x M X L_pos + + attn_cont = _skew(attn, 0) # B x M X (L+M) + out = torch.matmul(attn_cont, value) # B x M x H + return out + + def get_cache_size(self): + return self.adaptive_span.get_cache_size() + + +class MultiHeadSeqAttention(nn.Module): + def __init__(self, d_model, n_head, **kargs): + nn.Module.__init__(self) + assert d_model % n_head == 0 + self.n_head = n_head + self.head_dim = d_model // n_head + self.attn = SeqAttention(d_model=self.head_dim, n_head=n_head, **kargs) + self.proj_query = nn.Linear(d_model, d_model, bias=False) + nn.init.xavier_normal_(self.proj_query.weight) + self.proj_out = nn.Linear(d_model, d_model, bias=False) + nn.init.xavier_normal_(self.proj_out.weight) + self.proj_val = nn.Linear(d_model, d_model, bias=False) + nn.init.xavier_normal_(self.proj_val.weight) + self.proj_key = nn.Linear(d_model, d_model, bias=False) + nn.init.xavier_normal_(self.proj_key.weight) + + def head_reshape(self, x): + K = self.n_head + D = self.head_dim + x = x.view(x.size()[:-1] + (K, D)) # B x (M+L) x K x D + x = x.transpose(1, 2).contiguous() # B x K x (M+L) x D + x = x.view(-1, x.size(-2), x.size(-1)) # B_K x (M+L) x D + return x + + def forward(self, query, key, value, key_pe): + B = query.size(0) + K = self.n_head + D = self.head_dim + M = query.size(1) + + query = self.proj_query(query) + query = self.head_reshape(query) + value = self.proj_val(value) + value = self.head_reshape(value) + key = self.proj_key(key) + key = self.head_reshape(key) + + out = self.attn(query, key, value, key_pe) # B_K x M x D + out = out.view(B, K, M, D) # B x K x M x D + out = out.transpose(1, 2).contiguous() # B x M x K x D + out = out.view(B, M, -1) # B x M x K_D + out = self.proj_out(out) + return out + + +class FeedForwardLayer(nn.Module): + def __init__(self, d_model, d_inner, dropout, **kargs): + nn.Module.__init__(self) + self.fc1 = nn.Linear(d_model, d_inner) + self.fc2 = nn.Linear(d_inner, d_model) + nn.init.xavier_uniform_(self.fc1.weight) + nn.init.xavier_uniform_(self.fc2.weight) + self.dropout = nn.Dropout(dropout) + + def forward(self, h): + h1 = F.relu(self.fc1(h)) + h1 = self.dropout(h1) + h2 = self.fc2(h1) + return h2 + + +class TransformerSeqLayer(nn.Module): + def __init__(self, d_model, **kargs): + nn.Module.__init__(self) + self.attn = MultiHeadSeqAttention(d_model=d_model, **kargs) + self.norm1 = LayerNorm(d_model) + self.ff = FeedForwardLayer(d_model=d_model, **kargs) + self.norm2 = LayerNorm(d_model) + + def forward(self, h, h_cache, key_pe): + # h = B x M x H + # h_cache = B x L x H + h_all = torch.cat([h_cache, h], dim=1) # B x (M+L) x H + attn_out = self.attn(h, h_all, h_all, key_pe) + h = self.norm1(h + attn_out) # B x M x H + if self.ff is not None: + ff_out = self.ff(h) + out = self.norm2(h + ff_out) # B x M x H + else: + out = h + return out + + def get_cache_size(self): + return self.attn.attn.get_cache_size() + + +class TransformerSeq(nn.Module): + def __init__( + self, + vocab_size, + d_model, + n_head, + n_layer, + attn_span, + emb_dropout, + aux_loss_scaler, + adapt_span_layer, + **kargs + ): + nn.Module.__init__(self) + # token embeddings + self.in_emb = nn.Embedding(vocab_size, d_model) + nn.init.normal_(self.in_emb.weight, mean=0, std=d_model ** -0.5) + self.out_emb = nn.Linear(d_model, vocab_size) + self.aux_loss_scaler = aux_loss_scaler + if emb_dropout > 0: + self.emb_dropout = nn.Dropout(emb_dropout) + else: + self.emb_dropout = None + # position embeddings + self.key_pe = nn.Parameter(torch.randn(1, d_model // n_head, attn_span)) + + self.layers = nn.ModuleList() + self.layers.extend( + TransformerSeqLayer( + d_model=d_model, + n_head=n_head, + attn_span=attn_span, + adapt_span_layer=adapt_span_layer, + **kargs + ) + for _ in range(n_layer) + ) + + def forward(self, x, h_cache, target=None): + # x size = B x M + block_size = x.size(1) + h = self.in_emb(x) # B x M x H + if self.emb_dropout is not None: + h = self.emb_dropout(h) + + h_cache_next = [] + for l, layer in enumerate(self.layers): + cache_size = layer.attn.attn.get_cache_size() + if cache_size > block_size: + h_cache_next_l = torch.cat( + [h_cache[l][:, -cache_size + block_size :, :], h], dim=1 + ).detach() + else: + h_cache_next_l = h[:, -cache_size:, :].detach() + h_cache_next.append(h_cache_next_l) + h = layer(h, h_cache[l], self.key_pe) # B x M x H + + if self.emb_dropout is not None: + h = self.emb_dropout(h) + + out = F.log_softmax(self.out_emb(h).float(), dim=-1).type_as(h) + dummy_loss = None + + return out, h_cache_next, dummy_loss + + def get_aux_loss(self): + loss = 0.0 + for layer in self.layers: + loss += layer.attn.attn.adaptive_span.get_loss() + return self.aux_loss_scaler * loss + + def get_current_max_span(self): + max_span = 0.0 + for layer in self.layers: + max_span = max( + max_span, layer.attn.attn.adaptive_span.get_current_max_span() + ) + return max_span + + def get_current_avg_span(self): + avg_span = 0.0 + for layer in self.layers: + avg_span += layer.attn.attn.adaptive_span.get_current_avg_span() + return avg_span / len(self.layers) diff --git a/examples/adaptive_span/adaptive_span_model_wrapper.py b/examples/adaptive_span/adaptive_span_model_wrapper.py new file mode 100644 index 0000000000..5b147fe11f --- /dev/null +++ b/examples/adaptive_span/adaptive_span_model_wrapper.py @@ -0,0 +1,145 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from dataclasses import dataclass +from typing import Dict, List, Optional + +import torch +from fairseq.dataclass import FairseqDataclass +from fairseq.models import ( + FairseqIncrementalDecoder, + FairseqLanguageModel, + register_model, +) +from .adaptive_span_model import TransformerSeq as AdaptiveSpanTransformerModel + + +logger = logging.getLogger(__name__) + + +@dataclass +class AdaptiveSpanSmallConfig(FairseqDataclass): + # defaults come from https://github.com/facebookresearch/adaptive-span/blob/master/experiments/enwik8_small.sh + vocab_size: int = 50 + d_model: int = 256 + n_head: int = 4 + d_inner: int = 1024 + n_layer: int = 8 + attn_span: int = 1024 + dropout: float = 0.0 + emb_dropout: float = 0.0 + adapt_span_ramp: int = 32 + adapt_span_init: float = 0.0 + aux_loss_scaler: float = 0.000002 + adapt_span_layer: bool = False + + +@register_model("adaptive_span", dataclass=AdaptiveSpanSmallConfig) +class AdaptiveSpanTransformer(FairseqLanguageModel): + @classmethod + def build_model(cls, cfg: AdaptiveSpanSmallConfig, task): + return cls(AdaptiveSpanDecoder(cfg, task)) + + def get_aux_loss(self): + return self.decoder.get_aux_loss() + + def get_current_max_span(self): + return self.decoder.get_current_max_span() + + def get_current_avg_span(self): + return self.decoder.get_current_avg_span() + + +class AdaptiveSpanDecoder(FairseqIncrementalDecoder): + def __init__(self, cfg, task): + + super().__init__(task.target_dictionary) + + self.config = cfg + config = AdaptiveSpanSmallConfig( + vocab_size=len(task.target_dictionary), + d_model=cfg.d_model, + n_head=cfg.n_head, + d_inner=cfg.d_inner, + n_layer=cfg.n_layer, + attn_span=cfg.attn_span, + dropout=cfg.dropout, + emb_dropout=cfg.emb_dropout, + adapt_span_ramp=cfg.adapt_span_ramp, + adapt_span_init=cfg.adapt_span_init, + aux_loss_scaler=cfg.aux_loss_scaler, + adapt_span_layer=cfg.adapt_span_layer, + ) + logger.info(config) + self.model = AdaptiveSpanTransformerModel(**config.__dict__) + + self._mems = None + + def forward( + self, + src_tokens, + incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None, + encoder_out=None, + ): + bsz = src_tokens.size(0) + if incremental_state is not None: # used during inference + mems = self.get_incremental_state("mems") + src_tokens = src_tokens[:, -1:] # only keep the most recent token + else: + mems = self._mems + + if mems is None: + # first time init + mems = self.init_hid_cache(bsz) + output = self.model(x=src_tokens, h_cache=mems,) + if incremental_state is not None: + self.set_incremental_state(incremental_state, "mems", output[1]) + else: + self._mems = output[1] + return (output[0],) + + def max_positions(self): + return self.config.attn_span + + def init_hid_cache(self, batch_sz): + hid = [] + for layer in self.model.layers: + param = next(self.model.parameters()) + h = torch.zeros( + batch_sz, + layer.get_cache_size(), + self.config.d_model, + dtype=param.dtype, + device=param.device, + ) + hid.append(h) + return hid + + def get_aux_loss(self): + return self.model.get_aux_loss() + + def get_current_max_span(self): + return self.model.get_current_max_span() + + def get_current_avg_span(self): + return self.model.get_current_avg_span() + + def reorder_incremental_state( + self, + incremental_state: Dict[str, Dict[str, Optional[torch.Tensor]]], + new_order: torch.Tensor, + ): + """Reorder incremental state. + + This will be called when the order of the input has changed from the + previous time step. A typical use case is beam search, where the input + order changes between time steps based on the selection of beams. + """ + raise NotImplementedError("This is required for generation/beam search") + # mems = self.get_incremental_state(incremental_state, "mems") + # if mems is not None: + # new_mems = [mems_i.index_select(1, new_order) for mems_i in mems] + # self.set_incremental_state(incremental_state, "mems", new_mems) diff --git a/examples/adaptive_span/truncated_bptt_lm_task.py b/examples/adaptive_span/truncated_bptt_lm_task.py new file mode 120000 index 0000000000..a92da3a298 --- /dev/null +++ b/examples/adaptive_span/truncated_bptt_lm_task.py @@ -0,0 +1 @@ +../truncated_bptt/truncated_bptt_lm_task.py \ No newline at end of file diff --git a/examples/attention_head_selection/README.md b/examples/attention_head_selection/README.md new file mode 100644 index 0000000000..2434f1fb21 --- /dev/null +++ b/examples/attention_head_selection/README.md @@ -0,0 +1,161 @@ +# Pay Better Attention to Attention: Head Selection in Multilingual and Multi-Domain Sequence Modeling (Gong et al., 2021) + +[https://arxiv.org/pdf/2106.10840.pdf](https://arxiv.org/pdf/2106.10840.pdf) + +## Introduction + +We present attention head selection strategies in multilingual and multi-domain sequence modeling including text translation, speech recognition and speech translation tasks. + +Below is an example of training multilingual/multi-domain speech recognition models. + +## Data Preparation +Prepare mTEDx data as in [mTEDx example](https://github.com/fairinternal/fairseq-py/blob/0d9c5851e6fac40f9e366b3633ccd615c2901788/examples/speech_to_text/docs/mtedx_example.md) and CoVoST data as in [CoVoST example](https://github.com/fairinternal/fairseq-py/blob/0d9c5851e6fac40f9e366b3633ccd615c2901788/examples/speech_to_text/docs/covost_example.md). Similarly prepare EuroParl data. + + +## Training a multilingual ASR model with attention head selection + +```bash +data_dir= +train_subset="train_ar_ar_tedx,train_de_de_tedx,train_el_el_tedx,train_es_es_tedx,train_fr_fr_tedx,train_it_it_tedx,train_pt_pt_tedx,train_ru_ru_tedx" +valid_subset="valid_ar_ar_tedx,valid_de_de_tedx,valid_el_el_tedx,valid_es_es_tedx,valid_fr_fr_tedx,valid_it_it_tedx,valid_pt_pt_tedx,valid_ru_ru_tedx" +strateg= + +fairseq-train ${data_dir} \ + --user-dir examples/attention_head_selection/src \ + --train-subset "${train_subset}" \ + --valid-subset "${valid_subset}" \ + --config-yaml 'config_asr.yaml' \ + --arch 'head_selection_s2t_transformer_s' \ + --task 'speech_to_text_head_selection' \ + --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ + --lr-scheduler 'inverse_sqrt' --stop-min-lr -1.0 --warmup-updates 10000 \ + --lr 5e-4 \ + --clip-norm 10.0 \ + --seed 1 \ + --max-epoch 400 \ + --max-tokens 32000 \ + --ignore-prefix-size 1 \ + --dropout 0.3 \ + --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ + --skip-invalid-size-inputs-valid-test \ + --encoder-attn-head-select \ + --total-encoder-attention-heads 8 \ + --decoder-self-attn-head-select \ + --total-decoder-attention-heads 8 \ + --attn-head-select-strategy ${strategy} \ + --task-type lang \ +``` + +## Training a multi-domain ASR model with attention head selection + +```bash +data_dir= +train_subset="train_es_es_tedx,train_fr_fr_tedx,train_pt_pt_tedx,train_it_it_tedx,train_ru_ru_tedx,train_el_el_tedx,train_ar_ar_tedx,train_de_de_tedx,train_ar_ar_cv,train_de_de_cv,train_es_es_cv,train_fr_fr_cv,train_it_it_cv,train_pt_pt_cv,train_ru_ru_cv,train_de_de_ep,train_es_es_ep,train_fr_fr_ep,train_it_it_ep,train_pt_pt_ep" +valid_subset="dev_es_es_tedx,dev_fr_fr_tedx,dev_pt_pt_tedx,dev_it_it_tedx,dev_ru_ru_tedx,dev_el_el_tedx,dev_ar_ar_tedx,dev_de_de_tedx,dev_ar_ar_cv,dev_de_de_cv,dev_es_es_cv,dev_fr_fr_cv,dev_it_it_cv,dev_pt_pt_cv,dev_ru_ru_cv,dev_de_de_ep,dev_es_es_ep,dev_fr_fr_ep,dev_it_it_ep,dev_pt_pt_ep" +strateg= + +fairseq-train ${data_dir} \ + --user-dir examples/attention_head_selection/src \ + --train-subset "${train_subset}" \ + --valid-subset "${valid_subset}" \ + --config-yaml 'config_asr.yaml' \ + --arch head_selection_s2t_transformer_s \ + --task speech_to_text_head_selection \ + --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ + --lr-scheduler 'inverse_sqrt' --stop-min-lr -1.0 --warmup-updates 10000 \ + --lr 5e-4 \ + --clip-norm 10.0 \ + --seed 1 \ + --max-epoch 400 \ + --max-tokens 32000 \ + --ignore-prefix-size 1 \ + --dropout 0.3 \ + --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ + --skip-invalid-size-inputs-valid-test \ + --encoder-attn-head-select \ + --total-encoder-attention-heads 8 \ + --decoder-self-attn-head-select \ + --total-decoder-attention-heads 8 \ + --attn-head-select-strategy ${strategy} \ + --task-type domain +``` + +## Inference in multilingual setting + +```bash +MODEL_DIR= +data_dir= +gen_subset= +train_subset="train_ar_ar_tedx,train_de_de_tedx,train_el_el_tedx,train_es_es_tedx,train_fr_fr_tedx,train_it_it_tedx,train_pt_pt_tedx,train_ru_ru_tedx" +last_n=10 +CHECKPOINT_FILENAME="avg_last_${last_n}_checkpoint.pt" +CHECKPOINT="_avg" +RESULTS="${MODEL_DIR}/ckpt${CHECKPOINT}" +if [ ! -d $RESULTS ]; then + mkdir -p $RESULTS +fi; + +python scripts/average_checkpoints.py \ + --inputs ${MODEL_DIR} --num-epoch-checkpoints ${last_n} \ + --output "${MODEL_DIR}/${CHECKPOINT_FILENAME}" + +fairseq-generate ${data_dir} \ + --user-dir examples/attention_head_selection/src \ + --arch 'head_selection_s2t_transformer_s' \ + --task 'speech_to_text_head_selection' \ + --train-subset ${train_subset} \ + --gen-subset ${gen_subset} \ + --path "${MODEL_DIR}/${CHECKPOINT_FILENAME}" \ + --config-yaml 'config_asr.yaml' \ + --prefix-size 1 \ + --max-tokens 40000 --beam 5 \ + --skip-invalid-size-inputs-valid-test \ + --results-path ${RESULTS} \ + --scoring wer --wer-tokenizer 13a \ + --wer-lowercase --wer-remove-punct --remove-bpe +``` + +## Inference in multi-domain setting + +```bash +MODEL_DIR= +data_dir= +gen_subset= +train_subset="train_es_es_tedx,train_fr_fr_tedx,train_pt_pt_tedx,train_it_it_tedx,train_ru_ru_tedx,train_el_el_tedx,train_ar_ar_tedx,train_de_de_tedx,train_ar_ar_cv,train_de_de_cv,train_es_es_cv,train_fr_fr_cv,train_it_it_cv,train_pt_pt_cv,train_ru_ru_cv,train_de_de_ep,train_es_es_ep,train_fr_fr_ep,train_it_it_ep,train_pt_pt_ep" +last_n=10 +CHECKPOINT_FILENAME="avg_last_${last_n}_checkpoint.pt" +CHECKPOINT="_avg" +RESULTS="${MODEL_DIR}/ckpt${CHECKPOINT}" +if [ ! -d $RESULTS ]; then + mkdir -p $RESULTS +fi; + +python scripts/average_checkpoints.py \ + --inputs ${MODEL_DIR} --num-epoch-checkpoints ${last_n} \ + --output "${MODEL_DIR}/${CHECKPOINT_FILENAME}" + +fairseq-generate ${data_dir} \ + --user-dir examples/attention_head_selection/src \ + --arch 'head_selection_s2t_transformer_s' \ + --task 'speech_to_text_head_selection' \ + --train-subset ${train_subset} \ + --gen-subset ${gen_subset} \ + --path "${MODEL_DIR}/${CHECKPOINT_FILENAME}" \ + --config-yaml 'config_asr.yaml' \ + --prefix-size 1 \ + --max-tokens 40000 --beam 5 \ + --skip-invalid-size-inputs-valid-test \ + --results-path ${RESULTS} \ + --scoring wer --wer-tokenizer 13a \ + --wer-lowercase --wer-remove-punct --remove-bpe +``` + +## Citation +```bibtex +@article{gong2021pay, + title={Pay Better Attention to Attention: Head Selection in Multilingual and Multi-Domain Sequence Modeling}, + author={Gong, Hongyu and Tang, Yun and Pino, Juan and Li, Xian}, + journal={arXiv preprint arXiv:2106.10840}, + year={2021} +} +''' diff --git a/examples/latent_depth/src/loss/__init__.py b/examples/attention_head_selection/src/__init__.py similarity index 100% rename from examples/latent_depth/src/loss/__init__.py rename to examples/attention_head_selection/src/__init__.py diff --git a/examples/latent_depth/src/models/__init__.py b/examples/attention_head_selection/src/data/__init__.py similarity index 100% rename from examples/latent_depth/src/models/__init__.py rename to examples/attention_head_selection/src/data/__init__.py diff --git a/examples/attention_head_selection/src/data/speech_to_text_dataset_with_domain.py b/examples/attention_head_selection/src/data/speech_to_text_dataset_with_domain.py new file mode 100644 index 0000000000..1f1823a7ac --- /dev/null +++ b/examples/attention_head_selection/src/data/speech_to_text_dataset_with_domain.py @@ -0,0 +1,242 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from pathlib import Path +from typing import Dict, List, Optional +from dataclasses import dataclass + +import torch +from fairseq.data import ( + ConcatDataset, + Dictionary, + FairseqDataset, + ResamplingDataset +) +from fairseq.data.audio.data_cfg import S2TDataConfig +from fairseq.data.audio.speech_to_text_dataset import ( + SpeechToTextDatasetItem, + SpeechToTextDataset, + SpeechToTextDatasetCreator +) + +logger = logging.getLogger(__name__) + + +@dataclass +class SpeechToTextDatasetItemWithDomain(SpeechToTextDatasetItem): + src_lang_id: Optional[torch.Tensor] = None + tgt_lang_id: Optional[torch.Tensor] = None + domain_id: Optional[torch.Tensor] = None + + +class SpeechToTextDatasetWithDomain(SpeechToTextDataset): + + def __init__( + self, + split: str, + is_train_split: bool, + cfg: S2TDataConfig, + audio_paths: List[str], + n_frames: List[int], + src_texts: Optional[List[str]] = None, + tgt_texts: Optional[List[str]] = None, + speakers: Optional[List[str]] = None, + src_langs: Optional[List[str]] = None, + tgt_langs: Optional[List[str]] = None, + ids: Optional[List[str]] = None, + tgt_dict: Optional[Dictionary] = None, + pre_tokenizer=None, + bpe_tokenizer=None, + n_frames_per_step=1, + speaker_to_id=None, + src_lang_ids: Optional[List[int]] = None, + tgt_lang_ids: Optional[List[int]] = None, + domain_ids: Optional[List[int]] = None + ): + super().__init__( + split, is_train_split, cfg, audio_paths, n_frames, + src_texts, tgt_texts, speakers, src_langs, tgt_langs, + ids, tgt_dict, pre_tokenizer, bpe_tokenizer, + n_frames_per_step, speaker_to_id + ) + assert src_lang_ids is None or len(src_lang_ids) == self.n_samples + assert tgt_lang_ids is None or len(tgt_lang_ids) == self.n_samples + assert domain_ids is None or len(domain_ids) == self.n_samples + + self.src_lang_ids = src_lang_ids + self.tgt_lang_ids = tgt_lang_ids + self.domain_ids = domain_ids + + def __getitem__(self, index: int) -> SpeechToTextDatasetItemWithDomain: + item = super().__getitem__(index) + src_lang_id = self.src_lang_ids[index] + tgt_lang_id = self.tgt_lang_ids[index] + domain_id = self.domain_ids[index] + return SpeechToTextDatasetItemWithDomain( + index=item.index, source=item.source, + target=item.target, speaker_id=item.speaker_id, + src_lang_id=src_lang_id, + tgt_lang_id=tgt_lang_id, + domain_id=domain_id + ) + + def collater( + self, samples: List[SpeechToTextDatasetItem], return_order: bool = False + ) -> Dict: + if len(samples) == 0: + return {} + out = super().collater(samples, return_order=True) + order = out["order"] + src_lang_ids = torch.tensor([x.src_lang_id for x in samples], dtype=torch.long).index_select(0, order) + tgt_lang_ids = torch.tensor([x.tgt_lang_id for x in samples], dtype=torch.long).index_select(0, order) + domain_ids = torch.tensor([x.domain_id for x in samples], dtype=torch.long).index_select(0, order) + + out["src_lang_ids"] = src_lang_ids + out["tgt_lang_ids"] = tgt_lang_ids + out["domain_ids"] = domain_ids + if not return_order: + del out["order"] + return out + + +class SpeechToTextDatasetCreatorWithDomain(SpeechToTextDatasetCreator): + KEY_SRC_LANG_ID, KEY_TGT_LANG_ID = "src_lang_id", "tgt_lang_id" + KEY_DOMAIN_ID = "domain_id" + # default values + DEFAULT_SRC_LANG_ID, DEFAULT_TGT_LANG_ID, DEFAULT_DOMAIN_ID = 0, 0, 0 + + @classmethod + def _from_list( + cls, + split_name: str, + is_train_split, + samples: List[Dict], + cfg: S2TDataConfig, + tgt_dict, + pre_tokenizer, + bpe_tokenizer, + n_frames_per_step, + speaker_to_id + ) -> SpeechToTextDatasetWithDomain: + audio_root = Path(cfg.audio_root) + ids = [s[cls.KEY_ID] for s in samples] + audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples] + n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples] + tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples] + src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples] + speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples] + src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples] + tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples] + src_lang_ids = [s.get(cls.KEY_SRC_LANG_ID, cls.DEFAULT_SRC_LANG_ID) for s in samples] + tgt_lang_ids = [s.get(cls.KEY_TGT_LANG_ID, cls.DEFAULT_TGT_LANG_ID) for s in samples] + domain_ids = [s.get(cls.KEY_DOMAIN_ID, cls.DEFAULT_DOMAIN_ID) for s in samples] + return SpeechToTextDatasetWithDomain( + split_name, + is_train_split, + cfg, + audio_paths, + n_frames, + src_texts=src_texts, + tgt_texts=tgt_texts, + speakers=speakers, + src_langs=src_langs, + tgt_langs=tgt_langs, + ids=ids, + tgt_dict=tgt_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + n_frames_per_step=n_frames_per_step, + speaker_to_id=speaker_to_id, + src_lang_ids=src_lang_ids, + tgt_lang_ids=tgt_lang_ids, + domain_ids=domain_ids + ) + + @classmethod + def _load_samples_from_tsv( + cls, + root: str, + split: str, + src_lang_map, + tgt_lang_map, + domain_map + ): + # metadata from split + _, src_lang, tgt_lang, domain = split.split("_") + src_lang_id = src_lang_map[src_lang] + tgt_lang_id = tgt_lang_map[tgt_lang] + domain_id = domain_map[domain] + + samples = SpeechToTextDatasetCreator._load_samples_from_tsv(root, split) + for s in samples: + s.update({ + cls.KEY_SRC_LANG_ID: src_lang_id, + cls.KEY_TGT_LANG_ID: tgt_lang_id, + cls.KEY_DOMAIN_ID: domain_id + }) + return samples + + @classmethod + def _from_tsv( + cls, + root: str, + cfg: S2TDataConfig, + split: str, + tgt_dict, + is_train_split: bool, + pre_tokenizer, + bpe_tokenizer, + n_frames_per_step, + speaker_to_id, + src_lang_map: Dict[str, int], + tgt_lang_map: Dict[str, int], + domain_map: Dict[str, int] + ) -> SpeechToTextDatasetItemWithDomain: + samples = cls._load_samples_from_tsv( + root, split, src_lang_map, + tgt_lang_map, domain_map + ) + return cls._from_list( + split, is_train_split, samples, cfg, tgt_dict, pre_tokenizer, + bpe_tokenizer, n_frames_per_step, speaker_to_id + ) + + @classmethod + def from_tsv( + cls, + root: str, + cfg: S2TDataConfig, + splits: str, + tgt_dict, + pre_tokenizer, + bpe_tokenizer, + is_train_split: bool, + epoch: int, + seed: int, + src_lang_map: Dict[str, int], + tgt_lang_map: Dict[str, int], + domain_map: Dict[str, int], + n_frames_per_step: int = 1, + speaker_to_id=None + ) -> SpeechToTextDatasetWithDomain: + datasets = [ + cls._from_tsv( + root, cfg, split, tgt_dict, is_train_split, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id, src_lang_map, tgt_lang_map, domain_map + ) + for split in splits.split(",") + ] + + if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0: + # temperature-based sampling + size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha) + datasets = [ + ResamplingDataset( + d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0) + ) + for r, d in zip(size_ratios, datasets) + ] + + return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0] diff --git a/examples/latent_depth/src/modules/__init__.py b/examples/attention_head_selection/src/loss/__init__.py similarity index 100% rename from examples/latent_depth/src/modules/__init__.py rename to examples/attention_head_selection/src/loss/__init__.py diff --git a/examples/attention_head_selection/src/loss/attention_head_selection.py b/examples/attention_head_selection/src/loss/attention_head_selection.py new file mode 100644 index 0000000000..4ba33954d0 --- /dev/null +++ b/examples/attention_head_selection/src/loss/attention_head_selection.py @@ -0,0 +1,27 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +from torch.nn.modules.loss import _Loss + + +class HeadSelectionLoss(_Loss): + + def __init__(self, args): + super().__init__() + self.args = args + self.kl_weight = getattr(args, "kl_weight", 0.0) + + def forward(self, head_samples, sample_sizes, prior=0.5, eps=1e-7): + """ + head_scores: (num_tasks, num_layers, num_heads) + sample_sizes: (num_tasks, ) + """ + kl_loss = (head_samples * (torch.log(head_samples + eps) - math.log(prior))).sum(-1).sum(-1) + kl_loss /= (torch.numel(head_samples) / head_samples.size(0)) + kl_loss = self.kl_weight * torch.matmul(kl_loss, sample_sizes) + return kl_loss diff --git a/examples/linformer/src/models/__init__.py b/examples/attention_head_selection/src/models/__init__.py similarity index 100% rename from examples/linformer/src/models/__init__.py rename to examples/attention_head_selection/src/models/__init__.py diff --git a/examples/attention_head_selection/src/models/head_selection_s2t_transformer.py b/examples/attention_head_selection/src/models/head_selection_s2t_transformer.py new file mode 100644 index 0000000000..2c7ed89e89 --- /dev/null +++ b/examples/attention_head_selection/src/models/head_selection_s2t_transformer.py @@ -0,0 +1,170 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import Dict, List, Optional +from pathlib import Path +import torch.nn as nn +from torch import Tensor +from fairseq import checkpoint_utils + +from fairseq.models import register_model, register_model_architecture +from fairseq.utils import safe_hasattr +from fairseq.models.speech_to_text.s2t_transformer import ( + S2TTransformerModel, + S2TTransformerEncoder, + TransformerDecoderScriptable +) +from fairseq.models.speech_to_text.s2t_transformer import base_architecture as s2t_base_architecture + +from ..modules.attn_head_selector import AttnHeadSelector +from ..modules.head_selection_transformer_layer import HeadSelectionTransformerEncoderLayer +from .head_selection_transformer import HeadSelectionTransformerDecoder + + +logger = logging.getLogger(__name__) + + +@register_model("head_selection_s2t_transformer") +class HeadSelectionS2TTransformerModel(S2TTransformerModel): + """ + Head selection implemented in S2TTransformer + """ + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @staticmethod + def add_args(parser): + S2TTransformerModel.add_args(parser) + # encoder head selection + parser.add_argument( + "--encoder-attn-head-select", + action="store_true", + default=False, + help="encoder head selection" + ) + parser.add_argument( + "--total-encoder-attention-heads", + type=int, + help="total number of encoder attention heads" + ) + # decoder self attention selection + parser.add_argument( + "--decoder-self-attn-head-select", + action="store_true", + default=False, + help="decoder self-attention head selection" + ) + # decoder-encoder attention selection + parser.add_argument( + "--dec-enc-attn-head-select", + action="store_true", + default=False, + help="decoder-encoder attention head selection" + ) + parser.add_argument( + "--total-decoder-attention-heads", + type=int, + help="total number of decoder attention heads" + ) + # selection strategy + parser.add_argument( + "--attn-head-select-strategy", + type=str, + help="attention head selection strategy, subset or group" + ) + + @classmethod + def build_encoder(cls, args): + if safe_hasattr(args, "encoder_attn_head_select") and args.encoder_attn_head_select: + encoder = HeadSelectionS2TTransformerEncoder(args) + else: + encoder = S2TTransformerEncoder(args) + pretraining_path = getattr(args, "load_pretrained_encoder_from", None) + if pretraining_path is not None: + if not Path(pretraining_path).exists(): + logger.warning( + f"skipped pretraining because {pretraining_path} does not exist" + ) + else: + encoder = checkpoint_utils.load_pretrained_component_from_model( + component=encoder, checkpoint=pretraining_path + ) + logger.info(f"loaded pretrained encoder from: {pretraining_path}") + return encoder + + @classmethod + def build_decoder(cls, args, task, embed_tokens): + if (safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select) or (safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select): + return HeadSelectionTransformerDecoderScriptable(args, task.target_dictionary, embed_tokens) + else: + return TransformerDecoderScriptable(args, task.target_dictionary, embed_tokens) + + +class HeadSelectionS2TTransformerEncoder(S2TTransformerEncoder): + + def __init__(self, args): + super().__init__(args) + self.attn_head_selector = AttnHeadSelector( + args.encoder_tasks, + args.encoder_layers, + args.total_encoder_attention_heads, + args.encoder_attention_heads, + args.attn_head_select_strategy, + ) + self.task_ids = None + self.transformer_layers = nn.ModuleList([ + HeadSelectionTransformerEncoderLayer(args, layer_idx, attn_head_selector=self.attn_head_selector) for layer_idx in range(args.encoder_layers) + ]) + + def set_task_ids(self, task_ids): + self.task_ids = task_ids + + def _forward(self, src_tokens, src_lengths, return_all_hiddens=False): + self.attn_head_selector.head_select(self.task_ids) + return super()._forward(src_tokens, src_lengths, return_all_hiddens) + + +class HeadSelectionTransformerDecoderScriptable(HeadSelectionTransformerDecoder): + def extract_features( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + # call scriptable method from parent class + x, _ = self.extract_features_scriptable( + prev_output_tokens, + encoder_out, + incremental_state, + full_context_alignment, + alignment_layer, + alignment_heads, + ) + return x, None + + +@register_model_architecture(model_name="head_selection_s2t_transformer", arch_name="head_selection_s2t_transformer") +def base_architecture(args): + s2t_base_architecture(args) + args.encoder_attn_head_select = getattr(args, "encoder_attn_head_select", False) + args.decoder_self_attn_head_select = getattr(args, "decoder_self_attn_head_select", False) + args.dec_enc_attn_head_select = getattr(args, "dec_enc_attn_head_select", False) + args.total_encoder_attention_heads = getattr(args, "total_encoder_attention_heads", 8) + args.total_decoder_attention_heads = getattr(args, "total_decoder_attention_heads", 8) + args.attn_head_select_strategy = getattr(args, "attn_head_select_strategy", "group") + + +@register_model_architecture("head_selection_s2t_transformer", "head_selection_s2t_transformer_s") +def head_selection_s2t_transformer_s(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) + args.dropout = getattr(args, "dropout", 0.1) + base_architecture(args) diff --git a/examples/attention_head_selection/src/models/head_selection_transformer.py b/examples/attention_head_selection/src/models/head_selection_transformer.py new file mode 100644 index 0000000000..b9d595699d --- /dev/null +++ b/examples/attention_head_selection/src/models/head_selection_transformer.py @@ -0,0 +1,215 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, List, Dict, Optional +import torch +import torch.nn as nn +from torch import Tensor + +from fairseq.utils import safe_hasattr +from fairseq.models.transformer import ( + TransformerModel, + TransformerEncoder, + TransformerDecoder +) + +from ..modules.attn_head_selector import AttnHeadSelector +from ..modules.head_selection_transformer_layer import ( + HeadSelectionTransformerEncoderLayer, + HeadSelectionTransformerDecoderLayer +) + + +class HeadSelectionTransformerModel(TransformerModel): + def __init__(self, args, encoder, decoder): + super().__init__(args, encoder, decoder) + + @staticmethod + def add_args(parser): + TransformerModel.add_args(parser) + # encoder head selection + parser.add_argument( + "--encoder-attn-head-select", + action="store_true", + default=False, + help="encoder head selection" + ) + parser.add_argument( + "--total-encoder-attention-heads", + type=int, + help="total number of encoder attention heads" + ) + # decoder self attention + parser.add_argument( + "--decoder-self-attn-head-select", + action="store_true", + default=False, + help="decoder self-attention head selection" + ) + # decoder-encoder attention + parser.add_argument( + "--dec-enc-attn-head-select", + action="store_true", + default=False, + help="decoder-encoder attention head selection" + ) + parser.add_argument( + "--total-decoder-attention-heads", + type=int, + help="total number of decoder attention heads" + ) + # selection strategy + parser.add_argument( + "--attn-head-select-strategy", + type=str, + help="attention head selection strategy, subset or group" + ) + + @classmethod + def build_encoder(cls, args, src_dict, embed_tokens): + if safe_hasattr(args, "encoder_attn_head_select") and args.encoder_attn_head_select: + return HeadSelectionTransformerEncoder( + args, src_dict, embed_tokens + ) + else: + return TransformerEncoder(args, src_dict, embed_tokens) + + @classmethod + def build_decoder(cls, args, tgt_dict, embed_tokens): + if (safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select) or (safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select): + return HeadSelectionTransformerDecoder( + args, tgt_dict, embed_tokens + ) + else: + return TransformerDecoder(args, tgt_dict, embed_tokens) + + +class HeadSelectionTransformerEncoder(TransformerEncoder): + + def __init__(self, args, dictionary, embed_tokens): + self.num_tasks = args.encoder_tasks + self.num_layers = args.encoder_layers + self.total_num_heads = args.total_encoder_attention_heads + self.num_heads = args.encoder_attention_heads + self.select_strategy = args.attn_head_select_strategy + + super().__init__(args, dictionary, embed_tokens) + self.attn_head_selector = AttnHeadSelector( + self.num_tasks, + self.num_layers, + self.total_num_heads, + self.num_heads, + self.select_strategy + ) + self.task_ids = None + self.layers = nn.ModuleList( + [self.build_encoder_layer(args, i) for i in range(args.encoder_layers)] + ) + + def set_task_ids(self, task_ids): + self.task_ids = task_ids + + def build_encoder_layer(self, args, layer_idx=None): + return HeadSelectionTransformerEncoderLayer( + args, + layer_idx, + attn_head_selector=self.attn_head_selector + ) + + def forward( + self, + src_tokens, + src_lengths: Optional[torch.Tensor] = None, + return_all_hiddens: bool = False, + token_embeddings: Optional[torch.Tensor] = None, + ): + self.attn_head_selector.head_select(self.task_ids) + return super().forward(src_tokens, src_lengths, return_all_hiddens, token_embeddings) + + +class HeadSelectionTransformerDecoder(TransformerDecoder): + + def __init__( + self, + args, + dictionary, + embed_tokens, + no_encoder_attn=False, + output_projection=None, + ): + self.num_tasks = args.decoder_tasks + self.num_layers = args.decoder_layers + self.total_num_heads = args.total_decoder_attention_heads + self.num_heads = args.decoder_attention_heads + self.select_strategy = args.attn_head_select_strategy + super().__init__( + args, dictionary, embed_tokens, + no_encoder_attn=no_encoder_attn, + output_projection=output_projection + ) + self.self_attn_head_selector = None + self.enc_attn_head_selector = None + if safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select: + self.self_attn_head_selector = AttnHeadSelector( + self.num_tasks, + self.num_layers, + self.total_num_heads, + self.num_heads, + self.select_strategy + ) + if safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select: + self.enc_attn_head_selector = AttnHeadSelector( + self.num_tasks, + self.num_layers, + self.total_num_heads, + self.num_heads, + self.select_strategy + ) + self.task_ids = None + self.layers = nn.ModuleList( + [ + self.build_head_selection_decoder_layer(args, no_encoder_attn, idx) for idx in range(args.decoder_layers) + ] + ) + + def set_task_ids(self, task_ids): + self.task_ids = task_ids + + def build_head_selection_decoder_layer(self, args, no_encoder_attn=False, layer_idx=None): + return HeadSelectionTransformerDecoderLayer( + args, + layer_idx, + self.self_attn_head_selector, + self.enc_attn_head_selector, + no_encoder_attn=no_encoder_attn + ) + + def forward( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + features_only: bool = False, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + src_lengths: Optional[Any] = None, + return_all_hiddens: bool = False, + ): + if self.self_attn_head_selector is not None: + self.self_attn_head_selector.head_select(self.task_ids) + if self.enc_attn_head_selector is not None: + self.enc_attn_head_selector.head_select(self.task_ids) + return super().forward( + prev_output_tokens=prev_output_tokens, + encoder_out=encoder_out, + incremental_state=incremental_state, + features_only=features_only, + full_context_alignment=full_context_alignment, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + src_lengths=src_lengths, + return_all_hiddens=return_all_hiddens + ) diff --git a/examples/linformer/src/modules/__init__.py b/examples/attention_head_selection/src/modules/__init__.py similarity index 100% rename from examples/linformer/src/modules/__init__.py rename to examples/attention_head_selection/src/modules/__init__.py diff --git a/examples/attention_head_selection/src/modules/attn_head_selector.py b/examples/attention_head_selection/src/modules/attn_head_selector.py new file mode 100644 index 0000000000..346fc62308 --- /dev/null +++ b/examples/attention_head_selection/src/modules/attn_head_selector.py @@ -0,0 +1,81 @@ +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import math + + +class AttnHeadSelector(nn.Module): + """ + Latent variable modeling of attention head selection + """ + def __init__( + self, num_tasks, num_layers, + total_num_heads, num_heads, + select_strategy="group", + head_select_temp=5.0 + ): + super(AttnHeadSelector, self).__init__() + self.num_tasks = num_tasks + self.num_layers = num_layers + self.total_num_heads = total_num_heads + self.num_heads = num_heads + self.select_strategy = select_strategy + self.temp = head_select_temp + + self.head_logits = torch.nn.Parameter( + torch.Tensor(self.num_tasks, self.num_layers, total_num_heads), + requires_grad=True + ) + nn.init.uniform_( + self.head_logits, a=math.log(0.01), + b=math.log(1.0) + ) + + def gumbel_sample(self, logits, tau=1.0): + gumbels1 = -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format).exponential_().log() + gumbels2 = -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format).exponential_().log() + gumbels1 = (logits + gumbels1 - gumbels2) / tau + y_soft = gumbels1.sigmoid() + return y_soft + + def subset_select(self, y_soft, topk, dim=-1): + top_values, top_inds = torch.topk(y_soft, k=topk, dim=dim) + top_ret = 1.0 - top_values.detach() + top_values + return top_inds.detach(), top_ret + + def group_selet(self, y_soft, topk, dim=-1): + # top_values: (num_tasks, num_layers, topk) + top_values, top_inds = torch.max( + y_soft.view(self.num_tasks, self.num_layers, -1, topk), dim=2 + ) + top_inds = top_inds * topk + torch.arange(topk, device=top_inds.device).unsqueeze(0).unsqueeze(1) + top_ret = 1.0 - top_values.detach() + top_values + return top_inds.detach(), top_ret + + def head_select(self, task_ids=None): + # gumbel_sample + self.head_samples = self.gumbel_sample(self.head_logits, tau=self.temp) + # head select + if self.select_strategy == "subset": + self.subset_heads, self.subset_weights = self.subset_select( + self.head_samples, + topk=self.num_heads, + ) + elif self.select_strategy == "group": + self.subset_heads, self.subset_weights = self.group_selet( + self.head_samples, + topk=self.num_heads, + ) + else: + raise ValueError("{} is not supported".format(self.select_strategy)) + + self.batch_subset = self.subset_heads[task_ids, :, :] + self.batch_weights = self.subset_weights[task_ids, :, :] + + def forward(self, layer_idx): + assert layer_idx is not None + batch_subset = self.batch_subset[:, layer_idx, :] + batch_weights = self.batch_weights[:, layer_idx, :] + return batch_subset, batch_weights diff --git a/examples/attention_head_selection/src/modules/head_selection_transformer_layer.py b/examples/attention_head_selection/src/modules/head_selection_transformer_layer.py new file mode 100644 index 0000000000..c792143503 --- /dev/null +++ b/examples/attention_head_selection/src/modules/head_selection_transformer_layer.py @@ -0,0 +1,92 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.utils import safe_getattr +from fairseq.modules import TransformerEncoderLayer, TransformerDecoderLayer +from ..modules.multihead_attention_selection import MultiheadAttentionSelection + + +class HeadSelectionTransformerEncoderLayer(TransformerEncoderLayer): + + def __init__(self, args, layer_idx, attn_head_selector=None): + super().__init__(args) + self.layer_idx = layer_idx + self.self_attn = self.build_self_attention_selection( + self.embed_dim, args, attn_head_selector + ) + + def build_self_attention_selection(self, embed_dim, args, attn_head_selector=None): + return MultiheadAttentionSelection( + embed_dim, + args.total_encoder_attention_heads, + args.encoder_attention_heads, + dropout=args.attention_dropout, + self_attention=True, + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + layer_idx=self.layer_idx, + attn_head_selector=attn_head_selector + ) + + +class HeadSelectionTransformerDecoderLayer(TransformerDecoderLayer): + + def __init__( + self, + args, + layer_idx, + self_attn_head_selector=None, + enc_attn_head_selector=None, + no_encoder_attn=False, + add_bias_kv=False, + add_zero_attn=False, + ): + self.layer_idx = layer_idx + super().__init__(args, no_encoder_attn, add_bias_kv, add_zero_attn) + if self_attn_head_selector is not None: + self.self_attn = self.build_self_attention_selection( + self.embed_dim, args, + self_attn_head_selector=self_attn_head_selector, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn + ) + if enc_attn_head_selector is not None: + self.encoder_attn = self.build_encoder_attention_selection( + self.embed_dim, args, + enc_attn_head_selector=enc_attn_head_selector + ) + + def build_self_attention_selection( + self, embed_dim, args, self_attn_head_selector=None, + add_bias_kv=False, add_zero_attn=False + ): + return MultiheadAttentionSelection( + embed_dim, + args.total_decoder_attention_heads, + args.decoder_attention_heads, + dropout=args.attention_dropout, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + self_attention=not safe_getattr(args, "cross_self_attention"), + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + layer_idx=self.layer_idx, + attn_head_selector=self_attn_head_selector, + ) + + def build_encoder_attention_selection(self, embed_dim, args, enc_attn_head_selector=None): + return MultiheadAttentionSelection( + embed_dim, + args.total_decoder_attention_heads, + args.decoder_attention_heads, + kdim=args.encoder_embed_dim, + vdim=args.encoder_embed_dim, + dropout=args.attention_dropout, + encoder_decoder_attention=True, + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + layer_idx=self.layer_idx, + attn_head_selector=enc_attn_head_selector, + ) diff --git a/examples/attention_head_selection/src/modules/multihead_attention_selection.py b/examples/attention_head_selection/src/modules/multihead_attention_selection.py new file mode 100644 index 0000000000..566ad822ac --- /dev/null +++ b/examples/attention_head_selection/src/modules/multihead_attention_selection.py @@ -0,0 +1,355 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, Optional, Tuple +import torch +from fairseq import utils +from fairseq.modules.quant_noise import quant_noise +from torch import Tensor, nn +from torch.nn import Parameter + +from fairseq.modules.multihead_attention import MultiheadAttention +from ..modules.multihead_functional import multi_head_attention_forward + + +class MultiheadAttentionSelection(MultiheadAttention): + + def __init__( + self, + embed_dim, + total_num_heads, + num_heads, + kdim=None, + vdim=None, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + self_attention=False, + encoder_decoder_attention=False, + q_noise=0.0, + qn_block_size=8, + layer_idx=0, + attn_head_selector=None + ): + super().__init__( + embed_dim, + num_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + self_attention=self_attention, + encoder_decoder_attention=encoder_decoder_attention, + q_noise=q_noise, + qn_block_size=qn_block_size, + ) + self.layer_idx = layer_idx + self.attn_head_selector = attn_head_selector + self.total_num_heads = total_num_heads + self.total_embed_dim = self.head_dim * total_num_heads + self.k_proj = quant_noise( + nn.Linear(self.kdim, self.total_embed_dim, bias=bias), q_noise, qn_block_size + ) + self.v_proj = quant_noise( + nn.Linear(self.vdim, self.total_embed_dim, bias=bias), q_noise, qn_block_size + ) + self.q_proj = quant_noise( + nn.Linear(embed_dim, self.total_embed_dim, bias=bias), q_noise, qn_block_size + ) + if add_bias_kv: + self.bias_k = Parameter(torch.Tensor(1, 1, self.total_embed_dim)) + self.bias_v = Parameter(torch.Tensor(1, 1, self.total_embed_dim)) + else: + self.bias_k = self.bias_v = None + self.reset_parameters() + + def forward( + self, + query, + key: Optional[Tensor], + value: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + need_weights: bool = True, + static_kv: bool = False, + attn_mask: Optional[Tensor] = None, + before_softmax: bool = False, + need_head_weights: bool = False, + # subset_heads: Optional[Tensor] = None, + # subset_weights: Optional[Tensor] = None + ) -> Tuple[Tensor, Optional[Tensor]]: + if need_head_weights: + need_weights = True + + is_tpu = query.device.type == "xla" + + subset_heads, subset_weights = self.attn_head_selector(self.layer_idx) + + tgt_len, bsz, embed_dim = query.size() + src_len = tgt_len + assert list(query.size()) == [tgt_len, bsz, self.embed_dim] + if key is not None: + src_len, key_bsz, _ = key.size() + if not torch.jit.is_scripting(): + assert key_bsz == bsz + assert value is not None + assert src_len, bsz == value.shape[:2] + + if ( + not self.onnx_trace + and not is_tpu # don't use PyTorch version on TPUs + and incremental_state is None + and not static_kv + # A workaround for quantization to work. Otherwise JIT compilation + # treats bias in linear module as method. + and not torch.jit.is_scripting() + ): + assert key is not None and value is not None + return multi_head_attention_forward( + query, + key, + value, + self.embed_dim, + self.total_num_heads, + self.num_heads, + torch.empty([0]), + torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)), + self.bias_k, + self.bias_v, + self.add_zero_attn, + self.dropout_module.p, + self.out_proj.weight, + self.out_proj.bias, + self.training or self.dropout_module.apply_during_inference, + key_padding_mask, + need_weights, + attn_mask, + use_separate_proj_weight=True, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + subset_heads=subset_heads, + subset_weights=subset_weights + ) + + if incremental_state is not None: + saved_state = self._get_input_buffer(incremental_state) + if saved_state is not None and "prev_key" in saved_state: + # previous time steps are cached - no need to recompute + # key and value if they are static + if static_kv: + assert self.encoder_decoder_attention and not self.self_attention + key = value = None + else: + saved_state = None + + if self.self_attention: + q = self.q_proj(query) + k = self.k_proj(query) + v = self.v_proj(query) + elif self.encoder_decoder_attention: + # encoder-decoder attention + q = self.q_proj(query) + if key is None: + assert value is None + k = v = None + else: + k = self.k_proj(key) + v = self.v_proj(key) + + else: + assert key is not None and value is not None + q = self.q_proj(query) + k = self.k_proj(key) + v = self.v_proj(value) + q *= self.scaling + + if self.bias_k is not None: + assert self.bias_v is not None + k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + key_padding_mask.new_zeros(key_padding_mask.size(0), 1), + ], + dim=1, + ) + + q = ( + q.contiguous() + .view(tgt_len, bsz * self.total_num_heads, self.head_dim) + .transpose(0, 1) + ) + if k is not None: + k = ( + k.contiguous() + .view(-1, bsz * self.total_num_heads, self.head_dim) + .transpose(0, 1) + ) + if v is not None: + v = ( + v.contiguous() + .view(-1, bsz * self.total_num_heads, self.head_dim) + .transpose(0, 1) + ) + + if saved_state is not None: + # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) + if "prev_key" in saved_state: + _prev_key = saved_state["prev_key"] + assert _prev_key is not None + prev_key = _prev_key.view(bsz * self.total_num_heads, -1, self.head_dim) + if static_kv: + k = prev_key + else: + assert k is not None + k = torch.cat([prev_key, k], dim=1) + src_len = k.size(1) + if "prev_value" in saved_state: + _prev_value = saved_state["prev_value"] + assert _prev_value is not None + prev_value = _prev_value.view(bsz * self.total_num_heads, -1, self.head_dim) + if static_kv: + v = prev_value + else: + assert v is not None + v = torch.cat([prev_value, v], dim=1) + prev_key_padding_mask: Optional[Tensor] = None + if "prev_key_padding_mask" in saved_state: + prev_key_padding_mask = saved_state["prev_key_padding_mask"] + assert k is not None and v is not None + key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( + key_padding_mask=key_padding_mask, + prev_key_padding_mask=prev_key_padding_mask, + batch_size=bsz, + src_len=k.size(1), + static_kv=static_kv, + ) + + saved_state["prev_key"] = k.view(bsz, self.total_num_heads, -1, self.head_dim) + saved_state["prev_value"] = v.view(bsz, self.total_num_heads, -1, self.head_dim) + saved_state["prev_key_padding_mask"] = key_padding_mask + # In this branch incremental_state is never None + assert incremental_state is not None + incremental_state = self._set_input_buffer(incremental_state, saved_state) + assert k is not None + assert k.size(1) == src_len + + # This is part of a workaround to get around fork/join parallelism + # not supporting Optional types. + if key_padding_mask is not None and key_padding_mask.dim() == 0: + key_padding_mask = None + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(1) == src_len + + if self.add_zero_attn: + assert v is not None + src_len += 1 + k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) + v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + torch.zeros(key_padding_mask.size(0), 1).type_as( + key_padding_mask + ), + ], + dim=1, + ) + + attn_weights = torch.bmm(q, k.transpose(1, 2)) + attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) + + assert list(attn_weights.size()) == [bsz * self.total_num_heads, tgt_len, src_len] + + if attn_mask is not None: + attn_mask = attn_mask.unsqueeze(0) + if self.onnx_trace: + attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) + attn_weights += attn_mask + + if key_padding_mask is not None: + # don't attend to padding symbols + attn_weights = attn_weights.view(bsz, self.total_num_heads, tgt_len, src_len) + if not is_tpu: + attn_weights = attn_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), + float("-inf"), + ) + else: + attn_weights = attn_weights.transpose(0, 2) + attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf")) + attn_weights = attn_weights.transpose(0, 2) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if before_softmax: + return attn_weights, v + + attn_weights_float = utils.softmax( + attn_weights, dim=-1, onnx_trace=self.onnx_trace + ) + attn_weights = attn_weights_float.type_as(attn_weights) + attn_probs = self.dropout_module(attn_weights) + + assert v is not None + + # evaluation + if subset_heads is not None and subset_heads.numel() == 1: + subset_heads = subset_heads.repeat(bsz) + subset_weights = subset_weights.repeat(bsz) + + if subset_heads is None: + attn = torch.bmm(attn_probs, v) + else: + # training with head selection + mixed_attn = torch.bmm(attn_probs, v).contiguous().view(bsz, self.total_num_heads, tgt_len, self.head_dim) + attn = torch.stack( + [mixed_attn[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1 + ) + attn = attn * subset_weights.unsqueeze(2).unsqueeze(3) + attn = attn.contiguous().view(bsz * self.num_heads, tgt_len, self.head_dim) + + assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] + if self.onnx_trace and attn.size(1) == 1: + # when ONNX tracing a single decoder step (sequence length == 1) + # the transpose is a no-op copy before view, thus unnecessary + attn = attn.contiguous().view(tgt_len, bsz, embed_dim) + else: + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) + attn = self.out_proj(attn) + attn_weights: Optional[Tensor] = None + if need_weights: + if subset_heads is None: + attn_weights = attn_weights_float.view( + bsz, self.num_heads, tgt_len, src_len + ).transpose(1, 0) + else: + mixed_attn_weights = attn_weights_float.view( + bsz, self.total_num_heads, tgt_len, src_len + ) + attn_weights = torch.stack( + [mixed_attn_weights[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1 + ).transpose(1, 0) + if not need_head_weights: + # average attention weights over heads + attn_weights = attn_weights.mean(dim=0) + + return attn, attn_weights diff --git a/examples/attention_head_selection/src/modules/multihead_functional.py b/examples/attention_head_selection/src/modules/multihead_functional.py new file mode 100644 index 0000000000..d5edc777e3 --- /dev/null +++ b/examples/attention_head_selection/src/modules/multihead_functional.py @@ -0,0 +1,278 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional, Tuple +import torch +from torch import Tensor +from torch.nn.functional import ( + linear, softmax, dropout, pad, + has_torch_function, + handle_torch_function, + _in_projection_packed, +) +import math +import warnings + + +def _scaled_dot_product_attention( + q: Tensor, + k: Tensor, + v: Tensor, + attn_mask: Optional[Tensor] = None, + dropout_p: float = 0.0, + bsz: int = 1, + subset_heads: Optional[Tensor] = None, + subset_weights: Optional[Tensor] = None, +) -> Tuple[Tensor, Tensor]: + B, Nt, E = q.shape + q = q / math.sqrt(E) + # B: bsz * total_num_heads + # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns) + attn = torch.bmm(q, k.transpose(-2, -1)) + if attn_mask is not None: + attn += attn_mask + attn = softmax(attn, dim=-1) + if dropout_p > 0.0: + attn = dropout(attn, p=dropout_p) + if subset_heads is None: + # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E) + output = torch.bmm(attn, v) + else: + mixed_output = torch.bmm(attn, v).contiguous().view(bsz, -1, Nt, E) + output = torch.stack( + [mixed_output[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], + dim=1 + ) + output = output * subset_weights.unsqueeze(2).unsqueeze(3) + output = output.contiguous().view(-1, Nt, E) + if subset_heads is not None: + _, Nt, Ns = attn.size() + mixed_attn = attn.view(bsz, -1, Nt, Ns) + attn = torch.stack( + [mixed_attn[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1 + ) + return output, attn + + +def _in_projection( + q: Tensor, + k: Tensor, + v: Tensor, + w_q: Tensor, + w_k: Tensor, + w_v: Tensor, + b_q: Optional[Tensor] = None, + b_k: Optional[Tensor] = None, + b_v: Optional[Tensor] = None, +) -> Tuple[Tensor, Tensor, Tensor]: + return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v) + + +def multi_head_attention_forward( + query: Tensor, + key: Tensor, + value: Tensor, + embed_dim_to_check: int, + total_num_heads: int, + num_heads: int, + in_proj_weight: Tensor, + in_proj_bias: Optional[Tensor], + bias_k: Optional[Tensor], + bias_v: Optional[Tensor], + add_zero_attn: bool, + dropout_p: float, + out_proj_weight: Tensor, + out_proj_bias: Optional[Tensor], + training: bool = True, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + use_separate_proj_weight: bool = False, + q_proj_weight: Optional[Tensor] = None, + k_proj_weight: Optional[Tensor] = None, + v_proj_weight: Optional[Tensor] = None, + static_k: Optional[Tensor] = None, + static_v: Optional[Tensor] = None, + subset_heads: Optional[Tensor] = None, + subset_weights: Optional[Tensor] = None, +): + tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias) + if has_torch_function(tens_ops): + return handle_torch_function( + multi_head_attention_forward, + tens_ops, + query, + key, + value, + embed_dim_to_check, + total_num_heads, + num_heads, + in_proj_weight, + in_proj_bias, + bias_k, + bias_v, + add_zero_attn, + dropout_p, + out_proj_weight, + out_proj_bias, + training=training, + key_padding_mask=key_padding_mask, + need_weights=need_weights, + attn_mask=attn_mask, + use_separate_proj_weight=use_separate_proj_weight, + q_proj_weight=q_proj_weight, + k_proj_weight=k_proj_weight, + v_proj_weight=v_proj_weight, + static_k=static_k, + static_v=static_v, + subset_heads=subset_heads, + subset_weights=subset_weights + ) + + # set up shape vars + tgt_len, bsz, embed_dim = query.shape + src_len, _, _ = key.shape + assert embed_dim == embed_dim_to_check, \ + f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" + if isinstance(embed_dim, torch.Tensor): + # embed_dim can be a tensor when JIT tracing + head_dim = embed_dim.div(num_heads, rounding_mode='trunc') + else: + head_dim = embed_dim // num_heads + assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" + if use_separate_proj_weight: + # allow MHA to have different embedding dimensions when separate projection weights are used + assert key.shape[:2] == value.shape[:2], \ + f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" + else: + assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}" + + # + # compute in-projection + # + if not use_separate_proj_weight: + q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias) + else: + assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None" + assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None" + assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None" + if in_proj_bias is None: + b_q = b_k = b_v = None + else: + b_q, b_k, b_v = in_proj_bias.chunk(3) + q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v) + + # prep attention mask + if attn_mask is not None: + if attn_mask.dtype == torch.uint8: + warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.") + attn_mask = attn_mask.to(torch.bool) + else: + assert attn_mask.is_floating_point() or attn_mask.dtype == torch.bool, \ + f"Only float, byte, and bool types are supported for attn_mask, not {attn_mask.dtype}" + # ensure attn_mask's dim is 3 + if attn_mask.dim() == 2: + correct_2d_size = (tgt_len, src_len) + if attn_mask.shape != correct_2d_size: + raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.") + attn_mask = attn_mask.unsqueeze(0) + elif attn_mask.dim() == 3: + correct_3d_size = (bsz * total_num_heads, tgt_len, src_len) + if attn_mask.shape != correct_3d_size: + raise RuntimeError(f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.") + else: + raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported") + + # prep key padding mask + if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8: + warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.") + key_padding_mask = key_padding_mask.to(torch.bool) + + # add bias along batch dimension (currently second) + if bias_k is not None and bias_v is not None: + assert static_k is None, "bias cannot be added to static key." + assert static_v is None, "bias cannot be added to static value." + k = torch.cat([k, bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = pad(attn_mask, (0, 1)) + if key_padding_mask is not None: + key_padding_mask = pad(key_padding_mask, (0, 1)) + else: + assert bias_k is None + assert bias_v is None + + # + # reshape q, k, v for multihead attention and make em batch first + # + q = q.contiguous().view(tgt_len, bsz * total_num_heads, head_dim).transpose(0, 1) + if static_k is None: + k = k.contiguous().view(k.shape[0], bsz * total_num_heads, head_dim).transpose(0, 1) + else: + # TODO finish disentangling control flow so we don't do in-projections when statics are passed + assert static_k.size(0) == bsz * total_num_heads, \ + f"expecting static_k.size(0) of {bsz * total_num_heads}, but got {static_k.size(0)}" + assert static_k.size(2) == head_dim, \ + f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" + k = static_k + if static_v is None: + v = v.contiguous().view(v.shape[0], bsz * total_num_heads, head_dim).transpose(0, 1) + else: + # TODO finish disentangling control flow so we don't do in-projections when statics are passed + assert static_v.size(0) == bsz * total_num_heads, \ + f"expecting static_v.size(0) of {bsz * total_num_heads}, but got {static_v.size(0)}" + assert static_v.size(2) == head_dim, \ + f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" + v = static_v + + # add zero attention along batch dimension (now first) + if add_zero_attn: + zero_attn_shape = (bsz * total_num_heads, 1, head_dim) + k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1) + v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1) + if attn_mask is not None: + attn_mask = pad(attn_mask, (0, 1)) + if key_padding_mask is not None: + key_padding_mask = pad(key_padding_mask, (0, 1)) + + # update source sequence length after adjustments + src_len = k.size(1) + + # merge key padding and attention masks + if key_padding_mask is not None: + assert key_padding_mask.shape == (bsz, src_len), \ + f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" + key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len). \ + expand(-1, total_num_heads, -1, -1).reshape(bsz * total_num_heads, 1, src_len) + if attn_mask is None: + attn_mask = key_padding_mask + elif attn_mask.dtype == torch.bool: + attn_mask = attn_mask.logical_or(key_padding_mask) + else: + attn_mask = attn_mask.masked_fill(key_padding_mask, float("-inf")) + + # convert mask to float + if attn_mask is not None and attn_mask.dtype == torch.bool: + new_attn_mask = torch.zeros_like(attn_mask, dtype=torch.float) + new_attn_mask.masked_fill_(attn_mask, float("-inf")) + attn_mask = new_attn_mask + + # adjust dropout probability + if not training: + dropout_p = 0.0 + + # + # (deep breath) calculate attention and out projection + # + attn_output, attn_output_weights = _scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, bsz, subset_heads, subset_weights) + attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) + attn_output = linear(attn_output, out_proj_weight, out_proj_bias) + + if need_weights: + # average attention weights over heads + attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) + return attn_output, attn_output_weights.sum(dim=1) / num_heads + else: + return attn_output, None diff --git a/examples/attention_head_selection/src/speech_to_text_head_selection.py b/examples/attention_head_selection/src/speech_to_text_head_selection.py new file mode 100644 index 0000000000..6e0ce11d63 --- /dev/null +++ b/examples/attention_head_selection/src/speech_to_text_head_selection.py @@ -0,0 +1,180 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from fairseq.optim.amp_optimizer import AMPOptimizer +from fairseq.tasks import register_task +from fairseq.tasks.speech_to_text import SpeechToTextTask + +from .data.speech_to_text_dataset_with_domain import SpeechToTextDatasetCreatorWithDomain +from .loss.attention_head_selection import HeadSelectionLoss + + +@register_task("speech_to_text_head_selection") +class SpeechToTextHeadSelectionTask(SpeechToTextTask): + + @classmethod + def add_args(cls, parser): + SpeechToTextTask.add_args(parser) + parser.add_argument( + "--task-type", + type=str, + default="lang", + help="task type for head selection, lang or domain" + ) + parser.add_argument( + "--kl-weight", + type=float, + default=0.0, + help="the weight of KL loss" + ) + + def __init__(self, args, tgt_dict): + super().__init__(args, tgt_dict) + self.task_type = args.task_type + assert self.task_type in ["lang", "domain"], "invalid task_type: {}, should be either lang or domain".format(self.task_type) + self.map_task_to_id(args.train_subset) + self.encoder_head_prior = float(args.decoder_attention_heads) / args.total_decoder_attention_heads + self.decoder_head_prior = float(args.encoder_attention_heads) / args.total_encoder_attention_heads + self.kl_loss = HeadSelectionLoss(args) + + def map_task_to_id(self, train_subset): + src_lang_set, tgt_lang_set, domain_set = set(), set(), set() + for split in train_subset.split(","): + seq = split.split("_") + assert len(seq) == 4, "subset {} should be in the format of train_src_tgt_domain".format(split) + _, src_lang, tgt_lang, domain = seq + src_lang_set.add(src_lang) + tgt_lang_set.add(tgt_lang) + domain_set.add(domain) + src_langs = sorted(src_lang_set) + tgt_langs = sorted(tgt_lang_set) + domains = sorted(domain_set) + self.src_lang_map = {src_lang: i for (i, src_lang) in enumerate(src_langs)} + self.tgt_lang_map = {tgt_lang: i for (i, tgt_lang) in enumerate(tgt_langs)} + self.domain_map = {domain: i for (i, domain) in enumerate(domains)} + if self.task_type == "lang": + self.encoder_tasks = len(self.src_lang_map) + self.decoder_tasks = len(self.tgt_lang_map) + elif self.task_type == "domain": + self.encoder_tasks = len(self.domain_map) + self.decoder_tasks = len(self.domain_map) + + def load_dataset(self, split, epoch=1, combine=False, **kwargs): + is_train_split = split.startswith("train") + pre_tokenizer = self.build_tokenizer(self.args) + bpe_tokenizer = self.build_bpe(self.args) + self.datasets[split] = SpeechToTextDatasetCreatorWithDomain.from_tsv( + self.args.data, + self.data_cfg, + split, + self.tgt_dict, + pre_tokenizer, + bpe_tokenizer, + is_train_split=is_train_split, + epoch=epoch, + seed=self.args.seed, + src_lang_map=self.src_lang_map, + tgt_lang_map=self.tgt_lang_map, + domain_map=self.domain_map, + speaker_to_id=self.speaker_to_id + ) + + def build_model(self, args): + args.encoder_tasks = self.encoder_tasks + args.decoder_tasks = self.decoder_tasks + return super(SpeechToTextHeadSelectionTask, self).build_model(args) + + def get_sample_sizes(self, sample, task_ids, num_tasks): + """ + task_ids: (bsz,) + get sample sizes for each task + """ + bsz = task_ids.size(0) + mat = torch.zeros((num_tasks, bsz), device=task_ids.device) + mat[task_ids, torch.arange(bsz)] = 1.0 + ntokens = torch.sum(sample['target'] != 1, dim=-1) + sample_sizes = torch.matmul(mat, ntokens.float()) + return sample_sizes + + def train_step( + self, sample, model, criterion, optimizer, update_num, ignore_grad=False + ): + model.train() + model.set_num_updates(update_num) + # task ids + if self.task_type == "lang": + encoder_task_ids = sample["src_lang_ids"] + decoder_task_ids = sample["tgt_lang_ids"] + elif self.task_type == "domain": + encoder_task_ids = sample["domain_ids"] + decoder_task_ids = sample["domain_ids"] + model.encoder.set_task_ids(encoder_task_ids) + model.decoder.set_task_ids(decoder_task_ids) + + with torch.autograd.profiler.record_function("forward"): + with torch.cuda.amp.autocast(enabled=(isinstance(optimizer, AMPOptimizer))): + loss, sample_size, logging_output = criterion(model, sample) + # KL loss + if self.args.encoder_attn_head_select: + sample_sizes = self.get_sample_sizes(sample, encoder_task_ids, self.encoder_tasks) + loss += self.kl_loss( + model.encoder.attn_head_selector.head_samples, + sample_sizes, + self.encoder_head_prior + ) + if self.args.decoder_self_attn_head_select: + sample_sizes = self.get_sample_sizes(sample, decoder_task_ids, self.decoder_tasks) + loss += self.kl_loss( + model.decoder.self_attn_head_selector.head_samples, + sample_sizes, + self.decoder_head_prior + ) + if self.args.dec_enc_attn_head_select: + sample_sizes = self.get_sample_sizes(sample, decoder_task_ids, self.decoder_tasks) + loss += self.kl_loss( + model.decoder.enc_attn_head_selector.head_sampes, + sample_sizes, + self.decoder_head_prior + ) + + if ignore_grad: + loss *= 0 + with torch.autograd.profiler.record_function("backward"): + optimizer.backward(loss) + return loss, sample_size, logging_output + + def valid_step(self, sample, model, criterion): + model.eval() + # task ids + if self.task_type == "lang": + encoder_task_ids = sample["src_lang_ids"] + decoder_task_ids = sample["tgt_lang_ids"] + elif self.task_type == "domain": + encoder_task_ids = sample["domain_ids"] + decoder_task_ids = sample["domain_ids"] + model.encoder.set_task_ids(encoder_task_ids) + model.decoder.set_task_ids(decoder_task_ids) + with torch.no_grad(): + loss, sample_size, logging_output = criterion(model, sample) + return loss, sample_size, logging_output + + def inference_step( + self, generator, models, sample, prefix_tokens=None, constraints=None + ): + with torch.no_grad(): + # task ids + if self.task_type == "lang": + encoder_task_ids = sample["src_lang_ids"][:1] + decoder_task_ids = sample["tgt_lang_ids"][:1] + elif self.task_type == "domain": + encoder_task_ids = sample["domain_ids"][:1] + decoder_task_ids = sample["domain_ids"][:1] + for model in models: + model.encoder.set_task_ids(encoder_task_ids) + model.decoder.set_task_ids(decoder_task_ids) + return generator.generate( + models, sample, prefix_tokens=prefix_tokens, constraints=constraints + ) diff --git a/examples/audio_nlp/nlu/README.md b/examples/audio_nlp/nlu/README.md new file mode 100644 index 0000000000..a11b3f3065 --- /dev/null +++ b/examples/audio_nlp/nlu/README.md @@ -0,0 +1,53 @@ +# End-to-end NLU + +End-to-end spoken language understanding (SLU) predicts intent directly from audio using a single model. It promises to improve the performance of assistant systems by leveraging acoustic information lost in the intermediate textual representation and preventing cascading errors from Automatic Speech Recognition (ASR). Further, having one unified model has efficiency advantages when deploying assistant systems on-device. + +This page releases the code for reproducing the results in [STOP: A dataset for Spoken Task Oriented Semantic Parsing](https://arxiv.org/abs/2207.10643) + +The dataset can be downloaded here: [download link](https://dl.fbaipublicfiles.com/stop/stop.tar.gz) + +The low-resource splits can be downloaded here: [download link](http://dl.fbaipublicfiles.com/stop/low_resource_splits.tar.gz) + +## Pretrained models end-to-end NLU Models + +| Speech Pretraining | ASR Pretraining | Test EM Accuracy | Tesst EM-Tree Accuracy | Link | +| ----------- | ----------- |----------|----------|----------| +| None | None | 36.54 | 57.01 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-none-none.pt) | +| Wav2Vec | None | 68.05 | 82.53 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-wav2vec-none.pt) | +| HuBERT | None | 68.40 | 82.85 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-hubert-none.pt) | +| Wav2Vec | STOP | 68.70 | 82.78 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-wav2vec-stop.pt) | +| HuBERT | STOP | 69.23 | 82.87 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-hubert-stop.pt) | +| Wav2Vec | Librispeech | 68.47 | 82.49 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-wav2vec-ls.pt) | +| HuBERT | Librispeech | 68.70 | 82.78 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-hubert-ls.pt) | + +## Pretrained models ASR Models +| Speech Pre-training | ASR Dataset | STOP Eval WER | STOP Test WER | dev\_other WER | dev\_clean WER | test\_clean WER | test\_other WER | Link | +| ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | +| HuBERT | Librispeech | 8.47 | 2.99 | 3.25 | 8.06 | 25.68 | 26.19 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-hubert-ls.pt) | +| Wav2Vec | Librispeech | 9.215 | 3.204 | 3.334 | 9.006 | 27.257 | 27.588 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-wav2vec-ls.pt) | +| HuBERT | STOP | 46.31 | 31.30 | 31.52 | 47.16 | 4.29 | 4.26 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-hubert-stop.pt) | +| Wav2Vec | STOP | 43.103 | 27.833 | 28.479 | 28.479 | 4.679 | 4.667 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-wav2vec-stop.pt) | +| HuBERT | Librispeech + STOP | 9.015 | 3.211 | 3.372 | 8.635 | 5.133 | 5.056 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-hubert-ls-stop.pt) | +| Wav2Vec | Librispeech + STOP | 9.549 | 3.537 | 3.625 | 9.514 | 5.59 | 5.562 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-wav2vec-ls-stop.pt) | + +## Creating the fairseq datasets from STOP + +First, create the audio file manifests and label files: + +``` +python examples/audio_nlp/nlu/generate_manifests.py --stop_root $STOP_DOWNLOAD_DIR/stop --output $FAIRSEQ_DATASET_OUTPUT/ +``` + + +Run `./examples/audio_nlp/nlu/create_dict_stop.sh $FAIRSEQ_DATASET_OUTPUT` to generate the fairseq dictionaries. + + +## Training an End-to-end NLU Model + + +Download a wav2vec or hubert model from [link](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert) or [link](https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec) + + +``` +python fairseq_cli/hydra-train --config-dir examples/audio_nlp/nlu/configs/ --config-name nlu_finetuning task.data=$FAIRSEQ_DATA_OUTPUT model.w2v_path=$PRETRAINED_MODEL_PATH +``` diff --git a/examples/audio_nlp/nlu/configs/nlu_finetuning.yaml b/examples/audio_nlp/nlu/configs/nlu_finetuning.yaml new file mode 100644 index 0000000000..bb90f45a30 --- /dev/null +++ b/examples/audio_nlp/nlu/configs/nlu_finetuning.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 10 + tensorboard_logdir: tb + +checkpoint: + no_epoch_checkpoints: true + best_checkpoint_metric: em_error + save_interval: 10 + +task: + _name: nlu_finetuning + data: ??? + labels: parse + eval_wer_parse: true + autoregressive: true + +dataset: + num_workers: 6 + max_tokens: 1600000 + skip_invalid_size_inputs_valid_test: true + valid_subset: eval,test + train_subset: train + validate_interval: 10 + +criterion: + _name: label_smoothed_cross_entropy + +optimization: + max_update: 320000 + lr: [0.0001] + sentence_avg: true + update_freq: [1] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_seq2seq + w2v_path: ??? + autoregressive: true + apply_mask: true + mask_prob: 0.5 + mask_channel_prob: 0.5 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 0 diff --git a/examples/audio_nlp/nlu/create_dict_stop.sh b/examples/audio_nlp/nlu/create_dict_stop.sh new file mode 100755 index 0000000000..753393284d --- /dev/null +++ b/examples/audio_nlp/nlu/create_dict_stop.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +### Script handling creation of data binaries +### for model training within fairseq + + +fairseq_root="." + +data_root=$1 +train_prefix="${data_root}/train" +valid_prefix="${data_root}/eval" +test_prefix="${data_root}/test" + +dest_dir="$data_root/" + +#echo "src dict: $src_dict" > "$dest_dir/src_dict.txt" +#echo "trg dict: $tgt_dict" > "$dest_dir/tgt_dict.txt" + + #--tgtdict $tgt_dict \ +PYTHONPATH=$fairseq_root \ + python $fairseq_root/fairseq_cli/preprocess.py \ + --source-lang "parse" \ + --trainpref "$train_prefix" \ + --validpref "$valid_prefix" \ + --destdir "$dest_dir" \ + --only-source \ + --dict-only \ + --workers 60; + +PYTHONPATH=$fairseq_root \ + python $fairseq_root/fairseq_cli/preprocess.py \ + --source-lang "ltr" \ + --trainpref "$train_prefix" \ + --validpref "$valid_prefix" \ + --destdir "$dest_dir" \ + --only-source \ + --dict-only \ + --workers 60; diff --git a/examples/audio_nlp/nlu/generate_manifests.py b/examples/audio_nlp/nlu/generate_manifests.py new file mode 100644 index 0000000000..e2176099cb --- /dev/null +++ b/examples/audio_nlp/nlu/generate_manifests.py @@ -0,0 +1,83 @@ +import argparse +from pathlib import Path +import soundfile + +def get_insl_frame(parse): + out = [] + def is_ont_token(tok): + return tok[0] in ["[", "]"]; + + res = [] + x = [] + for tok in parse.split(): + if is_ont_token(tok): + res.extend('_'.join(x)) + x = [] + res.append(tok.upper()) + else: + x.append(tok.upper()) + + return " ".join(res) + ' | ' + +def sequencify_utterance(utterance): + utterance = utterance.upper() + utterance = utterance.replace(' ', '|') + '|' + utterance = list(utterance) + utterance = ' '.join(utterance) + return utterance + + +def generate_fairseq_manifests(manifest, output_path, audio_root=None): + + with open(manifest, 'r') as i: + parses = [] + utterances = [] + filepaths = [] + keys = None + for (idx, line) in enumerate(i): + if idx == 0: keys = line.strip().split('\t') + else: + data = { k: v for (k, v) in zip(keys, line.split('\t'))} + parses.append(get_insl_frame(data['decoupled_normalized_seqlogical'])) + utterances.append(sequencify_utterance(data['normalized_utterance'])) + filepaths.append(data['file_id']) + + parses_fp = output_path.with_suffix('.parse') + with open(str(parses_fp), 'w') as o: + for p in parses: + o.write(p + '\n') + + utterances_fp = output_path.with_suffix('.ltr') + with open(str(utterances_fp), 'w') as o: + for u in utterances: + o.write(u + '\n') + + filepaths_fp = output_path.with_suffix('.tsv') + with open(str(filepaths_fp), 'w') as o: + o.write(str(audio_root) + '\n') + for f in filepaths: + fullpath = audio_root / f + assert fullpath.exists(), f'{fullpath}' + frames = soundfile.info(fullpath).frames + o.write(f'{f}\t{frames}\n') + +def main(args): + + splits = ['train', 'eval', 'test'] + root = Path(args.stop_root) + output_root = Path(args.output) + + for split in splits: + stop_manifest_path = root / 'manifests' / (split + '.tsv') + output_path = output_root / (split) + + generate_fairseq_manifests(stop_manifest_path, output_path, root) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument('--stop_root', type=str, + help='path to stop root directory') + parser.add_argument('--output', type=str, + help='output directory') + args = parser.parse_args() + main(args) diff --git a/examples/bart/README.md b/examples/bart/README.md index 76857a99a2..4050a724ee 100644 --- a/examples/bart/README.md +++ b/examples/bart/README.md @@ -1,6 +1,6 @@ # BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension -[https://arxiv.org/pdf/1910.13461.pdf] +[https://arxiv.org/abs/1910.13461](https://arxiv.org/abs/1910.13461) ## Introduction @@ -100,7 +100,7 @@ bart.predict('mnli', tokens).argmax() # 2: entailment ##### Register a new (randomly initialized) classification head: ```python bart.register_classification_head('new_task', num_classes=3) -logprobs = bart.predict('new_task', tokens) +logprobs = bart.predict('new_task', tokens) ``` ##### Batched prediction: @@ -137,15 +137,23 @@ BART can be used to fill multiple `` tokens in the input. ```python bart = torch.hub.load('pytorch/fairseq', 'bart.base') bart.eval() -bart.fill_mask('The cat on the .', topk=3, beam=10) -# [('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))] +bart.fill_mask(['The cat on the .'], topk=3, beam=10) +# [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))]] ``` Note that by default we enforce the output length to match the input length. This can be disabled by setting ``match_source_len=False``: ``` -bart.fill_mask('The cat on the .', topk=3, beam=10, match_source_len=False) -# [('The cat was on the ground.', tensor(-0.6185)), ('The cat was asleep on the couch.', tensor(-0.6276)), ('The cat was on the floor.', tensor(-0.6800))] +bart.fill_mask(['The cat on the .'], topk=3, beam=10, match_source_len=False) +# [[('The cat was on the ground.', tensor(-0.6185)), ('The cat was asleep on the couch.', tensor(-0.6276)), ('The cat was on the floor.', tensor(-0.6800))]] +``` + +Example code to fill masks for a batch of sentences using GPU +``` +bart.cuda() +bart.fill_mask(['The cat on the .', 'The dog on the .'], topk=3, beam=10) +# [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))], [('The dog was on the ground.', tensor(-0.6190)), ('The dog lay on the ground.', tensor(-0.6711)), +('The dog was asleep on the couch', tensor(-0.6796))]] ``` #### Evaluating the `bart.large.mnli` model: @@ -171,38 +179,23 @@ with open('glue_data/MNLI/dev_matched.tsv') as fin: ``` #### Evaluating the `bart.large.cnn` model: -Follow instructions [here](https://github.com/abisee/cnn-dailymail) to download and process into data-files such that `test.source` and `test.target` has one line for each non-tokenized sample. +- Follow instructions [here](https://github.com/abisee/cnn-dailymail) to download and process into data-files such that `test.source` and `test.target` has one line for each non-tokenized sample. +- For simpler preprocessing, you can also `wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz`, although there is no guarantee of identical scores +- `huggingface/transformers` has a simpler interface that supports [single-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_eval.py) and [multi-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_distributed_eval.py) beam search. + In `huggingface/transformers`, the BART models' paths are `facebook/bart-large-cnn` and `facebook/bart-large-xsum`. -```python -bart = torch.hub.load('pytorch/fairseq', 'bart.large.cnn') -bart.cuda() -bart.eval() -bart.half() -count = 1 -bsz = 32 -with open('test.source') as source, open('test.hypo', 'w') as fout: - sline = source.readline().strip() - slines = [sline] - for sline in source: - if count % bsz == 0: - with torch.no_grad(): - hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=140, min_len=55, no_repeat_ngram_size=3) - - for hypothesis in hypotheses_batch: - fout.write(hypothesis + '\n') - fout.flush() - slines = [] - - slines.append(sline.strip()) - count += 1 - if slines != []: - hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=140, min_len=55, no_repeat_ngram_size=3) - for hypothesis in hypotheses_batch: - fout.write(hypothesis + '\n') - fout.flush() -``` - -Install `files2rouge` from [here](https://github.com/pltrdy/files2rouge). +In `fairseq`, summaries can be generated using: + +```bash +cp data-bin/cnn_dm/dict.source.txt checkpoints/ +python examples/bart/summarize.py \ + --model-dir pytorch/fairseq \ + --model-file bart.large.cnn \ + --src cnn_dm/test.source \ + --out cnn_dm/test.hypo +``` + +For calculating rouge, install `files2rouge` from [here](https://github.com/pltrdy/files2rouge). ```bash export CLASSPATH=/path/to/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar diff --git a/examples/bart/README.summarization.md b/examples/bart/README.summarization.md index d7fecc9ce6..8727584f2b 100644 --- a/examples/bart/README.summarization.md +++ b/examples/bart/README.summarization.md @@ -80,42 +80,23 @@ Expected training time is about `5 hours`. Training time can be reduced with dis Use TOTAL_NUM_UPDATES=15000 UPDATE_FREQ=2 for Xsum task ### Inference for CNN-DM test data using above trained checkpoint. -After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet: +After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using `eval_cnn.py`, for example -```python -import torch -from fairseq.models.bart import BARTModel - -bart = BARTModel.from_pretrained( - 'checkpoints/', - checkpoint_file='checkpoint_best.pt', - data_name_or_path='cnn_dm-bin' -) - -bart.cuda() -bart.eval() -bart.half() -count = 1 -bsz = 32 -with open('cnn_dm/test.source') as source, open('cnn_dm/test.hypo', 'w') as fout: - sline = source.readline().strip() - slines = [sline] - for sline in source: - if count % bsz == 0: - with torch.no_grad(): - hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=140, min_len=55, no_repeat_ngram_size=3) - - for hypothesis in hypotheses_batch: - fout.write(hypothesis + '\n') - fout.flush() - slines = [] - - slines.append(sline.strip()) - count += 1 - if slines != []: - hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=140, min_len=55, no_repeat_ngram_size=3) - for hypothesis in hypotheses_batch: - fout.write(hypothesis + '\n') - fout.flush() +```bash +cp data-bin/cnn_dm/dict.source.txt checkpoints/ +python examples/bart/summarize.py \ + --model-dir checkpoints \ + --model-file checkpoint_best.pt \ + --src cnn_dm/test.source \ + --out cnn_dm/test.hypo +``` +For XSUM, which uses beam=6, lenpen=1.0, max_len_b=60, min_len=10: +```bash +cp data-bin/cnn_dm/dict.source.txt checkpoints/ +python examples/bart/summarize.py \ + --model-dir checkpoints \ + --model-file checkpoint_best.pt \ + --src cnn_dm/test.source \ + --out cnn_dm/test.hypo \ + --xsum-kwargs ``` -Use beam=6, lenpen=1.0, max_len_b=60, min_len=10 for Xsum Generation diff --git a/examples/bart/summarize.py b/examples/bart/summarize.py new file mode 100644 index 0000000000..04435f80e3 --- /dev/null +++ b/examples/bart/summarize.py @@ -0,0 +1,100 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from fairseq.models.bart import BARTModel +import argparse + +XSUM_KWARGS = dict(beam=6, lenpen=1.0, max_len_b=60, min_len=10, no_repeat_ngram_size=3) +CNN_KWARGS = dict(beam=4, lenpen=2.0, max_len_b=140, min_len=55, no_repeat_ngram_size=3) + + +@torch.no_grad() +def generate(bart, infile, outfile="bart_hypo.txt", bsz=32, n_obs=None, **eval_kwargs): + count = 1 + + # if n_obs is not None: bsz = min(bsz, n_obs) + + with open(infile) as source, open(outfile, "w") as fout: + sline = source.readline().strip() + slines = [sline] + for sline in source: + if n_obs is not None and count > n_obs: + break + if count % bsz == 0: + hypotheses_batch = bart.sample(slines, **eval_kwargs) + for hypothesis in hypotheses_batch: + fout.write(hypothesis + "\n") + fout.flush() + slines = [] + + slines.append(sline.strip()) + count += 1 + + if slines != []: + hypotheses_batch = bart.sample(slines, **eval_kwargs) + for hypothesis in hypotheses_batch: + fout.write(hypothesis + "\n") + fout.flush() + + +def main(): + """ + Usage:: + + python examples/bart/summarize.py \ + --model-dir $HOME/bart.large.cnn \ + --model-file model.pt \ + --src $HOME/data-bin/cnn_dm/test.source + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--model-dir", + required=True, + type=str, + default="bart.large.cnn/", + help="path containing model file and src_dict.txt", + ) + parser.add_argument( + "--model-file", + default="checkpoint_best.pt", + help="where in model_dir are weights saved", + ) + parser.add_argument( + "--src", default="test.source", help="text to summarize", type=str + ) + parser.add_argument( + "--out", default="test.hypo", help="where to save summaries", type=str + ) + parser.add_argument("--bsz", default=32, help="where to save summaries", type=int) + parser.add_argument( + "--n", default=None, help="how many examples to summarize", type=int + ) + parser.add_argument( + "--xsum-kwargs", + action="store_true", + default=False, + help="if true use XSUM_KWARGS else CNN_KWARGS", + ) + args = parser.parse_args() + eval_kwargs = XSUM_KWARGS if args.xsum_kwargs else CNN_KWARGS + if args.model_dir == "pytorch/fairseq": + bart = torch.hub.load("pytorch/fairseq", args.model_file) + else: + bart = BARTModel.from_pretrained( + args.model_dir, + checkpoint_file=args.model_file, + data_name_or_path=args.model_dir, + ) + bart = bart.eval() + if torch.cuda.is_available(): + bart = bart.cuda().half() + generate( + bart, args.src, bsz=args.bsz, n_obs=args.n, outfile=args.out, **eval_kwargs + ) + + +if __name__ == "__main__": + main() diff --git a/examples/constrained_decoding/README.md b/examples/constrained_decoding/README.md index cfca9c91fd..e04b8b6a01 100644 --- a/examples/constrained_decoding/README.md +++ b/examples/constrained_decoding/README.md @@ -12,7 +12,7 @@ Constrained search is enabled by adding the command-line argument `--constraints Constraints are appended to each line of input, separated by tabs. Each constraint (one or more tokens) is a separate field. -The following command, using [Fairseq's WMT19 German--English model](https://github.com/pytorch/fairseq/blob/master/examples/wmt19/README.md), +The following command, using [Fairseq's WMT19 German--English model](https://github.com/pytorch/fairseq/blob/main/examples/wmt19/README.md), translates the sentence *Die maschinelle Übersetzung ist schwer zu kontrollieren.* with the constraints "hard" and "to influence". diff --git a/examples/criss/mining/mine.py b/examples/criss/mining/mine.py index c86f73ae87..c872da196f 100644 --- a/examples/criss/mining/mine.py +++ b/examples/criss/mining/mine.py @@ -7,7 +7,12 @@ import glob from subprocess import check_call -import faiss +try: + import faiss + + has_faiss = True +except ImportError: + has_faiss = False import numpy as np @@ -40,6 +45,8 @@ def load_batch(emb_file, dim): def knnGPU_sharded(x_batches_f, y_batches_f, dim, k, direction="x2y"): + if not has_faiss: + raise ImportError("Please install Faiss") sims = [] inds = [] xfrom = 0 diff --git a/examples/criss/save_encoder.py b/examples/criss/save_encoder.py index d911d066e3..24a842e409 100644 --- a/examples/criss/save_encoder.py +++ b/examples/criss/save_encoder.py @@ -11,6 +11,7 @@ import torch from fairseq import checkpoint_utils, options, progress_bar, tasks, utils from fairseq.sequence_generator import EnsembleModel +from fairseq.utils import safe_hasattr def get_avg_pool( @@ -109,9 +110,9 @@ def main(args): shard_id = 0 all_avg_pool = None encoder_has_langtok = ( - hasattr(task.args, "encoder_langtok") + safe_hasattr(task.args, "encoder_langtok") and task.args.encoder_langtok is not None - and hasattr(task.args, "lang_tok_replacing_bos_eos") + and safe_hasattr(task.args, "lang_tok_replacing_bos_eos") and not task.args.lang_tok_replacing_bos_eos ) with progress_bar.build_progress_bar(args, itr) as t: diff --git a/examples/cross_lingual_language_model/README.md b/examples/cross_lingual_language_model/README.md index a78f86d8da..af9128e39e 100644 --- a/examples/cross_lingual_language_model/README.md +++ b/examples/cross_lingual_language_model/README.md @@ -61,14 +61,14 @@ fairseq-train \ --max-update 2400000 --save-interval 1 --no-epoch-checkpoints \ --arch xlm_base \ --optimizer adam --lr-scheduler reduce_lr_on_plateau \ ---lr-shrink 0.5 --lr 0.0001 --min-lr 1e-09 \ +--lr-shrink 0.5 --lr 0.0001 --stop-min-lr 1e-09 \ --dropout 0.1 \ --criterion legacy_masked_lm_loss \ --max-tokens 2048 --tokens-per-sample 256 --attention-dropout 0.1 \ --dataset-impl lazy --seed 0 \ --masked-lm-only \ --monolingual-langs 'ar,de,en,hi,fr' --num-segment 5 \ ---ddp-backend=no_c10d +--ddp-backend=legacy_ddp ``` Some Notes: diff --git a/examples/data2vec/README.md b/examples/data2vec/README.md new file mode 100644 index 0000000000..a0ff21b82a --- /dev/null +++ b/examples/data2vec/README.md @@ -0,0 +1,261 @@ +# data2vec 2.0 + +data2vec 2.0 improves the training efficiency of the original data2vec algorithm. We make the following improvements for efficiency considerations - we forward only the unmasked timesteps through the encoder, we use convolutional decoder and we use multimasking to amortize the compute overhead of the teacher model. You can find details in the paper [Efficient Self-supervised Learning with Contextualized Target Representations for Vision, Speech and Language](https://arxiv.org/abs/2212.07525) and our [blog post](https://ai.facebook.com/blog/ai-self-supervised-learning-data2vec/). + +## Pretrained and finetuned models +### Vision +| Model | Finetuning split | Link +|---|---|--- +data2vec ViT-B | No fine-tuning | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_imagenet.pt) +data2vec ViT-B | Imagenet-1K | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_imagenet_ft.pt) +data2vec ViT-L | No fine-tuning | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_imagenet.pt) +data2vec ViT-L | Imagenet-1K | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_imagenet_ft.pt) +data2vec ViT-H | No fine-tuning | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/huge_imagenet.pt) +data2vec ViT-H | Imagenet-1K | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/huge_imagenet_ft.pt) + +Vision models only are license under CC-BY-NC. +### Speech + +| Model | Finetuning split | Dataset | Link +|---|---|---|--- +data2vec Base | No fine-tuning | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_libri.pt) +data2vec Base | 960 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_libri_960h.pt) +data2vec Large | No fine-tuning | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_vox.pt) +data2vec Large | 960 hours | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_vox_960h.pt) + +### NLP + +| Model | Fine-tuning data | Dataset | Link | Dict | BPE +|---|---|---|---|---|--- +data2vec Base | No fine-tuning | Books + Wiki | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/nlp_base.pt) | [dict](https://dl.fbaipublicfiles.com/fairseq/data2vec2/dict.txt) | [encoder](https://dl.fbaipublicfiles.com/fairseq/data2vec2/encoder.json) / [vocab](https://dl.fbaipublicfiles.com/fairseq/data2vec2/vocab.bpe) + +[//]: # (## Data Preparation) + +[//]: # () +[//]: # (### Vision) + +[//]: # (add details) + +[//]: # (### Speech) + +[//]: # (add details) + +[//]: # () +[//]: # (### NLP) + +[//]: # (add details) + + +## Commands to train different models using data2vec 2.0 + +### Vision + +Commands to pretrain different model configurations +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ +--config-name base_images_only_task task.data=/path/to/dir +``` + +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ +--config-name large_images_only_task task.data=/path/to/dir +``` + +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ +--config-name huge_images14_only_task task.data=/path/to/dir +``` + +Commands to finetune different model configurations + +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/vision/finetuning \ +--config-name mae_imagenet_clean task.data=/path/to/dir model.model_path=/path/to/pretrained/model +``` + +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/vision/finetuning \ +--config-name mae_imagenet_large_clean task.data=/path/to/dir model.model_path=/path/to/pretrained/model +``` + +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/vision/finetuning \ +--config-name mae_imagenet_huge_clean task.data=/path/to/dir model.model_path=/path/to/pretrained/model +``` + +### Speech + +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ +--config-name base_audio_only_task task.data=/path/to/manifests +``` + +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ +--config-name large_audio_only_task task.data=/path/to/manifests +``` + +Finetuning: + +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/wav2vec/config/finetuning --config-name vox_10h \ +task.data=/path/to/manifests model.w2v_path=/path/to/pretrained/model common.user_dir=examples/data2vec +``` + +Replace vox_10h with the right config depending on your model and fine-tuning split. +See examples/wav2vec/config/finetuning for all available configs. + +### NLP + +Commands to pretrain +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ +--config-name base_text_only_task task.data=/path/to/file +``` + +Commands to fine-tune all GLUE tasks +```shell script +$ task=cola # choose from [cola|qnli|mrpc|rte|sst_2|mnli|qqp|sts_b] +$ lr=1e-5 # sweep [1e-5|2e-5|4e-5|6e-5] for each task +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2/text_finetuning \ +--config-name $task task.data=/path/to/file model.model_path=/path/to/pretrained/model "optimization.lr=[${lr}]" +``` + +# data2vec + +data2vec is a framework for self-supervised representation learning for images, speech, and text as described in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language (Baevski et al., 2022)](https://ai.facebook.com/research/data2vec-a-general-framework-for-self-supervised-learning-in-speech-vision-and-language). The algorithm uses the same learning mechanism for different modalities. + + +## Pre-trained models + +### Vision + +Code and pre-trained models for data2vec visions can be found [here](https://github.com/facebookresearch/data2vec_vision/tree/main/beit). + +### Speech + +| Model | Finetuning split | Dataset | Link +|---|---|---|--- +data2vec Base | No fine-tuning | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls.pt) +data2vec Base | 10 minutes | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls_10m.pt) +data2vec Base | 100 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls_100h.pt) +data2vec Base | 960 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls_960h.pt) +data2vec Large | No fine-tuning | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_pretrained.pt) +data2vec Large | 10 minutes | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_10m.pt) +data2vec Large | 100 hours | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_100h.pt) +data2vec Large | 960 hours | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_960h.pt) +--- + +### NLP + +Model | Fine-tuning data | Dataset | Link +|---|---|---|---| +data2vec Base | No fine-tuning | Books + Wiki | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/nlp_base.pt) + +## Training a new speech model with the CLI tools + +Given a directory containing wav files to be used for pretraining (we recommend splitting each file into separate file 10 to 30 seconds in length) + +### Prepare training data manifest: + +First, install the `soundfile` library: +```shell script +pip install soundfile +``` + +Next, run: + +```shell script +$ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext $ext --valid-percent $valid +``` + +$ext should be set to flac, wav, or whatever format your dataset happens to use that soundfile can read. + +$valid should be set to some reasonable percentage (like 0.01) of training data to use for validation. +To use a pre-defined validation set (like dev-other from librispeech), set to it 0 and then overwrite valid.tsv with a +separately pre-processed manifest file. + +### Train a data2vec Base model: + +This configuration was used for the base model trained on the Librispeech dataset in the data2vec paper + +Note that the input is expected to be single channel, sampled at 16 kHz + +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/audio/pretraining \ +--config-name base_librispeech task.data=/path/to/manifests common.user_dir=examples/data2vec +``` + +Note: you can simulate 16 GPUs by using k GPUs and adding command line parameters +`distributed_training.distributed_world_size=k` `+optimization.update_freq='[x]'` where x = 16/k + +### Fine-tune a pre-trained model with CTC: + +Fine-tuning a model requires parallel audio and labels file, as well as a vocabulary file in fairseq format. +A letter vocabulary can be downloaded [here](https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt). +An example [script](../wav2vec/libri_labels.py) that generates labels for the Librispeech dataset from the tsv file produced by wav2vec_manifest.py can be used as follows: + +```shell script +split=train +$ python libri_labels.py /path/to/tsv --output-dir /output/dir --output-name $split +``` + +Fine-tuning on 100h of Librispeech with letter targets: +```shell script +$ fairseq-hydra-train \ + distributed_training.distributed_port=$PORT \ + task.data=/path/to/data \ + model.w2v_path=/path/to/model.pt \ + --config-dir /path/to/fairseq-py/examples/wav2vec/config/finetuning \ + --config-name base_100h common.user_dir=examples/data2vec +``` + +There are other config files in the config/finetuning directory that can be used to fine-tune on other splits. +You can specify the right config via the `--config-name` parameter. + +Decoding with a language model during training requires flashlight [python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter). +If you want to use a language model, add `+criterion.wer_args='[/path/to/kenlm, /path/to/lexicon, 2, -1]'` to the command line. + +### Evaluating a CTC model: + +Evaluating a CTC model with a language model requires [flashlight python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter) to be installed. + +Fairseq transformer language model used in the wav2vec 2.0 paper can be obtained from the [wav2letter model repository](https://github.com/facebookresearch/wav2letter/tree/master/recipes/sota/2019). +Be sure to upper-case the language model vocab after downloading it. + +Letter dictionary for pre-trained models can be found [here](https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt). + +Next, run the evaluation command: + +```shell script +python examples/speech_recognition/new/infer.py --config-dir examples/speech_recognition/new/conf \ +--config-name infer task=audio_finetuning task.data=/path/to/manifests common.user_dir=examples/data2vec \ +task.labels=ltr decoding.type=kenlm \ +decoding.lmweight=${lmweight} decoding.wordscore=${wordscore} decoding.silweight=${silscore} \ +decoding.lexicon=/path/to/lexicon \ +decoding.lmpath=/path/to/lm decoding.unique_wer_file=True \ +dataset.gen_subset=dev_clean,dev_other,test_clean,test_other \ +common_eval.path=/path/to/checkpoint.pt decoding.beam=1500 distributed_training.distributed_world_size=${num_gpus} +``` + +To get raw numbers, use decoding.type=viterbi and omit the lexicon. To use the transformer language model, use decoding.type=fairseqlm. + +## Training a new NLP model with the CLI tools + +Please follow the [RoBERTa](../roberta/README.md) instructions to preprocess your data. To train a data2vec model on run: + +```shell script +$ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/text/pretraining \ +--config-name base task.data=/path/to/data common.user_dir=examples/data2vec +``` + +As for speech models, you can simulate 16 gpus by using the update_freq parameter. + +### Finetuning data2vec-text on GLUE + +Please use a command similar to this: + +```shell +$ python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ + --config-name $task task.data=$data_path checkpoint.restore_file="${/path/to/pretrained/model.pt}" +``` diff --git a/examples/data2vec/__init__.py b/examples/data2vec/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/data2vec/config/audio/classification/base_classification.yaml b/examples/data2vec/config/audio/classification/base_classification.yaml new file mode 100644 index 0000000000..fdb9c8d3d7 --- /dev/null +++ b/examples/data2vec/config/audio/classification/base_classification.yaml @@ -0,0 +1,70 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + all_gather_list_size: 70000 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + +checkpoint: + save_interval: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: mAP + maximize_best_checkpoint_metric: true + +task: + _name: audio_classification + data: ??? + normalize: true + labels: lbl + +dataset: + num_workers: 6 + max_tokens: 2560000 + skip_invalid_size_inputs_valid_test: true + valid_subset: eval + validate_interval: 5 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 8 + +criterion: + _name: model + can_sum: false + log_keys: + - _predictions + - _targets + +optimization: + max_update: 30000 + lr: [0.00006] # scratch 53-5 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: cosine + warmup_updates: 5000 + +model: + _name: audio_classification + model_path: ??? + apply_mask: true + mask_prob: 0.6 + mask_length: 5 # scratch 1 + mask_channel_prob: 0 + mask_channel_length: 64 + layerdrop: 0.1 + dropout: 0.1 + activation_dropout: 0.1 + attention_dropout: 0.2 + feature_grad_mult: 0 # scratch 1 + label_mixup: true + source_mixup: 0.5 + prediction_mode: lin_softmax # scratch average_sigmoid + diff --git a/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml b/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml new file mode 100644 index 0000000000..881a1583f8 --- /dev/null +++ b/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml @@ -0,0 +1,35 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml b/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml new file mode 100644 index 0000000000..de7894d9cf --- /dev/null +++ b/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml @@ -0,0 +1,35 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 1 + tasks_per_node: 1 + mem_gb: 100 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml b/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml new file mode 100644 index 0000000000..b016cac9b5 --- /dev/null +++ b/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml @@ -0,0 +1,35 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/audio/pretraining/audioset.yaml b/examples/data2vec/config/audio/pretraining/audioset.yaml new file mode 100644 index 0000000000..dd30fbedd5 --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/audioset.yaml @@ -0,0 +1,91 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: /private/home/abaevski/data/audioset + max_sample_size: 320000 + min_sample_size: 32000 + normalize: true + +dataset: + num_workers: 6 + max_tokens: 3400000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 24 + ddp_backend: legacy_ddp + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var +# - avg_self_attn +# - weights + +optimization: + max_update: 200000 + lr: [0.0005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: cosine + warmup_updates: 10000 + +model: + _name: data2vec_audio + extractor_mode: layer_norm + encoder_layerdrop: 0.05 + dropout_input: 0.0 + dropout_features: 0.0 + feature_grad_mult: 1.0 + encoder_embed_dim: 768 + + mask_prob: 0.65 + mask_length: 10 + + loss_beta: 0 + loss_scale: null + + instance_norm_target_layer: true + layer_norm_targets: true + average_top_k_layers: 12 + + self_attn_norm_type: deepnorm + final_norm_type: deepnorm + + pos_conv_depth: 5 + conv_pos: 95 + + ema_decay: 0.999 + ema_end_decay: 0.9999 + ema_anneal_end_step: 30000 + ema_transformer_only: true + ema_layers_only: false + + require_same_masks: true + mask_dropout: 0 diff --git a/examples/data2vec/config/audio/pretraining/base_librispeech.yaml b/examples/data2vec/config/audio/pretraining/base_librispeech.yaml new file mode 100644 index 0000000000..c332c5a3f8 --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/base_librispeech.yaml @@ -0,0 +1,83 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: ??? + max_sample_size: 320000 + min_sample_size: 32000 + normalize: true + +dataset: + num_workers: 6 + max_tokens: 3800000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: legacy_ddp + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + +optimization: + max_update: 400000 + lr: [0.0005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.03,0.9,0.07] + +model: + _name: data2vec_audio + extractor_mode: layer_norm + encoder_layerdrop: 0.05 + dropout_input: 0.0 + dropout_features: 0.0 + feature_grad_mult: 1.0 + encoder_embed_dim: 768 + + mask_prob: 0.65 + mask_length: 10 + + loss_beta: 0 + loss_scale: null + + instance_norm_target_layer: true + average_top_k_layers: 8 + + pos_conv_depth: 5 + conv_pos: 95 + + ema_decay: 0.999 + ema_end_decay: 0.9999 + ema_anneal_end_step: 30000 + ema_transformer_only: true + ema_layers_only: true + + require_same_masks: true + mask_dropout: 0 diff --git a/examples/data2vec/config/audio/pretraining/run_config/local.yaml b/examples/data2vec/config/audio/pretraining/run_config/local.yaml new file mode 100644 index 0000000000..45595f9eea --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml b/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml new file mode 100644 index 0000000000..732f018899 --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml b/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml new file mode 100644 index 0000000000..e2bab5675a --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml b/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml new file mode 100644 index 0000000000..ec53dc2a98 --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml b/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml new file mode 100644 index 0000000000..70cc8cbb5b --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - task.post_save_script + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml b/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml new file mode 100644 index 0000000000..14b47d14e6 --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 3 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml b/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml new file mode 100644 index 0000000000..c54d735fb2 --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml b/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml new file mode 100644 index 0000000000..0231b2690d --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - task.post_save_script + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml b/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml new file mode 100644 index 0000000000..9a4e43a987 --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 6 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml b/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml new file mode 100644 index 0000000000..78c9f57aeb --- /dev/null +++ b/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 8 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/text/pretraining/base.yaml b/examples/data2vec/config/text/pretraining/base.yaml new file mode 100644 index 0000000000..c6b07c4052 --- /dev/null +++ b/examples/data2vec/config/text/pretraining/base.yaml @@ -0,0 +1,77 @@ +# @package _group_ +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + +checkpoint: + no_epoch_checkpoints: true + save_interval_updates: 50000 + keep_interval_updates: 1 + +distributed_training: + distributed_world_size: 16 + ddp_backend: legacy_ddp + +task: + _name: masked_lm + data: ??? + sample_break_mode: complete_doc + tokens_per_sample: 512 + include_target_tokens: true + random_token_prob: 0 + leave_unmasked_prob: 0 + mask_prob: 0.35 + mask_multiple_length: 4 + +criterion: model + +dataset: + max_tokens: 8192 + ignore_unused_valid_subsets: true + skip_invalid_size_inputs_valid_test: true + +optimizer: + _name: adam + weight_decay: 0.01 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: cosine + warmup_updates: 10000 + +optimization: + clip_norm: 5 + lr: [0.0002] + max_update: 1000000 + update_freq: [1] + +model: + _name: data2vec_text + head_layers: 2 + average_top_k_layers: 10 + layer_norm_target_layer: true + loss_scale: 1 + ema_decay: 0.999 + ema_end_decay: 0.9999 + ema_anneal_end_step: 300000 + loss_beta: 4 + ema_transformer_layers_only: true + + transformer: + dropout: 0.1 + attention_dropout: 0.1 + layernorm_embedding: true + activation_fn: gelu + no_scale_embedding: true + max_source_positions: 512 + encoder: + embed_dim: 768 + ffn_embed_dim: 3072 + layers: 12 + attention_heads: 12 + normalize_before: false + learned_pos: true + layerdrop: 0 diff --git a/examples/data2vec/config/text/pretraining/run_config/local.yaml b/examples/data2vec/config/text/pretraining/run_config/local.yaml new file mode 100644 index 0000000000..45595f9eea --- /dev/null +++ b/examples/data2vec/config/text/pretraining/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml b/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml new file mode 100644 index 0000000000..4bac45a58d --- /dev/null +++ b/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 + exclude: a100-st-p4d24xlarge-471 diff --git a/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml b/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml new file mode 100644 index 0000000000..006a0f2116 --- /dev/null +++ b/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml b/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml new file mode 100644 index 0000000000..4292198b4e --- /dev/null +++ b/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 + exclude: a100-st-p4d24xlarge-471 diff --git a/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml b/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml new file mode 100644 index 0000000000..0e1555d20f --- /dev/null +++ b/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 3 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml b/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml new file mode 100644 index 0000000000..c54d735fb2 --- /dev/null +++ b/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml b/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml new file mode 100644 index 0000000000..5df84cd6da --- /dev/null +++ b/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml @@ -0,0 +1,41 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 + exclude: a100-st-p4d24xlarge-471 + +distributed_training: + distributed_world_size: 32 + ddp_backend: legacy_ddp diff --git a/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml b/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml new file mode 100644 index 0000000000..5b32c23a66 --- /dev/null +++ b/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml @@ -0,0 +1,41 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 8 + name: pt + partition: wav2vec + max_num_timeout: 30 + exclude: a100-st-p4d24xlarge-471 + +distributed_training: + distributed_world_size: 64 + ddp_backend: legacy_ddp diff --git a/examples/data2vec/config/v2/base_audio_only_task.yaml b/examples/data2vec/config/v2/base_audio_only_task.yaml new file mode 100644 index 0000000000..65a9ab3e73 --- /dev/null +++ b/examples/data2vec/config/v2/base_audio_only_task.yaml @@ -0,0 +1,113 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: false + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: /private/home/abaevski/data/librispeech/full + max_sample_size: 320000 + min_sample_size: 32000 + normalize: true + precompute_mask_config: {} + +dataset: + num_workers: 6 + max_tokens: 1000000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 8 + ddp_backend: legacy_ddp + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 400000 + lr: [0.00075] + debug_param_names: true + +optimizer: + _name: adam + adam_betas: [ 0.9,0.98 ] + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: cosine + warmup_updates: 8000 + +model: + _name: data2vec_multi + + loss_beta: 0 + loss_scale: null + + depth: 12 + embed_dim: 768 + clone_batch: 8 + + ema_decay: 0.999 + ema_end_decay: 0.99999 + ema_anneal_end_step: 75000 + ema_encoder_only: false + + average_top_k_layers: 8 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: false + + layerdrop: 0.05 + norm_eps: 1e-5 + + supported_modality: AUDIO + + modalities: + audio: + feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]' + conv_pos_depth: 5 + conv_pos_width: 95 + conv_pos_groups: 16 + prenet_depth: 0 + mask_prob: 0.5 + mask_prob_adjust: 0.05 + inverse_mask: false + mask_length: 5 + mask_noise_std: 0.01 + mask_dropout: 0 + add_masks: false + ema_local_encoder: false + use_alibi_encoder: true + prenet_layerdrop: 0.05 + prenet_dropout: 0.1 + learned_alibi_scale: true + learned_alibi_scale_per_head: true + decoder: + input_dropout: 0.1 + decoder_dim: 384 + decoder_groups: 16 + decoder_kernel: 7 + decoder_layers: 4 diff --git a/examples/data2vec/config/v2/base_images_only_task.yaml b/examples/data2vec/config/v2/base_images_only_task.yaml new file mode 100644 index 0000000000..ff0c247b13 --- /dev/null +++ b/examples/data2vec/config/v2/base_images_only_task.yaml @@ -0,0 +1,116 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: mae_image_pretraining + data: /datasets01/imagenet_full_size/061417/ + rebuild_batches: true + local_cache_path: /scratch/cache_abaevski/imagenet + key: source + precompute_mask_config: {} + +dataset: + num_workers: 10 + batch_size: 16 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 375300 + lr: [ 0.001 ] + debug_param_names: true + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 1e-3 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + ema_decay: 0.9998 + ema_end_decay: 0.99999 + ema_anneal_end_step: 100000 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: true + end_of_block_targets: false + + depth: 10 + average_top_k_layers: 10 + clone_batch: 16 + + norm_eps: 1e-6 + + min_target_var: 0 + min_pred_var: 0 + + encoder_dropout: 0 + post_mlp_drop: 0 + attention_dropout: 0 + activation_dropout: 0 + + supported_modality: IMAGE + cls_loss: 0.01 + + ema_encoder_only: false + + modalities: + image: + inverse_mask: true + mask_prob: 0.8 + mask_prob_adjust: 0.07 + mask_length: 3 + mask_noise_std: 0.01 + prenet_depth: 2 + ema_local_encoder: true + num_extra_tokens: 1 + init_extra_token_zero: false + use_alibi_encoder: false + decoder: + decoder_dim: 768 + decoder_groups: 16 + decoder_kernel: 3 + decoder_layers: 6 + input_dropout: 0 \ No newline at end of file diff --git a/examples/data2vec/config/v2/base_text_only_task.yaml b/examples/data2vec/config/v2/base_text_only_task.yaml new file mode 100644 index 0000000000..62f22eb0fe --- /dev/null +++ b/examples/data2vec/config/v2/base_text_only_task.yaml @@ -0,0 +1,112 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + no_epoch_checkpoints: true + save_interval_updates: 50000 + keep_interval_updates: 1 + +distributed_training: + distributed_world_size: 16 + ddp_backend: legacy_ddp + +task: + _name: masked_lm + data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin + sample_break_mode: none + tokens_per_sample: 512 + include_target_tokens: true + random_token_prob: 0 + leave_unmasked_prob: 0 + include_index: True + skip_masking: True + d2v2_multi: True + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +dataset: + batch_size: 4 + ignore_unused_valid_subsets: true + skip_invalid_size_inputs_valid_test: true + disable_validation: true + +optimization: + clip_norm: 1 + lr: [0.0002] + max_update: 1000000 + update_freq: [1] + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.0002 + optimizer: + _name: adam + adam_betas: [0.9,0.98] + adam_eps: 1e-06 + weight_decay: 0.01 + lr_scheduler: + _name: cosine + warmup_updates: 4000 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + loss_beta: 0 + loss_scale: 1 + + depth: 12 + embed_dim: 768 + clone_batch: 8 + + ema_decay: 0.9999 + ema_end_decay: 0.99999 + ema_anneal_end_step: 100000 + ema_encoder_only: true + + average_top_k_layers: 12 + layer_norm_target_layer: false + instance_norm_target_layer: true + batch_norm_target_layer: false + instance_norm_targets: false + layer_norm_targets: false + + layerdrop: 0 + norm_eps: 1e-5 + + supported_modality: TEXT + + modalities: + text: + mask_prob: 0.48 + mask_length: 1 + mask_noise_std: 0.01 + prenet_depth: 0 + decoder: + input_dropout: 0.1 + decoder_dim: 768 + decoder_groups: 1 + decoder_kernel: 9 + decoder_layers: 5 + decoder_residual: false + projection_layers: 2 + projection_ratio: 2.0 diff --git a/examples/data2vec/config/v2/huge_images14_only_task.yaml b/examples/data2vec/config/v2/huge_images14_only_task.yaml new file mode 100644 index 0000000000..a8a15253f2 --- /dev/null +++ b/examples/data2vec/config/v2/huge_images14_only_task.yaml @@ -0,0 +1,122 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: mae_image_pretraining + data: /datasets01/imagenet_full_size/061417/ + rebuild_batches: true + local_cache_path: /scratch/cache_abaevski/imagenet + key: source + precompute_mask_config: {} + +dataset: + num_workers: 10 + batch_size: 8 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 32 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 500000 + lr: [ 0.0004 ] + debug_param_names: true + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 4e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + ema_decay: 0.9998 + ema_end_decay: 1 + ema_anneal_end_step: 300000 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: true + end_of_block_targets: false + + depth: 32 + embed_dim: 1280 + num_heads: 16 + + average_top_k_layers: 24 + clone_batch: 16 + + norm_eps: 1e-6 + + min_target_var: 0 + min_pred_var: 0 + + encoder_dropout: 0 + post_mlp_drop: 0 + attention_dropout: 0 + activation_dropout: 0 + + supported_modality: IMAGE + cls_loss: 0.01 + + ema_encoder_only: false + + modalities: + image: + patch_size: 14 + inverse_mask: true + mask_prob: 0.75 + mask_prob_adjust: 0.1 + mask_length: 3 + mask_noise_std: 0.01 + prenet_depth: 0 + ema_local_encoder: true + num_extra_tokens: 1 + init_extra_token_zero: false + use_alibi_encoder: false + embed_dim: 1280 + decoder: + decoder_dim: 1024 + decoder_groups: 16 + decoder_kernel: 5 + decoder_layers: 3 + final_layer_norm: false + input_dropout: 0 \ No newline at end of file diff --git a/examples/data2vec/config/v2/huge_images_only_task.yaml b/examples/data2vec/config/v2/huge_images_only_task.yaml new file mode 100644 index 0000000000..7a352ac3c7 --- /dev/null +++ b/examples/data2vec/config/v2/huge_images_only_task.yaml @@ -0,0 +1,120 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: mae_image_pretraining + data: /datasets01/imagenet_full_size/061417/ + rebuild_batches: true + local_cache_path: /scratch/cache_abaevski/imagenet + key: source + precompute_mask_config: {} + +dataset: + num_workers: 10 + batch_size: 8 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 375300 + lr: [ 0.0004 ] + debug_param_names: true + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 4e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + ema_decay: 0.9998 + ema_end_decay: 0.99995 + ema_anneal_end_step: 150000 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: true + end_of_block_targets: false + + depth: 32 + embed_dim: 1280 + num_heads: 16 + + average_top_k_layers: 24 + clone_batch: 16 + + norm_eps: 1e-6 + + min_target_var: 0 + min_pred_var: 0 + + encoder_dropout: 0 + post_mlp_drop: 0 + attention_dropout: 0 + activation_dropout: 0 + + supported_modality: IMAGE + cls_loss: 0.01 + + ema_encoder_only: false + + modalities: + image: + inverse_mask: true + mask_prob: 0.75 + mask_prob_adjust: 0.1 + mask_length: 3 + mask_noise_std: 0.01 + prenet_depth: 0 + ema_local_encoder: true + num_extra_tokens: 1 + init_extra_token_zero: false + use_alibi_encoder: false + embed_dim: 1280 + decoder: + decoder_dim: 1024 + decoder_groups: 16 + decoder_kernel: 5 + decoder_layers: 3 + input_dropout: 0 \ No newline at end of file diff --git a/examples/data2vec/config/v2/large_audio_only_task.yaml b/examples/data2vec/config/v2/large_audio_only_task.yaml new file mode 100644 index 0000000000..3f61589721 --- /dev/null +++ b/examples/data2vec/config/v2/large_audio_only_task.yaml @@ -0,0 +1,122 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: /fsx-wav2vec/abaevski/data/librivox/no_silence + max_sample_size: 320000 + min_sample_size: 32000 + normalize: true + precompute_mask_config: {} + +dataset: + num_workers: 8 + max_tokens: 320000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 48 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 600000 + debug_param_names: true + clip_norm: 1 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.0004 + optimizer: + _name: adam + adam_betas: [0.9,0.98] + adam_eps: 1e-06 + weight_decay: 0.01 + lr_scheduler: + _name: cosine + warmup_updates: 10000 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + loss_beta: 0 + loss_scale: null + + depth: 16 + embed_dim: 1024 + num_heads: 16 + + clone_batch: 12 + + ema_decay: 0.9997 + ema_end_decay: 1 + ema_anneal_end_step: 300000 + ema_encoder_only: false + + average_top_k_layers: 16 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: false + + layerdrop: 0 + norm_eps: 1e-5 + + supported_modality: AUDIO + + modalities: + audio: + feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]' + conv_pos_depth: 5 + conv_pos_width: 95 + conv_pos_groups: 16 + prenet_depth: 8 + mask_prob: 0.55 + mask_prob_adjust: 0.1 + inverse_mask: false + mask_length: 5 + mask_noise_std: 0.01 + mask_dropout: 0 + add_masks: false + ema_local_encoder: false + use_alibi_encoder: true + prenet_layerdrop: 0 + prenet_dropout: 0.1 + learned_alibi_scale: true + learned_alibi_scale_per_head: true + decoder: + input_dropout: 0.1 + decoder_dim: 768 + decoder_groups: 16 + decoder_kernel: 7 + decoder_layers: 4 diff --git a/examples/data2vec/config/v2/large_images_only_task.yaml b/examples/data2vec/config/v2/large_images_only_task.yaml new file mode 100644 index 0000000000..6b957fc129 --- /dev/null +++ b/examples/data2vec/config/v2/large_images_only_task.yaml @@ -0,0 +1,120 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: mae_image_pretraining + data: /datasets01/imagenet_full_size/061417/ + rebuild_batches: true + local_cache_path: /scratch/cache_abaevski/imagenet + key: source + precompute_mask_config: {} + +dataset: + num_workers: 10 + batch_size: 8 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 375300 + lr: [ 0.0004 ] + debug_param_names: true + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 4e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + ema_decay: 0.9998 + ema_end_decay: 0.99999 + ema_anneal_end_step: 150000 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: true + end_of_block_targets: false + + depth: 24 + embed_dim: 1024 + num_heads: 16 + + average_top_k_layers: 18 + clone_batch: 16 + + norm_eps: 1e-6 + + min_target_var: 0 + min_pred_var: 0 + + encoder_dropout: 0 + post_mlp_drop: 0 + attention_dropout: 0 + activation_dropout: 0 + + supported_modality: IMAGE + cls_loss: 0.01 + + ema_encoder_only: false + + modalities: + image: + inverse_mask: true + mask_prob: 0.75 + mask_prob_adjust: 0.1 + mask_length: 3 + mask_noise_std: 0.01 + prenet_depth: 0 + ema_local_encoder: true + num_extra_tokens: 1 + init_extra_token_zero: false + use_alibi_encoder: false + embed_dim: 1024 + decoder: + decoder_dim: 1024 + decoder_groups: 16 + decoder_kernel: 5 + decoder_layers: 3 + input_dropout: 0 \ No newline at end of file diff --git a/examples/data2vec/config/v2/large_text_only_task.yaml b/examples/data2vec/config/v2/large_text_only_task.yaml new file mode 100644 index 0000000000..fd69048e77 --- /dev/null +++ b/examples/data2vec/config/v2/large_text_only_task.yaml @@ -0,0 +1,112 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval_updates: 50000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: masked_lm + data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin + sample_break_mode: none + tokens_per_sample: 512 + include_target_tokens: true + random_token_prob: 0 + leave_unmasked_prob: 0 + include_index: True + skip_masking: True + d2v2_multi: True + +dataset: + batch_size: 2 + ignore_unused_valid_subsets: true + skip_invalid_size_inputs_valid_test: true + disable_validation: true + +distributed_training: + distributed_world_size: 32 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 600000 + clip_norm: 1 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.0001 + optimizer: + _name: adam + adam_betas: [0.9,0.98] + adam_eps: 1e-06 + weight_decay: 0.01 + lr_scheduler: + _name: cosine + warmup_updates: 4000 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + loss_beta: 0 + loss_scale: 1 + + depth: 24 + num_heads: 16 + embed_dim: 1024 + clone_batch: 8 + + ema_decay: 0.9999 + ema_end_decay: 0.99999 + ema_anneal_end_step: 100000 + ema_encoder_only: true + + average_top_k_layers: 24 + layer_norm_target_layer: true + instance_norm_target_layer: false + batch_norm_target_layer: false + instance_norm_targets: true + layer_norm_targets: false + + layerdrop: 0 + norm_eps: 1e-5 + + supported_modality: TEXT + + modalities: + text: + mask_prob: 0.5 + mask_length: 1 + mask_noise_std: 0.01 + prenet_depth: 0 + decoder: + input_dropout: 0.1 + decoder_dim: 768 + decoder_groups: 1 + decoder_kernel: 9 + decoder_layers: 5 + decoder_residual: false + projection_layers: 2 + projection_ratio: 2.0 diff --git a/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml b/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml new file mode 100644 index 0000000000..739e6f6724 --- /dev/null +++ b/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml @@ -0,0 +1,123 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + no_epoch_checkpoints: true + save_interval_updates: 50000 + keep_interval_updates: 1 + +distributed_training: + distributed_world_size: 32 + ddp_backend: legacy_ddp + +task: + _name: masked_lm + data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin + sample_break_mode: none + tokens_per_sample: 512 + include_target_tokens: true + random_token_prob: 0 + leave_unmasked_prob: 0 + include_index: True + skip_masking: True + d2v2_multi: True + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +dataset: + batch_size: 2 + ignore_unused_valid_subsets: true + skip_invalid_size_inputs_valid_test: true + disable_validation: true + +optimization: + clip_norm: 1 + lr: [3e-4] + max_update: 1000000 + update_freq: [1] + +optimizer: + _name: composite + groups: + default: + lr_float: 1e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.98] + adam_eps: 1e-06 + weight_decay: 0.01 + lr_scheduler: + _name: cosine + warmup_updates: 4000 + decoder: + lr_float: 1e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.98] + adam_eps: 1e-06 + weight_decay: 0.01 + lr_scheduler: + _name: cosine + warmup_updates: 4000 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + loss_beta: 4 + loss_scale: 1 + + depth: 24 + num_heads: 16 + embed_dim: 1024 + clone_batch: 8 + + ema_decay: 0.9999 + ema_end_decay: 0.99999 + ema_anneal_end_step: 100000 + ema_encoder_only: true + + average_top_k_layers: 24 + layer_norm_target_layer: true + instance_norm_target_layer: false + batch_norm_target_layer: false + instance_norm_targets: true + layer_norm_targets: false + + layerdrop: 0 + norm_eps: 1e-5 + + supported_modality: TEXT + decoder_group: true + + modalities: + text: + mask_prob: 0.5 + mask_length: 1 + mask_noise_std: 0.01 + prenet_depth: 0 + decoder: + input_dropout: 0.1 + decoder_dim: 768 + decoder_groups: 1 + decoder_kernel: 9 + decoder_layers: 5 + decoder_residual: false + projection_layers: 2 + projection_ratio: 2.0 diff --git a/examples/data2vec/config/v2/run_config/local.yaml b/examples/data2vec/config/v2/run_config/local.yaml new file mode 100644 index 0000000000..45595f9eea --- /dev/null +++ b/examples/data2vec/config/v2/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/examples/data2vec/config/v2/run_config/slurm_1.yaml b/examples/data2vec/config/v2/run_config/slurm_1.yaml new file mode 100644 index 0000000000..732f018899 --- /dev/null +++ b/examples/data2vec/config/v2/run_config/slurm_1.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml b/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml new file mode 100644 index 0000000000..b2184f8cfa --- /dev/null +++ b/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.local_cache_path + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/v2/run_config/slurm_2.yaml b/examples/data2vec/config/v2/run_config/slurm_2.yaml new file mode 100644 index 0000000000..ec53dc2a98 --- /dev/null +++ b/examples/data2vec/config/v2/run_config/slurm_2.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/v2/run_config/slurm_2_aws.yaml b/examples/data2vec/config/v2/run_config/slurm_2_aws.yaml new file mode 100644 index 0000000000..553765597f --- /dev/null +++ b/examples/data2vec/config/v2/run_config/slurm_2_aws.yaml @@ -0,0 +1,39 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.local_cache_path + - task.data + - task.post_save_script + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + - model.model_path + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 12 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 diff --git a/examples/data2vec/config/v2/run_config/slurm_3.yaml b/examples/data2vec/config/v2/run_config/slurm_3.yaml new file mode 100644 index 0000000000..14b47d14e6 --- /dev/null +++ b/examples/data2vec/config/v2/run_config/slurm_3.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 3 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/v2/run_config/slurm_4.yaml b/examples/data2vec/config/v2/run_config/slurm_4.yaml new file mode 100644 index 0000000000..c54d735fb2 --- /dev/null +++ b/examples/data2vec/config/v2/run_config/slurm_4.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/v2/run_config/slurm_4_aws.yaml b/examples/data2vec/config/v2/run_config/slurm_4_aws.yaml new file mode 100644 index 0000000000..a77f62aece --- /dev/null +++ b/examples/data2vec/config/v2/run_config/slurm_4_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - task.post_save_script + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 12 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 diff --git a/examples/data2vec/config/v2/run_config/slurm_6_aws.yaml b/examples/data2vec/config/v2/run_config/slurm_6_aws.yaml new file mode 100644 index 0000000000..20e06582be --- /dev/null +++ b/examples/data2vec/config/v2/run_config/slurm_6_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 12 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 6 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/v2/run_config/slurm_8.yaml b/examples/data2vec/config/v2/run_config/slurm_8.yaml new file mode 100644 index 0000000000..e3ec2c2847 --- /dev/null +++ b/examples/data2vec/config/v2/run_config/slurm_8.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 8 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/v2/run_config/slurm_8_aws.yaml b/examples/data2vec/config/v2/run_config/slurm_8_aws.yaml new file mode 100644 index 0000000000..a9dce876cc --- /dev/null +++ b/examples/data2vec/config/v2/run_config/slurm_8_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 12 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 8 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/v2/text_finetuning/cola.yaml b/examples/data2vec/config/v2/text_finetuning/cola.yaml new file mode 100644 index 0000000000..d4ac4ec8b8 --- /dev/null +++ b/examples/data2vec/config/v2/text_finetuning/cola.yaml @@ -0,0 +1,60 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: mcc + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + report_mcc: True + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 320 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 5336 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/examples/data2vec/config/v2/text_finetuning/mnli.yaml b/examples/data2vec/config/v2/text_finetuning/mnli.yaml new file mode 100644 index 0000000000..1a9d6e52f0 --- /dev/null +++ b/examples/data2vec/config/v2/text_finetuning/mnli.yaml @@ -0,0 +1,60 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 3 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + valid_subset: valid,valid1 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 7432 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 123873 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/examples/data2vec/config/v2/text_finetuning/mrpc.yaml b/examples/data2vec/config/v2/text_finetuning/mrpc.yaml new file mode 100644 index 0000000000..8f93d9d9ea --- /dev/null +++ b/examples/data2vec/config/v2/text_finetuning/mrpc.yaml @@ -0,0 +1,60 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: acc_and_f1 + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + report_acc_and_f1: True + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 137 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 2296 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/examples/data2vec/config/v2/text_finetuning/qnli.yaml b/examples/data2vec/config/v2/text_finetuning/qnli.yaml new file mode 100644 index 0000000000..739fb53b69 --- /dev/null +++ b/examples/data2vec/config/v2/text_finetuning/qnli.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 1986 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 33112 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/examples/data2vec/config/v2/text_finetuning/qqp.yaml b/examples/data2vec/config/v2/text_finetuning/qqp.yaml new file mode 100644 index 0000000000..9accbaa521 --- /dev/null +++ b/examples/data2vec/config/v2/text_finetuning/qqp.yaml @@ -0,0 +1,60 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: acc_and_f1 + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + report_acc_and_f1: True + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 28318 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 113272 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/examples/data2vec/config/v2/text_finetuning/rte.yaml b/examples/data2vec/config/v2/text_finetuning/rte.yaml new file mode 100644 index 0000000000..ea07764d98 --- /dev/null +++ b/examples/data2vec/config/v2/text_finetuning/rte.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 122 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 2036 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/examples/data2vec/config/v2/text_finetuning/run_config/local.yaml b/examples/data2vec/config/v2/text_finetuning/run_config/local.yaml new file mode 100644 index 0000000000..45595f9eea --- /dev/null +++ b/examples/data2vec/config/v2/text_finetuning/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/examples/data2vec/config/v2/text_finetuning/sst_2.yaml b/examples/data2vec/config/v2/text_finetuning/sst_2.yaml new file mode 100644 index 0000000000..a273e5b943 --- /dev/null +++ b/examples/data2vec/config/v2/text_finetuning/sst_2.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 1256 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 20935 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/examples/data2vec/config/v2/text_finetuning/sts_b.yaml b/examples/data2vec/config/v2/text_finetuning/sts_b.yaml new file mode 100644 index 0000000000..fb009ab95b --- /dev/null +++ b/examples/data2vec/config/v2/text_finetuning/sts_b.yaml @@ -0,0 +1,61 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 1 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: pearson_and_spearman + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + regression_target: true + report_pearson_and_spearman: True + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 214 + +optimization: + clip_norm: 0.0 + lr: [4e-05] + max_update: 3598 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/examples/data2vec/config/vision/finetuning/imagenet.yaml b/examples/data2vec/config/vision/finetuning/imagenet.yaml new file mode 100644 index 0000000000..d6d4864cca --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/imagenet.yaml @@ -0,0 +1,52 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: accuracy + +task: + _name: image_classification + data: /datasets01/imagenet_full_size/061417 + +dataset: + num_workers: 6 + batch_size: 64 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + valid_subset: val + +distributed_training: + distributed_world_size: 8 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - correct + +optimization: + max_update: 100000 + lr: [0.0005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: cosine + warmup_updates: 10000 + +model: + _name: data2vec_image_classification + model_path: ??? diff --git a/examples/data2vec/config/vision/finetuning/mae_imagenet_clean.yaml b/examples/data2vec/config/vision/finetuning/mae_imagenet_clean.yaml new file mode 100644 index 0000000000..17d4c0a8f5 --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/mae_imagenet_clean.yaml @@ -0,0 +1,65 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + +task: + _name: mae_image_classification + data: /datasets01/imagenet_full_size/061417 + +dataset: + num_workers: 6 + batch_size: 32 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 2 + valid_subset: val + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - correct + +optimization: + max_update: 250200 + lr: [0.001] + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.001 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 16000 + min_lr: 1e-6 + + +lr_scheduler: pass_through + +model: + _name: mae_image_classification + mixup: 0.7 + mixup_prob: 0.9 + + model_path: ??? diff --git a/examples/data2vec/config/vision/finetuning/mae_imagenet_huge_clean.yaml b/examples/data2vec/config/vision/finetuning/mae_imagenet_huge_clean.yaml new file mode 100644 index 0000000000..2d2eb57bac --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/mae_imagenet_huge_clean.yaml @@ -0,0 +1,68 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + +task: + _name: mae_image_classification + data: /datasets01/imagenet_full_size/061417 + +dataset: + num_workers: 6 + batch_size: 32 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 2 + valid_subset: val + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - correct + +optimization: + max_update: 125200 + lr: [0.0005] + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.0005 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 16000 + min_lr: 1e-20 + + +lr_scheduler: pass_through + +model: + _name: mae_image_classification + mixup: 0.7 + mixup_prob: 0.9 + layer_decay: 0.75 + drop_path_rate: 0.2 + + model_path: ??? diff --git a/examples/data2vec/config/vision/finetuning/mae_imagenet_large_clean.yaml b/examples/data2vec/config/vision/finetuning/mae_imagenet_large_clean.yaml new file mode 100644 index 0000000000..3a9413cef6 --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/mae_imagenet_large_clean.yaml @@ -0,0 +1,68 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + +task: + _name: mae_image_classification + data: /datasets01/imagenet_full_size/061417 + +dataset: + num_workers: 6 + batch_size: 32 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 2 + valid_subset: val + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - correct + +optimization: + max_update: 125200 + lr: [0.0005] + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.0005 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 16000 + min_lr: 1e-7 + + +lr_scheduler: pass_through + +model: + _name: mae_image_classification + mixup: 0.7 + mixup_prob: 0.9 + layer_decay: 0.75 + drop_path_rate: 0.2 + + model_path: ??? diff --git a/examples/data2vec/config/vision/finetuning/run_config/local.yaml b/examples/data2vec/config/vision/finetuning/run_config/local.yaml new file mode 100644 index 0000000000..45595f9eea --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/examples/data2vec/config/vision/finetuning/run_config/slurm_1.yaml b/examples/data2vec/config/vision/finetuning/run_config/slurm_1.yaml new file mode 100644 index 0000000000..732f018899 --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/run_config/slurm_1.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/finetuning/run_config/slurm_1_aws.yaml b/examples/data2vec/config/vision/finetuning/run_config/slurm_1_aws.yaml new file mode 100644 index 0000000000..e2bab5675a --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/run_config/slurm_1_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/finetuning/run_config/slurm_2.yaml b/examples/data2vec/config/vision/finetuning/run_config/slurm_2.yaml new file mode 100644 index 0000000000..c8b0f02a9b --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/run_config/slurm_2.yaml @@ -0,0 +1,38 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + - task.local_cache_path + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/finetuning/run_config/slurm_2_aws.yaml b/examples/data2vec/config/vision/finetuning/run_config/slurm_2_aws.yaml new file mode 100644 index 0000000000..93d0d9c20a --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/run_config/slurm_2_aws.yaml @@ -0,0 +1,38 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + - task.local_cache_path + - model.model_path + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/finetuning/run_config/slurm_3.yaml b/examples/data2vec/config/vision/finetuning/run_config/slurm_3.yaml new file mode 100644 index 0000000000..14b47d14e6 --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/run_config/slurm_3.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 3 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/finetuning/run_config/slurm_4.yaml b/examples/data2vec/config/vision/finetuning/run_config/slurm_4.yaml new file mode 100644 index 0000000000..c54d735fb2 --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/run_config/slurm_4.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/finetuning/run_config/slurm_4_aws.yaml b/examples/data2vec/config/vision/finetuning/run_config/slurm_4_aws.yaml new file mode 100644 index 0000000000..d5d11cb755 --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/run_config/slurm_4_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/finetuning/run_config/slurm_6_aws.yaml b/examples/data2vec/config/vision/finetuning/run_config/slurm_6_aws.yaml new file mode 100644 index 0000000000..906f08a602 --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/run_config/slurm_6_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 6 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/finetuning/run_config/slurm_8_aws.yaml b/examples/data2vec/config/vision/finetuning/run_config/slurm_8_aws.yaml new file mode 100644 index 0000000000..d60e13f8ba --- /dev/null +++ b/examples/data2vec/config/vision/finetuning/run_config/slurm_8_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 8 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/pretraining/base_imagenet.yaml b/examples/data2vec/config/vision/pretraining/base_imagenet.yaml new file mode 100644 index 0000000000..9bfc0f32b6 --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/base_imagenet.yaml @@ -0,0 +1,52 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: image_pretraining + data: /datasets01/imagenet_full_size/061417/ + +dataset: + num_workers: 6 + batch_size: 64 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + +optimization: + max_update: 400000 + lr: [0.0005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: cosine + warmup_updates: 10000 + +model: + _name: data2vec_vision diff --git a/examples/data2vec/config/vision/pretraining/base_imagenet_d2v1.yaml b/examples/data2vec/config/vision/pretraining/base_imagenet_d2v1.yaml new file mode 100644 index 0000000000..5fd399b117 --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/base_imagenet_d2v1.yaml @@ -0,0 +1,64 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: image_pretraining + data: /datasets01/imagenet_full_size/061417 + +dataset: + num_workers: 6 + batch_size: 128 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 2 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: legacy_ddp + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + +optimization: + max_update: 375300 #300*1251 + lr: [0.0005] + clip_norm: 3.0 + +optimizer: + _name: adam + adam_betas: (0.9,0.999) + adam_eps: 1e-08 + weight_decay: 0.05 + +lr_scheduler: + _name: cosine + warmup_updates: 12510 # it should be 10 epochs + +model: + _name: data2vec_vision + + attention_dropout: 0.05 + + ema_decay: 0.999 + ema_end_decay: 0.9998 + layer_norm_targets: True + average_top_k_layers: 6 + + loss_beta: 2.0 + + drop_path: 0.25 diff --git a/examples/data2vec/config/vision/pretraining/base_mae_imagenet.yaml b/examples/data2vec/config/vision/pretraining/base_mae_imagenet.yaml new file mode 100644 index 0000000000..d7872b5e04 --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/base_mae_imagenet.yaml @@ -0,0 +1,64 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: mae_image_pretraining + data: /datasets01/imagenet_full_size/061417/ + rebuild_batches: true + +dataset: + num_workers: 6 + batch_size: 64 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + +optimization: + max_update: 375300 + lr: [0.0006] + +optimizer: + _name: composite + groups: + with_decay: + lr_float: 6e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + no_decay: + lr_float: 6e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + +lr_scheduler: pass_through + +model: + _name: mae diff --git a/examples/data2vec/config/vision/pretraining/run_config/local.yaml b/examples/data2vec/config/vision/pretraining/run_config/local.yaml new file mode 100644 index 0000000000..45595f9eea --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/examples/data2vec/config/vision/pretraining/run_config/slurm_1.yaml b/examples/data2vec/config/vision/pretraining/run_config/slurm_1.yaml new file mode 100644 index 0000000000..732f018899 --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/run_config/slurm_1.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/pretraining/run_config/slurm_1_aws.yaml b/examples/data2vec/config/vision/pretraining/run_config/slurm_1_aws.yaml new file mode 100644 index 0000000000..e2bab5675a --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/run_config/slurm_1_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/pretraining/run_config/slurm_2.yaml b/examples/data2vec/config/vision/pretraining/run_config/slurm_2.yaml new file mode 100644 index 0000000000..c8b0f02a9b --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/run_config/slurm_2.yaml @@ -0,0 +1,38 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + - task.local_cache_path + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/pretraining/run_config/slurm_2_aws.yaml b/examples/data2vec/config/vision/pretraining/run_config/slurm_2_aws.yaml new file mode 100644 index 0000000000..032e53a304 --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/run_config/slurm_2_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + - task.local_cache_path + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/pretraining/run_config/slurm_3.yaml b/examples/data2vec/config/vision/pretraining/run_config/slurm_3.yaml new file mode 100644 index 0000000000..14b47d14e6 --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/run_config/slurm_3.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 3 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/pretraining/run_config/slurm_4.yaml b/examples/data2vec/config/vision/pretraining/run_config/slurm_4.yaml new file mode 100644 index 0000000000..c54d735fb2 --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/run_config/slurm_4.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/pretraining/run_config/slurm_4_aws.yaml b/examples/data2vec/config/vision/pretraining/run_config/slurm_4_aws.yaml new file mode 100644 index 0000000000..d5d11cb755 --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/run_config/slurm_4_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/pretraining/run_config/slurm_6_aws.yaml b/examples/data2vec/config/vision/pretraining/run_config/slurm_6_aws.yaml new file mode 100644 index 0000000000..906f08a602 --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/run_config/slurm_6_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 6 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/config/vision/pretraining/run_config/slurm_8_aws.yaml b/examples/data2vec/config/vision/pretraining/run_config/slurm_8_aws.yaml new file mode 100644 index 0000000000..d60e13f8ba --- /dev/null +++ b/examples/data2vec/config/vision/pretraining/run_config/slurm_8_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 8 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/data2vec/data/__init__.py b/examples/data2vec/data/__init__.py new file mode 100644 index 0000000000..d76112bfc2 --- /dev/null +++ b/examples/data2vec/data/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .image_dataset import ImageDataset +from .path_dataset import PathDataset +from .mae_image_dataset import MaeImageDataset +from .mae_finetuning_image_dataset import MaeFinetuningImageDataset + + +__all__ = [ + "ImageDataset", + "MaeImageDataset", + "MaeFinetuningImageDataset", + "PathDataset", +] \ No newline at end of file diff --git a/examples/data2vec/data/add_class_target_dataset.py b/examples/data2vec/data/add_class_target_dataset.py new file mode 100644 index 0000000000..c346c83e58 --- /dev/null +++ b/examples/data2vec/data/add_class_target_dataset.py @@ -0,0 +1,63 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from fairseq.data import BaseWrapperDataset, data_utils + + +class AddClassTargetDataset(BaseWrapperDataset): + def __init__( + self, + dataset, + labels, + multi_class, + num_classes=None, + label_indices=None, + add_to_input=True, + ): + super().__init__(dataset) + + self.label_indices = label_indices + self.labels = labels + self.multi_class = multi_class + self.add_to_input = add_to_input + if num_classes is None and multi_class: + assert self.label_indices is not None + num_classes = len(self.label_indices) + + self.num_classes = num_classes + + def __getitem__(self, index): + item = self.dataset[index] + item_labels = self.labels[index] + if self.multi_class: + item["label"] = torch.zeros(self.num_classes) + for il in item_labels: + if self.label_indices is not None: + il = self.label_indices[il] + item["label"][il] = 1.0 + else: + item["label"] = torch.tensor( + self.labels[index] + if self.label_indices is None + else self.label_indices[self.labels[index]] + ) + + return item + + def collater(self, samples): + collated = self.dataset.collater(samples) + if len(collated) == 0: + return collated + + indices = set(collated["id"].tolist()) + target = [s["label"] for s in samples if s["id"] in indices] + collated["label"] = torch.stack(target, dim=0) + + if self.add_to_input: + collated["net_input"]["label"] = collated["label"] + + return collated diff --git a/examples/data2vec/data/image_dataset.py b/examples/data2vec/data/image_dataset.py new file mode 100644 index 0000000000..7f551057e8 --- /dev/null +++ b/examples/data2vec/data/image_dataset.py @@ -0,0 +1,127 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import logging + +import numpy as np +import os +from typing import Optional, Callable, Set + +import torch + +from torchvision.datasets.vision import VisionDataset +from torchvision.transforms import ToTensor + +from fairseq.data import FairseqDataset + + +logger = logging.getLogger(__name__) + + +class ImageDataset(FairseqDataset, VisionDataset): + def __init__( + self, + root: str, + extensions: Set[str], + load_classes: bool, + transform: Optional[Callable] = None, + shuffle=True, + ): + FairseqDataset.__init__(self) + VisionDataset.__init__(self, root=root, transform=transform) + + self.shuffle = shuffle + self.tensor_transform = ToTensor() + + self.classes = None + self.labels = None + if load_classes: + classes = [d.name for d in os.scandir(root) if d.is_dir()] + classes.sort() + self.classes = {cls_name: i for i, cls_name in enumerate(classes)} + logger.info(f"loaded {len(self.classes)} classes") + self.labels = [] + + def walk_path(root_path): + for root, _, fnames in sorted(os.walk(root_path, followlinks=True)): + for fname in sorted(fnames): + fname_ext = os.path.splitext(fname) + if fname_ext[-1].lower() not in extensions: + continue + + path = os.path.join(root, fname) + yield path + + logger.info(f"finding images in {root}") + if self.classes is not None: + self.files = [] + self.labels = [] + for c, i in self.classes.items(): + for f in walk_path(os.path.join(root, c)): + self.files.append(f) + self.labels.append(i) + else: + self.files = [f for f in walk_path(root)] + + logger.info(f"loaded {len(self.files)} examples") + + def __getitem__(self, index): + from PIL import Image + + fpath = self.files[index] + + with open(fpath, "rb") as f: + img = Image.open(f).convert("RGB") + + if self.transform is None: + img = self.tensor_transform(img) + else: + img = self.transform(img) + assert torch.is_tensor(img) + + res = {"id": index, "img": img} + + if self.labels is not None: + res["label"] = self.labels[index] + + return res + + def __len__(self): + return len(self.files) + + def collater(self, samples): + if len(samples) == 0: + return {} + + collated_img = torch.stack([s["img"] for s in samples], dim=0) + + res = { + "id": torch.LongTensor([s["id"] for s in samples]), + "net_input": { + "img": collated_img, + }, + } + + if "label" in samples[0]: + res["net_input"]["label"] = torch.LongTensor([s["label"] for s in samples]) + + return res + + def num_tokens(self, index): + return 1 + + def size(self, index): + return 1 + + def ordered_indices(self): + """Return an ordered list of indices. Batches will be constructed based + on this order.""" + if self.shuffle: + order = [np.random.permutation(len(self))] + else: + order = [np.arange(len(self))] + + return order[0] diff --git a/examples/data2vec/data/mae_finetuning_image_dataset.py b/examples/data2vec/data/mae_finetuning_image_dataset.py new file mode 100644 index 0000000000..28cbcb38ac --- /dev/null +++ b/examples/data2vec/data/mae_finetuning_image_dataset.py @@ -0,0 +1,135 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import logging + +import numpy as np +import os + +import torch + +from torchvision import datasets, transforms + +from timm.data import create_transform +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +import PIL + +from fairseq.data import FairseqDataset +from .mae_image_dataset import caching_loader + + +logger = logging.getLogger(__name__) + + +def build_transform(is_train, input_size, color_jitter, aa, reprob, remode, recount): + mean = IMAGENET_DEFAULT_MEAN + std = IMAGENET_DEFAULT_STD + # train transform + if is_train: + # this should always dispatch to transforms_imagenet_train + transform = create_transform( + input_size=input_size, + is_training=True, + color_jitter=color_jitter, + auto_augment=aa, + interpolation="bicubic", + re_prob=reprob, + re_mode=remode, + re_count=recount, + mean=mean, + std=std, + ) + return transform + + # eval transform + t = [] + if input_size <= 224: + crop_pct = 224 / 256 + else: + crop_pct = 1.0 + size = int(input_size / crop_pct) + t.append( + transforms.Resize( + size, interpolation=PIL.Image.BICUBIC + ), # to maintain same ratio w.r.t. 224 images + ) + t.append(transforms.CenterCrop(input_size)) + + t.append(transforms.ToTensor()) + t.append(transforms.Normalize(mean, std)) + return transforms.Compose(t) + + +class MaeFinetuningImageDataset(FairseqDataset): + def __init__( + self, + root: str, + split: str, + is_train: bool, + input_size, + color_jitter=None, + aa="rand-m9-mstd0.5-inc1", + reprob=0.25, + remode="pixel", + recount=1, + local_cache_path=None, + shuffle=True, + ): + FairseqDataset.__init__(self) + + self.shuffle = shuffle + + transform = build_transform( + is_train, input_size, color_jitter, aa, reprob, remode, recount + ) + + path = os.path.join(root, split) + loader = caching_loader(local_cache_path, datasets.folder.default_loader) + + self.dataset = datasets.ImageFolder(path, loader=loader, transform=transform) + + logger.info(f"loaded {len(self.dataset)} examples") + + def __getitem__(self, index): + img, label = self.dataset[index] + return {"id": index, "img": img, "label": label} + + def __len__(self): + return len(self.dataset) + + def collater(self, samples): + if len(samples) == 0: + return {} + + collated_img = torch.stack([s["img"] for s in samples], dim=0) + + res = { + "id": torch.LongTensor([s["id"] for s in samples]), + "net_input": { + "imgs": collated_img, + }, + } + + if "label" in samples[0]: + res["net_input"]["labels"] = torch.LongTensor([s["label"] for s in samples]) + + return res + + def num_tokens(self, index): + return 1 + + def size(self, index): + return 1 + + def ordered_indices(self): + """Return an ordered list of indices. Batches will be constructed based + on this order.""" + if self.shuffle: + order = [np.random.permutation(len(self))] + else: + order = [np.arange(len(self))] + + return order[0] diff --git a/examples/data2vec/data/mae_image_dataset.py b/examples/data2vec/data/mae_image_dataset.py new file mode 100644 index 0000000000..4aacb94895 --- /dev/null +++ b/examples/data2vec/data/mae_image_dataset.py @@ -0,0 +1,418 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from functools import partial +import logging +import math +import random +import time + +import numpy as np +import os + +import torch + +from torchvision import datasets, transforms +from .path_dataset import PathDataset + +from fairseq.data import FairseqDataset +from fairseq.data.data_utils import compute_block_mask_1d, compute_block_mask_2d + +from shutil import copyfile + +logger = logging.getLogger(__name__) + + +def load(path, loader, cache): + if hasattr(caching_loader, "cache_root"): + cache = caching_loader.cache_root + + cached_path = cache + path + + num_tries = 3 + for curr_try in range(num_tries): + try: + if curr_try == 2: + return loader(path) + if not os.path.exists(cached_path) or curr_try > 0: + os.makedirs(os.path.dirname(cached_path), exist_ok=True) + copyfile(path, cached_path) + os.chmod(cached_path, 0o777) + return loader(cached_path) + except Exception as e: + logger.warning(str(e)) + if "Errno 13" in str(e): + caching_loader.cache_root = f"/scratch/{random.randint(0, 69420)}" + logger.warning(f"setting cache root to {caching_loader.cache_root}") + cached_path = caching_loader.cache_root + path + if curr_try == (num_tries - 1): + raise + time.sleep(2) + + +def caching_loader(cache_root: str, loader): + if cache_root is None: + return loader + + if cache_root == "slurm_tmpdir": + cache_root = os.environ["SLURM_TMPDIR"] + assert len(cache_root) > 0 + + if not cache_root.endswith("/"): + cache_root += "/" + + return partial(load, loader=loader, cache=cache_root) + + +class RandomResizedCropAndInterpolationWithTwoPic: + """Crop the given PIL Image to random size and aspect ratio with random interpolation. + + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop + is finally resized to given size. + This is popularly used to train the Inception networks. + + Args: + size: expected output size of each edge + scale: range of size of the origin size cropped + ratio: range of aspect ratio of the origin aspect ratio cropped + interpolation: Default: PIL.Image.BILINEAR + """ + + def __init__( + self, + size, + second_size=None, + scale=(0.08, 1.0), + ratio=(3.0 / 4.0, 4.0 / 3.0), + interpolation="bilinear", + second_interpolation="lanczos", + ): + if isinstance(size, tuple): + self.size = size + else: + self.size = (size, size) + if second_size is not None: + if isinstance(second_size, tuple): + self.second_size = second_size + else: + self.second_size = (second_size, second_size) + else: + self.second_size = None + if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): + logger.warning("range should be of kind (min, max)") + + if interpolation == "random": + from PIL import Image + + self.interpolation = (Image.BILINEAR, Image.BICUBIC) + else: + self.interpolation = self._pil_interp(interpolation) + + self.second_interpolation = ( + self._pil_interp(second_interpolation) + if second_interpolation is not None + else None + ) + self.scale = scale + self.ratio = ratio + + def _pil_interp(self, method): + from PIL import Image + + if method == "bicubic": + return Image.BICUBIC + elif method == "lanczos": + return Image.LANCZOS + elif method == "hamming": + return Image.HAMMING + else: + # default bilinear, do we want to allow nearest? + return Image.BILINEAR + + @staticmethod + def get_params(img, scale, ratio): + """Get parameters for ``crop`` for a random sized crop. + + Args: + img (PIL Image): Image to be cropped. + scale (tuple): range of size of the origin size cropped + ratio (tuple): range of aspect ratio of the origin aspect ratio cropped + + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for a random + sized crop. + """ + area = img.size[0] * img.size[1] + + for attempt in range(10): + target_area = random.uniform(*scale) * area + log_ratio = (math.log(ratio[0]), math.log(ratio[1])) + aspect_ratio = math.exp(random.uniform(*log_ratio)) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if w <= img.size[0] and h <= img.size[1]: + i = random.randint(0, img.size[1] - h) + j = random.randint(0, img.size[0] - w) + return i, j, h, w + + # Fallback to central crop + in_ratio = img.size[0] / img.size[1] + if in_ratio < min(ratio): + w = img.size[0] + h = int(round(w / min(ratio))) + elif in_ratio > max(ratio): + h = img.size[1] + w = int(round(h * max(ratio))) + else: # whole image + w = img.size[0] + h = img.size[1] + i = (img.size[1] - h) // 2 + j = (img.size[0] - w) // 2 + return i, j, h, w + + def __call__(self, img): + import torchvision.transforms.functional as F + + """ + Args: + img (PIL Image): Image to be cropped and resized. + + Returns: + PIL Image: Randomly cropped and resized image. + """ + i, j, h, w = self.get_params(img, self.scale, self.ratio) + if isinstance(self.interpolation, (tuple, list)): + interpolation = random.choice(self.interpolation) + else: + interpolation = self.interpolation + if self.second_size is None: + return F.resized_crop(img, i, j, h, w, self.size, interpolation) + else: + return F.resized_crop( + img, i, j, h, w, self.size, interpolation + ), F.resized_crop( + img, i, j, h, w, self.second_size, self.second_interpolation + ) + + +class MaeImageDataset(FairseqDataset): + def __init__( + self, + root: str, + split: str, + input_size, + local_cache_path=None, + shuffle=True, + key="imgs", + beit_transforms=False, + target_transform=False, + no_transform=False, + compute_mask=False, + patch_size: int = 16, + mask_prob: float = 0.75, + mask_prob_adjust: float = 0, + mask_length: int = 1, + inverse_mask: bool = False, + expand_adjacent: bool = False, + mask_dropout: float = 0, + non_overlapping: bool = False, + require_same_masks: bool = True, + clone_batch: int = 1, + dataset_type: str = "imagefolder", + ): + FairseqDataset.__init__(self) + + self.shuffle = shuffle + self.key = key + + loader = caching_loader(local_cache_path, datasets.folder.default_loader) + + self.transform_source = None + self.transform_target = None + + if target_transform: + self.transform_source = transforms.ColorJitter(0.4, 0.4, 0.4) + self.transform_target = transforms.ColorJitter(0.4, 0.4, 0.4) + + if no_transform: + if input_size <= 224: + crop_pct = 224 / 256 + else: + crop_pct = 1.0 + size = int(input_size / crop_pct) + + self.transform_train = transforms.Compose( + [ + transforms.Resize(size, interpolation=3), + transforms.CenterCrop(input_size), + ] + ) + + self.transform_train = transforms.Resize((input_size, input_size)) + elif beit_transforms: + beit_transform_list = [] + if not target_transform: + beit_transform_list.append(transforms.ColorJitter(0.4, 0.4, 0.4)) + beit_transform_list.extend( + [ + transforms.RandomHorizontalFlip(p=0.5), + RandomResizedCropAndInterpolationWithTwoPic( + size=input_size, + second_size=None, + interpolation="bicubic", + second_interpolation=None, + ), + ] + ) + self.transform_train = transforms.Compose(beit_transform_list) + else: + self.transform_train = transforms.Compose( + [ + transforms.RandomResizedCrop( + input_size, scale=(0.2, 1.0), interpolation=3 + ), # 3 is bicubic + transforms.RandomHorizontalFlip(), + ] + ) + self.final_transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + + if dataset_type == "imagefolder": + self.dataset = datasets.ImageFolder( + os.path.join(root, split), loader=loader + ) + elif dataset_type == "path": + self.dataset = PathDataset( + root, + loader, + None, + None, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + ) + else: + raise Exception(f"invalid dataset type {dataset_type}") + + logger.info( + f"initial transform: {self.transform_train}, " + f"source transform: {self.transform_source}, " + f"target transform: {self.transform_target}, " + f"final transform: {self.final_transform}" + ) + logger.info(f"loaded {len(self.dataset)} examples") + + self.is_compute_mask = compute_mask + self.patches = (input_size // patch_size) ** 2 + self.mask_prob = mask_prob + self.mask_prob_adjust = mask_prob_adjust + self.mask_length = mask_length + self.inverse_mask = inverse_mask + self.expand_adjacent = expand_adjacent + self.mask_dropout = mask_dropout + self.non_overlapping = non_overlapping + self.require_same_masks = require_same_masks + self.clone_batch = clone_batch + + def __getitem__(self, index): + img, _ = self.dataset[index] + + img = self.transform_train(img) + + source = None + target = None + if self.transform_source is not None: + source = self.final_transform(self.transform_source(img)) + if self.transform_target is not None: + target = self.final_transform(self.transform_target(img)) + + if source is None: + img = self.final_transform(img) + + v = {"id": index, self.key: source if source is not None else img} + if target is not None: + v["target"] = target + + if self.is_compute_mask: + if self.mask_length == 1: + mask = compute_block_mask_1d( + shape=(self.clone_batch, self.patches), + mask_prob=self.mask_prob, + mask_length=self.mask_length, + mask_prob_adjust=self.mask_prob_adjust, + inverse_mask=self.inverse_mask, + require_same_masks=True, + ) + else: + mask = compute_block_mask_2d( + shape=(self.clone_batch, self.patches), + mask_prob=self.mask_prob, + mask_length=self.mask_length, + mask_prob_adjust=self.mask_prob_adjust, + inverse_mask=self.inverse_mask, + require_same_masks=True, + expand_adjcent=self.expand_adjacent, + mask_dropout=self.mask_dropout, + non_overlapping=self.non_overlapping, + ) + + v["precomputed_mask"] = mask + + return v + + def __len__(self): + return len(self.dataset) + + def collater(self, samples): + if len(samples) == 0: + return {} + + collated_img = torch.stack([s[self.key] for s in samples], dim=0) + + res = { + "id": torch.LongTensor([s["id"] for s in samples]), + "net_input": { + self.key: collated_img, + }, + } + + if "target" in samples[0]: + collated_target = torch.stack([s["target"] for s in samples], dim=0) + res["net_input"]["target"] = collated_target + + if "precomputed_mask" in samples[0]: + collated_mask = torch.cat([s["precomputed_mask"] for s in samples], dim=0) + res["net_input"]["precomputed_mask"] = collated_mask + + return res + + def num_tokens(self, index): + return 1 + + def size(self, index): + return 1 + + @property + def sizes(self): + return np.full((len(self),), 1) + + def ordered_indices(self): + """Return an ordered list of indices. Batches will be constructed based + on this order.""" + if self.shuffle: + order = [np.random.permutation(len(self))] + else: + order = [np.arange(len(self))] + + return order[0] diff --git a/examples/data2vec/data/modality.py b/examples/data2vec/data/modality.py new file mode 100644 index 0000000000..aa23ac94f7 --- /dev/null +++ b/examples/data2vec/data/modality.py @@ -0,0 +1,14 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +from enum import Enum, auto + + +class Modality(Enum): + AUDIO = auto() + IMAGE = auto() + TEXT = auto() diff --git a/examples/data2vec/data/path_dataset.py b/examples/data2vec/data/path_dataset.py new file mode 100644 index 0000000000..02010058e6 --- /dev/null +++ b/examples/data2vec/data/path_dataset.py @@ -0,0 +1,64 @@ +import glob +import os +from typing import List, Optional, Tuple + +import logging +import numpy as np +import torchvision.transforms.functional as TF +import PIL +from PIL import Image +from torchvision.datasets import VisionDataset + +logger = logging.getLogger(__name__) + + +class PathDataset(VisionDataset): + def __init__( + self, + root: List[str], + loader: None = None, + transform: Optional[str] = None, + extra_transform: Optional[str] = None, + mean: Optional[List[float]] = None, + std: Optional[List[float]] = None, + ): + super().__init__(root=root) + + PIL.Image.MAX_IMAGE_PIXELS = 256000001 + + self.files = [] + for folder in self.root: + self.files.extend( + sorted(glob.glob(os.path.join(folder, "**", "*.jpg"), recursive=True)) + ) + self.files.extend( + sorted(glob.glob(os.path.join(folder, "**", "*.png"), recursive=True)) + ) + + self.transform = transform + self.extra_transform = extra_transform + self.mean = mean + self.std = std + + self.loader = loader + + logger.info(f"loaded {len(self.files)} samples from {root}") + + assert (mean is None) == (std is None) + + def __len__(self) -> int: + return len(self.files) + + def __getitem__(self, idx) -> Tuple[np.ndarray, np.ndarray]: + path = self.files[idx] + + if self.loader is not None: + return self.loader(path), None + + img = Image.open(path).convert("RGB") + if self.transform is not None: + img = self.transform(img) + img = TF.to_tensor(img) + if self.mean is not None and self.std is not None: + img = TF.normalize(img, self.mean, self.std) + return img, None diff --git a/examples/data2vec/fb_convert_beit_cp.py b/examples/data2vec/fb_convert_beit_cp.py new file mode 100644 index 0000000000..cf42ace762 --- /dev/null +++ b/examples/data2vec/fb_convert_beit_cp.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import torch + +from omegaconf import OmegaConf + +from fairseq.criterions.model_criterion import ModelCriterionConfig +from fairseq.dataclass.configs import FairseqConfig + +from tasks import ImageClassificationConfig, ImagePretrainingConfig +from models.data2vec_image_classification import ( + Data2VecImageClassificationConfig, + Data2VecImageClassificationModel, +) +from models.data2vec_vision import Data2VecVisionConfig, Data2VecVisionModel + + +def get_parser(): + parser = argparse.ArgumentParser( + description="convert beit checkpoint into data2vec - vision checkpoint" + ) + # fmt: off + parser.add_argument('checkpoint', help='checkpoint to convert') + parser.add_argument('--output', required=True, metavar='PATH', help='where to output converted checkpoint') + parser.add_argument('--type', type=str, choices=['vision', 'image_classification'], default='image_classification', help='type of model to upgrade') + parser.add_argument('--inception_norms', action='store_true', default=False) + # fmt: on + + return parser + + +def update_checkpoint(model_dict, prefix, is_nested): + + replace_paths = { + "cls_token": "model.cls_emb" if is_nested else "cls_emb", + "patch_embed": "model.patch_embed" if is_nested else "patch_embed", + "mask_token": "mask_emb", + } + + starts_with = { + "patch_embed.proj": "model.patch_embed.conv" + if is_nested + else "patch_embed.conv", + "lm_head": "final_proj", + "fc_norm": "fc_norm", + "head": "head", + } + + partial = { + "mlp.fc1": "mlp.0", + "mlp.fc2": "mlp.2", + } + + for k in list(model_dict.keys()): + for sw, r in starts_with.items(): + if k.startswith(sw): + replace_paths[k] = k.replace(sw, r) + for p, r in partial.items(): + if p in k: + replace_paths[k] = prefix + k.replace(p, r) + + if prefix != "": + for k in list(model_dict.keys()): + if k not in replace_paths: + replace_paths[k] = prefix + k + + for k in list(model_dict.keys()): + if k in replace_paths: + model_dict[replace_paths[k]] = model_dict[k] + if k != replace_paths[k]: + del model_dict[k] + + return model_dict + + +def main(): + parser = get_parser() + args = parser.parse_args() + + cp = torch.load(args.checkpoint, map_location="cpu") + + cfg = FairseqConfig( + criterion=ModelCriterionConfig(_name="model", log_keys=["correct"]), + ) + + if args.type == "image_classification": + + cfg.task = ImageClassificationConfig( + _name="image_classification", + data=".", + ) + + if args.inception_norms: + cfg.task.normalization_mean = [0.5, 0.5, 0.5] + cfg.task.normalization_std = [0.5, 0.5, 0.5] + + cfg.model = Data2VecImageClassificationConfig( + _name="data2vec_image_classification", + ) + cfg.model.pretrained_model_args = FairseqConfig( + model=Data2VecVisionConfig( + _name="data2vec_vision", shared_rel_pos_bias=False + ), + task=ImagePretrainingConfig( + _name="image_pretraining", + ), + ) + + cfg = OmegaConf.create(cfg) + + state = { + "cfg": OmegaConf.to_container(cfg, resolve=True, enum_to_str=True), + "model": cp["module"], + "best_loss": None, + "optimizer": None, + "extra_state": {}, + } + + model = Data2VecImageClassificationModel(cfg.model) + model.load_state_dict( + update_checkpoint(state["model"], prefix="model.encoder.", is_nested=True), + strict=True, + ) + elif args.type == "vision": + cfg.task = ImagePretrainingConfig( + _name="image_pretraining", + data=".", + ) + + if args.inception_norms: + cfg.task.normalization_mean = [0.5, 0.5, 0.5] + cfg.task.normalization_std = [0.5, 0.5, 0.5] + + cfg.model = Data2VecVisionConfig( + _name="data2vec_vision", + ) + cfg = OmegaConf.create(cfg) + + state = { + "cfg": OmegaConf.to_container(cfg, resolve=True, enum_to_str=True), + "model": cp["model"], + "best_loss": None, + "optimizer": None, + "extra_state": {}, + } + + model = Data2VecVisionModel(cfg.model) + model.load_state_dict( + update_checkpoint(state["model"], prefix="encoder.", is_nested=False), + strict=True, + ) + else: + raise Exception("unsupported type " + args.type) + + print(state["cfg"], state.keys()) + torch.save(state, args.output) + + +if __name__ == "__main__": + main() diff --git a/examples/data2vec/models/__init__.py b/examples/data2vec/models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/data2vec/models/audio_classification.py b/examples/data2vec/models/audio_classification.py new file mode 100644 index 0000000000..06d2158267 --- /dev/null +++ b/examples/data2vec/models/audio_classification.py @@ -0,0 +1,614 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import logging +import re +from dataclasses import dataclass, field +from typing import Any, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from omegaconf import II, MISSING, open_dict + +from fairseq import checkpoint_utils, tasks +from fairseq.dataclass import FairseqDataclass +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.models import ( + BaseFairseqModel, + register_model, +) +from fairseq.models.wav2vec.wav2vec2 import MASKING_DISTRIBUTION_CHOICES +from fairseq.modules import TransposeLast +from fairseq.tasks import FairseqTask + +logger = logging.getLogger(__name__) + + +@dataclass +class AudioClassificationConfig(FairseqDataclass): + model_path: str = field( + default=MISSING, metadata={"help": "path to wav2vec 2.0 model"} + ) + no_pretrained_weights: bool = field( + default=False, metadata={"help": "if true, does not load pretrained weights"} + ) + dropout_input: float = field( + default=0.0, + metadata={"help": "dropout to apply to the input (after feat extr)"}, + ) + final_dropout: float = field( + default=0.0, + metadata={"help": "dropout after transformer and before final projection"}, + ) + dropout: float = field( + default=0.0, metadata={"help": "dropout probability inside wav2vec 2.0 model"} + ) + attention_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability for attention weights inside wav2vec 2.0 model" + }, + ) + activation_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability after activation in FFN inside wav2vec 2.0 model" + }, + ) + + # masking + apply_mask: bool = field( + default=False, metadata={"help": "apply masking during fine-tuning"} + ) + mask_length: int = field( + default=10, metadata={"help": "repeat the mask indices multiple times"} + ) + mask_prob: float = field( + default=0.5, + metadata={ + "help": "probability of replacing a token with mask (normalized by length)" + }, + ) + mask_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose masks"} + ) + mask_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument (used for more complex distributions), " + "see help in compute_mask_indices" + }, + ) + no_mask_overlap: bool = field( + default=False, metadata={"help": "whether to allow masks to overlap"} + ) + mask_min_space: Optional[int] = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + require_same_masks: bool = field( + default=True, + metadata={ + "help": "whether to number of masked timesteps must be the same across all " + "examples in a batch" + }, + ) + mask_dropout: float = field( + default=0.0, + metadata={"help": "percent of masks to unmask for each sample"}, + ) + + # channel masking + mask_channel_length: int = field( + default=10, metadata={"help": "length of the mask for features (channels)"} + ) + mask_channel_prob: float = field( + default=0.0, metadata={"help": "probability of replacing a feature with 0"} + ) + mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", + metadata={"help": "how to choose mask length for channel masking"}, + ) + mask_channel_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument (used for more complex distributions), " + "see help in compute_mask_indicesh" + }, + ) + no_mask_channel_overlap: bool = field( + default=False, metadata={"help": "whether to allow channel masks to overlap"} + ) + freeze_finetune_updates: int = field( + default=0, metadata={"help": "dont finetune wav2vec for this many updates"} + ) + feature_grad_mult: float = field( + default=0.0, metadata={"help": "reset feature grad mult in wav2vec 2.0 to this"} + ) + layerdrop: float = field( + default=0.0, metadata={"help": "probability of dropping a layer in wav2vec 2.0"} + ) + mask_channel_min_space: Optional[int] = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + mask_channel_before: bool = False + normalize: bool = II("task.normalize") + data: str = II("task.data") + # this holds the loaded wav2vec args + d2v_args: Any = None + offload_activations: bool = field( + default=False, metadata={"help": "offload_activations"} + ) + min_params_to_wrap: int = field( + default=int(1e8), + metadata={ + "help": "minimum number of params for a layer to be wrapped with FSDP() when " + "training with --ddp-backend=fully_sharded. Smaller values will " + "improve memory efficiency, but may make torch.distributed " + "communication less efficient due to smaller input sizes. This option " + "is set to 0 (i.e., always wrap) when --checkpoint-activations or " + "--offload-activations are passed." + }, + ) + + checkpoint_activations: bool = field( + default=False, + metadata={"help": "recompute activations and save memory for extra compute"}, + ) + ddp_backend: str = II("distributed_training.ddp_backend") + + prediction_mode: str = "lin_softmax" + eval_prediction_mode: Optional[str] = None + conv_kernel: int = -1 + conv_stride: int = 1 + two_convs: bool = False + extreme_factor: float = 1.0 + + conv_feature_layers: Optional[str] = field( + default=None, + metadata={ + "help": "string describing convolutional feature extraction layers in form of a python list that contains " + "[(dim, kernel_size, stride), ...]" + }, + ) + + mixup_prob: float = 1.0 + source_mixup: float = -1 + same_mixup: bool = True + label_mixup: bool = False + + gain_mode: str = "none" + + +@register_model("audio_classification", dataclass=AudioClassificationConfig) +class AudioClassificationModel(BaseFairseqModel): + def __init__(self, cfg: AudioClassificationConfig, num_classes): + super().__init__() + + self.apply_mask = cfg.apply_mask + self.cfg = cfg + + arg_overrides = { + "dropout": cfg.dropout, + "activation_dropout": cfg.activation_dropout, + "dropout_input": cfg.dropout_input, + "attention_dropout": cfg.attention_dropout, + "mask_length": cfg.mask_length, + "mask_prob": cfg.mask_prob, + "require_same_masks": getattr(cfg, "require_same_masks", True), + "mask_dropout": getattr(cfg, "mask_dropout", 0), + "mask_selection": cfg.mask_selection, + "mask_other": cfg.mask_other, + "no_mask_overlap": cfg.no_mask_overlap, + "mask_channel_length": cfg.mask_channel_length, + "mask_channel_prob": cfg.mask_channel_prob, + "mask_channel_before": cfg.mask_channel_before, + "mask_channel_selection": cfg.mask_channel_selection, + "mask_channel_other": cfg.mask_channel_other, + "no_mask_channel_overlap": cfg.no_mask_channel_overlap, + "encoder_layerdrop": cfg.layerdrop, + "feature_grad_mult": cfg.feature_grad_mult, + "checkpoint_activations": cfg.checkpoint_activations, + "offload_activations": cfg.offload_activations, + "min_params_to_wrap": cfg.min_params_to_wrap, + "mixup": -1, + } + + if cfg.conv_feature_layers is not None: + arg_overrides["conv_feature_layers"] = cfg.conv_feature_layers + + if cfg.d2v_args is None: + state = checkpoint_utils.load_checkpoint_to_cpu( + cfg.model_path, arg_overrides + ) + d2v_args = state.get("cfg", None) + if d2v_args is None: + d2v_args = convert_namespace_to_omegaconf(state["args"]) + d2v_args.criterion = None + d2v_args.lr_scheduler = None + cfg.d2v_args = d2v_args + + logger.info(d2v_args) + + else: + state = None + d2v_args = cfg.d2v_args + + model_normalized = d2v_args.task.get( + "normalize", d2v_args.model.get("normalize", False) + ) + assert cfg.normalize == model_normalized, ( + "Fine-tuning works best when data normalization is the same. " + "Please check that --normalize is set or unset for both pre-training and here" + ) + + if hasattr(cfg, "checkpoint_activations") and cfg.checkpoint_activations: + with open_dict(d2v_args): + d2v_args.model.checkpoint_activations = cfg.checkpoint_activations + + d2v_args.task.data = cfg.data + task = tasks.setup_task(d2v_args.task) + model = task.build_model(d2v_args.model, from_checkpoint=True) + + model.remove_pretraining_modules() + + if state is not None and not cfg.no_pretrained_weights: + self.load_model_weights(state, model, cfg) + + d = d2v_args.model.encoder_embed_dim + + self.d2v_model = model + + self.final_dropout = nn.Dropout(cfg.final_dropout) + self.freeze_finetune_updates = cfg.freeze_finetune_updates + self.num_updates = 0 + + for p in self.parameters(): + p.param_group = "pretrained" + + if cfg.prediction_mode == "proj_avg_proj": + self.proj = nn.Linear(d, d * 2) + self.proj2 = nn.Linear(d * 2, num_classes) + + for p in self.proj.parameters(): + p.param_group = "projection" + for p in self.proj2.parameters(): + p.param_group = "projection" + elif self.cfg.prediction_mode == "summary_proj": + self.proj = nn.Linear(d // 3, num_classes) + for p in self.proj.parameters(): + p.param_group = "projection" + elif self.cfg.conv_kernel > 1 and not self.cfg.two_convs: + self.proj = nn.Sequential( + TransposeLast(), + nn.Conv1d(d, num_classes, kernel_size=self.cfg.conv_kernel, stride=self.cfg.conv_stride), + TransposeLast(), + ) + for p in self.proj.parameters(): + p.param_group = "projection" + elif self.cfg.conv_kernel > 0 and self.cfg.two_convs: + self.proj = nn.Sequential( + TransposeLast(), + nn.Conv1d(d, d, kernel_size=self.cfg.conv_kernel, stride=self.cfg.conv_stride), + TransposeLast(), + nn.GELU(), + nn.Linear(d, num_classes), + ) + for p in self.proj.parameters(): + p.param_group = "projection" + else: + self.proj = nn.Linear(d, num_classes) + for p in self.proj.parameters(): + p.param_group = "projection" + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + return state_dict + + @classmethod + def build_model(cls, cfg: AudioClassificationConfig, task: FairseqTask): + """Build a new model instance.""" + + assert hasattr(task, "labels"), f"Task {task} must have an attribute 'labels'" + + return cls(cfg, len(task.labels)) + + def load_model_weights(self, state, model, cfg): + if cfg.ddp_backend == "fully_sharded": + from fairseq.distributed import FullyShardedDataParallel + + for name, module in model.named_modules(): + if "encoder.layers" in name and len(name.split(".")) == 3: + # Only for layers, we do a special handling and load the weights one by one + # We dont load all weights together as that wont be memory efficient and may + # cause oom + new_dict = { + k.replace(name + ".", ""): v + for (k, v) in state["model"].items() + if name + "." in k + } + assert isinstance(module, FullyShardedDataParallel) + with module.summon_full_params(): + module.load_state_dict(new_dict, strict=True) + module._reset_lazy_init() + + # Once layers are loaded, filter them out and load everything else. + r = re.compile("encoder.layers.\d.") + filtered_list = list(filter(r.match, state["model"].keys())) + + new_big_dict = { + k: v for (k, v) in state["model"].items() if k not in filtered_list + } + + model.load_state_dict(new_big_dict, strict=False) + else: + if "_ema" in state["model"]: + del state["model"]["_ema"] + model.load_state_dict(state["model"], strict=False) + + def set_num_updates(self, num_updates): + """Set the number of parameters updates.""" + super().set_num_updates(num_updates) + self.num_updates = num_updates + + def compute_gain(self, sound, fs=16_000, min_db=-80.0, mode="A_weighting"): + if fs == 16000: + n_fft = 2048 + elif fs == 44100: + n_fft = 4096 + else: + raise Exception("Invalid fs {}".format(fs)) + stride = n_fft // 2 + + def a_weight(fs, n_fft, min_db=-80.0): + freq = np.linspace(0, fs // 2, n_fft // 2 + 1) + freq_sq = np.power(freq, 2) + freq_sq[0] = 1.0 + weight = 2.0 + 20.0 * ( + 2 * np.log10(12194) + + 2 * np.log10(freq_sq) + - np.log10(freq_sq + 12194 ** 2) + - np.log10(freq_sq + 20.6 ** 2) + - 0.5 * np.log10(freq_sq + 107.7 ** 2) + - 0.5 * np.log10(freq_sq + 737.9 ** 2) + ) + weight = np.maximum(weight, min_db) + + return weight + + gain = [] + for i in range(0, len(sound) - n_fft + 1, stride): + if mode == "RMSE": + g = np.mean(sound[i : i + n_fft] ** 2) + elif mode == "A_weighting": + spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i : i + n_fft]) + power_spec = np.abs(spec) ** 2 + a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10) + g = np.sum(a_weighted_spec) + else: + raise Exception("Invalid mode {}".format(mode)) + gain.append(g) + + gain = np.array(gain) + gain = np.maximum(gain, np.power(10, min_db / 10)) + gain_db = 10 * np.log10(gain) + + return gain_db + + # adapted from https://github.com/mil-tokyo/bc_learning_sound/blob/master/utils.py + def compute_gain_torch(self, sound, fs=16_000, min_db=-80.0, mode="A_weighting"): + if fs == 16000: + n_fft = 2048 + elif fs == 44100: + n_fft = 4096 + else: + raise Exception("Invalid fs {}".format(fs)) + + if mode == "A_weighting": + if not hasattr(self, f"a_weight"): + self.a_weight = {} + + if fs not in self.a_weight: + + def a_weight(fs, n_fft, min_db=-80.0): + freq = np.linspace(0, fs // 2, n_fft // 2 + 1) + freq_sq = freq ** 2 + freq_sq[0] = 1.0 + weight = 2.0 + 20.0 * ( + 2 * np.log10(12194) + + 2 * np.log10(freq_sq) + - np.log10(freq_sq + 12194 ** 2) + - np.log10(freq_sq + 20.6 ** 2) + - 0.5 * np.log10(freq_sq + 107.7 ** 2) + - 0.5 * np.log10(freq_sq + 737.9 ** 2) + ) + weight = np.maximum(weight, min_db) + + return weight + + self.a_weight[fs] = torch.from_numpy( + np.power(10, a_weight(fs, n_fft, min_db) / 10) + ).to(device=sound.device) + + sound = sound.unfold(-1, n_fft, n_fft // 2) + + if mode == "RMSE": + sound = sound ** 2 + g = sound.mean(-1) + elif mode == "A_weighting": + w = torch.hann_window(n_fft, device=sound.device) * sound + spec = torch.fft.rfft(w) + power_spec = spec.abs() ** 2 + a_weighted_spec = power_spec * self.a_weight[fs] + g = a_weighted_spec.sum(-1) + else: + raise Exception("Invalid mode {}".format(mode)) + + gain = torch.maximum(g, torch.tensor(10 ** (min_db / 10), device=g.device)) + gain_db = 10 * torch.log10(gain) + + return gain_db + + def forward(self, source, padding_mask, label=None, **kwargs): + + if self.cfg.source_mixup >= 0 and self.training and self.cfg.mixup_prob > 0: + with torch.no_grad(): + mixed_source = source + mix_mask = None + if self.cfg.mixup_prob < 1: + mix_mask = ( + torch.empty((source.size(0),), device=source.device) + .bernoulli_(self.cfg.mixup_prob) + .bool() + ) + mixed_source = source[mix_mask] + + r = ( + torch.FloatTensor( + 1 if self.cfg.same_mixup else mixed_source.size(0) + ) + .uniform_(max(1e-6, self.cfg.source_mixup), 1) + .to(dtype=source.dtype, device=source.device) + ) + + mixup_perm = torch.randperm(source.size(0)) + s2 = source[mixup_perm] + + if self.cfg.gain_mode == "none": + p = r.unsqueeze(-1) + if mix_mask is not None: + s2 = s2[mix_mask] + else: + if self.cfg.gain_mode == "naive_rms": + G1 = source.pow(2).mean(dim=-1).sqrt() + else: + G1, _ = self.compute_gain_torch( + source, mode=self.cfg.gain_mode + ).max(-1) + G1 = G1.to(dtype=source.dtype) + + G2 = G1[mixup_perm] + + if mix_mask is not None: + G1 = G1[mix_mask] + G2 = G2[mix_mask] + s2 = s2[mix_mask] + + p = 1 / (1 + 10 ** ((G1 - G2) / 20) * (1 - r) / r) + p = p.unsqueeze(-1) + + mixed = (p * mixed_source) + (1 - p) * s2 + + if mix_mask is None: + source = mixed / torch.sqrt(p ** 2 + (1 - p) ** 2) + else: + source[mix_mask] = mixed / torch.sqrt(p ** 2 + (1 - p) ** 2) + + if label is not None and self.cfg.label_mixup: + r = r.unsqueeze(-1) + if mix_mask is None: + label = label * r + (1 - r) * label[mixup_perm] + else: + label[mix_mask] = ( + label[mix_mask] * r + (1 - r) * label[mixup_perm][mix_mask] + ) + + d2v_args = { + "source": source, + "padding_mask": padding_mask, + "mask": self.apply_mask and self.training, + } + + ft = self.freeze_finetune_updates <= self.num_updates + + with torch.no_grad() if not ft else contextlib.ExitStack(): + res = self.d2v_model.extract_features(**d2v_args) + + x = res["x"] + padding_mask = res["padding_mask"] + if padding_mask is not None: + x[padding_mask] = 0 + + x = self.final_dropout(x) + + if self.training or ( + self.cfg.eval_prediction_mode is None or self.cfg.eval_prediction_mode == "" + ): + prediction_mode = self.cfg.prediction_mode + else: + prediction_mode = self.cfg.eval_prediction_mode + + if prediction_mode == "average_before": + x = x.mean(dim=1) + + if prediction_mode != "summary_mha" and prediction_mode != "summary_proj" and prediction_mode != "cls": + x = self.proj(x) + + logits = True + if prediction_mode == "lin_softmax": + x = F.logsigmoid(x.float()) + x = torch.logsumexp(x + x, dim=1) - torch.logsumexp(x, dim=1) + x = x.clamp(max=0) + x = x - torch.log(-(torch.expm1(x))) + elif prediction_mode == "extremized_odds": + x = x.float().sum(dim=1) + x = x * self.cfg.extreme_factor + elif prediction_mode == "average_before": + x = x.float() + elif prediction_mode == "average": + x = x.float().mean(dim=1) + elif prediction_mode == "average_sigmoid": + x = torch.sigmoid(x.float()) + x = x.mean(dim=1) + logits = False + elif prediction_mode == "max": + x, _ = x.float().max(dim=1) + elif prediction_mode == "max_sigmoid": + x = torch.sigmoid(x.float()) + x, _ = x.float().max(dim=1) + logits = False + elif prediction_mode == "proj_avg_proj": + x = x.mean(dim=1) + x = self.proj2(x) + elif prediction_mode == "summary_mha" or prediction_mode == "summary_proj": + x = self.d2v_model.summary( + x, padding_mask, proj=prediction_mode == "summary_proj" + ) + x = x.type_as(source) + x = self.proj(x) + elif prediction_mode == "cls": + x = x[:,0] + x = self.proj(x) + else: + raise Exception(f"unknown prediction mode {prediction_mode}") + + if label is None: + return torch.sigmoid(x) if logits else x + + x = torch.nan_to_num(x) + + if logits: + loss = F.binary_cross_entropy_with_logits( + x, label.float(), reduction="none" + ) + else: + loss = F.binary_cross_entropy(x, label.float(), reduction="none") + + result = { + "losses": { + "main": loss, + }, + "sample_size": label.sum(), + } + + if not self.training: + result["_predictions"] = torch.sigmoid(x) if logits else x + result["_targets"] = label + + return result diff --git a/examples/data2vec/models/data2vec2.py b/examples/data2vec/models/data2vec2.py new file mode 100644 index 0000000000..0c61b37081 --- /dev/null +++ b/examples/data2vec/models/data2vec2.py @@ -0,0 +1,813 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import math +from dataclasses import dataclass, field +from typing import Optional, Callable +from functools import partial +import numpy as np + +from omegaconf import II + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.distributed as dist + +from fairseq.modules import EMAModule, EMAModuleConfig + +from fairseq.dataclass import FairseqDataclass +from fairseq.models import BaseFairseqModel, register_model + +from examples.data2vec.data.modality import Modality + +from examples.data2vec.models.modalities.base import ( + MaskSeed, + D2vModalityConfig, + ModalitySpecificEncoder, + get_annealed_rate, +) +from examples.data2vec.models.modalities.modules import ( + D2vDecoderConfig, + AltBlock, + Decoder1d, +) + +from examples.data2vec.models.modalities.audio import ( + D2vAudioConfig, + AudioEncoder, +) +from examples.data2vec.models.modalities.images import ( + D2vImageConfig, + ImageEncoder, +) +from examples.data2vec.models.modalities.text import ( + D2vTextConfig, + TextEncoder, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class D2vModalitiesConfig(FairseqDataclass): + audio: D2vAudioConfig = D2vAudioConfig() + image: D2vImageConfig = D2vImageConfig() + text: D2vTextConfig = D2vTextConfig() + + +@dataclass +class Data2VecMultiConfig(FairseqDataclass): + + loss_beta: float = field( + default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"} + ) + loss_scale: Optional[float] = field( + default=None, + metadata={ + "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)" + }, + ) + + depth: int = 8 + start_drop_path_rate: float = 0 + end_drop_path_rate: float = 0 + num_heads: int = 12 + norm_eps: float = 1e-6 + norm_affine: bool = True + encoder_dropout: float = 0.1 + post_mlp_drop: float = 0.1 + attention_dropout: float = 0.1 + activation_dropout: float = 0.0 + dropout_input: float = 0.0 + layerdrop: float = 0.0 + embed_dim: int = 768 + mlp_ratio: float = 4 + layer_norm_first: bool = False + + average_top_k_layers: int = field( + default=8, metadata={"help": "how many layers to average"} + ) + + end_of_block_targets: bool = False + + clone_batch: int = 1 + + layer_norm_target_layer: bool = False + batch_norm_target_layer: bool = False + instance_norm_target_layer: bool = False + instance_norm_targets: bool = False + layer_norm_targets: bool = False + + ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"}) + ema_same_dtype: bool = True + log_norms: bool = True + ema_end_decay: float = field( + default=0.9999, metadata={"help": "final ema decay rate"} + ) + + # when to finish annealing ema decay rate + ema_anneal_end_step: int = II("optimization.max_update") + + ema_encoder_only: bool = field( + default=True, + metadata={ + "help": "whether to momentum update only the shared transformer encoder" + }, + ) + + max_update: int = II("optimization.max_update") + + modalities: D2vModalitiesConfig = D2vModalitiesConfig() + + shared_decoder: Optional[D2vDecoderConfig] = None + + min_target_var: float = field( + default=0.1, metadata={"help": "stop training if target var falls below this"} + ) + min_pred_var: float = field( + default=0.01, + metadata={"help": "stop training if prediction var falls below this"}, + ) + + supported_modality: Optional[Modality] = None + mae_init: bool = False + + seed: int = II("common.seed") + + skip_ema: bool = False + + cls_loss: float = 0 + recon_loss: float = 0 + d2v_loss: float = 1 + + decoder_group: bool = False + + +@register_model("data2vec_multi", dataclass=Data2VecMultiConfig) +class Data2VecMultiModel(BaseFairseqModel): + def make_modality_encoder( + self, + cfg: D2vModalityConfig, + embed_dim: int, + make_block: Callable[[float], nn.ModuleList], + norm_layer: Callable[[int], nn.LayerNorm], + layer_norm_first: bool, + alibi_biases, + task, + ) -> ModalitySpecificEncoder: + if cfg.type == Modality.AUDIO: + enc_cls = AudioEncoder + elif cfg.type == Modality.IMAGE: + enc_cls = ImageEncoder + elif cfg.type == Modality.TEXT: + enc_cls = TextEncoder + if hasattr(task, "text_task"): + task = task.text_task + else: + raise Exception(f"unsupported modality {cfg.type}") + + return enc_cls( + cfg, + embed_dim, + make_block, + norm_layer, + layer_norm_first, + alibi_biases, + task, + ) + + def __init__(self, cfg: Data2VecMultiConfig, modalities, skip_ema=False, task=None): + super().__init__() + self.cfg = cfg + self.modalities = modalities + self.task = task + + make_layer_norm = partial( + nn.LayerNorm, eps=cfg.norm_eps, elementwise_affine=cfg.norm_affine + ) + + def make_block(drop_path, dim=None, heads=None): + return AltBlock( + cfg.embed_dim if dim is None else dim, + cfg.num_heads if heads is None else heads, + cfg.mlp_ratio, + qkv_bias=True, + drop=cfg.encoder_dropout, + attn_drop=cfg.attention_dropout, + mlp_drop=cfg.activation_dropout, + post_mlp_drop=cfg.post_mlp_drop, + drop_path=drop_path, + norm_layer=make_layer_norm, + layer_norm_first=cfg.layer_norm_first, + ffn_targets=not cfg.end_of_block_targets, + ) + + self.alibi_biases = {} + self.modality_encoders = nn.ModuleDict() + for mod in self.modalities: + mod_cfg = getattr(cfg.modalities, mod.name.lower()) + enc = self.make_modality_encoder( + mod_cfg, + cfg.embed_dim, + make_block, + make_layer_norm, + cfg.layer_norm_first, + self.alibi_biases, + task, + ) + self.modality_encoders[mod.name] = enc + + self.ema = None + + self.average_top_k_layers = cfg.average_top_k_layers + self.loss_beta = cfg.loss_beta + self.loss_scale = cfg.loss_scale + + self.dropout_input = nn.Dropout(cfg.dropout_input) + + dpr = np.linspace(cfg.start_drop_path_rate, cfg.end_drop_path_rate, cfg.depth) + + self.blocks = nn.ModuleList([make_block(dpr[i]) for i in range(cfg.depth)]) + + self.norm = None + if cfg.layer_norm_first: + self.norm = make_layer_norm(cfg.embed_dim) + + if self.cfg.mae_init: + self.apply(self._init_weights) + else: + from fairseq.modules.transformer_sentence_encoder import init_bert_params + + self.apply(init_bert_params) + + for mod_enc in self.modality_encoders.values(): + mod_enc.reset_parameters() + + if not skip_ema: + self.ema = self.make_ema_teacher(cfg.ema_decay) + self.shared_decoder = ( + Decoder1d(cfg.shared_decoder, cfg.embed_dim) + if self.cfg.shared_decoder is not None + else None + ) + if self.shared_decoder is not None: + self.shared_decoder.apply(self._init_weights) + + self.recon_proj = None + if cfg.recon_loss > 0: + self.recon_proj = nn.Linear(cfg.embed_dim, cfg.embed_dim) + + for pn, p in self.named_parameters(): + if len(p.shape) == 1 or pn.endswith(".bias") or "alibi_scale" in pn: + p.optim_overrides = {"optimizer": {"weight_decay_scale": 0}} + if cfg.decoder_group and "decoder" in pn: + p.param_group = "decoder" + + self.num_updates = 0 + + def _init_weights(self, m): + + try: + from apex.normalization import FusedLayerNorm + + fn = FusedLayerNorm + except: + fn = nn.LayerNorm + + if isinstance(m, nn.Linear): + torch.nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm) or isinstance(m, fn): + if m.bias is not None: + nn.init.constant_(m.bias, 0) + if m.weight is not None: + nn.init.constant_(m.weight, 1.0) + + @torch.no_grad() + def make_ema_teacher(self, ema_decay): + ema_config = EMAModuleConfig( + ema_decay=ema_decay, + ema_fp32=True, + log_norms=self.cfg.log_norms, + add_missing_params=False, + ) + + model_copy = self.make_target_model() + + return EMAModule( + model_copy, + ema_config, + copy_model=False, + ) + + def make_target_model(self): + logger.info("making target model") + + model_copy = Data2VecMultiModel( + self.cfg, self.modalities, skip_ema=True, task=self.task + ) + + if self.cfg.ema_encoder_only: + model_copy = model_copy.blocks + for p_s, p_t in zip(self.blocks.parameters(), model_copy.parameters()): + p_t.data.copy_(p_s.data) + else: + for p_s, p_t in zip(self.parameters(), model_copy.parameters()): + p_t.data.copy_(p_s.data) + + for mod_enc in model_copy.modality_encoders.values(): + mod_enc.decoder = None + if not mod_enc.modality_cfg.ema_local_encoder: + mod_enc.local_encoder = None + mod_enc.project_features = None + + model_copy.requires_grad_(False) + return model_copy + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + + if self.ema is not None and ( + (self.num_updates == 0 and num_updates > 1) + or self.num_updates >= num_updates + ): + pass + elif self.training and self.ema is not None: + ema_weight_decay = None + if self.cfg.ema_decay != self.cfg.ema_end_decay: + if num_updates >= self.cfg.ema_anneal_end_step: + decay = self.cfg.ema_end_decay + else: + decay = get_annealed_rate( + self.cfg.ema_decay, + self.cfg.ema_end_decay, + num_updates, + self.cfg.ema_anneal_end_step, + ) + self.ema.set_decay(decay, weight_decay=ema_weight_decay) + if self.ema.get_decay() < 1: + self.ema.step(self.blocks if self.cfg.ema_encoder_only else self) + + self.num_updates = num_updates + + def state_dict(self, destination=None, prefix="", keep_vars=False): + state = super().state_dict(destination, prefix, keep_vars) + + if self.ema is not None: + state[prefix + "_ema"] = self.ema.fp32_params + + return state + + def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): + k = prefix + "_ema" + if self.ema is not None: + assert k in state_dict + self.ema.restore(state_dict[k], True) + del state_dict[k] + elif k in state_dict: + del state_dict[k] + + return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + @classmethod + def build_model(cls, cfg: Data2VecMultiConfig, task=None): + """Build a new model instance.""" + if task is None or not hasattr(task, "supported_modalities"): + modalities = ( + [cfg.supported_modality] + if cfg.supported_modality is not None + else [ + Modality.AUDIO, + Modality.IMAGE, + Modality.TEXT, + ] + ) + else: + modalities = task.supported_modalities + return cls(cfg, modalities, task=task, skip_ema=cfg.skip_ema) + + def forward( + self, + source, + target=None, + id=None, + mode=None, + padding_mask=None, + mask=True, + features_only=False, + force_remove_masked=False, + remove_extra_tokens=True, + precomputed_mask=None, + ): + if mode is None: + assert self.cfg.supported_modality is not None + mode = self.cfg.supported_modality + + if isinstance(mode, Modality): + mode = mode.name + + feature_extractor = self.modality_encoders[mode] + + mask_seeds = None + if id is not None: + mask_seeds = MaskSeed(seed=self.cfg.seed, update=self.num_updates, ids=id) + + extractor_out = feature_extractor( + source, + padding_mask, + mask, + remove_masked=not features_only or force_remove_masked, + clone_batch=self.cfg.clone_batch if not features_only else 1, + mask_seeds=mask_seeds, + precomputed_mask=precomputed_mask, + ) + + x = extractor_out["x"] + encoder_mask = extractor_out["encoder_mask"] + masked_padding_mask = extractor_out["padding_mask"] + masked_alibi_bias = extractor_out.get("alibi_bias", None) + alibi_scale = extractor_out.get("alibi_scale", None) + + if self.dropout_input is not None: + x = self.dropout_input(x) + + layer_results = [] + for i, blk in enumerate(self.blocks): + if ( + not self.training + or self.cfg.layerdrop == 0 + or (np.random.random() > self.cfg.layerdrop) + ): + ab = masked_alibi_bias + if ab is not None and alibi_scale is not None: + scale = ( + alibi_scale[i] + if alibi_scale.size(0) > 1 + else alibi_scale.squeeze(0) + ) + ab = ab * scale.type_as(ab) + + x, lr = blk( + x, + padding_mask=masked_padding_mask, + alibi_bias=ab, + ) + if features_only: + layer_results.append(lr) + + if self.norm is not None: + x = self.norm(x) + + if features_only: + if remove_extra_tokens: + x = x[:, feature_extractor.modality_cfg.num_extra_tokens :] + if masked_padding_mask is not None: + masked_padding_mask = masked_padding_mask[ + :, feature_extractor.modality_cfg.num_extra_tokens : + ] + + return { + "x": x, + "padding_mask": masked_padding_mask, + "layer_results": layer_results, + "mask": encoder_mask, + } + + xs = [] + + if self.shared_decoder is not None: + dx = self.forward_decoder( + x, + feature_extractor, + self.shared_decoder, + encoder_mask, + ) + xs.append(dx) + if feature_extractor.decoder is not None: + dx = self.forward_decoder( + x, + feature_extractor, + feature_extractor.decoder, + encoder_mask, + ) + xs.append(dx) + orig_x = x + + assert len(xs) > 0 + + p = next(self.ema.model.parameters()) + device = x.device + dtype = x.dtype + ema_device = p.device + ema_dtype = p.dtype + + if not self.cfg.ema_same_dtype: + dtype = ema_dtype + + if ema_device != device or ema_dtype != dtype: + logger.info(f"adjusting ema dtype to {dtype} and device to {device}") + self.ema.model = self.ema.model.to(dtype=dtype, device=device) + ema_dtype = dtype + + def to_device(d): + for k, p in d.items(): + if isinstance(d[k], dict): + to_device(d[k]) + else: + d[k] = p.to(device=device) + + to_device(self.ema.fp32_params) + tm = self.ema.model + + with torch.no_grad(): + tm.eval() + + if self.cfg.ema_encoder_only: + assert target is None + ema_input = extractor_out["local_features"] + ema_input = feature_extractor.contextualized_features( + ema_input.to(dtype=ema_dtype), + padding_mask, + mask=False, + remove_masked=False, + ) + ema_blocks = tm + else: + ema_blocks = tm.blocks + if feature_extractor.modality_cfg.ema_local_encoder: + inp = ( + target.to(dtype=ema_dtype) + if target is not None + else source.to(dtype=ema_dtype) + ) + ema_input = tm.modality_encoders[mode]( + inp, + padding_mask, + mask=False, + remove_masked=False, + ) + else: + assert target is None + ema_input = extractor_out["local_features"] + ema_feature_enc = tm.modality_encoders[mode] + ema_input = ema_feature_enc.contextualized_features( + ema_input.to(dtype=ema_dtype), + padding_mask, + mask=False, + remove_masked=False, + ) + + ema_padding_mask = ema_input["padding_mask"] + ema_alibi_bias = ema_input.get("alibi_bias", None) + ema_alibi_scale = ema_input.get("alibi_scale", None) + ema_input = ema_input["x"] + + y = [] + ema_x = [] + extra_tokens = feature_extractor.modality_cfg.num_extra_tokens + for i, blk in enumerate(ema_blocks): + ab = ema_alibi_bias + if ab is not None and alibi_scale is not None: + scale = ( + ema_alibi_scale[i] + if ema_alibi_scale.size(0) > 1 + else ema_alibi_scale.squeeze(0) + ) + ab = ab * scale.type_as(ab) + + ema_input, lr = blk( + ema_input, + padding_mask=ema_padding_mask, + alibi_bias=ab, + ) + y.append(lr[:, extra_tokens:]) + ema_x.append(ema_input[:, extra_tokens:]) + + y = self.make_targets(y, self.average_top_k_layers) + orig_targets = y + + if self.cfg.clone_batch > 1: + y = y.repeat_interleave(self.cfg.clone_batch, 0) + + masked = encoder_mask.mask.unsqueeze(-1) + masked_b = encoder_mask.mask.bool() + y = y[masked_b] + + if xs[0].size(1) == masked_b.size(1): + xs = [x[masked_b] for x in xs] + else: + xs = [x.reshape(-1, x.size(-1)) for x in xs] + + sample_size = masked.sum().long() + + result = { + "losses": {}, + "sample_size": sample_size, + } + + sample_size = result["sample_size"] + + if self.cfg.cls_loss > 0: + assert extra_tokens > 0 + cls_target = orig_targets.mean(dim=1) + if self.cfg.clone_batch > 1: + cls_target = cls_target.repeat_interleave(self.cfg.clone_batch, 0) + cls_pred = x[:, extra_tokens - 1] + result["losses"]["cls"] = self.d2v_loss(cls_pred, cls_target) * ( + self.cfg.cls_loss * sample_size + ) + + if self.cfg.recon_loss > 0: + + with torch.no_grad(): + target = feature_extractor.patchify(source) + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.0e-6) ** 0.5 + + if self.cfg.clone_batch > 1: + target = target.repeat_interleave(self.cfg.clone_batch, 0) + + if masked_b is not None: + target = target[masked_b] + + recon = xs[0] + if self.recon_proj is not None: + recon = self.recon_proj(recon) + + result["losses"]["recon"] = ( + self.d2v_loss(recon, target.float()) * self.cfg.recon_loss + ) + + if self.cfg.d2v_loss > 0: + for i, x in enumerate(xs): + reg_loss = self.d2v_loss(x, y) + n = f"{mode}_regression_{i}" if len(xs) > 1 else f"{mode}_regression" + result["losses"][n] = reg_loss * self.cfg.d2v_loss + + suffix = "" if len(self.modalities) == 1 else f"_{mode}" + with torch.no_grad(): + if encoder_mask is not None: + result["masked_pct"] = 1 - ( + encoder_mask.ids_keep.size(1) / encoder_mask.ids_restore.size(1) + ) + for i, x in enumerate(xs): + n = f"pred_var{suffix}_{i}" if len(xs) > 1 else f"pred_var{suffix}" + result[n] = self.compute_var(x.float()) + if self.ema is not None: + for k, v in self.ema.logs.items(): + result[k] = v + + y = y.float() + result[f"target_var{suffix}"] = self.compute_var(y) + + if self.num_updates > 5000: + if result[f"target_var{suffix}"] < self.cfg.min_target_var: + logger.error( + f"target var is {result[f'target_var{suffix}'].item()} < {self.cfg.min_target_var}, exiting ({mode})" + ) + raise Exception( + f"target var is {result[f'target_var{suffix}'].item()} < {self.cfg.min_target_var}, exiting ({mode})" + ) + + for k in result.keys(): + if k.startswith("pred_var") and result[k] < self.cfg.min_pred_var: + logger.error( + f"{k} is {result[k].item()} < {self.cfg.min_pred_var}, exiting ({mode})" + ) + raise Exception( + f"{k} is {result[k].item()} < {self.cfg.min_pred_var}, exiting ({mode})" + ) + + result["ema_decay"] = self.ema.get_decay() * 1000 + + return result + + def forward_decoder( + self, + x, + feature_extractor, + decoder, + mask_info, + ): + x = feature_extractor.decoder_input(x, mask_info) + x = decoder(*x) + + return x + + def d2v_loss(self, x, y): + x = x.view(-1, x.size(-1)).float() + y = y.view(-1, x.size(-1)) + + if self.loss_beta == 0: + loss = F.mse_loss(x, y, reduction="none") + else: + loss = F.smooth_l1_loss(x, y, reduction="none", beta=self.loss_beta) + + if self.loss_scale is not None: + scale = self.loss_scale + else: + scale = 1 / math.sqrt(x.size(-1)) + + reg_loss = loss * scale + + return reg_loss + + def make_targets(self, y, num_layers): + + with torch.no_grad(): + target_layer_results = y[-num_layers:] + + permuted = False + if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer: + target_layer_results = [ + tl.transpose(1, 2) for tl in target_layer_results # BTC -> BCT + ] + permuted = True + if self.cfg.batch_norm_target_layer: + target_layer_results = [ + F.batch_norm( + tl.float(), running_mean=None, running_var=None, training=True + ) + for tl in target_layer_results + ] + if self.cfg.instance_norm_target_layer: + target_layer_results = [ + F.instance_norm(tl.float()) for tl in target_layer_results + ] + if permuted: + target_layer_results = [ + tl.transpose(1, 2) for tl in target_layer_results # BCT -> BTC + ] + if self.cfg.layer_norm_target_layer: + target_layer_results = [ + F.layer_norm(tl.float(), tl.shape[-1:]) + for tl in target_layer_results + ] + + y = target_layer_results[0].float() + for tl in target_layer_results[1:]: + y.add_(tl.float()) + y = y.div_(len(target_layer_results)) + + if self.cfg.layer_norm_targets: + y = F.layer_norm(y, y.shape[-1:]) + + if self.cfg.instance_norm_targets: + y = F.instance_norm(y.transpose(1, 2)).transpose(1, 2) + + return y + + @staticmethod + def compute_var(y): + y = y.view(-1, y.size(-1)) + if dist.is_initialized(): + zc = torch.tensor(y.size(0)).cuda() + zs = y.sum(dim=0) + zss = (y**2).sum(dim=0) + + dist.all_reduce(zc) + dist.all_reduce(zs) + dist.all_reduce(zss) + + var = zss / (zc - 1) - (zs**2) / (zc * (zc - 1)) + return torch.sqrt(var + 1e-6).mean() + else: + return torch.sqrt(y.var(dim=0) + 1e-6).mean() + + def extract_features( + self, source, mode=None, padding_mask=None, mask=False, remove_extra_tokens=True + ): + res = self.forward( + source, + mode=mode, + padding_mask=padding_mask, + mask=mask, + features_only=True, + remove_extra_tokens=remove_extra_tokens, + ) + return res + + def remove_pretraining_modules(self, modality=None, keep_decoder=False): + self.ema = None + self.cfg.clone_batch = 1 + self.recon_proj = None + + if not keep_decoder: + self.shared_decoder = None + + modality = modality.lower() if modality is not None else None + for k in list(self.modality_encoders.keys()): + if modality is not None and k.lower() != modality: + del self.modality_encoders[k] + else: + self.modality_encoders[k].remove_pretraining_modules( + keep_decoder=keep_decoder + ) + if not keep_decoder: + self.modality_encoders[k].decoder = None diff --git a/examples/data2vec/models/data2vec_audio.py b/examples/data2vec/models/data2vec_audio.py new file mode 100644 index 0000000000..261c2f104c --- /dev/null +++ b/examples/data2vec/models/data2vec_audio.py @@ -0,0 +1,537 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import math +from dataclasses import dataclass, field +from typing import Optional + +from omegaconf import II + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.distributed as dist + +from fairseq.modules import EMAModule, EMAModuleConfig +from fairseq.data.data_utils import compute_mask_indices +from fairseq.models import BaseFairseqModel, register_model +from fairseq.models.wav2vec import ( + ConvFeatureExtractionModel, + Wav2Vec2Config, + TransformerEncoder, +) +from fairseq.modules import ( + GradMultiply, + LayerNorm, +) +from fairseq.utils import index_put + + +logger = logging.getLogger(__name__) + + +@dataclass +class Data2VecAudioConfig(Wav2Vec2Config): + + loss_beta: float = field( + default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"} + ) + loss_scale: Optional[float] = field( + default=None, + metadata={ + "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)" + }, + ) + average_top_k_layers: int = field( + default=8, metadata={"help": "how many layers to average"} + ) + + layer_norm_target_layer: bool = False + instance_norm_target_layer: bool = False + instance_norm_targets: bool = False + layer_norm_targets: bool = False + batch_norm_target_layer: bool = False + group_norm_target_layer: bool = False + + ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"}) + ema_end_decay: float = field( + default=0.9999, metadata={"help": "final ema decay rate"} + ) + + # when to finish annealing ema decay rate + ema_anneal_end_step: int = II("optimization.max_update") + + ema_transformer_only: bool = field( + default=True, + metadata={"help": "whether to momentum update only the transformer"}, + ) + ema_layers_only: bool = field( + default=True, + metadata={"help": "whether to momentum update only the transformer layers"}, + ) + + max_update: int = II("optimization.max_update") + + min_target_var: float = field( + default=0.1, metadata={"help": "stop training if target var falls below this"} + ) + min_pred_var: float = field( + default=0.01, + metadata={"help": "stop training if prediction var falls below this"}, + ) + + +def get_annealed_rate(start, end, curr_step, total_steps): + r = end - start + pct_remaining = 1 - curr_step / total_steps + return end - r * pct_remaining + + +@register_model("data2vec_audio", dataclass=Data2VecAudioConfig) +class Data2VecAudioModel(BaseFairseqModel): + def __init__(self, cfg: Data2VecAudioConfig): + super().__init__() + self.cfg = cfg + + feature_enc_layers = eval(cfg.conv_feature_layers) + self.extractor_embed = feature_enc_layers[-1][0] + + self.ema = None + self.embed = cfg.encoder_embed_dim + + self.average_top_k_layers = cfg.average_top_k_layers + self.loss_beta = cfg.loss_beta + self.loss_scale = cfg.loss_scale + + self.feature_extractor = ConvFeatureExtractionModel( + conv_layers=feature_enc_layers, + dropout=0.0, + mode=cfg.extractor_mode, + conv_bias=cfg.conv_bias, + ) + + self.post_extract_proj = nn.Linear(self.extractor_embed, cfg.encoder_embed_dim) + + self.mask_prob = cfg.mask_prob + self.mask_selection = cfg.mask_selection + self.mask_other = cfg.mask_other + self.mask_length = cfg.mask_length + self.no_mask_overlap = cfg.no_mask_overlap + self.mask_min_space = cfg.mask_min_space + + self.mask_channel_prob = cfg.mask_channel_prob + self.mask_channel_before = cfg.mask_channel_before + self.mask_channel_selection = cfg.mask_channel_selection + self.mask_channel_other = cfg.mask_channel_other + self.mask_channel_length = cfg.mask_channel_length + self.no_mask_channel_overlap = cfg.no_mask_channel_overlap + self.mask_channel_min_space = cfg.mask_channel_min_space + + self.dropout_input = nn.Dropout(cfg.dropout_input) + self.dropout_features = nn.Dropout(cfg.dropout_features) + + self.feature_grad_mult = cfg.feature_grad_mult + + self.mask_emb = nn.Parameter( + torch.FloatTensor(cfg.encoder_embed_dim).uniform_() + ) + + self.encoder = TransformerEncoder(cfg) + self.layer_norm = LayerNorm(self.extractor_embed) + + self.final_proj = nn.Linear(self.embed, self.embed) + + self.num_updates = 0 + + def make_ema_teacher(self): + ema_config = EMAModuleConfig( + ema_decay=self.cfg.ema_decay, + ema_fp32=True, + ) + skip_keys = set() + if self.cfg.ema_layers_only: + self.cfg.ema_transformer_only = True + for k, _ in self.encoder.pos_conv.named_parameters(): + skip_keys.add(f"pos_conv.{k}") + + self.ema = EMAModule( + self.encoder if self.cfg.ema_transformer_only else self, + ema_config, + skip_keys=skip_keys, + ) + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + + if self.ema is None and self.final_proj is not None: + logger.info(f"making ema teacher") + self.make_ema_teacher() + elif self.training and self.ema is not None: + if self.cfg.ema_decay != self.cfg.ema_end_decay: + if num_updates >= self.cfg.ema_anneal_end_step: + decay = self.cfg.ema_end_decay + else: + decay = get_annealed_rate( + self.cfg.ema_decay, + self.cfg.ema_end_decay, + num_updates, + self.cfg.ema_anneal_end_step, + ) + self.ema.set_decay(decay) + if self.ema.get_decay() < 1: + self.ema.step(self.encoder if self.cfg.ema_transformer_only else self) + + self.num_updates = num_updates + + def state_dict(self, destination=None, prefix="", keep_vars=False): + state = super().state_dict(destination, prefix, keep_vars) + + if self.ema is not None: + state[prefix + "_ema"] = self.ema.fp32_params + + return state + + def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): + if self.ema is not None: + k = prefix + "_ema" + assert k in state_dict + self.ema.restore(state_dict[k], True) + del state_dict[k] + return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + @classmethod + def build_model(cls, cfg: Data2VecAudioConfig, task=None): + """Build a new model instance.""" + + return cls(cfg) + + def apply_mask( + self, + x, + padding_mask, + mask_indices=None, + mask_channel_indices=None, + ): + B, T, C = x.shape + + if self.mask_channel_prob > 0 and self.mask_channel_before: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + torch.from_numpy(mask_channel_indices) + .to(x.device) + .unsqueeze(1) + .expand(-1, T, -1) + ) + x[mask_channel_indices] = 0 + + if self.mask_prob > 0: + if mask_indices is None: + mask_indices = compute_mask_indices( + (B, T), + padding_mask, + self.mask_prob, + self.mask_length, + self.mask_selection, + self.mask_other, + min_masks=1, + no_overlap=self.no_mask_overlap, + min_space=self.mask_min_space, + require_same_masks=self.cfg.require_same_masks, + mask_dropout=self.cfg.mask_dropout, + ) + mask_indices = torch.from_numpy(mask_indices).to(x.device) + x = index_put(x, mask_indices, self.mask_emb) + else: + mask_indices = None + + if self.mask_channel_prob > 0 and not self.mask_channel_before: + if mask_channel_indices is None: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + torch.from_numpy(mask_channel_indices) + .to(x.device) + .unsqueeze(1) + .expand(-1, T, -1) + ) + x = index_put(x, mask_channel_indices, 0) + + return x, mask_indices + + def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor): + """ + Computes the output length of the convolutional layers + """ + + def _conv_out_length(input_length, kernel_size, stride): + return torch.floor((input_length - kernel_size) / stride + 1) + + conv_cfg_list = eval(self.cfg.conv_feature_layers) + + for i in range(len(conv_cfg_list)): + input_lengths = _conv_out_length( + input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2] + ) + + return input_lengths.to(torch.long) + + def forward( + self, + source, + padding_mask=None, + mask=True, + features_only=False, + layer=None, + mask_indices=None, + mask_channel_indices=None, + padding_count=None, + ): + features = source + + if self.feature_grad_mult > 0: + features = self.feature_extractor(features) + if self.feature_grad_mult != 1.0: + features = GradMultiply.apply(features, self.feature_grad_mult) + else: + with torch.no_grad(): + features = self.feature_extractor(features) + + features = features.transpose(1, 2) + + features = self.layer_norm(features) + + orig_padding_mask = padding_mask + + if padding_mask is not None and padding_mask.any(): + input_lengths = (1 - padding_mask.long()).sum(-1) + # apply conv formula to get real output_lengths + output_lengths = self._get_feat_extract_output_lengths(input_lengths) + + padding_mask = torch.zeros( + features.shape[:2], dtype=features.dtype, device=features.device + ) + + # these two operations makes sure that all values + # before the output lengths indices are attended to + padding_mask[ + ( + torch.arange(padding_mask.shape[0], device=padding_mask.device), + output_lengths - 1, + ) + ] = 1 + padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() + else: + padding_mask = None + + if self.post_extract_proj is not None: + features = self.post_extract_proj(features) + + pre_encoder_features = None + if self.cfg.ema_transformer_only: + pre_encoder_features = features.clone() + + features = self.dropout_input(features) + + if mask: + x, mask_indices = self.apply_mask( + features, + padding_mask, + mask_indices=mask_indices, + mask_channel_indices=mask_channel_indices, + ) + else: + x = features + mask_indices = None + + x, layer_results = self.encoder( + x, + padding_mask=padding_mask, + layer=layer, + ) + + if features_only: + return { + "x": x, + "padding_mask": padding_mask, + "layer_results": layer_results, + } + + result = { + "losses": {}, + } + + with torch.no_grad(): + self.ema.model.eval() + + if self.cfg.ema_transformer_only: + y, layer_results = self.ema.model.extract_features( + pre_encoder_features, + padding_mask=padding_mask, + min_layer=self.cfg.encoder_layers - self.average_top_k_layers, + ) + y = { + "x": y, + "padding_mask": padding_mask, + "layer_results": layer_results, + } + else: + y = self.ema.model.extract_features( + source=source, + padding_mask=orig_padding_mask, + mask=False, + ) + + target_layer_results = [l[2] for l in y["layer_results"]] + + permuted = False + if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer: + target_layer_results = [ + tl.permute(1, 2, 0) for tl in target_layer_results # TBC -> BCT + ] + permuted = True + + if self.cfg.batch_norm_target_layer: + target_layer_results = [ + F.batch_norm( + tl.float(), running_mean=None, running_var=None, training=True + ) + for tl in target_layer_results + ] + + if self.cfg.instance_norm_target_layer: + target_layer_results = [ + F.instance_norm(tl.float()) for tl in target_layer_results + ] + + if permuted: + target_layer_results = [ + tl.transpose(1, 2) for tl in target_layer_results # BCT -> BTC + ] + + if self.cfg.group_norm_target_layer: + target_layer_results = [ + F.layer_norm(tl.float(), tl.shape[-2:]) + for tl in target_layer_results + ] + + if self.cfg.layer_norm_target_layer: + target_layer_results = [ + F.layer_norm(tl.float(), tl.shape[-1:]) + for tl in target_layer_results + ] + + y = sum(target_layer_results) / len(target_layer_results) + + if self.cfg.layer_norm_targets: + y = F.layer_norm(y.float(), y.shape[-1:]) + + if self.cfg.instance_norm_targets: + y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2) + + if not permuted: + y = y.transpose(0, 1) + + y = y[mask_indices] + + x = x[mask_indices] + x = self.final_proj(x) + + sz = x.size(-1) + + if self.loss_beta == 0: + loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1) + else: + loss = F.smooth_l1_loss( + x.float(), y.float(), reduction="none", beta=self.loss_beta + ).sum(dim=-1) + + if self.loss_scale is not None: + scale = self.loss_scale + else: + scale = 1 / math.sqrt(sz) + + result["losses"]["regression"] = loss.sum() * scale + + if "sample_size" not in result: + result["sample_size"] = loss.numel() + + with torch.no_grad(): + result["target_var"] = self.compute_var(y) + result["pred_var"] = self.compute_var(x.float()) + + if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var: + logger.error( + f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting" + ) + raise Exception( + f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting" + ) + if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var: + logger.error( + f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting" + ) + raise Exception( + f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting" + ) + + if self.ema is not None: + result["ema_decay"] = self.ema.get_decay() * 1000 + + return result + + @staticmethod + def compute_var(y): + y = y.view(-1, y.size(-1)) + if dist.is_initialized(): + zc = torch.tensor(y.size(0)).cuda() + zs = y.sum(dim=0) + zss = (y ** 2).sum(dim=0) + + dist.all_reduce(zc) + dist.all_reduce(zs) + dist.all_reduce(zss) + + var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1)) + return torch.sqrt(var + 1e-6).mean() + else: + return torch.sqrt(y.var(dim=0) + 1e-6).mean() + + def extract_features( + self, source, padding_mask, mask=False, layer=None + ): + res = self.forward( + source, + padding_mask, + mask=mask, + features_only=True, + layer=layer, + ) + return res + + def remove_pretraining_modules(self, last_layer=None): + self.final_proj = None + self.ema = None + if last_layer is not None: + self.encoder.layers = nn.ModuleList( + l for i, l in enumerate(self.encoder.layers) if i <= last_layer + ) diff --git a/examples/data2vec/models/data2vec_image_classification.py b/examples/data2vec/models/data2vec_image_classification.py new file mode 100644 index 0000000000..851c9ce455 --- /dev/null +++ b/examples/data2vec/models/data2vec_image_classification.py @@ -0,0 +1,143 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# The code in this file is adapted from the BeiT implementation which can be found here: +# https://github.com/microsoft/unilm/tree/master/beit + +import logging + +from dataclasses import dataclass +from typing import Any + +from omegaconf import II, MISSING + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from fairseq import checkpoint_utils, tasks + +from fairseq.dataclass import FairseqDataclass +from fairseq.models import BaseFairseqModel, register_model + + +logger = logging.getLogger(__name__) + + +@dataclass +class Data2VecImageClassificationConfig(FairseqDataclass): + model_path: str = MISSING + no_pretrained_weights: bool = False + num_classes: int = 1000 + mixup: float = 0.8 + cutmix: float = 1.0 + label_smoothing: float = 0.1 + + pretrained_model_args: Any = None + data: str = II("task.data") + + +@register_model( + "data2vec_image_classification", dataclass=Data2VecImageClassificationConfig +) +class Data2VecImageClassificationModel(BaseFairseqModel): + def __init__(self, cfg: Data2VecImageClassificationConfig): + super().__init__() + self.cfg = cfg + + if cfg.pretrained_model_args is None: + state = checkpoint_utils.load_checkpoint_to_cpu(cfg.model_path, {}) + pretrained_args = state.get("cfg", None) + pretrained_args.criterion = None + pretrained_args.lr_scheduler = None + cfg.pretrained_model_args = pretrained_args + + logger.info(pretrained_args) + else: + state = None + pretrained_args = cfg.pretrained_model_args + + pretrained_args.task.data = cfg.data + task = tasks.setup_task(pretrained_args.task) + model = task.build_model(pretrained_args.model, from_checkpoint=True) + + model.remove_pretraining_modules() + + self.model = model + + if state is not None and not cfg.no_pretrained_weights: + self.load_model_weights(state, model, cfg) + + self.fc_norm = nn.LayerNorm(pretrained_args.model.embed_dim) + self.head = nn.Linear(pretrained_args.model.embed_dim, cfg.num_classes) + + self.head.weight.data.mul_(1e-3) + self.head.bias.data.mul_(1e-3) + + self.mixup_fn = None + + if cfg.mixup > 0 or cfg.cutmix > 0: + from timm.data import Mixup + + self.mixup_fn = Mixup( + mixup_alpha=cfg.mixup, + cutmix_alpha=cfg.cutmix, + cutmix_minmax=None, + prob=1.0, + switch_prob=0.5, + mode="batch", + label_smoothing=cfg.label_smoothing, + num_classes=cfg.num_classes, + ) + + def load_model_weights(self, state, model, cfg): + if "_ema" in state["model"]: + del state["model"]["_ema"] + model.load_state_dict(state["model"], strict=True) + + @classmethod + def build_model(cls, cfg: Data2VecImageClassificationConfig, task=None): + """Build a new model instance.""" + + return cls(cfg) + + def forward( + self, + img, + label=None, + ): + if self.training and self.mixup_fn is not None and label is not None: + img, label = self.mixup_fn(img, label) + + x = self.model(img, mask=False) + x = x[:, 1:] + x = self.fc_norm(x.mean(1)) + x = self.head(x) + + if label is None: + return x + + if self.training and self.mixup_fn is not None: + loss = -label * F.log_softmax(x.float(), dim=-1) + else: + loss = F.cross_entropy( + x.float(), + label, + label_smoothing=self.cfg.label_smoothing if self.training else 0, + reduction="none", + ) + + result = { + "losses": {"regression": loss}, + "sample_size": img.size(0), + } + + if not self.training: + with torch.no_grad(): + pred = x.argmax(-1) + correct = (pred == label).sum() + result["correct"] = correct + + return result diff --git a/examples/data2vec/models/data2vec_text.py b/examples/data2vec/models/data2vec_text.py new file mode 100644 index 0000000000..cb3c8b383a --- /dev/null +++ b/examples/data2vec/models/data2vec_text.py @@ -0,0 +1,517 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass, field +from typing import Optional +import logging +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from omegaconf import II + +from fairseq.dataclass import FairseqDataclass +from fairseq.modules import EMAModule, EMAModuleConfig +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderModel, + register_model, +) +from fairseq.models.roberta.model import RobertaLMHead, RobertaClassificationHead +from fairseq.models.transformer import TransformerEncoder, TransformerConfig +from fairseq.modules.transformer_sentence_encoder import init_bert_params + +logger = logging.getLogger(__name__) + + +@dataclass +class Data2VecTextConfig(FairseqDataclass): + max_positions: int = II("task.tokens_per_sample") + + head_layers: int = 1 + + transformer: TransformerConfig = TransformerConfig() + + load_checkpoint_heads: bool = field( + default=False, + metadata={"help": "(re-)register and load heads when loading checkpoints"}, + ) + + loss_beta: float = field( + default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"} + ) + loss_scale: Optional[float] = field( + default=None, + metadata={ + "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)" + }, + ) + average_top_k_layers: int = field( + default=8, metadata={"help": "how many layers to average"} + ) + + layer_norm_target_layer: bool = False + instance_norm_target_layer: bool = False + batch_norm_target_layer: bool = False + instance_norm_targets: bool = False + layer_norm_targets: bool = False + + ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"}) + ema_end_decay: float = field( + default=0.9999, metadata={"help": "final ema decay rate"} + ) + + # when to finish annealing ema decay rate + ema_anneal_end_step: int = II("optimization.max_update") + + ema_transformer_layers_only: bool = field( + default=True, + metadata={"help": "whether to momentum update only the transformer layers"}, + ) + + +def get_annealed_rate(start, end, curr_step, total_steps): + r = end - start + pct_remaining = 1 - curr_step / total_steps + return end - r * pct_remaining + + +@register_model("data2vec_text", dataclass=Data2VecTextConfig) +class Data2VecTextModel(FairseqEncoderModel): + def __init__(self, cfg: Data2VecTextConfig, encoder): + super().__init__(encoder) + self.cfg = cfg + + # We follow BERT's random weight initialization + self.apply(init_bert_params) + + self.classification_heads = nn.ModuleDict() + + @classmethod + def build_model(cls, cfg, task): + """Build a new model instance.""" + + encoder = Data2VecTextEncoder(cfg, task.source_dictionary, task.cfg.data) + + return cls(cfg, encoder) + + def forward( + self, + src_tokens, + target_tokens=None, + features_only=False, + return_all_hiddens=False, + classification_head_name=None, + **kwargs, + ): + if classification_head_name is not None: + features_only = True + + res = self.encoder( + src_tokens, target_tokens, features_only, return_all_hiddens, **kwargs + ) + + if isinstance(res, tuple): + x, extra = res + else: + return res + + if classification_head_name is not None: + x = self.classification_heads[classification_head_name](x) + return x, extra + + def get_normalized_probs(self, net_output, log_probs, sample=None): + """Get normalized probabilities (or log probs) from a net's output.""" + logits = net_output[0].float() + if log_probs: + return F.log_softmax(logits, dim=-1) + else: + return F.softmax(logits, dim=-1) + + def register_classification_head( + self, name, num_classes=None, inner_dim=None, **kwargs + ): + """Register a classification head.""" + if name in self.classification_heads: + prev_num_classes = self.classification_heads[name].out_proj.out_features + prev_inner_dim = self.classification_heads[name].dense.out_features + if num_classes != prev_num_classes or inner_dim != prev_inner_dim: + logger.warning( + 're-registering head "{}" with num_classes {} (prev: {}) ' + "and inner_dim {} (prev: {})".format( + name, num_classes, prev_num_classes, inner_dim, prev_inner_dim + ) + ) + self.classification_heads[name] = RobertaClassificationHead( + input_dim=self.cfg.transformer.encoder.embed_dim, + inner_dim=inner_dim or self.cfg.transformer.encoder.embed_dim, + num_classes=num_classes, + activation_fn="tanh", + pooler_dropout=0, + ) + + @property + def supported_targets(self): + return {"self"} + + def upgrade_state_dict_named(self, state_dict, name): + prefix = name + "." if name != "" else "" + + # rename decoder -> encoder before upgrading children modules + for k in list(state_dict.keys()): + if k.startswith(prefix + "decoder"): + new_k = prefix + "encoder" + k[len(prefix + "decoder") :] + state_dict[new_k] = state_dict[k] + del state_dict[k] + + # rename emb_layer_norm -> layernorm_embedding + for k in list(state_dict.keys()): + if ".emb_layer_norm." in k: + new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.") + state_dict[new_k] = state_dict[k] + del state_dict[k] + + if self.encoder.regression_head is not None: + if ".lm_head." in k: + new_k = k.replace(".lm_head.", ".regression_head.") + state_dict[new_k] = state_dict[k] + del state_dict[k] + else: + if ".regression_head." in k: + del state_dict[k] + + # upgrade children modules + super().upgrade_state_dict_named(state_dict, name) + + # Handle new classification heads present in the state dict. + current_head_names = ( + [] + if not hasattr(self, "classification_heads") + or self.classification_heads is None + else self.classification_heads.keys() + ) + keys_to_delete = [] + for k in state_dict.keys(): + if not k.startswith(prefix + "classification_heads."): + continue + + head_name = k[len(prefix + "classification_heads.") :].split(".")[0] + num_classes = state_dict[ + prefix + "classification_heads." + head_name + ".out_proj.weight" + ].size(0) + inner_dim = state_dict[ + prefix + "classification_heads." + head_name + ".dense.weight" + ].size(0) + + if self.cfg.load_checkpoint_heads: + if head_name not in current_head_names: + self.register_classification_head(head_name, num_classes, inner_dim) + else: + if head_name not in current_head_names: + logger.warning( + "deleting classification head ({}) from checkpoint " + "not present in current model: {}".format(head_name, k) + ) + keys_to_delete.append(k) + elif ( + num_classes + != self.classification_heads[head_name].out_proj.out_features + or inner_dim + != self.classification_heads[head_name].dense.out_features + ): + logger.warning( + "deleting classification head ({}) from checkpoint " + "with different dimensions than current model: {}".format( + head_name, k + ) + ) + keys_to_delete.append(k) + for k in keys_to_delete: + del state_dict[k] + + # Copy any newly-added classification heads into the state dict + # with their current weights. + if ( + hasattr(self, "classification_heads") + and self.classification_heads is not None + and len(self.classification_heads) > 0 + ): + cur_state = self.classification_heads.state_dict() + for k, v in cur_state.items(): + if prefix + "classification_heads." + k not in state_dict: + logger.info("Overwriting " + prefix + "classification_heads." + k) + state_dict[prefix + "classification_heads." + k] = v + + for k in list(state_dict.keys()): + if k.startswith(prefix + "encoder.lm_head.") or k.startswith( + prefix + "encoder.emb_head." + ): + del state_dict[k] + + self.encoder.lm_head = None + + if self.encoder.target_model is None: + for k in list(state_dict.keys()): + if k.startswith(prefix + "encoder.target_model."): + del state_dict[k] + + if (self.encoder.ema is None) and (prefix + "encoder._ema" in state_dict): + del state_dict[prefix + "encoder._ema"] + + def remove_pretraining_modules(self, last_layer=None): + self.encoder.lm_head = None + self.encoder.regression_head = None + self.encoder.ema = None + self.classification_heads = None + + if last_layer is not None: + self.encoder.sentence_encoder.layers = nn.ModuleList( + l + for i, l in enumerate(self.encoder.sentence_encoder.layers) + if i <= last_layer + ) + self.encoder.sentence_encoder.layer_norm = None + + +class Data2VecTextEncoder(FairseqEncoder): + def __init__(self, cfg: Data2VecTextConfig, dictionary, task_data): + super().__init__(dictionary) + + self.cfg = cfg + + embed_tokens = self.build_embedding( + len(dictionary), cfg.transformer.encoder.embed_dim, dictionary.pad() + ) + + self.sentence_encoder = self.build_encoder(cfg, dictionary, embed_tokens) + self.mask_idx = dictionary.index("") + assert self.mask_idx != dictionary.unk(), dictionary.symbols + + self.ema = None + self.average_top_k_layers = cfg.average_top_k_layers + self.loss_scale = cfg.loss_scale + + assert self.cfg.head_layers >= 1 + + embed_dim = cfg.transformer.encoder.embed_dim + curr_dim = embed_dim + projs = [] + for i in range(self.cfg.head_layers - 1): + next_dim = embed_dim * 2 if i == 0 else curr_dim + projs.append(nn.Linear(curr_dim, next_dim)) + projs.append(nn.GELU()) + curr_dim = next_dim + + projs.append(nn.Linear(curr_dim, embed_dim)) + self.regression_head = nn.Sequential(*projs) + + self.num_updates = 0 + + def build_embedding(self, vocab_size, embedding_dim, padding_idx): + return nn.Embedding(vocab_size, embedding_dim, padding_idx) + + def build_encoder(self, cfg, dictionary, embed_tokens): + encoder = TransformerEncoder(cfg.transformer, dictionary, embed_tokens, return_fc=True) + encoder.apply(init_bert_params) + return encoder + + def build_lm_head(self, embed_dim, output_dim, activation_fn, weight): + return RobertaLMHead(embed_dim, output_dim, activation_fn, weight) + + def make_ema_teacher(self): + ema_config = EMAModuleConfig( + ema_decay=self.cfg.ema_decay, + ema_fp32=True, + ) + skip_keys = set() + if self.cfg.ema_transformer_layers_only: + for k, _ in self.sentence_encoder.embed_positions.named_parameters(): + skip_keys.add(f"embed_tokens.{k}") + for k, _ in self.sentence_encoder.embed_positions.named_parameters(): + skip_keys.add(f"embed_positions.{k}") + if self.sentence_encoder.layernorm_embedding is not None: + for ( + k, + _, + ) in self.sentence_encoder.layernorm_embedding.named_parameters(): + skip_keys.add(f"layernorm_embedding.{k}") + if self.sentence_encoder.layer_norm is not None: + for k, _ in self.sentence_encoder.layer_norm.named_parameters(): + skip_keys.add(f"layernorm_embedding.{k}") + + self.ema = EMAModule( + self.sentence_encoder, + ema_config, + skip_keys=skip_keys, + ) + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + + if self.ema is None and self.regression_head is not None: + logger.info(f"making ema teacher") + self.make_ema_teacher() + elif self.training and self.ema is not None: + if self.cfg.ema_decay != self.cfg.ema_end_decay: + if num_updates >= self.cfg.ema_anneal_end_step: + decay = self.cfg.ema_end_decay + else: + decay = get_annealed_rate( + self.cfg.ema_decay, + self.cfg.ema_end_decay, + num_updates, + self.cfg.ema_anneal_end_step, + ) + self.ema.set_decay(decay) + if self.ema.get_decay() < 1: + self.ema.step(self.sentence_encoder) + + def state_dict(self, destination=None, prefix="", keep_vars=False): + state = super().state_dict(destination, prefix, keep_vars) + if self.ema is not None: + state[prefix + "_ema"] = self.ema.fp32_params + return state + + def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): + if self.ema is not None: + k = prefix + "_ema" + assert k in state_dict + self.ema.restore(state_dict[k], True) + del state_dict[k] + return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + def forward( + self, + src_tokens, + target_tokens=None, + features_only=False, + return_all_hiddens=False, + masked_tokens=None, + **unused, + ): + """ + Args: + src_tokens (LongTensor): input tokens of shape `(batch, src_len)` + features_only (bool, optional): skip LM head and just return + features. If True, the output will be of shape + `(batch, src_len, embed_dim)`. + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + + Returns: + tuple: + - the LM output of shape `(batch, src_len, vocab)` + - a dictionary of additional data, where 'inner_states' + is a list of hidden states. Note that the hidden + states have shape `(src_len, batch, vocab)`. + """ + + x, extra = self.extract_features( + src_tokens, return_all_hiddens=return_all_hiddens + ) + + if features_only: + return x, extra + + assert target_tokens is not None + + with torch.no_grad(): + # use EMA parameter as the teacher + self.ema.model.eval() + + encoder_out = self.ema.model( + target_tokens, + return_all_hiddens=True, + ) + y = encoder_out["fc_results"] + + y = y[-self.average_top_k_layers :] + + permuted = False + if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer: + y = [tl.permute(1, 2, 0) for tl in y] # TBC -> BCT + permuted = True + + if self.cfg.batch_norm_target_layer: + y = [ + F.batch_norm( + tl.float(), running_mean=None, running_var=None, training=True + ) + for tl in y + ] + + if self.cfg.instance_norm_target_layer: + y = [F.instance_norm(tl.float()) for tl in y] + + if permuted: + y = [tl.transpose(1, 2) for tl in y] # BCT -> BTC + + if self.cfg.layer_norm_target_layer: + y = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in y] + + y = sum(y) / len(y) + + if not permuted: + y = y.transpose(0, 1) + + if self.cfg.layer_norm_targets: + y = F.layer_norm(y.float(), y.shape[-1:]) + + if self.cfg.instance_norm_targets: + y = F.instance_norm(y.transpose(1, 2)).transpose(1, 2) + + masked_indices = src_tokens.eq(self.mask_idx) + + x = x[masked_indices] + y = y[masked_indices] + + x = self.regression_head(x) + + sz = x.size(-1) + if self.cfg.loss_beta == 0: + loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1) + else: + loss = F.smooth_l1_loss( + x.float(), y.float(), reduction="none", beta=self.cfg.loss_beta + ).sum(dim=-1) + + result = { + "losses": { + "main": loss.sum() / math.sqrt(sz) + if self.loss_scale <= 0 + else loss.sum() * self.loss_scale, + }, + "sample_size": loss.numel(), + } + + # logging other values + other_logs = { + "ema_decay": self.ema.get_decay() * 1000 + } + result["logs"] = other_logs + return result + + def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs): + encoder_out = self.sentence_encoder( + src_tokens, + return_all_hiddens=return_all_hiddens, + token_embeddings=kwargs.get("token_embeddings", None), + ) + # T x B x C -> B x T x C + features = encoder_out["encoder_out"][0].transpose(0, 1) + inner_states = encoder_out["encoder_states"] if return_all_hiddens else None + return features, { + "inner_states": inner_states, + "encoder_embedding": encoder_out["encoder_embedding"][0], + } + + def output_layer(self, features, masked_tokens=None, **unused): + return self.lm_head(features, masked_tokens) + + def max_positions(self): + """Maximum output length supported by the encoder.""" + return self.cfg.max_positions diff --git a/examples/data2vec/models/data2vec_text_classification.py b/examples/data2vec/models/data2vec_text_classification.py new file mode 100644 index 0000000000..e787b916dc --- /dev/null +++ b/examples/data2vec/models/data2vec_text_classification.py @@ -0,0 +1,141 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# The code in this file is adapted from the BeiT implementation which can be found here: +# https://github.com/microsoft/unilm/tree/master/beit + +import logging + +from dataclasses import dataclass +from typing import Any + +from omegaconf import II, MISSING + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from fairseq import checkpoint_utils, tasks + +from fairseq.dataclass import FairseqDataclass +from fairseq.models import BaseFairseqModel, register_model +from fairseq.models.roberta.model import RobertaClassificationHead + +from examples.data2vec.data.modality import Modality + + +logger = logging.getLogger(__name__) + + +@dataclass +class Data2VecTextClassificationConfig(FairseqDataclass): + pooler_dropout: float = 0.0 + pooler_activation_fn: str = "tanh" + quant_noise_pq: int = 0 + quant_noise_pq_block_size: int = 8 + spectral_norm_classification_head: bool = False + + model_path: str = MISSING + no_pretrained_weights: bool = False + + pretrained_model_args: Any = None + + +@register_model( + "data2vec_text_classification", dataclass=Data2VecTextClassificationConfig +) +class Data2VecTextClassificationModel(BaseFairseqModel): + def __init__(self, cfg: Data2VecTextClassificationConfig): + super().__init__() + self.cfg = cfg + + if cfg.pretrained_model_args is None: + state = checkpoint_utils.load_checkpoint_to_cpu(cfg.model_path, {}) + pretrained_args = state.get("cfg", None) + pretrained_args.criterion = None + pretrained_args.lr_scheduler = None + cfg.pretrained_model_args = pretrained_args + + logger.info(pretrained_args) + else: + state = None + pretrained_args = cfg.pretrained_model_args + + task = tasks.setup_task(pretrained_args.task) + model = task.build_model(pretrained_args.model, from_checkpoint=True) + + model.remove_pretraining_modules() + + self.model = model + + if state is not None and not cfg.no_pretrained_weights: + self.load_model_weights(state, model, cfg) + + self.classification_heads = nn.ModuleDict() + + + def load_model_weights(self, state, model, cfg): + for k in list(state["model"].keys()): + if ( + k.startswith("shared_decoder") or + k.startswith("_ema") or + "decoder" in k + ): + logger.info(f"Deleting {k} from checkpoint") + del state["model"][k] + model.load_state_dict(state["model"], strict=True) + + @classmethod + def build_model(cls, cfg: Data2VecTextClassificationConfig, task=None): + """Build a new model instance.""" + + return cls(cfg) + + def register_classification_head( + self, name, num_classes=None, inner_dim=None, **kwargs + ): + """Register a classification head.""" + if name in self.classification_heads: + prev_num_classes = self.classification_heads[name].out_proj.out_features + prev_inner_dim = self.classification_heads[name].dense.out_features + if num_classes != prev_num_classes or inner_dim != prev_inner_dim: + logger.warning( + 're-registering head "{}" with num_classes {} (prev: {}) ' + "and inner_dim {} (prev: {})".format( + name, num_classes, prev_num_classes, inner_dim, prev_inner_dim + ) + ) + embed_dim = self.cfg.pretrained_model_args.model.embed_dim + self.classification_heads[name] = RobertaClassificationHead( + input_dim=embed_dim, + inner_dim=inner_dim or embed_dim, + num_classes=num_classes, + activation_fn=self.cfg.pooler_activation_fn, + pooler_dropout=self.cfg.pooler_dropout, + q_noise=self.cfg.quant_noise_pq, + qn_block_size=self.cfg.quant_noise_pq_block_size, + do_spectral_norm=self.cfg.spectral_norm_classification_head, + ) + + def forward( + self, + source, + id, + padding_mask, + features_only=True, + remove_extra_tokens=True, + classification_head_name=None, + ): + encoder_out = self.model( + source, + id=id, + mode=Modality.TEXT, + padding_mask=padding_mask, + mask=False, + features_only=features_only, + remove_extra_tokens=remove_extra_tokens + ) + logits = self.classification_heads[classification_head_name](encoder_out["x"]) + return logits, encoder_out diff --git a/examples/data2vec/models/data2vec_vision.py b/examples/data2vec/models/data2vec_vision.py new file mode 100644 index 0000000000..2f89894429 --- /dev/null +++ b/examples/data2vec/models/data2vec_vision.py @@ -0,0 +1,727 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# The code in this file is adapted from the BeiT implementation which can be found here: +# https://github.com/microsoft/unilm/tree/master/beit + +import logging +import math +import numpy as np +import random + +from dataclasses import dataclass, field +from typing import Optional + +from omegaconf import II + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.distributed as dist + +from fairseq.modules import EMAModule, EMAModuleConfig +from fairseq.dataclass import FairseqDataclass +from fairseq.models import BaseFairseqModel, register_model + + +logger = logging.getLogger(__name__) + + +@dataclass +class Data2VecVisionConfig(FairseqDataclass): + layer_scale_init_value: float = field( + default=1e-4, metadata={"help": "rescale layer outputs, 0 to disable"} + ) + num_mask_patches: int = field( + default=75, + metadata={"help": "number of the visual tokens/patches need be masked"}, + ) + min_mask_patches_per_block: int = 16 + max_mask_patches_per_block: int = 196 + image_size: int = 224 + patch_size: int = 16 + in_channels: int = 3 + + shared_rel_pos_bias: bool = True + + drop_path: float = 0.1 + attention_dropout: float = 0.0 + + depth: int = 12 + embed_dim: int = 768 + num_heads: int = 12 + mlp_ratio: int = 4 + + loss_beta: float = field( + default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"} + ) + loss_scale: Optional[float] = field( + default=None, + metadata={ + "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)" + }, + ) + average_top_k_layers: int = field( + default=8, metadata={"help": "how many layers to average"} + ) + + end_of_block_targets: bool = True + layer_norm_target_layer: bool = False + instance_norm_target_layer: bool = False + batch_norm_target_layer: bool = False + instance_norm_targets: bool = False + layer_norm_targets: bool = False + + ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"}) + ema_end_decay: float = field( + default=0.9999, metadata={"help": "final ema decay rate"} + ) + + # when to finish annealing ema decay rate + ema_anneal_end_step: int = II("optimization.max_update") + + ema_transformer_only: bool = field( + default=True, + metadata={"help": "whether to momentum update only the transformer layers"}, + ) + + +def get_annealed_rate(start, end, curr_step, total_steps): + r = end - start + pct_remaining = 1 - curr_step / total_steps + return end - r * pct_remaining + + +@register_model("data2vec_vision", dataclass=Data2VecVisionConfig) +class Data2VecVisionModel(BaseFairseqModel): + def __init__(self, cfg: Data2VecVisionConfig): + super().__init__() + self.cfg = cfg + + self.ema = None + + self.average_top_k_layers = cfg.average_top_k_layers + self.loss_beta = cfg.loss_beta + self.loss_scale = ( + cfg.loss_scale + if cfg.loss_scale is not None + else 1 / math.sqrt(cfg.embed_dim) + ) + + self.patch_embed = PatchEmbed( + img_size=cfg.image_size, + patch_size=cfg.patch_size, + in_chans=cfg.in_channels, + embed_dim=cfg.embed_dim, + ) + + patch_size = self.patch_embed.patch_size + self.window_size = ( + cfg.image_size // patch_size[0], + cfg.image_size // patch_size[1], + ) + + self.cls_emb = nn.Parameter(torch.FloatTensor(1, 1, cfg.embed_dim)) + self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, cfg.embed_dim)) + + nn.init.trunc_normal_(self.cls_emb, 0.02) + nn.init.trunc_normal_(self.mask_emb, 0.02) + + self.encoder = TransformerEncoder(cfg, self.patch_embed.patch_shape) + + self.final_proj = nn.Linear(cfg.embed_dim, cfg.embed_dim) + self.num_updates = 0 + + def make_ema_teacher(self): + ema_config = EMAModuleConfig( + ema_decay=self.cfg.ema_decay, + ema_fp32=True, + ) + self.ema = EMAModule( + self.encoder if self.cfg.ema_transformer_only else self, + ema_config, + ) + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + + if self.ema is None and self.final_proj is not None: + logger.info(f"making ema teacher") + self.make_ema_teacher() + elif self.training and self.ema is not None: + if self.cfg.ema_decay != self.cfg.ema_end_decay: + if num_updates >= self.cfg.ema_anneal_end_step: + decay = self.cfg.ema_end_decay + else: + decay = get_annealed_rate( + self.cfg.ema_decay, + self.cfg.ema_end_decay, + num_updates, + self.cfg.ema_anneal_end_step, + ) + self.ema.set_decay(decay) + if self.ema.get_decay() < 1: + self.ema.step(self.encoder if self.cfg.ema_transformer_only else self) + + self.num_updates = num_updates + + def state_dict(self, destination=None, prefix="", keep_vars=False): + state = super().state_dict(destination, prefix, keep_vars) + + if self.ema is not None: + state[prefix + "_ema"] = self.ema.fp32_params + + return state + + def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): + if self.ema is not None: + k = prefix + "_ema" + assert k in state_dict + self.ema.restore(state_dict[k], True) + del state_dict[k] + return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + + @classmethod + def build_model(cls, cfg: Data2VecVisionConfig, task=None): + """Build a new model instance.""" + + return cls(cfg) + + def make_mask(self, bsz, num_masks, min_masks, max_masks): + height, width = self.window_size + + masks = np.zeros(shape=(bsz, height, width), dtype=np.int) + + for i in range(bsz): + mask = masks[i] + mask_count = 0 + + min_aspect = 0.3 + max_aspect = 1 / min_aspect + log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + + def _mask(mask, max_mask_patches): + delta = 0 + for attempt in range(10): + target_area = random.uniform(min_masks, max_mask_patches) + aspect_ratio = math.exp(random.uniform(*log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < width and h < height: + top = random.randint(0, height - h) + left = random.randint(0, width - w) + + num_masked = mask[top : top + h, left : left + w].sum() + # Overlap + if 0 < h * w - num_masked <= max_mask_patches: + for i in range(top, top + h): + for j in range(left, left + w): + if mask[i, j] == 0: + mask[i, j] = 1 + delta += 1 + + if delta > 0: + break + return delta + + while mask_count < num_masks: + max_mask_patches = min(num_masks - mask_count, max_masks) + + delta = _mask(mask, max_mask_patches) + if delta == 0: + break + else: + mask_count += delta + + return torch.from_numpy(masks) + + def forward( + self, + img, + mask: bool = True, + layer_results: bool = False, + ): + x = self.patch_embed(img) + batch_size, seq_len, _ = x.size() + + if mask: + mask_indices = self.make_mask( + img.size(0), + self.cfg.num_mask_patches, + self.cfg.min_mask_patches_per_block, + self.cfg.max_mask_patches_per_block, + ) + bool_mask = mask_indices.view(mask_indices.size(0), -1).bool() + else: + mask_indices = bool_mask = None + + cls_tokens = self.cls_emb.expand(batch_size, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + + if self.ema is not None: + with torch.no_grad(): + self.ema.model.eval() + + if self.cfg.ema_transformer_only: + y = self.ema.model( + x, + layer_results="end" if self.cfg.end_of_block_targets else "fc", + ) + else: + y = self.ema.model( + img, + mask=False, + layer_results=True, + ) + + y = y[-self.cfg.average_top_k_layers :] + + permuted = False + if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer: + y = [tl.transpose(1, 2) for tl in y] # BTC -> BCT + permuted = True + + if self.cfg.batch_norm_target_layer: + y = [ + F.batch_norm( + tl.float(), running_mean=None, running_var=None, training=True + ) + for tl in y + ] + + if self.cfg.instance_norm_target_layer: + y = [F.instance_norm(tl.float()) for tl in y] + + if permuted: + y = [tl.transpose(1, 2) for tl in y] # BCT -> BTC + + if self.cfg.layer_norm_target_layer: + y = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in y] + + y = sum(y) / len(y) + + if self.cfg.layer_norm_targets: + y = F.layer_norm(y.float(), y.shape[-1:]) + + if self.cfg.instance_norm_targets: + y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2) + + y = y[bool_mask].float() + + if mask_indices is not None: + mask_token = self.mask_emb.expand(batch_size, seq_len, -1) + w = mask_indices.view(mask_indices.size(0), -1, 1).type_as(mask_token) + x[:, 1:] = x[:, 1:] * (1 - w) + mask_token * w + + if layer_results: + enc_layer_results = "end" if self.cfg.end_of_block_targets else "fc" + else: + enc_layer_results = None + + x = self.encoder(x, layer_results=enc_layer_results) + if layer_results or mask_indices is None: + return x + + x = x[bool_mask].float() + + if self.loss_beta == 0: + loss = F.mse_loss(x, y, reduction="none").sum(dim=-1) + else: + loss = F.smooth_l1_loss(x, y, reduction="none", beta=self.loss_beta).sum( + dim=-1 + ) + + if self.loss_scale > 0: + loss = loss * self.loss_scale + + result = { + "losses": {"regression": loss.sum()}, + "sample_size": loss.numel(), + "target_var": self.compute_var(y), + "pred_var": self.compute_var(x), + "ema_decay": self.ema.get_decay() * 1000, + } + return result + + @staticmethod + def compute_var(y): + y = y.view(-1, y.size(-1)) + if dist.is_initialized(): + zc = torch.tensor(y.size(0)).cuda() + zs = y.sum(dim=0) + zss = (y ** 2).sum(dim=0) + + dist.all_reduce(zc) + dist.all_reduce(zs) + dist.all_reduce(zss) + + var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1)) + return torch.sqrt(var + 1e-6).mean() + else: + return torch.sqrt(y.var(dim=0) + 1e-6).mean() + + def remove_pretraining_modules(self, last_layer=None): + self.final_proj = None + self.ema = None + self.encoder.norm = nn.Identity() + self.mask_emb = None + if last_layer is not None: + self.encoder.layers = nn.ModuleList( + l for i, l in enumerate(self.encoder.layers) if i <= last_layer + ) + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding""" + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + if isinstance(img_size, int): + img_size = img_size, img_size + if isinstance(patch_size, int): + patch_size = patch_size, patch_size + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.conv = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) + + def forward(self, x): + # BCHW -> BTC + x = self.conv(x).flatten(2).transpose(1, 2) + return x + + +class Attention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=True, + attn_drop=0.0, + proj_drop=0.0, + window_size=None, + attn_head_dim=None, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = head_dim ** -0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1 + ) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = torch.zeros( + size=(window_size[0] * window_size[1] + 1,) * 2, + dtype=relative_coords.dtype, + ) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, rel_pos_bias=None): + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat( + ( + self.q_bias, + torch.zeros_like(self.v_bias, requires_grad=False), + self.v_bias, + ) + ) + # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + if self.relative_position_bias_table is not None: + assert 1==2 + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + print("attn.size() :", attn.size()) + print("rel_pos_bias.size() :", rel_pos_bias.size()) + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class RelativePositionBias(nn.Module): + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1 + ) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads) + ) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = torch.zeros( + size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype + ) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + def forward(self): + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ) # Wh*Ww,Wh*Ww,nH + print("self.window_size :", self.window_size) + print("self.num_relative_distance :", self.num_relative_distance) + print("self.relative_position_index :", self.relative_position_index.size(), self.relative_position_index) + print("relative_position_bias.size(), relative_position_bias :",relative_position_bias.size(), relative_position_bias) + print("self.relative_position_bias_table.size(), self.relative_position_bias_table :",self.relative_position_bias_table.size(), self.relative_position_bias_table) + return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + if self.drop_prob == 0.0 or not self.training: + return x + keep_prob = 1 - self.drop_prob + shape = (x.shape[0],) + (1,) * ( + x.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() + output = x.div(keep_prob) * random_tensor + return output + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class Block(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + init_values=None, + window_size=None, + ): + super().__init__() + + self.norm1 = nn.LayerNorm(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + attn_drop=attn_drop, + proj_drop=drop, + window_size=window_size, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = nn.LayerNorm(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + + self.mlp = nn.Sequential( + nn.Linear(dim, mlp_hidden_dim), + nn.GELU(), + nn.Linear(mlp_hidden_dim, dim), + nn.Dropout(drop), + ) + + if init_values > 0: + self.gamma_1 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True + ) + self.gamma_2 = nn.Parameter( + init_values * torch.ones((dim)), requires_grad=True + ) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, rel_pos_bias=None): + print("inside block :", x.size()) + if self.gamma_1 is None: + x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) + fc_feature = self.drop_path(self.mlp(self.norm2(x))) + x = x + fc_feature + else: + x = x + self.drop_path( + self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias) + ) + fc_feature = self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + x = x + fc_feature + return x, fc_feature + + +class TransformerEncoder(nn.Module): + def __init__(self, cfg: Data2VecVisionConfig, patch_shape): + super().__init__() + + self.rel_pos_bias = None + if cfg.shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias( + window_size=patch_shape, num_heads=cfg.num_heads + ) + + dpr = [ + x.item() for x in torch.linspace(0, cfg.drop_path, cfg.depth) + ] # stochastic depth decay rule + + print("TransformerEncoder > patch_shape :", patch_shape) + self.blocks = nn.ModuleList( + Block( + dim=cfg.embed_dim, + num_heads=cfg.num_heads, + attn_drop=cfg.attention_dropout, + drop_path=dpr[i], + init_values=cfg.layer_scale_init_value, + window_size=patch_shape if not cfg.shared_rel_pos_bias else None, + ) + for i in range(cfg.depth) + ) + + self.norm = nn.LayerNorm(cfg.embed_dim) + + self.apply(self.init_weights) + self.fix_init_weight() + + def init_weights(self, m): + std = 0.02 + if isinstance(m, nn.Linear): + nn.init.trunc_normal_(m.weight, std=std) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + nn.init.trunc_normal_(m.weight, std=std) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.mlp[2].weight.data, layer_id + 1) + + def extract_features(self, x, layer_results): + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + + z = [] + for i, blk in enumerate(self.blocks): + x, fc_feature = blk(x, rel_pos_bias=rel_pos_bias) + if layer_results == "end": + z.append(x) + elif layer_results == "fc": + z.append(fc_feature) + + return z if layer_results else self.norm(x) + + def forward(self, x, layer_results=None): + x = self.extract_features(x, layer_results=layer_results) + if layer_results: + return [z[:, 1:] for z in x] + + x = x[:, 1:] + return x diff --git a/examples/data2vec/models/mae.py b/examples/data2vec/models/mae.py new file mode 100644 index 0000000000..a3b5f72a4a --- /dev/null +++ b/examples/data2vec/models/mae.py @@ -0,0 +1,829 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# The code in this file is adapted from the BeiT implementation which can be found here: +# https://github.com/microsoft/unilm/tree/master/beit + +import logging +from dataclasses import dataclass +from functools import partial + +from timm.models.vision_transformer import PatchEmbed, Block + +import torch +import torch.nn as nn + +import numpy as np + +from fairseq.dataclass import FairseqDataclass +from fairseq.models import BaseFairseqModel, register_model +from fairseq.models.wav2vec.wav2vec2 import TransformerSentenceEncoderLayer + +try: + from apex.normalization import FusedLayerNorm +except: + FusedLayerNorm = nn.LayerNorm + +import torch.nn.functional as F + + +logger = logging.getLogger(__name__) + + +@dataclass +class MaeConfig(FairseqDataclass): + input_size: int = 224 + in_chans: int = 3 + patch_size: int = 16 + embed_dim: int = 768 + depth: int = 12 + num_heads: int = 12 + decoder_embed_dim: int = 512 + decoder_depth: int = 8 + decoder_num_heads: int = 16 + mlp_ratio: int = 4 + norm_eps: float = 1e-6 + + drop_path_rate: float = 0.0 + + mask_ratio: float = 0.75 + norm_pix_loss: bool = True + + w2v_block: bool = False + alt_block: bool = False + alt_block2: bool = False + alt_attention: bool = False + block_dropout: float = 0 + attention_dropout: float = 0 + activation_dropout: float = 0 + layer_norm_first: bool = False + + fused_ln: bool = True + end_of_block_targets: bool = True + + no_decoder_embed: bool = False + no_decoder_pos_embed: bool = False + mask_noise_std: float = 0 + + single_qkv: bool = False + use_rel_pos_bias: bool = False + no_cls: bool = False + + +def modify_relative_position_bias(orig_bias, bsz, mask): + if mask is None: + return orig_bias.unsqueeze(0).repeat( + bsz, 1, 1, 1 + ) # heads x seq_len x seq_len => bsz x heads x seq_len x seq_len + heads, max_seq_len, max_seq_len = orig_bias.shape # includes CLS token + mask_for_rel_pos_bias = torch.cat( + (torch.zeros(bsz, 1, dtype=mask.dtype, device=mask.device), mask), dim=1 + ).bool() # bsz x seqlen (add CLS token) + unmasked_for_rel_pos_bias = ~mask_for_rel_pos_bias + unmasked_for_rel_pos_bias = unmasked_for_rel_pos_bias.unsqueeze(1).repeat( + 1, heads, 1 + ) # bsz x seq_len => bsz x heads x seq_len + b_t_t_rel_pos_bias = orig_bias.unsqueeze(0).repeat( + bsz, 1, 1, 1 + ) # heads x seq_len x seq_len => bsz x heads x seq_len x seq_len + b_t_t_rel_pos_bias = b_t_t_rel_pos_bias.masked_select( + unmasked_for_rel_pos_bias.unsqueeze(-1) + ) + b_t_t_rel_pos_bias = b_t_t_rel_pos_bias.view(bsz, heads, -1, max_seq_len) + new_len = b_t_t_rel_pos_bias.size(-2) + b_t_t_rel_pos_bias = b_t_t_rel_pos_bias.masked_select( + unmasked_for_rel_pos_bias.unsqueeze(-2) + ) + b_t_t_rel_pos_bias = b_t_t_rel_pos_bias.view(bsz, heads, new_len, new_len) + return b_t_t_rel_pos_bias + + +class AltBlock(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + layer_norm_first=True, + ffn_targets=False, + use_rel_pos_bias=False, + window_size=None, + alt_attention=False, + ): + super().__init__() + + self.layer_norm_first = layer_norm_first + self.ffn_targets = ffn_targets + + from timm.models.vision_transformer import Attention, DropPath, Mlp + + self.norm1 = norm_layer(dim) + self.use_rel_pos_bias = use_rel_pos_bias + if use_rel_pos_bias: + self.attn = AltAttention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + window_size=window_size, + ) + else: + if alt_attention: + from .multi.modules import AltAttention as AltAttention2 + self.attn = AltAttention2( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + else: + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + def forward(self, x, rel_pos_bias=None, pos_mask=None): + if self.layer_norm_first: + if self.use_rel_pos_bias: + x = x + self.drop_path( + self.attn( + self.norm1(x), rel_pos_bias=rel_pos_bias, pos_mask=pos_mask + ) + ) + else: + x = x + self.drop_path(self.attn(self.norm1(x))) + t = self.mlp(self.norm2(x)) + x = x + self.drop_path(t) + if not self.ffn_targets: + t = x + return x, t + else: + if self.use_rel_pos_bias: + x = x + self.drop_path( + self.attn(x, rel_pos_bias=rel_pos_bias, pos_mask=pos_mask) + ) + else: + x = x + self.drop_path(self.attn(x)) + r = x = self.norm1(x) + x = self.mlp(x) + t = x + x = self.norm2(r + self.drop_path(x)) + if not self.ffn_targets: + t = x + return x, t + + +class AltAttention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + window_size=None, + attn_head_dim=None, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1 + ) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = torch.zeros( + size=(window_size[0] * window_size[1] + 1,) * 2, + dtype=relative_coords.dtype, + ) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, rel_pos_bias=None, pos_mask=None): + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat( + ( + self.q_bias, + torch.zeros_like(self.v_bias, requires_grad=False), + self.v_bias, + ) + ) + # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + if self.relative_position_bias_table is not None: + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + modify_relative_position_bias( + relative_position_bias, x.size(0), pos_mask + ) + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class RelativePositionBias(nn.Module): + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1 + ) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads) + ) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = torch.zeros( + size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype + ) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + def forward(self): + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + + +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size, dtype=np.float32) + grid_w = np.arange(grid_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size, grid_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +def interpolate_pos_embed(model, checkpoint_model): + if "pos_embed" in checkpoint_model: + pos_embed_checkpoint = checkpoint_model["pos_embed"] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print( + "Position interpolate from %dx%d to %dx%d" + % (orig_size, orig_size, new_size, new_size) + ) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape( + -1, orig_size, orig_size, embedding_size + ).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, + size=(new_size, new_size), + mode="bicubic", + align_corners=False, + ) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model["pos_embed"] = new_pos_embed + + +@register_model("mae", dataclass=MaeConfig) +class MaeModel(BaseFairseqModel): + def __init__(self, cfg: MaeConfig): + super().__init__() + self.cfg = cfg + + self.mask_ratio = cfg.mask_ratio + + # -------------------------------------------------------------------------- + # MAE encoder specifics + self.patch_embed = PatchEmbed( + cfg.input_size, cfg.patch_size, cfg.in_chans, cfg.embed_dim + ) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, cfg.embed_dim)) if not cfg.no_cls else None + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches + int(not cfg.no_cls), cfg.embed_dim), requires_grad=False + ) # fixed sin-cos embedding + + norm_layer = partial(nn.LayerNorm, eps=cfg.norm_eps) + + dpr = [ + x.item() for x in torch.linspace(0, cfg.drop_path_rate, cfg.depth) + ] # stochastic depth decay rule + + def make_block(drop_path): + if cfg.w2v_block: + return TransformerSentenceEncoderLayer( + embedding_dim=cfg.embed_dim, + ffn_embedding_dim=cfg.embed_dim * cfg.mlp_ratio, + num_attention_heads=cfg.num_heads, + dropout=cfg.block_dropout, + attention_dropout=cfg.attention_dropout, + activation_dropout=cfg.activation_dropout, + activation_fn="gelu", + layer_norm_first=cfg.layer_norm_first, + drop_path=drop_path, + norm_eps=1e-6, + single_qkv=cfg.single_qkv, + fused_ln=cfg.fused_ln, + ) + elif cfg.alt_block: + window_size = ( + cfg.input_size // self.patch_embed.patch_size[0], + cfg.input_size // self.patch_embed.patch_size[1], + ) + return AltBlock( + cfg.embed_dim, + cfg.num_heads, + cfg.mlp_ratio, + qkv_bias=True, + qk_scale=None, + norm_layer=norm_layer, + drop_path=drop_path, + layer_norm_first=cfg.layer_norm_first, + ffn_targets=not cfg.end_of_block_targets, + use_rel_pos_bias=cfg.use_rel_pos_bias, + window_size=window_size + if (self.cfg.use_rel_pos_bias and not self.cfg.shared_rel_pos_bias) + else None, + alt_attention=cfg.alt_attention, + ) + elif cfg.alt_block2: + from .multi.modules import AltBlock as AltBlock2 + return AltBlock2( + cfg.embed_dim, + cfg.num_heads, + cfg.mlp_ratio, + qkv_bias=True, + qk_scale=None, + norm_layer=norm_layer, + drop_path=drop_path, + layer_norm_first=cfg.layer_norm_first, + ffn_targets=not cfg.end_of_block_targets, + ) + else: + return Block( + cfg.embed_dim, + cfg.num_heads, + cfg.mlp_ratio, + qkv_bias=True, + qk_scale=None, + norm_layer=norm_layer, + drop_path=drop_path, + ) + + self.blocks = nn.ModuleList([make_block(dpr[i]) for i in range(cfg.depth)]) + self.norm = norm_layer(cfg.embed_dim) + # -------------------------------------------------------------------------- + + # -------------------------------------------------------------------------- + # MAE decoder specifics + self.decoder_embed = ( + nn.Linear(cfg.embed_dim, cfg.decoder_embed_dim, bias=True) + if not cfg.no_decoder_embed + else None + ) + + self.mask_token = ( + nn.Parameter( + torch.zeros( + 1, + 1, + cfg.decoder_embed_dim + if not cfg.no_decoder_embed + else cfg.embed_dim, + ) + ) + if cfg.mask_noise_std <= 0 + else None + ) + + self.decoder_pos_embed = ( + nn.Parameter( + torch.zeros( + 1, + num_patches + 1, + cfg.decoder_embed_dim + if not cfg.no_decoder_embed + else cfg.embed_dim, + ), + requires_grad=False, + ) + if not cfg.no_decoder_pos_embed + else None + ) + + self.decoder_blocks = nn.ModuleList( + [ + Block( + cfg.decoder_embed_dim, + cfg.decoder_num_heads, + cfg.mlp_ratio, + qkv_bias=True, + qk_scale=None, + norm_layer=norm_layer, + ) + for _ in range(cfg.decoder_depth) + ] + ) + + self.decoder_norm = norm_layer(cfg.decoder_embed_dim) + self.decoder_pred = nn.Linear( + cfg.decoder_embed_dim, cfg.patch_size ** 2 * cfg.in_chans, bias=True + ) # decoder to patch + # -------------------------------------------------------------------------- + + self.norm_pix_loss = cfg.norm_pix_loss + + self.initialize_weights() + + for pn, p in self.named_parameters(): + if len(p.shape) == 1 or pn.endswith(".bias"): + p.param_group = "no_decay" + else: + p.param_group = "with_decay" + + def initialize_weights(self): + # initialization + # initialize (and freeze) pos_embed by sin-cos embedding + pos_embed = get_2d_sincos_pos_embed( + self.pos_embed.shape[-1], + int(self.patch_embed.num_patches ** 0.5), + cls_token=not self.cfg.no_cls, + ) + self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) + + if self.decoder_pos_embed is not None: + decoder_pos_embed = get_2d_sincos_pos_embed( + self.decoder_pos_embed.shape[-1], + int(self.patch_embed.num_patches ** 0.5), + cls_token=not self.cfg.no_cls, + ) + self.decoder_pos_embed.data.copy_( + torch.from_numpy(decoder_pos_embed).float().unsqueeze(0) + ) + + # initialize patch_embed like nn.Linear (instead of nn.Conv2d) + w = self.patch_embed.proj.weight.data + torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + + # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.) + if self.cls_token is not None: + torch.nn.init.normal_(self.cls_token, std=0.02) + + if self.mask_token is not None: + torch.nn.init.normal_(self.mask_token, std=0.02) + + # initialize nn.Linear and nn.LayerNorm + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + # we use xavier_uniform following official JAX ViT: + torch.nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm) or isinstance(m, FusedLayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def patchify(self, imgs): + """ + imgs: (N, 3, H, W) + x: (N, L, patch_size**2 *3) + """ + p = self.patch_embed.patch_size[0] + assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0 + + h = w = imgs.shape[2] // p + x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) + x = torch.einsum("nchpwq->nhwpqc", x) + x = x.reshape(shape=(imgs.shape[0], h * w, p ** 2 * 3)) + return x + + def unpatchify(self, x): + """ + x: (N, L, patch_size**2 *3) + imgs: (N, 3, H, W) + """ + p = self.patch_embed.patch_size[0] + h = w = int(x.shape[1] ** 0.5) + assert h * w == x.shape[1] + + x = x.reshape(shape=(x.shape[0], h, w, p, p, 3)) + x = torch.einsum("nhwpqc->nchpwq", x) + imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p)) + return imgs + + def random_masking(self, x, mask_ratio): + """ + Perform per-sample random masking by per-sample shuffling. + Per-sample shuffling is done by argsort random noise. + x: [N, L, D], sequence + """ + N, L, D = x.shape # batch, length, dim + len_keep = int(L * (1 - mask_ratio)) + + noise = torch.rand(N, L, device=x.device) # noise in [0, 1] + + # sort noise for each sample + ids_shuffle = torch.argsort( + noise, dim=1 + ) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset + ids_keep = ids_shuffle[:, :len_keep] + x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) + + # generate the binary mask: 0 is keep, 1 is remove + mask = torch.ones([N, L], device=x.device) + mask[:, :len_keep] = 0 + # unshuffle to get the binary mask + mask = torch.gather(mask, dim=1, index=ids_restore) + + return x_masked, mask, ids_restore # x_masked is actually unmasked x + + @classmethod + def build_model(cls, cfg: MaeConfig, task=None): + """Build a new model instance.""" + + return cls(cfg) + + def forward_encoder(self, x, mask_ratio): + # embed patches + x = self.patch_embed(x) + + # add pos embed w/o cls token + # if self.cls_token is not None: + # x = x + self.pos_embed + # else: + x = x + self.pos_embed[:, 1:, :] + + # masking: length -> length * mask_ratio + if mask_ratio > 0: + x, mask, ids_restore = self.random_masking(x, mask_ratio) + else: + mask = ids_restore = None + + # append cls token + if self.cls_token is not None: + cls_token = self.cls_token + self.pos_embed[:, :1, :] + cls_tokens = cls_token.expand(x.shape[0], -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + + # apply Transformer blocks + for blk in self.blocks: + x = blk(x) + + if self.norm is not None: + x = self.norm(x) + + return x, mask, ids_restore + + def forward_decoder(self, x, ids_restore): + # embed tokens + x = self.decoder_embed(x) + + # append mask tokens to sequence + mask_tokens = self.mask_token.repeat( + x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1 + ) + if self.cls_token is not None: + x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1) # no cls token + else: + x_ = torch.cat([x, mask_tokens], dim=1) # no cls token + + x_ = torch.gather( + x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]) + ) # unshuffle + + if self.cls_token is not None: + x = torch.cat([x[:, :1, :], x_], dim=1) # append cls token + + # add pos embed + x = x + self.decoder_pos_embed + + # apply Transformer blocks + for blk in self.decoder_blocks: + x = blk(x) + x = self.decoder_norm(x) + + # predictor projection + x = self.decoder_pred(x) + + if self.cls_token is not None: + # remove cls token + x = x[:, 1:, :] + + return x + + def forward_loss(self, imgs, pred, mask): + """ + imgs: [N, 3, H, W] + pred: [N, L, p*p*3] + mask: [N, L], 0 is keep, 1 is remove, + """ + target = self.patchify(imgs) + if self.norm_pix_loss: + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.0e-6) ** 0.5 + + loss = (pred - target) ** 2 + loss = loss.mean(dim=-1) # [N, L], mean loss per patch + + loss = (loss * mask).sum() + return loss, mask.sum() + + def forward(self, imgs, predictions_only=False): + latent, mask, ids_restore = self.forward_encoder( + imgs, self.mask_ratio if not predictions_only else 0 + ) + + if predictions_only: + return latent + + pred = self.forward_decoder(latent, ids_restore) # [N, L, p*p*3] + loss, sample_size = self.forward_loss(imgs, pred, mask) + + result = { + "losses": {"regression": loss}, + "sample_size": sample_size, + } + return result + + def remove_pretraining_modules(self): + self.decoder_embed = None + self.decoder_blocks = None + self.decoder_norm = None + self.decoder_pos_embed = None + self.decoder_pred = None + self.mask_token = None + if self.cfg.layer_norm_first: + self.norm = None diff --git a/examples/data2vec/models/mae_image_classification.py b/examples/data2vec/models/mae_image_classification.py new file mode 100644 index 0000000000..e304618dc5 --- /dev/null +++ b/examples/data2vec/models/mae_image_classification.py @@ -0,0 +1,386 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# The code in this file is adapted from the BeiT implementation which can be found here: +# https://github.com/microsoft/unilm/tree/master/beit + +import logging + +from dataclasses import dataclass +from enum import Enum, auto +from typing import Any, Optional + +import numpy as np +from omegaconf import II, MISSING + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from fairseq import checkpoint_utils, tasks +from omegaconf import open_dict + +from fairseq.dataclass import FairseqDataclass +from fairseq.models import BaseFairseqModel, register_model +from .mae import interpolate_pos_embed + + +logger = logging.getLogger(__name__) + + +class PredictionMode(Enum): + MEAN_POOLING = auto() + CLS_TOKEN = auto() + LIN_SOFTMAX = auto() + + +@dataclass +class MaeImageClassificationConfig(FairseqDataclass): + model_path: str = MISSING + no_pretrained_weights: bool = False + linear_classifier: bool = False + num_classes: int = 1000 + mixup: float = 0.8 + cutmix: float = 1.0 + label_smoothing: float = 0.1 + + drop_path_rate: float = 0.1 + layer_decay: float = 0.65 + + mixup_prob: float = 1.0 + mixup_switch_prob: float = 0.5 + mixup_mode: str = "batch" + + pretrained_model_args: Any = None + data: str = II("task.data") + + norm_eps: Optional[float] = None + + remove_alibi: bool = False + + # regularization overwrites + encoder_dropout: float = 0 + post_mlp_drop: float = 0 + attention_dropout: float = 0 + activation_dropout: float = 0.0 + dropout_input: float = 0.0 + layerdrop: float = 0.0 + + prenet_layerdrop: float = 0 + prenet_dropout: float = 0 + + use_fc_norm: bool = True + prediction_mode: PredictionMode = PredictionMode.MEAN_POOLING + + no_decay_blocks: bool = True + + +def get_layer_id_for_vit(name, num_layers): + """ + Assign a parameter with its layer id + Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33 + """ + if name in ["cls_token", "pos_embed"]: + return 0 + elif name.startswith("patch_embed"): + return 0 + elif name.startswith("rel_pos_bias"): + return num_layers - 1 + elif name.startswith("blocks"): + return int(name.split(".")[1]) + 1 + else: + return num_layers + + +@register_model("mae_image_classification", dataclass=MaeImageClassificationConfig) +class MaeImageClassificationModel(BaseFairseqModel): + def __init__(self, cfg: MaeImageClassificationConfig): + super().__init__() + self.cfg = cfg + + if cfg.pretrained_model_args is None: + state = checkpoint_utils.load_checkpoint_to_cpu(cfg.model_path, {}) + pretrained_args = state.get("cfg", None) + + pretrained_args.criterion = None + pretrained_args.lr_scheduler = None + + logger.info(pretrained_args.model) + + with open_dict(pretrained_args.model): + pretrained_args.model.drop_path_rate = cfg.drop_path_rate + if cfg.norm_eps is not None: + pretrained_args.model.norm_eps = cfg.norm_eps + + cfg.pretrained_model_args = pretrained_args + + logger.info(pretrained_args) + else: + state = None + pretrained_args = cfg.pretrained_model_args + + if "data" in pretrained_args.task: + pretrained_args.task.data = cfg.data + elif "image" in pretrained_args.task: + pretrained_args.task.image.data = cfg.data + + if "modalities" in pretrained_args.model: + prenet_blocks = pretrained_args.model["modalities"]["image"]["prenet_depth"] + model_blocks = pretrained_args.model["depth"] + with open_dict(pretrained_args): + dpr = np.linspace(0, cfg.drop_path_rate, model_blocks).tolist() + pretrained_args.model["modalities"]["image"][ + "start_drop_path_rate" + ] = dpr[0] + pretrained_args.model["modalities"]["image"][ + "end_drop_path_rate" + ] = max(0, dpr[prenet_blocks - 1]) + pretrained_args.model["start_drop_path_rate"] = dpr[prenet_blocks] + pretrained_args.model["end_drop_path_rate"] = dpr[-1] + + if "mae_masking" in pretrained_args.model["modalities"]["image"]: + del pretrained_args.model["modalities"]["image"]["mae_masking"] + + if cfg.remove_alibi: + pretrained_args.model["modalities"]["image"][ + "use_alibi_encoder" + ] = False + if ( + state is not None + and "modality_encoders.IMAGE.alibi_bias" in state["model"] + ): + del state["model"]["modality_encoders.IMAGE.alibi_bias"] + + pretrained_args.model["encoder_dropout"] = cfg.encoder_dropout + pretrained_args.model["post_mlp_drop"] = cfg.post_mlp_drop + pretrained_args.model["attention_dropout"] = cfg.attention_dropout + pretrained_args.model["activation_dropout"] = cfg.activation_dropout + pretrained_args.model["dropout_input"] = cfg.dropout_input + pretrained_args.model["layerdrop"] = cfg.layerdrop + + pretrained_args.model["modalities"]["image"][ + "prenet_layerdrop" + ] = cfg.prenet_layerdrop + pretrained_args.model["modalities"]["image"][ + "prenet_dropout" + ] = cfg.prenet_dropout + else: + # not d2v multi + with open_dict(pretrained_args): + pretrained_args.model["drop_path_rate"] = cfg.drop_path_rate + pretrained_args.model["block_dropout"] = cfg.encoder_dropout + pretrained_args.model["attention_dropout"] = cfg.attention_dropout + pretrained_args.model["activation_dropout"] = cfg.activation_dropout + + task = tasks.setup_task(pretrained_args.task) + model = task.build_model(pretrained_args.model, from_checkpoint=True) + + self.d2v_multi = "data2vec_multi" in pretrained_args.model._name + self.linear_classifier = cfg.linear_classifier + + self.model = model + + if state is not None and not cfg.no_pretrained_weights: + interpolate_pos_embed(model, state) + + if "modality_encoders.IMAGE.positional_encoder.pos_embed" in state["model"]: + state["model"][ + "modality_encoders.IMAGE.positional_encoder.positions" + ] = state["model"][ + "modality_encoders.IMAGE.positional_encoder.pos_embed" + ] + del state["model"][ + "modality_encoders.IMAGE.positional_encoder.pos_embed" + ] + if "modality_encoders.IMAGE.encoder_mask" in state["model"]: + del state["model"]["modality_encoders.IMAGE.encoder_mask"] + + model.load_state_dict(state["model"], strict=True) + + if self.d2v_multi: + model.remove_pretraining_modules(modality="image") + else: + model.remove_pretraining_modules() + + if self.linear_classifier: + model.requires_grad_(False) + + self.fc_norm = None + if self.cfg.use_fc_norm: + self.fc_norm = nn.LayerNorm(pretrained_args.model.embed_dim, eps=1e-6) + nn.init.constant_(self.fc_norm.bias, 0) + nn.init.constant_(self.fc_norm.weight, 1.0) + + self.head = nn.Linear(pretrained_args.model.embed_dim, cfg.num_classes) + + nn.init.trunc_normal_(self.head.weight, std=0.02) + nn.init.constant_(self.head.bias, 0) + + self.mixup_fn = None + + if cfg.mixup > 0 or cfg.cutmix > 0: + from timm.data import Mixup + + self.mixup_fn = Mixup( + mixup_alpha=cfg.mixup, + cutmix_alpha=cfg.cutmix, + cutmix_minmax=None, + prob=cfg.mixup_prob, + switch_prob=cfg.mixup_switch_prob, + mode=cfg.mixup_mode, + label_smoothing=cfg.label_smoothing, + num_classes=cfg.num_classes, + ) + + if self.model.norm is not None: + for pn, p in self.model.norm.named_parameters(): + if len(p.shape) == 1 or pn.endswith(".bias"): + p.optim_overrides = {"optimizer": {"weight_decay_scale": 0}} + + if self.fc_norm is not None: + for pn, p in self.fc_norm.named_parameters(): + if len(p.shape) == 1 or pn.endswith(".bias"): + p.optim_overrides = {"optimizer": {"weight_decay_scale": 0}} + + for pn, p in self.head.named_parameters(): + if len(p.shape) == 1 or pn.endswith(".bias"): + p.optim_overrides = {"optimizer": {"weight_decay_scale": 0}} + + if self.d2v_multi: + mod_encs = list(model.modality_encoders.values()) + assert len(mod_encs) == 1, len(mod_encs) + blocks = list(mod_encs[0].context_encoder.blocks) + list(model.blocks) + else: + blocks = model.blocks + + num_layers = len(blocks) + 1 + layer_scales = list( + cfg.layer_decay ** (num_layers - i) for i in range(num_layers + 1) + ) + + if self.d2v_multi: + for n, p in self.model.named_parameters(): + optimizer_override_dict = {} + + if len(p.shape) == 1 or n.endswith(".bias"): + optimizer_override_dict["weight_decay_scale"] = 0 + + p.optim_overrides = {"optimizer": optimizer_override_dict} + + if cfg.layer_decay > 0: + for i, b in enumerate(blocks): + lid = i + 1 + if layer_scales[lid] == 1.0: + continue + + for n, p in b.named_parameters(): + optim_override = getattr(p, "optim_overrides", {}) + if "optimizer" not in optim_override: + optim_override["optimizer"] = {} + + if cfg.no_decay_blocks: + optim_override["optimizer"]["lr_scale"] = layer_scales[lid] + p.optim_overrides = optim_override + else: + optim_override["optimizer"] = { + "lr_scale": layer_scales[lid] + } + p.optim_overrides = optim_override + + else: + for n, p in self.model.named_parameters(): + optimizer_override_dict = {} + layer_id = get_layer_id_for_vit(n, num_layers) + + if len(p.shape) == 1 or n.endswith(".bias"): + optimizer_override_dict["weight_decay_scale"] = 0 + + if cfg.layer_decay > 0: + optimizer_override_dict["lr_scale"] = layer_scales[layer_id] + p.optim_overrides = {"optimizer": optimizer_override_dict} + + @classmethod + def build_model(cls, cfg: MaeImageClassificationConfig, task=None): + """Build a new model instance.""" + + return cls(cfg) + + def forward( + self, + imgs, + labels=None, + ): + if self.training and self.mixup_fn is not None and labels is not None: + imgs, labels = self.mixup_fn(imgs, labels) + + if self.linear_classifier: + with torch.no_grad(): + x = self.model_forward(imgs) + else: + x = self.model_forward(imgs) + + if self.cfg.prediction_mode == PredictionMode.MEAN_POOLING: + x = x.mean(dim=1) + elif self.cfg.prediction_mode == PredictionMode.CLS_TOKEN: + x = x[:, 0] + elif self.cfg.prediction_mode == PredictionMode.LIN_SOFTMAX: + dtype = x.dtype + x = F.logsigmoid(x.float()) + x = torch.logsumexp(x + x, dim=1) - torch.logsumexp(x + 1e-6, dim=1) + x = x.clamp(max=0) + x = x - torch.log(-(torch.expm1(x))) + x = torch.nan_to_num(x, nan=0, posinf=0, neginf=0) + x = x.to(dtype=dtype) + else: + raise Exception(f"unknown prediction mode {self.cfg.prediction_mode.name}") + + if self.fc_norm is not None: + x = self.fc_norm(x) + + x = self.head(x) + + if labels is None: + return x + + if self.training and self.mixup_fn is not None: + loss = -labels * F.log_softmax(x.float(), dim=-1) + else: + loss = F.cross_entropy( + x.float(), + labels, + label_smoothing=self.cfg.label_smoothing if self.training else 0, + reduction="none", + ) + + result = { + "losses": {"regression": loss}, + "sample_size": imgs.size(0), + } + + if not self.training: + with torch.no_grad(): + pred = x.argmax(-1) + correct = (pred == labels).sum() + result["correct"] = correct + + return result + + def model_forward(self, imgs): + if self.d2v_multi: + x = self.model.extract_features( + imgs, + mode="IMAGE", + mask=False, + remove_extra_tokens=( + self.cfg.prediction_mode != PredictionMode.CLS_TOKEN + ), + )["x"] + else: + x = self.model(imgs, predictions_only=True) + if ( + "no_cls" not in self.model.cfg or not self.model.cfg.no_cls + ) and not self.cfg.prediction_mode == PredictionMode.CLS_TOKEN: + x = x[:, 1:] + return x diff --git a/examples/data2vec/models/modalities/__init__.py b/examples/data2vec/models/modalities/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/data2vec/models/modalities/audio.py b/examples/data2vec/models/modalities/audio.py new file mode 100644 index 0000000000..80d2857b24 --- /dev/null +++ b/examples/data2vec/models/modalities/audio.py @@ -0,0 +1,192 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from functools import partial +import torch +import torch.nn as nn +import numpy as np +from dataclasses import dataclass, field +from typing import Callable, Dict, Optional +from fairseq.models.wav2vec import ConvFeatureExtractionModel +from fairseq.modules import ( + LayerNorm, + SamePad, + TransposeLast, +) +from fairseq.tasks import FairseqTask +from .base import D2vModalityConfig, ModalitySpecificEncoder, get_alibi_bias +from .modules import BlockEncoder, Decoder1d +from examples.data2vec.data.modality import Modality + + +@dataclass +class D2vAudioConfig(D2vModalityConfig): + type: Modality = Modality.AUDIO + extractor_mode: str = "layer_norm" + feature_encoder_spec: str = field( + default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]", + metadata={ + "help": "string describing convolutional feature extraction layers in form of a python list that contains " + "[(dim, kernel_size, stride), ...]" + }, + ) + conv_pos_width: int = field( + default=95, + metadata={"help": "number of filters for convolutional positional embeddings"}, + ) + conv_pos_groups: int = field( + default=16, + metadata={"help": "number of groups for convolutional positional embedding"}, + ) + conv_pos_depth: int = field( + default=5, + metadata={"help": "depth of positional encoder network"}, + ) + conv_pos_pre_ln: bool = False + + +class AudioEncoder(ModalitySpecificEncoder): + + modality_cfg: D2vAudioConfig + + def __init__( + self, + modality_cfg: D2vAudioConfig, + embed_dim: int, + make_block: Callable[[float], nn.ModuleList], + norm_layer: Callable[[int], nn.LayerNorm], + layer_norm_first: bool, + alibi_biases: Dict, + task: Optional[FairseqTask], + ): + + self.feature_enc_layers = eval(modality_cfg.feature_encoder_spec) + feature_embed_dim = self.feature_enc_layers[-1][0] + + local_encoder = ConvFeatureExtractionModel( + conv_layers=self.feature_enc_layers, + dropout=0.0, + mode=modality_cfg.extractor_mode, + conv_bias=False, + ) + + project_features = nn.Sequential( + TransposeLast(), + nn.LayerNorm(feature_embed_dim), + nn.Linear(feature_embed_dim, embed_dim), + ) + + num_pos_layers = modality_cfg.conv_pos_depth + k = max(3, modality_cfg.conv_pos_width // num_pos_layers) + + positional_encoder = nn.Sequential( + TransposeLast(), + *[ + nn.Sequential( + nn.Conv1d( + embed_dim, + embed_dim, + kernel_size=k, + padding=k // 2, + groups=modality_cfg.conv_pos_groups, + ), + SamePad(k), + TransposeLast(), + LayerNorm(embed_dim, elementwise_affine=False), + TransposeLast(), + nn.GELU(), + ) + for _ in range(num_pos_layers) + ], + TransposeLast(), + ) + + if modality_cfg.conv_pos_pre_ln: + positional_encoder = nn.Sequential(LayerNorm(embed_dim), positional_encoder) + + dpr = np.linspace( + modality_cfg.start_drop_path_rate, + modality_cfg.end_drop_path_rate, + modality_cfg.prenet_depth, + ) + context_encoder = BlockEncoder( + nn.ModuleList(make_block(dpr[i]) for i in range(modality_cfg.prenet_depth)), + norm_layer(embed_dim) if not layer_norm_first else None, + layer_norm_first, + modality_cfg.prenet_layerdrop, + modality_cfg.prenet_dropout, + ) + + decoder = ( + Decoder1d(modality_cfg.decoder, embed_dim) + if modality_cfg.decoder is not None + else None + ) + + alibi_bias_fn = partial(get_alibi_bias, alibi_biases=alibi_biases) + + super().__init__( + modality_cfg=modality_cfg, + embed_dim=embed_dim, + local_encoder=local_encoder, + project_features=project_features, + fixed_positional_encoder=None, + relative_positional_encoder=positional_encoder, + context_encoder=context_encoder, + decoder=decoder, + get_alibi_bias=alibi_bias_fn, + ) + + def convert_padding_mask(self, x, padding_mask): + def get_feat_extract_output_lengths(input_lengths: torch.LongTensor): + """ + Computes the output length of the convolutional layers + """ + + def _conv_out_length(input_length, kernel_size, stride): + return torch.floor((input_length - kernel_size) / stride + 1) + + for i in range(len(self.feature_enc_layers)): + input_lengths = _conv_out_length( + input_lengths, + self.feature_enc_layers[i][1], + self.feature_enc_layers[i][2], + ) + + return input_lengths.to(torch.long) + + if padding_mask is not None: + input_lengths = (1 - padding_mask.long()).sum(-1) + # apply conv formula to get real output_lengths + output_lengths = get_feat_extract_output_lengths(input_lengths) + + if padding_mask.any(): + padding_mask = torch.zeros(x.shape[:2], dtype=x.dtype, device=x.device) + + # these two operations makes sure that all values + # before the output lengths indices are attended to + padding_mask[ + ( + torch.arange(padding_mask.shape[0], device=padding_mask.device), + output_lengths - 1, + ) + ] = 1 + padding_mask = ( + 1 - padding_mask.flip([-1]).cumsum(-1).flip([-1]) + ).bool() + else: + padding_mask = torch.zeros( + x.shape[:2], dtype=torch.bool, device=x.device + ) + + return padding_mask + + def reset_parameters(self): + super().reset_parameters() + for mod in self.project_features.children(): + if isinstance(mod, nn.Linear): + mod.reset_parameters() + if self.decoder is not None: + self.decoder.reset_parameters() diff --git a/examples/data2vec/models/modalities/base.py b/examples/data2vec/models/modalities/base.py new file mode 100644 index 0000000000..642cc84661 --- /dev/null +++ b/examples/data2vec/models/modalities/base.py @@ -0,0 +1,684 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import math +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from collections import namedtuple +from dataclasses import dataclass +from functools import partial +from omegaconf import MISSING, II +from typing import Optional, Callable +from fairseq.data.data_utils import compute_mask_indices +from fairseq.modules import GradMultiply +from fairseq.utils import index_put +from examples.data2vec.data.modality import Modality +from .modules import D2vDecoderConfig + +logger = logging.getLogger(__name__) + + +@dataclass +class D2vModalityConfig: + type: Modality = MISSING + prenet_depth: int = 4 + prenet_layerdrop: float = 0 + prenet_dropout: float = 0 + start_drop_path_rate: float = 0 + end_drop_path_rate: float = 0 + + num_extra_tokens: int = 0 + init_extra_token_zero: bool = True + + mask_noise_std: float = 0.01 + mask_prob_min: Optional[float] = None + mask_prob: float = 0.7 + inverse_mask: bool = False + mask_prob_adjust: float = 0 + keep_masked_pct: float = 0 + + mask_length: int = 5 + add_masks: bool = False + remove_masks: bool = False + mask_dropout: float = 0.0 + encoder_zero_mask: bool = True + + mask_channel_prob: float = 0.0 + mask_channel_length: int = 64 + + ema_local_encoder: bool = False # used in data2vec_multi + local_grad_mult: float = 1.0 + + use_alibi_encoder: bool = False + alibi_scale: float = 1.0 + learned_alibi: bool = False + alibi_max_pos: Optional[int] = None + learned_alibi_scale: bool = False + learned_alibi_scale_per_head: bool = False + learned_alibi_scale_per_layer: bool = False + + num_alibi_heads: int = II("model.num_heads") + model_depth: int = II("model.depth") + + decoder: Optional[D2vDecoderConfig] = D2vDecoderConfig() + + +MaskSeed = namedtuple("MaskSeed", ["seed", "update", "ids"]) +MaskInfo = namedtuple("MaskInfo", ["x_unmasked", "mask", "ids_restore", "ids_keep"]) + + +class ModalitySpecificEncoder(nn.Module): + def __init__( + self, + modality_cfg: D2vModalityConfig, + embed_dim: int, + local_encoder: nn.Module, + project_features: nn.Module, + fixed_positional_encoder: Optional[nn.Module], + relative_positional_encoder: Optional[nn.Module], + context_encoder: nn.Module, + decoder: nn.Module, + get_alibi_bias: Optional[Callable[[int, int, str, str], torch.Tensor]], + ): + super().__init__() + + self.modality_cfg = modality_cfg + self.local_encoder = local_encoder + self.project_features = project_features + self.fixed_positional_encoder = fixed_positional_encoder + self.relative_positional_encoder = relative_positional_encoder + self.context_encoder = context_encoder + + self.decoder = decoder + self.get_alibi_bias = get_alibi_bias if modality_cfg.use_alibi_encoder else None + + self.local_grad_mult = self.modality_cfg.local_grad_mult + + self.extra_tokens = None + if modality_cfg.num_extra_tokens > 0: + self.extra_tokens = nn.Parameter( + torch.zeros(1, modality_cfg.num_extra_tokens, embed_dim) + ) + if not modality_cfg.init_extra_token_zero: + nn.init.normal_(self.extra_tokens) + elif self.extra_tokens.size(1) > 1: + nn.init.normal_(self.extra_tokens[:, 1:]) + + self.alibi_scale = None + if self.get_alibi_bias is not None: + self.alibi_scale = nn.Parameter( + torch.full( + ( + (modality_cfg.prenet_depth + modality_cfg.model_depth) + if modality_cfg.learned_alibi_scale_per_layer + else 1, + 1, + self.modality_cfg.num_alibi_heads + if modality_cfg.learned_alibi_scale_per_head + else 1, + 1, + 1, + ), + modality_cfg.alibi_scale, + dtype=torch.float, + ), + requires_grad=modality_cfg.learned_alibi_scale, + ) + + if modality_cfg.learned_alibi and self.get_alibi_bias is not None: + assert modality_cfg.alibi_max_pos is not None + alibi_bias = self.get_alibi_bias( + batch_size=1, + time_steps=modality_cfg.alibi_max_pos, + heads=modality_cfg.num_alibi_heads, + scale=1.0, + dtype=torch.float, + device="cpu", + ) + self.alibi_bias = nn.Parameter(alibi_bias) + self.get_alibi_bias = partial( + _learned_alibi_bias, alibi_bias=self.alibi_bias + ) + + def upgrade_state_dict_named(self, state_dict, name): + k = f"{name}.alibi_scale" + if k in state_dict and state_dict[k].dim() == 4: + state_dict[k] = state_dict[k].unsqueeze(0) + + return state_dict + + def convert_padding_mask(self, x, padding_mask): + return padding_mask + + def decoder_input(self, x, mask_info: MaskInfo): + inp_drop = self.modality_cfg.decoder.input_dropout + if inp_drop > 0: + x = F.dropout(x, inp_drop, training=self.training, inplace=True) + + num_extra = self.modality_cfg.num_extra_tokens + + if mask_info is not None: + num_masked = mask_info.ids_restore.shape[1] - x.shape[1] + num_extra + + mask_tokens = x.new_empty( + x.size(0), + num_masked, + x.size(-1), + ).normal_(0, self.modality_cfg.mask_noise_std) + + x_ = torch.cat([x[:, num_extra:], mask_tokens], dim=1) + x = torch.gather(x_, dim=1, index=mask_info.ids_restore) + + if self.modality_cfg.decoder.add_positions_masked: + assert self.fixed_positional_encoder is not None + pos = self.fixed_positional_encoder(x, None) + x = x + (pos * mask_info.mask.unsqueeze(-1)) + else: + x = x[:, num_extra:] + + if self.modality_cfg.decoder.add_positions_all: + assert self.fixed_positional_encoder is not None + x = x + self.fixed_positional_encoder(x, None) + + return x, mask_info + + def local_features(self, features): + if self.local_grad_mult > 0: + if self.local_grad_mult == 1.0: + x = self.local_encoder(features) + else: + x = GradMultiply.apply( + self.local_encoder(features), self.local_grad_mult + ) + else: + with torch.no_grad(): + x = self.local_encoder(features) + + x = self.project_features(x) + return x + + def contextualized_features( + self, + x, + padding_mask, + mask, + remove_masked, + clone_batch: int = 1, + mask_seeds: Optional[torch.Tensor] = None, + precomputed_mask=None, + ): + + if padding_mask is not None: + padding_mask = self.convert_padding_mask(x, padding_mask) + + local_features = x + if mask and clone_batch == 1: + local_features = local_features.clone() + + orig_B, orig_T, _ = x.shape + pre_mask_B = orig_B + mask_info = None + + x_pos = None + if self.fixed_positional_encoder is not None: + x = x + self.fixed_positional_encoder(x, padding_mask) + + if mask: + if clone_batch > 1: + x = x.repeat_interleave(clone_batch, 0) + if mask_seeds is not None: + clone_hash = [ + int(hash((mask_seeds.seed, ind)) % 1e10) + for ind in range(clone_batch - 1) + ] + clone_hash = torch.tensor([0] + clone_hash).long().view(1, -1) + + id = mask_seeds.ids + id = id.repeat_interleave(clone_batch, 0) + id = id.view(-1, clone_batch) + clone_hash.to(id) + id = id.view(-1) + mask_seeds = MaskSeed( + seed=mask_seeds.seed, update=mask_seeds.update, ids=id + ) + if padding_mask is not None: + padding_mask = padding_mask.repeat_interleave(clone_batch, 0) + + x, mask_info = self.compute_mask( + x, + padding_mask, + mask_seed=mask_seeds, + apply=self.relative_positional_encoder is not None or not remove_masked, + precomputed_mask=precomputed_mask, + ) + + if self.relative_positional_encoder is not None: + x_pos = self.relative_positional_encoder(x) + + masked_padding_mask = padding_mask + if mask and remove_masked: + x = mask_info.x_unmasked + if x_pos is not None: + x = x + gather_unmasked(x_pos, mask_info) + + if padding_mask is not None and padding_mask.any(): + masked_padding_mask = gather_unmasked_mask(padding_mask, mask_info) + if not masked_padding_mask.any(): + masked_padding_mask = None + else: + masked_padding_mask = None + + elif x_pos is not None: + x = x + x_pos + + alibi_bias = None + alibi_scale = self.alibi_scale + + if self.get_alibi_bias is not None: + alibi_bias = self.get_alibi_bias( + batch_size=pre_mask_B, + time_steps=orig_T, + heads=self.modality_cfg.num_alibi_heads, + dtype=torch.float32, + device=x.device, + ) + + if alibi_scale is not None: + alibi_scale = alibi_scale.clamp_min(0) + if alibi_scale.size(0) == 1: + alibi_bias = alibi_bias * alibi_scale.squeeze(0).type_as(alibi_bias) + alibi_scale = None + + if clone_batch > 1: + alibi_bias = alibi_bias.repeat_interleave(clone_batch, 0) + + if mask_info is not None and remove_masked: + alibi_bias = masked_alibi(alibi_bias, mask_info) + + if self.extra_tokens is not None: + num = self.extra_tokens.size(1) + x = torch.cat([self.extra_tokens.expand(x.size(0), -1, -1), x], dim=1) + if masked_padding_mask is not None: + # B x T + masked_padding_mask = F.pad(masked_padding_mask, (num, 0)) + if alibi_bias is not None: + # B x H x T x T + alibi_bias = F.pad(alibi_bias, (num, 0, num, 0)) + + x = self.context_encoder( + x, + masked_padding_mask, + alibi_bias, + alibi_scale[: self.modality_cfg.prenet_depth] + if alibi_scale is not None + else None, + ) + + return { + "x": x, + "local_features": local_features, + "padding_mask": masked_padding_mask, + "alibi_bias": alibi_bias, + "alibi_scale": alibi_scale[self.modality_cfg.prenet_depth :] + if alibi_scale is not None and alibi_scale.size(0) > 1 + else alibi_scale, + "encoder_mask": mask_info, + } + + def forward( + self, + features, + padding_mask, + mask: bool, + remove_masked: bool, + clone_batch: int = 1, + mask_seeds: Optional[torch.Tensor] = None, + precomputed_mask=None, + ): + x = self.local_features(features) + return self.contextualized_features( + x, + padding_mask, + mask, + remove_masked, + clone_batch, + mask_seeds, + precomputed_mask, + ) + + def reset_parameters(self): + pass + + def compute_mask( + self, + x, + padding_mask, + mask_seed: Optional[MaskSeed], + apply, + precomputed_mask, + ): + if precomputed_mask is not None: + mask = precomputed_mask + mask_info = self.make_maskinfo(x, mask) + else: + B, T, C = x.shape + cfg = self.modality_cfg + + mask_prob = cfg.mask_prob + + if ( + cfg.mask_prob_min is not None + and cfg.mask_prob_min >= 0 + and cfg.mask_prob_min < mask_prob + ): + mask_prob = np.random.uniform(cfg.mask_prob_min, mask_prob) + + if mask_prob > 0: + if cfg.mask_length == 1: + mask_info = random_masking(x, mask_prob, mask_seed) + else: + if self.modality_cfg.inverse_mask: + mask_prob = 1 - mask_prob + + mask = compute_mask_indices( + (B, T), + padding_mask, + mask_prob, + cfg.mask_length, + min_masks=1, + require_same_masks=True, + mask_dropout=cfg.mask_dropout, + add_masks=cfg.add_masks, + seed=mask_seed.seed if mask_seed is not None else None, + epoch=mask_seed.update if mask_seed is not None else None, + indices=mask_seed.ids if mask_seed is not None else None, + ) + + mask = torch.from_numpy(mask).to(device=x.device) + if self.modality_cfg.inverse_mask: + mask = 1 - mask + mask_info = self.make_maskinfo(x, mask) + else: + mask_info = None + + if apply: + x = self.apply_mask(x, mask_info) + + return x, mask_info + + def make_maskinfo(self, x, mask, shape=None): + if shape is None: + B, T, D = x.shape + else: + B, T, D = shape + + mask = mask.to(torch.uint8) + ids_shuffle = mask.argsort(dim=1) + ids_restore = ids_shuffle.argsort(dim=1).unsqueeze(-1).expand(-1, -1, D) + + len_keep = T - mask[0].sum() + if self.modality_cfg.keep_masked_pct > 0: + len_keep += round((T - int(len_keep)) * self.modality_cfg.keep_masked_pct) + + ids_keep = ids_shuffle[:, :len_keep] + + if shape is not None: + x_unmasked = None + else: + ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D) + x_unmasked = torch.gather(x, dim=1, index=ids_keep) + + mask_info = MaskInfo( + x_unmasked=x_unmasked, + mask=mask, + ids_restore=ids_restore, + ids_keep=ids_keep, + ) + return mask_info + + def apply_mask(self, x, mask_info): + cfg = self.modality_cfg + B, T, C = x.shape + + if mask_info is not None: + mask = mask_info.mask + if cfg.encoder_zero_mask: + x = x * (1 - mask.type_as(x).unsqueeze(-1)) + else: + num_masks = mask.sum().item() + masks = x.new_empty(num_masks, x.size(-1)).normal_( + 0, cfg.mask_noise_std + ) + x = index_put(x, mask, masks) + if cfg.mask_channel_prob > 0: + mask_channel = compute_mask_indices( + (B, C), + None, + cfg.mask_channel_prob, + cfg.mask_channel_length, + ) + mask_channel = ( + torch.from_numpy(mask_channel) + .to(x.device) + .unsqueeze(1) + .expand(-1, T, -1) + ) + x = index_put(x, mask_channel, 0) + return x + + def remove_pretraining_modules(self, keep_decoder=False): + if not keep_decoder: + self.decoder = None + + +def get_annealed_rate(start, end, curr_step, total_steps): + if curr_step >= total_steps: + return end + r = end - start + pct_remaining = 1 - curr_step / total_steps + return end - r * pct_remaining + + +# adapted from MAE +def random_masking(x, mask_ratio, mask_seed: Optional[MaskSeed]): + N, L, D = x.shape # batch, length, dim + len_keep = int(L * (1 - mask_ratio)) + + generator = None + if mask_seed is not None: + seed = int( + hash((mask_seed.seed, mask_seed.update, mask_seed.ids.sum().item())) % 1e6 + ) + generator = torch.Generator(device=x.device) + generator.manual_seed(seed) + + noise = torch.rand(N, L, generator=generator, device=x.device) # noise in [0, 1] + + # sort noise for each sample + ids_shuffle = noise.argsort(dim=1) # ascend: small is keep, large is remove + ids_restore = ids_shuffle.argsort(dim=1) + + # keep the first subset + ids_keep = ids_shuffle[:, :len_keep] + ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D) + x_unmasked = torch.gather(x, dim=1, index=ids_keep) + + # generate the binary mask: 0 is keep, 1 is remove + mask = torch.ones([N, L], dtype=x.dtype, device=x.device) + mask[:, :len_keep] = 0 + # unshuffle to get the binary mask + mask = torch.gather(mask, dim=1, index=ids_restore) + + ids_restore = ids_restore.unsqueeze(-1).expand(-1, -1, D) + + return MaskInfo( + x_unmasked=x_unmasked, mask=mask, ids_restore=ids_restore, ids_keep=ids_keep + ) + + +def gather_unmasked(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor: + return torch.gather( + x, + dim=1, + index=mask_info.ids_keep, + ) + + +def gather_unmasked_mask(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor: + return torch.gather( + x, + dim=1, + index=mask_info.ids_keep[..., 0], # ignore the feature dimension + ) + + +def get_alibi( + max_positions: int, + attention_heads: int, + dims: int = 1, + distance: str = "manhattan", +): + def get_slopes(n): + def get_slopes_power_of_2(n): + start = 2 ** (-(2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio**i for i in range(n)] + + # In the paper, we only train models that have 2^a heads for some + # a. This function has some good properties that only occur when + # the input is a power of 2. To maintain that even when the number + # of heads is not a power of 2, we use this workaround. + if math.log2(n).is_integer(): + return get_slopes_power_of_2(n) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(n)) + return ( + get_slopes_power_of_2(closest_power_of_2) + + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2] + ) + + maxpos = max_positions + attn_heads = attention_heads + slopes = torch.Tensor(get_slopes(attn_heads)) + + if dims == 1: + # prepare alibi position linear bias. Note that wav2vec2 is non + # autoregressive model so we want a symmetric mask with 0 on the + # diagonal and other wise linear decreasing valuees + pos_bias = ( + torch.abs( + torch.arange(maxpos).unsqueeze(0) - torch.arange(maxpos).unsqueeze(1) + ) + * -1 + ) + elif dims == 2: + if distance == "manhattan": + df = lambda x1, y1, x2, y2: abs(x1 - x2) + abs(y1 - y2) + elif distance == "euclidean": + df = lambda x1, y1, x2, y2: math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) + + n = math.sqrt(max_positions) + assert n.is_integer(), n + n = int(n) + + pos_bias = torch.zeros((max_positions, max_positions)) + + for i in range(n): + for j in range(n): + for k in range(n): + for l in range(n): + new_x = i * n + j + new_y = k * n + l + pos_bias[new_x, new_y] = -df(i, j, k, l) + + else: + raise Exception(f"unsupported number of alibi dims: {dims}") + + alibi_bias = slopes.unsqueeze(1).unsqueeze(1) * pos_bias.unsqueeze(0).expand( + attn_heads, -1, -1 + ) + + return alibi_bias + + +def get_alibi_bias( + alibi_biases, + batch_size, + time_steps, + heads, + dtype, + device, + dims=1, + distance="manhattan", +): + cache_key = f"{dims}_{heads}_{distance}" + + buffered = alibi_biases.get(cache_key, None) + + target_size = heads * batch_size + if ( + buffered is None + or buffered.size(0) < target_size + or buffered.size(1) < time_steps + or buffered.dtype != dtype + or buffered.device != device + ): + bt = max(time_steps, buffered.size(1) if buffered is not None else 0) + bn = max(target_size, buffered.size(0) if buffered is not None else 0) // heads + + buffered = ( + get_alibi(bt, heads, dims=dims, distance=distance) + .to(dtype=dtype, device=device) + .repeat(bn, 1, 1) + ) + + alibi_biases[cache_key] = buffered + + b = buffered[:target_size, :time_steps, :time_steps] + b = b.view(batch_size, heads, time_steps, time_steps) + return b + + +def _learned_alibi_bias( + alibi_bias, + batch_size, + time_steps, + heads, + scale, + dtype, + device, +): + assert alibi_bias.size(1) == heads, alibi_bias.shape + assert alibi_bias.dtype == dtype, alibi_bias.dtype + assert alibi_bias.device == device, alibi_bias.device + + if alibi_bias.size(-1) < time_steps: + psz = math.ceil((time_steps - alibi_bias.size(-1)) / 2) + alibi_bias = F.pad(alibi_bias, (psz, psz, psz, psz), mode="replicate") + + alibi_bias = alibi_bias.expand(batch_size, -1, -1, -1) * scale + return alibi_bias[..., :time_steps, :time_steps] + + +def masked_alibi(alibi_bias, mask_info): + H = alibi_bias.size(1) + + orig_bias = alibi_bias + + index = mask_info.ids_keep.unsqueeze(1)[..., 0].unsqueeze(-1) + alibi_bias = torch.gather( + orig_bias, + dim=-2, + index=index.expand(-1, H, -1, mask_info.ids_restore.size(1)), + ) + alibi_bias = torch.gather( + alibi_bias, + dim=-1, + index=index.transpose(-1, -2).expand(-1, H, alibi_bias.size(-2), -1), + ) + + return alibi_bias diff --git a/examples/data2vec/models/modalities/images.py b/examples/data2vec/models/modalities/images.py new file mode 100644 index 0000000000..a6b738cb07 --- /dev/null +++ b/examples/data2vec/models/modalities/images.py @@ -0,0 +1,256 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from functools import partial +from dataclasses import dataclass +from typing import Callable, Dict, Optional +from timm.models.layers import to_2tuple +from fairseq.tasks import FairseqTask +from examples.data2vec.models.mae import get_2d_sincos_pos_embed, PatchEmbed +from .base import ( + D2vModalityConfig, + ModalitySpecificEncoder, + get_alibi_bias, + MaskSeed, +) +from .modules import ( + BlockEncoder, + Decoder2d, + FixedPositionalEncoder, + TransformerDecoder, + EncDecTransformerDecoder, +) +from examples.data2vec.data.modality import Modality + + +@dataclass +class D2vImageConfig(D2vModalityConfig): + type: Modality = Modality.IMAGE + + input_size: int = 224 + in_chans: int = 3 + patch_size: int = 16 + embed_dim: int = 768 + + alibi_dims: int = 2 + alibi_distance: str = "manhattan" + + fixed_positions: bool = True + + transformer_decoder: bool = False + enc_dec_transformer: bool = False + + +class ImageEncoder(ModalitySpecificEncoder): + + modality_cfg: D2vImageConfig + + def __init__( + self, + modality_cfg: D2vImageConfig, + embed_dim: int, + make_block: Callable[[float, Optional[int], Optional[int]], nn.ModuleList], + norm_layer: Callable[[int], nn.LayerNorm], + layer_norm_first: bool, + alibi_biases: Dict, + task: Optional[FairseqTask], + ): + + img_size = to_2tuple(modality_cfg.input_size) + patch_size = to_2tuple(modality_cfg.patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + + local_encoder = PatchEmbed( + modality_cfg.input_size, + modality_cfg.patch_size, + modality_cfg.in_chans, + modality_cfg.embed_dim, + ) + + w = local_encoder.proj.weight.data + torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + + if modality_cfg.embed_dim != embed_dim: + local_encoder = nn.Sequential( + local_encoder, + nn.Linear(modality_cfg.embed_dim, embed_dim), + ) + + project_features = nn.Identity() + + pos_embed = nn.Parameter( + torch.zeros(1, num_patches, embed_dim), requires_grad=False + ) + + side_n = int(num_patches ** 0.5) + + emb = get_2d_sincos_pos_embed( + pos_embed.shape[-1], + side_n, + cls_token=False, + ) + pos_embed.data.copy_(torch.from_numpy(emb).float().unsqueeze(0)) + fixed_positional_encoder = ( + FixedPositionalEncoder(pos_embed) if modality_cfg.fixed_positions else None + ) + + dpr = np.linspace( + modality_cfg.start_drop_path_rate, + modality_cfg.end_drop_path_rate, + modality_cfg.prenet_depth, + ) + + context_encoder = BlockEncoder( + nn.ModuleList(make_block(dpr[i]) for i in range(modality_cfg.prenet_depth)), + norm_layer(embed_dim) if not layer_norm_first else None, + layer_norm_first, + modality_cfg.prenet_layerdrop, + modality_cfg.prenet_dropout, + ) + + if modality_cfg.transformer_decoder: + if modality_cfg.enc_dec_transformer: + decoder = EncDecTransformerDecoder(modality_cfg.decoder, embed_dim) + else: + dec_enc = BlockEncoder( + nn.ModuleList( + make_block(0, modality_cfg.decoder.decoder_dim, 8) + for _ in range(modality_cfg.decoder.decoder_layers) + ), + None, + layer_norm_first, + 0, + 0, + ) + decoder = TransformerDecoder(modality_cfg.decoder, embed_dim, dec_enc) + else: + decoder = ( + Decoder2d(modality_cfg.decoder, embed_dim, side_n, side_n) + if modality_cfg.decoder is not None + else None + ) + + alibi_bias_fn = partial( + get_alibi_bias, + alibi_biases=alibi_biases, + heads=modality_cfg.num_alibi_heads, + dims=modality_cfg.alibi_dims, + distance=modality_cfg.alibi_distance, + ) + + super().__init__( + modality_cfg=modality_cfg, + embed_dim=embed_dim, + local_encoder=local_encoder, + project_features=project_features, + fixed_positional_encoder=fixed_positional_encoder, + relative_positional_encoder=None, + context_encoder=context_encoder, + decoder=decoder, + get_alibi_bias=alibi_bias_fn, + ) + + def reset_parameters(self): + super().reset_parameters() + if self.decoder is not None: + self.decoder.reset_parameters() + + @torch.no_grad() + def patchify(self, imgs): + """ + imgs: (N, 3, H, W) + x: (N, L, patch_size**2 *3) + """ + p = self.modality_cfg.patch_size + h = w = imgs.shape[2] // p + x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) + x = torch.einsum("nchpwq->nhwpqc", x) + x = x.reshape(shape=(imgs.shape[0], h * w, p ** 2 * 3)) + + return x + + @torch.no_grad() + def unpatchify(self, x): + """ + x: (N, L, patch_size**2 *3) + imgs: (N, 3, H, W) + """ + p = self.modality_cfg.patch_size + h = w = int(x.shape[1] ** 0.5) + assert h * w == x.shape[1] + + x = x.reshape(shape=(x.shape[0], h, w, p, p, 3)) + x = torch.einsum("nhwpqc->nchpwq", x) + imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p)) + return imgs + + def compute_mask( + self, + x, + padding_mask, + mask_seed: Optional[MaskSeed], + apply, + shape=None, + precomputed_mask=None, + ): + mlen = self.modality_cfg.mask_length + if mlen <= 1: + return super().compute_mask( + x, padding_mask, mask_seed, apply, precomputed_mask + ) + + if precomputed_mask is not None: + mask = precomputed_mask + else: + from fairseq.data.data_utils import compute_block_mask_2d + + if shape is not None: + B, L, D = shape + else: + B, L, D = x.shape + + mask = compute_block_mask_2d( + shape=(B, L), + mask_prob=self.modality_cfg.mask_prob, + mask_length=self.modality_cfg.mask_length, + mask_prob_adjust=self.modality_cfg.mask_prob_adjust, + inverse_mask=self.modality_cfg.inverse_mask, + require_same_masks=True, + mask_dropout=self.modality_cfg.mask_dropout, + ) + + mask_info = self.make_maskinfo(x, mask, shape) + if apply: + x = self.apply_mask(x, mask_info) + + return x, mask_info + + def decoder_input(self, x, mask_info): + if ( + not self.modality_cfg.transformer_decoder + or not self.modality_cfg.enc_dec_transformer + ): + return super().decoder_input(x, mask_info) + + inp_drop = self.modality_cfg.decoder.input_dropout + if inp_drop > 0: + x = F.dropout(x, inp_drop, training=self.training, inplace=True) + + kv = x[:, self.modality_cfg.num_extra_tokens :] + + assert self.fixed_positional_encoder is not None + pos = self.fixed_positional_encoder(x, None).expand(x.size(0), -1, -1) + + mask = mask_info.mask.bool() + if self.modality_cfg.decoder.add_positions_all: + kv = kv + pos[~mask].view(kv.shape) + + q = pos[mask].view(x.size(0), -1, x.size(-1)) + + return q, kv diff --git a/examples/data2vec/models/modalities/modules.py b/examples/data2vec/models/modalities/modules.py new file mode 100644 index 0000000000..a4e1a4ea07 --- /dev/null +++ b/examples/data2vec/models/modalities/modules.py @@ -0,0 +1,589 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from dataclasses import dataclass +from fairseq.modules import ( + LayerNorm, + SamePad, + SamePad2d, + TransposeLast, +) + + +@dataclass +class D2vDecoderConfig: + decoder_dim: int = 384 + decoder_groups: int = 16 + decoder_kernel: int = 5 + decoder_layers: int = 5 + input_dropout: float = 0.1 + + add_positions_masked: bool = False + add_positions_all: bool = False + + decoder_residual: bool = True + projection_layers: int = 1 + projection_ratio: float = 2.0 + + +class FixedPositionalEncoder(nn.Module): + def __init__(self, pos_embed): + super().__init__() + self.positions = pos_embed + + def forward(self, x, padding_mask): + return self.positions + + +class TextFeatPositionalEncoder(nn.Module): + """ + Original encoder expects (B, T) long input. This module wraps it to take + local_encoder output which are (B, T, D) float tensors + """ + + def __init__(self, pos_encoder): + super().__init__() + self.pos_encoder = pos_encoder + + def forward(self, x, padding_mask): + # assume padded token embeddings are 0s + # TODO: consider using padding_mask as input + return self.pos_encoder(x[..., 0]) + + +class BlockEncoder(nn.Module): + def __init__(self, blocks, norm_layer, layer_norm_first, layerdrop, dropout): + super().__init__() + self.blocks = blocks + self.norm = norm_layer + self.layer_norm_first = layer_norm_first + self.layerdrop = layerdrop + self.dropout = nn.Dropout(dropout, inplace=True) + + def forward(self, x, padding_mask, alibi_bias, alibi_scale): + if self.norm is not None and not self.layer_norm_first: + x = self.norm(x) + + x = self.dropout(x) + + for i, blk in enumerate(self.blocks): + if ( + not self.training + or self.layerdrop == 0 + or (np.random.random() > self.layerdrop) + ): + ab = alibi_bias + if ab is not None and alibi_scale is not None: + scale = ( + alibi_scale[i] + if alibi_scale.size(0) > 1 + else alibi_scale.squeeze(0) + ) + ab = ab * scale.type_as(ab) + x, _ = blk(x, padding_mask, ab) + + if self.norm is not None and self.layer_norm_first: + x = self.norm(x) + + return x + + +class DecoderBase(nn.Module): + decoder_cfg: D2vDecoderConfig + + def __init__(self, cfg: D2vDecoderConfig): + super().__init__() + + self.decoder_cfg = cfg + + def reset_parameters(self): + for mod in self.proj.modules(): + if isinstance(mod, nn.Linear): + mod.reset_parameters() + + def add_residual(self, x, residual, i, mask_info): + if ( + residual is None + or not self.decoder_cfg.decoder_residual + or residual.size(1) != x.size(1) + ): + return x + + ret = x + residual + + return ret + + +class Decoder1d(DecoderBase): + def __init__(self, cfg: D2vDecoderConfig, input_dim): + super().__init__(cfg) + + def make_block(in_dim): + block = [ + nn.Conv1d( + in_dim, + cfg.decoder_dim, + kernel_size=cfg.decoder_kernel, + padding=cfg.decoder_kernel // 2, + groups=cfg.decoder_groups, + ), + SamePad(cfg.decoder_kernel), + TransposeLast(), + LayerNorm(cfg.decoder_dim, elementwise_affine=False), + TransposeLast(), + nn.GELU(), + ] + + return nn.Sequential(*block) + + self.blocks = nn.Sequential( + *[ + make_block(input_dim if i == 0 else cfg.decoder_dim) + for i in range(cfg.decoder_layers) + ] + ) + + projs = [] + curr_dim = cfg.decoder_dim + for i in range(cfg.projection_layers - 1): + next_dim = int(curr_dim * cfg.projection_ratio) if i == 0 else curr_dim + projs.append(nn.Linear(curr_dim, next_dim)) + projs.append(nn.GELU()) + curr_dim = next_dim + projs.append(nn.Linear(curr_dim, input_dim)) + if len(projs) == 1: + self.proj = projs[0] + else: + self.proj = nn.Sequential(*projs) + + def forward(self, x, mask_info): + + x = x.transpose(1, 2) + + residual = x + + for i, layer in enumerate(self.blocks): + x = layer(x) + x = self.add_residual(x, residual, i, mask_info) + residual = x + + x = x.transpose(1, 2) + x = self.proj(x) + return x + + +class Decoder2d(DecoderBase): + def __init__(self, cfg: D2vDecoderConfig, input_dim, h_size, w_size): + super().__init__(cfg) + + self.h_size = h_size + self.w_size = w_size + + def make_block(in_dim): + block = [ + nn.Conv2d( + in_dim, + cfg.decoder_dim, + kernel_size=cfg.decoder_kernel, + padding=cfg.decoder_kernel // 2, + groups=cfg.decoder_groups, + ), + SamePad2d(cfg.decoder_kernel), + TransposeLast(tranpose_dim=-3), + LayerNorm(cfg.decoder_dim, elementwise_affine=False), + TransposeLast(tranpose_dim=-3), + nn.GELU(), + ] + + return nn.Sequential(*block) + + self.blocks = nn.Sequential( + *[ + make_block(input_dim if i == 0 else cfg.decoder_dim) + for i in range(cfg.decoder_layers) + ] + ) + + self.proj = nn.Linear(cfg.decoder_dim, input_dim) + + def forward(self, x, mask_info): + B, T, C = x.shape + + x = x.transpose(1, 2).reshape(B, C, self.h_size, self.w_size) + + residual = x + + for i, layer in enumerate(self.blocks): + x = layer(x) + x = self.add_residual(x, residual, i, mask_info) + residual = x + + x = x.reshape(B, -1, T).transpose(1, 2) + x = self.proj(x) + return x + + +class TransformerDecoder(nn.Module): + decoder_cfg: D2vDecoderConfig + + def __init__(self, cfg: D2vDecoderConfig, input_dim, encoder): + super().__init__() + + self.decoder_cfg = cfg + + self.input_proj = nn.Linear(input_dim, cfg.decoder_dim) + + self.encoder = encoder + + self.proj = nn.Linear(cfg.decoder_dim, input_dim) + + def reset_parameters(self): + from fairseq.modules.transformer_sentence_encoder import init_bert_params + + self.apply(init_bert_params) + + def forward(self, x, mask_info): + x = self.input_proj(x) + x = self.encoder(x, None, None, 1) + x = self.proj(x) + return x + + +class AltBlock(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + mlp_drop=0.0, + post_mlp_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + layer_norm_first=True, + ffn_targets=False, + cosine_attention=False, + ): + super().__init__() + + self.layer_norm_first = layer_norm_first + self.ffn_targets = ffn_targets + + from timm.models.vision_transformer import DropPath, Mlp + + self.norm1 = norm_layer(dim) + self.attn = AltAttention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + cosine_attention=cosine_attention, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=mlp_drop, + ) + self.post_mlp_dropout = nn.Dropout(post_mlp_drop, inplace=False) + + def forward(self, x, padding_mask=None, alibi_bias=None): + if self.layer_norm_first: + x = x + self.drop_path(self.attn(self.norm1(x), padding_mask, alibi_bias)) + r = x = self.mlp(self.norm2(x)) + t = x + x = r + self.drop_path(self.post_mlp_dropout(x)) + if not self.ffn_targets: + t = x + else: + x = x + self.drop_path(self.attn(x, padding_mask, alibi_bias)) + r = x = self.norm1(x) + x = self.mlp(x) + t = x + x = self.norm2(r + self.drop_path(self.post_mlp_dropout(x))) + if not self.ffn_targets: + t = x + + return x, t + + +class AltAttention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + cosine_attention=False, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.cosine_attention = cosine_attention + + if cosine_attention: + self.logit_scale = nn.Parameter( + torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True + ) + + def forward(self, x, padding_mask=None, alibi_bias=None): + B, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) # qkv x B x H x L x D + ) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + dtype = q.dtype + + if self.cosine_attention: + # cosine attention + attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1) + logit_scale = torch.clamp( + self.logit_scale, max=torch.log(torch.tensor(1.0 / 0.01)) + ).exp() + attn = attn * logit_scale + else: + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + if alibi_bias is not None: + attn = attn.type_as(alibi_bias) + attn[:, : alibi_bias.size(1)] += alibi_bias + + if padding_mask is not None and padding_mask.any(): + attn = attn.masked_fill( + padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), + float("-inf"), + ) + + attn = attn.softmax(dim=-1, dtype=torch.float32).to(dtype=dtype) + attn = self.attn_drop(attn) + x = (attn @ v).transpose(1, 2) # + x = x.reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class EncDecAttention(nn.Module): + def __init__( + self, + q_dim, + kv_dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + cosine_attention=False, + ): + super().__init__() + self.num_heads = num_heads + head_dim = q_dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.q_proj = nn.Linear(q_dim, q_dim, bias=qkv_bias) + self.kv_proj = nn.Linear(kv_dim, 2 * q_dim, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(q_dim, q_dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.cosine_attention = cosine_attention + + if cosine_attention: + self.logit_scale = nn.Parameter( + torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True + ) + + def forward(self, q, kv, padding_mask=None, alibi_bias=None): + B, N, C = q.shape + + q = ( + self.q_proj(q) + .reshape(B, N, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) # B x H x L x D + kv = ( + self.kv_proj(kv) + .reshape(B, -1, 2, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) # kv x B x H x L x D + k, v = ( + kv[0], + kv[1], + ) # make torchscript happy (cannot use tensor as tuple) + + dtype = q.dtype + + if self.cosine_attention: + # cosine attention + attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1) + logit_scale = torch.clamp( + self.logit_scale, max=torch.log(torch.tensor(1.0 / 0.01)) + ).exp() + attn = attn * logit_scale + else: + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + if alibi_bias is not None: + attn = attn.type_as(alibi_bias) + attn[:, : alibi_bias.size(1)] += alibi_bias + + if padding_mask is not None and padding_mask.any(): + attn = attn.masked_fill( + padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), + float("-inf"), + ) + + attn = attn.softmax(dim=-1, dtype=torch.float32).to(dtype=dtype) + attn = self.attn_drop(attn) + x = (attn @ v).transpose(1, 2) # + x = x.reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class EncDecBlock(nn.Module): + def __init__( + self, + q_dim, + kv_dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + mlp_drop=0.0, + post_mlp_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + layer_norm_first=True, + cosine_attention=False, + first_residual=True, + ): + super().__init__() + + self.layer_norm_first = layer_norm_first + + from timm.models.vision_transformer import DropPath, Mlp + + self.norm1 = norm_layer(q_dim) + self.attn = EncDecAttention( + q_dim, + kv_dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + cosine_attention=cosine_attention, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(q_dim) + mlp_hidden_dim = int(q_dim * mlp_ratio) + self.mlp = Mlp( + in_features=q_dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=mlp_drop, + ) + self.post_mlp_dropout = nn.Dropout(post_mlp_drop, inplace=False) + self.first_residual = first_residual + + def forward(self, q, kv, padding_mask=None, alibi_bias=None): + r = q if self.first_residual else 0 + if self.layer_norm_first: + x = r + self.drop_path( + self.attn(self.norm1(q), kv, padding_mask, alibi_bias) + ) + r = x = self.mlp(self.norm2(x)) + x = r + self.drop_path(self.post_mlp_dropout(x)) + else: + x = r + self.drop_path(self.attn(q, kv, padding_mask, alibi_bias)) + r = x = self.norm1(x) + x = self.mlp(x) + x = self.norm2(r + self.drop_path(self.post_mlp_dropout(x))) + + return x + + +class EncDecTransformerDecoder(nn.Module): + def __init__(self, cfg: D2vDecoderConfig, input_dim): + super().__init__() + + self.input_proj = nn.Linear(input_dim, cfg.decoder_dim) + + self.blocks = nn.Sequential( + *[ + EncDecBlock( + q_dim=cfg.decoder_dim, + kv_dim=input_dim, + num_heads=8, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + mlp_drop=0.0, + post_mlp_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + layer_norm_first=False, + cosine_attention=False, + first_residual=i > 0, + ) + for i in range(cfg.decoder_layers) + ] + ) + + self.proj = nn.Linear(cfg.decoder_dim, input_dim) + + def reset_parameters(self): + from fairseq.modules.transformer_sentence_encoder import init_bert_params + + self.apply(init_bert_params) + + def forward(self, x, kv): + x = self.input_proj(x) + for i, layer in enumerate(self.blocks): + x = layer(x, kv) + + x = self.proj(x) + return x diff --git a/examples/data2vec/models/modalities/text.py b/examples/data2vec/models/modalities/text.py new file mode 100644 index 0000000000..adfac1ca48 --- /dev/null +++ b/examples/data2vec/models/modalities/text.py @@ -0,0 +1,161 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass +from functools import partial +from typing import Callable, Dict, Optional + +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from fairseq.modules import PositionalEmbedding, FairseqDropout, LayerNorm +from fairseq.tasks import FairseqTask +from .base import D2vModalityConfig, ModalitySpecificEncoder, get_alibi_bias +from .modules import BlockEncoder, Decoder1d +from examples.data2vec.data.modality import Modality + + +@dataclass +class D2vTextConfig(D2vModalityConfig): + type: Modality = Modality.TEXT + max_source_positions: int = 512 + learned_pos: bool = True + dropout: float = 0.1 # used for both local_encoder and contextualized encoder. tied with global transformer in data2vec_text + + no_scale_embedding: bool = True + layernorm_embedding: bool = True + no_token_positional_embeddings: bool = False + + +class TextEncoder(ModalitySpecificEncoder): + + modality_cfg: D2vTextConfig + + def __init__( + self, + modality_cfg: D2vTextConfig, + embed_dim: int, + make_block: Callable[[float], nn.ModuleList], + norm_layer: Callable[[int], nn.LayerNorm], + layer_norm_first: bool, + alibi_biases: Dict, + task: Optional[FairseqTask], + ): + self.pad_idx = task.source_dictionary.pad() + self.vocab_size = len(task.source_dictionary) + + local_encoder = TextLocalEncoder( + vocab_size=self.vocab_size, + embed_dim=embed_dim, + max_source_positions=modality_cfg.max_source_positions, + pad_idx=self.pad_idx, + no_scale_embedding=modality_cfg.no_scale_embedding, + layernorm_embedding=modality_cfg.layernorm_embedding, + dropout=modality_cfg.dropout, + no_token_positional_embeddings=modality_cfg.no_token_positional_embeddings, + learned_pos=modality_cfg.learned_pos, + ) + dpr = np.linspace( + modality_cfg.start_drop_path_rate, + modality_cfg.end_drop_path_rate, + modality_cfg.prenet_depth, + ) + context_encoder = BlockEncoder( + nn.ModuleList(make_block(dpr[i]) for i in range(modality_cfg.prenet_depth)), + norm_layer(embed_dim) + if not layer_norm_first and modality_cfg.prenet_depth > 0 + else None, + layer_norm_first, + modality_cfg.prenet_layerdrop, + modality_cfg.prenet_dropout if modality_cfg.prenet_depth > 0 else 0.0, + ) + decoder = ( + Decoder1d(modality_cfg.decoder, embed_dim) + if modality_cfg.decoder is not None + else None + ) + + alibi_bias_fn = partial(get_alibi_bias, alibi_biases=alibi_biases) + + super().__init__( + modality_cfg=modality_cfg, + embed_dim=embed_dim, + local_encoder=local_encoder, + project_features=nn.Identity(), + fixed_positional_encoder=None, + relative_positional_encoder=None, + context_encoder=context_encoder, + decoder=decoder, + get_alibi_bias=alibi_bias_fn, + ) + + def reset_parameters(self): + super().reset_parameters() + + def convert_padding_mask(self, x, padding_mask): + if padding_mask is None or padding_mask.size(1) == x.size(1): + return padding_mask + + diff = self.downsample - padding_mask.size(1) % self.downsample + if 0 < diff < self.downsample: + padding_mask = F.pad(padding_mask, (0, diff), value=True) + + padding_mask = padding_mask.view(padding_mask.size(0), -1, self.downsample) + padding_mask = padding_mask.all(-1) + if padding_mask.size(1) > x.size(1): + padding_mask = padding_mask[:, : x.size(1)] + + assert x.size(1) == padding_mask.size( + 1 + ), f"{x.size(1), padding_mask.size(1), diff, self.downsample}" + + return padding_mask + + +class TextLocalEncoder(nn.Module): + def __init__( + self, + vocab_size, + embed_dim, + max_source_positions, + pad_idx, + no_scale_embedding, + layernorm_embedding, + dropout, + no_token_positional_embeddings, + learned_pos, + ): + super().__init__() + self.pad_idx = pad_idx + self.dropout_module = FairseqDropout(dropout) + + self.embed_tokens = nn.Embedding(vocab_size, embed_dim, pad_idx) + self.embed_scale = 1.0 if no_scale_embedding else math.sqrt(embed_dim) + self.embed_positions = ( + PositionalEmbedding( + max_source_positions, + embed_dim, + pad_idx, + learned=learned_pos, + ) + if not no_token_positional_embeddings + else None + ) + self.embed_scale = 1.0 if no_scale_embedding else math.sqrt(embed_dim) + + self.layernorm_embedding = None + if layernorm_embedding: + self.layernorm_embedding = LayerNorm(embed_dim) + + def forward(self, src_tokens): + x = self.embed_scale * self.embed_tokens(src_tokens) + if self.embed_positions is not None: + x = x + self.embed_positions(src_tokens) + + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + x = self.dropout_module(x) + return x diff --git a/examples/data2vec/models/utils.py b/examples/data2vec/models/utils.py new file mode 100644 index 0000000000..0e2f240d4f --- /dev/null +++ b/examples/data2vec/models/utils.py @@ -0,0 +1,55 @@ +import math +import torch + +def get_alibi( + max_positions: int, + attention_heads: int, +): + def get_slopes(n): + def get_slopes_power_of_2(n): + start = 2 ** (-(2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio ** i for i in range(n)] + + # In the paper, we only train models that have 2^a heads for some + # a. This function has some good properties that only occur when + # the input is a power of 2. To maintain that even when the number + # of heads is not a power of 2, we use this workaround. + if math.log2(n).is_integer(): + return get_slopes_power_of_2(n) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(n)) + return ( + get_slopes_power_of_2(closest_power_of_2) + + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2] + ) + + maxpos = max_positions + attn_heads = attention_heads + slopes = torch.Tensor(get_slopes(attn_heads)) + # prepare alibi position linear bias. Note that wav2vec2 is non + # autoregressive model so we want a symmetric mask with 0 on the + # diagonal and other wise linear decreasing valuees + pos_bias = ( + torch.abs( + torch.arange(maxpos).unsqueeze(0) - torch.arange(maxpos).unsqueeze(1) + ) + * -1 + ) + alibi_bias = slopes.unsqueeze(1).unsqueeze(1) * pos_bias.unsqueeze(0).expand( + attn_heads, -1, -1 + ) + return alibi_bias + +def masked_alibi(alibi_bias, mask_indices, orig_B, orig_T): + alibi_bias = alibi_bias.view(orig_B, -1, orig_T, orig_T) + H = alibi_bias.size(1) + alibi_mask = mask_indices.unsqueeze(1) + alibi_bias = alibi_bias.masked_select(alibi_mask.unsqueeze(-1)) + alibi_bias = alibi_bias.view(orig_B, H, -1, orig_T) + M = alibi_bias.size(-2) + alibi_bias = alibi_bias.masked_select(alibi_mask.unsqueeze(-2)) + alibi_bias = alibi_bias.view(-1, M, M) + return alibi_bias + + diff --git a/examples/data2vec/scripts/convert_audioset_labels.py b/examples/data2vec/scripts/convert_audioset_labels.py new file mode 100644 index 0000000000..7d720e606a --- /dev/null +++ b/examples/data2vec/scripts/convert_audioset_labels.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + + +def get_parser(): + parser = argparse.ArgumentParser(description="convert audioset labels") + # fmt: off + parser.add_argument('in_file', help='audioset csv file to convert') + parser.add_argument('--manifest', required=True, metavar='PATH', help='wav2vec-like manifest') + parser.add_argument('--descriptors', required=True, metavar='PATH', help='path to label descriptor file') + parser.add_argument('--output', required=True, metavar='PATH', help='where to output converted labels') + # fmt: on + + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + + label_descriptors = {} + with open(args.descriptors, "r") as ldf: + next(ldf) + for line in ldf: + if line.strip() == "": + continue + + items = line.split(",") + assert len(items) > 2, line + idx = items[0] + lbl = items[1] + assert lbl not in label_descriptors, lbl + label_descriptors[lbl] = idx + + labels = {} + with open(args.in_file, "r") as ifd: + for line in ifd: + if line.lstrip().startswith("#"): + continue + items = line.rstrip().split(",") + id = items[0].strip() + start = items[1].strip() + end = items[2].strip() + lbls = [label_descriptors[it.strip(' "')] for it in items[3:]] + labels[id] = [start, end, ",".join(lbls)] + + with open(args.manifest, "r") as mf, open(args.output, "w") as of: + next(mf) + for line in mf: + path, _ = line.split("\t") + id = os.path.splitext(os.path.basename(path))[0] + lbl = labels[id] + print("\t".join(lbl), file=of) + + +if __name__ == "__main__": + main() diff --git a/examples/data2vec/scripts/multi/finetune_all_fair_aws_local_lr.sh b/examples/data2vec/scripts/multi/finetune_all_fair_aws_local_lr.sh new file mode 100755 index 0000000000..41bcd31fc5 --- /dev/null +++ b/examples/data2vec/scripts/multi/finetune_all_fair_aws_local_lr.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -eu + +job_id="$1" +task_id="$2" +dir="$3" + +echo "job_id: $job_id, task_id: $task_id, dir: $dir" + +mkdir -p "$dir/log" +sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" +sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" +sbatch_args="$sbatch_args -d afterok:$job_id -o $dir/log/decode_sweep_%A.out" +sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" + +sbatch $sbatch_args examples/data2vec/scripts/multi/finetune_all_fair_local_lr.sh $dir + diff --git a/examples/data2vec/scripts/multi/finetune_all_fair_aws_local_lr_nodep.sh b/examples/data2vec/scripts/multi/finetune_all_fair_aws_local_lr_nodep.sh new file mode 100644 index 0000000000..fc85908b72 --- /dev/null +++ b/examples/data2vec/scripts/multi/finetune_all_fair_aws_local_lr_nodep.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -eu + +dir="$1" + +echo "dir: $dir" + +mkdir -p "$dir/log" +sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" +sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" +sbatch_args="$sbatch_args -o $dir/log/decode_sweep_%A.out" +sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" + +sbatch $sbatch_args examples/data2vec/scripts/multi/finetune_all_fair_local_lr.sh $dir + diff --git a/examples/data2vec/scripts/multi/finetune_all_fair_local_lr.sh b/examples/data2vec/scripts/multi/finetune_all_fair_local_lr.sh new file mode 100755 index 0000000000..121226972b --- /dev/null +++ b/examples/data2vec/scripts/multi/finetune_all_fair_local_lr.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env zsh + +dir="$1" +cp="$dir/checkpoints/checkpoint_last.pt" + +echo "dir: $dir" + +declare -A tasks +tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" +tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" +tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" +tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" +tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" +tasks[mnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MNLI-bin" +tasks[qqp]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QQP-bin" +tasks[sts_b]="/fsx-wav2vec/abaevski/data/nlp/GLUE/STS-B-bin" + +lrs=(5e-6 8e-6 1e-5 2e-5) + +for task data_path in ${(kv)tasks}; do + for lr in $lrs; do + echo $lr $task + PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" \ + python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/multi/text_finetuning \ + --config-name $task +run_config=local task.data="$data_path" common.log_interval=200 dataset.num_workers=1 \ + model.model_path="$cp" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" +model=text_wrap + done +done diff --git a/examples/data2vec/scripts/text/finetune_all_char_fair_aws_local_lr.sh b/examples/data2vec/scripts/text/finetune_all_char_fair_aws_local_lr.sh new file mode 100755 index 0000000000..18b862c240 --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_char_fair_aws_local_lr.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -eu + +job_id="$1" +task_id="$2" +dir="$3" + +echo "job_id: $job_id, task_id: $task_id, dir: $dir" + +mkdir -p "$dir/log" +sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" +sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" +sbatch_args="$sbatch_args -d afterok:$job_id -o $dir/log/ft_%A.out" +sbatch_args="$sbatch_args -e $dir/log/ft_%A.err" + +sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_char_fair_local_lr.sh $dir diff --git a/examples/data2vec/scripts/text/finetune_all_fair.sh b/examples/data2vec/scripts/text/finetune_all_fair.sh new file mode 100755 index 0000000000..34a2df3990 --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_fair.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env zsh + +job_id=$1 +task_id=$2 +dir="$3" +cp="$dir/$task_id/checkpoints/checkpoint_last.pt" + +echo "job_id: $job_id, task_id: $task_id, dir: $dir" + +declare -A tasks +tasks[cola]="/private/home/jgu/data/GLUE/CoLA-bin" +tasks[qnli]="/private/home/jgu/data/GLUE/QNLI-bin" +tasks[mrpc]="/private/home/jgu/data/GLUE/MRPC-bin" +tasks[rte]="/private/home/jgu/data/GLUE/RTE-bin" +tasks[sst_2]="/private/home/jgu/data/GLUE/SST-2-bin" + +for task data_path in ${(kv)tasks}; do + PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ + --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ + checkpoint.restore_file="$cp" +hydra.launcher.additional_parameters.dependency="afterok:$job_id" hydra.sweep.dir="$dir/finetune/$task" & +done diff --git a/examples/data2vec/scripts/text/finetune_all_fair_aws.sh b/examples/data2vec/scripts/text/finetune_all_fair_aws.sh new file mode 100755 index 0000000000..b417c20024 --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_fair_aws.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env zsh + +job_id=$1 +task_id=$2 +dir="$3" +cp="$dir/checkpoints/checkpoint_last.pt" + +echo "job_id: $job_id, task_id: $task_id, dir: $dir" + +declare -A tasks +tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" +tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" +tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" +tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" +tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" + +for task data_path in ${(kv)tasks}; do + PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ + --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g_aws task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ + checkpoint.restore_file="$cp" +hydra.launcher.additional_parameters.dependency="afterok:$job_id" hydra.sweep.dir="$dir/finetune/$task" & +done diff --git a/examples/data2vec/scripts/text/finetune_all_fair_aws_local_lr.sh b/examples/data2vec/scripts/text/finetune_all_fair_aws_local_lr.sh new file mode 100755 index 0000000000..64dbcb111e --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_fair_aws_local_lr.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -eu + +job_id="$1" +task_id="$2" +dir="$3" + +echo "job_id: $job_id, task_id: $task_id, dir: $dir" + +mkdir -p "$dir/log" +sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" +sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" +sbatch_args="$sbatch_args -d afterok:$job_id -o $dir/log/decode_sweep_%A.out" +sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" + +sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_fair_local_lr.sh $dir diff --git a/examples/data2vec/scripts/text/finetune_all_fair_aws_lr.sh b/examples/data2vec/scripts/text/finetune_all_fair_aws_lr.sh new file mode 100755 index 0000000000..d75c549573 --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_fair_aws_lr.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env zsh + +job_id=$1 +task_id=$2 +dir="$3" +cp="$dir/checkpoints/checkpoint_last.pt" + +echo "job_id: $job_id, task_id: $task_id, dir: $dir" + +declare -A tasks +tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" +tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" +tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" +tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" +tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" + +for task data_path in ${(kv)tasks}; do + for lr in 5e-6 8e-6 1e-5 2e-5 5e-5 8e-5 1e-4 2e-4; do + PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ + --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g_aws task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ + checkpoint.restore_file="$cp" +hydra.launcher.additional_parameters.dependency="afterok:$job_id" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" & + done +done diff --git a/examples/data2vec/scripts/text/finetune_all_fair_local_lr.sh b/examples/data2vec/scripts/text/finetune_all_fair_local_lr.sh new file mode 100755 index 0000000000..8be98c0847 --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_fair_local_lr.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env zsh + +dir="$1" +cp="$dir/checkpoints/checkpoint_last.pt" + +echo "dir: $dir" + +declare -A tasks +tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" +tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" +tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" +tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" +tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" + +lrs=(5e-6 8e-6 1e-5 2e-5) + +for task data_path in ${(kv)tasks}; do + for lr in $lrs; do + echo $lr $task + PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" \ + python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ + --config-name $task +run_config=local task.data="$data_path" common.log_interval=200 dataset.num_workers=1 \ + checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" + done +done diff --git a/examples/data2vec/scripts/text/finetune_all_fair_nodep.sh b/examples/data2vec/scripts/text/finetune_all_fair_nodep.sh new file mode 100755 index 0000000000..d02bcc0f75 --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_fair_nodep.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env zsh + +dir="$1" +cp="$dir/checkpoints/checkpoint_last.pt" + +echo "dir: $dir" + +declare -A tasks +tasks[cola]="/private/home/jgu/data/GLUE/CoLA-bin" +tasks[qnli]="/private/home/jgu/data/GLUE/QNLI-bin" +tasks[mrpc]="/private/home/jgu/data/GLUE/MRPC-bin" +tasks[rte]="/private/home/jgu/data/GLUE/RTE-bin" +tasks[sst_2]="/private/home/jgu/data/GLUE/SST-2-bin" + +for task data_path in ${(kv)tasks}; do + PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ + --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ + checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune/$task" & +done diff --git a/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws.sh b/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws.sh new file mode 100755 index 0000000000..75538354e1 --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env zsh + +dir="$1" +cp="$dir/checkpoints/checkpoint_last.pt" + +echo "dir: $dir" + +declare -A tasks +tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" +tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" +tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" +tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" +tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" + +for task data_path in ${(kv)tasks}; do + PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ + --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g_aws task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ + checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune/$task" & +done diff --git a/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_local_lr.sh b/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_local_lr.sh new file mode 100755 index 0000000000..16c1358b2f --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_local_lr.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -eu + +dir="$1" + +echo "dir: $dir" + +mkdir -p "$dir/log" +sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" +sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" +sbatch_args="$sbatch_args -o $dir/log/decode_sweep_%A.out" +sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" + +sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_fair_local_lr.sh $dir diff --git a/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_lr.sh b/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_lr.sh new file mode 100755 index 0000000000..fb5ddbe22c --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_lr.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env zsh + +dir="$1" +cp="$dir/checkpoints/checkpoint_last.pt" + +echo "dir: $dir" + +declare -A tasks +tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" +tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" +tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" +tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" +tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" + +for task data_path in ${(kv)tasks}; do + for lr in 5e-6 8e-6 1e-5 2e-5 5e-5 8e-5 1e-4 2e-4; do + PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ + --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g_aws task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ + checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" & + done +done diff --git a/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_lr_nopos.sh b/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_lr_nopos.sh new file mode 100755 index 0000000000..1ffab1c850 --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_lr_nopos.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env zsh + +dir="$1" +cp="$dir/checkpoints/checkpoint_last.pt" + +echo "dir: $dir" + +declare -A tasks +tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" +tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" +tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" +tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" +tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" + +for task data_path in ${(kv)tasks}; do + for lr in 5e-6 8e-6 1e-5 2e-5 5e-5 8e-5 1e-4 2e-4; do + PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ + --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g_aws task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ + checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" +model.encoder_learned_pos=False & + done +done diff --git a/examples/data2vec/scripts/text/finetune_all_large_fair_aws_local_lr.sh b/examples/data2vec/scripts/text/finetune_all_large_fair_aws_local_lr.sh new file mode 100755 index 0000000000..c3c58adcb8 --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_large_fair_aws_local_lr.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -eu + +job_id="$1" +task_id="$2" +dir="$3" + +echo "job_id: $job_id, task_id: $task_id, dir: $dir" + +mkdir -p "$dir/log" +sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" +sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" +sbatch_args="$sbatch_args -d afterok:$job_id -o $dir/log/decode_sweep_%A.out" +sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" + +sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh $dir diff --git a/examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh b/examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh new file mode 100644 index 0000000000..5efb00e0df --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env zsh + +dir="$1" +cp="$dir/checkpoints/checkpoint_last.pt" + +echo "dir: $dir" + +declare -A tasks +tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" +tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" +tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" +tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" +tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" + +lrs=(5e-6 8e-6 1e-5 2e-5) + +for task data_path in ${(kv)tasks}; do + for lr in $lrs; do + echo $lr $task + PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" \ + python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ + --config-name $task +run_config=local task.data="$data_path" common.log_interval=200 dataset.num_workers=1 \ + checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" \ + model._name=roberta_large + done +done diff --git a/examples/data2vec/scripts/text/finetune_all_large_fair_nodep_aws_local_lr.sh b/examples/data2vec/scripts/text/finetune_all_large_fair_nodep_aws_local_lr.sh new file mode 100755 index 0000000000..4fb21bce79 --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_all_large_fair_nodep_aws_local_lr.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -eu + +dir="$1" + +echo "dir: $dir" + +mkdir -p "$dir/log" +sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" +sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" +sbatch_args="$sbatch_args -o $dir/log/decode_sweep_%A.out" +sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" + +sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh $dir diff --git a/examples/data2vec/scripts/text/finetune_sst2_qnli_sweep_fair_nodep.sh b/examples/data2vec/scripts/text/finetune_sst2_qnli_sweep_fair_nodep.sh new file mode 100755 index 0000000000..d7b43bee80 --- /dev/null +++ b/examples/data2vec/scripts/text/finetune_sst2_qnli_sweep_fair_nodep.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env zsh + +dir="$1" +cp="$dir/checkpoints/checkpoint_last.pt" + +echo "dir: $dir" + +declare -A tasks +tasks[qnli]="/private/home/jgu/data/GLUE/QNLI-bin" +tasks[sst_2]="/private/home/jgu/data/GLUE/SST-2-bin" + +lrs="5e-6 1e-5 2e-5 5e-5 1e-4 2e-4 5e-4 1e-3" + +for task data_path in ${(kv)tasks}; do + for lr in $(echo "$lrs"); do + PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ + --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ + checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_sweep/$task/lr_$lr" "optimization.lr=[${lr}]" & + done +done diff --git a/examples/data2vec/scripts/text/glue.py b/examples/data2vec/scripts/text/glue.py new file mode 100644 index 0000000000..5382d31834 --- /dev/null +++ b/examples/data2vec/scripts/text/glue.py @@ -0,0 +1,34 @@ +from valids import parser, main as valids_main +import os.path as osp + + +args = parser.parse_args() +args.target = "valid_accuracy" +args.best_biggest = True +args.best = True +args.last = 0 +args.path_contains = None + +res = valids_main(args, print_output=False) + +grouped = {} +for k, v in res.items(): + k = osp.dirname(k) + run = osp.dirname(k) + task = osp.basename(k) + val = v["valid_accuracy"] + + if run not in grouped: + grouped[run] = {} + + grouped[run][task] = val + +for run, tasks in grouped.items(): + print(run) + avg = sum(float(v) for v in tasks.values()) / len(tasks) + avg_norte = sum(float(v) for k,v in tasks.items() if k != 'rte') / (len(tasks) -1) + try: + print(f"{tasks['cola']}\t{tasks['qnli']}\t{tasks['mrpc']}\t{tasks['rte']}\t{tasks['sst_2']}\t{avg:.2f}\t{avg_norte:.2f}") + except: + print(tasks) + print() diff --git a/examples/data2vec/scripts/text/glue_lr.py b/examples/data2vec/scripts/text/glue_lr.py new file mode 100644 index 0000000000..75bdfe0368 --- /dev/null +++ b/examples/data2vec/scripts/text/glue_lr.py @@ -0,0 +1,143 @@ +import os.path as osp +import re +from collections import defaultdict + +from valids import parser, main as valids_main + + +TASK_TO_METRIC = { + "cola": "mcc", + "qnli": "accuracy", + "mrpc": "acc_and_f1", + "rte": "accuracy", + "sst_2": "accuracy", + "mnli": "accuracy", + "qqp": "acc_and_f1", + "sts_b": "pearson_and_spearman", +} +TASKS = ["cola", "qnli", "mrpc", "rte", "sst_2", "mnli", "qqp", "sts_b"] + + +def get_best_stat_str(task_vals, show_subdir): + task_to_best_val = {} + task_to_best_dir = {} + for task, subdir_to_val in task_vals.items(): + task_to_best_val[task] = max(subdir_to_val.values()) + task_to_best_dir[task] = max(subdir_to_val.keys(), key=lambda x: subdir_to_val[x]) + + # import pdb; pdb.set_trace() + N1 = len(task_to_best_val) + N2 = len([k for k in task_to_best_val if k != "rte"]) + avg1 = sum(task_to_best_val.values()) / N1 + avg2 = sum(v for task, v in task_to_best_val.items() if task != "rte") / N2 + + try: + msg = "" + for task in TASKS: + dir = task_to_best_dir.get(task, 'null') + val = task_to_best_val.get(task, -100) + msg += f"({dir}, {val})\t" if show_subdir else f"{val}\t" + msg += f"{avg1:.2f}\t{avg2:.2f}" + except Exception as e: + msg = str(e) + msg += str(sorted(task_vals.items())) + return msg + +def get_all_stat_str(task_vals): + msg = "" + for task in [task for task in TASKS if task in task_vals]: + msg += f"=== {task}\n" + for subdir in sorted(task_vals[task].keys()): + msg += f"\t{subdir}\t{task_vals[task][subdir]}\n" + return msg + +def get_tabular_stat_str(task_vals): + """assume subdir is /run_*/0""" + msg = "" + for task in [task for task in TASKS if task in task_vals]: + msg += f"=== {task}\n" + param_to_runs = defaultdict(dict) + for subdir in task_vals[task]: + match = re.match("(.*)/(run_.*)/0", subdir) + assert match, "subdir" + param, run = match.groups() + param_to_runs[param][run] = task_vals[task][subdir] + params = sorted(param_to_runs, key=lambda x: float(x)) + runs = sorted(set(run for runs in param_to_runs.values() for run in runs)) + msg += ("runs:" + "\t".join(runs) + "\n") + msg += ("params:" + "\t".join(params) + "\n") + for param in params: + msg += "\t".join([str(param_to_runs[param].get(run, None)) for run in runs]) + msg += "\n" + # for subdir in sorted(task_vals[task].keys()): + # msg += f"\t{subdir}\t{task_vals[task][subdir]}\n" + return msg + + + +def main(): + parser.add_argument("--show_glue", action="store_true", help="show glue metric for each task instead of accuracy") + parser.add_argument("--print_mode", default="best", help="best|all|tabular") + parser.add_argument("--show_subdir", action="store_true", help="print the subdir that has the best results for each run") + parser.add_argument("--override_target", default="valid_accuracy", help="override target") + + args = parser.parse_args() + args.target = args.override_target + args.best_biggest = True + args.best = True + args.last = 0 + args.path_contains = None + + res = valids_main(args, print_output=False) + grouped_acc = {} + grouped_met = {} # use official metric for each task + for path, v in res.items(): + path = "/".join([args.base, path]) + path = re.sub("//*", "/", path) + match = re.match("(.*)finetune[^/]*/([^/]*)/(.*)", path) + if not match: + continue + run, task, subdir = match.groups() + + if run not in grouped_acc: + grouped_acc[run] = {} + grouped_met[run] = {} + if task not in grouped_acc[run]: + grouped_acc[run][task] = {} + grouped_met[run][task] = {} + + if v is not None: + grouped_acc[run][task][subdir] = float(v.get("valid_accuracy", -100)) + grouped_met[run][task][subdir] = float(v.get(f"valid_{TASK_TO_METRIC[task]}", -100)) + else: + print(f"{path} has None return") + + header = "\t".join(TASKS) + for run in sorted(grouped_acc): + print(run) + if args.print_mode == "all": + if args.show_glue: + print("===== GLUE =====") + print(get_all_stat_str(grouped_met[run])) + else: + print("===== ACC =====") + print(get_all_stat_str(grouped_acc[run])) + elif args.print_mode == "best": + print(f" {header}") + if args.show_glue: + print(f"GLEU: {get_best_stat_str(grouped_met[run], args.show_subdir)}") + else: + print(f"ACC: {get_best_stat_str(grouped_acc[run], args.show_subdir)}") + elif args.print_mode == "tabular": + if args.show_glue: + print("===== GLUE =====") + print(get_tabular_stat_str(grouped_met[run])) + else: + print("===== ACC =====") + print(get_tabular_stat_str(grouped_acc[run])) + else: + raise ValueError(args.print_mode) + print() + +if __name__ == "__main__": + main() diff --git a/examples/data2vec/scripts/text/unprocess_data.py b/examples/data2vec/scripts/text/unprocess_data.py new file mode 100644 index 0000000000..f1acb624b8 --- /dev/null +++ b/examples/data2vec/scripts/text/unprocess_data.py @@ -0,0 +1,188 @@ +import json +import os +import tqdm +from fairseq.data import Dictionary, data_utils + + +def load_dictionary(dict_path): + return Dictionary.load(dict_path) + +def load_dataset(split_path, src_dict): + dataset = data_utils.load_indexed_dataset( + split_path, + src_dict, + combine=False, # set to true for loading `train*` + ) + if dataset is None: + raise FileNotFoundError(f"Dataset not found: {split_path}") + return dataset + +def load_bpe(enc_path): + with open(enc_path) as f: + bpe2idx = json.load(f) + idx2bpe = {v: k for k, v in bpe2idx.items()} + return bpe2idx, idx2bpe + +def detokenize(tokens, src_dict, idx2bpe): + raw_inds = map(int, src_dict.string(tokens).split()) + raw_chrs = "".join([idx2bpe[raw_ind] for raw_ind in raw_inds]) + raw_chrs = raw_chrs.replace("\u0120", " ") + return raw_chrs + +def _main(src_root, src_dict_path, src_bpe_path, src_splits, tgt_root, tgt_splits): + src_dict = load_dictionary(src_dict_path) + bpe2idx, idx2bpe = load_bpe(src_bpe_path) + + assert len(src_splits) == len(tgt_splits) + for src_split, tgt_split in zip(src_splits, tgt_splits): + src_dataset = load_dataset(f"{src_root}/{src_split}", src_dict) + tgt_path = f"{tgt_root}/{tgt_split}.txt" + print(f"processing {src_split} (dump to {tgt_path})...") + os.makedirs(os.path.dirname(tgt_path), exist_ok=True) + with open(tgt_path, "w") as f: + for tokens in tqdm.tqdm(src_dataset): + raw_str = detokenize(tokens, src_dict, idx2bpe) + f.write(raw_str + "\n") + +def main_pt(): + src_root = "/datasets01/bookwiki_CC-NEWS_openwebtext_stories-mmap2-bin/121219/bookwiki_CC-NEWS_openwebtext_stories-mmap2-bin" + src_dict_path = f"{src_root}/dict.txt" + src_bpe_path = f"{src_root}/encoder.json" + src_splits = [ + "bookwiki_aml-mmap2-bin/shard0/train", + "bookwiki_aml-mmap2-bin/shard1/train", + "bookwiki_aml-mmap2-bin/shard2/train", + "bookwiki_aml-mmap2-bin/shard3/train", + "bookwiki_aml-mmap2-bin/shard4/train", + "bookwiki_aml-mmap2-bin/valid/valid", + ] + + tgt_root = "/checkpoint/wnhsu/data/data2vec2/data/text/bookwiki_aml-full-mmap2-txt" + tgt_splits = [ + "train0", + "train1", + "train2", + "train3", + "train4", + "valid", + ] + _main(src_root, src_dict_path, src_bpe_path, src_splits, tgt_root, tgt_splits) + +def main_ft(): + src_root = "/fsx-wav2vec/wnhsu/data/data2vec2/data/text/GLUE" + src_dict_path = f"{src_root}/dict.txt" + src_bpe_path = f"{src_root}/encoder.json" + src_splits = [ + "CoLA-bin/input0/train", + "CoLA-bin/input0/valid", + "CoLA-bin/input0/test", + + "MNLI-bin/input0/train", + "MNLI-bin/input0/valid", + "MNLI-bin/input0/test", + "MNLI-bin/input0/test1", + "MNLI-bin/input1/train", + "MNLI-bin/input1/valid", + "MNLI-bin/input1/test", + "MNLI-bin/input1/test1", + + "MRPC-bin/input0/train", + "MRPC-bin/input0/valid", + "MRPC-bin/input0/test", + "MRPC-bin/input1/train", + "MRPC-bin/input1/valid", + "MRPC-bin/input1/test", + + "QNLI-bin/input0/train", + "QNLI-bin/input0/valid", + "QNLI-bin/input0/test", + "QNLI-bin/input1/train", + "QNLI-bin/input1/valid", + "QNLI-bin/input1/test", + + "QQP-bin/input0/train", + "QQP-bin/input0/valid", + "QQP-bin/input0/test", + "QQP-bin/input1/train", + "QQP-bin/input1/valid", + "QQP-bin/input1/test", + + "RTE-bin/input0/train", + "RTE-bin/input0/valid", + "RTE-bin/input0/test", + "RTE-bin/input1/train", + "RTE-bin/input1/valid", + "RTE-bin/input1/test", + + "SST-2-bin/input0/train", + "SST-2-bin/input0/valid", + "SST-2-bin/input0/test", + + "STS-B-bin/input0/train", + "STS-B-bin/input0/valid", + "STS-B-bin/input0/test", + "STS-B-bin/input1/train", + "STS-B-bin/input1/valid", + "STS-B-bin/input1/test", + ] + + tgt_root = "/fsx-wav2vec/wnhsu/data/data2vec2/data/text/GLUE_chr" + tgt_splits = [ + "CoLA-bin/input0/train", + "CoLA-bin/input0/valid", + "CoLA-bin/input0/test", + + "MNLI-bin/input0/train", + "MNLI-bin/input0/valid", + "MNLI-bin/input0/test", + "MNLI-bin/input0/test1", + "MNLI-bin/input1/train", + "MNLI-bin/input1/valid", + "MNLI-bin/input1/test", + "MNLI-bin/input1/test1", + + "MRPC-bin/input0/train", + "MRPC-bin/input0/valid", + "MRPC-bin/input0/test", + "MRPC-bin/input1/train", + "MRPC-bin/input1/valid", + "MRPC-bin/input1/test", + + "QNLI-bin/input0/train", + "QNLI-bin/input0/valid", + "QNLI-bin/input0/test", + "QNLI-bin/input1/train", + "QNLI-bin/input1/valid", + "QNLI-bin/input1/test", + + "QQP-bin/input0/train", + "QQP-bin/input0/valid", + "QQP-bin/input0/test", + "QQP-bin/input1/train", + "QQP-bin/input1/valid", + "QQP-bin/input1/test", + + "RTE-bin/input0/train", + "RTE-bin/input0/valid", + "RTE-bin/input0/test", + "RTE-bin/input1/train", + "RTE-bin/input1/valid", + "RTE-bin/input1/test", + + "SST-2-bin/input0/train", + "SST-2-bin/input0/valid", + "SST-2-bin/input0/test", + + "STS-B-bin/input0/train", + "STS-B-bin/input0/valid", + "STS-B-bin/input0/test", + "STS-B-bin/input1/train", + "STS-B-bin/input1/valid", + "STS-B-bin/input1/test", + ] + _main(src_root, src_dict_path, src_bpe_path, src_splits, tgt_root, tgt_splits) + + +if __name__ == "__main__": + main_pt() + main_ft() diff --git a/examples/data2vec/scripts/text/valids.py b/examples/data2vec/scripts/text/valids.py new file mode 100644 index 0000000000..b2e5cfb25d --- /dev/null +++ b/examples/data2vec/scripts/text/valids.py @@ -0,0 +1,301 @@ +import os, argparse, re, json, copy, math +from collections import OrderedDict +import numpy as np + +parser = argparse.ArgumentParser(description='Process some integers.') +parser.add_argument('base', help='base log path') +parser.add_argument('--file_name', default='train.log', help='the log file name') +parser.add_argument('--target', default='valid_loss', help='target metric') +parser.add_argument('--last', type=int, default=999999999, help='print last n matches') +parser.add_argument('--last_files', type=int, default=None, help='print last x files') +parser.add_argument('--everything', action='store_true', help='print everything instead of only last match') +parser.add_argument('--path_contains', help='only consider matching file pattern') +parser.add_argument('--group_on', help='if set, groups by this metric and shows table of differences') +parser.add_argument('--epoch', help='epoch for comparison', type=int) +parser.add_argument('--skip_empty', action='store_true', help='skip empty results') +parser.add_argument('--skip_containing', help='skips entries containing this attribute') +parser.add_argument('--unique_epochs', action='store_true', help='only consider the last line fore each epoch') +parser.add_argument('--best', action='store_true', help='print the last best result') +parser.add_argument('--avg_params', help='average these params through entire log') +parser.add_argument('--extract_prev', help='extracts this metric from previous line') + +parser.add_argument('--remove_metric', help='extracts this metric from previous line') + +parser.add_argument('--compact', action='store_true', help='if true, just prints checkpoint best val') +parser.add_argument('--hydra', action='store_true', help='if true, uses hydra param conventions') + +parser.add_argument('--best_biggest', action='store_true', help='if true, best is the biggest number, not smallest') +parser.add_argument('--key_len', type=int, default=10, help='max length of key') + +parser.add_argument('--best_only', action='store_true', help='if set, only prints the best value') +parser.add_argument('--flat', action='store_true', help='just print the best results') + + +def main(args, print_output): + ret = {} + + entries = [] + + def extract_metric(s, metric): + try: + j = json.loads(s) + except: + return None + if args.epoch is not None and ('epoch' not in j or j['epoch'] != args.epoch): + return None + return j[metric] if metric in j else None + + + def extract_params(s): + s = s.replace(args.base, '', 1) + if args.path_contains is not None: + s = s.replace(args.path_contains, '', 1) + + if args.hydra: + num_matches = re.findall(r'(?:/|__)([^/:]+):(\d+\.?\d*)', s) + # str_matches = re.findall(r'(?:/|__)([^/:]+):([^\.]*[^\d\.]+)(?:/|__)', s) + str_matches = re.findall(r'(?:/|__)?((?:(?!(?:\:|__)).)+):([^\.]*[^\d\.]+\d*)(?:/|__)', s) + lr_matches = re.findall(r'optimization.(lr):\[([\d\.,]+)\]', s) + task_matches = re.findall(r'.*/(\d+)$', s) + else: + num_matches = re.findall(r'\.?([^\.]+?)(\d+(e\-\d+)?(?:\.\d+)?)(\.|$)', s) + str_matches = re.findall(r'[/\.]([^\.]*[^\d\.]+\d*)(?=\.)', s) + lr_matches = [] + task_matches = [] + + cp_matches = re.findall(r'checkpoint(?:_\d+)?_(\d+).pt', s) + + items = OrderedDict() + for m in str_matches: + if isinstance(m, tuple): + if 'checkpoint' not in m[0]: + items[m[0]] = m[1] + else: + items[m] = '' + + for m in num_matches: + items[m[0]] = m[1] + + for m in lr_matches: + items[m[0]] = m[1] + + for m in task_matches: + items["hydra_task"] = m + + for m in cp_matches: + items['checkpoint'] = m + + return items + + abs_best = None + + sources = [] + for root, _, files in os.walk(args.base): + if args.path_contains is not None and not args.path_contains in root: + continue + for f in files: + if f.endswith(args.file_name): + sources.append((root, f)) + + if args.last_files is not None: + sources = sources[-args.last_files:] + + for root, file in sources: + with open(os.path.join(root, file), 'r') as fin: + found = [] + avg = {} + prev = None + for line in fin: + line = line.rstrip() + if line.find(args.target) != -1 and ( + args.skip_containing is None or line.find(args.skip_containing) == -1): + try: + idx = line.index("{") + line = line[idx:] + line_json = json.loads(line) + except: + continue + if prev is not None: + try: + prev.update(line_json) + line_json = prev + except: + pass + if args.target in line_json: + found.append(line_json) + if args.avg_params: + avg_params = args.avg_params.split(',') + for p in avg_params: + m = extract_metric(line, p) + if m is not None: + prev_v, prev_c = avg.get(p, (0, 0)) + avg[p] = prev_v + float(m), prev_c + 1 + if args.extract_prev: + try: + prev = json.loads(line) + except: + pass + best = None + if args.best: + curr_best = None + for i in range(len(found)): + cand_best = found[i][args.target] if args.target in found[i] else None + + def cmp(a, b): + a = float(a) + b = float(b) + if args.best_biggest: + return a > b + return a < b + + if cand_best is not None and not math.isnan(float(cand_best)) and ( + curr_best is None or cmp(cand_best, curr_best)): + curr_best = cand_best + if abs_best is None or cmp(curr_best, abs_best): + abs_best = curr_best + best = found[i] + if args.unique_epochs or args.epoch: + last_found = [] + last_epoch = None + for i in reversed(range(len(found))): + epoch = found[i]['epoch'] + if args.epoch and args.epoch != epoch: + continue + if epoch != last_epoch: + last_epoch = epoch + last_found.append(found[i]) + found = list(reversed(last_found)) + + if len(found) == 0: + if print_output and (args.last_files is not None or not args.skip_empty): + # print(root.split('/')[-1]) + print(root[len(args.base):]) + print('Nothing') + else: + if not print_output: + ret[root[len(args.base):]] = best + continue + + if args.compact: + # print('{}\t{}'.format(root.split('/')[-1], curr_best)) + print('{}\t{}'.format(root[len(args.base)+1:], curr_best)) + continue + + if args.group_on is None and not args.best_only: + # print(root.split('/')[-1]) + print(root[len(args.base):]) + if not args.everything: + if best is not None and args.group_on is None and not args.best_only and not args.flat: + print(best, '(best)') + if args.group_on is None and args.last and not args.best_only and not args.flat: + for f in found[-args.last:]: + if args.extract_prev is not None: + try: + print('{}\t{}'.format(f[args.extract_prev], f[args.target])) + except Exception as e: + print('Exception!', e) + else: + print(f) + try: + metric = found[-1][args.target] if not args.best or best is None else best[args.target] + except: + print(found[-1]) + raise + if metric is not None: + entries.append((extract_params(root), metric)) + else: + for f in found: + print(f) + if not args.group_on and print_output: + print() + + if len(avg) > 0: + for k, (v, c) in avg.items(): + print(f'{k}: {v/c}') + + if args.best_only: + print(abs_best) + + if args.flat: + print("\t".join(m for _, m in entries)) + + if args.group_on is not None: + by_val = OrderedDict() + for e, m in entries: + k = args.group_on + if k not in e: + m_keys = [x for x in e.keys() if x.startswith(k)] + if len(m_keys) == 0: + val = "False" + else: + assert len(m_keys) == 1 + k = m_keys[0] + val = m_keys[0] + else: + val = e[args.group_on] + if val == "": + val = "True" + scrubbed_entry = copy.deepcopy(e) + if k in scrubbed_entry: + del scrubbed_entry[k] + if args.remove_metric and args.remove_metric in scrubbed_entry: + val += '_' + scrubbed_entry[args.remove_metric] + del scrubbed_entry[args.remove_metric] + by_val.setdefault(tuple(scrubbed_entry.items()), dict())[val] = m + distinct_vals = set() + for v in by_val.values(): + distinct_vals.update(v.keys()) + try: + distinct_vals = {int(d) for d in distinct_vals} + except: + print(distinct_vals) + print() + print("by_val", len(by_val)) + for k,v in by_val.items(): + print(k, '=>', v) + print() + + # , by_val, entries) + raise + from natsort import natsorted + svals = list(map(str, natsorted(distinct_vals))) + print('{}\t{}'.format(args.group_on, '\t'.join(svals))) + sums = OrderedDict({n:[] for n in svals}) + for k, v in by_val.items(): + kstr = '.'.join(':'.join(x) for x in k) + vstr = '' + for mv in svals: + x = v[mv] if mv in v else '' + vstr += '\t{}'.format(round(x, 5) if isinstance(x, float) else x) + try: + sums[mv].append(float(x)) + except: + pass + print('{}{}'.format(kstr[:args.key_len], vstr)) + if any(len(x) > 0 for x in sums.values()): + print('min:', end='') + for v in sums.values(): + min = np.min(v) + print(f'\t{round(min, 5)}', end='') + print() + print('max:', end='') + for v in sums.values(): + max = np.max(v) + print(f'\t{round(max, 5)}', end='') + print() + print('avg:', end='') + for v in sums.values(): + mean = np.mean(v) + print(f'\t{round(mean, 5)}', end='') + print() + print('median:', end='') + for v in sums.values(): + median = np.median(v) + print(f'\t{round(median, 5)}', end='') + print() + + return ret + +if __name__ == "__main__": + args = parser.parse_args() + main(args, print_output=True) \ No newline at end of file diff --git a/examples/data2vec/tasks/__init__.py b/examples/data2vec/tasks/__init__.py new file mode 100644 index 0000000000..a7422e4b30 --- /dev/null +++ b/examples/data2vec/tasks/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .image_pretraining import ImagePretrainingTask, ImagePretrainingConfig +from .image_classification import ImageClassificationTask, ImageClassificationConfig +from .mae_image_pretraining import MaeImagePretrainingTask, MaeImagePretrainingConfig + + +__all__ = [ + "ImageClassificationTask", + "ImageClassificationConfig", + "ImagePretrainingTask", + "ImagePretrainingConfig", + "MaeImagePretrainingTask", + "MaeImagePretrainingConfig", +] \ No newline at end of file diff --git a/examples/data2vec/tasks/audio_classification.py b/examples/data2vec/tasks/audio_classification.py new file mode 100644 index 0000000000..2925a04cf9 --- /dev/null +++ b/examples/data2vec/tasks/audio_classification.py @@ -0,0 +1,167 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import logging +import os +import numpy as np +import math +import torch + +from sklearn import metrics as sklearn_metrics +from dataclasses import dataclass + +from fairseq.tasks.audio_pretraining import AudioPretrainingTask, AudioPretrainingConfig +from fairseq.tasks import register_task +from fairseq.logging import metrics + +from ..data.add_class_target_dataset import AddClassTargetDataset + + +logger = logging.getLogger(__name__) + + +@dataclass +class AudioClassificationConfig(AudioPretrainingConfig): + label_descriptors: str = "label_descriptors.csv" + labels: str = "lbl" + + +@register_task("audio_classification", dataclass=AudioClassificationConfig) +class AudioClassificationTask(AudioPretrainingTask): + """ """ + + cfg: AudioClassificationConfig + + def __init__( + self, + cfg: AudioClassificationConfig, + ): + super().__init__(cfg) + + self.state.add_factory("labels", self.load_labels) + + def load_labels(self): + labels = {} + path = os.path.join(self.cfg.data, self.cfg.label_descriptors) + with open(path, "r") as ldf: + for line in ldf: + if line.strip() == "": + continue + items = line.split(",") + idx = items[0] + lbl = items[1] + assert lbl not in labels, lbl + labels[lbl] = idx + return labels + + @property + def labels(self): + return self.state.labels + + def load_dataset( + self, split: str, task_cfg: AudioClassificationConfig = None, **kwargs + ): + super().load_dataset(split, task_cfg, **kwargs) + + task_cfg = task_cfg or self.cfg + + data_path = self.cfg.data + label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}") + skipped_indices = getattr(self.datasets[split], "skipped_indices", set()) + labels = [] + with open(label_path, "r") as f: + for i, line in enumerate(f): + if i not in skipped_indices: + lbl_items = line.rstrip().split("\t") + labels.append([int(x) for x in lbl_items[2].split(",")]) + + assert len(labels) == len(self.datasets[split]), ( + f"labels length ({len(labels)}) and dataset length " + f"({len(self.datasets[split])}) do not match" + ) + + self.datasets[split] = AddClassTargetDataset( + self.datasets[split], + labels, + multi_class=True, + add_to_input=True, + num_classes=len(self.labels), + ) + + def calculate_stats(self, output, target): + + classes_num = target.shape[-1] + stats = [] + + # Accuracy, only used for single-label classification such as esc-50, not for multiple label one such as AudioSet + # acc = sklearn_metrics.accuracy_score(np.argmax(target, 1), np.argmax(output, 1)) + + # Class-wise statistics + for k in range(classes_num): + # Average precision + avg_precision = sklearn_metrics.average_precision_score( + target[:, k], output[:, k], average=None + ) + + dict = { + "AP": avg_precision, + } + + # # AUC + # try: + # auc = sklearn_metrics.roc_auc_score(target[:, k], output[:, k], average=None) + # except: + # auc = 0 + # + # # Precisions, recalls + # (precisions, recalls, thresholds) = sklearn_metrics.precision_recall_curve( + # target[:, k], output[:, k] + # ) + # + # # FPR, TPR + # (fpr, tpr, thresholds) = sklearn_metrics.roc_curve(target[:, k], output[:, k]) + # + # save_every_steps = 1000 # Sample statistics to reduce size + # dict = { + # "precisions": precisions[0::save_every_steps], + # "recalls": recalls[0::save_every_steps], + # "AP": avg_precision, + # "fpr": fpr[0::save_every_steps], + # "fnr": 1.0 - tpr[0::save_every_steps], + # "auc": auc, + # # note acc is not class-wise, this is just to keep consistent with other metrics + # "acc": acc, + # } + stats.append(dict) + + return stats + + def valid_step(self, sample, model, criterion): + loss, sample_size, logging_output = super().valid_step(sample, model, criterion) + return loss, sample_size, logging_output + + def reduce_metrics(self, logging_outputs, criterion): + super().reduce_metrics(logging_outputs, criterion) + if "_predictions" in logging_outputs[0]: + metrics.log_concat_tensor( + "_predictions", + torch.cat([l["_predictions"].cpu() for l in logging_outputs], dim=0), + ) + metrics.log_concat_tensor( + "_targets", + torch.cat([l["_targets"].cpu() for l in logging_outputs], dim=0), + ) + + def compute_stats(meters): + if meters["_predictions"].tensor.shape[0] < 100: + return 0 + stats = self.calculate_stats( + meters["_predictions"].tensor, meters["_targets"].tensor + ) + return np.nanmean([stat["AP"] for stat in stats]) + + metrics.log_derived("mAP", compute_stats) diff --git a/examples/data2vec/tasks/image_classification.py b/examples/data2vec/tasks/image_classification.py new file mode 100644 index 0000000000..1ea4c2afee --- /dev/null +++ b/examples/data2vec/tasks/image_classification.py @@ -0,0 +1,129 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import os.path as osp +import logging + +from dataclasses import dataclass +import torch +from torchvision import transforms + +from fairseq.dataclass import FairseqDataclass +from fairseq.tasks import register_task +from fairseq.logging import metrics + +try: + from ..data import ImageDataset +except: + import sys + + sys.path.append("..") + from data import ImageDataset + +from .image_pretraining import ( + ImagePretrainingConfig, + ImagePretrainingTask, + IMG_EXTENSIONS, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class ImageClassificationConfig(ImagePretrainingConfig): + pass + + +@register_task("image_classification", dataclass=ImageClassificationConfig) +class ImageClassificationTask(ImagePretrainingTask): + + cfg: ImageClassificationConfig + + @classmethod + def setup_task(cls, cfg: ImageClassificationConfig, **kwargs): + return cls(cfg) + + def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): + data_path = self.cfg.data + cfg = task_cfg or self.cfg + + path_with_split = osp.join(data_path, split) + if osp.exists(path_with_split): + data_path = path_with_split + + from timm.data import create_transform + + if split == "train": + # this should always dispatch to transforms_imagenet_train + transform = create_transform( + input_size=cfg.input_size, + is_training=True, + auto_augment="rand-m9-mstd0.5-inc1", + interpolation="bicubic", + re_prob=0.25, + re_mode="pixel", + re_count=1, + mean=cfg.normalization_mean, + std=cfg.normalization_std, + ) + if not cfg.input_size > 32: + transform.transforms[0] = transforms.RandomCrop( + cfg.input_size, padding=4 + ) + else: + t = [] + if cfg.input_size > 32: + crop_pct = 1 + if cfg.input_size < 384: + crop_pct = 224 / 256 + size = int(cfg.input_size / crop_pct) + t.append( + transforms.Resize( + size, interpolation=3 + ), # to maintain same ratio w.r.t. 224 images + ) + t.append(transforms.CenterCrop(cfg.input_size)) + + t.append(transforms.ToTensor()) + t.append( + transforms.Normalize(cfg.normalization_mean, cfg.normalization_std) + ) + transform = transforms.Compose(t) + logger.info(transform) + + self.datasets[split] = ImageDataset( + root=data_path, + extensions=IMG_EXTENSIONS, + load_classes=True, + transform=transform, + ) + for k in self.datasets.keys(): + if k != split: + assert self.datasets[k].classes == self.datasets[split].classes + + def build_model(self, model_cfg: FairseqDataclass, from_checkpoint=False): + model = super().build_model(model_cfg, from_checkpoint) + + actualized_cfg = getattr(model, "cfg", None) + if actualized_cfg is not None: + if hasattr(actualized_cfg, "pretrained_model_args"): + model_cfg.pretrained_model_args = actualized_cfg.pretrained_model_args + + return model + + def reduce_metrics(self, logging_outputs, criterion): + super().reduce_metrics(logging_outputs, criterion) + + if "correct" in logging_outputs[0]: + zero = torch.scalar_tensor(0.0) + correct = sum(log.get("correct", zero) for log in logging_outputs) + metrics.log_scalar_sum("_correct", correct) + + metrics.log_derived( + "accuracy", + lambda meters: 100 * meters["_correct"].sum / meters["sample_size"].sum, + ) diff --git a/examples/data2vec/tasks/image_pretraining.py b/examples/data2vec/tasks/image_pretraining.py new file mode 100644 index 0000000000..cd688fd136 --- /dev/null +++ b/examples/data2vec/tasks/image_pretraining.py @@ -0,0 +1,110 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import logging +import sys +import os.path as osp + +from dataclasses import dataclass, field +from typing import List +from omegaconf import MISSING + +import torch +from torchvision import transforms + +from fairseq.dataclass import FairseqDataclass +from fairseq.tasks import FairseqTask, register_task + +try: + from ..data import ImageDataset +except: + sys.path.append("..") + from data import ImageDataset + +logger = logging.getLogger(__name__) + +IMG_EXTENSIONS = { + ".jpg", + ".jpeg", + ".png", + ".ppm", + ".bmp", + ".pgm", + ".tif", + ".tiff", + ".webp", +} + + +@dataclass +class ImagePretrainingConfig(FairseqDataclass): + data: str = field(default=MISSING, metadata={"help": "path to data directory"}) + input_size: int = 224 + normalization_mean: List[float] = (0.485, 0.456, 0.406) + normalization_std: List[float] = (0.229, 0.224, 0.225) + + +@register_task("image_pretraining", dataclass=ImagePretrainingConfig) +class ImagePretrainingTask(FairseqTask): + """ """ + + cfg: ImagePretrainingConfig + + @classmethod + def setup_task(cls, cfg: ImagePretrainingConfig, **kwargs): + """Setup the task (e.g., load dictionaries). + + Args: + cfg (AudioPretrainingConfig): configuration of this task + """ + + return cls(cfg) + + def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): + data_path = self.cfg.data + cfg = task_cfg or self.cfg + + path_with_split = osp.join(data_path, split) + if osp.exists(path_with_split): + data_path = path_with_split + + transform = transforms.Compose( + [ + transforms.ColorJitter(0.4, 0.4, 0.4), + transforms.RandomHorizontalFlip(p=0.5), + transforms.RandomResizedCrop( + size=cfg.input_size, + interpolation=transforms.InterpolationMode.BICUBIC, + ), + transforms.ToTensor(), + transforms.Normalize( + mean=torch.tensor(cfg.normalization_mean), + std=torch.tensor(cfg.normalization_std), + ), + ] + ) + + logger.info(transform) + + self.datasets[split] = ImageDataset( + root=data_path, + extensions=IMG_EXTENSIONS, + load_classes=False, + transform=transform, + ) + + @property + def source_dictionary(self): + return None + + @property + def target_dictionary(self): + return None + + def max_positions(self): + """Maximum input length supported by the encoder.""" + return sys.maxsize, sys.maxsize diff --git a/examples/data2vec/tasks/mae_image_classification.py b/examples/data2vec/tasks/mae_image_classification.py new file mode 100644 index 0000000000..1bf935879f --- /dev/null +++ b/examples/data2vec/tasks/mae_image_classification.py @@ -0,0 +1,100 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import logging +import sys +import torch + +from typing import Optional +from dataclasses import dataclass, field +from omegaconf import MISSING + +from fairseq.dataclass import FairseqDataclass +from fairseq.tasks import FairseqTask, register_task +from fairseq.logging import metrics + +try: + from ..data import MaeFinetuningImageDataset +except: + sys.path.append("..") + from data import MaeFinetuningImageDataset + +logger = logging.getLogger(__name__) + + +@dataclass +class MaeImageClassificationConfig(FairseqDataclass): + data: str = field(default=MISSING, metadata={"help": "path to data directory"}) + input_size: int = 224 + local_cache_path: Optional[str] = None + + rebuild_batches: bool = True + + +@register_task("mae_image_classification", dataclass=MaeImageClassificationConfig) +class MaeImageClassificationTask(FairseqTask): + """ """ + + cfg: MaeImageClassificationConfig + + @classmethod + def setup_task(cls, cfg: MaeImageClassificationConfig, **kwargs): + """Setup the task (e.g., load dictionaries). + + Args: + cfg (AudioPretrainingConfig): configuration of this task + """ + + return cls(cfg) + + def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): + data_path = self.cfg.data + cfg = task_cfg or self.cfg + + self.datasets[split] = MaeFinetuningImageDataset( + root=data_path, + split=split, + is_train=split == "train", + input_size=cfg.input_size, + local_cache_path=cfg.local_cache_path, + shuffle=split == "train", + ) + + def build_model(self, model_cfg: FairseqDataclass, from_checkpoint=False): + model = super().build_model(model_cfg, from_checkpoint) + + actualized_cfg = getattr(model, "cfg", None) + if actualized_cfg is not None: + if hasattr(actualized_cfg, "pretrained_model_args"): + model_cfg.pretrained_model_args = actualized_cfg.pretrained_model_args + + return model + + def reduce_metrics(self, logging_outputs, criterion): + super().reduce_metrics(logging_outputs, criterion) + + if "correct" in logging_outputs[0]: + zero = torch.scalar_tensor(0.0) + correct = sum(log.get("correct", zero) for log in logging_outputs) + metrics.log_scalar_sum("_correct", correct) + + metrics.log_derived( + "accuracy", + lambda meters: 100 * meters["_correct"].sum / meters["sample_size"].sum, + ) + + @property + def source_dictionary(self): + return None + + @property + def target_dictionary(self): + return None + + def max_positions(self): + """Maximum input length supported by the encoder.""" + return sys.maxsize, sys.maxsize diff --git a/examples/data2vec/tasks/mae_image_pretraining.py b/examples/data2vec/tasks/mae_image_pretraining.py new file mode 100644 index 0000000000..35a14891ca --- /dev/null +++ b/examples/data2vec/tasks/mae_image_pretraining.py @@ -0,0 +1,119 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import logging +import sys + +from typing import Optional, List +from dataclasses import dataclass, field +from omegaconf import MISSING, II + +from fairseq.data import SubsampleDataset +from fairseq.dataclass import FairseqDataclass +from fairseq.tasks import FairseqTask, register_task + +try: + from ..data import MaeImageDataset +except: + sys.path.append("..") + from data import MaeImageDataset + +logger = logging.getLogger(__name__) + + +@dataclass +class ImageMaskingConfig: + patch_size: int = II("model.modalities.image.patch_size") + mask_prob: float = II("model.modalities.image.mask_prob") + mask_prob_adjust: float = II("model.modalities.image.mask_prob_adjust") + mask_length: int = II("model.modalities.image.mask_length") + inverse_mask: bool = II("model.modalities.image.inverse_mask") + mask_dropout: float = II("model.modalities.image.mask_dropout") + clone_batch: int = II("model.clone_batch") + expand_adjacent: bool = False + non_overlapping: bool = False + + +@dataclass +class MaeImagePretrainingConfig(FairseqDataclass): + data: str = field(default=MISSING, metadata={"help": "path to data directory"}) + multi_data: Optional[List[str]] = None + input_size: int = 224 + local_cache_path: Optional[str] = None + key: str = "imgs" + + beit_transforms: bool = False + target_transform: bool = False + no_transform: bool = False + + rebuild_batches: bool = True + + precompute_mask_config: Optional[ImageMaskingConfig] = None + + subsample: float = 1 + seed: int = II("common.seed") + dataset_type: str = "imagefolder" + + +@register_task("mae_image_pretraining", dataclass=MaeImagePretrainingConfig) +class MaeImagePretrainingTask(FairseqTask): + """ """ + + cfg: MaeImagePretrainingConfig + + @classmethod + def setup_task(cls, cfg: MaeImagePretrainingConfig, **kwargs): + """Setup the task (e.g., load dictionaries). + + Args: + cfg (AudioPretrainingConfig): configuration of this task + """ + + return cls(cfg) + + def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): + data_path = self.cfg.data + cfg = task_cfg or self.cfg + + compute_mask = cfg.precompute_mask_config is not None + mask_args = {} + if compute_mask: + mask_args = cfg.precompute_mask_config + + self.datasets[split] = MaeImageDataset( + root=data_path if cfg.multi_data is None else cfg.multi_data, + split=split, + input_size=cfg.input_size, + local_cache_path=cfg.local_cache_path, + key=cfg.key, + beit_transforms=cfg.beit_transforms, + target_transform=cfg.target_transform, + no_transform=cfg.no_transform, + compute_mask=compute_mask, + dataset_type=cfg.dataset_type, + **mask_args, + ) + + if cfg.subsample < 1: + self.datasets[split] = SubsampleDataset( + self.datasets[split], + cfg.subsample, + shuffle=True, + seed=cfg.seed, + ) + + @property + def source_dictionary(self): + return None + + @property + def target_dictionary(self): + return None + + def max_positions(self): + """Maximum input length supported by the encoder.""" + return sys.maxsize, sys.maxsize diff --git a/examples/data2vec/tasks/multimodal.py b/examples/data2vec/tasks/multimodal.py new file mode 100644 index 0000000000..74648e918f --- /dev/null +++ b/examples/data2vec/tasks/multimodal.py @@ -0,0 +1,165 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import sys + +from dataclasses import dataclass +from typing import Optional, List +from omegaconf import II + +from fairseq.data.iterators import GroupedEpochBatchIterator + +from fairseq.dataclass import FairseqDataclass +from fairseq.tasks import FairseqTask, register_task +from fairseq.tasks.audio_pretraining import AudioPretrainingConfig, AudioPretrainingTask +from fairseq.tasks.masked_lm import MaskedLMConfig, MaskedLMTask +from .mae_image_pretraining import MaeImagePretrainingConfig, MaeImagePretrainingTask +from examples.data2vec.data.modality import Modality + +from fairseq.data.audio.multi_modality_dataset import ( + MultiModalityDataset, + ModalityDatasetItem, +) + + +@dataclass +class MultimodalPretrainingConfig(FairseqDataclass): + audio: Optional[AudioPretrainingConfig] = None + image: Optional[MaeImagePretrainingConfig] = None + text: Optional[MaskedLMConfig] = None + + audio_ratio: float = 1 + image_ratio: float = 1 + text_ratio: float = 1 + + max_tokens: Optional[int] = II("dataset.max_tokens") + batch_size: Optional[int] = II("dataset.batch_size") + update_freq: List[int] = II("optimization.update_freq") + + rebuild_batches: bool = True + + +@register_task("multimodal_pretraining", dataclass=MultimodalPretrainingConfig) +class MultimodalPretrainingTask(FairseqTask): + """ """ + + cfg: MultimodalPretrainingConfig + + def __init__(self, cfg: MultimodalPretrainingConfig): + super().__init__(cfg) + self.audio_task = ( + AudioPretrainingTask(cfg.audio) if cfg.audio is not None else None + ) + self.image_task = ( + MaeImagePretrainingTask(cfg.image) if cfg.image is not None else None + ) + self.text_task = MaskedLMTask(cfg.text) if cfg.text is not None else None + + self.mult_ratios = [] + + @classmethod + def setup_task(cls, cfg: MultimodalPretrainingConfig, **kwargs): + """Setup the task (e.g., load dictionaries). + + Args: + cfg (AudioPretrainingConfig): configuration of this task + """ + + return cls(cfg) + + def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): + datasets = [] + self.mult_ratios = [] + + def load_ds(task, name, ratio): + if task is not None: + task.load_dataset(split) + ds = ModalityDatasetItem( + datasetname=name, + dataset=task.dataset(split), + max_positions=task.max_positions(), + max_tokens=self.cfg.max_tokens, + max_sentences=self.cfg.batch_size, + ) + datasets.append(ds) + self.mult_ratios.append(ratio) + + load_ds(self.audio_task, Modality.AUDIO, self.cfg.audio_ratio) + load_ds(self.image_task, Modality.IMAGE, self.cfg.image_ratio) + load_ds(self.text_task, Modality.TEXT, self.cfg.text_ratio) + + assert len(datasets) > 0 + + self.datasets[split] = MultiModalityDataset(datasets) + + @property + def supported_modalities(self): + modalities = [] + if self.cfg.text is not None: + modalities.append(Modality.TEXT) + if self.cfg.audio is not None: + modalities.append(Modality.AUDIO) + if self.cfg.image is not None: + modalities.append(Modality.IMAGE) + + return modalities + + def get_batch_iterator( + self, + dataset, + max_tokens=None, + max_sentences=None, + max_positions=None, + ignore_invalid_inputs=False, + required_batch_size_multiple=1, + seed=1, + num_shards=1, + shard_id=0, + num_workers=0, + epoch=0, + data_buffer_size=0, + disable_iterator_cache=False, + skip_remainder_batch=False, + grouped_shuffling=False, + update_epoch_batch_itr=False, + ): + + # initialize the dataset with the correct starting epoch + dataset.set_epoch(epoch) + + batch_samplers = dataset.get_batch_samplers( + self.mult_ratios, required_batch_size_multiple, seed + ) + + # return a reusable, sharded iterator + epoch_iter = GroupedEpochBatchIterator( + dataset=dataset, + collate_fn=dataset.collater, + batch_samplers=batch_samplers, + seed=seed, + num_shards=num_shards, + shard_id=shard_id, + num_workers=num_workers, + epoch=epoch, + mult_rate=max(self.cfg.update_freq), + buffer_size=data_buffer_size, + skip_remainder_batch=skip_remainder_batch, + ) + self.dataset_to_epoch_iter[dataset] = {} # refresh it every epoch + return epoch_iter + + @property + def source_dictionary(self): + return None + + @property + def target_dictionary(self): + return None + + def max_positions(self): + """Maximum input length supported by the encoder.""" + return sys.maxsize, sys.maxsize diff --git a/examples/discriminative_reranking_nmt/README.md b/examples/discriminative_reranking_nmt/README.md new file mode 100644 index 0000000000..b155e855f2 --- /dev/null +++ b/examples/discriminative_reranking_nmt/README.md @@ -0,0 +1,202 @@ +# Discriminative Reranking for Neural Machine Translation +https://aclanthology.org/2021.acl-long.563/ + +This folder contains source code for training DrNMT, a discriminatively trained reranker for neural machine translation. + +## Data preparation +1. Follow the instructions under `examples/translation` to build a base MT model. Prepare three files, one with source sentences, one with ground truth target sentences, and one with hypotheses generated from the base MT model. Each line in the file contains one sentence in raw text (i.e. no sentencepiece, etc.). Below is an example of the files with _N_ hypotheses for each source sentence. + +``` +# Example of the source sentence file: (The file should contain L lines.) + +source_sentence_1 +source_sentence_2 +source_sentence_3 +... +source_sentence_L + +# Example of the target sentence file: (The file should contain L lines.) + +target_sentence_1 +target_sentence_2 +target_sentence_3 +... +target_sentence_L + +# Example of the hypotheses file: (The file should contain L*N lines.) + +source_sentence_1_hypo_1 +source_sentence_1_hypo_2 +... +source_sentence_1_hypo_N +source_sentence_2_hypo_1 +... +source_sentence_2_hypo_N +... +source_sentence_L_hypo_1 +... +source_sentence_L_hypo_N +``` + +2. Download the [XLMR model](https://github.com/fairinternal/fairseq-py/tree/main/examples/xlmr#pre-trained-models). +``` +wget https://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz +tar zxvf xlmr.base.tar.gz + +# The folder should contain dict.txt, model.pt and sentencepiece.bpe.model. +``` + +3. Prepare scores and BPE data. +* `N`: Number of hypotheses per each source sentence. We use 50 in the paper. +* `SPLIT`: Name of the data split, i.e. train, valid, test. Use split_name, split_name1, split_name2, ..., if there are multiple datasets for a split, e.g. train, train1, valid, valid1. +* `NUM_SHARDS`: Number of shards. Set this to 1 for non-train splits. +* `METRIC`: The metric for DrNMT to optimize for. We support either `bleu` or `ter`. +``` +# For each data split, e.g. train, valid, test, etc., run the following: + +SOURCE_FILE=/path/to/source_sentence_file +TARGET_FILE=/path/to/target_sentence_file +HYPO_FILE=/path/to/hypo_file +XLMR_DIR=/path/to/xlmr +OUTPUT_DIR=/path/to/output + +python scripts/prep_data.py \ + --input-source ${SOURCE_FILE} \ + --input-target ${TARGET_FILE} \ + --input-hypo ${HYPO_FILE} \ + --output-dir ${OUTPUT_DIR} \ + --split $SPLIT + --beam $N \ + --sentencepiece-model ${XLMR_DIR}/sentencepiece.bpe.model \ + --metric $METRIC \ + --num-shards ${NUM_SHARDS} + +# The script will create ${OUTPUT_DIR}/$METRIC with ${NUM_SHARDS} splits. +# Under split*/input_src, split*/input_tgt and split*/$METRIC, there will be $SPLIT.bpe and $SPLIT.$METRIC files, respectively. + +``` + +4. Pre-process the data into fairseq format. +``` +# use comma to separate if there are more than one train or valid set +for suffix in src tgt ; do + fairseq-preprocess --only-source \ + --trainpref ${OUTPUT_DIR}/$METRIC/split1/input_${suffix}/train.bpe \ + --validpref ${OUTPUT_DIR}/$METRIC/split1/input_${suffix}/valid.bpe \ + --destdir ${OUTPUT_DIR}/$METRIC/split1/input_${suffix} \ + --workers 60 \ + --srcdict ${XLMR_DIR}/dict.txt +done + +for i in `seq 2 ${NUM_SHARDS}`; do + for suffix in src tgt ; do + fairseq-preprocess --only-source \ + --trainpref ${OUTPUT_DIR}/$METRIC/split${i}/input_${suffix}/train.bpe \ + --destdir ${OUTPUT_DIR}/$METRIC/split${i}/input_${suffix} \ + --workers 60 \ + --srcdict ${XLMR_DIR}/dict.txt + + ln -s ${OUTPUT_DIR}/$METRIC/split1/input_${suffix}/valid* ${OUTPUT_DIR}/$METRIC/split${i}/input_${suffix}/. + done + + ln -s ${OUTPUT_DIR}/$METRIC/split1/$METRIC/valid* ${OUTPUT_DIR}/$METRIC/split${i}/$METRIC/. +done +``` + +## Training + +``` +EXP_DIR=/path/to/exp + +# An example of training the model with the config for De-En experiment in the paper. +# The config uses 16 GPUs and 50 hypotheses. +# For training with fewer number of GPUs, set +# distributed_training.distributed_world_size=k +optimization.update_freq='[x]' where x = 16/k +# For training with fewer number of hypotheses, set +# task.mt_beam=N dataset.batch_size=N dataset.required_batch_size_multiple=N + +fairseq-hydra-train -m \ + --config-dir config/ --config-name deen \ + task.data=${OUTPUT_DIR}/$METRIC/split1/ \ + task.num_data_splits=${NUM_SHARDS} \ + model.pretrained_model=${XLMR_DIR}/model.pt \ + common.user_dir=${FAIRSEQ_ROOT}/examples/discriminative_reranking_nmt \ + checkpoint.save_dir=${EXP_DIR} + +``` + +## Inference & scoring +Perform DrNMT reranking (fw + reranker score) +1. Tune weights on valid sets. +``` +# genrate N hypotheses with the base MT model (fw score) +VALID_SOURCE_FILE=/path/to/source_sentences # one sentence per line, converted to the sentencepiece used by the base MT model +VALID_TARGET_FILE=/path/to/target_sentences # one sentence per line in raw text, i.e. no sentencepiece and tokenization +MT_MODEL=/path/to/mt_model +MT_DATA_PATH=/path/to/mt_data + +cat ${VALID_SOURCE_FILE} | \ + fairseq-interactive ${MT_DATA_PATH} \ + --max-tokens 4000 --buffer-size 16 \ + --num-workers 32 --path ${MT_MODEL} \ + --beam $N --nbest $N \ + --post-process sentencepiece &> valid-hypo.out + +# replace "bleu" with "ter" to optimize for TER +python drnmt_rerank.py \ + ${OUTPUT_DIR}/$METRIC/split1/ \ + --path ${EXP_DIR}/checkpoint_best.pt \ + --in-text valid-hypo.out \ + --results-path ${EXP_DIR} \ + --gen-subset valid \ + --target-text ${VALID_TARGET_FILE} \ + --user-dir ${FAIRSEQ_ROOT}/examples/discriminative_reranking_nmt \ + --bpe sentencepiece \ + --sentencepiece-model ${XLMR_DIR}/sentencepiece.bpe.model \ + --beam $N \ + --batch-size $N \ + --metric bleu \ + --tune + +``` + +2. Apply best weights on test sets +``` +# genrate N hypotheses with the base MT model (fw score) +TEST_SOURCE_FILE=/path/to/source_sentences # one sentence per line, converted to the sentencepiece used by the base MT model + +cat ${TEST_SOURCE_FILE} | \ + fairseq-interactive ${MT_DATA_PATH} \ + --max-tokens 4000 --buffer-size 16 \ + --num-workers 32 --path ${MT_MODEL} \ + --beam $N --nbest $N \ + --post-process sentencepiece &> test-hypo.out + +# replace "bleu" with "ter" to evaluate TER +# Add --target-text for evaluating BLEU/TER, +# otherwise the script will only generate the hypotheses with the highest scores only. +python drnmt_rerank.py \ + ${OUTPUT_DIR}/$METRIC/split1/ \ + --path ${EXP_DIR}/checkpoint_best.pt \ + --in-text test-hypo.out \ + --results-path ${EXP_DIR} \ + --gen-subset test \ + --user-dir ${FAIRSEQ_ROOT}/examples/discriminative_reranking_nmt \ + --bpe sentencepiece \ + --sentencepiece-model ${XLMR_DIR}/sentencepiece.bpe.model \ + --beam $N \ + --batch-size $N \ + --metric bleu \ + --fw-weight ${BEST_FW_WEIGHT} \ + --lenpen ${BEST_LENPEN} +``` + +## Citation +```bibtex +@inproceedings{lee2021discriminative, + title={Discriminative Reranking for Neural Machine Translation}, + author={Lee, Ann and Auli, Michael and Ranzato, Marc'Aurelio}, + booktitle={ACL}, + year={2021} +} +``` diff --git a/examples/discriminative_reranking_nmt/__init__.py b/examples/discriminative_reranking_nmt/__init__.py new file mode 100644 index 0000000000..0278f6a273 --- /dev/null +++ b/examples/discriminative_reranking_nmt/__init__.py @@ -0,0 +1 @@ +from . import criterions, models, tasks # noqa diff --git a/examples/discriminative_reranking_nmt/config/deen.yaml b/examples/discriminative_reranking_nmt/config/deen.yaml new file mode 100644 index 0000000000..3fc2d5fcf5 --- /dev/null +++ b/examples/discriminative_reranking_nmt/config/deen.yaml @@ -0,0 +1,56 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 50 + seed: 2 + +checkpoint: + no_epoch_checkpoints: true + best_checkpoint_metric: bleu + maximize_best_checkpoint_metric: true + +task: + _name: discriminative_reranking_nmt + data: ??? + num_data_splits: ??? + include_src: true + mt_beam: 50 + eval_target_metric: true + target_metric: bleu + +dataset: + batch_size: 50 + num_workers: 6 + required_batch_size_multiple: 50 + valid_subset: ??? + +criterion: + _name: kl_divergence_rereanking + target_dist_norm: minmax + temperature: 0.5 + +optimization: + max_epoch: 200 + lr: [0.00005] + update_freq: [32] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 8000 + total_num_update: 320000 + +model: + _name: discriminative_nmt_reranker + pretrained_model: ??? + classifier_dropout: 0.2 + +distributed_training: + ddp_backend: no_c10d + distributed_world_size: 16 diff --git a/examples/discriminative_reranking_nmt/criterions/__init__.py b/examples/discriminative_reranking_nmt/criterions/__init__.py new file mode 100644 index 0000000000..7c257c2700 --- /dev/null +++ b/examples/discriminative_reranking_nmt/criterions/__init__.py @@ -0,0 +1,6 @@ +from .discriminative_reranking_criterion import KLDivergenceRerankingCriterion + + +__all__ = [ + "KLDivergenceRerankingCriterion", +] diff --git a/examples/discriminative_reranking_nmt/criterions/discriminative_reranking_criterion.py b/examples/discriminative_reranking_nmt/criterions/discriminative_reranking_criterion.py new file mode 100644 index 0000000000..c8f19e3858 --- /dev/null +++ b/examples/discriminative_reranking_nmt/criterions/discriminative_reranking_criterion.py @@ -0,0 +1,139 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass, field + +import torch +import torch.nn.functional as F + +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import ChoiceEnum, FairseqDataclass + + +_EPSILON = torch.finfo(torch.float32).eps +TARGET_DIST_NORM_CHOICES = ChoiceEnum(["none", "minmax"]) + + +@dataclass +class KLDivergenceRerankingCriterionConfig(FairseqDataclass): + target_dist_norm: TARGET_DIST_NORM_CHOICES = field( + default="none", + metadata={"help": "method to normalize the range of target scores"}, + ) + temperature: float = field( + default=1.0, + metadata={"help": "temperature in softmax for target distributions"}, + ) + forward_batch_size: int = field( + default=32, + metadata={ + "help": "number of hypotheses per batch for model forward (set a value smaller than --mt-beam to avoid OOM when training with a large beam size)" + }, + ) + + +@register_criterion( + "kl_divergence_rereanking", dataclass=KLDivergenceRerankingCriterionConfig +) +class KLDivergenceRerankingCriterion(FairseqCriterion): + def __init__( + self, task, target_dist_norm, temperature, forward_batch_size, + ): + super().__init__(task) + self.target_dist_norm = target_dist_norm + self.temperature = temperature + self.forward_batch_size = forward_batch_size + + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + + sample_size = sample["id"].numel() + assert sample_size % self.task.cfg.mt_beam == 0, ( + f"sample_size ({sample_size}) cannot be divided by beam size ({self.task.cfg.mt_beam})." + f"Please set --required-batch-size-multiple={self.task.cfg.mt_beam}." + ) + + # split into smaller batches for model forward + batch_out = [] + for i in range(0, sample_size, self.forward_batch_size): + j = min(i + self.forward_batch_size, sample_size) + + out = model( + src_tokens=sample["net_input"]["src_tokens"][i:j, :], + src_lengths=sample["net_input"]["src_lengths"][i:j], + ) + + batch_out.append( + model.sentence_forward(out, sample["net_input"]["src_tokens"][i:j, :]) + ) + + batch_out = torch.cat(batch_out, dim=0).view( + self.task.cfg.mt_beam, sample_size // self.task.cfg.mt_beam, -1 + ) # T x B x C + if model.joint_classification == "sent": + batch_out = model.joint_forward(batch_out) + scores = model.classification_forward(batch_out.view(sample_size, 1, -1)).view( + -1, self.task.cfg.mt_beam + ) # input: B x T x C + + loss = self.compute_kl_loss( + scores, sample["target"][:, 0].view(-1, self.task.cfg.mt_beam) + ) + + sample_size = sample_size // self.task.cfg.mt_beam + + logging_output = { + "loss": loss.detach(), + "ntokens": sample["ntokens"], + "nsentences": sample_size * self.task.cfg.mt_beam, + "sample_size": sample_size, + "scores": scores.detach(), + } + + return loss, sample_size, logging_output + + def compute_kl_loss(self, logits, target): + norm_target = target + if self.target_dist_norm == "minmax": + min_v = torch.min(target, 1, keepdim=True).values + max_v = torch.max(target, 1, keepdim=True).values + norm_target = (target - min_v) / (max_v - min_v + _EPSILON) + + target_dist = F.softmax( + norm_target / self.temperature, dim=-1, dtype=torch.float32 + ) + model_dist = F.log_softmax(logits, dim=-1, dtype=torch.float32) + loss = -(target_dist * model_dist - target_dist * target_dist.log()).sum() + return loss + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) + + sample_size = utils.item( + sum(log.get("sample_size", 0) for log in logging_outputs) + ) + + loss = loss_sum / sample_size / math.log(2) + metrics.log_scalar("loss", loss, sample_size, round=3) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True diff --git a/examples/discriminative_reranking_nmt/drnmt_rerank.py b/examples/discriminative_reranking_nmt/drnmt_rerank.py new file mode 100644 index 0000000000..2e0fc2bd29 --- /dev/null +++ b/examples/discriminative_reranking_nmt/drnmt_rerank.py @@ -0,0 +1,364 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Score raw text with a trained model. +""" + +from collections import namedtuple +import logging +from multiprocessing import Pool +import sys +import os +import random + +import numpy as np +import sacrebleu +import torch + +from fairseq import checkpoint_utils, options, utils + + +logger = logging.getLogger("fairseq_cli.drnmt_rerank") +logger.setLevel(logging.INFO) + +Batch = namedtuple("Batch", "ids src_tokens src_lengths") + + +pool_init_variables = {} + + +def init_loaded_scores(mt_scores, model_scores, hyp, ref): + global pool_init_variables + pool_init_variables["mt_scores"] = mt_scores + pool_init_variables["model_scores"] = model_scores + pool_init_variables["hyp"] = hyp + pool_init_variables["ref"] = ref + + +def parse_fairseq_gen(filename, task): + source = {} + hypos = {} + scores = {} + with open(filename, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line.startswith("S-"): # source + uid, text = line.split("\t", 1) + uid = int(uid[2:]) + source[uid] = text + elif line.startswith("D-"): # hypo + uid, score, text = line.split("\t", 2) + uid = int(uid[2:]) + if uid not in hypos: + hypos[uid] = [] + scores[uid] = [] + hypos[uid].append(text) + scores[uid].append(float(score)) + else: + continue + + source_out = [source[i] for i in range(len(hypos))] + hypos_out = [h for i in range(len(hypos)) for h in hypos[i]] + scores_out = [s for i in range(len(scores)) for s in scores[i]] + + return source_out, hypos_out, scores_out + + +def read_target(filename): + with open(filename, "r", encoding="utf-8") as f: + output = [line.strip() for line in f] + return output + + +def make_batches(args, src, hyp, task, max_positions, encode_fn): + assert len(src) * args.beam == len( + hyp + ), f"Expect {len(src) * args.beam} hypotheses for {len(src)} source sentences with beam size {args.beam}. Got {len(hyp)} hypotheses intead." + hyp_encode = [ + task.source_dictionary.encode_line(encode_fn(h), add_if_not_exist=False).long() + for h in hyp + ] + if task.cfg.include_src: + src_encode = [ + task.source_dictionary.encode_line( + encode_fn(s), add_if_not_exist=False + ).long() + for s in src + ] + tokens = [(src_encode[i // args.beam], h) for i, h in enumerate(hyp_encode)] + lengths = [(t1.numel(), t2.numel()) for t1, t2 in tokens] + else: + tokens = [(h,) for h in hyp_encode] + lengths = [(h.numel(),) for h in hyp_encode] + + itr = task.get_batch_iterator( + dataset=task.build_dataset_for_inference(tokens, lengths), + max_tokens=args.max_tokens, + max_sentences=args.batch_size, + max_positions=max_positions, + ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, + ).next_epoch_itr(shuffle=False) + + for batch in itr: + yield Batch( + ids=batch["id"], + src_tokens=batch["net_input"]["src_tokens"], + src_lengths=batch["net_input"]["src_lengths"], + ) + + +def decode_rerank_scores(args): + if args.max_tokens is None and args.batch_size is None: + args.batch_size = 1 + + logger.info(args) + + use_cuda = torch.cuda.is_available() and not args.cpu + + # Load ensemble + logger.info("loading model(s) from {}".format(args.path)) + models, _model_args, task = checkpoint_utils.load_model_ensemble_and_task( + [args.path], arg_overrides=eval(args.model_overrides), + ) + + for model in models: + if args.fp16: + model.half() + if use_cuda: + model.cuda() + + # Initialize generator + generator = task.build_generator(args) + + # Handle tokenization and BPE + tokenizer = task.build_tokenizer(args) + bpe = task.build_bpe(args) + + def encode_fn(x): + if tokenizer is not None: + x = tokenizer.encode(x) + if bpe is not None: + x = bpe.encode(x) + return x + + max_positions = utils.resolve_max_positions( + task.max_positions(), *[model.max_positions() for model in models] + ) + + src, hyp, mt_scores = parse_fairseq_gen(args.in_text, task) + model_scores = {} + logger.info("decode reranker score") + for batch in make_batches(args, src, hyp, task, max_positions, encode_fn): + src_tokens = batch.src_tokens + src_lengths = batch.src_lengths + if use_cuda: + src_tokens = src_tokens.cuda() + src_lengths = src_lengths.cuda() + + sample = { + "net_input": {"src_tokens": src_tokens, "src_lengths": src_lengths}, + } + scores = task.inference_step(generator, models, sample) + + for id, sc in zip(batch.ids.tolist(), scores.tolist()): + model_scores[id] = sc[0] + + model_scores = [model_scores[i] for i in range(len(model_scores))] + + return src, hyp, mt_scores, model_scores + + +def get_score(mt_s, md_s, w1, lp, tgt_len): + return mt_s / (tgt_len ** lp) * w1 + md_s + + +def get_best_hyps(mt_scores, md_scores, hypos, fw_weight, lenpen, beam): + assert len(mt_scores) == len(md_scores) and len(mt_scores) == len(hypos) + hypo_scores = [] + best_hypos = [] + best_scores = [] + offset = 0 + for i in range(len(hypos)): + tgt_len = len(hypos[i].split()) + hypo_scores.append( + get_score(mt_scores[i], md_scores[i], fw_weight, lenpen, tgt_len) + ) + + if (i + 1) % beam == 0: + max_i = np.argmax(hypo_scores) + best_hypos.append(hypos[offset + max_i]) + best_scores.append(hypo_scores[max_i]) + hypo_scores = [] + offset += beam + return best_hypos, best_scores + + +def eval_metric(args, hypos, ref): + if args.metric == "bleu": + score = sacrebleu.corpus_bleu(hypos, [ref]).score + else: + score = sacrebleu.corpus_ter(hypos, [ref]).score + + return score + + +def score_target_hypo(args, fw_weight, lp): + mt_scores = pool_init_variables["mt_scores"] + model_scores = pool_init_variables["model_scores"] + hyp = pool_init_variables["hyp"] + ref = pool_init_variables["ref"] + best_hypos, _ = get_best_hyps( + mt_scores, model_scores, hyp, fw_weight, lp, args.beam + ) + rerank_eval = None + if ref: + rerank_eval = eval_metric(args, best_hypos, ref) + print(f"fw_weight {fw_weight}, lenpen {lp}, eval {rerank_eval}") + + return rerank_eval + + +def print_result(best_scores, best_hypos, output_file): + for i, (s, h) in enumerate(zip(best_scores, best_hypos)): + print(f"{i}\t{s}\t{h}", file=output_file) + + +def main(args): + utils.import_user_module(args) + + src, hyp, mt_scores, model_scores = decode_rerank_scores(args) + + assert ( + not args.tune or args.target_text is not None + ), "--target-text has to be set when tuning weights" + if args.target_text: + ref = read_target(args.target_text) + assert len(src) == len( + ref + ), f"different numbers of source and target sentences ({len(src)} vs. {len(ref)})" + + orig_best_hypos = [hyp[i] for i in range(0, len(hyp), args.beam)] + orig_eval = eval_metric(args, orig_best_hypos, ref) + + if args.tune: + logger.info("tune weights for reranking") + + random_params = np.array( + [ + [ + random.uniform( + args.lower_bound_fw_weight, args.upper_bound_fw_weight + ), + random.uniform(args.lower_bound_lenpen, args.upper_bound_lenpen), + ] + for k in range(args.num_trials) + ] + ) + + logger.info("launching pool") + with Pool( + 32, + initializer=init_loaded_scores, + initargs=(mt_scores, model_scores, hyp, ref), + ) as p: + rerank_scores = p.starmap( + score_target_hypo, + [ + (args, random_params[i][0], random_params[i][1],) + for i in range(args.num_trials) + ], + ) + if args.metric == "bleu": + best_index = np.argmax(rerank_scores) + else: + best_index = np.argmin(rerank_scores) + best_fw_weight = random_params[best_index][0] + best_lenpen = random_params[best_index][1] + else: + assert ( + args.lenpen is not None and args.fw_weight is not None + ), "--lenpen and --fw-weight should be set" + best_fw_weight, best_lenpen = args.fw_weight, args.lenpen + + best_hypos, best_scores = get_best_hyps( + mt_scores, model_scores, hyp, best_fw_weight, best_lenpen, args.beam + ) + + if args.results_path is not None: + os.makedirs(args.results_path, exist_ok=True) + output_path = os.path.join( + args.results_path, "generate-{}.txt".format(args.gen_subset), + ) + with open(output_path, "w", buffering=1, encoding="utf-8") as o: + print_result(best_scores, best_hypos, o) + else: + print_result(best_scores, best_hypos, sys.stdout) + + if args.target_text: + rerank_eval = eval_metric(args, best_hypos, ref) + print(f"before reranking, {args.metric.upper()}:", orig_eval) + print( + f"after reranking with fw_weight={best_fw_weight}, lenpen={best_lenpen}, {args.metric.upper()}:", + rerank_eval, + ) + + +def cli_main(): + parser = options.get_generation_parser(interactive=True) + + parser.add_argument( + "--in-text", + default=None, + required=True, + help="text from fairseq-interactive output, containing source sentences and hypotheses", + ) + parser.add_argument("--target-text", default=None, help="reference text") + parser.add_argument("--metric", type=str, choices=["bleu", "ter"], default="bleu") + parser.add_argument( + "--tune", + action="store_true", + help="if set, tune weights on fw scores and lenpen instead of applying fixed weights for reranking", + ) + parser.add_argument( + "--lower-bound-fw-weight", + default=0.0, + type=float, + help="lower bound of search space", + ) + parser.add_argument( + "--upper-bound-fw-weight", + default=3, + type=float, + help="upper bound of search space", + ) + parser.add_argument( + "--lower-bound-lenpen", + default=0.0, + type=float, + help="lower bound of search space", + ) + parser.add_argument( + "--upper-bound-lenpen", + default=3, + type=float, + help="upper bound of search space", + ) + parser.add_argument( + "--fw-weight", type=float, default=None, help="weight on the fw model score" + ) + parser.add_argument( + "--num-trials", + default=1000, + type=int, + help="number of trials to do for random search", + ) + + args = options.parse_args_and_arch(parser) + main(args) + + +if __name__ == "__main__": + cli_main() diff --git a/examples/discriminative_reranking_nmt/models/__init__.py b/examples/discriminative_reranking_nmt/models/__init__.py new file mode 100644 index 0000000000..c593ea5f18 --- /dev/null +++ b/examples/discriminative_reranking_nmt/models/__init__.py @@ -0,0 +1,6 @@ +from .discriminative_reranking_model import DiscriminativeNMTReranker + + +__all__ = [ + "DiscriminativeNMTReranker", +] diff --git a/examples/discriminative_reranking_nmt/models/discriminative_reranking_model.py b/examples/discriminative_reranking_nmt/models/discriminative_reranking_model.py new file mode 100644 index 0000000000..e4b5887f82 --- /dev/null +++ b/examples/discriminative_reranking_nmt/models/discriminative_reranking_model.py @@ -0,0 +1,365 @@ +from dataclasses import dataclass, field +import os + +import torch +import torch.nn as nn + +from fairseq import utils +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.models import ( + BaseFairseqModel, + register_model, +) + +from fairseq.models.roberta.model import RobertaClassificationHead + +from fairseq.modules import ( + LayerNorm, + TransformerSentenceEncoder, + TransformerSentenceEncoderLayer, +) + + +ACTIVATION_FN_CHOICES = ChoiceEnum(utils.get_available_activation_fns()) +JOINT_CLASSIFICATION_CHOICES = ChoiceEnum(["none", "sent"]) +SENTENCE_REP_CHOICES = ChoiceEnum(["head", "meanpool", "maxpool"]) + + +def update_init_roberta_model_state(state): + """ + update the state_dict of a Roberta model for initializing + weights of the BertRanker + """ + for k in list(state.keys()): + if ".lm_head." in k or "version" in k: + del state[k] + continue + # remove 'encoder/decoder.sentence_encoder.' from the key + assert k.startswith("encoder.sentence_encoder.") or k.startswith( + "decoder.sentence_encoder." + ), f"Cannot recognize parameter name {k}" + if "layernorm_embedding" in k: + new_k = k.replace(".layernorm_embedding.", ".emb_layer_norm.") + state[new_k[25:]] = state[k] + else: + state[k[25:]] = state[k] + del state[k] + + +class BaseRanker(nn.Module): + def __init__(self, args, task): + super().__init__() + + self.separator_token = task.dictionary.eos() + self.padding_idx = task.dictionary.pad() + + def forward(self, src_tokens): + raise NotImplementedError + + def get_segment_labels(self, src_tokens): + segment_boundary = (src_tokens == self.separator_token).long() + segment_labels = ( + segment_boundary.cumsum(dim=1) + - segment_boundary + - (src_tokens == self.padding_idx).long() + ) + + return segment_labels + + def get_positions(self, src_tokens, segment_labels): + segment_positions = ( + torch.arange(src_tokens.shape[1]) + .to(src_tokens.device) + .repeat(src_tokens.shape[0], 1) + ) + segment_boundary = (src_tokens == self.separator_token).long() + _, col_idx = (segment_positions * segment_boundary).nonzero(as_tuple=True) + col_idx = torch.cat([torch.zeros(1).type_as(col_idx), col_idx]) + offset = torch.cat( + [ + torch.zeros(1).type_as(segment_boundary), + segment_boundary.sum(dim=1).cumsum(dim=0)[:-1], + ] + ) + segment_positions -= col_idx[segment_labels + offset.unsqueeze(1)] * ( + segment_labels != 0 + ) + + padding_mask = src_tokens.ne(self.padding_idx) + segment_positions = (segment_positions + 1) * padding_mask.type_as( + segment_positions + ) + self.padding_idx + + return segment_positions + + +class BertRanker(BaseRanker): + def __init__(self, args, task): + super(BertRanker, self).__init__(args, task) + + init_model = getattr(args, "pretrained_model", "") + self.joint_layers = nn.ModuleList() + if os.path.isfile(init_model): + print(f"initialize weight from {init_model}") + + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + os.path.dirname(init_model), + checkpoint_file=os.path.basename(init_model), + ) + + in_state_dict = x["models"][0].state_dict() + init_args = x["args"].model + + num_positional_emb = init_args.max_positions + task.dictionary.pad() + 1 + + # follow the setup in roberta + self.model = TransformerSentenceEncoder( + padding_idx=task.dictionary.pad(), + vocab_size=len(task.dictionary), + num_encoder_layers=getattr( + args, "encoder_layers", init_args.encoder_layers + ), + embedding_dim=init_args.encoder_embed_dim, + ffn_embedding_dim=init_args.encoder_ffn_embed_dim, + num_attention_heads=init_args.encoder_attention_heads, + dropout=init_args.dropout, + attention_dropout=init_args.attention_dropout, + activation_dropout=init_args.activation_dropout, + num_segments=2, # add language embeddings + max_seq_len=num_positional_emb, + offset_positions_by_padding=False, + encoder_normalize_before=True, + apply_bert_init=True, + activation_fn=init_args.activation_fn, + freeze_embeddings=args.freeze_embeddings, + n_trans_layers_to_freeze=args.n_trans_layers_to_freeze, + ) + + # still need to learn segment embeddings as we added a second language embedding + if args.freeze_embeddings: + for p in self.model.segment_embeddings.parameters(): + p.requires_grad = False + + update_init_roberta_model_state(in_state_dict) + print("loading weights from the pretrained model") + self.model.load_state_dict( + in_state_dict, strict=False + ) # ignore mismatch in language embeddings + + ffn_embedding_dim = init_args.encoder_ffn_embed_dim + num_attention_heads = init_args.encoder_attention_heads + dropout = init_args.dropout + attention_dropout = init_args.attention_dropout + activation_dropout = init_args.activation_dropout + activation_fn = init_args.activation_fn + + classifier_embed_dim = getattr( + args, "embed_dim", init_args.encoder_embed_dim + ) + if classifier_embed_dim != init_args.encoder_embed_dim: + self.transform_layer = nn.Linear( + init_args.encoder_embed_dim, classifier_embed_dim + ) + else: + self.model = TransformerSentenceEncoder( + padding_idx=task.dictionary.pad(), + vocab_size=len(task.dictionary), + num_encoder_layers=args.encoder_layers, + embedding_dim=args.embed_dim, + ffn_embedding_dim=args.ffn_embed_dim, + num_attention_heads=args.attention_heads, + dropout=args.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + max_seq_len=task.max_positions() + if task.max_positions() + else args.tokens_per_sample, + num_segments=2, + offset_positions_by_padding=False, + encoder_normalize_before=args.encoder_normalize_before, + apply_bert_init=args.apply_bert_init, + activation_fn=args.activation_fn, + ) + + classifier_embed_dim = args.embed_dim + ffn_embedding_dim = args.ffn_embed_dim + num_attention_heads = args.attention_heads + dropout = args.dropout + attention_dropout = args.attention_dropout + activation_dropout = args.activation_dropout + activation_fn = args.activation_fn + + self.joint_classification = args.joint_classification + if args.joint_classification == "sent": + if args.joint_normalize_before: + self.joint_layer_norm = LayerNorm(classifier_embed_dim) + else: + self.joint_layer_norm = None + + self.joint_layers = nn.ModuleList( + [ + TransformerSentenceEncoderLayer( + embedding_dim=classifier_embed_dim, + ffn_embedding_dim=ffn_embedding_dim, + num_attention_heads=num_attention_heads, + dropout=dropout, + attention_dropout=attention_dropout, + activation_dropout=activation_dropout, + activation_fn=activation_fn, + ) + for _ in range(args.num_joint_layers) + ] + ) + + self.classifier = RobertaClassificationHead( + classifier_embed_dim, + classifier_embed_dim, + 1, # num_classes + "tanh", + args.classifier_dropout, + ) + + def forward(self, src_tokens, src_lengths): + segment_labels = self.get_segment_labels(src_tokens) + positions = self.get_positions(src_tokens, segment_labels) + + inner_states, _ = self.model( + tokens=src_tokens, + segment_labels=segment_labels, + last_state_only=True, + positions=positions, + ) + + return inner_states[-1].transpose(0, 1) # T x B x C -> B x T x C + + def sentence_forward(self, encoder_out, src_tokens=None, sentence_rep="head"): + # encoder_out: B x T x C + if sentence_rep == "head": + x = encoder_out[:, :1, :] + else: # 'meanpool', 'maxpool' + assert src_tokens is not None, "meanpool requires src_tokens input" + segment_labels = self.get_segment_labels(src_tokens) + padding_mask = src_tokens.ne(self.padding_idx) + encoder_mask = segment_labels * padding_mask.type_as(segment_labels) + + if sentence_rep == "meanpool": + ntokens = torch.sum(encoder_mask, dim=1, keepdim=True) + x = torch.sum( + encoder_out * encoder_mask.unsqueeze(2), dim=1, keepdim=True + ) / ntokens.unsqueeze(2).type_as(encoder_out) + else: # 'maxpool' + encoder_out[ + (encoder_mask == 0).unsqueeze(2).repeat(1, 1, encoder_out.shape[-1]) + ] = -float("inf") + x, _ = torch.max(encoder_out, dim=1, keepdim=True) + + if hasattr(self, "transform_layer"): + x = self.transform_layer(x) + + return x # B x 1 x C + + def joint_forward(self, x): + # x: T x B x C + if self.joint_layer_norm: + x = self.joint_layer_norm(x.transpose(0, 1)) + x = x.transpose(0, 1) + + for layer in self.joint_layers: + x, _ = layer(x, self_attn_padding_mask=None) + return x + + def classification_forward(self, x): + # x: B x T x C + return self.classifier(x) + + +@dataclass +class DiscriminativeNMTRerankerConfig(FairseqDataclass): + pretrained_model: str = field( + default="", metadata={"help": "pretrained model to load"} + ) + sentence_rep: SENTENCE_REP_CHOICES = field( + default="head", + metadata={ + "help": "method to transform the output of the transformer stack to a sentence-level representation" + }, + ) + + dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) + attention_dropout: float = field( + default=0.0, metadata={"help": "dropout probability for attention weights"} + ) + activation_dropout: float = field( + default=0.0, metadata={"help": "dropout probability after activation in FFN"} + ) + classifier_dropout: float = field( + default=0.0, metadata={"help": "classifier dropout probability"} + ) + embed_dim: int = field(default=768, metadata={"help": "embedding dimension"}) + ffn_embed_dim: int = field( + default=2048, metadata={"help": "embedding dimension for FFN"} + ) + encoder_layers: int = field(default=12, metadata={"help": "num encoder layers"}) + attention_heads: int = field(default=8, metadata={"help": "num attention heads"}) + encoder_normalize_before: bool = field( + default=False, metadata={"help": "apply layernorm before each encoder block"} + ) + apply_bert_init: bool = field( + default=False, metadata={"help": "use custom param initialization for BERT"} + ) + activation_fn: ACTIVATION_FN_CHOICES = field( + default="relu", metadata={"help": "activation function to use"} + ) + freeze_embeddings: bool = field( + default=False, metadata={"help": "freeze embeddings in the pretrained model"} + ) + n_trans_layers_to_freeze: int = field( + default=0, + metadata={ + "help": "number of layers to freeze in the pretrained transformer model" + }, + ) + + # joint classfication + joint_classification: JOINT_CLASSIFICATION_CHOICES = field( + default="none", + metadata={"help": "method to compute joint features for classification"}, + ) + num_joint_layers: int = field( + default=1, metadata={"help": "number of joint layers"} + ) + joint_normalize_before: bool = field( + default=False, + metadata={"help": "apply layer norm on the input to the joint layer"}, + ) + + +@register_model( + "discriminative_nmt_reranker", dataclass=DiscriminativeNMTRerankerConfig +) +class DiscriminativeNMTReranker(BaseFairseqModel): + @classmethod + def build_model(cls, args, task): + model = BertRanker(args, task) + return DiscriminativeNMTReranker(args, model) + + def __init__(self, args, model): + super().__init__() + + self.model = model + self.sentence_rep = args.sentence_rep + self.joint_classification = args.joint_classification + + def forward(self, src_tokens, src_lengths, **kwargs): + return self.model(src_tokens, src_lengths) + + def sentence_forward(self, encoder_out, src_tokens): + return self.model.sentence_forward(encoder_out, src_tokens, self.sentence_rep) + + def joint_forward(self, x): + return self.model.joint_forward(x) + + def classification_forward(self, x): + return self.model.classification_forward(x) diff --git a/examples/discriminative_reranking_nmt/scripts/prep_data.py b/examples/discriminative_reranking_nmt/scripts/prep_data.py new file mode 100755 index 0000000000..7aa7d37edc --- /dev/null +++ b/examples/discriminative_reranking_nmt/scripts/prep_data.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +import argparse +from multiprocessing import Pool +from pathlib import Path + +import sacrebleu +import sentencepiece as spm + + +def read_text_file(filename): + with open(filename, "r") as f: + output = [line.strip() for line in f] + + return output + + +def get_bleu(in_sent, target_sent): + bleu = sacrebleu.corpus_bleu([in_sent], [[target_sent]]) + out = " ".join( + map(str, [bleu.score, bleu.sys_len, bleu.ref_len] + bleu.counts + bleu.totals) + ) + return out + + +def get_ter(in_sent, target_sent): + ter = sacrebleu.corpus_ter([in_sent], [[target_sent]]) + out = " ".join(map(str, [ter.score, ter.num_edits, ter.ref_length])) + return out + + +def init(sp_model): + global sp + sp = spm.SentencePieceProcessor() + sp.Load(sp_model) + + +def process(source_sent, target_sent, hypo_sent, metric): + source_bpe = " ".join(sp.EncodeAsPieces(source_sent)) + hypo_bpe = [" ".join(sp.EncodeAsPieces(h)) for h in hypo_sent] + + if metric == "bleu": + score_str = [get_bleu(h, target_sent) for h in hypo_sent] + else: # ter + score_str = [get_ter(h, target_sent) for h in hypo_sent] + + return source_bpe, hypo_bpe, score_str + + +def main(args): + assert ( + args.split.startswith("train") or args.num_shards == 1 + ), "--num-shards should be set to 1 for valid and test sets" + assert ( + args.split.startswith("train") + or args.split.startswith("valid") + or args.split.startswith("test") + ), "--split should be set to train[n]/valid[n]/test[n]" + + source_sents = read_text_file(args.input_source) + target_sents = read_text_file(args.input_target) + + num_sents = len(source_sents) + assert num_sents == len( + target_sents + ), f"{args.input_source} and {args.input_target} should have the same number of sentences." + + hypo_sents = read_text_file(args.input_hypo) + assert ( + len(hypo_sents) % args.beam == 0 + ), f"Number of hypotheses ({len(hypo_sents)}) cannot be divided by beam size ({args.beam})." + + hypo_sents = [ + hypo_sents[i : i + args.beam] for i in range(0, len(hypo_sents), args.beam) + ] + assert num_sents == len( + hypo_sents + ), f"{args.input_hypo} should contain {num_sents * args.beam} hypotheses but only has {len(hypo_sents) * args.beam}. (--beam={args.beam})" + + output_dir = args.output_dir / args.metric + for ns in range(args.num_shards): + print(f"processing shard {ns+1}/{args.num_shards}") + shard_output_dir = output_dir / f"split{ns+1}" + source_output_dir = shard_output_dir / "input_src" + hypo_output_dir = shard_output_dir / "input_tgt" + metric_output_dir = shard_output_dir / args.metric + + source_output_dir.mkdir(parents=True, exist_ok=True) + hypo_output_dir.mkdir(parents=True, exist_ok=True) + metric_output_dir.mkdir(parents=True, exist_ok=True) + + if args.n_proc > 1: + with Pool( + args.n_proc, initializer=init, initargs=(args.sentencepiece_model,) + ) as p: + output = p.starmap( + process, + [ + (source_sents[i], target_sents[i], hypo_sents[i], args.metric) + for i in range(ns, num_sents, args.num_shards) + ], + ) + else: + init(args.sentencepiece_model) + output = [ + process(source_sents[i], target_sents[i], hypo_sents[i], args.metric) + for i in range(ns, num_sents, args.num_shards) + ] + + with open(source_output_dir / f"{args.split}.bpe", "w") as s_o, open( + hypo_output_dir / f"{args.split}.bpe", "w" + ) as h_o, open(metric_output_dir / f"{args.split}.{args.metric}", "w") as m_o: + for source_bpe, hypo_bpe, score_str in output: + assert len(hypo_bpe) == len(score_str) + for h, m in zip(hypo_bpe, score_str): + s_o.write(f"{source_bpe}\n") + h_o.write(f"{h}\n") + m_o.write(f"{m}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input-source", type=Path, required=True) + parser.add_argument("--input-target", type=Path, required=True) + parser.add_argument("--input-hypo", type=Path, required=True) + parser.add_argument("--output-dir", type=Path, required=True) + parser.add_argument("--split", type=str, required=True) + parser.add_argument("--beam", type=int, required=True) + parser.add_argument("--sentencepiece-model", type=str, required=True) + parser.add_argument("--metric", type=str, choices=["bleu", "ter"], default="bleu") + parser.add_argument("--num-shards", type=int, default=1) + parser.add_argument("--n-proc", type=int, default=8) + + args = parser.parse_args() + + main(args) diff --git a/examples/discriminative_reranking_nmt/tasks/__init__.py b/examples/discriminative_reranking_nmt/tasks/__init__.py new file mode 100644 index 0000000000..2d78ca9870 --- /dev/null +++ b/examples/discriminative_reranking_nmt/tasks/__init__.py @@ -0,0 +1,6 @@ +from .discriminative_reranking_task import DiscriminativeRerankingNMTTask + + +__all__ = [ + "DiscriminativeRerankingNMTTask", +] diff --git a/examples/discriminative_reranking_nmt/tasks/discriminative_reranking_task.py b/examples/discriminative_reranking_nmt/tasks/discriminative_reranking_task.py new file mode 100644 index 0000000000..b4ed2a69aa --- /dev/null +++ b/examples/discriminative_reranking_nmt/tasks/discriminative_reranking_task.py @@ -0,0 +1,490 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass, field + +import itertools +import logging +import os + +import numpy as np +import torch + +from fairseq.logging import metrics +from fairseq.data import ( + ConcatDataset, + ConcatSentencesDataset, + data_utils, + Dictionary, + IdDataset, + indexed_dataset, + NestedDictionaryDataset, + NumSamplesDataset, + NumelDataset, + PrependTokenDataset, + RawLabelDataset, + RightPadDataset, + SortDataset, + TruncateDataset, + TokenBlockDataset, +) +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.tasks import FairseqTask, register_task +from omegaconf import II, MISSING + + +EVAL_BLEU_ORDER = 4 +TARGET_METRIC_CHOICES = ChoiceEnum(["bleu", "ter"]) + +logger = logging.getLogger(__name__) + + +@dataclass +class DiscriminativeRerankingNMTConfig(FairseqDataclass): + data: str = field(default=MISSING, metadata={"help": "path to data directory"}) + num_data_splits: int = field( + default=1, metadata={"help": "total number of data splits"} + ) + no_shuffle: bool = field( + default=False, metadata={"help": "do not shuffle training data"} + ) + max_positions: int = field( + default=512, metadata={"help": "number of positional embeddings to learn"} + ) + include_src: bool = field( + default=False, metadata={"help": "include source sentence"} + ) + mt_beam: int = field(default=50, metadata={"help": "beam size of input hypotheses"}) + eval_target_metric: bool = field( + default=False, + metadata={"help": "evaluation with the target metric during validation"}, + ) + target_metric: TARGET_METRIC_CHOICES = field( + default="bleu", metadata={"help": "name of the target metric to optimize for"} + ) + train_subset: str = field( + default=II("dataset.train_subset"), + metadata={"help": "data subset to use for training (e.g. train, valid, test)"}, + ) + seed: int = field( + default=II("common.seed"), + metadata={"help": "pseudo random number generator seed"}, + ) + + +class RerankerScorer(object): + """Scores the target for a given (source (optional), target) input.""" + + def __init__(self, args, mt_beam): + self.mt_beam = mt_beam + + @torch.no_grad() + def generate(self, models, sample, **kwargs): + """Score a batch of translations.""" + net_input = sample["net_input"] + + assert len(models) == 1, "does not support model ensemble" + model = models[0] + + bs = net_input["src_tokens"].shape[0] + assert ( + model.joint_classification == "none" or bs % self.mt_beam == 0 + ), f"invalid batch size ({bs}) for joint classification with beam size ({self.mt_beam})" + + model.eval() + logits = model(**net_input) + + batch_out = model.sentence_forward(logits, net_input["src_tokens"]) + if model.joint_classification == "sent": + batch_out = model.joint_forward( + batch_out.view(self.mt_beam, bs // self.mt_beam, -1) + ) + scores = model.classification_forward( + batch_out.view(bs, 1, -1) + ) # input: B x T x C + + return scores + + +@register_task( + "discriminative_reranking_nmt", dataclass=DiscriminativeRerankingNMTConfig +) +class DiscriminativeRerankingNMTTask(FairseqTask): + """ + Translation rerank task. + The input can be either (src, tgt) sentence pairs or tgt sentence only. + """ + + cfg: DiscriminativeRerankingNMTConfig + + def __init__(self, cfg: DiscriminativeRerankingNMTConfig, data_dictionary=None): + super().__init__(cfg) + self.dictionary = data_dictionary + self._max_positions = cfg.max_positions + # args.tokens_per_sample = self._max_positions + # self.num_classes = 1 # for model + + @classmethod + def load_dictionary(cls, cfg, filename): + """Load the dictionary from the filename""" + dictionary = Dictionary.load(filename) + dictionary.add_symbol("") # for loading pretrained XLMR model + + return dictionary + + @classmethod + def setup_task(cls, cfg: DiscriminativeRerankingNMTConfig, **kwargs): + # load data dictionary (assume joint dictionary) + data_path = cfg.data + data_dict = cls.load_dictionary( + cfg, os.path.join(data_path, "input_src/dict.txt") + ) + + logger.info("[input] src dictionary: {} types".format(len(data_dict))) + + return DiscriminativeRerankingNMTTask(cfg, data_dict) + + def load_dataset(self, split, epoch=0, combine=False, **kwargs): + """Load a given dataset split (e.g., train, valid, test).""" + if self.cfg.data.endswith("1"): + data_shard = (epoch - 1) % self.cfg.num_data_splits + 1 + data_path = self.cfg.data[:-1] + str(data_shard) + else: + data_path = self.cfg.data + + def get_path(type, data_split): + return os.path.join(data_path, str(type), data_split) + + def make_dataset(type, dictionary, data_split, combine): + split_path = get_path(type, data_split) + + dataset = data_utils.load_indexed_dataset( + split_path, + dictionary, + combine=combine, + ) + return dataset + + def load_split(data_split, metric): + input_src = None + if self.cfg.include_src: + input_src = make_dataset( + "input_src", self.dictionary, data_split, combine=False + ) + assert input_src is not None, "could not find dataset: {}".format( + get_path("input_src", data_split) + ) + + input_tgt = make_dataset( + "input_tgt", self.dictionary, data_split, combine=False + ) + assert input_tgt is not None, "could not find dataset: {}".format( + get_path("input_tgt", data_split) + ) + + label_path = f"{get_path(metric, data_split)}.{metric}" + assert os.path.exists(label_path), f"could not find dataset: {label_path}" + + np_labels = np.loadtxt(label_path) + if self.cfg.target_metric == "ter": + np_labels = -np_labels + label = RawLabelDataset(np_labels) + + return input_src, input_tgt, label + + src_datasets = [] + tgt_datasets = [] + label_datasets = [] + + if split == self.cfg.train_subset: + for k in itertools.count(): + split_k = "train" + (str(k) if k > 0 else "") + prefix = os.path.join(data_path, "input_tgt", split_k) + if not indexed_dataset.dataset_exists(prefix, impl=None): + if k > 0: + break + else: + raise FileNotFoundError(f"Dataset not found: {prefix}") + input_src, input_tgt, label = load_split( + split_k, self.cfg.target_metric + ) + src_datasets.append(input_src) + tgt_datasets.append(input_tgt) + label_datasets.append(label) + else: + input_src, input_tgt, label = load_split(split, self.cfg.target_metric) + src_datasets.append(input_src) + tgt_datasets.append(input_tgt) + label_datasets.append(label) + + if len(tgt_datasets) == 1: + input_tgt, label = tgt_datasets[0], label_datasets[0] + if self.cfg.include_src: + input_src = src_datasets[0] + else: + input_tgt = ConcatDataset(tgt_datasets) + label = ConcatDataset(label_datasets) + if self.cfg.include_src: + input_src = ConcatDataset(src_datasets) + + input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions) + if self.cfg.include_src: + input_src = PrependTokenDataset(input_src, self.dictionary.bos()) + input_src = TruncateDataset(input_src, self.cfg.max_positions) + src_lengths = NumelDataset(input_src, reduce=False) + src_tokens = ConcatSentencesDataset(input_src, input_tgt) + else: + src_tokens = PrependTokenDataset(input_tgt, self.dictionary.bos()) + src_lengths = NumelDataset(src_tokens, reduce=False) + + dataset = { + "id": IdDataset(), + "net_input": { + "src_tokens": RightPadDataset( + src_tokens, + pad_idx=self.source_dictionary.pad(), + ), + "src_lengths": src_lengths, + }, + "nsentences": NumSamplesDataset(), + "ntokens": NumelDataset(src_tokens, reduce=True), + "target": label, + } + + dataset = NestedDictionaryDataset( + dataset, + sizes=[src_tokens.sizes], + ) + + assert ( + len(dataset) % self.cfg.mt_beam == 0 + ), "dataset size (%d) is not a multiple of beam size (%d)" % ( + len(dataset), + self.cfg.mt_beam, + ) + + # no need to shuffle valid/test sets + if not self.cfg.no_shuffle and split == self.cfg.train_subset: + + # need to keep all hypothese together + start_idx = np.arange(0, len(dataset), self.cfg.mt_beam) + with data_utils.numpy_seed(self.cfg.seed + epoch): + np.random.shuffle(start_idx) + + idx = np.arange(0, self.cfg.mt_beam) + shuffle = np.tile(idx, (len(start_idx), 1)).reshape(-1) + np.tile( + start_idx, (self.cfg.mt_beam, 1) + ).transpose().reshape(-1) + + dataset = SortDataset( + dataset, + sort_order=[shuffle], + ) + + logger.info(f"Loaded {split} with #samples: {len(dataset)}") + + self.datasets[split] = dataset + return self.datasets[split] + + def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): + assert not self.cfg.include_src or len(src_tokens[0]) == 2 + input_src = None + if self.cfg.include_src: + input_src = TokenBlockDataset( + [t[0] for t in src_tokens], + [l[0] for l in src_lengths], + block_size=None, # ignored for "eos" break mode + pad=self.source_dictionary.pad(), + eos=self.source_dictionary.eos(), + break_mode="eos", + ) + input_src = PrependTokenDataset(input_src, self.dictionary.bos()) + input_src = TruncateDataset(input_src, self.cfg.max_positions) + + input_tgt = TokenBlockDataset( + [t[-1] for t in src_tokens], + [l[-1] for l in src_lengths], + block_size=None, # ignored for "eos" break mode + pad=self.source_dictionary.pad(), + eos=self.source_dictionary.eos(), + break_mode="eos", + ) + input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions) + if self.cfg.include_src: + src_tokens = ConcatSentencesDataset(input_src, input_tgt) + src_lengths = NumelDataset(input_src, reduce=False) + else: + input_tgt = PrependTokenDataset(input_tgt, self.dictionary.bos()) + src_tokens = input_tgt + src_lengths = NumelDataset(src_tokens, reduce=False) + + dataset = { + "id": IdDataset(), + "net_input": { + "src_tokens": RightPadDataset( + src_tokens, + pad_idx=self.source_dictionary.pad(), + ), + "src_lengths": src_lengths, + }, + "nsentences": NumSamplesDataset(), + "ntokens": NumelDataset(src_tokens, reduce=True), + } + + return NestedDictionaryDataset( + dataset, + sizes=[src_tokens.sizes], + ) + + def build_model(self, cfg: FairseqDataclass, from_checkpoint: bool = False): + return super().build_model(cfg) + + def build_generator(self, args): + return RerankerScorer(args, mt_beam=self.cfg.mt_beam) + + def max_positions(self): + return self._max_positions + + @property + def source_dictionary(self): + return self.dictionary + + @property + def target_dictionary(self): + return self.dictionary + + def create_dummy_batch(self, device): + dummy_target = ( + torch.zeros(self.cfg.mt_beam, EVAL_BLEU_ORDER * 2 + 3).long().to(device) + if not self.cfg.eval_ter + else torch.zeros(self.cfg.mt_beam, 3).long().to(device) + ) + + return { + "id": torch.zeros(self.cfg.mt_beam, 1).long().to(device), + "net_input": { + "src_tokens": torch.zeros(self.cfg.mt_beam, 4).long().to(device), + "src_lengths": torch.ones(self.cfg.mt_beam, 1).long().to(device), + }, + "nsentences": 0, + "ntokens": 0, + "target": dummy_target, + } + + def train_step( + self, sample, model, criterion, optimizer, update_num, ignore_grad=False + ): + if ignore_grad and sample is None: + sample = self.create_dummy_batch(model.device) + + return super().train_step( + sample, model, criterion, optimizer, update_num, ignore_grad + ) + + def valid_step(self, sample, model, criterion): + if sample is None: + sample = self.create_dummy_batch(model.device) + + loss, sample_size, logging_output = super().valid_step(sample, model, criterion) + + if not self.cfg.eval_target_metric: + return loss, sample_size, logging_output + + scores = logging_output["scores"] + + if self.cfg.target_metric == "bleu": + assert sample["target"].shape[1] == EVAL_BLEU_ORDER * 2 + 3, ( + "target does not contain enough information (" + + str(sample["target"].shape[1]) + + "for evaluating BLEU" + ) + + max_id = torch.argmax(scores, dim=1) + select_id = max_id + torch.arange( + 0, sample_size * self.cfg.mt_beam, self.cfg.mt_beam + ).to(max_id.device) + bleu_data = sample["target"][select_id, 1:].sum(0).data + + logging_output["_bleu_sys_len"] = bleu_data[0] + logging_output["_bleu_ref_len"] = bleu_data[1] + + for i in range(EVAL_BLEU_ORDER): + logging_output["_bleu_counts_" + str(i)] = bleu_data[2 + i] + logging_output["_bleu_totals_" + str(i)] = bleu_data[ + 2 + EVAL_BLEU_ORDER + i + ] + + elif self.cfg.target_metric == "ter": + assert sample["target"].shape[1] == 3, ( + "target does not contain enough information (" + + str(sample["target"].shape[1]) + + "for evaluating TER" + ) + + max_id = torch.argmax(scores, dim=1) + select_id = max_id + torch.arange( + 0, sample_size * self.cfg.mt_beam, self.cfg.mt_beam + ).to(max_id.device) + ter_data = sample["target"][select_id, 1:].sum(0).data + + logging_output["_ter_num_edits"] = -ter_data[0] + logging_output["_ter_ref_len"] = -ter_data[1] + + return loss, sample_size, logging_output + + def reduce_metrics(self, logging_outputs, criterion): + super().reduce_metrics(logging_outputs, criterion) + + if not self.cfg.eval_target_metric: + return + + def sum_logs(key): + return sum(log.get(key, 0) for log in logging_outputs) + + if self.cfg.target_metric == "bleu": + counts, totals = [], [] + for i in range(EVAL_BLEU_ORDER): + counts.append(sum_logs("_bleu_counts_" + str(i))) + totals.append(sum_logs("_bleu_totals_" + str(i))) + + if max(totals) > 0: + # log counts as numpy arrays -- log_scalar will sum them correctly + metrics.log_scalar("_bleu_counts", np.array(counts)) + metrics.log_scalar("_bleu_totals", np.array(totals)) + metrics.log_scalar("_bleu_sys_len", sum_logs("_bleu_sys_len")) + metrics.log_scalar("_bleu_ref_len", sum_logs("_bleu_ref_len")) + + def compute_bleu(meters): + import inspect + import sacrebleu + + fn_sig = inspect.getfullargspec(sacrebleu.compute_bleu)[0] + if "smooth_method" in fn_sig: + smooth = {"smooth_method": "exp"} + else: + smooth = {"smooth": "exp"} + bleu = sacrebleu.compute_bleu( + correct=meters["_bleu_counts"].sum, + total=meters["_bleu_totals"].sum, + sys_len=meters["_bleu_sys_len"].sum, + ref_len=meters["_bleu_ref_len"].sum, + **smooth, + ) + return round(bleu.score, 2) + + metrics.log_derived("bleu", compute_bleu) + elif self.cfg.target_metric == "ter": + num_edits = sum_logs("_ter_num_edits") + ref_len = sum_logs("_ter_ref_len") + + if ref_len > 0: + metrics.log_scalar("_ter_num_edits", num_edits) + metrics.log_scalar("_ter_ref_len", ref_len) + + def compute_ter(meters): + score = meters["_ter_num_edits"].sum / meters["_ter_ref_len"].sum + return round(score.item(), 2) + + metrics.log_derived("ter", compute_ter) diff --git a/examples/emotion_conversion/README.md b/examples/emotion_conversion/README.md new file mode 100644 index 0000000000..caf22befe4 --- /dev/null +++ b/examples/emotion_conversion/README.md @@ -0,0 +1,214 @@ +# Textless speech emotion conversion using decomposed and discrete representations +[Felix Kreuk](https://felixkreuk.github.io), Adam Polyak, Jade Copet, Eugene Kharitonov, Tu-Anh Nguyen, Morgane Rivière, Wei-Ning Hsu, Abdelrahman Mohamed, Emmanuel Dupoux, [Yossi Adi](https://adiyoss.github.io) + +_abstract_: Speech emotion conversion is the task of modifying the perceived emotion of a speech utterance while preserving the lexical content and speaker identity. In this study, we cast the problem of emotion conversion as a spoken language translation task. We decompose speech into discrete and disentangled learned representations, consisting of content units, F0, speaker, and emotion. First, we modify the speech content by translating the content units to a target emotion, and then predict the prosodic features based on these units. Finally, the speech waveform is generated by feeding the predicted representations into a neural vocoder. Such a paradigm allows us to go beyond spectral and parametric changes of the signal, and model non-verbal vocalizations, such as laughter insertion, yawning removal, etc. We demonstrate objectively and subjectively that the proposed method is superior to the baselines in terms of perceived emotion and audio quality. We rigorously evaluate all components of such a complex system and conclude with an extensive model analysis and ablation study to better emphasize the architectural choices, strengths and weaknesses of the proposed method. Samples and code will be publicly available under the following link: https://speechbot.github.io/emotion. + +## Installation +First, create a conda virtual environment and activate it: +``` +conda create -n emotion python=3.8 -y +conda activate emotion +``` + +Then, clone this repository: +``` +git clone https://github.com/facebookresearch/fairseq.git +cd fairseq/examples/emotion_conversion +git clone https://github.com/felixkreuk/speech-resynthesis +``` + +Next, download the EmoV discrete tokens: +``` +wget https://dl.fbaipublicfiles.com/textless_nlp/emotion_conversion/data.tar.gz # (still in fairseq/examples/emotion_conversion) +tar -xzvf data.tar.gz +``` + +Your `fairseq/examples/emotion_conversion` directory should like this: +``` +drwxrwxr-x 3 felixkreuk felixkreuk 0 Feb 6 2022 data +drwxrwxr-x 3 felixkreuk felixkreuk 0 Sep 28 10:41 emotion_models +drwxr-xr-x 3 felixkreuk felixkreuk 0 Jun 29 05:43 fairseq_models +drwxr-xr-x 3 felixkreuk felixkreuk 0 Sep 28 10:41 preprocess +-rw-rw-r-- 1 felixkreuk felixkreuk 11K Dec 5 09:00 README.md +-rw-rw-r-- 1 felixkreuk felixkreuk 88 Mar 6 2022 requirements.txt +-rw-rw-r-- 1 felixkreuk felixkreuk 13K Jun 29 06:26 synthesize.py +``` + +Lastly, install fairseq and the other packages: +``` +pip install --editable ./ +pip install -r examples/emotion_conversion/requirements.txt +``` + +## Data preprocessing + +### Convert your audio to discrete representations +Please follow the steps described [here](https://github.com/pytorch/fairseq/tree/main/examples/hubert/simple_kmeans). +To generate the same discrete representations please use the following: +1. [HuBERT checkpoint](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) +2. k-means model at `data/hubert_base_ls960_layer9_clusters200/data_hubert_base_ls960_layer9_clusters200.bin` + +### Construct data splits +This step will use the discrete representations from the previous step and split them to train/valid/test sets for 3 tasks: +1. Translation model pre-training (BART language denoising) +2. Translation model training (content units emotion translation mechanism) +3. HiFiGAN model training (for synthesizing audio from discrete representations) + +Your processed data should be at `data/`: +1. `hubert_base_ls960_layer9_clusters200` - discrete representations extracted using HuBERT layer 9, clustered into 200 clusters. +2. `data.tsv` - a tsv file pointing to the EmoV dataset in your environment (Please edit the first line of this file according to your path). + +The following command will create the above splits: +``` +python examples/emotion_conversion/preprocess/create_core_manifest.py \ + --tsv data/data.tsv \ + --emov-km data/hubert_base_ls960_layer9_clusters200/data.km \ + --km data/hubert_base_ls960_layer9_clusters200/vctk.km \ + --dict data/hubert_base_ls960_layer9_clusters200/dict.txt \ + --manifests-dir $DATA +``` +* Set `$DATA` as the directory that will contain the processed data. + +### Extract F0 +To train the HiFiGAN vocoder we need to first extract the F0 curves: +``` +python examples/emotion_conversion/preprocess/extract_f0.py \ + --tsv data/data.tsv \ + --extractor pyaapt \ +``` + +## HiFiGAN training +Now we are all set to train the HiFiGAN vocoder: +``` +python examples/emotion_conversion/speech-resynthesis/train.py + --checkpoint_path \ + --config examples/emotion_conversion/speech-resynthesis/configs/EmoV/emov_hubert-layer9-cluster200_fixed-spkr-embedder_f0-raw_gst.json +``` + +## Translation Pre-training +Before translating emotions, we first need to pre-train the translation model as a denoising autoencoder (similarly to BART). +``` +python train.py \ + $DATA/fairseq-data/emov_multilingual_denoising_cross-speaker_dedup_nonzeroshot/tokenized \ + --save-dir \ + --tensorboard-logdir \ + --langs neutral,amused,angry,sleepy,disgusted,vctk.km \ + --dataset-impl mmap \ + --task multilingual_denoising \ + --arch transformer_small --criterion cross_entropy \ + --multilang-sampling-alpha 1.0 --sample-break-mode eos --max-tokens 16384 \ + --update-freq 1 --max-update 3000000 \ + --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.0 \ + --optimizer adam --weight-decay 0.01 --adam-eps 1e-06 \ + --clip-norm 0.1 --lr-scheduler polynomial_decay --lr 0.0003 \ + --total-num-update 3000000 --warmup-updates 10000 --fp16 \ + --poisson-lambda 3.5 --mask 0.3 --mask-length span-poisson --replace-length 1 --rotate 0 --mask-random 0.1 --insert 0 --permute-sentences 1.0 \ + --skip-invalid-size-inputs-valid-test \ + --user-dir examples/emotion_conversion/fairseq_models +``` + +## Translation Training +Now we are ready to train our emotion translation model: +``` +python train.py \ + --distributed-world-size 1 \ + $DATA/fairseq-data/emov_multilingual_translation_cross-speaker_dedup/tokenized/ \ + --save-dir \ + --tensorboard-logdir \ + --arch multilingual_small --task multilingual_translation \ + --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ + --lang-pairs neutral-amused,neutral-sleepy,neutral-disgusted,neutral-angry,amused-sleepy,amused-disgusted,amused-neutral,amused-angry,angry-amused,angry-sleepy,angry-disgusted,angry-neutral,disgusted-amused,disgusted-sleepy,disgusted-neutral,disgusted-angry,sleepy-amused,sleepy-neutral,sleepy-disgusted,sleepy-angry \ + --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ + --lr 1e-05 --clip-norm 0 --dropout 0.1 --attention-dropout 0.1 \ + --weight-decay 0.01 --warmup-updates 2000 --lr-scheduler inverse_sqrt \ + --max-tokens 4096 --update-freq 1 --max-update 100000 \ + --required-batch-size-multiple 8 --fp16 --num-workers 4 \ + --seed 2 --log-format json --log-interval 25 --save-interval-updates 1000 \ + --no-epoch-checkpoints --keep-best-checkpoints 1 --keep-interval-updates 1 \ + --finetune-from-model \ + --user-dir examples/emotion_conversion/fairseq_models +``` +* To share encoders/decoders use the `--share-encoders` and `--share-decoders` flags. +* To add source/target emotion tokens use the `--encoder-langtok {'src'|'tgt'}` and `--decoder-langtok` flags. + +## F0-predictor Training +The following command trains the F0 prediction module: +``` +cd examples/emotion_conversion +python -m emotion_models.pitch_predictor n_tokens=200 \ + train_tsv="$DATA/denoising/emov/train.tsv" \ + train_km="$DATA/denoising/emov/train.km" \ + valid_tsv="$DATA/denoising/emov/valid.tsv" \ + valid_km="$DATA/denoising/emov/valid.km" +``` +* See `hyra.run.dir` to configure directory for saving models. + +## Duration-predictor Training +The following command trains the duration prediction modules: +``` +cd examples/emotion_conversion +for emotion in "neutral" "amused" "angry" "disgusted" "sleepy"; do + python -m emotion_models.duration_predictor n_tokens=200 substring=$emotion \ + train_tsv="$DATA/denoising/emov/train.tsv" \ + train_km="$DATA/denoising/emov/train.km" \ + valid_tsv="$DATA/denoising/emov/valid.tsv" \ + valid_km="$DATA/denoising/emov/valid.km" +done +``` +* See `hyra.run.dir` to configure directory for saving models. +* After the above command you should have 5 duration models in your checkpoint directory: +``` +❯ ll duration_predictor/ +total 21M +-rw-rw-r-- 1 felixkreuk felixkreuk 4.1M Nov 15 2021 amused.ckpt +-rw-rw-r-- 1 felixkreuk felixkreuk 4.1M Nov 15 2021 angry.ckpt +-rw-rw-r-- 1 felixkreuk felixkreuk 4.1M Nov 15 2021 disgusted.ckpt +-rw-rw-r-- 1 felixkreuk felixkreuk 4.1M Nov 15 2021 neutral.ckpt +-rw-rw-r-- 1 felixkreuk felixkreuk 4.1M Nov 15 2021 sleepy.ckpt +``` + +## Token Generation +The following command uses `fairseq-generate` to generate the token sequences based on the source and target emotions. +``` +fairseq-generate \ + $DATA/fairseq-data/emov_multilingual_translation_cross-speaker_dedup/tokenized/ \ + --task multilingual_translation \ + --gen-subset test \ + --path \ + --beam 5 \ + --batch-size 4 --max-len-a 1.8 --max-len-b 10 --lenpen 1 --min-len 1 \ + --skip-invalid-size-inputs-valid-test --distributed-world-size 1 \ + --source-lang neutral --target-lang amused \ + --lang-pairs neutral-amused,neutral-sleepy,neutral-disgusted,neutral-angry,amused-sleepy,amused-disgusted,amused-neutral,amused-angry,angry-amused,angry-sleepy,angry-disgusted,angry-neutral,disgusted-amused,disgusted-sleepy,disgusted-neutral,disgusted-angry,sleepy-amused,sleepy-neutral,sleepy-disgusted,sleepy-angry \ + --results-path \ + --user-dir examples/emotion_conversion/fairseq_models +``` +* Modify `--source-lang` and `--target-lang` to control for the source and target emotions. +* See [fairseq documentation](https://fairseq.readthedocs.io/en/latest/command_line_tools.html#fairseq-generate) for a full overview of generation parameters (e.g., top-k/top-p sampling). + +## Waveform Synthesis +Using the output of the above command, the HiFiGAN vocoder, and the prosody prediction modules (F0 and duration) we can now generate the output waveforms: +``` +python examples/emotion_conversion/synthesize.py \ + --result-path /generate-test.txt \ + --data $DATA/fairseq-data/emov_multilingual_translation_cross-speaker_dedup/neutral-amused \ + --orig-tsv examples/emotion_conversion/data/data.tsv \ + --orig-km examples/emotion_conversion/data/hubert_base_ls960_layer9_clusters200/data.km \ + --checkpoint-file /g_00400000 \ + --dur-model duration_predictor/ \ + --f0-model pitch_predictor/pitch_predictor.ckpt \ + -s neutral -t amused \ + --outdir ~/tmp/emotion_results/wavs/neutral-amused +``` +* Please make sure the source and target emotions here match those of the previous command. + +# Citation +If you find this useful in your research, please use the following BibTeX entry for citation. +``` +@article{kreuk2021textless, + title={Textless speech emotion conversion using decomposed and discrete representations}, + author={Kreuk, Felix and Polyak, Adam and Copet, Jade and Kharitonov, Eugene and Nguyen, Tu-Anh and Rivi{\`e}re, Morgane and Hsu, Wei-Ning and Mohamed, Abdelrahman and Dupoux, Emmanuel and Adi, Yossi}, + journal={Conference on Empirical Methods in Natural Language Processing (EMNLP)}, + year={2022} +} +``` diff --git a/examples/emotion_conversion/emotion_models/__init__.py b/examples/emotion_conversion/emotion_models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/emotion_conversion/emotion_models/duration_predictor.py b/examples/emotion_conversion/emotion_models/duration_predictor.py new file mode 100644 index 0000000000..eb47df0a21 --- /dev/null +++ b/examples/emotion_conversion/emotion_models/duration_predictor.py @@ -0,0 +1,243 @@ +import logging +import os + +import hydra +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops.layers.torch import Rearrange +from torch.utils.data import DataLoader, Dataset + +from .utils import Accuracy + +logger = logging.getLogger(__name__) + + +def save_ckpt(model, path, model_class): + ckpt = { + "state_dict": model.state_dict(), + "padding_token": model.padding_token, + "model_class": model_class, + } + torch.save(ckpt, path) + + +def load_ckpt(path): + ckpt = torch.load(path) + ckpt["model_class"]["_target_"] = "emotion_models.duration_predictor.CnnPredictor" + model = hydra.utils.instantiate(ckpt["model_class"]) + model.load_state_dict(ckpt["state_dict"]) + model.padding_token = ckpt["padding_token"] + model = model.cpu() + model.eval() + return model + + +class Collator: + def __init__(self, padding_idx): + self.padding_idx = padding_idx + + def __call__(self, batch): + x = [item[0] for item in batch] + lengths = [len(item) for item in x] + x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=self.padding_idx) + y = [item[1] for item in batch] + y = torch.nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=self.padding_idx) + mask = (x != self.padding_idx) + return x, y, mask, lengths + + +class Predictor(nn.Module): + def __init__(self, n_tokens, emb_dim): + super(Predictor, self).__init__() + self.n_tokens = n_tokens + self.emb_dim = emb_dim + self.padding_token = n_tokens + # add 1 extra embedding for padding token, set the padding index to be the last token + # (tokens from the clustering start at index 0) + self.emb = nn.Embedding(n_tokens + 1, emb_dim, padding_idx=self.padding_token) + + def inflate_input(self, batch): + """ get a sequence of tokens, predict their durations + and inflate them accordingly """ + batch_durs = self.forward(batch) + batch_durs = torch.exp(batch_durs) - 1 + batch_durs = batch_durs.round() + output = [] + for seq, durs in zip(batch, batch_durs): + inflated_seq = [] + for token, n in zip(seq, durs): + if token == self.padding_token: + break + n = int(n.item()) + token = int(token.item()) + inflated_seq.extend([token for _ in range(n)]) + output.append(inflated_seq) + output = torch.LongTensor(output) + return output + + +class CnnPredictor(Predictor): + def __init__(self, n_tokens, emb_dim, channels, kernel, output_dim, dropout, n_layers): + super(CnnPredictor, self).__init__(n_tokens=n_tokens, emb_dim=emb_dim) + layers = [ + Rearrange("b t c -> b c t"), + nn.Conv1d(emb_dim, channels, kernel_size=kernel, padding=(kernel - 1) // 2), + Rearrange("b c t -> b t c"), + nn.ReLU(), + nn.LayerNorm(channels), + nn.Dropout(dropout), + ] + for _ in range(n_layers-1): + layers += [ + Rearrange("b t c -> b c t"), + nn.Conv1d(channels, channels, kernel_size=kernel, padding=(kernel - 1) // 2), + Rearrange("b c t -> b t c"), + nn.ReLU(), + nn.LayerNorm(channels), + nn.Dropout(dropout), + ] + self.conv_layer = nn.Sequential(*layers) + self.proj = nn.Linear(channels, output_dim) + + def forward(self, x): + x = self.emb(x) + x = self.conv_layer(x) + x = self.proj(x) + x = x.squeeze(-1) + return x + + +def l2_log_loss(input, target): + return F.mse_loss( + input=input.float(), + target=torch.log(target.float() + 1), + reduce=False + ) + + +class DurationDataset(Dataset): + def __init__(self, tsv_path, km_path, substring=""): + lines = open(tsv_path, "r").readlines() + self.root, self.tsv = lines[0], lines[1:] + self.km = open(km_path, "r").readlines() + logger.info(f"loaded {len(self.km)} files") + + if substring != "": + tsv, km = [], [] + for tsv_line, km_line in zip(self.tsv, self.km): + if substring.lower() in tsv_line.lower(): + tsv.append(tsv_line) + km.append(km_line) + self.tsv, self.km = tsv, km + logger.info(f"after filtering: {len(self.km)} files") + + def __len__(self): + return len(self.km) + + def __getitem__(self, i): + x = self.km[i] + x = x.split(" ") + x = list(map(int, x)) + + y = [] + xd = [] + count = 1 + for x1, x2 in zip(x[:-1], x[1:]): + if x1 == x2: + count += 1 + continue + else: + y.append(count) + xd.append(x1) + count = 1 + + xd = torch.LongTensor(xd) + y = torch.LongTensor(y) + return xd, y + + +def train(cfg): + device = "cuda:0" + model = hydra.utils.instantiate(cfg[cfg.model]).to(device) + optimizer = hydra.utils.instantiate(cfg.optimizer, model.parameters()) + # add 1 extra embedding for padding token, set the padding index to be the last token + # (tokens from the clustering start at index 0) + collate_fn = Collator(padding_idx=model.padding_token) + logger.info(f"data: {cfg.train_tsv}") + train_ds = DurationDataset(cfg.train_tsv, cfg.train_km, substring=cfg.substring) + valid_ds = DurationDataset(cfg.valid_tsv, cfg.valid_km, substring=cfg.substring) + train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn) + valid_dl = DataLoader(valid_ds, batch_size=32, shuffle=False, collate_fn=collate_fn) + + best_loss = float("inf") + for epoch in range(cfg.epochs): + train_loss, train_loss_scaled = train_epoch(model, train_dl, l2_log_loss, optimizer, device) + valid_loss, valid_loss_scaled, *acc = valid_epoch(model, valid_dl, l2_log_loss, device) + acc0, acc1, acc2, acc3 = acc + if valid_loss_scaled < best_loss: + path = f"{os.getcwd()}/{cfg.substring}.ckpt" + save_ckpt(model, path, cfg[cfg.model]) + best_loss = valid_loss_scaled + logger.info(f"saved checkpoint: {path}") + logger.info(f"[epoch {epoch}] train loss: {train_loss:.3f}, train scaled: {train_loss_scaled:.3f}") + logger.info(f"[epoch {epoch}] valid loss: {valid_loss:.3f}, valid scaled: {valid_loss_scaled:.3f}") + logger.info(f"acc: {acc0,acc1,acc2,acc3}") + + +def train_epoch(model, loader, criterion, optimizer, device): + model.train() + epoch_loss = 0 + epoch_loss_scaled = 0 + for x, y, mask, _ in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + yhat = model(x) + loss = criterion(yhat, y) * mask + loss = torch.mean(loss) + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + epoch_loss += loss.item() + # get normal scale loss + yhat_scaled = torch.exp(yhat) - 1 + yhat_scaled = torch.round(yhat_scaled) + scaled_loss = torch.mean(torch.abs(yhat_scaled - y) * mask) + epoch_loss_scaled += scaled_loss.item() + return epoch_loss / len(loader), epoch_loss_scaled / len(loader) + + +def valid_epoch(model, loader, criterion, device): + model.eval() + epoch_loss = 0 + epoch_loss_scaled = 0 + acc = Accuracy() + for x, y, mask, _ in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + yhat = model(x) + loss = criterion(yhat, y) * mask + loss = torch.mean(loss) + epoch_loss += loss.item() + # get normal scale loss + yhat_scaled = torch.exp(yhat) - 1 + yhat_scaled = torch.round(yhat_scaled) + scaled_loss = torch.sum(torch.abs(yhat_scaled - y) * mask) / mask.sum() + acc.update(yhat_scaled[mask].view(-1).float(), y[mask].view(-1).float()) + epoch_loss_scaled += scaled_loss.item() + logger.info(f"example y: {y[0, :10].tolist()}") + logger.info(f"example yhat: {yhat_scaled[0, :10].tolist()}") + acc0 = acc.acc(tol=0) + acc1 = acc.acc(tol=1) + acc2 = acc.acc(tol=2) + acc3 = acc.acc(tol=3) + logger.info(f"accs: {acc0,acc1,acc2,acc3}") + return epoch_loss / len(loader), epoch_loss_scaled / len(loader), acc0, acc1, acc2, acc3 + + +@hydra.main(config_path=".", config_name="duration_predictor.yaml") +def main(cfg): + logger.info(f"{cfg}") + train(cfg) + + +if __name__ == "__main__": + main() diff --git a/examples/emotion_conversion/emotion_models/duration_predictor.yaml b/examples/emotion_conversion/emotion_models/duration_predictor.yaml new file mode 100644 index 0000000000..0e976f4843 --- /dev/null +++ b/examples/emotion_conversion/emotion_models/duration_predictor.yaml @@ -0,0 +1,48 @@ +train_tsv: "/denoising/emov/train.tsv" +train_km: "/denoising/emov/train.km" +valid_tsv: "/denoising/emov/valid.tsv" +valid_km: "/denoising/emov/valid.km" + +n_tokens: 200 +batch_size: 32 +lr: 0.0001 +epochs: 300 +model: "cnn" +substring: "" + +rnn: + _target_: emotion_models.duration_predictor.RnnPredictor + n_tokens: ${n_tokens} + emb_dim: 128 + rnn_hidden: 128 + output_dim: 1 + dropout: 0 + n_layers: 1 + +optimizer: + _target_: torch.optim.Adam + lr: ${lr} + betas: [0.9, 0.98] + eps: 0.000000001 + weight_decay: 0 + +cnn: + _target_: emotion_models.duration_predictor.CnnPredictor + n_tokens: ${n_tokens} + emb_dim: 128 + channels: 256 + kernel: 3 + output_dim: 1 + dropout: 0.5 + n_layers: 1 + +hydra: + run: + dir: /checkpoint/felixkreuk/experiments/duration_predictor/${hydra.job.override_dirname} + job: + config: + # configuration for the ${hydra.job.override_dirname} runtime variable + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: ['train_tsv', 'train_km', 'valid_tsv', 'valid_km'] diff --git a/examples/emotion_conversion/emotion_models/pitch_predictor.py b/examples/emotion_conversion/emotion_models/pitch_predictor.py new file mode 100644 index 0000000000..431446996c --- /dev/null +++ b/examples/emotion_conversion/emotion_models/pitch_predictor.py @@ -0,0 +1,559 @@ +import logging +import os +import random +import sys +from collections import defaultdict + +import hydra +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from einops.layers.torch import Rearrange +from scipy.io.wavfile import read +from scipy.ndimage import gaussian_filter1d +from torch.utils.data import DataLoader, Dataset +from tqdm import tqdm + +dir_path = os.path.dirname(__file__) +resynth_path = os.path.dirname(dir_path) + "/speech-resynthesis" +sys.path.append(resynth_path) +from dataset import parse_speaker, parse_style +from .utils import F0Stat + +MAX_WAV_VALUE = 32768.0 +logger = logging.getLogger(__name__) + + +def quantize_f0(speaker_to_f0, nbins, normalize, log): + f0_all = [] + for speaker, f0 in speaker_to_f0.items(): + f0 = f0.raw_data + if log: + f0 = f0.log() + mean = speaker_to_f0[speaker].mean_log if log else speaker_to_f0[speaker].mean + std = speaker_to_f0[speaker].std_log if log else speaker_to_f0[speaker].std + if normalize == "mean": + f0 = f0 - mean + elif normalize == "meanstd": + f0 = (f0 - mean) / std + f0_all.extend(f0.tolist()) + + hist, bin_x = np.histogram(f0_all, 100000) + cum_hist = np.cumsum(hist) / len(f0_all) * 100 + + bin_offset = [] + bin_size = 100 / nbins + threshold = bin_size + for i in range(nbins - 1): + index = (np.abs(cum_hist - threshold)).argmin() + bin_offset.append(bin_x[index]) + threshold += bin_size + bins = np.array(bin_offset) + bins = torch.FloatTensor(bins) + + return bins + + +def save_ckpt(model, path, model_class, f0_min, f0_max, f0_bins, speaker_stats): + ckpt = { + "state_dict": model.state_dict(), + "padding_token": model.padding_token, + "model_class": model_class, + "speaker_stats": speaker_stats, + "f0_min": f0_min, + "f0_max": f0_max, + "f0_bins": f0_bins, + } + torch.save(ckpt, path) + + +def load_ckpt(path): + ckpt = torch.load(path) + ckpt["model_class"]["_target_"] = "emotion_models.pitch_predictor.CnnPredictor" + model = hydra.utils.instantiate(ckpt["model_class"]) + model.load_state_dict(ckpt["state_dict"]) + model.setup_f0_stats( + ckpt["f0_min"], + ckpt["f0_max"], + ckpt["f0_bins"], + ckpt["speaker_stats"], + ) + return model + + +def freq2bin(f0, f0_min, f0_max, bins): + f0 = f0.clone() + f0[f0 < f0_min] = f0_min + f0[f0 > f0_max] = f0_max + f0 = torch.bucketize(f0, bins) + return f0 + + +def bin2freq(x, f0_min, f0_max, bins, mode): + n_bins = len(bins) + 1 + assert x.shape[-1] == n_bins + bins = torch.cat([torch.tensor([f0_min]), bins]).to(x.device) + if mode == "mean": + f0 = (x * bins).sum(-1, keepdims=True) / x.sum(-1, keepdims=True) + elif mode == "argmax": + idx = F.one_hot(x.argmax(-1), num_classes=n_bins) + f0 = (idx * bins).sum(-1, keepdims=True) + else: + raise NotImplementedError() + return f0[..., 0] + + +def load_wav(full_path): + sampling_rate, data = read(full_path) + return data, sampling_rate + + +def l1_loss(input, target): + return F.l1_loss(input=input.float(), target=target.float(), reduce=False) + + +def l2_loss(input, target): + return F.mse_loss(input=input.float(), target=target.float(), reduce=False) + + +class Collator: + def __init__(self, padding_idx): + self.padding_idx = padding_idx + + def __call__(self, batch): + tokens = [item[0] for item in batch] + lengths = [len(item) for item in tokens] + tokens = torch.nn.utils.rnn.pad_sequence( + tokens, batch_first=True, padding_value=self.padding_idx + ) + f0 = [item[1] for item in batch] + f0 = torch.nn.utils.rnn.pad_sequence( + f0, batch_first=True, padding_value=self.padding_idx + ) + f0_raw = [item[2] for item in batch] + f0_raw = torch.nn.utils.rnn.pad_sequence( + f0_raw, batch_first=True, padding_value=self.padding_idx + ) + spk = [item[3] for item in batch] + spk = torch.LongTensor(spk) + gst = [item[4] for item in batch] + gst = torch.LongTensor(gst) + mask = tokens != self.padding_idx + return tokens, f0, f0_raw, spk, gst, mask, lengths + + +class CnnPredictor(nn.Module): + def __init__( + self, + n_tokens, + emb_dim, + channels, + kernel, + dropout, + n_layers, + spk_emb, + gst_emb, + n_bins, + f0_pred, + f0_log, + f0_norm, + ): + super(CnnPredictor, self).__init__() + self.n_tokens = n_tokens + self.emb_dim = emb_dim + self.f0_log = f0_log + self.f0_pred = f0_pred + self.padding_token = n_tokens + self.f0_norm = f0_norm + # add 1 extra embedding for padding token, set the padding index to be the last token + # (tokens from the clustering start at index 0) + self.token_emb = nn.Embedding( + n_tokens + 1, emb_dim, padding_idx=self.padding_token + ) + + self.spk_emb = spk_emb + self.gst_emb = nn.Embedding(20, gst_emb) + self.setup = False + + feats = emb_dim + gst_emb + # feats = emb_dim + gst_emb + (256 if spk_emb else 0) + layers = [ + nn.Sequential( + Rearrange("b t c -> b c t"), + nn.Conv1d( + feats, channels, kernel_size=kernel, padding=(kernel - 1) // 2 + ), + Rearrange("b c t -> b t c"), + nn.ReLU(), + nn.LayerNorm(channels), + nn.Dropout(dropout), + ) + ] + for _ in range(n_layers - 1): + layers += [ + nn.Sequential( + Rearrange("b t c -> b c t"), + nn.Conv1d( + channels, + channels, + kernel_size=kernel, + padding=(kernel - 1) // 2, + ), + Rearrange("b c t -> b t c"), + nn.ReLU(), + nn.LayerNorm(channels), + nn.Dropout(dropout), + ) + ] + self.conv_layer = nn.ModuleList(layers) + self.proj = nn.Linear(channels, n_bins) + + def forward(self, x, gst=None): + x = self.token_emb(x) + feats = [x] + + if gst is not None: + gst = self.gst_emb(gst) + gst = rearrange(gst, "b c -> b c 1") + gst = F.interpolate(gst, x.shape[1]) + gst = rearrange(gst, "b c t -> b t c") + feats.append(gst) + + x = torch.cat(feats, dim=-1) + + for i, conv in enumerate(self.conv_layer): + if i != 0: + x = conv(x) + x + else: + x = conv(x) + + x = self.proj(x) + x = x.squeeze(-1) + + if self.f0_pred == "mean": + x = torch.sigmoid(x) + elif self.f0_pred == "argmax": + x = torch.softmax(x, dim=-1) + else: + raise NotImplementedError + return x + + def setup_f0_stats(self, f0_min, f0_max, f0_bins, speaker_stats): + self.f0_min = f0_min + self.f0_max = f0_max + self.f0_bins = f0_bins + self.speaker_stats = speaker_stats + self.setup = True + + def inference(self, x, spk_id=None, gst=None): + assert ( + self.setup == True + ), "make sure that `setup_f0_stats` was called before inference!" + probs = self(x, gst) + f0 = bin2freq(probs, self.f0_min, self.f0_max, self.f0_bins, self.f0_pred) + for i in range(f0.shape[0]): + mean = ( + self.speaker_stats[spk_id[i].item()].mean_log + if self.f0_log + else self.speaker_stats[spk_id[i].item()].mean + ) + std = ( + self.speaker_stats[spk_id[i].item()].std_log + if self.f0_log + else self.speaker_stats[spk_id[i].item()].std + ) + if self.f0_norm == "mean": + f0[i] = f0[i] + mean + if self.f0_norm == "meanstd": + f0[i] = (f0[i] * std) + mean + if self.f0_log: + f0 = f0.exp() + return f0 + + +class PitchDataset(Dataset): + def __init__( + self, + tsv_path, + km_path, + substring, + spk, + spk2id, + gst, + gst2id, + f0_bins, + f0_bin_type, + f0_smoothing, + f0_norm, + f0_log, + ): + lines = open(tsv_path, "r").readlines() + self.root, self.tsv = lines[0], lines[1:] + self.root = self.root.strip() + self.km = open(km_path, "r").readlines() + print(f"loaded {len(self.km)} files") + + self.spk = spk + self.spk2id = spk2id + self.gst = gst + self.gst2id = gst2id + + self.f0_bins = f0_bins + self.f0_smoothing = f0_smoothing + self.f0_norm = f0_norm + self.f0_log = f0_log + + if substring != "": + tsv, km = [], [] + for tsv_line, km_line in zip(self.tsv, self.km): + if substring.lower() in tsv_line.lower(): + tsv.append(tsv_line) + km.append(km_line) + self.tsv, self.km = tsv, km + print(f"after filtering: {len(self.km)} files") + + self.speaker_stats = self._compute_f0_stats() + self.f0_min, self.f0_max = self._compute_f0_minmax() + if f0_bin_type == "adaptive": + self.f0_bins = quantize_f0( + self.speaker_stats, self.f0_bins, self.f0_norm, self.f0_log + ) + elif f0_bin_type == "uniform": + self.f0_bins = torch.linspace(self.f0_min, self.f0_max, self.f0_bins + 1)[ + 1:-1 + ] + else: + raise NotImplementedError + print(f"f0 min: {self.f0_min}, f0 max: {self.f0_max}") + print(f"bins: {self.f0_bins} (shape: {self.f0_bins.shape})") + + def __len__(self): + return len(self.km) + + def _load_f0(self, tsv_line): + tsv_line = tsv_line.split("\t")[0] + f0 = self.root + "/" + tsv_line.replace(".wav", ".yaapt.f0.npy") + f0 = np.load(f0) + f0 = torch.FloatTensor(f0) + return f0 + + def _preprocess_f0(self, f0, spk): + mask = f0 != -999999 # process all frames + # mask = (f0 != 0) # only process voiced frames + mean = ( + self.speaker_stats[spk].mean_log + if self.f0_log + else self.speaker_stats[spk].mean + ) + std = ( + self.speaker_stats[spk].std_log + if self.f0_log + else self.speaker_stats[spk].std + ) + if self.f0_log: + f0[f0 == 0] = 1e-5 + f0[mask] = f0[mask].log() + if self.f0_norm == "mean": + f0[mask] = f0[mask] - mean + if self.f0_norm == "meanstd": + f0[mask] = (f0[mask] - mean) / std + return f0 + + def _compute_f0_minmax(self): + f0_min, f0_max = float("inf"), -float("inf") + for tsv_line in tqdm(self.tsv, desc="computing f0 minmax"): + spk = self.spk2id[parse_speaker(tsv_line, self.spk)] + f0 = self._load_f0(tsv_line) + f0 = self._preprocess_f0(f0, spk) + f0_min = min(f0_min, f0.min().item()) + f0_max = max(f0_max, f0.max().item()) + return f0_min, f0_max + + def _compute_f0_stats(self): + from functools import partial + + speaker_stats = defaultdict(partial(F0Stat, True)) + for tsv_line in tqdm(self.tsv, desc="computing speaker stats"): + spk = self.spk2id[parse_speaker(tsv_line, self.spk)] + f0 = self._load_f0(tsv_line) + mask = f0 != 0 + f0 = f0[mask] # compute stats only on voiced parts + speaker_stats[spk].update(f0) + return speaker_stats + + def __getitem__(self, i): + x = self.km[i] + x = x.split(" ") + x = list(map(int, x)) + x = torch.LongTensor(x) + + gst = parse_style(self.tsv[i], self.gst) + gst = self.gst2id[gst] + spk = parse_speaker(self.tsv[i], self.spk) + spk = self.spk2id[spk] + + f0_raw = self._load_f0(self.tsv[i]) + f0 = self._preprocess_f0(f0_raw.clone(), spk) + + f0 = F.interpolate(f0.unsqueeze(0).unsqueeze(0), x.shape[0])[0, 0] + f0_raw = F.interpolate(f0_raw.unsqueeze(0).unsqueeze(0), x.shape[0])[0, 0] + + f0 = freq2bin(f0, f0_min=self.f0_min, f0_max=self.f0_max, bins=self.f0_bins) + f0 = F.one_hot(f0.long(), num_classes=len(self.f0_bins) + 1).float() + if self.f0_smoothing > 0: + f0 = torch.tensor( + gaussian_filter1d(f0.float().numpy(), sigma=self.f0_smoothing) + ) + return x, f0, f0_raw, spk, gst + + +def train(cfg): + device = "cuda:0" + # add 1 extra embedding for padding token, set the padding index to be the last token + # (tokens from the clustering start at index 0) + padding_token = cfg.n_tokens + collate_fn = Collator(padding_idx=padding_token) + train_ds = PitchDataset( + cfg.train_tsv, + cfg.train_km, + substring=cfg.substring, + spk=cfg.spk, + spk2id=cfg.spk2id, + gst=cfg.gst, + gst2id=cfg.gst2id, + f0_bins=cfg.f0_bins, + f0_bin_type=cfg.f0_bin_type, + f0_smoothing=cfg.f0_smoothing, + f0_norm=cfg.f0_norm, + f0_log=cfg.f0_log, + ) + valid_ds = PitchDataset( + cfg.valid_tsv, + cfg.valid_km, + substring=cfg.substring, + spk=cfg.spk, + spk2id=cfg.spk2id, + gst=cfg.gst, + gst2id=cfg.gst2id, + f0_bins=cfg.f0_bins, + f0_bin_type=cfg.f0_bin_type, + f0_smoothing=cfg.f0_smoothing, + f0_norm=cfg.f0_norm, + f0_log=cfg.f0_log, + ) + train_dl = DataLoader( + train_ds, + num_workers=0, + batch_size=cfg.batch_size, + shuffle=True, + collate_fn=collate_fn, + ) + valid_dl = DataLoader( + valid_ds, num_workers=0, batch_size=16, shuffle=False, collate_fn=collate_fn + ) + + f0_min = train_ds.f0_min + f0_max = train_ds.f0_max + f0_bins = train_ds.f0_bins + speaker_stats = train_ds.speaker_stats + + model = hydra.utils.instantiate(cfg["model"]).to(device) + model.setup_f0_stats(f0_min, f0_max, f0_bins, speaker_stats) + + optimizer = hydra.utils.instantiate(cfg.optimizer, model.parameters()) + + best_loss = float("inf") + for epoch in range(cfg.epochs): + train_loss, train_l2_loss, train_l2_voiced_loss = run_epoch( + model, train_dl, optimizer, device, cfg, mode="train" + ) + valid_loss, valid_l2_loss, valid_l2_voiced_loss = run_epoch( + model, valid_dl, None, device, cfg, mode="valid" + ) + print( + f"[epoch {epoch}] train loss: {train_loss:.3f}, l2 loss: {train_l2_loss:.3f}, l2 voiced loss: {train_l2_voiced_loss:.3f}" + ) + print( + f"[epoch {epoch}] valid loss: {valid_loss:.3f}, l2 loss: {valid_l2_loss:.3f}, l2 voiced loss: {valid_l2_voiced_loss:.3f}" + ) + if valid_l2_voiced_loss < best_loss: + path = f"{os.getcwd()}/pitch_predictor.ckpt" + save_ckpt(model, path, cfg["model"], f0_min, f0_max, f0_bins, speaker_stats) + best_loss = valid_l2_voiced_loss + print(f"saved checkpoint: {path}") + print(f"[epoch {epoch}] best loss: {best_loss:.3f}") + + +def run_epoch(model, loader, optimizer, device, cfg, mode): + if mode == "train": + model.train() + else: + model.eval() + + epoch_loss = 0 + l1 = 0 + l1_voiced = 0 + for x, f0_bin, f0_raw, spk_id, gst, mask, _ in tqdm(loader): + x, f0_bin, f0_raw, spk_id, gst, mask = ( + x.to(device), + f0_bin.to(device), + f0_raw.to(device), + spk_id.to(device), + gst.to(device), + mask.to(device), + ) + b, t, n_bins = f0_bin.shape + yhat = model(x, gst) + nonzero_mask = (f0_raw != 0).logical_and(mask) + yhat_raw = model.inference(x, spk_id, gst) + expanded_mask = mask.unsqueeze(-1).expand(-1, -1, n_bins) + if cfg.f0_pred == "mean": + loss = F.binary_cross_entropy( + yhat[expanded_mask], f0_bin[expanded_mask] + ).mean() + elif cfg.f0_pred == "argmax": + loss = F.cross_entropy( + rearrange(yhat, "b t d -> (b t) d"), + rearrange(f0_bin.argmax(-1), "b t -> (b t)"), + reduce=False, + ) + loss = rearrange(loss, "(b t) -> b t", b=b, t=t) + loss = (loss * mask).sum() / mask.float().sum() + else: + raise NotImplementedError + l1 += F.l1_loss(yhat_raw[mask], f0_raw[mask]).item() + l1_voiced += F.l1_loss(yhat_raw[nonzero_mask], f0_raw[nonzero_mask]).item() + epoch_loss += loss.item() + + if mode == "train": + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + + print(f"{mode} example y: {f0_bin.argmax(-1)[0, 50:60].tolist()}") + print(f"{mode} example yhat: {yhat.argmax(-1)[0, 50:60].tolist()}") + print(f"{mode} example y: {f0_raw[0, 50:60].round().tolist()}") + print(f"{mode} example yhat: {yhat_raw[0, 50:60].round().tolist()}") + return epoch_loss / len(loader), l1 / len(loader), l1_voiced / len(loader) + + +@hydra.main(config_path=dir_path, config_name="pitch_predictor.yaml") +def main(cfg): + np.random.seed(1) + random.seed(1) + torch.manual_seed(1) + from hydra.core.hydra_config import HydraConfig + + overrides = { + x.split("=")[0]: x.split("=")[1] + for x in HydraConfig.get().overrides.task + if "/" not in x + } + print(f"{cfg}") + train(cfg) + + +if __name__ == "__main__": + main() diff --git a/examples/emotion_conversion/emotion_models/pitch_predictor.yaml b/examples/emotion_conversion/emotion_models/pitch_predictor.yaml new file mode 100644 index 0000000000..d2dbb862c3 --- /dev/null +++ b/examples/emotion_conversion/emotion_models/pitch_predictor.yaml @@ -0,0 +1,64 @@ +train_tsv: "/denoising/emov/train.tsv" +train_km: "/denoising/emov/train.km" +valid_tsv: "/denoising/emov/valid.tsv" +valid_km: "/denoising/emov/valid.km" + +n_tokens: 200 +batch_size: 64 +lr: 0.0001 +epochs: 1000 + +substring: "" +loss: "l2" +spk: "parent_parent_name" +gst: "emotion" + +f0_bins: 50 +f0_pred: "mean" # [argmax, mean] +f0_smoothing: 0.1 +f0_norm: "mean" +f0_log: false +f0_bin_type: "adaptive" # [uniform, adaptive] + +spk2id: + bea: 0 + jenie: 1 + josh: 2 + sam: 3 + +gst2id: + amused: 0 + angry: 1 + disgusted: 2 + neutral: 3 + sleepy: 4 + +optimizer: + _target_: torch.optim.Adam + lr: ${lr} + +model: + _target_: emotion_models.pitch_predictor.CnnPredictor + n_tokens: ${n_tokens} + emb_dim: 256 + channels: 256 + kernel: 5 + dropout: 0.1 + n_layers: 6 + spk_emb: true + gst_emb: 8 + n_bins: ${f0_bins} + f0_pred: ${f0_pred} + f0_log: ${f0_log} + f0_norm: ${f0_norm} + +hydra: + run: + dir: /checkpoint/felixkreuk/experiments/pitch_predictor/${hydra.job.override_dirname} + job: + config: + # configuration for the ${hydra.job.override_dirname} runtime variable + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: ['train_tsv', 'train_km', 'valid_tsv', 'valid_km'] diff --git a/examples/emotion_conversion/emotion_models/utils.py b/examples/emotion_conversion/emotion_models/utils.py new file mode 100644 index 0000000000..4199c310f8 --- /dev/null +++ b/examples/emotion_conversion/emotion_models/utils.py @@ -0,0 +1,78 @@ +import torch + + +class Stat: + def __init__(self, keep_raw=False): + self.x = 0.0 + self.x2 = 0.0 + self.z = 0.0 # z = logx + self.z2 = 0.0 + self.n = 0.0 + self.u = 0.0 + self.keep_raw = keep_raw + self.raw = [] + + def update(self, new_x): + new_z = new_x.log() + + self.x += new_x.sum() + self.x2 += (new_x**2).sum() + self.z += new_z.sum() + self.z2 += (new_z**2).sum() + self.n += len(new_x) + self.u += 1 + + if self.keep_raw: + self.raw.append(new_x) + + @property + def mean(self): + return self.x / self.n + + @property + def std(self): + return (self.x2 / self.n - self.mean**2) ** 0.5 + + @property + def mean_log(self): + return self.z / self.n + + @property + def std_log(self): + return (self.z2 / self.n - self.mean_log**2) ** 0.5 + + @property + def n_frms(self): + return self.n + + @property + def n_utts(self): + return self.u + + @property + def raw_data(self): + assert self.keep_raw, "does not support storing raw data!" + return torch.cat(self.raw) + + +class F0Stat(Stat): + def update(self, new_x): + # assume unvoiced frames are 0 and consider only voiced frames + if new_x is not None: + super().update(new_x[new_x != 0]) + + +class Accuracy: + def __init__(self): + self.y, self.yhat = [], [] + + def update(self, yhat, y): + self.yhat.append(yhat) + self.y.append(y) + + def acc(self, tol): + yhat = torch.cat(self.yhat) + y = torch.cat(self.y) + acc = torch.abs(yhat - y) <= tol + acc = acc.float().mean().item() + return acc diff --git a/examples/emotion_conversion/fairseq_models/__init__.py b/examples/emotion_conversion/fairseq_models/__init__.py new file mode 100644 index 0000000000..441bc03db4 --- /dev/null +++ b/examples/emotion_conversion/fairseq_models/__init__.py @@ -0,0 +1,226 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq import utils +from fairseq.models import ( + FairseqMultiModel, + register_model, + register_model_architecture, +) +from fairseq.models.transformer import ( + Embedding, + base_architecture, +) +from fairseq.models.multilingual_transformer import ( + MultilingualTransformerModel, + base_multilingual_architecture, +) +from fairseq.utils import safe_hasattr +from collections import OrderedDict + + +@register_model("multilingual_transformer_from_mbart") +class MultilingualTransformerModelFromMbart(MultilingualTransformerModel): + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + from fairseq.tasks.multilingual_translation import MultilingualTranslationTask + + assert isinstance(task, MultilingualTranslationTask) + + # make sure all arguments are present in older models + base_multilingual_architecture(args) + + if not safe_hasattr(args, "max_source_positions"): + args.max_source_positions = 1024 + if not safe_hasattr(args, "max_target_positions"): + args.max_target_positions = 1024 + + src_langs = [lang_pair.split("-")[0] for lang_pair in task.model_lang_pairs] + tgt_langs = [lang_pair.split("-")[1] for lang_pair in task.model_lang_pairs] + + if args.share_encoders: + args.share_encoder_embeddings = True + if args.share_decoders: + args.share_decoder_embeddings = True + + def build_embedding(dictionary, embed_dim, path=None): + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + emb = Embedding(num_embeddings, embed_dim, padding_idx) + # if provided, load from preloaded dictionaries + if path: + embed_dict = utils.parse_embedding(path) + utils.load_embedding(embed_dict, dictionary, emb) + return emb + + # build shared embeddings (if applicable) + shared_encoder_embed_tokens, shared_decoder_embed_tokens = None, None + if args.share_all_embeddings: + if args.encoder_embed_dim != args.decoder_embed_dim: + raise ValueError( + "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" + ) + if args.decoder_embed_path and ( + args.decoder_embed_path != args.encoder_embed_path + ): + raise ValueError( + "--share-all-embeddings not compatible with --decoder-embed-path" + ) + shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( + dicts=task.dicts, + langs=task.langs, + embed_dim=args.encoder_embed_dim, + build_embedding=build_embedding, + pretrained_embed_path=args.encoder_embed_path, + ) + shared_decoder_embed_tokens = shared_encoder_embed_tokens + args.share_decoder_input_output_embed = True + else: + if args.share_encoder_embeddings: + shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( + dicts=task.dicts, + langs=src_langs, + embed_dim=args.encoder_embed_dim, + build_embedding=build_embedding, + pretrained_embed_path=args.encoder_embed_path, + ) + if args.share_decoder_embeddings: + shared_decoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( + dicts=task.dicts, + langs=tgt_langs, + embed_dim=args.decoder_embed_dim, + build_embedding=build_embedding, + pretrained_embed_path=args.decoder_embed_path, + ) + + # encoders/decoders for each language + lang_encoders, lang_decoders = {}, {} + + def get_encoder(lang): + if lang not in lang_encoders: + if shared_encoder_embed_tokens is not None: + encoder_embed_tokens = shared_encoder_embed_tokens + else: + encoder_embed_tokens = build_embedding( + task.dicts[lang], + args.encoder_embed_dim, + args.encoder_embed_path, + ) + lang_encoders[lang] = MultilingualTransformerModel._get_module_class( + True, args, task.dicts[lang], encoder_embed_tokens, src_langs + ) + return lang_encoders[lang] + + def get_decoder(lang): + if lang not in lang_decoders: + if shared_decoder_embed_tokens is not None: + decoder_embed_tokens = shared_decoder_embed_tokens + else: + decoder_embed_tokens = build_embedding( + task.dicts[lang], + args.decoder_embed_dim, + args.decoder_embed_path, + ) + lang_decoders[lang] = MultilingualTransformerModel._get_module_class( + False, args, task.dicts[lang], decoder_embed_tokens, tgt_langs + ) + return lang_decoders[lang] + + # shared encoders/decoders (if applicable) + shared_encoder, shared_decoder = None, None + if args.share_encoders: + shared_encoder = get_encoder(src_langs[0]) + if args.share_decoders: + shared_decoder = get_decoder(tgt_langs[0]) + + encoders, decoders = OrderedDict(), OrderedDict() + for lang_pair, src, tgt in zip(task.model_lang_pairs, src_langs, tgt_langs): + encoders[lang_pair] = ( + shared_encoder if shared_encoder is not None else get_encoder(src) + ) + decoders[lang_pair] = ( + shared_decoder if shared_decoder is not None else get_decoder(tgt) + ) + + return MultilingualTransformerModelFromMbart(encoders, decoders) + + def load_state_dict(self, state_dict, strict=True, model_cfg=None): + state_dict_subset = state_dict.copy() + lang_pairs = set([x.split(".")[1] for x in state_dict.keys()]) + finetune_mode = not any("neutral" in lp for lp in lang_pairs) + + if finetune_mode: + # load a pre-trained mBART/BART model + # we need this code because mBART/BART are not of type FairseqMultiModel but FairseqModel + # so we hackishly load the weights by replicating them for all lang pairs + print("loading pre-trained BART") + self_state_dict = self.state_dict() + for k, v in state_dict.items(): + for lang_pair in self.models: + new_key = k if "models." in k else f"models.{lang_pair}.{k}" + # print(new_key) + if self_state_dict[new_key].shape == v.shape: + state_dict_subset[new_key] = v + elif any( + w in k + for w in [ + "encoder.embed_tokens.weight", + "decoder.embed_tokens.weight", + "decoder.output_projection.weight", + ] + ): + # why vocab_size - 5? because there are `vocab_size` tokens from the language + # and 5 additional tokens in the denoising task: eos,bos,pad,unk,mask. + # but in the translation task there are only `vocab_size` + 4 (no mask). + print( + f"{k}: {self_state_dict[new_key].shape} != {v.shape}", + end="", + flush=True, + ) + vocab_size = v.shape[0] - 5 + state_dict_subset[new_key] = self_state_dict[new_key] + state_dict_subset[new_key] = v[: vocab_size + 4] + print(f" => fixed by using first {vocab_size + 4} dims") + else: + raise ValueError("unable to load model due to mimatched dims!") + del state_dict_subset[k] + else: + print("loading pre-trained emotion translation model") + for k, _ in state_dict.items(): + assert k.startswith("models.") + lang_pair = k.split(".")[1] + if lang_pair not in self.models: + del state_dict_subset[k] + + super().load_state_dict(state_dict_subset, strict=strict, model_cfg=model_cfg) + + +@register_model_architecture("transformer", "transformer_small") +def transformer_small(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.encoder_layers = getattr(args, "encoder_layers", 3) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 512) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) + args.decoder_layers = getattr(args, "decoder_layers", 3) + base_architecture(args) + + +@register_model_architecture( + "multilingual_transformer_from_mbart", "multilingual_small" +) +def multilingual_small(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.encoder_layers = getattr(args, "encoder_layers", 3) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 512) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) + args.decoder_layers = getattr(args, "decoder_layers", 3) + base_multilingual_architecture(args) diff --git a/examples/emotion_conversion/preprocess/__init__.py b/examples/emotion_conversion/preprocess/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/emotion_conversion/preprocess/build_hifigan_manifest.py b/examples/emotion_conversion/preprocess/build_hifigan_manifest.py new file mode 100644 index 0000000000..29c0d79cee --- /dev/null +++ b/examples/emotion_conversion/preprocess/build_hifigan_manifest.py @@ -0,0 +1,38 @@ +import torchaudio +import argparse +import json + +def main(): + parser = argparse.ArgumentParser(description="example: python create_hifigan_manifest.py --tsv /checkpoint/felixkreuk/datasets/vctk/splits/vctk_16khz/train.tsv --km /checkpoint/felixkreuk/experiments/hubert/hubert_feats/vctk_16khz_km_100/train.km --km_type hubert_100km > ~/tmp/tmp_mani.txt") + parser.add_argument("--tsv", required=True, help="path to fairseq tsv file") + parser.add_argument("--km", required=True, help="path to a km file generated by HuBERT clustering") + parser.add_argument("--km_type", required=True, help="name of the codes in the output json (for example: 'cpc_100km')") + args = parser.parse_args() + + km_lines = open(args.km, "r").readlines() + tsv_lines = open(args.tsv, "r").readlines() + assert len(km_lines) == len(tsv_lines) - 1, "tsv and km files are not of the same length!" + + wav_root = tsv_lines[0].strip() + tsv_lines = tsv_lines[1:] + + for tsv_line, km_line in zip(tsv_lines, km_lines): + tsv_line, km_line = tsv_line.strip(), km_line.strip() + wav_basename, wav_num_frames = tsv_line.split("\t") + wav_path = wav_root + "/" + wav_basename + wav_info = torchaudio.info(wav_path) + assert int(wav_num_frames) == wav_info.num_frames, "tsv duration and actual duration don't match!" + wav_duration = wav_info.num_frames / wav_info.sample_rate + manifest_line = {"audio": wav_path, "duration": wav_duration, args.km_type: km_line} + print(json.dumps(manifest_line)) + +if __name__ == "__main__": + """ + usage: + python create_hifigan_manifest.py \ + --tsv /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/valid.tsv \ + --km /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/hubert_km_100/valid.km \ + --km_type hubert \ + > /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/hubert_km_100/hifigan_valid_manifest.txt + """ + main() diff --git a/examples/emotion_conversion/preprocess/build_translation_manifests.py b/examples/emotion_conversion/preprocess/build_translation_manifests.py new file mode 100644 index 0000000000..d38454a713 --- /dev/null +++ b/examples/emotion_conversion/preprocess/build_translation_manifests.py @@ -0,0 +1,258 @@ +from glob import glob +import argparse +from collections import defaultdict, Counter +from itertools import combinations, product, groupby +from pathlib import Path +import os +from sklearn.utils import shuffle +import numpy as np +import random +from shutil import copy +from subprocess import check_call + +np.random.seed(42) +random.seed(42) + + +def get_fname(s): + return s.split("\t")[0] + +def get_emotion(s): + return get_fname(s).split("_")[0].split("/")[1].lower() + +def get_utt_id(s): + return get_fname(s).split(".")[0].split("_")[-1] + +def dedup(seq): + """ >> remove_repetitions("1 2 2 3 100 2 2 1") + '1 2 3 100 2 1' """ + seq = seq.strip().split(" ") + result = seq[:1] + reps = [] + rep_counter = 1 + for k in seq[1:]: + if k != result[-1]: + result += [k] + reps += [rep_counter] + rep_counter = 1 + else: + rep_counter += 1 + reps += [rep_counter] + assert len(reps) == len(result) and sum(reps) == len(seq) + return " ".join(result) + "\n" #, reps + +def remove_under_k(seq, k): + """ remove tokens that repeat less then k times in a row + >> remove_under_k("a a a a b c c c", 1) ==> a a a a c c c """ + seq = seq.strip().split(" ") + result = [] + + freqs = [(k,len(list(g))) for k, g in groupby(seq)] + for c, f in freqs: + if f > k: + result += [c for _ in range(f)] + return " ".join(result) + "\n" #, reps + + +def call(cmd): + print(cmd) + check_call(cmd, shell=True) + + +def denoising_preprocess(path, lang, dict): + bin = 'fairseq-preprocess' + cmd = [ + bin, + f'--trainpref {path}/train.{lang} --validpref {path}/valid.{lang} --testpref {path}/test.{lang}', + f'--destdir {path}/tokenized/{lang}', + '--only-source', + '--task multilingual_denoising', + '--workers 40', + ] + if dict != "": + cmd += [f'--srcdict {dict}'] + cmd = " ".join(cmd) + call(cmd) + + +def translation_preprocess(path, src_lang, trg_lang, dict, only_train=False): + bin = 'fairseq-preprocess' + cmd = [ + bin, + f'--source-lang {src_lang} --target-lang {trg_lang}', + f'--trainpref {path}/train', + f'--destdir {path}/tokenized', + '--workers 40', + ] + if not only_train: + cmd += [f'--validpref {path}/valid --testpref {path}/test'] + if dict != "": + cmd += [ + f'--srcdict {dict}', + f'--tgtdict {dict}', + ] + cmd = " ".join(cmd) + call(cmd) + + +def load_tsv_km(tsv_path, km_path): + assert tsv_path.exists() and km_path.exists() + tsv_lines = open(tsv_path, "r").readlines() + root, tsv_lines = tsv_lines[0], tsv_lines[1:] + km_lines = open(km_path, "r").readlines() + assert len(tsv_lines) == len(km_lines), ".tsv and .km should be the same length!" + return root, tsv_lines, km_lines + + +def main(): + desc = """ + this script takes as input .tsv and .km files for EMOV dataset, and a pairs of emotions. + it generates parallel .tsv and .km files for these emotions. for exmaple: + ❯ python build_emov_translation_manifests.py \ + /checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/train.tsv \ + /checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/emov_16khz_km_100/train.km \ + ~/tmp/emov_pairs \ + --src-emotion amused --trg-emotion neutral \ + --dedup --shuffle --cross-speaker --dry-run + """ + parser = argparse.ArgumentParser(description=desc) + parser.add_argument("data", type=Path, help="path to a dir containing .tsv and .km files containing emov dataset") + parser.add_argument("output_path", type=Path, help="output directory with the manifests will be created") + parser.add_argument("-cs", "--cross-speaker", action='store_true', help="if set then translation will occur also between speakers, meaning the same sentence can be translated between different speakers (default: false)") + parser.add_argument("-dd", "--dedup", action='store_true', help="remove repeated tokens (example: 'aaabc=>abc')") + parser.add_argument("-sh", "--shuffle", action='store_true', help="shuffle the data") + parser.add_argument("-ae", "--autoencode", action='store_true', help="include training pairs from the same emotion (this includes examples of the same sentence uttered by different people and examples where the src and trg are the exact same seq)") + parser.add_argument("-dr", "--dry-run", action='store_true', help="don't write anything to disk") + parser.add_argument("-zs", "--zero-shot", action='store_true', help="if true, the denoising task will train on the same splits as the translation task (split by utterance id). if false, the denoising task will train on randomly sampled splits (not split by utterance id)") + parser.add_argument("--km-ext", default="km", help="") + parser.add_argument("--dict", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/fairseq.dict.txt", help="") + args = parser.parse_args() + SPEAKERS = ["bea", "jenie", "josh", "sam", "SAME"] + EMOTIONS = ['neutral', 'amused', 'angry', 'disgusted', 'sleepy'] + + suffix = "" + if args.cross_speaker: suffix += "_cross-speaker" + if args.dedup: suffix += "_dedup" + translation_suffix = "" + if args.autoencode: translation_suffix += "_autoencode" + denoising_suffix = "" + denoising_suffix += "_zeroshot" if args.zero_shot else "_nonzeroshot" + + translation_dir = Path(args.output_path) / ("emov_multilingual_translation" + suffix + translation_suffix) + os.makedirs(translation_dir, exist_ok=True) + denoising_dir = Path(args.output_path) / ("emov_multilingual_denoising" + suffix + denoising_suffix) + os.makedirs(denoising_dir, exist_ok=True) + + denoising_data = [p.name for p in (args.data / "denoising").glob("*") if "emov" not in p.name] + + for split in ["train", "valid", "test"]: + root, tsv_lines, km_lines = load_tsv_km( + tsv_path = args.data / "denoising" / "emov" / f"{split}.tsv", + km_path = args.data / "denoising" / "emov" / f"{split}.{args.km_ext}" + ) + + # generate data for the multilingual denoising task + for EMOTION in EMOTIONS: + print("---") + print(split) + print(f"denoising: {EMOTION}") + emotion_tsv, emotion_km = [], [] + for tsv_line, km_line in zip(tsv_lines, km_lines): + if EMOTION.lower() in tsv_line.lower(): + km_line = km_line if not args.dedup else dedup(km_line) + emotion_tsv.append(tsv_line) + emotion_km.append(km_line) + print(f"{len(emotion_km)} samples") + open(denoising_dir / f"files.{split}.{EMOTION}", "w").writelines([root] + emotion_tsv) + open(denoising_dir / f"{split}.{EMOTION}", "w").writelines(emotion_km) + + for data in denoising_data: + with open(args.data / "denoising" / data / f"{split}.{args.km_ext}", "r") as f1: + with open(denoising_dir / f"{split}.{data}", "w") as f2: + f2.writelines([l if not args.dedup else dedup(l) for l in f1.readlines()]) + + # start of translation preprocessing + root, tsv_lines, km_lines = load_tsv_km( + tsv_path = args.data / "translation" / f"{split}.tsv", + km_path = args.data / "translation" / f"{split}.{args.km_ext}" + ) + + # generate data for the multilingual translation task + for SRC_EMOTION in EMOTIONS: + TRG_EMOTIONS = EMOTIONS if args.autoencode else set(EMOTIONS) - set([SRC_EMOTION]) + for TRG_EMOTION in TRG_EMOTIONS: + # when translating back to the same emotion - we dont want these emotion + # pairs to be part of the validation/test sets (because its not really emotion conversino) + # if SRC_EMOTION == TRG_EMOTION and split in ["valid", "test"]: continue + print("---") + print(split) + print(f"src emotions: {SRC_EMOTION}\ntrg emotions: {TRG_EMOTION}") + + # create a dictionary with the following structure: + # output[SPEAKER][UTT_ID] = list with indexes of line from the tsv file + # that match the speaker and utterance id. for exmaple: + # output = {'sam': {'0493': [875, 1608, 1822], ...}, ...} + # meaning, for speaker 'sam', utterance id '0493', the indexes in tsv_lines + # are 875, 1608, 1822 + spkr2utts = defaultdict(lambda: defaultdict(list)) + for i, tsv_line in enumerate(tsv_lines): + speaker = tsv_line.split("/")[0] + if args.cross_speaker: speaker = "SAME" + assert speaker in SPEAKERS, "unknown speaker! make sure the .tsv contains EMOV data" + utt_id = get_utt_id(tsv_line) + spkr2utts[speaker][utt_id].append(i) + + # create a tsv and km files with all the combinations for translation + src_tsv, trg_tsv, src_km, trg_km = [], [], [], [] + for speaker, utt_ids in spkr2utts.items(): + for utt_id, indices in utt_ids.items(): + # generate all pairs + pairs = [(x,y) for x in indices for y in indices] + # self-translation + if SRC_EMOTION == TRG_EMOTION: + pairs = [(x,y) for (x,y) in pairs if x == y] + # filter according to src and trg emotions + pairs = [(x,y) for (x,y) in pairs + if get_emotion(tsv_lines[x]) == SRC_EMOTION and get_emotion(tsv_lines[y]) == TRG_EMOTION] + + for idx1, idx2 in pairs: + assert get_utt_id(tsv_lines[idx1]) == get_utt_id(tsv_lines[idx2]) + src_tsv.append(tsv_lines[idx1]) + trg_tsv.append(tsv_lines[idx2]) + km_line_idx1 = km_lines[idx1] + km_line_idx2 = km_lines[idx2] + km_line_idx1 = km_line_idx1 if not args.dedup else dedup(km_line_idx1) + km_line_idx2 = km_line_idx2 if not args.dedup else dedup(km_line_idx2) + src_km.append(km_line_idx1) + trg_km.append(km_line_idx2) + assert len(src_tsv) == len(trg_tsv) == len(src_km) == len(trg_km) + print(f"{len(src_tsv)} pairs") + + if len(src_tsv) == 0: + raise Exception("ERROR: generated 0 pairs!") + + if args.dry_run: continue + + # create files + os.makedirs(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}", exist_ok=True) + open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"files.{split}.{SRC_EMOTION}", "w").writelines([root] + src_tsv) + open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"files.{split}.{TRG_EMOTION}", "w").writelines([root] + trg_tsv) + open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"{split}.{SRC_EMOTION}", "w").writelines(src_km) + open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"{split}.{TRG_EMOTION}", "w").writelines(trg_km) + + + # fairseq-preprocess the denoising data + for EMOTION in EMOTIONS + denoising_data: + denoising_preprocess(denoising_dir, EMOTION, args.dict) + os.system(f"cp {args.dict} {denoising_dir}/tokenized/dict.txt") + + # fairseq-preprocess the translation data + os.makedirs(translation_dir / "tokenized", exist_ok=True) + for SRC_EMOTION in EMOTIONS: + TRG_EMOTIONS = EMOTIONS if args.autoencode else set(EMOTIONS) - set([SRC_EMOTION]) + for TRG_EMOTION in TRG_EMOTIONS: + translation_preprocess(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}", SRC_EMOTION, TRG_EMOTION, args.dict)#, only_train=SRC_EMOTION==TRG_EMOTION) + os.system(f"cp -rf {translation_dir}/**/tokenized/* {translation_dir}/tokenized") + +if __name__ == "__main__": + main() diff --git a/examples/emotion_conversion/preprocess/create_core_manifest.py b/examples/emotion_conversion/preprocess/create_core_manifest.py new file mode 100644 index 0000000000..b55740e00b --- /dev/null +++ b/examples/emotion_conversion/preprocess/create_core_manifest.py @@ -0,0 +1,91 @@ +from pathlib import Path +import os +import sys +import subprocess +import argparse +from datetime import datetime +import logging + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(message)s', + handlers=[logging.FileHandler('debug.log'), logging.StreamHandler()] +) +logger = logging.getLogger(__name__) + + +def verify_dict_size(km, dict): + logger.info(f"verifying: {km}") + dict_size = len(open(dict, "r").readlines()) + km_vocab = set(open(km, "r").read().replace("\n", " ").split(" ")) + if "" in km_vocab: km_vocab.remove("") + km_vocab_size = len(km_vocab) + return dict_size == km_vocab_size + + +def verify_files_exist(l): + for f in l: + if not f.exists(): + logging.error(f"{f} doesn't exist!") + return False + return True + + +def run_cmd(cmd, print_output=True): + try: + out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, universal_newlines=True, shell=True) + if print_output: + logger.info(f"command output:\n{out}") + return out + except subprocess.CalledProcessError as grepexc: + logger.info(f"error executing command!:\n{cmd}") + logger.info(grepexc.output) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--tsv", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/data.tsv", type=Path) + parser.add_argument("--emov-km", required=True, type=Path) + parser.add_argument("--km", nargs='+', required=True, type=Path) + parser.add_argument("--seed", type=int, default=1) + parser.add_argument("--dict", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/fairseq.dict.txt") + parser.add_argument("--manifests-dir", type=Path, default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz") + args = parser.parse_args() + + manifests_dir = args.manifests_dir + date = datetime.now().strftime('%d%m%y') + outdir = manifests_dir / f"{date}" + + # verify input and create folders + all_kms = args.km + [args.emov_km] + assert verify_files_exist(all_kms), "make sure the km dir contains: train-clean-all.km, blizzard2013.km, data.km" + for codes in all_kms: + assert verify_dict_size(codes, args.dict), "dict argument doesn't match the vocabulary of the km file!" + assert not outdir.exists(), "data dir already exists!" + outdir.mkdir(parents=True, exist_ok=True) + + logger.info("generating denoising split (emov)") + run_cmd(f"python preprocess/split_km_tsv.py {args.tsv} {args.emov_km} --destdir {outdir}/denoising/emov -sh --seed {args.seed}") + for codes in args.km: + codes_name = os.path.basename(codes) + run_cmd(f"python preprocess/split_km.py {codes} --destdir {outdir}/denoising/{codes_name} -sh --seed {args.seed}") + + logger.info("generating translation split") + run_cmd(f"python preprocess/split_emov_km_tsv_by_uttid.py {args.tsv} {args.emov_km} --destdir {outdir}/translation --seed {args.seed}") + + emov_code_name = os.path.basename(args.emov_km) + logger.info("generating hifigan split") + run_cmd( + f"mkdir -p {outdir}/hifigan &&" + f"python preprocess/build_hifigan_manifest.py --km_type hubert --tsv {outdir}/denoising/emov/train.tsv --km {outdir}/denoising/emov/train.km > {outdir}/hifigan/train.txt &&" + f"python preprocess/build_hifigan_manifest.py --km_type hubert --tsv {outdir}/denoising/emov/valid.tsv --km {outdir}/denoising/emov/valid.km > {outdir}/hifigan/valid.txt &&" + f"python preprocess/build_hifigan_manifest.py --km_type hubert --tsv {outdir}/denoising/emov/test.tsv --km {outdir}/denoising/emov/test.km > {outdir}/hifigan/test.txt" + ) + + logger.info("generating fairseq manifests") + run_cmd(f"python preprocess/build_translation_manifests.py {outdir} {outdir}/fairseq-data -dd -cs --dict {args.dict}") + + logger.info(f"finished processing data at:\n{outdir}") + + +if __name__ == "__main__": + main() diff --git a/examples/emotion_conversion/preprocess/extract_f0.py b/examples/emotion_conversion/preprocess/extract_f0.py new file mode 100644 index 0000000000..4204aa4db1 --- /dev/null +++ b/examples/emotion_conversion/preprocess/extract_f0.py @@ -0,0 +1,57 @@ +import argparse +from tqdm import tqdm +from multiprocessing import Manager, Pool + +from scipy.io.wavfile import read +from librosa.util import normalize +import numpy as np +import amfm_decompy.pYAAPT as pYAAPT +import amfm_decompy.basic_tools as basic + +MAX_WAV_VALUE = 32768.0 + +parser = argparse.ArgumentParser(description="") +parser.add_argument("tsv", help="") +parser.add_argument("--extractor", choices=["crepe", "pyaapt"], default="pyaapt", help="") +parser.add_argument("--interp", action="store_true", help="") +parser.add_argument("--n_workers", type=int, default=40, help="") +args = parser.parse_args() + +tsv_lines = open(args.tsv, "r").readlines() +root, tsv_lines = tsv_lines[0].strip(), tsv_lines[1:] + + +def extract_f0(tsv_line): + wav_path, _ = tsv_line.split("\t") + wav_path = root.strip() + "/" + wav_path + sr, wav = read(wav_path) + wav = wav / MAX_WAV_VALUE + wav = normalize(wav) * 0.95 + + if args.extractor == "pyaapt": + frame_length = 20.0 + pad = int(frame_length / 1000 * sr) // 2 + wav = np.pad(wav.squeeze(), (pad, pad), "constant", constant_values=0) + signal = basic.SignalObj(wav, sr) + pitch = pYAAPT.yaapt( + signal, + **{ + 'frame_length': frame_length, + 'frame_space': 5.0, + 'nccf_thresh1': 0.25, + 'tda_frame_length': 25.0 + }) + pitch = pitch.samp_interp[None, None, :] if args.interp else pitch.samp_values[None, None, :] + pitch = pitch[0, 0] + f0_path = wav_path.replace(".wav", ".yaapt") + f0_path += ".interp.f0" if args.interp else ".f0" + np.save(f0_path, pitch) + + +def main(): + with Pool(args.n_workers) as p: + r = list(tqdm(p.imap(extract_f0, tsv_lines), total=len(tsv_lines))) + + +if __name__ == "__main__": + main() diff --git a/examples/emotion_conversion/preprocess/process_km.py b/examples/emotion_conversion/preprocess/process_km.py new file mode 100644 index 0000000000..864a022105 --- /dev/null +++ b/examples/emotion_conversion/preprocess/process_km.py @@ -0,0 +1,40 @@ +import sys +import argparse +from tqdm import tqdm +from build_emov_translation_manifests import dedup, remove_under_k + + +if __name__ == "__main__": + """ + this is a standalone script to process a km file + specifically, to dedup or remove tokens that repeat less + than k times in a row + """ + parser = argparse.ArgumentParser(description="") + parser.add_argument("km", type=str, help="path to km file") + parser.add_argument("--dedup", action='store_true') + parser.add_argument("--remove-under-k", type=int, default=0) + parser.add_argument("--output", default=None) + args = parser.parse_args() + + if not args.dedup and args.remove_under_k == 0: + print("nothing to do! quitting...") + sys.exit(0) + + km = open(args.km, "r").readlines() + out = [] + for line in tqdm(km): + if args.remove_under_k > 0: + line = remove_under_k(line, args.remove_under_k) + if args.dedup: + line = dedup(line) + out.append(line) + + path = args.km if args.output is None else args.output + if args.remove_under_k > 0: + path = path.replace(".km", f"-k{args.remove_under_k}.km") + if args.dedup: + path = path.replace(".km", f"-deduped.km") + + open(path, "w").writelines(out) + print(f"written to {path}") diff --git a/examples/emotion_conversion/preprocess/split_emov_km_tsv_by_uttid.py b/examples/emotion_conversion/preprocess/split_emov_km_tsv_by_uttid.py new file mode 100644 index 0000000000..94221afba7 --- /dev/null +++ b/examples/emotion_conversion/preprocess/split_emov_km_tsv_by_uttid.py @@ -0,0 +1,70 @@ +from pathlib import Path +import os +import sys +import argparse +import random +import numpy as np +from tqdm import tqdm +from sklearn.model_selection import train_test_split +from build_translation_manifests import get_utt_id + + +def train_val_test_split(tsv_lines, km_lines, valid_percent, test_percent, seed=42): + utt_ids = list(sorted(set([get_utt_id(x) for x in tsv_lines]))) + utt_ids, valid_utt_ids, _, _ = train_test_split(utt_ids, utt_ids, test_size=valid_percent, shuffle=True, random_state=seed) + train_utt_ids, test_utt_ids, _, _ = train_test_split(utt_ids, utt_ids, test_size=test_percent, shuffle=True, random_state=seed) + + train_idx = [i for i, line in enumerate(tsv_lines) if get_utt_id(line) in train_utt_ids] + valid_idx = [i for i, line in enumerate(tsv_lines) if get_utt_id(line) in valid_utt_ids] + test_idx = [i for i, line in enumerate(tsv_lines) if get_utt_id(line) in test_utt_ids] + + train_tsv, train_km = [tsv_lines[i] for i in train_idx], [km_lines[i] for i in train_idx] + valid_tsv, valid_km = [tsv_lines[i] for i in valid_idx], [km_lines[i] for i in valid_idx] + test_tsv, test_km = [tsv_lines[i] for i in test_idx], [km_lines[i] for i in test_idx] + + print(f"train {len(train_km)}") + print(f"valid {len(valid_km)}") + print(f"test {len(test_km)}") + + return train_tsv, train_km, valid_tsv, valid_km, test_tsv, test_km + + +if __name__ == "__main__": + """ + this is a standalone script to process a km file + specifically, to dedup or remove tokens that repeat less + than k times in a row + """ + parser = argparse.ArgumentParser(description="") + parser.add_argument("tsv", type=str, help="path to tsv file") + parser.add_argument("km", type=str, help="path to km file") + parser.add_argument("--destdir", required=True, type=str) + parser.add_argument("--valid-percent", type=float, default=0.05, help="percent to allocate to validation set") + parser.add_argument("--test-percent", type=float, default=0.05, help="percent to allocate to test set") + parser.add_argument("--seed", type=int, default=42, help="") + args = parser.parse_args() + + np.random.seed(args.seed) + random.seed(args.seed) + + os.makedirs(args.destdir, exist_ok=True) + km = open(args.km, "r").readlines() + tsv = open(args.tsv, "r").readlines() + root, tsv = tsv[0], tsv[1:] + + assert args.tsv.endswith(".tsv") and args.km.endswith(".km") + assert len(tsv) == len(km) + + train_tsv, train_km, valid_tsv, valid_km, test_tsv, test_km = train_val_test_split(tsv, km, args.valid_percent, args.test_percent, args.seed) + + assert len(train_tsv) + len(valid_tsv) + len(test_tsv) == len(tsv) + assert len(train_tsv) == len(train_km) and len(valid_tsv) == len(valid_km) and len(test_tsv) == len(test_km) + + dir = Path(args.destdir) + open(dir / f"train.tsv", "w").writelines([root] + train_tsv) + open(dir / f"valid.tsv", "w").writelines([root] + valid_tsv) + open(dir / f"test.tsv", "w").writelines([root] + test_tsv) + open(dir / f"train.km", "w").writelines(train_km) + open(dir / f"valid.km", "w").writelines(valid_km) + open(dir / f"test.km", "w").writelines(test_km) + print("done") diff --git a/examples/emotion_conversion/preprocess/split_km.py b/examples/emotion_conversion/preprocess/split_km.py new file mode 100644 index 0000000000..d145fc2bde --- /dev/null +++ b/examples/emotion_conversion/preprocess/split_km.py @@ -0,0 +1,50 @@ +from pathlib import Path +import os +import argparse +import random +import numpy as np +from sklearn.utils import shuffle + + +if __name__ == "__main__": + """ + this is a standalone script to process a km file + specifically, to dedup or remove tokens that repeat less + than k times in a row + """ + parser = argparse.ArgumentParser(description="") + parser.add_argument("km", type=str, help="path to km file") + parser.add_argument("--destdir", required=True, type=str) + parser.add_argument("--valid-percent", type=float, default=0.05, help="percent to allocate to validation set") + parser.add_argument("--test-percent", type=float, default=0.05, help="percent to allocate to test set") + parser.add_argument("-sh", "--shuffle", action="store_true", help="path to km file") + parser.add_argument("--seed", type=int, default=42, help="") + args = parser.parse_args() + + np.random.seed(args.seed) + random.seed(args.seed) + + os.makedirs(args.destdir, exist_ok=True) + km = open(args.km, "r").readlines() + + if args.shuffle: + km = shuffle(km) + print(f"shuffled") + + N = len(km) + N_tt = int(N * args.test_percent) + N_cv = int(N * args.valid_percent) + N_tr = N - N_tt - N_cv + + train_km = km[:N_tr] + valid_km = km[N_tr:N_tr + N_cv] + test_km = km[N_tr + N_cv:] + + dir = Path(args.destdir) + open(dir / f"train.km", "w").writelines(train_km) + open(dir / f"valid.km", "w").writelines(valid_km) + open(dir / f"test.km", "w").writelines(test_km) + print(f"train: {len(train_km)}") + print(f"valid: {len(valid_km)}") + print(f"test: {len(test_km)}") + print("done") diff --git a/examples/emotion_conversion/preprocess/split_km_tsv.py b/examples/emotion_conversion/preprocess/split_km_tsv.py new file mode 100644 index 0000000000..2113aa718d --- /dev/null +++ b/examples/emotion_conversion/preprocess/split_km_tsv.py @@ -0,0 +1,65 @@ +from pathlib import Path +import os +import argparse +import random +import numpy as np +from sklearn.utils import shuffle + + +if __name__ == "__main__": + """ + this is a standalone script to process a km file + specifically, to dedup or remove tokens that repeat less + than k times in a row + """ + parser = argparse.ArgumentParser(description="") + parser.add_argument("tsv", type=str, help="path to tsv file") + parser.add_argument("km", type=str, help="path to km file") + parser.add_argument("--destdir", required=True, type=str) + parser.add_argument("--valid-percent", type=float, default=0.05, help="percent to allocate to validation set") + parser.add_argument("--test-percent", type=float, default=0.05, help="percent to allocate to test set") + parser.add_argument("-sh", "--shuffle", action="store_true", help="path to km file") + parser.add_argument("--seed", type=int, default=42, help="") + args = parser.parse_args() + + np.random.seed(args.seed) + random.seed(args.seed) + + os.makedirs(args.destdir, exist_ok=True) + km = open(args.km, "r").readlines() + tsv = open(args.tsv, "r").readlines() + root, tsv = tsv[0], tsv[1:] + + assert args.tsv.endswith(".tsv") and args.km.endswith(".km") + assert len(tsv) == len(km) + + if args.shuffle: + tsv, km = shuffle(tsv, km) + print(f"shuffled") + + N = len(tsv) + N_tt = int(N * args.test_percent) + N_cv = int(N * args.valid_percent) + N_tr = N - N_tt - N_cv + + train_tsv = tsv[:N_tr] + valid_tsv = tsv[N_tr:N_tr + N_cv] + test_tsv = tsv[N_tr + N_cv:] + train_km = km[:N_tr] + valid_km = km[N_tr:N_tr + N_cv] + test_km = km[N_tr + N_cv:] + + assert len(train_tsv) + len(valid_tsv) + len(test_tsv) == len(tsv) + assert len(train_tsv) == len(train_km) and len(valid_tsv) == len(valid_km) and len(test_tsv) == len(test_km) + + dir = Path(args.destdir) + open(dir / f"train.tsv", "w").writelines([root] + train_tsv) + open(dir / f"valid.tsv", "w").writelines([root] + valid_tsv) + open(dir / f"test.tsv", "w").writelines([root] + test_tsv) + open(dir / f"train.km", "w").writelines(train_km) + open(dir / f"valid.km", "w").writelines(valid_km) + open(dir / f"test.km", "w").writelines(test_km) + print(f"train: {len(train_km)}") + print(f"valid: {len(valid_km)}") + print(f"test: {len(test_km)}") + print("done") diff --git a/examples/emotion_conversion/requirements.txt b/examples/emotion_conversion/requirements.txt new file mode 100644 index 0000000000..fc94c5a547 --- /dev/null +++ b/examples/emotion_conversion/requirements.txt @@ -0,0 +1,11 @@ +scipy +einops +amfm_decompy +joblib +numba +decorator +requests +appdirs +packaging +six +sklearn diff --git a/examples/emotion_conversion/synthesize.py b/examples/emotion_conversion/synthesize.py new file mode 100644 index 0000000000..327fdaf4ea --- /dev/null +++ b/examples/emotion_conversion/synthesize.py @@ -0,0 +1,322 @@ +import logging +import argparse +import random +import sys +import os +import numpy as np +import torch +import soundfile as sf +import shutil +import librosa +import json +from pathlib import Path +from tqdm import tqdm +import amfm_decompy.basic_tools as basic +import amfm_decompy.pYAAPT as pYAAPT + +dir_path = os.path.dirname(__file__) +resynth_path = os.path.dirname(os.path.abspath(__file__)) + "/speech-resynthesis" +sys.path.append(resynth_path) + +from models import CodeGenerator +from inference import scan_checkpoint, load_checkpoint, generate +from emotion_models.pitch_predictor import load_ckpt as load_pitch_predictor +from emotion_models.duration_predictor import load_ckpt as load_duration_predictor +from dataset import load_audio, MAX_WAV_VALUE, parse_style, parse_speaker, EMOV_SPK2ID, EMOV_STYLE2ID + + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(message)s', + handlers=[logging.FileHandler('debug.log'), logging.StreamHandler()] +) +logger = logging.getLogger(__name__) + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def parse_generation_file(fname): + lines = open(fname).read() + lines = lines.split('\n') + + results = {} + for l in lines: + if len(l) == 0: + continue + + if l[0] == 'H': + parts = l[2:].split('\t') + if len(parts) == 2: + sid, utt = parts + else: + sid, _, utt = parts + sid = int(sid) + utt = [int(x) for x in utt.split()] + if sid in results: + results[sid]['H'] = utt + else: + results[sid] = {'H': utt} + elif l[0] == 'S': + sid, utt = l[2:].split('\t') + sid = int(sid) + utt = [x for x in utt.split()] + if sid in results: + results[sid]['S'] = utt + else: + results[sid] = {'S': utt} + elif l[0] == 'T': + sid, utt = l[2:].split('\t') + sid = int(sid) + utt = [int(x) for x in utt.split()] + if sid in results: + results[sid]['T'] = utt + else: + results[sid] = {'T': utt} + + for d, result in results.items(): + if 'H' not in result: + result['H'] = result['S'] + + return results + + +def get_code_to_fname(manifest, tokens): + if tokens is None: + code_to_fname = {} + with open(manifest) as f: + for line in f: + line = line.strip() + fname, code = line.split() + code = code.replace(',', ' ') + code_to_fname[code] = fname + + return code_to_fname + + with open(manifest) as f: + fnames = [l.strip() for l in f.readlines()] + root = Path(fnames[0]) + fnames = fnames[1:] + if '\t' in fnames[0]: + fnames = [x.split()[0] for x in fnames] + + with open(tokens) as f: + codes = [l.strip() for l in f.readlines()] + + code_to_fname = {} + for fname, code in zip(fnames, codes): + code = code.replace(',', ' ') + code_to_fname[code] = str(root / fname) + + return root, code_to_fname + + +def code_to_str(s): + k = ' '.join([str(x) for x in s]) + return k + + +def get_praat_f0(audio, rate=16000, interp=False): + frame_length = 20.0 + to_pad = int(frame_length / 1000 * rate) // 2 + + f0s = [] + for y in audio.astype(np.float64): + y_pad = np.pad(y.squeeze(), (to_pad, to_pad), "constant", constant_values=0) + signal = basic.SignalObj(y_pad, rate) + pitch = pYAAPT.yaapt(signal, **{'frame_length': frame_length, 'frame_space': 5.0, 'nccf_thresh1': 0.25, + 'tda_frame_length': 25.0}) + if interp: + f0s += [pitch.samp_interp[None, None, :]] + else: + f0s += [pitch.samp_values[None, None, :]] + + f0 = np.vstack(f0s) + return f0 + + +def generate_from_code(generator, h, code, spkr=None, f0=None, gst=None, device="cpu"): + batch = { + 'code': torch.LongTensor(code).to(device).view(1, -1), + } + if spkr is not None: + batch['spkr'] = spkr.to(device).unsqueeze(0) + if f0 is not None: + batch['f0'] = f0.to(device) + if gst is not None: + batch['style'] = gst.to(device) + + with torch.no_grad(): + audio, rtf = generate(h, generator, batch) + audio = librosa.util.normalize(audio / 2 ** 15) + + return audio + + +@torch.no_grad() +def synth(argv, interactive=False): + parser = argparse.ArgumentParser() + parser.add_argument('--result-path', type=Path, help='Translation Model Output', required=True) + parser.add_argument('--data', type=Path, help='a directory with the files: src.tsv, src.km, trg.tsv, trg.km, orig.tsv, orig.km') + parser.add_argument("--orig-tsv", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/data.tsv") + parser.add_argument("--orig-km", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/core_manifests/emov_16khz_km_100/data.km") + + parser.add_argument('--checkpoint-file', type=Path, help='Generator Checkpoint', required=True) + parser.add_argument('--dur-model', type=Path, help='a token duration prediction model (if tokens were deduped)') + parser.add_argument('--f0-model', type=Path, help='a f0 prediction model') + + parser.add_argument('-s', '--src-emotion', default=None) + parser.add_argument('-t', '--trg-emotion', default=None) + parser.add_argument('-N', type=int, default=10) + parser.add_argument('--split', default="test") + + parser.add_argument('--outdir', type=Path, default=Path('results')) + parser.add_argument('--orig-filename', action='store_true') + + parser.add_argument('--device', type=int, default=0) + a = parser.parse_args(argv) + + seed = 52 + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + if os.path.isdir(a.checkpoint_file): + config_file = os.path.join(a.checkpoint_file, 'config.json') + else: + config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json') + with open(config_file) as f: + data = f.read() + json_config = json.loads(data) + h = AttrDict(json_config) + + generator = CodeGenerator(h).to(a.device) + if os.path.isdir(a.checkpoint_file): + cp_g = scan_checkpoint(a.checkpoint_file, 'g_') + else: + cp_g = a.checkpoint_file + state_dict_g = load_checkpoint(cp_g) + generator.load_state_dict(state_dict_g['generator']) + + generator.eval() + generator.remove_weight_norm() + + dur_models = { + "neutral": load_duration_predictor(f"{a.dur_model}/neutral.ckpt"), + "amused": load_duration_predictor(f"{a.dur_model}/amused.ckpt"), + "disgusted": load_duration_predictor(f"{a.dur_model}/disgusted.ckpt"), + "angry": load_duration_predictor(f"{a.dur_model}/angry.ckpt"), + "sleepy": load_duration_predictor(f"{a.dur_model}/sleepy.ckpt"), + } + logger.info(f"loaded duration prediction model from {a.dur_model}") + + f0_model = load_pitch_predictor(a.f0_model).to(a.device) + logger.info(f"loaded f0 prediction model from {a.f0_model}") + + # we need to know how to map code back to the filename + # (if we want the original files names as output) + results = parse_generation_file(a.result_path) + _, src_code_to_fname = get_code_to_fname(f'{a.data}/files.{a.split}.{a.src_emotion}', f'{a.data}/{a.split}.{a.src_emotion}') + _, tgt_code_to_fname = get_code_to_fname(f'{a.data}/files.{a.split}.{a.trg_emotion}', f'{a.data}/{a.split}.{a.trg_emotion}') + + # we need the originals (before dedup) to get the ground-truth durations + orig_tsv = open(a.orig_tsv, 'r').readlines() + orig_tsv_root, orig_tsv = orig_tsv[0].strip(), orig_tsv[1:] + orig_km = open(a.orig_km, 'r').readlines() + fname_to_idx = {orig_tsv_root + "/" + line.split("\t")[0]: i for i, line in enumerate(orig_tsv)} + + outdir = a.outdir + outdir.mkdir(parents=True, exist_ok=True) + (outdir / '0-source').mkdir(exist_ok=True) + (outdir / '1-src-tokens-src-style-src-f0').mkdir(exist_ok=True) + (outdir / '2-src-tokens-trg-style-src-f0').mkdir(exist_ok=True) + (outdir / '2.5-src-tokens-trg-style-src-f0').mkdir(exist_ok=True) + (outdir / '3-src-tokens-trg-style-pred-f0').mkdir(exist_ok=True) + (outdir / '4-gen-tokens-trg-style-pred-f0').mkdir(exist_ok=True) + (outdir / '5-target').mkdir(exist_ok=True) + + N = 0 + results = list(results.items()) + random.shuffle(results) + for i, (sid, result) in tqdm(enumerate(results)): + N += 1 + if N > a.N and a.N != -1: + break + + if '[' in result['S'][0]: + result['S'] = result['S'][1:] + if '_' in result['S'][-1]: + result['S'] = result['S'][:-1] + src_ref = src_code_to_fname[code_to_str(result['S'])] + trg_ref = tgt_code_to_fname[code_to_str(result['T'])] + + src_style, trg_style = None, None + src_spkr, trg_spkr = None, None + src_f0 = None + src_audio = (load_audio(src_ref)[0] / MAX_WAV_VALUE) * 0.95 + trg_audio = (load_audio(trg_ref)[0] / MAX_WAV_VALUE) * 0.95 + src_audio = torch.FloatTensor(src_audio).unsqueeze(0).cuda() + trg_audio = torch.FloatTensor(trg_audio).unsqueeze(0).cuda() + + src_spkr = parse_speaker(src_ref, h.multispkr) + src_spkr = src_spkr if src_spkr in EMOV_SPK2ID else random.choice(list(EMOV_SPK2ID.keys())) + src_spkr = EMOV_SPK2ID[src_spkr] + src_spkr = torch.LongTensor([src_spkr]) + trg_spkr = parse_speaker(trg_ref, h.multispkr) + trg_spkr = trg_spkr if trg_spkr in EMOV_SPK2ID else random.choice(list(EMOV_SPK2ID.keys())) + trg_spkr = EMOV_SPK2ID[trg_spkr] + trg_spkr = torch.LongTensor([trg_spkr]) + + src_style = EMOV_STYLE2ID[a.src_emotion] + src_style = torch.LongTensor([src_style]).cuda() + trg_style_str = a.trg_emotion + trg_style = EMOV_STYLE2ID[a.trg_emotion] + trg_style = torch.LongTensor([trg_style]).cuda() + + src_tokens = list(map(int, orig_km[fname_to_idx[src_ref]].strip().split(" "))) + src_tokens = torch.LongTensor(src_tokens).unsqueeze(0) + src_tokens_dur_pred = torch.LongTensor(list(map(int, result['S']))).unsqueeze(0) + src_tokens_dur_pred = dur_models[trg_style_str].inflate_input(src_tokens_dur_pred) + gen_tokens = torch.LongTensor(result['H']).unsqueeze(0) + gen_tokens = dur_models[trg_style_str].inflate_input(gen_tokens) + trg_tokens = torch.LongTensor(result['T']).unsqueeze(0) + trg_tokens = dur_models[trg_style_str].inflate_input(trg_tokens) + + src_f0 = get_praat_f0(src_audio.unsqueeze(0).cpu().numpy()) + src_f0 = torch.FloatTensor(src_f0).cuda() + + pred_src_f0 = f0_model.inference(torch.LongTensor(src_tokens).to(a.device), src_spkr, trg_style).unsqueeze(0) + pred_src_dur_pred_f0 = f0_model.inference(torch.LongTensor(src_tokens_dur_pred).to(a.device), src_spkr, trg_style).unsqueeze(0) + pred_gen_f0 = f0_model.inference(torch.LongTensor(gen_tokens).to(a.device), src_spkr, trg_style).unsqueeze(0) + pred_trg_f0 = f0_model.inference(torch.LongTensor(trg_tokens).to(a.device), src_spkr, trg_style).unsqueeze(0) + + if a.orig_filename: + path = src_code_to_fname[code_to_str(result['S'])] + sid = str(sid) + "__" + Path(path).stem + shutil.copy(src_code_to_fname[code_to_str(result['S'])], outdir / '0-source' / f'{sid}.wav') + + audio = generate_from_code(generator, h, src_tokens, spkr=src_spkr, f0=src_f0, gst=src_style, device=a.device) + sf.write(outdir / '1-src-tokens-src-style-src-f0' / f'{sid}.wav', audio, samplerate=h.sampling_rate) + + audio = generate_from_code(generator, h, src_tokens, spkr=src_spkr, f0=src_f0, gst=trg_style, device=a.device) + sf.write(outdir / '2-src-tokens-trg-style-src-f0' / f'{sid}.wav', audio, samplerate=h.sampling_rate) + + audio = generate_from_code(generator, h, src_tokens_dur_pred, spkr=src_spkr, f0=src_f0, gst=trg_style, device=a.device) + sf.write(outdir / '2.5-src-tokens-trg-style-src-f0' / f'{sid}.wav', audio, samplerate=h.sampling_rate) + + audio = generate_from_code(generator, h, src_tokens_dur_pred, spkr=src_spkr, f0=pred_src_dur_pred_f0, gst=trg_style, device=a.device) + sf.write(outdir / '3-src-tokens-trg-style-pred-f0' / f'{sid}.wav', audio, samplerate=h.sampling_rate) + + audio = generate_from_code(generator, h, gen_tokens, spkr=src_spkr, f0=pred_gen_f0, gst=trg_style, device=a.device) + sf.write(outdir / '4-gen-tokens-trg-style-pred-f0' / f'{sid}.wav', audio, samplerate=h.sampling_rate) + + shutil.copy(tgt_code_to_fname[code_to_str(result['T'])], outdir / '5-target' / f'{sid}.wav') + + logger.info("Done.") + + +if __name__ == '__main__': + synth(sys.argv[1:]) diff --git a/examples/fast_noisy_channel/README.md b/examples/fast_noisy_channel/README.md new file mode 100644 index 0000000000..f2631a8c34 --- /dev/null +++ b/examples/fast_noisy_channel/README.md @@ -0,0 +1,345 @@ +# Language Models not just for Pre-training: Fast Online Neural Noisy Channel Modeling + +## Introduction +- [Yee et al. (2019)](https://www.aclweb.org/anthology/D19-1571.pdf) introduce a simple and effective noisy channel modeling approach for neural machine translation. However, the noisy channel online decoding approach introduced in this paper is too slow to be practical. +- To address this, [Bhosale et al. (2020)](http://www.statmt.org/wmt20/pdf/2020.wmt-1.68.pdf) introduces 3 simple approximations to make this approach very fast and practical without much loss in accuracy. +- This README provides intructions on how to run online decoding or generation with the noisy channel modeling approach, including ways to make it very fast without much loss in accuracy. + +## Noisy Channel Modeling + +[Yee et al. (2019)](https://www.aclweb.org/anthology/D19-1571.pdf) applies the Bayes Rule to predict `P(y|x)`, the probability of the target `y` given the source `x`. +```P(y|x) = P(x|y) * P(y) / P(x)``` +- `P(x|y)` predicts the source `x` given the target `y` and is referred to as the **channel model** +- `P(y)` is a **language model** over the target `y` +- `P(x)` is generally not modeled since it is constant for all `y`. + +We use Transformer models to parameterize the direct model `P(y|x)`, the channel model `P(x|y)` and the language model `P(y)`. + +During online decoding with beam search, we generate the top `K2` candidates per beam and score them with the following linear combination of the channel model, the language model as well as the direct model scores. + +```(1 / t) * log(P(y|x) + (1 / s) * ( λ1 * log(P(x|y)) + λ2 * log(P(y) ) )``` +- `t` - Target Prefix Length +- `s` - Source Length +- `λ1` - Channel Model Weight +- `λ2` - Language Model Weight + +The top `beam_size` candidates based on the above combined scores are chosen to continue the beams in beam search. In beam search with a direct model alone, the scores from the direct model `P(y|x)` are used to choose the top candidates in beam search. + +This framework provides a great way to utlize strong target language models trained on large amounts of unlabeled data. Language models can prefer targets unrelated to the source, so we also need a channel model whose role is to ensure that the target preferred by the language model also translates back to the source. + +### Training Translation Models and Language Models + +For training Transformer models in fairseq for machine translation, refer to instructions [here](https://github.com/pytorch/fairseq/tree/main/examples/translation) + +For training Transformer models in fairseq for language modeling, refer to instructions [here](https://github.com/pytorch/fairseq/tree/main/examples/language_model) + +### Generation with Language Model for German-English translation with fairseq + +Here are instructions to generate using a direct model and a target-side language model. + +Note: +- Download and install fairseq as per instructions [here](https://github.com/pytorch/fairseq) +- Preprocess and binarize the dataset as per instructions in section [Test Data Preprocessing](#test-data-preprocessing) + +```sh +binarized_data=data_dir/binarized +direct_model=de_en_seed4.pt +lm_model=en_lm.pt +lm_data=lm_data +wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt -O ${direct_model} +wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt -O ${lm_model} +mkdir -p ${lm_data} +wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/dict.txt -O ${lm_data}/dict.txt + +k2=10 +lenpen=0.16 +lm_wt=0.14 +fairseq-generate ${binarized_data} \ + --user-dir examples/fast_noisy_channel \ + --beam 5 \ + --path ${direct_model} \ + --lm-model ${lm_model} \ + --lm-data ${lm_data} \ + --k2 ${k2} \ + --combine-method lm_only \ + --task noisy_channel_translation \ + --lenpen ${lenpen} \ + --lm-wt ${lm_wt} \ + --gen-subset valid \ + --remove-bpe \ + --fp16 \ + --batch-size 10 +``` +### Noisy Channel Generation for German-English translation with fairseq + +Here are instructions for noisy channel generation with a direct model, channel model and language model as explained in section [Noisy Channel Modeling](#noisy-channel-modeling). + +Note: +- Download and install fairseq as per instructions [here](https://github.com/pytorch/fairseq) +- Preprocess and binarize the dataset as per instructions in section [Test Data Preprocessing](#test-data-preprocessing) + +```sh +binarized_data=data_dir/binarized +direct_model=de_en_seed4.pt +lm_model=en_lm.pt +lm_data=lm_data +ch_model=en_de.big.seed4.pt +wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt -O ${direct_model} +wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt -O ${lm_model} +mkdir -p ${lm_data} +wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/dict.txt -O ${lm_data}/dict.txt +wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed4.pt -O ${ch_model} + +k2=10 +lenpen=0.21 +lm_wt=0.50 +bw_wt=0.30 +fairseq-generate ${binarized_data} \ + --user-dir examples/fast_noisy_channel \ + --beam 5 \ + --path ${direct_model} \ + --lm-model ${lm_model} \ + --lm-data ${lm_data} \ + --channel-model ${ch_model} \ + --k2 ${k2} \ + --combine-method noisy_channel \ + --task noisy_channel_translation \ + --lenpen ${lenpen} \ + --lm-wt ${lm_wt} \ + --ch-wt ${bw_wt} \ + --gen-subset test \ + --remove-bpe \ + --fp16 \ + --batch-size 1 +``` +## Fast Noisy Channel Modeling + +[Bhosale et al. (2020)](http://www.statmt.org/wmt20/pdf/2020.wmt-1.68.pdf) introduces 3 approximations that speed up online noisy channel decoding - +- Smaller channel models (`Tranformer Base` with 1 encoder and decoder layer each vs. `Transformer Big`) + - This involves training a channel model that is possibly smaller and less accurate in terms of BLEU than a channel model of the same size as the direct model. + - Since the role of the channel model is mainly to assign low scores to generations from the language model if they don't translate back to the source, we may not need the most accurate channel model for this purpose. +- Smaller output vocabulary size for the channel model (~30,000 -> ~1000) + - The channel model doesn't need to score the full output vocabulary, it just needs to score the source tokens, which are completely known. + - This is specified using the arguments `--channel-scoring-type src_vocab --top-k-vocab 500` + - This means that the output vocabulary for the channel model will be the source tokens for all examples in the batch and the top-K most frequent tokens in the vocabulary + - This reduces the memory consumption needed to store channel model scores significantly +- Smaller number of candidates (`k2`) scored per beam + - This is specified by reducing the argument `--k2` + + +### Fast Noisy Channel Generation for German-English translation with fairseq + +Here are instructions for **fast** noisy channel generation with a direct model, channel model and language model as explained in section [Fast Noisy Channel Modeling](#fast-noisy-channel-modeling). The main differences are that we use a smaller channel model, reduce `--k2`, set `--channel-scoring-type src_vocab --top-k-vocab 500` and increase the `--batch-size`. + +Note: +- Download and install fairseq as per instructions [here](https://github.com/pytorch/fairseq) +- Preprocess and binarize the dataset as per instructions in section [Test Data Preprocessing](#test-data-preprocessing) + +```sh +binarized_data=data_dir/binarized +direct_model=de_en_seed4.pt +lm_model=en_lm.pt +lm_data=lm_data +small_ch_model=en_de.base_1_1.seed4.pt +wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt -O ${direct_model} +wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt -O ${lm_model} +mkdir -p ${lm_data} +wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/dict.txt -O ${lm_data}/dict.txt +wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed4.pt -O ${small_ch_model} + +k2=3 +lenpen=0.23 +lm_wt=0.58 +bw_wt=0.26 +fairseq-generate ${binarized_data} \ + --user-dir examples/fast_noisy_channel \ + --beam 5 \ + --path ${direct_model} \ + --lm-model ${lm_model} \ + --lm-data ${lm_data} \ + --channel-model ${small_ch_model} \ + --k2 ${k2} \ + --combine-method noisy_channel \ + --task noisy_channel_translation \ + --lenpen ${lenpen} \ + --lm-wt ${lm_wt} \ + --ch-wt ${bw_wt} \ + --gen-subset test \ + --remove-bpe \ + --fp16 \ + --batch-size 50 \ + --channel-scoring-type src_vocab --top-k-vocab 500 +``` + +## Test Data Preprocessing + +For preprocessing and binarizing the test sets for Romanian-English and German-English translation, we use the following script - + +```sh +FAIRSEQ=/path/to/fairseq +cd $FAIRSEQ +SCRIPTS=$FAIRSEQ/mosesdecoder/scripts +if [ ! -d "${SCRIPTS}" ]; then + echo 'Cloning Moses github repository (for tokenization scripts)...' + git clone https://github.com/moses-smt/mosesdecoder.git +fi +TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl +NORMALIZE=$SCRIPTS/tokenizer/normalize-punctuation.perl + +s=de +t=en +test=wmt18 + +mkdir -p data_dir + +# Tokenization +if [ $s == "ro" ] ; then + # Note: Get normalise-romanian.py and remove-diacritics.py from + # https://github.com/rsennrich/wmt16-scripts/tree/master/preprocess + sacrebleu -t $test -l $s-$t --echo src | \ + $NORMALIZE -l $s | \ + python normalise-romanian.py | \ + python remove-diacritics.py | \ + $TOKENIZER -l $s -a -q > data_dir/$test.$s-$t.$s +else + sacrebleu -t $test -l $s-$t --echo src | perl $NORMALIZE -l $s | perl $TOKENIZER -threads 8 -a -l $s > data_dir/$test.$s-$t.$s +fi + +sacrebleu -t $test -l $s-$t --echo ref | perl $NORMALIZE -l $t | perl $TOKENIZER -threads 8 -a -l $t > data_dir/$test.$s-$t.$t + + +# Applying BPE +src_bpe_code=/path/to/source/language/bpe/code +tgt_bpe_code=/path/to/target/language/bpe/code +src_dict=/path/to/source/language/dict +tgt_dict=/path/to/target/language/dict + +FASTBPE=$FAIRSEQ/fastBPE +if [ ! -d "${FASTBPE}" ] ; then + git clone https://github.com/glample/fastBPE.git + # Follow compilation instructions at https://github.com/glample/fastBPE + g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast +fi + +${FASTBPE}/fast applybpe data_dir/bpe.$test.$s-$t.$s data_dir/$test.$s-$t.$s ${src_bpe_code} +${FASTBPE}/fast applybpe data_dir/bpe.$test.$s-$t.$s data_dir/$test.$s-$t.$s ${tgt_bpe_code} + +fairseq-preprocess -s $s -t $t \ + --testpref data_dir/bpe.$test.$s-$t \ + --destdir data_dir/binarized \ + --srcdict ${src_dict} \ + --tgtdict ${tgt_dict} +``` + +## Calculating BLEU + +```sh +DETOKENIZER=$SCRIPTS/tokenizer/detokenizer.perl +cat ${generation_output} | grep -P "^H" | sort -V | cut -f 3- | $DETOKENIZER -l $t -q -a | sacrebleu -t $test -l $s-$t +``` + + +## Romanian-English Translation + +The direct and channel models are trained using bitext data (WMT16) combined with backtranslated data (The monolingual data used for backtranslation comes from http://data.statmt.org/rsennrich/wmt16_backtranslations/ (Sennrich et al., 2016c)) + +The backtranslated data is generated using an ensemble of 3 English-Romanian models trained on bitext training data (WMT16) with unrestricted sampling. + +### BPE Codes and Dictionary + +We learn a joint BPE vocabulary of 18K types on the bitext training data which is used for both the source and target. +||Path| +|----------|------| +| BPE Code | [joint_bpe_18k](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/bpe_18k) | +| Dictionary | [dict](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/dict) | + +### Direct Models +For Ro-En with backtranslation, the direct and channel models use a Transformer-Big architecture. + +| Seed | Model | +|----|----| +| 2 | [ro_en_seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/direct_models/seed2.pt) +| 4 | [ro_en_seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/direct_models/seed4.pt) +| 6 | [ro_en_seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/direct_models/seed6.pt) + +### Channel Models +For channel models, we follow the same steps as for the direct models. But backtranslated data is generated in the opposite direction using [this Romanian monolingual data](http://data.statmt.org/rsennrich/wmt16_backtranslations/). +The best lenpen, LM weight and CH weight are obtained by sweeping over the validation set (wmt16/dev) using beam 5. +| Model Size | Lenpen | LM Weight | CH Weight | Seed 2 | Seed 4 | Seed 6 | +|----|----|----|----|----|----|----| +| `big` | 0.84 | 0.64 | 0.56 | [big.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/big.seed2.pt) | [big.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/big.seed2.pt) | [big.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/big.seed2.pt) | +| `base_1_1` | 0.63 | 0.40 | 0.37 | [base_1_1.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/base_1_1.seed2.pt) | [base_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/base_1_1.seed4.pt) | [base_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/base_1_1.seed6.pt) | + +### Language Model +The model is trained on de-duplicated English Newscrawl data from 2007-2018 comprising 186 million sentences or 4.5B words after normalization and tokenization. +| | Path | +|----|----| +| `--lm-model` | [transformer_en_lm](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/lm_model/transformer_lm.pt) | +| `--lm-data` | [lm_data](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/lm_model/lm_dict) + +## German-English Translation + +### BPE Codes and Dictionaries + +| | Path| +|----------|------| +| Source BPE Code | [de_bpe_code_24K](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/de_bpe_code_24K) | +| Target BPE Code | [en_bpe_code_24K](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/en_bpe_code_24K) +| Source Dictionary | [de_dict](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/de_dict) | +| Target Dictionary | [en_dict](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/en_dict) | + +### Direct Models +We train on WMT’19 training data. Following [Ng et al., 2019](http://statmt.org/wmt19/pdf/53/WMT33.pdf), we apply language identification filtering and remove sentences longer than 250 tokens as well as sentence pairs with a source/target length ratio exceeding 1.5. This results in 26.8M sentence pairs. +We use the Transformer-Big architecture for the direct model. + +| Seed | Model | +|:----:|----| +| 4 | [de_en_seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt) +| 5 | [de_en_seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed5.pt) +| 6 | [de_en_seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed6.pt) + +### Channel Models + +We train on WMT’19 training data. Following [Ng et al., 2019](http://statmt.org/wmt19/pdf/53/WMT33.pdf), we apply language identification filtering and remove sentences longer than 250 tokens as well as sentence pairs with a source/target length ratio exceeding 1.5. This results in 26.8M sentence pairs. + +| Model Size | Seed 4 | Seed 5 | Seed 6 | +|----|----|----|----| +| `big` | [big.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed4.pt) | [big.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed5.pt) | [big.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed6.pt) | +| `big_1_1` | [big_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big_1_1.seed4.pt) | [big_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big_1_1.seed5.pt) | [big_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big_1_1.seed6.pt) | +| `base` | [base.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base.seed4.pt) | [base.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base.seed5.pt) | [base.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base.seed6.pt) | +| `base_1_1` | [base_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed4.pt) | [base_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed5.pt) | [base_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed6.pt) | +| `half` | [half.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half.seed4.pt) | [half.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half.seed5.pt) | [half.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half.seed6.pt) | +| `half_1_1` | [half_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half_1_1.seed4.pt) | [half_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half_1_1.seed5.pt) | [half_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half_1_1.seed6.pt) | +| `quarter` | [quarter.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter.seed4.pt) | [quarter.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter.seed5.pt) | [quarter.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter.seed6.pt) | +| `quarter_1_1` | [quarter_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter_1_1.seed4.pt) | [quarter_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter_1_1.seed5.pt) | [quarter_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter_1_1.seed6.pt) | +| `8th` | [8th.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th.seed4.pt) | [8th.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th.seed5.pt) | [8th.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th.seed6.pt) | +| `8th_1_1` | [8th_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th_1_1.seed4.pt) | [8th_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th_1_1.seed5.pt) | [8th_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th_1_1.seed6.pt) | +| `16th` | [16th.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th.seed4.pt) | [16th.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th.seed5.pt) | [16th.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th.seed6.pt) | +| `16th_1_1` | [16th_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th_1_1.seed4.pt) | [16th_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th_1_1.seed5.pt) | [16th_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th_1_1.seed6.pt) | + +### Language Model +The model is trained on de-duplicated English Newscrawl data from 2007-2018 comprising 186 million sentences or 4.5B words after normalization and tokenization. +| | Path | +|----|----| +| `--lm-model` | [transformer_en_lm](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt) | +| `--lm-data` | [lm_data](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/) + + +## Citation + +```bibtex +@inproceedings{bhosale2020language, + title={Language Models not just for Pre-training: Fast Online Neural Noisy Channel Modeling}, + author={Shruti Bhosale and Kyra Yee and Sergey Edunov and Michael Auli}, + booktitle={Proceedings of the Fifth Conference on Machine Translation (WMT)}, + year={2020}, +} + +@inproceedings{yee2019simple, + title={Simple and Effective Noisy Channel Modeling for Neural Machine Translation}, + author={Yee, Kyra and Dauphin, Yann and Auli, Michael}, + booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, + pages={5700--5705}, + year={2019} +} +``` diff --git a/examples/fast_noisy_channel/__init__.py b/examples/fast_noisy_channel/__init__.py new file mode 100644 index 0000000000..9b248c3a24 --- /dev/null +++ b/examples/fast_noisy_channel/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import noisy_channel_translation # noqa +from . import noisy_channel_sequence_generator # noqa +from . import noisy_channel_beam_search # noqa diff --git a/examples/fast_noisy_channel/noisy_channel_beam_search.py b/examples/fast_noisy_channel/noisy_channel_beam_search.py new file mode 100644 index 0000000000..23869ebcd0 --- /dev/null +++ b/examples/fast_noisy_channel/noisy_channel_beam_search.py @@ -0,0 +1,71 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from fairseq.search import Search + + +class NoisyChannelBeamSearch(Search): + + def __init__(self, tgt_dict): + super().__init__(tgt_dict) + self.fw_scores_buf = None + self.lm_scores_buf = None + + def _init_buffers(self, t): + # super()._init_buffers(t) + if self.fw_scores_buf is None: + self.scores_buf = t.new() + self.indices_buf = torch.LongTensor().to(device=t.device) + self.beams_buf = torch.LongTensor().to(device=t.device) + self.fw_scores_buf = t.new() + self.lm_scores_buf = t.new() + + def combine_fw_bw(self, combine_method, fw_cum, bw, step): + if combine_method == "noisy_channel": + fw_norm = fw_cum.div(step + 1) + lprobs = bw + fw_norm + elif combine_method == "lm_only": + lprobs = bw + fw_cum + + return lprobs + + def step(self, step, fw_lprobs, scores, bw_lprobs, lm_lprobs, combine_method): + self._init_buffers(fw_lprobs) + bsz, beam_size, vocab_size = fw_lprobs.size() + + if step == 0: + # at the first step all hypotheses are equally likely, so use + # only the first beam + fw_lprobs = fw_lprobs[:, ::beam_size, :].contiguous() + bw_lprobs = bw_lprobs[:, ::beam_size, :].contiguous() + # nothing to add since we are at the first step + fw_lprobs_cum = fw_lprobs + + else: + # make probs contain cumulative scores for each hypothesis + raw_scores = (scores[:, :, step - 1].unsqueeze(-1)) + fw_lprobs_cum = (fw_lprobs.add(raw_scores)) + + combined_lprobs = self.combine_fw_bw(combine_method, fw_lprobs_cum, bw_lprobs, step) + + # choose the top k according to the combined noisy channel model score + torch.topk( + combined_lprobs.view(bsz, -1), + k=min( + # Take the best 2 x beam_size predictions. We'll choose the first + # beam_size of these which don't predict eos to continue with. + beam_size * 2, + combined_lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad + ), + out=(self.scores_buf, self.indices_buf), + ) + # save corresponding fw and lm scores + self.fw_scores_buf = torch.gather(fw_lprobs_cum.view(bsz, -1), 1, self.indices_buf) + self.lm_scores_buf = torch.gather(lm_lprobs.view(bsz, -1), 1, self.indices_buf) + # Project back into relative indices and beams + self.beams_buf = self.indices_buf // vocab_size + self.indices_buf.fmod_(vocab_size) + return self.scores_buf, self.fw_scores_buf, self.lm_scores_buf, self.indices_buf, self.beams_buf diff --git a/examples/fast_noisy_channel/noisy_channel_sequence_generator.py b/examples/fast_noisy_channel/noisy_channel_sequence_generator.py new file mode 100644 index 0000000000..ea8fae98e8 --- /dev/null +++ b/examples/fast_noisy_channel/noisy_channel_sequence_generator.py @@ -0,0 +1,842 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, List, Optional + +import math +import numpy as np + +import torch +import torch.nn.functional as F +from torch import Tensor + +from .noisy_channel_beam_search import NoisyChannelBeamSearch +from fairseq.sequence_generator import EnsembleModel + + +class NoisyChannelSequenceGenerator(object): + def __init__( + self, + combine_method, + tgt_dict, + src_dict=None, + beam_size=1, + max_len_a=0, + max_len_b=200, + min_len=1, + len_penalty=1.0, + unk_penalty=0.0, + retain_dropout=False, + temperature=1.0, + match_source_len=False, + no_repeat_ngram_size=0, + normalize_scores=True, + channel_models=None, + k2=10, + ch_weight=1.0, + channel_scoring_type='log_norm', + top_k_vocab=0, + lm_models=None, + lm_dict=None, + lm_weight=1.0, + normalize_lm_scores_by_tgt_len=False, + ): + """Generates translations of a given source sentence, + using beam search with noisy channel decoding. + + Args: + combine_method (string, optional): Method to combine direct, LM and + channel model scores (default: None) + tgt_dict (~fairseq.data.Dictionary): target dictionary + src_dict (~fairseq.data.Dictionary): source dictionary + beam_size (int, optional): beam width (default: 1) + max_len_a/b (int, optional): generate sequences of maximum length + ax + b, where x is the source length + min_len (int, optional): the minimum length of the generated output + (not including end-of-sentence) + len_penalty (float, optional): length penalty, where <1.0 favors + shorter, >1.0 favors longer sentences (default: 1.0) + unk_penalty (float, optional): unknown word penalty, where <0 + produces more unks, >0 produces fewer (default: 0.0) + retain_dropout (bool, optional): use dropout when generating + (default: False) + temperature (float, optional): temperature, where values + >1.0 produce more uniform samples and values <1.0 produce + sharper samples (default: 1.0) + match_source_len (bool, optional): outputs should match the source + length (default: False) + no_repeat_ngram_size (int, optional): Size of n-grams that we avoid + repeating in the generation (default: 0) + normalize_scores (bool, optional): normalize scores by the length + of the output (default: True) + channel_models (List[~fairseq.models.FairseqModel]): ensemble of models + translating from the target to the source + k2 (int, optional): Top K2 candidates to score per beam at each step (default:10) + ch_weight (int, optional): Weight associated with the channel model score + assuming that the direct model score has weight 1.0 (default: 1.0) + channel_scoring_type (str, optional): String specifying how to score + the channel model (default: 'log_norm') + top_k_vocab (int, optional): If `channel_scoring_type` is `'src_vocab'` or + `'src_vocab_batched'`, then this parameter specifies the number of + most frequent tokens to include in the channel model output vocabulary, + in addition to the source tokens in the input batch (default: 0) + lm_models (List[~fairseq.models.FairseqModel]): ensemble of models + generating text in the target language + lm_dict (~fairseq.data.Dictionary): LM Model dictionary + lm_weight (int, optional): Weight associated with the LM model score + assuming that the direct model score has weight 1.0 (default: 1.0) + normalize_lm_scores_by_tgt_len (bool, optional): Should we normalize LM scores + by the target length? By default, we normalize the combination of + LM and channel model scores by the source length + """ + self.pad = tgt_dict.pad() + self.unk = tgt_dict.unk() + self.eos = tgt_dict.eos() + self.vocab_size = len(tgt_dict) + self.beam_size = beam_size + # the max beam size is the dictionary size - 1, since we never select pad + self.beam_size = min(beam_size, self.vocab_size - 1) + self.max_len_a = max_len_a + self.max_len_b = max_len_b + self.min_len = min_len + self.normalize_scores = normalize_scores + self.len_penalty = len_penalty + self.unk_penalty = unk_penalty + self.retain_dropout = retain_dropout + self.temperature = temperature + self.match_source_len = match_source_len + self.no_repeat_ngram_size = no_repeat_ngram_size + self.channel_models = channel_models + self.src_dict = src_dict + self.tgt_dict = tgt_dict + self.combine_method = combine_method + self.k2 = k2 + self.ch_weight = ch_weight + self.channel_scoring_type = channel_scoring_type + self.top_k_vocab = top_k_vocab + self.lm_models = lm_models + self.lm_dict = lm_dict + self.lm_weight = lm_weight + self.log_softmax_fn = torch.nn.LogSoftmax(dim=1) + self.normalize_lm_scores_by_tgt_len = normalize_lm_scores_by_tgt_len + + self.share_tgt_dict = (self.lm_dict == self.tgt_dict) + self.tgt_to_lm = make_dict2dict(tgt_dict, lm_dict) + + self.ch_scoring_bsz = 3072 + + assert temperature > 0, '--temperature must be greater than 0' + + self.search = NoisyChannelBeamSearch(tgt_dict) + + @torch.no_grad() + def generate( + self, + models, + sample, + prefix_tokens=None, + bos_token=None, + **kwargs + ): + """Generate a batch of translations. + Args: + models (List[~fairseq.models.FairseqModel]): ensemble of models + sample (dict): batch + prefix_tokens (torch.LongTensor, optional): force decoder to begin + with these tokens + """ + model = EnsembleModel(models) + incremental_states = torch.jit.annotate( + List[Dict[str, Dict[str, Optional[Tensor]]]], + [ + torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) + for i in range(model.models_size) + ], + ) + if not self.retain_dropout: + model.eval() + + # model.forward normally channels prev_output_tokens into the decoder + # separately, but SequenceGenerator directly calls model.encoder + encoder_input = { + k: v for k, v in sample['net_input'].items() + if k != 'prev_output_tokens' + } + src_tokens = encoder_input['src_tokens'] + src_lengths_no_eos = (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1) + input_size = src_tokens.size() + # batch dimension goes first followed by source lengths + bsz = input_size[0] + src_len = input_size[1] + beam_size = self.beam_size + + if self.match_source_len: + max_len = src_lengths_no_eos.max().item() + else: + max_len = min( + int(self.max_len_a * src_len + self.max_len_b), + # exclude the EOS marker + model.max_decoder_positions() - 1, + ) + + # compute the encoder output for each beam + encoder_outs = model.forward_encoder(encoder_input) + new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) + new_order = new_order.to(src_tokens.device).long() + encoder_outs = model.reorder_encoder_out(encoder_outs, new_order) + + src_lengths = encoder_input['src_lengths'] + # initialize buffers + scores = src_tokens.new(bsz * beam_size, max_len + 1).float().fill_(0) + lm_prefix_scores = src_tokens.new(bsz * beam_size).float().fill_(0) + + scores_buf = scores.clone() + tokens = src_tokens.new(bsz * beam_size, max_len + 2).long().fill_(self.pad) + tokens_buf = tokens.clone() + tokens[:, 0] = self.eos if bos_token is None else bos_token + + # reorder source tokens so they may be used as a reference in generating P(S|T) + src_tokens = reorder_all_tokens(src_tokens, src_lengths, self.src_dict.eos_index) + + src_tokens = src_tokens.repeat(1, beam_size).view(-1, src_len) + src_lengths = src_lengths.view(bsz, -1).repeat(1, beam_size).view(bsz*beam_size, -1) + + attn, attn_buf = None, None + nonpad_idxs = None + + # The cands_to_ignore indicates candidates that should be ignored. + # For example, suppose we're sampling and have already finalized 2/5 + # samples. Then the cands_to_ignore would mark 2 positions as being ignored, + # so that we only finalize the remaining 3 samples. + cands_to_ignore = src_tokens.new_zeros(bsz, beam_size).eq(-1) # forward and backward-compatible False mask + + # list of completed sentences + finalized = [[] for i in range(bsz)] + finished = [False for i in range(bsz)] + num_remaining_sent = bsz + + # number of candidate hypos per step + cand_size = 2 * beam_size # 2 x beam size in case half are EOS + + # offset arrays for converting between different indexing schemes + bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens) + cand_offsets = torch.arange(0, cand_size).type_as(tokens) + + # helper function for allocating buffers on the fly + buffers = {} + + def buffer(name, type_of=tokens): # noqa + if name not in buffers: + buffers[name] = type_of.new() + return buffers[name] + + def is_finished(sent, step, unfin_idx): + """ + Check whether we've finished generation for a given sentence, by + comparing the worst score among finalized hypotheses to the best + possible score among unfinalized hypotheses. + """ + assert len(finalized[sent]) <= beam_size + if len(finalized[sent]) == beam_size: + return True + return False + + def finalize_hypos(step, bbsz_idx, eos_scores, combined_noisy_channel_eos_scores): + """ + Finalize the given hypotheses at this step, while keeping the total + number of finalized hypotheses per sentence <= beam_size. + + Note: the input must be in the desired finalization order, so that + hypotheses that appear earlier in the input are preferred to those + that appear later. + + Args: + step: current time step + bbsz_idx: A vector of indices in the range [0, bsz*beam_size), + indicating which hypotheses to finalize + eos_scores: A vector of the same size as bbsz_idx containing + fw scores for each hypothesis + combined_noisy_channel_eos_scores: A vector of the same size as bbsz_idx containing + combined noisy channel scores for each hypothesis + """ + assert bbsz_idx.numel() == eos_scores.numel() + + # clone relevant token and attention tensors + tokens_clone = tokens.index_select(0, bbsz_idx) + tokens_clone = tokens_clone[:, 1:step + 2] # skip the first index, which is EOS + assert not tokens_clone.eq(self.eos).any() + tokens_clone[:, step] = self.eos + attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step+2] if attn is not None else None + + # compute scores per token position + pos_scores = scores.index_select(0, bbsz_idx)[:, :step+1] + pos_scores[:, step] = eos_scores + # convert from cumulative to per-position scores + pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] + + # normalize sentence-level scores + if self.normalize_scores: + combined_noisy_channel_eos_scores /= (step + 1) ** self.len_penalty + + cum_unfin = [] + prev = 0 + for f in finished: + if f: + prev += 1 + else: + cum_unfin.append(prev) + + sents_seen = set() + for i, (idx, score) in enumerate(zip(bbsz_idx.tolist(), combined_noisy_channel_eos_scores.tolist())): + unfin_idx = idx // beam_size + sent = unfin_idx + cum_unfin[unfin_idx] + + sents_seen.add((sent, unfin_idx)) + + if self.match_source_len and step > src_lengths_no_eos[unfin_idx]: + score = -math.inf + + def get_hypo(): + + if attn_clone is not None: + # remove padding tokens from attn scores + hypo_attn = attn_clone[i][nonpad_idxs[sent]] + _, alignment = hypo_attn.max(dim=0) + else: + hypo_attn = None + alignment = None + + return { + 'tokens': tokens_clone[i], + 'score': score, + 'attention': hypo_attn, # src_len x tgt_len + 'alignment': alignment, + 'positional_scores': pos_scores[i], + } + + if len(finalized[sent]) < beam_size: + finalized[sent].append(get_hypo()) + + newly_finished = [] + for sent, unfin_idx in sents_seen: + # check termination conditions for this sentence + if not finished[sent] and is_finished(sent, step, unfin_idx): + finished[sent] = True + newly_finished.append(unfin_idx) + return newly_finished + + def noisy_channel_rescoring(lprobs, beam_size, bsz, src_tokens, tokens, k): + """Rescore the top k hypothesis from each beam using noisy channel modeling + Returns: + new_fw_lprobs: the direct model probabilities after pruning the top k + new_ch_lm_lprobs: the combined channel and language model probabilities + new_lm_lprobs: the language model probabilities after pruning the top k + """ + with torch.no_grad(): + lprobs_size = lprobs.size() + if prefix_tokens is not None and step < prefix_tokens.size(1): + probs_slice = lprobs.view(bsz, -1, lprobs.size(-1))[:, 0, :] + cand_scores = torch.gather( + probs_slice, dim=1, + index=prefix_tokens[:, step].view(-1, 1).data + ).expand(-1, beam_size).contiguous().view(bsz*beam_size, 1) + cand_indices = prefix_tokens[:, step].view(-1, 1).expand(bsz, beam_size).data.contiguous().view(bsz*beam_size, 1) + + # need to calculate and save fw and lm probs for prefix tokens + fw_top_k = cand_scores + fw_top_k_idx = cand_indices + k = 1 + else: + # take the top k best words for every sentence in batch*beam + fw_top_k, fw_top_k_idx = torch.topk(lprobs.view(beam_size*bsz, -1), k=k) + eos_idx = torch.nonzero(fw_top_k_idx.view(bsz*beam_size*k, -1) == self.eos)[:, 0] + ch_scores = fw_top_k.new_full((beam_size*bsz*k, ), 0) + src_size = torch.sum(src_tokens[:, :] != self.src_dict.pad_index, dim=1, keepdim=True, dtype=fw_top_k.dtype) + + if self.combine_method != "lm_only": + temp_src_tokens_full = src_tokens[:, :].repeat(1, k).view(bsz*beam_size*k, -1) + not_padding = temp_src_tokens_full[:, 1:] != self.src_dict.pad_index + cur_tgt_size = step+2 + + # add eos to all candidate sentences except those that already end in eos + eos_tokens = tokens[:, 0].repeat(1, k).view(-1, 1) + eos_tokens[eos_idx] = self.tgt_dict.pad_index + + if step == 0: + channel_input = torch.cat((fw_top_k_idx.view(-1, 1), eos_tokens), 1) + else: + # move eos from beginning to end of target sentence + channel_input = torch.cat((tokens[:, 1:step + 1].repeat(1, k).view(-1, step), fw_top_k_idx.view(-1, 1), eos_tokens), 1) + + ch_input_lengths = torch.tensor(np.full(channel_input.size(0), cur_tgt_size)) + ch_input_lengths[eos_idx] = cur_tgt_size-1 + if self.channel_scoring_type == "unnormalized": + ch_encoder_output = channel_model.encoder(channel_input, src_lengths=ch_input_lengths) + ch_decoder_output, _ = channel_model.decoder(temp_src_tokens_full, encoder_out=ch_encoder_output, features_only=True) + del ch_encoder_output + ch_intermed_scores = channel_model.decoder.unnormalized_scores_given_target(ch_decoder_output, target_ids=temp_src_tokens_full[:, 1:]) + ch_intermed_scores = ch_intermed_scores.float() + ch_intermed_scores *= not_padding.float() + ch_scores = torch.sum(ch_intermed_scores, dim=1) + elif self.channel_scoring_type == "k2_separate": + for k_idx in range(k): + k_eos_tokens = eos_tokens[k_idx::k, :] + if step == 0: + k_ch_input = torch.cat((fw_top_k_idx[:, k_idx:k_idx+1], k_eos_tokens), 1) + else: + # move eos from beginning to end of target sentence + k_ch_input = torch.cat((tokens[:, 1:step + 1], fw_top_k_idx[:, k_idx:k_idx+1], k_eos_tokens), 1) + k_ch_input_lengths = ch_input_lengths[k_idx::k] + k_ch_output = channel_model(k_ch_input, k_ch_input_lengths, src_tokens) + k_ch_lprobs = channel_model.get_normalized_probs(k_ch_output, log_probs=True) + k_ch_intermed_scores = torch.gather(k_ch_lprobs[:, :-1, :], 2, src_tokens[:, 1:].unsqueeze(2)).squeeze(2) + k_ch_intermed_scores *= not_padding.float() + ch_scores[k_idx::k] = torch.sum(k_ch_intermed_scores, dim=1) + elif self.channel_scoring_type == "src_vocab": + ch_encoder_output = channel_model.encoder(channel_input, src_lengths=ch_input_lengths) + ch_decoder_output, _ = channel_model.decoder(temp_src_tokens_full, encoder_out=ch_encoder_output, features_only=True) + + del ch_encoder_output + ch_lprobs = normalized_scores_with_batch_vocab( + channel_model.decoder, + ch_decoder_output, src_tokens, k, bsz, beam_size, + self.src_dict.pad_index, top_k=self.top_k_vocab) + ch_scores = torch.sum(ch_lprobs, dim=1) + elif self.channel_scoring_type == "src_vocab_batched": + ch_bsz_size = temp_src_tokens_full.shape[0] + ch_lprobs_list = [None] * len(range(0, ch_bsz_size, self.ch_scoring_bsz)) + for i, start_idx in enumerate(range(0, ch_bsz_size, self.ch_scoring_bsz)): + end_idx = min(start_idx + self.ch_scoring_bsz, ch_bsz_size) + temp_src_tokens_full_batch = temp_src_tokens_full[start_idx:end_idx, :] + channel_input_batch = channel_input[start_idx:end_idx, :] + ch_input_lengths_batch = ch_input_lengths[start_idx:end_idx] + ch_encoder_output_batch = channel_model.encoder(channel_input_batch, src_lengths=ch_input_lengths_batch) + ch_decoder_output_batch, _ = channel_model.decoder(temp_src_tokens_full_batch, encoder_out=ch_encoder_output_batch, features_only=True) + ch_lprobs_list[i] = normalized_scores_with_batch_vocab( + channel_model.decoder, + ch_decoder_output_batch, src_tokens, k, bsz, beam_size, + self.src_dict.pad_index, top_k=self.top_k_vocab, + start_idx=start_idx, end_idx=end_idx) + ch_lprobs = torch.cat(ch_lprobs_list, dim=0) + ch_scores = torch.sum(ch_lprobs, dim=1) + else: + ch_output = channel_model(channel_input, ch_input_lengths, temp_src_tokens_full) + ch_lprobs = channel_model.get_normalized_probs(ch_output, log_probs=True) + ch_intermed_scores = torch.gather(ch_lprobs[:, :-1, :], 2, temp_src_tokens_full[:, 1:].unsqueeze(2)).squeeze().view(bsz*beam_size*k, -1) + ch_intermed_scores *= not_padding.float() + ch_scores = torch.sum(ch_intermed_scores, dim=1) + + else: + cur_tgt_size = 0 + ch_scores = ch_scores.view(bsz*beam_size, k) + expanded_lm_prefix_scores = lm_prefix_scores.unsqueeze(1).expand(-1, k).flatten() + + if self.share_tgt_dict: + lm_scores = get_lm_scores(lm, tokens[:, :step + 1].view(-1, step+1), lm_incremental_states, fw_top_k_idx.view(-1, 1), torch.tensor(np.full(tokens.size(0), step+1)), k) + else: + new_lm_input = dict2dict(tokens[:, :step + 1].view(-1, step+1), self.tgt_to_lm) + new_cands = dict2dict(fw_top_k_idx.view(-1, 1), self.tgt_to_lm) + lm_scores = get_lm_scores(lm, new_lm_input, lm_incremental_states, new_cands, torch.tensor(np.full(tokens.size(0), step+1)), k) + + lm_scores.add_(expanded_lm_prefix_scores) + ch_lm_scores = combine_ch_lm(self.combine_method, ch_scores, lm_scores, src_size, cur_tgt_size) + # initialize all as min value + new_fw_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1) + new_ch_lm_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1) + new_lm_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1) + new_fw_lprobs[:, self.pad] = -math.inf + new_ch_lm_lprobs[:, self.pad] = -math.inf + new_lm_lprobs[:, self.pad] = -math.inf + + new_fw_lprobs.scatter_(1, fw_top_k_idx, fw_top_k) + new_ch_lm_lprobs.scatter_(1, fw_top_k_idx, ch_lm_scores) + new_lm_lprobs.scatter_(1, fw_top_k_idx, lm_scores.view(-1, k)) + return new_fw_lprobs, new_ch_lm_lprobs, new_lm_lprobs + + def combine_ch_lm(combine_type, ch_scores, lm_scores1, src_size, tgt_size): + if self.channel_scoring_type == "unnormalized": + ch_scores = self.log_softmax_fn( + ch_scores.view(-1, self.beam_size * self.k2) + ).view(ch_scores.shape) + ch_scores = ch_scores * self.ch_weight + lm_scores1 = lm_scores1 * self.lm_weight + + if combine_type == "lm_only": + # log P(T|S) + log P(T) + ch_scores = lm_scores1.view(ch_scores.size()) + elif combine_type == "noisy_channel": + # 1/t log P(T|S) + 1/s log P(S|T) + 1/t log P(T) + if self.normalize_lm_scores_by_tgt_len: + ch_scores.div_(src_size) + lm_scores_norm = lm_scores1.view(ch_scores.size()).div(tgt_size) + ch_scores.add_(lm_scores_norm) + # 1/t log P(T|S) + 1/s log P(S|T) + 1/s log P(T) + else: + ch_scores.add_(lm_scores1.view(ch_scores.size())) + ch_scores.div_(src_size) + + return ch_scores + + if self.channel_models is not None: + channel_model = self.channel_models[0] # assume only one channel_model model + else: + channel_model = None + + lm = EnsembleModel(self.lm_models) + lm_incremental_states = torch.jit.annotate( + List[Dict[str, Dict[str, Optional[Tensor]]]], + [ + torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) + for i in range(lm.models_size) + ], + ) + + reorder_state = None + batch_idxs = None + for step in range(max_len + 1): # one extra step for EOS marker + # reorder decoder internal states based on the prev choice of beams + if reorder_state is not None: + if batch_idxs is not None: + # update beam indices to take into account removed sentences + corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(batch_idxs) + reorder_state.view(-1, beam_size).add_(corr.unsqueeze(-1) * beam_size) + model.reorder_incremental_state(incremental_states, reorder_state) + encoder_outs = model.reorder_encoder_out(encoder_outs, reorder_state) + + lm.reorder_incremental_state(lm_incremental_states, reorder_state) + + fw_lprobs, avg_attn_scores = model.forward_decoder( + tokens[:, :step + 1], encoder_outs, incremental_states, temperature=self.temperature, + ) + + fw_lprobs[:, self.pad] = -math.inf # never select pad + fw_lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty + fw_lprobs, ch_lm_lprobs, lm_lprobs = noisy_channel_rescoring(fw_lprobs, beam_size, bsz, src_tokens, tokens, self.k2) + + # handle min and max length constraints + if step >= max_len: + fw_lprobs[:, :self.eos] = -math.inf + fw_lprobs[:, self.eos + 1:] = -math.inf + elif step < self.min_len: + fw_lprobs[:, self.eos] = -math.inf + + # handle prefix tokens (possibly with different lengths) + if prefix_tokens is not None and step < prefix_tokens.size(1): + prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1) + prefix_mask = prefix_toks.ne(self.pad) + + prefix_fw_lprobs = fw_lprobs.gather(-1, prefix_toks.unsqueeze(-1)) + fw_lprobs[prefix_mask] = -math.inf + fw_lprobs[prefix_mask] = fw_lprobs[prefix_mask].scatter_( + -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_fw_lprobs + ) + + prefix_ch_lm_lprobs = ch_lm_lprobs.gather(-1, prefix_toks.unsqueeze(-1)) + ch_lm_lprobs[prefix_mask] = -math.inf + ch_lm_lprobs[prefix_mask] = ch_lm_lprobs[prefix_mask].scatter_( + -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_ch_lm_lprobs + ) + + prefix_lm_lprobs = lm_lprobs.gather(-1, prefix_toks.unsqueeze(-1)) + lm_lprobs[prefix_mask] = -math.inf + lm_lprobs[prefix_mask] = lm_lprobs[prefix_mask].scatter_( + -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lm_lprobs + ) + + # if prefix includes eos, then we should make sure tokens and + # scores are the same across all beams + eos_mask = prefix_toks.eq(self.eos) + if eos_mask.any(): + # validate that the first beam matches the prefix + first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[:, 0, 1:step + 1] + eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0] + target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step] + assert (first_beam == target_prefix).all() + + def replicate_first_beam(tensor, mask): + tensor = tensor.view(-1, beam_size, tensor.size(-1)) + tensor[mask] = tensor[mask][:, :1, :] + return tensor.view(-1, tensor.size(-1)) + + # copy tokens, scores and lprobs from the first beam to all beams + tokens = replicate_first_beam(tokens, eos_mask_batch_dim) + scores = replicate_first_beam(scores, eos_mask_batch_dim) + + fw_lprobs = replicate_first_beam(fw_lprobs, eos_mask_batch_dim) + ch_lm_lprobs = replicate_first_beam(ch_lm_lprobs, eos_mask_batch_dim) + lm_lprobs = replicate_first_beam(lm_lprobs, eos_mask_batch_dim) + + if self.no_repeat_ngram_size > 0: + # for each beam and batch sentence, generate a list of previous ngrams + gen_ngrams = [{} for bbsz_idx in range(bsz * beam_size)] + for bbsz_idx in range(bsz * beam_size): + gen_tokens = tokens[bbsz_idx].tolist() + for ngram in zip(*[gen_tokens[i:] for i in range(self.no_repeat_ngram_size)]): + gen_ngrams[bbsz_idx][tuple(ngram[:-1])] = \ + gen_ngrams[bbsz_idx].get(tuple(ngram[:-1]), []) + [ngram[-1]] + + # Record attention scores + if avg_attn_scores is not None: + if attn is None: + attn = scores.new(bsz * beam_size, src_tokens.size(1), max_len + 2) + attn_buf = attn.clone() + nonpad_idxs = src_tokens.ne(self.pad) + attn[:, :, step + 1].copy_(avg_attn_scores) + + scores = scores.type_as(fw_lprobs) + scores_buf = scores_buf.type_as(fw_lprobs) + + self.search.set_src_lengths(src_lengths_no_eos) + + if self.no_repeat_ngram_size > 0: + def calculate_banned_tokens(bbsz_idx): + # before decoding the next token, prevent decoding of ngrams that have already appeared + ngram_index = tuple(tokens[bbsz_idx, step + 2 - self.no_repeat_ngram_size:step + 1].tolist()) + return gen_ngrams[bbsz_idx].get(ngram_index, []) + + if step + 2 - self.no_repeat_ngram_size >= 0: + # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + banned_tokens = [calculate_banned_tokens(bbsz_idx) for bbsz_idx in range(bsz * beam_size)] + else: + banned_tokens = [[] for bbsz_idx in range(bsz * beam_size)] + + for bbsz_idx in range(bsz * beam_size): + fw_lprobs[bbsz_idx, banned_tokens[bbsz_idx]] = -math.inf + + combined_noisy_channel_scores, fw_lprobs_top_k, lm_lprobs_top_k, cand_indices, cand_beams = self.search.step( + step, + fw_lprobs.view(bsz, -1, self.vocab_size), + scores.view(bsz, beam_size, -1)[:, :, :step], ch_lm_lprobs.view(bsz, -1, self.vocab_size), + lm_lprobs.view(bsz, -1, self.vocab_size), self.combine_method + ) + + # cand_bbsz_idx contains beam indices for the top candidate + # hypotheses, with a range of values: [0, bsz*beam_size), + # and dimensions: [bsz, cand_size] + cand_bbsz_idx = cand_beams.add(bbsz_offsets) + + # finalize hypotheses that end in eos (except for candidates to be ignored) + eos_mask = cand_indices.eq(self.eos) + eos_mask[:, :beam_size] &= ~cands_to_ignore + + # only consider eos when it's among the top beam_size indices + eos_bbsz_idx = torch.masked_select( + cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size] + ) + + finalized_sents = set() + if eos_bbsz_idx.numel() > 0: + eos_scores = torch.masked_select( + fw_lprobs_top_k[:, :beam_size], mask=eos_mask[:, :beam_size] + ) + combined_noisy_channel_eos_scores = torch.masked_select( + combined_noisy_channel_scores[:, :beam_size], + mask=eos_mask[:, :beam_size], + ) + + # finalize hypo using channel model score + finalized_sents = finalize_hypos( + step, eos_bbsz_idx, eos_scores, combined_noisy_channel_eos_scores) + + num_remaining_sent -= len(finalized_sents) + + assert num_remaining_sent >= 0 + if num_remaining_sent == 0: + break + + if len(finalized_sents) > 0: + new_bsz = bsz - len(finalized_sents) + + # construct batch_idxs which holds indices of batches to keep for the next pass + batch_mask = cand_indices.new_ones(bsz) + batch_mask[cand_indices.new(finalized_sents)] = 0 + batch_idxs = torch.nonzero(batch_mask).squeeze(-1) + + eos_mask = eos_mask[batch_idxs] + cand_beams = cand_beams[batch_idxs] + bbsz_offsets.resize_(new_bsz, 1) + cand_bbsz_idx = cand_beams.add(bbsz_offsets) + + lm_lprobs_top_k = lm_lprobs_top_k[batch_idxs] + + fw_lprobs_top_k = fw_lprobs_top_k[batch_idxs] + cand_indices = cand_indices[batch_idxs] + if prefix_tokens is not None: + prefix_tokens = prefix_tokens[batch_idxs] + src_lengths_no_eos = src_lengths_no_eos[batch_idxs] + cands_to_ignore = cands_to_ignore[batch_idxs] + + scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) + scores_buf.resize_as_(scores) + tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) + tokens_buf.resize_as_(tokens) + src_tokens = src_tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) + src_lengths = src_lengths.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) + lm_prefix_scores = lm_prefix_scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1).squeeze() + + if attn is not None: + attn = attn.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, attn.size(1), -1) + attn_buf.resize_as_(attn) + bsz = new_bsz + else: + batch_idxs = None + + # Set active_mask so that values > cand_size indicate eos or + # ignored hypos and values < cand_size indicate candidate + # active hypos. After this, the min values per row are the top + # candidate active hypos. + eos_mask[:, :beam_size] |= cands_to_ignore + active_mask = torch.add( + eos_mask.type_as(cand_offsets) * cand_size, + cand_offsets[: eos_mask.size(1)], + ) + + # get the top beam_size active hypotheses, which are just the hypos + # with the smallest values in active_mask + active_hypos, new_cands_to_ignore = buffer('active_hypos'), buffer('new_cands_to_ignore') + torch.topk( + active_mask, k=beam_size, dim=1, largest=False, + out=(new_cands_to_ignore, active_hypos) + ) + + # update cands_to_ignore to ignore any finalized hypos + cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size] + assert (~cands_to_ignore).any(dim=1).all() + + active_bbsz_idx = buffer('active_bbsz_idx') + torch.gather( + cand_bbsz_idx, dim=1, index=active_hypos, + out=active_bbsz_idx, + ) + active_scores = torch.gather( + fw_lprobs_top_k, dim=1, index=active_hypos, + out=scores[:, step].view(bsz, beam_size), + ) + + active_bbsz_idx = active_bbsz_idx.view(-1) + active_scores = active_scores.view(-1) + + # copy tokens and scores for active hypotheses + torch.index_select( + tokens[:, :step + 1], dim=0, index=active_bbsz_idx, + out=tokens_buf[:, :step + 1], + ) + torch.gather( + cand_indices, dim=1, index=active_hypos, + out=tokens_buf.view(bsz, beam_size, -1)[:, :, step + 1], + ) + if step > 0: + torch.index_select( + scores[:, :step], dim=0, index=active_bbsz_idx, + out=scores_buf[:, :step], + ) + torch.gather( + fw_lprobs_top_k, dim=1, index=active_hypos, + out=scores_buf.view(bsz, beam_size, -1)[:, :, step], + ) + torch.gather( + lm_lprobs_top_k, dim=1, index=active_hypos, + out=lm_prefix_scores.view(bsz, beam_size) + ) + + # copy attention for active hypotheses + if attn is not None: + torch.index_select( + attn[:, :, :step + 2], dim=0, index=active_bbsz_idx, + out=attn_buf[:, :, :step + 2], + ) + + # swap buffers + tokens, tokens_buf = tokens_buf, tokens + scores, scores_buf = scores_buf, scores + if attn is not None: + attn, attn_buf = attn_buf, attn + + # reorder incremental state in decoder + reorder_state = active_bbsz_idx + + # sort by score descending + for sent in range(len(finalized)): + finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True) + + return finalized + + +def get_lm_scores(model, input_tokens, incremental_states, cand_tokens, input_len, k): + with torch.no_grad(): + lm_lprobs, avg_attn_scores = model.forward_decoder( + input_tokens, encoder_outs=None, incremental_states=incremental_states, + ) + + lm_lprobs_size = lm_lprobs.size(0) + probs_next_wrd = torch.gather(lm_lprobs.repeat(1, k).view(lm_lprobs_size*k, -1), 1, cand_tokens).squeeze().view(-1) + + return probs_next_wrd + + +def make_dict2dict(old_dict, new_dict): + dict2dict_map = {} + for sym in old_dict.symbols: + dict2dict_map[old_dict.index(sym)] = new_dict.index(sym) + return dict2dict_map + + +def dict2dict(tokens, dict2dict_map): + if tokens.device == torch.device('cpu'): + tokens_tmp = tokens + else: + tokens_tmp = tokens.cpu() + return tokens_tmp.map_( + tokens_tmp, + lambda _, val, dict2dict_map=dict2dict_map : dict2dict_map[float(val)] + ).to(tokens.device) + + +def reorder_tokens(tokens, lengths, eos): + # reorder source tokens so they may be used as reference for P(S|T) + return torch.cat((tokens.new([eos]), tokens[-lengths:-1], tokens[:-lengths]), 0) + + +def reorder_all_tokens(tokens, lengths, eos): + # used to reorder src tokens from [ .. ] to [ ...] + # so source tokens can be used to predict P(S|T) + return torch.stack([reorder_tokens(token, length, eos) for token, length in zip(tokens, lengths)]) + + +def normalized_scores_with_batch_vocab( + model_decoder, features, target_ids, k, bsz, beam_size, + pad_idx, top_k=0, vocab_size_meter=None, start_idx=None, + end_idx=None, **kwargs): + """ + Get normalized probabilities (or log probs) from a net's output + w.r.t. vocab consisting of target IDs in the batch + """ + if model_decoder.adaptive_softmax is None: + weight = model_decoder.output_projection.weight + vocab_ids = torch.unique( + torch.cat( + (torch.unique(target_ids), torch.arange(top_k, device=target_ids.device)) + ) + ) + id_map = dict(zip(vocab_ids.tolist(), range(len(vocab_ids)))) + mapped_target_ids = target_ids.cpu().apply_( + lambda x, id_map=id_map: id_map[x] + ).to(target_ids.device) + expanded_target_ids = mapped_target_ids[:, :].repeat(1, k).view(bsz*beam_size*k, -1) + if start_idx is not None and end_idx is not None: + expanded_target_ids = expanded_target_ids[start_idx:end_idx, :] + logits = F.linear(features, weight[vocab_ids, :]) + log_softmax = F.log_softmax(logits, dim=-1, dtype=torch.float32) + intermed_scores = torch.gather( + log_softmax[:, :-1, :], + 2, + expanded_target_ids[:, 1:].unsqueeze(2), + ).squeeze() + not_padding = expanded_target_ids[:, 1:] != pad_idx + intermed_scores *= not_padding.float() + return intermed_scores + else: + raise ValueError("adaptive softmax doesn't work with " + + "`normalized_scores_with_batch_vocab()`") diff --git a/examples/fast_noisy_channel/noisy_channel_translation.py b/examples/fast_noisy_channel/noisy_channel_translation.py new file mode 100644 index 0000000000..b74bdfd456 --- /dev/null +++ b/examples/fast_noisy_channel/noisy_channel_translation.py @@ -0,0 +1,127 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.tasks.translation import TranslationTask +from fairseq.tasks.language_modeling import LanguageModelingTask +from fairseq import checkpoint_utils +import argparse +from fairseq.tasks import register_task +import torch + + +@register_task("noisy_channel_translation") +class NoisyChannelTranslation(TranslationTask): + """ + Rescore the top k candidates from each beam using noisy channel modeling + """ + + @staticmethod + def add_args(parser): + """Add task-specific arguments to the parser.""" + TranslationTask.add_args(parser) + # fmt: off + parser.add_argument('--channel-model', metavar='FILE', + help='path to P(S|T) model. P(S|T) and P(T|S) must share source and target dictionaries.') + parser.add_argument('--combine-method', default='lm_only', + choices=['lm_only', 'noisy_channel'], + help="""method for combining direct and channel model scores. + lm_only: decode with P(T|S)P(T) + noisy_channel: decode with 1/t P(T|S) + 1/s(P(S|T)P(T))""") + parser.add_argument('--normalize-lm-scores-by-tgt-len', action='store_true', default=False, + help='normalize lm score by target length instead of source length') + parser.add_argument('--channel-scoring-type', default='log_norm', choices=['unnormalized', 'log_norm', 'k2_separate', 'src_vocab', 'src_vocab_batched'], + help="Normalize bw scores with log softmax or return bw scores without log softmax") + parser.add_argument('--top-k-vocab', default=0, type=int, + help='top k vocab IDs to use with `src_vocab` in channel model scoring') + parser.add_argument('--k2', default=50, type=int, + help='the top k2 candidates to rescore with the noisy channel model for each beam') + parser.add_argument('--ch-wt', default=1, type=float, + help='weight for the channel model') + parser.add_argument('--lm-model', metavar='FILE', + help='path to lm model file, to model P(T). P(T) must share the same vocab as the direct model on the target side') + parser.add_argument('--lm-data', metavar='FILE', + help='path to lm model training data for target language, used to properly load LM with correct dictionary') + parser.add_argument('--lm-wt', default=1, type=float, + help='the weight of the lm in joint decoding') + # fmt: on + + def build_generator( + self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None + ): + if getattr(args, "score_reference", False): + raise NotImplementedError() + else: + from .noisy_channel_sequence_generator import NoisyChannelSequenceGenerator + use_cuda = torch.cuda.is_available() and not self.args.cpu + assert self.args.lm_model is not None, '--lm-model required for noisy channel generation!' + assert self.args.lm_data is not None, '--lm-data required for noisy channel generation to map between LM and bitext vocabs' + if self.args.channel_model is not None: + import copy + ch_args_task = copy.deepcopy(self.args) + tmp = ch_args_task.source_lang + ch_args_task.source_lang = ch_args_task.target_lang + ch_args_task.target_lang = tmp + ch_args_task._name = 'translation' + channel_task = TranslationTask.setup_task(ch_args_task) + + arg_dict = {} + arg_dict['task'] = 'language_modeling' + arg_dict['sample_break_mode'] = 'eos' + arg_dict['data'] = self.args.lm_data + arg_dict['output_dictionary_size'] = -1 + lm_args = argparse.Namespace(**arg_dict) + lm_task = LanguageModelingTask.setup_task(lm_args) + lm_dict = lm_task.output_dictionary + + if self.args.channel_model is not None: + channel_models, _ = checkpoint_utils.load_model_ensemble(self.args.channel_model.split(':'), task=channel_task) + + for model in channel_models: + model.make_generation_fast_( + beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, + need_attn=args.print_alignment, + ) + if self.args.fp16: + model.half() + if use_cuda: + model.cuda() + else: + channel_models = None + + lm_models, _ = checkpoint_utils.load_model_ensemble(self.args.lm_model.split(':'), task=lm_task) + + for model in lm_models: + model.make_generation_fast_( + beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, + need_attn=args.print_alignment, + ) + if self.args.fp16: + model.half() + if use_cuda: + model.cuda() + return NoisyChannelSequenceGenerator( + combine_method=self.args.combine_method, + tgt_dict=self.target_dictionary, + src_dict=self.source_dictionary, + beam_size=getattr(args, 'beam', 5), + max_len_a=getattr(args, 'max_len_a', 0), + max_len_b=getattr(args, 'max_len_b', 200), + min_len=getattr(args, 'min_len', 1), + len_penalty=getattr(args, 'lenpen', 1), + unk_penalty=getattr(args, 'unkpen', 0), + temperature=getattr(args, 'temperature', 1.), + match_source_len=getattr(args, 'match_source_len', False), + no_repeat_ngram_size=getattr(args, 'no_repeat_ngram_size', 0), + normalize_scores=(not getattr(args, 'unnormalized', False)), + channel_models=channel_models, + k2=getattr(self.args, 'k2', 50), + ch_weight=getattr(self.args, 'ch_wt', 1), + channel_scoring_type=self.args.channel_scoring_type, + top_k_vocab=self.args.top_k_vocab, + lm_models=lm_models, + lm_dict=lm_dict, + lm_weight=getattr(self.args, 'lm_wt', 1), + normalize_lm_scores_by_tgt_len=getattr(self.args, 'normalize_lm_scores_by_tgt_len', False), + ) diff --git a/examples/flores101/README.md b/examples/flores101/README.md new file mode 100644 index 0000000000..635c13f40b --- /dev/null +++ b/examples/flores101/README.md @@ -0,0 +1,223 @@ +

+ +

+ +# Flores101: Large-Scale Multilingual Machine Translation + +## Introduction + +Baseline pretrained models for small and large tracks of WMT 21 Large-Scale Multilingual Machine Translation competition. + +Flores Task at WMT 21: http://www.statmt.org/wmt21/large-scale-multilingual-translation-task.html + +Flores announement blog post: https://ai.facebook.com/blog/flores-researchers-kick-off-multilingual-translation-challenge-at-wmt-and-call-for-compute-grants/ + + + +## Pretrained models + +Model | Num layers | Embed dimension | FFN dimension| Vocab Size | #params | Download +---|---|---|---|---|---|--- +`flores101_mm100_615M` | 12 | 1024 | 4096 | 256,000 | 615M | https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz +`flores101_mm100_175M` | 6 | 512 | 2048 | 256,000 | 175M | https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_175M.tar.gz + + +These models are trained similar to [M2M-100](https://arxiv.org/abs/2010.11125) with additional support for the languages that are part of the WMT Large-Scale Multilingual Machine Translation track. Full list of languages can be found at the bottom. + + +## Example Generation code + +### Download model, sentencepiece vocab + +```bash +fairseq=/path/to/fairseq +cd $fairseq + +# Download 615M param model. +wget https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz + +# Extract +tar -xvzf flores101_mm100_615M.tar.gz +``` + +### Encode using our SentencePiece Model +Note: Install SentencePiece from [here](https://github.com/google/sentencepiece) + + +```bash +fairseq=/path/to/fairseq +cd $fairseq + +# Download example dataset From German to French +sacrebleu --echo src -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.de +sacrebleu --echo ref -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.fr + +for lang in de fr ; do + python scripts/spm_encode.py \ + --model flores101_mm100_615M/sentencepiece.bpe.model \ + --output_format=piece \ + --inputs=raw_input.de-fr.${lang} \ + --outputs=spm.de-fr.${lang} +done +``` + +### Binarization + +```bash +fairseq-preprocess \ + --source-lang de --target-lang fr \ + --testpref spm.de-fr \ + --thresholdsrc 0 --thresholdtgt 0 \ + --destdir data_bin \ + --srcdict flores101_mm100_615M/dict.txt --tgtdict flores101_mm100_615M/dict.txt +``` + +### Generation + + +```bash +fairseq-generate \ + data_bin \ + --batch-size 1 \ + --path flores101_mm100_615M/model.pt \ + --fixed-dictionary flores101_mm100_615M/dict.txt \ + -s de -t fr \ + --remove-bpe 'sentencepiece' \ + --beam 5 \ + --task translation_multi_simple_epoch \ + --lang-pairs flores101_mm100_615M/language_pairs.txt \ + --decoder-langtok --encoder-langtok src \ + --gen-subset test \ + --fp16 \ + --dataset-impl mmap \ + --distributed-world-size 1 --distributed-no-spawn +``` + +### Supported Languages and lang code + +Language | lang code +---|--- +Akrikaans | af +Amharic | am +Arabic | ar +Assamese | as +Asturian | ast +Aymara | ay +Azerbaijani | az +Bashkir | ba +Belarusian | be +Bulgarian | bg +Bengali | bn +Breton | br +Bosnian | bs +Catalan | ca +Cebuano | ceb +Chokwe | cjk +Czech | cs +Welsh | cy +Danish | da +German | de +Dyula| dyu +Greek | el +English | en +Spanish | es +Estonian | et +Persian | fa +Fulah | ff +Finnish | fi +French | fr +Western Frisian | fy +Irish | ga +Scottish Gaelic | gd +Galician | gl +Gujarati | gu +Hausa | ha +Hebrew | he +Hindi | hi +Croatian | hr +Haitian Creole | ht +Hungarian | hu +Armenian | hy +Indonesian | id +Igbo | ig +Iloko | ilo +Icelandic | is +Italian | it +Japanese | ja +Javanese | jv +Georgian | ka +Kachin | kac +Kamba | kam +Kabuverdianu | kea +Kongo | kg +Kazakh | kk +Central Khmer | km +Kimbundu | kmb +Northern Kurdish | kmr +Kannada | kn +Korean | ko +Kurdish | ku +Kyrgyz | ky +Luxembourgish | lb +Ganda | lg +Lingala | ln +Lao | lo +Lithuanian | lt +Luo | luo +Latvian | lv +Malagasy | mg +Maori | mi +Macedonian | mk +Malayalam | ml +Mongolian | mn +Marathi | mr +Malay | ms +Maltese | mt +Burmese | my +Nepali | ne +Dutch | nl +Norwegian | no +Northern Sotho | ns +Nyanja | ny +Occitan | oc +Oromo | om +Oriya | or +Punjabi | pa +Polish | pl +Pashto | ps +Portuguese | pt +Quechua | qu +Romanian | ro +Russian | ru +Sindhi | sd +Shan | shn +Sinhala | si +Slovak | sk +Slovenian | sl +Shona | sn +Somali | so +Albanian | sq +Serbian | sr +Swati | ss +Sundanese | su +Swedish | sv +Swahili | sw +Tamil | ta +Telugu | te +Tajik | tg +Thai | th +Tigrinya | ti +Tagalog | tl +Tswana | tn +Turkish | tr +Ukrainian | uk +Umbundu | umb +Urdu | ur +Uzbek | uz +Vietnamese | vi +Wolof | wo +Xhosa | xh +Yiddish | yi +Yoruba | yo +Chinese| zh +Zulu | zu diff --git a/examples/flores101/flores_logo.png b/examples/flores101/flores_logo.png new file mode 100644 index 0000000000..d4d1455c6e Binary files /dev/null and b/examples/flores101/flores_logo.png differ diff --git a/examples/fully_sharded_data_parallel/README.md b/examples/fully_sharded_data_parallel/README.md new file mode 100644 index 0000000000..b9e44fef48 --- /dev/null +++ b/examples/fully_sharded_data_parallel/README.md @@ -0,0 +1,177 @@ +# Fully Sharded Data Parallel (FSDP) + +## Overview +Recent work by [Microsoft](https://arxiv.org/abs/1910.02054) and +[Google](https://arxiv.org/abs/2004.13336) has shown that data parallel +training can be made significantly more efficient by sharding the model +parameters and optimizer state across data parallel workers. These ideas are +encapsulated in the new **`FullyShardedDataParallel` (FSDP)** wrapper provided +by [fairscale](https://github.com/facebookresearch/fairscale/). + +Compared to PyTorch DDP: +* FSDP produces identical results as PyTorch DDP (it's still synchronous data parallel training) +* FSDP shards parameters (FP16 + FP32) and optimizer state across data parallel GPUs +* FSDP is faster than PyTorch DDP because the optimizer step is sharded, and the communication can be overlapped with the forward pass +* FSDP enables training 13B parameter models on 8 GPUs and 175B parameter models on 128 GPUs + +FSDP is fully supported in fairseq via the following new arguments: +* `--ddp-backend=fully_sharded`: enables full sharding via FSDP +* `--cpu-offload`: offloads the optimizer state and FP32 model copy to CPU (combine with `--optimizer=cpu_adam`) +* `--no-reshard-after-forward`: increases training speed for large models (1B+ params) and is similar to ZeRO stage 2 +* other popular options (`--fp16`, `--update-freq`, `--checkpoint-activations`, `--offload-activations`, etc.) continue to work as normal + +
Limitations

+ +FSDP currently has several limitations compared to fairseq's default DDP backend (PyTorch DDP): +* while FSDP is full compatible with pointwise Optimizers (e.g., Adam, AdamW, Adadelta, Adamax, SGD, etc.), it is not currently compatible with non-pointwise Optimizers (e.g., Adagrad, Adafactor, LAMB, etc.) +* FSDP depends on flattening the parameters, so models that currently require `--fp16-no-flatten-grads` may not be supported + +See the [fairscale docs](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp_tips.html) for a more detailed +explanation of these and other limitations. + +

+ +
How it works

+ +Fully Sharded Data Parallel + +See the [fairscale docs](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp_tips.html) for a more detailed +explanation of how FSDP works. + +

+ +## Example usage + +The following examples illustrate how to train a very large language model with +13 billion parameters on 1 GPU by offloading parameters and optimizer states to +CPU, or on 8 GPUs by fully sharding the params and optimizer states across GPUs. + +These examples use the WikiText-103 dataset for demonstration purposes, but +in practice a much larger dataset will be needed to achieve good results. +Follow the [instructions here](https://github.com/pytorch/fairseq/blob/main/examples/roberta/README.pretraining.md#1-preprocess-the-data) +to preprocess the WikiText-103 dataset using the GPT-2/RoBERTa vocabulary. + +### 13B params on 1 V100 GPU (with CPU offloading) + +The following command trains a 13B parameter GPT-3 model on a single V100 GPU +using the `--cpu-offload` feature to offload parameters and optimizer states to +CPU. In this setting, the optimizer step (Adam) happens on CPU. We also use the +`--checkpoint-activations` feature (sometimes called [gradient checkpointing](https://pytorch.org/docs/stable/checkpoint.html)), +which further saves memory in exchange for a small increase in computation. + +**Requirements:** +- Install the latest master version of fairscale: `pip install git+https://github.com/facebookresearch/fairscale.git@master` +- You'll need 32GB of GPU memory and ~256GB of system memory to train the 13B param model. +- If you have less system memory, the 6.7B param model can be trained with ~128GB of system memory, just set `--arch transformer_lm_gpt3_6_7` +- We use the CPU Adam optimizer from [DeepSpeed](https://github.com/microsoft/DeepSpeed), so you'll need to `pip install deepspeed` before running the command. + +**Notes:** +- The command will take ~5 minutes to start training, during which time it will appear to be hung, since randomly initializing 13B weights can be slow. +- The `--cpu-offload` feature requires training in mixed precision (`--fp16`). +- Tune the `OMP_NUM_THREADS` env variable for best performance with CPU offloading. +- The example command below stops training after 10 steps (`--max-update 10`) and does not save checkpoints (`--no-save`). + +```bash +OMP_NUM_THREADS=20 CUDA_VISIBLE_DEVICES=0 \ + fairseq-train data-bin/wikitext-103-roberta-bpe-bin \ + --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \ + --cpu-offload --checkpoint-activations \ + --task language_modeling --tokens-per-sample 2048 --batch-size 8 \ + --arch transformer_lm_gpt3_13 \ + --optimizer cpu_adam --adam-betas "(0.9,0.98)" \ + --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \ + --max-update 10 --no-save --log-format json --log-interval 1 +``` + +
Example output

+ +``` +(...) +2021-03-08 12:29:51 | INFO | fairseq_cli.train | num. model params: 13,110,865,920 (num. trained: 13,110,865,920) +(...) +2021-03-08 12:29:51 | INFO | fairseq_cli.train | training on 1 devices (GPUs/TPUs) +2021-03-08 12:29:51 | INFO | fairseq_cli.train | max tokens per GPU = None and batch size per GPU = 8 +(...) +Adam Optimizer #0 is created with AVX2 arithmetic capability. +Config: alpha=0.000100, betas=(0.900000, 0.980000), weight_decay=0.000000, adam_w=1 +(...) +2021-03-08 12:31:36 | INFO | train_inner | {"epoch": 1, "update": 0.0, "loss": "16.475", "ppl": "91120.8", "wps": "0", "ups": "0", "wpb": "16384", "bsz": "8", "num_updates": "1", "lr": "2e-05", "gnorm": "20.751", "loss_scale": "4", "train_wall": "99", "gb_free": "9.3", "wall": "105"} +2021-03-08 12:32:33 | INFO | train_inner | {"epoch": 1, "update": 0.0, "loss": "16.446", "ppl": "89281.6", "wps": "288.7", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "2", "lr": "4e-05", "gnorm": "19.777", "loss_scale": "4", "train_wall": "57", "gb_free": "9.3", "wall": "161"} +2021-03-08 12:33:12 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2.0 +2021-03-08 12:33:51 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1.0 +2021-03-08 12:34:45 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "25.22", "ppl": "3.90691e+07", "wps": "123.4", "ups": "0.01", "wpb": "16384", "bsz": "8", "num_updates": "3", "lr": "6e-05", "gnorm": "131.281", "loss_scale": "1", "train_wall": "133", "gb_free": "9.3", "wall": "294"} +2021-03-08 12:35:43 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.079", "ppl": "276809", "wps": "285.5", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "4", "lr": "8e-05", "gnorm": "13.776", "loss_scale": "1", "train_wall": "57", "gb_free": "9.3", "wall": "351"} +2021-03-08 12:36:35 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "23.729", "ppl": "1.39088e+07", "wps": "316.7", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "5", "lr": "0.0001", "gnorm": "72.774", "loss_scale": "1", "train_wall": "52", "gb_free": "9.3", "wall": "403"} +2021-03-08 12:37:28 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "20.429", "ppl": "1.41203e+06", "wps": "307.6", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "6", "lr": "8e-05", "gnorm": "60.846", "loss_scale": "1", "train_wall": "53", "gb_free": "9.3", "wall": "456"} +2021-03-08 12:38:27 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.965", "ppl": "511684", "wps": "279.4", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "7", "lr": "6e-05", "gnorm": "22.687", "loss_scale": "1", "train_wall": "59", "gb_free": "9.3", "wall": "515"} +2021-03-08 12:39:18 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.345", "ppl": "332887", "wps": "319.1", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "8", "lr": "4e-05", "gnorm": "8.451", "loss_scale": "1", "train_wall": "51", "gb_free": "9.3", "wall": "566"} +2021-03-08 12:40:11 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "18.262", "ppl": "314336", "wps": "305.9", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "9", "lr": "2e-05", "gnorm": "6.457", "loss_scale": "1", "train_wall": "54", "gb_free": "9.3", "wall": "620"} +2021-03-08 12:41:04 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "17.556", "ppl": "192686", "wps": "311.8", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "10", "lr": "0", "gnorm": "5.796", "loss_scale": "1", "train_wall": "53", "gb_free": "9.3", "wall": "673"} +2021-03-08 12:41:04 | INFO | fairseq_cli.train | Stopping training due to num_updates: 10 >= max_update: 10 +2021-03-08 12:41:04 | INFO | fairseq_cli.train | begin validation on "valid" subset +2021-03-08 12:43:15 | INFO | valid | {"epoch": 1, "valid_loss": "17.953", "valid_ppl": "253807", "valid_wps": "1868.4", "valid_wpb": "15400.2", "valid_bsz": "7.6", "valid_num_updates": "10"} +2021-03-08 12:43:15 | INFO | fairseq_cli.train | end of epoch 1 (average epoch stats below) +2021-03-08 12:43:15 | INFO | train | {"epoch": 1, "train_loss": "19.351", "train_ppl": "668509", "train_wps": "210.9", "train_ups": "0.01", "train_wpb": "16384", "train_bsz": "8", "train_num_updates": "10", "train_lr": "0", "train_gnorm": "36.26", "train_loss_scale": "1", "train_train_wall": "667", "train_gb_free": "9.3", "train_wall": "804"} +2021-03-08 12:43:15 | INFO | fairseq_cli.train | done training in 798.6 seconds +``` + +

+ +### 13B params on 8 V100 GPUs (with full parameter + optimizer state sharding) + +FSDP can also shard the parameters and optimizer states across multiple GPUs, +reducing memory requirements significantly. On 8 x 32GB GPUs, sharding enables +training the same 13B parameter model *without offloading the parameters to +CPU*. However, without CPU offloading we'd only be able to fit a batch size of +1 per GPU, which would cause training speed to suffer. + +We obtain the best performance on 8 GPUs by combining full sharding and CPU +offloading. The following command trains the same 13B parameter GPT-3 model as +before on 8 x 32GB V100 GPUs; training speed increases superlinearly from ~310 +words per second to ~3200 words per second. + +```bash +OMP_NUM_THREADS=20 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + fairseq-train data-bin/wikitext-103-roberta-bpe-bin \ + --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \ + --cpu-offload --checkpoint-activations \ + --task language_modeling --tokens-per-sample 2048 --batch-size 8 \ + --arch transformer_lm_gpt3_13 \ + --optimizer cpu_adam --adam-betas "(0.9,0.98)" \ + --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \ + --max-update 10 --no-save --log-format json --log-interval 1 +``` + +
Example output

+ +``` +(...) +2021-03-08 18:04:09 | INFO | fairseq_cli.train | num. model params: 13,110,865,920 (num. trained: 13,110,865,920) +(...) +2021-03-08 18:04:09 | INFO | fairseq_cli.train | training on 8 devices (GPUs/TPUs) +2021-03-08 18:04:09 | INFO | fairseq_cli.train | max tokens per GPU = None and batch size per GPU = 8 +(...) +Adam Optimizer #0 is created with AVX2 arithmetic capability. +Config: alpha=0.000100, betas=(0.900000, 0.980000), weight_decay=0.000000, adam_w=1 +(...) +2021-03-08 18:05:06 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "16.408", "ppl": "86945.6", "wps": "0", "ups": "0", "wpb": "131072", "bsz": "64", "num_updates": "1", "lr": "2e-05", "gnorm": "18.27", "loss_scale": "4", "train_wall": "47", "gb_free": "9.3", "wall": "56"} +2021-03-08 18:05:45 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "16.352", "ppl": "83644.3", "wps": "3283.4", "ups": "0.03", "wpb": "131072", "bsz": "64", "num_updates": "2", "lr": "4e-05", "gnorm": "18.411", "loss_scale": "4", "train_wall": "40", "gb_free": "9.3", "wall": "96"} +2021-03-08 18:06:21 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2.0 +2021-03-08 18:06:56 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1.0 +2021-03-08 18:07:37 | INFO | train_inner | {"epoch": 1, "update": 0.006, "loss": "23.682", "ppl": "1.34537e+07", "wps": "1176.6", "ups": "0.01", "wpb": "131072", "bsz": "64", "num_updates": "3", "lr": "6e-05", "gnorm": "119.682", "loss_scale": "1", "train_wall": "111", "gb_free": "9.3", "wall": "208"} +2021-03-08 18:08:18 | INFO | train_inner | {"epoch": 1, "update": 0.007, "loss": "18.988", "ppl": "519921", "wps": "3189.1", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "4", "lr": "8e-05", "gnorm": "14.934", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "249"} +2021-03-08 18:08:59 | INFO | train_inner | {"epoch": 1, "update": 0.008, "loss": "20.08", "ppl": "1.10798e+06", "wps": "3223.1", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "5", "lr": "0.0001", "gnorm": "59.92", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "289"} +2021-03-08 18:09:39 | INFO | train_inner | {"epoch": 1, "update": 0.009, "loss": "18.323", "ppl": "327980", "wps": "3256.6", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "6", "lr": "8e-05", "gnorm": "37.425", "loss_scale": "1", "train_wall": "40", "gb_free": "9.3", "wall": "330"} +2021-03-08 18:10:20 | INFO | train_inner | {"epoch": 1, "update": 0.01, "loss": "17.264", "ppl": "157354", "wps": "3188.7", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "7", "lr": "6e-05", "gnorm": "10.824", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "371"} +2021-03-08 18:11:01 | INFO | train_inner | {"epoch": 1, "update": 0.011, "loss": "16.794", "ppl": "113647", "wps": "3230", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "8", "lr": "4e-05", "gnorm": "5.616", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "411"} +2021-03-08 18:11:39 | INFO | train_inner | {"epoch": 1, "update": 0.012, "loss": "16.706", "ppl": "106938", "wps": "3384", "ups": "0.03", "wpb": "131072", "bsz": "64", "num_updates": "9", "lr": "2e-05", "gnorm": "5.318", "loss_scale": "1", "train_wall": "39", "gb_free": "9.3", "wall": "450"} +2021-03-08 18:12:19 | INFO | train_inner | {"epoch": 1, "update": 0.013, "loss": "16.548", "ppl": "95796.2", "wps": "3274.4", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "10", "lr": "0", "gnorm": "5.22", "loss_scale": "1", "train_wall": "40", "gb_free": "9.3", "wall": "490"} +2021-03-08 18:12:19 | INFO | fairseq_cli.train | Stopping training due to num_updates: 10 >= max_update: 10 +2021-03-08 18:12:19 | INFO | fairseq_cli.train | begin validation on "valid" subset +2021-03-08 18:12:45 | INFO | valid | {"epoch": 1, "valid_loss": "16.624", "valid_ppl": "101000", "valid_wps": "10855.9", "valid_wpb": "123202", "valid_bsz": "60.5", "valid_num_updates": "10"} +2021-03-08 18:12:45 | INFO | fairseq_cli.train | end of epoch 1 (average epoch stats below) +2021-03-08 18:12:45 | INFO | train | {"epoch": 1, "train_loss": "18.114", "train_ppl": "283776", "train_wps": "2567.8", "train_ups": "0.02", "train_wpb": "131072", "train_bsz": "64", "train_num_updates": "10", "train_lr": "0", "train_gnorm": "29.562", "train_loss_scale": "1", "train_train_wall": "480", "train_gb_free": "9.3", "train_wall": "516"} +2021-03-08 18:12:45 | INFO | fairseq_cli.train | done training in 509.9 seconds +``` + +

diff --git a/examples/gottbert/README.md b/examples/gottbert/README.md new file mode 100644 index 0000000000..1d58feb279 --- /dev/null +++ b/examples/gottbert/README.md @@ -0,0 +1,64 @@ +# GottBERT: a pure German language model + +## Introduction + +[GottBERT](http://arxiv.org/abs/2012.02110) is a pretrained language model trained on 145GB of German text based on RoBERTa. + +## Example usage + +### fairseq +##### Load GottBERT from torch.hub (PyTorch >= 1.1): +```python +import torch +gottbert = torch.hub.load('pytorch/fairseq', 'gottbert-base') +gottbert.eval() # disable dropout (or leave in train mode to finetune) +``` + +##### Load GottBERT (for PyTorch 1.0 or custom models): +```python +# Download gottbert model +wget https://dl.gottbert.de/fairseq/models/gottbert-base.tar.gz +tar -xzvf gottbert.tar.gz + +# Load the model in fairseq +from fairseq.models.roberta import GottbertModel +gottbert = GottbertModel.from_pretrained('/path/to/gottbert') +gottbert.eval() # disable dropout (or leave in train mode to finetune) +``` + +##### Filling masks: +```python +masked_line = 'Gott ist ! :)' +gottbert.fill_mask(masked_line, topk=3) +# [('Gott ist gut ! :)', 0.3642110526561737, ' gut'), +# ('Gott ist überall ! :)', 0.06009674072265625, ' überall'), +# ('Gott ist großartig ! :)', 0.0370681993663311, ' großartig')] +``` + +##### Extract features from GottBERT + +```python +# Extract the last layer's features +line = "Der erste Schluck aus dem Becher der Naturwissenschaft macht atheistisch , aber auf dem Grunde des Bechers wartet Gott !" +tokens = gottbert.encode(line) +last_layer_features = gottbert.extract_features(tokens) +assert last_layer_features.size() == torch.Size([1, 27, 768]) + +# Extract all layer's features (layer 0 is the embedding layer) +all_layers = gottbert.extract_features(tokens, return_all_hiddens=True) +assert len(all_layers) == 13 +assert torch.all(all_layers[-1] == last_layer_features) +``` +## Citation +If you use our work, please cite: + +```bibtex +@misc{scheible2020gottbert, + title={GottBERT: a pure German Language Model}, + author={Raphael Scheible and Fabian Thomczyk and Patric Tippmann and Victor Jaravine and Martin Boeker}, + year={2020}, + eprint={2012.02110}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/examples/hubert/README.md b/examples/hubert/README.md new file mode 100644 index 0000000000..6695d81971 --- /dev/null +++ b/examples/hubert/README.md @@ -0,0 +1,116 @@ +# HuBERT + +## Pre-trained and fine-tuned (ASR) models +Model | Pretraining Data | Finetuning Dataset | Model | Quantizer +|---|---|---|---|--- +HuBERT Base (~95M params) | [Librispeech](http://www.openslr.org/12) 960 hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | [L9 km500](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960_L9_km500.bin) +HuBERT Large (~316M params) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k.pt) +HuBERT Extra Large (~1B params) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k.pt) +HuBERT Large | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k_finetune_ls960.pt) +HuBERT Extra Large | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k_finetune_ls960.pt) + +## Load a model +``` +ckpt_path = "/path/to/the/checkpoint.pt" +models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path]) +model = models[0] +``` + +## Train a new model + +### Data preparation + +Follow the steps in `./simple_kmeans` to create: +- `{train,valid}.tsv` waveform list files +- `{train,valid}.km` frame-aligned pseudo label files. +- `dict.km.txt` a dummy dictionary +The `label_rate` is the same as the feature frame rate used for clustering, +which is 100Hz for MFCC features and 50Hz for HuBERT features by default. + +### Pre-train a HuBERT model + +Suppose `{train,valid}.tsv` are saved at `/path/to/data`, `{train,valid}.km` +are saved at `/path/to/labels`, and the label rate is 100Hz. + +To train a base model (12 layer transformer), run: +```sh +$ python fairseq_cli/hydra_train.py \ + --config-dir /path/to/fairseq-py/examples/hubert/config/pretrain \ + --config-name hubert_base_librispeech \ + task.data=/path/to/data task.label_dir=/path/to/labels task.labels='["km"]' model.label_rate=100 +``` + +### Fine-tune a HuBERT model with a CTC loss + +Suppose `{train,valid}.tsv` are saved at `/path/to/data`, and their +corresponding character transcripts `{train,valid}.ltr` are saved at +`/path/to/trans`. + +To fine-tune a pre-trained HuBERT model at `/path/to/checkpoint`, run +```sh +$ python fairseq_cli/hydra_train.py \ + --config-dir /path/to/fairseq-py/examples/hubert/config/finetune \ + --config-name base_10h \ + task.data=/path/to/data task.label_dir=/path/to/trans \ + model.w2v_path=/path/to/checkpoint +``` + +### Decode a HuBERT model + +Suppose the `test.tsv` and `test.ltr` are the waveform list and transcripts of +the split to be decoded, saved at `/path/to/data`, and the fine-tuned model is +saved at `/path/to/checkpoint`. We support three decoding modes: +- Viterbi decoding: greedy decoding without a language model +- KenLM decoding: decoding with an arpa-format KenLM n-gram language model +- Fairseq-LM deocding: decoding with a Fairseq neural language model + + +#### Viterbi decoding + +`task.normalize` needs to be consistent with the value used during fine-tuning. +Decoding results will be saved at +`/path/to/experiment/directory/decode/viterbi/test`. + +```sh +$ python examples/speech_recognition/new/infer.py \ + --config-dir /path/to/fairseq-py/examples/hubert/config/decode \ + --config-name infer_viterbi \ + task.data=/path/to/data \ + task.normalize=[true|false] \ + decoding.exp_dir=/path/to/experiment/directory \ + common_eval.path=/path/to/checkpoint + dataset.gen_subset=test \ +``` + +#### KenLM / Fairseq-LM decoding + +Suppose the pronunciation lexicon and the n-gram LM are saved at +`/path/to/lexicon` and `/path/to/arpa`, respectively. Decoding results will be +saved at `/path/to/experiment/directory/decode/kenlm/test`. + +```sh +$ python examples/speech_recognition/new/infer.py \ + --config-dir /path/to/fairseq-py/examples/hubert/config/decode \ + --config-name infer_kenlm \ + task.data=/path/to/data \ + task.normalize=[true|false] \ + decoding.exp_dir=/path/to/experiment/directory \ + common_eval.path=/path/to/checkpoint + dataset.gen_subset=test \ + decoding.decoder.lexicon=/path/to/lexicon \ + decoding.decoder.lmpath=/path/to/arpa +``` + +The command above uses the default decoding hyperparameter, which can be found +in `examples/speech_recognition/hydra/decoder.py`. These parameters can be +configured from the command line. For example, to search with a beam size of +500, we can append the command above with `decoding.decoder.beam=500`. +Important parameters include: +- decoding.decoder.beam +- decoding.decoder.beamthreshold +- decoding.decoder.lmweight +- decoding.decoder.wordscore +- decoding.decoder.silweight + +To decode with a Fairseq LM, use `--config-name infer_fsqlm` instead, and +change the path of lexicon and LM accordingly. diff --git a/examples/hubert/config/decode/ax_sweep/ngram.yaml b/examples/hubert/config/decode/ax_sweep/ngram.yaml new file mode 100644 index 0000000000..5a02df1f7d --- /dev/null +++ b/examples/hubert/config/decode/ax_sweep/ngram.yaml @@ -0,0 +1,33 @@ +# @package _global_ + +common_eval: + results_path: ${decoding.exp_dir}/decode/${decoding.decoder.name}_ax/${dataset.gen_subset} + +hydra: + sweeper: + ax_config: + max_trials: 60 + early_stop: + minimize: true + max_epochs_without_improvement: 10 + epsilon: 0.025 + experiment: + name: ${dataset.gen_subset} + objective_name: wer + minimize: true + parameter_constraints: null + outcome_constraints: null + status_quo: null + client: + verbose_logging: false + random_seed: null + params: + decoding.decoder.lmweight: + type: range + bounds: [0.0, 8.0] + decoding.decoder.wordscore: + type: range + bounds: [-5.0, 5.0] + decoding.decoder.silweight: + type: range + bounds: [-10.0, 0.0] diff --git a/examples/hubert/config/decode/ax_sweep/transformer.yaml b/examples/hubert/config/decode/ax_sweep/transformer.yaml new file mode 100644 index 0000000000..85ed3bd1a5 --- /dev/null +++ b/examples/hubert/config/decode/ax_sweep/transformer.yaml @@ -0,0 +1,33 @@ +# @package _global_ + +common_eval: + results_path: ${decoding.exp_dir}/decode/${decoding.decoder.name}_ax/${dataset.gen_subset} + +hydra: + sweeper: + ax_config: + max_trials: 60 + early_stop: + minimize: true + max_epochs_without_improvement: 10 + epsilon: 0.025 + experiment: + name: ${dataset.gen_subset} + objective_name: wer + minimize: true + parameter_constraints: null + outcome_constraints: null + status_quo: null + client: + verbose_logging: false + random_seed: null + params: + decoding.decoder.lmweight: + type: range + bounds: [0.0, 4.0] + decoding.decoder.wordscore: + type: range + bounds: [-5.0, 5.0] + decoding.decoder.silweight: + type: range + bounds: [-8.0, 0.0] diff --git a/examples/hubert/config/decode/infer_fsqlm.yaml b/examples/hubert/config/decode/infer_fsqlm.yaml new file mode 100644 index 0000000000..026ad8db89 --- /dev/null +++ b/examples/hubert/config/decode/infer_fsqlm.yaml @@ -0,0 +1,36 @@ +# @package _group_ + +defaults: + - model: null + +hydra: + run: + dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} + sweep: + dir: ${common_eval.results_path} + subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} + +task: + _name: hubert_pretraining + single_target: true + fine_tuning: true + data: ??? + normalize: ??? + +decoding: + type: fairseqlm + lexicon: ??? + lmpath: ??? + beamthreshold: 25 + beam: 500 + lmweight: 2 + wordscore: -1 + silweight: 0 + unique_wer_file: true +common_eval: + results_path: ??? + path: ??? + post_process: letter +dataset: + max_tokens: 1100000 + gen_subset: ??? diff --git a/examples/hubert/config/decode/infer_kenlm.yaml b/examples/hubert/config/decode/infer_kenlm.yaml new file mode 100644 index 0000000000..04642aeb65 --- /dev/null +++ b/examples/hubert/config/decode/infer_kenlm.yaml @@ -0,0 +1,36 @@ +# @package _group_ + +defaults: + - model: null + +hydra: + run: + dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} + sweep: + dir: ${common_eval.results_path} + subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} + +task: + _name: hubert_pretraining + single_target: true + fine_tuning: true + data: ??? + normalize: ??? + +decoding: + type: kenlm + lexicon: ??? + lmpath: ??? + beamthreshold: 100 + beam: 500 + lmweight: 2 + wordscore: -1 + silweight: 0 + unique_wer_file: true +common_eval: + results_path: ??? + path: ??? + post_process: letter +dataset: + max_tokens: 1100000 + gen_subset: ??? diff --git a/examples/hubert/config/decode/infer_viterbi.yaml b/examples/hubert/config/decode/infer_viterbi.yaml new file mode 100644 index 0000000000..4afc74c18c --- /dev/null +++ b/examples/hubert/config/decode/infer_viterbi.yaml @@ -0,0 +1,29 @@ +# @package _group_ + +defaults: + - model: null + +hydra: + run: + dir: ${common_eval.results_path}/viterbi + sweep: + dir: ${common_eval.results_path} + subdir: viterbi + +task: + _name: hubert_pretraining + single_target: true + fine_tuning: true + data: ??? + normalize: ??? + +decoding: + type: viterbi + unique_wer_file: true +common_eval: + results_path: ??? + path: ??? + post_process: letter +dataset: + max_tokens: 1100000 + gen_subset: ??? diff --git a/examples/hubert/config/decode/run/submitit_slurm.yaml b/examples/hubert/config/decode/run/submitit_slurm.yaml new file mode 100644 index 0000000000..0b8065832e --- /dev/null +++ b/examples/hubert/config/decode/run/submitit_slurm.yaml @@ -0,0 +1,17 @@ +# @package _global_ +hydra: + launcher: + cpus_per_task: ${distributed_training.distributed_world_size} + gpus_per_node: ${distributed_training.distributed_world_size} + tasks_per_node: ${hydra.launcher.gpus_per_node} + nodes: 1 + mem_gb: 200 + timeout_min: 4320 + max_num_timeout: 50 + name: ${hydra.job.config_name} + submitit_folder: ${hydra.sweep.dir}/submitit + +distributed_training: + distributed_world_size: 1 + distributed_no_spawn: true + distributed_port: 29761 diff --git a/examples/hubert/config/decode/run/submitit_slurm_8gpu.yaml b/examples/hubert/config/decode/run/submitit_slurm_8gpu.yaml new file mode 100644 index 0000000000..2f669f3763 --- /dev/null +++ b/examples/hubert/config/decode/run/submitit_slurm_8gpu.yaml @@ -0,0 +1,17 @@ +# @package _global_ +hydra: + launcher: + cpus_per_task: ${distributed_training.distributed_world_size} + gpus_per_node: ${distributed_training.distributed_world_size} + tasks_per_node: ${hydra.launcher.gpus_per_node} + nodes: 1 + mem_gb: 200 + timeout_min: 4320 + max_num_timeout: 50 + name: ${hydra.job.config_name} + submitit_folder: ${hydra.sweep.dir}/submitit + +distributed_training: + distributed_world_size: 8 + distributed_no_spawn: true + distributed_port: 29761 diff --git a/examples/hubert/config/finetune/base_10h.yaml b/examples/hubert/config/finetune/base_10h.yaml new file mode 100644 index 0000000000..a22c7c0347 --- /dev/null +++ b/examples/hubert/config/finetune/base_10h.yaml @@ -0,0 +1,100 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tblog + seed: 1337 + +checkpoint: + save_interval: 5 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +distributed_training: + ddp_backend: c10d + find_unused_parameters: true + distributed_world_size: 1 + distributed_port: 29671 + nprocs_per_node: 8 + +task: + _name: hubert_pretraining + data: ??? + fine_tuning: true + label_dir: ??? + normalize: false # must be consistent with pre-training + labels: ["ltr"] + single_target: true + +dataset: + num_workers: 0 + max_tokens: 3200000 + validate_after_updates: ${model.freeze_finetune_updates} + validate_interval: 5 + train_subset: train + valid_subset: valid + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 25000 + lr: [2e-5] + sentence_avg: true + update_freq: [1] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + warmup_steps: 8000 + hold_steps: 0 + decay_steps: 72000 + final_lr_scale: 0.05 + +model: + _name: hubert_ctc + w2v_path: ??? + apply_mask: true + mask_selection: static + mask_length: 10 + mask_other: 0 + mask_prob: 0.75 + mask_channel_selection: static + mask_channel_length: 64 + mask_channel_other: 0 + mask_channel_prob: 0.5 + layerdrop: 0.1 + dropout: 0.0 + activation_dropout: 0.1 + attention_dropout: 0.0 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '__' + exclude_keys: + - run + - task.data + - task.label_dir + - model.w2v_path + - dataset.train_subset + - dataset.valid_subset + - criterion.wer_kenlm_model + - criterion.wer_lexicon + run: + dir: ??? + sweep: + dir: ??? + subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} diff --git a/examples/hubert/config/finetune/ckpt/it1.yaml b/examples/hubert/config/finetune/ckpt/it1.yaml new file mode 100644 index 0000000000..2af96b3f72 --- /dev/null +++ b/examples/hubert/config/finetune/ckpt/it1.yaml @@ -0,0 +1,7 @@ +# @package _global_ + +task: + normalize: false + +model: + w2v_path: /checkpoint/wnhsu/w2v/hubert_final/iter1/hubert.km.randcrop.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU400k.s1337.ngpu32/checkpoint_last.pt diff --git a/examples/hubert/config/finetune/lm/ls_4gram.yaml b/examples/hubert/config/finetune/lm/ls_4gram.yaml new file mode 100644 index 0000000000..8c7728ad29 --- /dev/null +++ b/examples/hubert/config/finetune/lm/ls_4gram.yaml @@ -0,0 +1,7 @@ +# @package _global_ + +criterion: + wer_kenlm_model: /checkpoint/abdo/old_checkpoint02/datasets/librispeech/4-gram.bin + wer_lexicon: /checkpoint/abdo/old_checkpoint02/datasets/librispeech/10h/raw/lexicon_ltr.lst + wer_lm_weight: 2.0 + wer_word_score: -1.0 diff --git a/examples/hubert/config/finetune/run/submitit_reg.yaml b/examples/hubert/config/finetune/run/submitit_reg.yaml new file mode 100644 index 0000000000..27509503e7 --- /dev/null +++ b/examples/hubert/config/finetune/run/submitit_reg.yaml @@ -0,0 +1,20 @@ +# @package _global_ + +hydra: + launcher: + cpus_per_task: 8 + gpus_per_node: 8 + tasks_per_node: ${hydra.launcher.gpus_per_node} + nodes: 1 + comment: null + mem_gb: 384 + timeout_min: 4320 + max_num_timeout: 100 + constraint: volta32gb + name: ${hydra.job.config_name}/${hydra.job.override_dirname} + submitit_folder: ${hydra.sweep.dir}/submitit/%j + +distributed_training: + distributed_world_size: 8 + distributed_port: 29671 + nprocs_per_node: 8 diff --git a/examples/hubert/config/pretrain/data/iter1.yaml b/examples/hubert/config/pretrain/data/iter1.yaml new file mode 100644 index 0000000000..0a1b65d802 --- /dev/null +++ b/examples/hubert/config/pretrain/data/iter1.yaml @@ -0,0 +1,8 @@ +# @package _global_ + +task: + label_dir: ??? + labels: ["km"] + +model: + label_rate: 100 diff --git a/examples/hubert/config/pretrain/data/iter2.yaml b/examples/hubert/config/pretrain/data/iter2.yaml new file mode 100644 index 0000000000..2d4bfe61cc --- /dev/null +++ b/examples/hubert/config/pretrain/data/iter2.yaml @@ -0,0 +1,8 @@ +# @package _global_ + +task: + label_dir: ??? + labels: ["km"] + +model: + label_rate: 50 diff --git a/examples/hubert/config/pretrain/hubert_base_librispeech.yaml b/examples/hubert/config/pretrain/hubert_base_librispeech.yaml new file mode 100644 index 0000000000..bd84461a16 --- /dev/null +++ b/examples/hubert/config/pretrain/hubert_base_librispeech.yaml @@ -0,0 +1,97 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + seed: 1337 + tensorboard_logdir: tblog + +checkpoint: + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + + +distributed_training: + ddp_backend: no_c10d + distributed_backend: 'nccl' + distributed_world_size: 32 + distributed_port: 29671 + nprocs_per_node: 8 + find_unused_parameters: true + +task: + _name: hubert_pretraining + data: ??? + label_dir: ??? + labels: ??? + label_rate: ${model.label_rate} + sample_rate: 16000 + max_sample_size: 250000 + min_sample_size: 32000 + pad_audio: false + random_crop: true + normalize: false # must be consistent with extractor + +dataset: + num_workers: 6 + max_tokens: 1400000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + validate_interval_updates: 10000 + +criterion: + _name: hubert + pred_masked_weight: 1.0 + pred_nomask_weight: 0.0 + loss_weights: [10,] + +optimization: + max_update: 400000 + lr: [0.0005] + clip_norm: 10.0 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 32000 + +model: + _name: hubert + label_rate: ??? + skip_masked: false + skip_nomask: false + mask_prob: 0.80 + extractor_mode: default + conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' + final_dim: 256 + encoder_layerdrop: 0.05 + dropout_input: 0.1 + dropout_features: 0.1 + dropout: 0.1 + attention_dropout: 0.1 + feature_grad_mult: 0.1 + untie_final_proj: true + activation_dropout: 0.0 + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '__' + exclude_keys: + - run + - task.data + - task.label_dir + run: + dir: ??? + sweep: + dir: ??? + subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} diff --git a/examples/hubert/config/pretrain/hubert_large_librivox.yaml b/examples/hubert/config/pretrain/hubert_large_librivox.yaml new file mode 100644 index 0000000000..a5192b5f29 --- /dev/null +++ b/examples/hubert/config/pretrain/hubert_large_librivox.yaml @@ -0,0 +1,101 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + seed: 1337 + tensorboard_logdir: tblog + +checkpoint: + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + + +distributed_training: + ddp_backend: no_c10d + distributed_backend: 'nccl' + distributed_world_size: 128 + distributed_port: 29671 + nprocs_per_node: 8 + find_unused_parameters: true + +task: + _name: hubert_pretraining + data: ??? + label_dir: ??? + labels: ??? + label_rate: ${model.label_rate} + sample_rate: 16000 + max_sample_size: 250000 + min_sample_size: 32000 + pad_audio: false + random_crop: true + normalize: true # must be consistent with extractor + +dataset: + num_workers: 6 + max_tokens: 900000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + validate_interval_updates: 10000 + +criterion: + _name: hubert + pred_masked_weight: 1.0 + pred_nomask_weight: 0.0 + loss_weights: [10,] + +optimization: + max_update: 400000 + lr: [0.0015] + clip_norm: 1.0 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 32000 + +model: + _name: hubert + label_rate: ??? + encoder_layers: 24 + encoder_embed_dim: 1024 + encoder_ffn_embed_dim: 4096 + encoder_attention_heads: 16 + final_dim: 768 + skip_masked: false + skip_nomask: false + mask_prob: 0.80 + extractor_mode: layer_norm + conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' + encoder_layerdrop: 0.0 + dropout_input: 0.0 + dropout_features: 0.0 + dropout: 0.0 + attention_dropout: 0.0 + layer_norm_first: true + feature_grad_mult: 1.0 + untie_final_proj: true + activation_dropout: 0.0 + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '__' + exclude_keys: + - run + - task.data + run: + dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt + sweep: + dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt + subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} diff --git a/examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml b/examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml new file mode 100644 index 0000000000..34e8f2bfb9 --- /dev/null +++ b/examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml @@ -0,0 +1,101 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + seed: 1337 + tensorboard_logdir: tblog + +checkpoint: + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + + +distributed_training: + ddp_backend: no_c10d + distributed_backend: 'nccl' + distributed_world_size: 256 + distributed_port: 29671 + nprocs_per_node: 8 + find_unused_parameters: true + +task: + _name: hubert_pretraining + data: ??? + label_dir: ??? + labels: ??? + label_rate: ${model.label_rate} + sample_rate: 16000 + max_sample_size: 250000 + min_sample_size: 32000 + pad_audio: false + random_crop: true + normalize: true # must be consistent with extractor + +dataset: + num_workers: 6 + max_tokens: 360000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + validate_interval_updates: 10000 + +criterion: + _name: hubert + pred_masked_weight: 1.0 + pred_nomask_weight: 0.0 + loss_weights: [10,] + +optimization: + max_update: 400000 + lr: [0.003] + clip_norm: 1.0 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 32000 + +model: + _name: hubert + label_rate: ??? + encoder_layers: 48 + encoder_embed_dim: 1280 + encoder_ffn_embed_dim: 5120 + encoder_attention_heads: 16 + final_dim: 1024 + skip_masked: false + skip_nomask: false + mask_prob: 0.80 + extractor_mode: layer_norm + conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' + encoder_layerdrop: 0.0 + dropout_input: 0.0 + dropout_features: 0.0 + dropout: 0.0 + attention_dropout: 0.0 + layer_norm_first: true + feature_grad_mult: 1.0 + untie_final_proj: true + activation_dropout: 0.0 + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '__' + exclude_keys: + - run + - task.data + run: + dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt + sweep: + dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt + subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} diff --git a/examples/hubert/config/pretrain/run/submitit_reg.yaml b/examples/hubert/config/pretrain/run/submitit_reg.yaml new file mode 100644 index 0000000000..46c979cd28 --- /dev/null +++ b/examples/hubert/config/pretrain/run/submitit_reg.yaml @@ -0,0 +1,20 @@ +# @package _global_ + +hydra: + launcher: + cpus_per_task: 8 + gpus_per_node: 8 + tasks_per_node: ${hydra.launcher.gpus_per_node} + nodes: 4 + comment: null + mem_gb: 384 + timeout_min: 4320 + max_num_timeout: 100 + constraint: volta32gb + name: ${hydra.job.config_name}/${hydra.job.override_dirname} + submitit_folder: ${hydra.sweep.dir}/submitit/%j + +distributed_training: + distributed_world_size: 32 + distributed_port: 29671 + nprocs_per_node: 8 diff --git a/examples/hubert/measure_teacher_quality.py b/examples/hubert/measure_teacher_quality.py new file mode 100644 index 0000000000..92279b2214 --- /dev/null +++ b/examples/hubert/measure_teacher_quality.py @@ -0,0 +1,241 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import os.path as op +import re +from tabulate import tabulate +from collections import Counter + + +def comp_purity(p_xy, axis): + max_p = p_xy.max(axis=axis) + marg_p = p_xy.sum(axis=axis) + indv_pur = max_p / marg_p + aggr_pur = max_p.sum() + return indv_pur, aggr_pur + + +def comp_entropy(p): + return (-p * np.log(p + 1e-8)).sum() + + +def comp_norm_mutual_info(p_xy): + p_x = p_xy.sum(axis=1, keepdims=True) + p_y = p_xy.sum(axis=0, keepdims=True) + pmi = np.log(p_xy / np.matmul(p_x, p_y) + 1e-8) + mi = (p_xy * pmi).sum() + h_x = comp_entropy(p_x) + h_y = comp_entropy(p_y) + return mi, mi / h_x, mi / h_y, h_x, h_y + + +def pad(labs, n): + if n == 0: + return np.array(labs) + return np.concatenate([[labs[0]] * n, labs, [labs[-1]] * n]) + + +def comp_avg_seg_dur(labs_list): + n_frms = 0 + n_segs = 0 + for labs in labs_list: + labs = np.array(labs) + edges = np.zeros(len(labs)).astype(bool) + edges[0] = True + edges[1:] = labs[1:] != labs[:-1] + n_frms += len(edges) + n_segs += edges.astype(int).sum() + return n_frms / n_segs + + +def comp_joint_prob(uid2refs, uid2hyps): + """ + Args: + pad: padding for spliced-feature derived labels + """ + cnts = Counter() + skipped = [] + abs_frmdiff = 0 + for uid in uid2refs: + if uid not in uid2hyps: + skipped.append(uid) + continue + refs = uid2refs[uid] + hyps = uid2hyps[uid] + abs_frmdiff += abs(len(refs) - len(hyps)) + min_len = min(len(refs), len(hyps)) + refs = refs[:min_len] + hyps = hyps[:min_len] + cnts.update(zip(refs, hyps)) + tot = sum(cnts.values()) + + ref_set = sorted({ref for ref, _ in cnts.keys()}) + hyp_set = sorted({hyp for _, hyp in cnts.keys()}) + ref2pid = dict(zip(ref_set, range(len(ref_set)))) + hyp2lid = dict(zip(hyp_set, range(len(hyp_set)))) + # print(hyp_set) + p_xy = np.zeros((len(ref2pid), len(hyp2lid)), dtype=float) + for (ref, hyp), cnt in cnts.items(): + p_xy[ref2pid[ref], hyp2lid[hyp]] = cnt + p_xy /= p_xy.sum() + return p_xy, ref2pid, hyp2lid, tot, abs_frmdiff, skipped + + +def read_phn(tsv_path, rm_stress=True): + uid2phns = {} + with open(tsv_path) as f: + for line in f: + uid, phns = line.rstrip().split("\t") + phns = phns.split(",") + if rm_stress: + phns = [re.sub("[0-9]", "", phn) for phn in phns] + uid2phns[uid] = phns + return uid2phns + + +def read_lab(tsv_path, lab_path, pad_len=0, upsample=1): + """ + tsv is needed to retrieve the uids for the labels + """ + with open(tsv_path) as f: + f.readline() + uids = [op.splitext(op.basename(line.rstrip().split()[0]))[0] for line in f] + with open(lab_path) as f: + labs_list = [pad(line.rstrip().split(), pad_len).repeat(upsample) for line in f] + assert len(uids) == len(labs_list) + return dict(zip(uids, labs_list)) + + +def main_lab_lab( + tsv_dir, + lab_dir, + lab_name, + lab_sets, + ref_dir, + ref_name, + pad_len=0, + upsample=1, + verbose=False, +): + # assume tsv_dir is the same for both the reference and the hypotheses + tsv_dir = lab_dir if tsv_dir is None else tsv_dir + + uid2refs = {} + for s in lab_sets: + uid2refs.update(read_lab(f"{tsv_dir}/{s}.tsv", f"{ref_dir}/{s}.{ref_name}")) + + uid2hyps = {} + for s in lab_sets: + uid2hyps.update( + read_lab( + f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample + ) + ) + _main(uid2refs, uid2hyps, verbose) + + +def main_phn_lab( + tsv_dir, + lab_dir, + lab_name, + lab_sets, + phn_dir, + phn_sets, + pad_len=0, + upsample=1, + verbose=False, +): + uid2refs = {} + for s in phn_sets: + uid2refs.update(read_phn(f"{phn_dir}/{s}.tsv")) + + uid2hyps = {} + tsv_dir = lab_dir if tsv_dir is None else tsv_dir + for s in lab_sets: + uid2hyps.update( + read_lab( + f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample + ) + ) + _main(uid2refs, uid2hyps, verbose) + + +def _main(uid2refs, uid2hyps, verbose): + (p_xy, ref2pid, hyp2lid, tot, frmdiff, skipped) = comp_joint_prob( + uid2refs, uid2hyps + ) + ref_pur_by_hyp, ref_pur = comp_purity(p_xy, axis=0) + hyp_pur_by_ref, hyp_pur = comp_purity(p_xy, axis=1) + (mi, mi_norm_by_ref, mi_norm_by_hyp, h_ref, h_hyp) = comp_norm_mutual_info(p_xy) + outputs = { + "ref pur": ref_pur, + "hyp pur": hyp_pur, + "H(ref)": h_ref, + "H(hyp)": h_hyp, + "MI": mi, + "MI/H(ref)": mi_norm_by_ref, + "ref segL": comp_avg_seg_dur(uid2refs.values()), + "hyp segL": comp_avg_seg_dur(uid2hyps.values()), + "p_xy shape": p_xy.shape, + "frm tot": tot, + "frm diff": frmdiff, + "utt tot": len(uid2refs), + "utt miss": len(skipped), + } + print(tabulate([outputs.values()], outputs.keys(), floatfmt=".4f")) + + +if __name__ == "__main__": + """ + compute quality of labels with respect to phone or another labels if set + """ + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("tsv_dir") + parser.add_argument("lab_dir") + parser.add_argument("lab_name") + parser.add_argument("--lab_sets", default=["valid"], type=str, nargs="+") + parser.add_argument( + "--phn_dir", + default="/checkpoint/wnhsu/data/librispeech/960h/fa/raw_phn/phone_frame_align_v1", + ) + parser.add_argument( + "--phn_sets", default=["dev-clean", "dev-other"], type=str, nargs="+" + ) + parser.add_argument("--pad_len", default=0, type=int, help="padding for hypotheses") + parser.add_argument( + "--upsample", default=1, type=int, help="upsample factor for hypotheses" + ) + parser.add_argument("--ref_lab_dir", default="") + parser.add_argument("--ref_lab_name", default="") + parser.add_argument("--verbose", action="store_true") + args = parser.parse_args() + + if args.ref_lab_dir and args.ref_lab_name: + main_lab_lab( + args.tsv_dir, + args.lab_dir, + args.lab_name, + args.lab_sets, + args.ref_lab_dir, + args.ref_lab_name, + args.pad_len, + args.upsample, + args.verbose, + ) + else: + main_phn_lab( + args.tsv_dir, + args.lab_dir, + args.lab_name, + args.lab_sets, + args.phn_dir, + args.phn_sets, + args.pad_len, + args.upsample, + args.verbose, + ) diff --git a/examples/hubert/simple_kmeans/README.md b/examples/hubert/simple_kmeans/README.md new file mode 100644 index 0000000000..847475c23f --- /dev/null +++ b/examples/hubert/simple_kmeans/README.md @@ -0,0 +1,80 @@ +# Sharded Feature Extraction and K-means Application + +This folder contains scripts for preparing HUBERT labels from tsv files, the +steps are: +1. feature extraction +2. k-means clustering +3. k-means application + + +## Data preparation + +`*.tsv` files contains a list of audio, where each line is the root, and +following lines are the subpath for each audio: +``` + + + +... +``` + + +## Feature extraction + +### MFCC feature +Suppose the tsv file is at `${tsv_dir}/${split}.tsv`. To extract 39-D +mfcc+delta+ddelta features for the 1st iteration HUBERT training, run: +```sh +python dump_mfcc_feature.py ${tsv_dir} ${split} ${nshard} ${rank} ${feat_dir} +``` +This would shard the tsv file into `${nshard}` and extract features for the +`${rank}`-th shard, where rank is an integer in `[0, nshard-1]`. Features would +be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`. + + +### HUBERT feature +To extract features from the `${layer}`-th transformer layer of a trained +HUBERT model saved at `${ckpt_path}`, run: +```sh +python dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} ${layer} ${nshard} ${rank} ${feat_dir} +``` +Features would also be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`. + +- if out-of-memory, decrease the chunk size with `--max_chunk` + + +## K-means clustering +To fit a k-means model with `${n_clusters}` clusters on 10% of the `${split}` data, run +```sh +python learn_kmeans.py ${feat_dir} ${split} ${nshard} ${km_path} ${n_cluster} --percent 0.1 +``` +This saves the k-means model to `${km_path}`. + +- set `--precent -1` to use all data +- more kmeans options can be found with `-h` flag + + +## K-means application +To apply a trained k-means model `${km_path}` to obtain labels for `${split}`, run +```sh +python dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir} +``` +This would extract labels for the `${rank}`-th shard out of `${nshard}` shards +and dump them to `${lab_dir}/${split}_${rank}_${shard}.km` + + +Finally, merge shards for `${split}` by running +```sh +for rank in $(seq 0 $((nshard - 1))); do + cat $lab_dir/${split}_${rank}_${nshard}.km +done > $lab_dir/${split}.km +``` + + +## Create a dummy dict +To create a dummy dictionary, run +```sh +for x in $(seq 0 $((n_clusters - 1))); do + echo "$x 1" +done >> $lab_dir/dict.km.txt +``` diff --git a/examples/hubert/simple_kmeans/dump_hubert_feature.py b/examples/hubert/simple_kmeans/dump_hubert_feature.py new file mode 100644 index 0000000000..7ea4ea0aa9 --- /dev/null +++ b/examples/hubert/simple_kmeans/dump_hubert_feature.py @@ -0,0 +1,93 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +import sys + +import fairseq +import soundfile as sf +import torch +import torch.nn.functional as F + +from feature_utils import get_path_iterator, dump_feature +from fairseq.data.audio.audio_utils import get_features_or_waveform + + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=sys.stdout, +) +logger = logging.getLogger("dump_hubert_feature") + + +class HubertFeatureReader(object): + def __init__(self, ckpt_path, layer, max_chunk=1600000): + ( + model, + cfg, + task, + ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path]) + self.model = model[0].eval().cuda() + self.task = task + self.layer = layer + self.max_chunk = max_chunk + logger.info(f"TASK CONFIG:\n{self.task.cfg}") + logger.info(f" max_chunk = {self.max_chunk}") + + def read_audio(self, path, ref_len=None): + wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate) + if wav.ndim == 2: + wav = wav.mean(-1) + assert wav.ndim == 1, wav.ndim + if ref_len is not None and abs(ref_len - len(wav)) > 160: + logging.warning(f"ref {ref_len} != read {len(wav)} ({path})") + return wav + + def get_feats(self, path, ref_len=None): + x = self.read_audio(path, ref_len=ref_len) + with torch.no_grad(): + x = torch.from_numpy(x).float().cuda() + if self.task.cfg.normalize: + x = F.layer_norm(x, x.shape) + x = x.view(1, -1) + + feat = [] + for start in range(0, x.size(1), self.max_chunk): + x_chunk = x[:, start : start + self.max_chunk] + feat_chunk, _ = self.model.extract_features( + source=x_chunk, + padding_mask=None, + mask=False, + output_layer=self.layer, + ) + feat.append(feat_chunk) + return torch.cat(feat, 1).squeeze(0) + + +def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk): + reader = HubertFeatureReader(ckpt_path, layer, max_chunk) + generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank) + dump_feature(reader, generator, num, split, nshard, rank, feat_dir) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("tsv_dir") + parser.add_argument("split") + parser.add_argument("ckpt_path") + parser.add_argument("layer", type=int) + parser.add_argument("nshard", type=int) + parser.add_argument("rank", type=int) + parser.add_argument("feat_dir") + parser.add_argument("--max_chunk", type=int, default=1600000) + args = parser.parse_args() + logger.info(args) + + main(**vars(args)) diff --git a/examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py b/examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py new file mode 100644 index 0000000000..941bc1b675 --- /dev/null +++ b/examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py @@ -0,0 +1,95 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import csv +import io +import logging +import os +import os.path as op +import sys + +from dump_hubert_feature import HubertFeatureReader +from feature_utils import get_shard_range, dump_feature +from fairseq.data.audio.audio_utils import get_features_or_waveform + + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=sys.stdout, +) +logger = logging.getLogger("dump_hubert_feature_s2t") + + +class HubertFeatureReaderS2T(HubertFeatureReader): + def read_audio(self, path, ref_len=None): + wav = get_features_or_waveform( + path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate + ) + if wav.ndim == 2: + wav = wav.mean(-1) + assert wav.ndim == 1, wav.ndim + if ref_len is not None and abs(ref_len - len(wav)) > 160: + logging.warning(f"ref {ref_len} != read {len(wav)} ({path})") + return wav + + +def get_path_iterator(root, tsv, nshard, rank, audio_col_name): + with open(tsv) as f: + reader = csv.DictReader( + f, + delimiter="\t", + quotechar=None, + doublequote=False, + lineterminator="\n", + quoting=csv.QUOTE_NONE, + ) + subpaths = [op.join(root, e[audio_col_name]) for e in reader] + start, end = get_shard_range(len(subpaths), nshard, rank) + subpaths = subpaths[start:end] + + def iterate(): + for subpath in subpaths: + yield op.join(root, subpath), None + + return iterate, len(subpaths) + + +def main( + root, + tsv_path, + ckpt_path, + layer, + nshard, + rank, + feat_dir, + split, + max_chunk, + audio_col_name, +): + reader = HubertFeatureReaderS2T(ckpt_path, layer, max_chunk) + generator, num = get_path_iterator(root, tsv_path, nshard, rank, audio_col_name) + dump_feature(reader, generator, num, split, nshard, rank, feat_dir) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("root") + parser.add_argument("tsv_path") + parser.add_argument("ckpt_path") + parser.add_argument("layer", type=int) + parser.add_argument("nshard", type=int) + parser.add_argument("rank", type=int) + parser.add_argument("feat_dir") + parser.add_argument("split") + parser.add_argument("--audio_col_name", type=str, default="audio") + parser.add_argument("--max_chunk", type=int, default=1600000) + args = parser.parse_args() + logger.info(args) + + main(**vars(args)) diff --git a/examples/hubert/simple_kmeans/dump_km_label.py b/examples/hubert/simple_kmeans/dump_km_label.py new file mode 100644 index 0000000000..8871307804 --- /dev/null +++ b/examples/hubert/simple_kmeans/dump_km_label.py @@ -0,0 +1,98 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +import sys + +import numpy as np + +import joblib +import torch +import tqdm + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=sys.stdout, +) +logger = logging.getLogger("dump_km_label") + + +class ApplyKmeans(object): + def __init__(self, km_path): + self.km_model = joblib.load(km_path) + self.C_np = self.km_model.cluster_centers_.transpose() + self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True) + + self.C = torch.from_numpy(self.C_np) + self.Cnorm = torch.from_numpy(self.Cnorm_np) + if torch.cuda.is_available(): + self.C = self.C.cuda() + self.Cnorm = self.Cnorm.cuda() + + def __call__(self, x): + if isinstance(x, torch.Tensor): + dist = ( + x.pow(2).sum(1, keepdim=True) + - 2 * torch.matmul(x, self.C) + + self.Cnorm + ) + return dist.argmin(dim=1).cpu().numpy() + else: + dist = ( + (x ** 2).sum(1, keepdims=True) + - 2 * np.matmul(x, self.C_np) + + self.Cnorm_np + ) + return np.argmin(dist, axis=1) + + +def get_feat_iterator(feat_dir, split, nshard, rank): + feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy" + leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len" + with open(leng_path, "r") as f: + lengs = [int(line.rstrip()) for line in f] + offsets = [0] + np.cumsum(lengs[:-1]).tolist() + + def iterate(): + feat = np.load(feat_path, mmap_mode="r") + assert feat.shape[0] == (offsets[-1] + lengs[-1]) + for offset, leng in zip(offsets, lengs): + yield feat[offset: offset + leng] + + return iterate, len(lengs) + + +def dump_label(feat_dir, split, km_path, nshard, rank, lab_dir): + apply_kmeans = ApplyKmeans(km_path) + generator, num = get_feat_iterator(feat_dir, split, nshard, rank) + iterator = generator() + + lab_path = f"{lab_dir}/{split}_{rank}_{nshard}.km" + os.makedirs(lab_dir, exist_ok=True) + with open(lab_path, "w") as f: + for feat in tqdm.tqdm(iterator, total=num): + # feat = torch.from_numpy(feat).cuda() + lab = apply_kmeans(feat).tolist() + f.write(" ".join(map(str, lab)) + "\n") + logger.info("finished successfully") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("feat_dir") + parser.add_argument("split") + parser.add_argument("km_path") + parser.add_argument("nshard", type=int) + parser.add_argument("rank", type=int) + parser.add_argument("lab_dir") + args = parser.parse_args() + logging.info(str(args)) + + dump_label(**vars(args)) diff --git a/examples/hubert/simple_kmeans/dump_mfcc_feature.py b/examples/hubert/simple_kmeans/dump_mfcc_feature.py new file mode 100644 index 0000000000..c3537784d1 --- /dev/null +++ b/examples/hubert/simple_kmeans/dump_mfcc_feature.py @@ -0,0 +1,74 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +import sys + +import soundfile as sf +import torch +import torchaudio + +from feature_utils import get_path_iterator, dump_feature +from fairseq.data.audio.audio_utils import get_features_or_waveform + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=sys.stdout, +) +logger = logging.getLogger("dump_mfcc_feature") + + +class MfccFeatureReader(object): + def __init__(self, sample_rate): + self.sample_rate = sample_rate + + def read_audio(self, path, ref_len=None): + wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.sample_rate) + if ref_len is not None and abs(ref_len - len(wav)) > 160: + logging.warning(f"ref {ref_len} != read {len(wav)} ({path})") + return wav + + def get_feats(self, path, ref_len=None): + x = self.read_audio(path, ref_len=ref_len) + with torch.no_grad(): + x = torch.from_numpy(x).float() + x = x.view(1, -1) + + mfccs = torchaudio.compliance.kaldi.mfcc( + waveform=x, + sample_frequency=self.sample_rate, + use_energy=False, + ) # (time, freq) + mfccs = mfccs.transpose(0, 1) # (freq, time) + deltas = torchaudio.functional.compute_deltas(mfccs) + ddeltas = torchaudio.functional.compute_deltas(deltas) + concat = torch.cat([mfccs, deltas, ddeltas], dim=0) + concat = concat.transpose(0, 1).contiguous() # (freq, time) + return concat + + +def main(tsv_dir, split, nshard, rank, feat_dir, sample_rate): + reader = MfccFeatureReader(sample_rate) + generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank) + dump_feature(reader, generator, num, split, nshard, rank, feat_dir) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("tsv_dir") + parser.add_argument("split") + parser.add_argument("nshard", type=int) + parser.add_argument("rank", type=int) + parser.add_argument("feat_dir") + parser.add_argument("--sample_rate", type=int, default=16000) + args = parser.parse_args() + logger.info(args) + + main(**vars(args)) diff --git a/examples/hubert/simple_kmeans/dump_w2v2_feature.py b/examples/hubert/simple_kmeans/dump_w2v2_feature.py new file mode 100644 index 0000000000..a1f0d902ac --- /dev/null +++ b/examples/hubert/simple_kmeans/dump_w2v2_feature.py @@ -0,0 +1,95 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +import sys + +import fairseq +import soundfile as sf +import torch +import torch.nn.functional as F + +from feature_utils import get_path_iterator, dump_feature + + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=sys.stdout, +) +logger = logging.getLogger("dump_w2v2_feature") + + +class Wav2Vec2FeatureReader(object): + def __init__(self, ckpt_path, layer, max_chunk=1600000): + ( + model, + cfg, + task, + ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path]) + self.model = model[0].eval().cuda() + self.task = task + self.layer = layer # assume this is 1-based like HuBERT + self.max_chunk = max_chunk + logger.info(f"TASK CONFIG:\n{self.task.cfg}") + logger.info(f" max_chunk = {self.max_chunk}") + logger.info(f" model:\n{self.model}") + + def read_audio(self, path, ref_len=None): + wav, sr = sf.read(path) + assert sr == self.task.cfg.sample_rate, sr + if wav.ndim == 2: + wav = wav.mean(-1) + assert wav.ndim == 1, wav.ndim + if ref_len is not None and abs(ref_len - len(wav)) > 160: + logging.warning(f"ref {ref_len} != read {len(wav)} ({path})") + return wav + + def get_feats(self, path, ref_len=None): + x = self.read_audio(path, ref_len) + with torch.no_grad(): + x = torch.from_numpy(x).float().cuda() + if self.task.cfg.normalize: + x = F.layer_norm(x, x.shape) + x = x.view(1, -1) + + feat = [] + for start in range(0, x.size(1), self.max_chunk): + x_chunk = x[:, start: start + self.max_chunk] + res = self.model.extract_features( + source=x_chunk, + padding_mask=None, + mask=False, + layer=self.layer - 1, + ) + feat_chunk = res["x"] + feat.append(feat_chunk) + return torch.cat(feat, 1).squeeze(0) + + +def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk): + reader = Wav2Vec2FeatureReader(ckpt_path, layer, max_chunk) + generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank) + dump_feature(reader, generator, num, split, nshard, rank, feat_dir) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("tsv_dir") + parser.add_argument("split") + parser.add_argument("ckpt_path") + parser.add_argument("layer", type=int) + parser.add_argument("nshard", type=int) + parser.add_argument("rank", type=int) + parser.add_argument("feat_dir") + parser.add_argument("--max_chunk", type=int, default=1600000) + args = parser.parse_args() + logger.info(args) + + main(**vars(args)) diff --git a/examples/hubert/simple_kmeans/feature_utils.py b/examples/hubert/simple_kmeans/feature_utils.py new file mode 100644 index 0000000000..f80bc45697 --- /dev/null +++ b/examples/hubert/simple_kmeans/feature_utils.py @@ -0,0 +1,66 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +import sys + +import tqdm +from npy_append_array import NpyAppendArray + + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=sys.stdout, +) +logger = logging.getLogger("feature_utils") + + +def get_shard_range(tot, nshard, rank): + assert rank < nshard and rank >= 0, f"invaid rank/nshard {rank}/{nshard}" + start = round(tot / nshard * rank) + end = round(tot / nshard * (rank + 1)) + assert start < end, f"start={start}, end={end}" + logger.info( + f"rank {rank} of {nshard}, process {end-start} " + f"({start}-{end}) out of {tot}" + ) + return start, end + + +def get_path_iterator(tsv, nshard, rank): + with open(tsv, "r") as f: + root = f.readline().rstrip() + lines = [line.rstrip() for line in f] + start, end = get_shard_range(len(lines), nshard, rank) + lines = lines[start:end] + def iterate(): + for line in lines: + subpath, nsample = line.split("\t") + yield f"{root}/{subpath}", int(nsample) + return iterate, len(lines) + + +def dump_feature(reader, generator, num, split, nshard, rank, feat_dir): + iterator = generator() + + feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy" + leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len" + + os.makedirs(feat_dir, exist_ok=True) + if os.path.exists(feat_path): + os.remove(feat_path) + + feat_f = NpyAppendArray(feat_path) + with open(leng_path, "w") as leng_f: + for path, nsample in tqdm.tqdm(iterator, total=num): + feat = reader.get_feats(path, nsample) + feat_f.append(feat.cpu().numpy()) + leng_f.write(f"{len(feat)}\n") + logger.info("finished successfully") + + diff --git a/examples/hubert/simple_kmeans/learn_kmeans.py b/examples/hubert/simple_kmeans/learn_kmeans.py new file mode 100644 index 0000000000..113ac655b8 --- /dev/null +++ b/examples/hubert/simple_kmeans/learn_kmeans.py @@ -0,0 +1,146 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +import sys + +import numpy as np +from sklearn.cluster import MiniBatchKMeans + +import joblib + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=sys.stdout, +) +logger = logging.getLogger("learn_kmeans") + + +def get_km_model( + n_clusters, + init, + max_iter, + batch_size, + tol, + max_no_improvement, + n_init, + reassignment_ratio, +): + return MiniBatchKMeans( + n_clusters=n_clusters, + init=init, + max_iter=max_iter, + batch_size=batch_size, + verbose=1, + compute_labels=False, + tol=tol, + max_no_improvement=max_no_improvement, + init_size=None, + n_init=n_init, + reassignment_ratio=reassignment_ratio, + ) + + +def load_feature_shard(feat_dir, split, nshard, rank, percent): + feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy" + leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len" + with open(leng_path, "r") as f: + lengs = [int(line.rstrip()) for line in f] + offsets = [0] + np.cumsum(lengs[:-1]).tolist() + + if percent < 0: + return np.load(feat_path, mmap_mode="r") + else: + nsample = int(np.ceil(len(lengs) * percent)) + indices = np.random.choice(len(lengs), nsample, replace=False) + feat = np.load(feat_path, mmap_mode="r") + sampled_feat = np.concatenate( + [feat[offsets[i]: offsets[i] + lengs[i]] for i in indices], axis=0 + ) + logger.info( + ( + f"sampled {nsample} utterances, {len(sampled_feat)} frames " + f"from shard {rank}/{nshard}" + ) + ) + return sampled_feat + + +def load_feature(feat_dir, split, nshard, seed, percent): + assert percent <= 1.0 + feat = np.concatenate( + [ + load_feature_shard(feat_dir, split, nshard, r, percent) + for r in range(nshard) + ], + axis=0, + ) + logging.info(f"loaded feature with dimension {feat.shape}") + return feat + + +def learn_kmeans( + feat_dir, + split, + nshard, + km_path, + n_clusters, + seed, + percent, + init, + max_iter, + batch_size, + tol, + n_init, + reassignment_ratio, + max_no_improvement, +): + np.random.seed(seed) + feat = load_feature(feat_dir, split, nshard, seed, percent) + km_model = get_km_model( + n_clusters, + init, + max_iter, + batch_size, + tol, + max_no_improvement, + n_init, + reassignment_ratio, + ) + km_model.fit(feat) + joblib.dump(km_model, km_path) + + inertia = -km_model.score(feat) / len(feat) + logger.info("total intertia: %.5f", inertia) + logger.info("finished successfully") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("feat_dir", type=str) + parser.add_argument("split", type=str) + parser.add_argument("nshard", type=int) + parser.add_argument("km_path", type=str) + parser.add_argument("n_clusters", type=int) + parser.add_argument("--seed", default=0, type=int) + parser.add_argument( + "--percent", default=-1, type=float, help="sample a subset; -1 for all" + ) + parser.add_argument("--init", default="k-means++") + parser.add_argument("--max_iter", default=100, type=int) + parser.add_argument("--batch_size", default=10000, type=int) + parser.add_argument("--tol", default=0.0, type=float) + parser.add_argument("--max_no_improvement", default=100, type=int) + parser.add_argument("--n_init", default=20, type=int) + parser.add_argument("--reassignment_ratio", default=0.0, type=float) + args = parser.parse_args() + logging.info(str(args)) + + learn_kmeans(**vars(args)) diff --git a/examples/hubert/tests/6313-76958-0021.flac b/examples/hubert/tests/6313-76958-0021.flac new file mode 100644 index 0000000000..e644b19871 Binary files /dev/null and b/examples/hubert/tests/6313-76958-0021.flac differ diff --git a/examples/hubert/tests/sample.base.L9.km500.km b/examples/hubert/tests/sample.base.L9.km500.km new file mode 100644 index 0000000000..656eef96e5 --- /dev/null +++ b/examples/hubert/tests/sample.base.L9.km500.km @@ -0,0 +1 @@ +17 17 17 17 296 296 20 20 20 461 461 20 184 20 20 20 184 289 144 445 445 213 213 213 213 252 215 129 401 20 354 180 494 44 416 416 416 192 192 180 180 84 84 84 16 88 88 88 88 319 242 240 348 35 35 117 404 197 226 209 83 55 55 55 322 67 94 199 118 118 118 118 118 118 402 219 219 219 222 222 222 353 59 245 245 251 251 241 241 431 367 367 178 35 35 35 458 192 351 41 324 324 324 252 464 464 139 139 424 424 424 497 497 497 122 90 42 42 147 380 380 499 319 319 319 348 348 33 33 394 90 76 465 74 425 425 386 386 431 319 319 319 319 319 240 203 53 473 34 340 340 340 340 116 64 212 384 377 123 123 123 216 216 216 114 114 57 57 57 203 381 381 117 48 13 47 80 20 80 80 320 7 7 364 345 141 141 141 141 281 281 9 86 221 198 198 22 283 455 236 239 239 107 107 395 286 286 286 468 468 406 406 467 176 176 176 328 200 200 248 464 145 365 365 365 365 330 385 457 77 77 77 54 224 300 334 334 382 304 304 271 186 31 342 342 342 198 22 283 5 38 162 232 232 482 68 26 26 359 359 81 444 213 213 252 143 458 41 324 324 324 422 143 445 445 445 351 180 486 315 315 450 450 450 203 53 473 291 89 116 379 243 478 478 66 482 482 105 105 336 336 354 29 498 498 498 498 396 396 313 37 314 198 22 222 222 222 222 245 129 74 74 437 437 496 496 496 413 94 199 41 41 324 324 318 318 269 342 9 168 106 106 284 426 426 426 426 348 64 76 401 259 108 123 153 153 153 153 372 372 396 313 24 314 90 401 259 445 445 351 351 365 365 365 365 282 282 215 233 233 229 427 20 247 126 126 126 326 326 326 326 326 326 326 101 101 101 149 228 228 20 289 20 7 217 70 65 189 189 151 240 285 300 300 495 406 467 176 135 135 339 248 466 114 222 222 222 313 313 239 384 371 490 490 38 31 54 54 224 494 494 236 129 259 74 190 487 288 288 288 288 374 173 173 280 280 302 302 175 175 69 69 223 130 129 401 75 108 119 295 295 295 295 143 192 192 135 135 135 135 200 200 464 255 255 255 251 251 241 431 235 235 235 348 348 465 192 44 44 236 8 8 354 319 319 383 348 36 310 107 107 395 462 462 8 32 32 32 354 153 153 153 153 153 387 387 387 387 85 207 318 318 318 49 453 9 168 125 125 125 125 125 466 199 44 44 143 129 144 445 351 351 351 486 486 460 285 285 302 302 497 497 122 239 161 161 79 79 499 499 499 265 265 265 85 85 85 299 299 173 352 352 427 229 170 247 15 15 15 15 15 15 193 193 193 17 diff --git a/examples/hubert/tests/sample.base.L9.len b/examples/hubert/tests/sample.base.L9.len new file mode 100644 index 0000000000..7d3028fa24 --- /dev/null +++ b/examples/hubert/tests/sample.base.L9.len @@ -0,0 +1 @@ +596 diff --git a/examples/hubert/tests/sample.base.L9.npy b/examples/hubert/tests/sample.base.L9.npy new file mode 100644 index 0000000000..574bef9c7c Binary files /dev/null and b/examples/hubert/tests/sample.base.L9.npy differ diff --git a/examples/hubert/tests/sample.large.L20.len b/examples/hubert/tests/sample.large.L20.len new file mode 100644 index 0000000000..7d3028fa24 --- /dev/null +++ b/examples/hubert/tests/sample.large.L20.len @@ -0,0 +1 @@ +596 diff --git a/examples/hubert/tests/sample.large.L20.npy b/examples/hubert/tests/sample.large.L20.npy new file mode 100644 index 0000000000..c58d221e3c Binary files /dev/null and b/examples/hubert/tests/sample.large.L20.npy differ diff --git a/examples/hubert/tests/sample.large.hypo.word b/examples/hubert/tests/sample.large.hypo.word new file mode 100644 index 0000000000..d77a4cfddc --- /dev/null +++ b/examples/hubert/tests/sample.large.hypo.word @@ -0,0 +1 @@ +KEEP A GOING AN IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0) diff --git a/examples/hubert/tests/sample.xlarge.L30.len b/examples/hubert/tests/sample.xlarge.L30.len new file mode 100644 index 0000000000..7d3028fa24 --- /dev/null +++ b/examples/hubert/tests/sample.xlarge.L30.len @@ -0,0 +1 @@ +596 diff --git a/examples/hubert/tests/sample.xlarge.L30.npy b/examples/hubert/tests/sample.xlarge.L30.npy new file mode 100644 index 0000000000..29d8c0dabd Binary files /dev/null and b/examples/hubert/tests/sample.xlarge.L30.npy differ diff --git a/examples/hubert/tests/sample.xlarge.hypo.word b/examples/hubert/tests/sample.xlarge.hypo.word new file mode 100644 index 0000000000..53e402d455 --- /dev/null +++ b/examples/hubert/tests/sample.xlarge.hypo.word @@ -0,0 +1 @@ +KEEP A GOIN AND IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0) diff --git a/examples/hubert/tests/test_feature_and_unit.sh b/examples/hubert/tests/test_feature_and_unit.sh new file mode 100644 index 0000000000..8cddb27758 --- /dev/null +++ b/examples/hubert/tests/test_feature_and_unit.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +set -e + +sizes="base large xlarge" + +declare -A ckpt_urls +ckpt_urls[base]="https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt" +ckpt_urls[large]="https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k.pt" +ckpt_urls[xlarge]="https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k.pt" + +declare -A km_layers +km_layers[base]=9 +km_layers[large]=20 +km_layers[xlarge]=30 + +declare -A km_urls +km_urls[base]="https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960_L9_km500.bin" + +declare -A km_nunits +km_nunits[base]=500 + +test_dir=./examples/hubert/tests +split=sample + +echo -e "${test_dir}\n6313-76958-0021.flac\t190800" > "${test_dir}/${split}.tsv" + +check_feature () { + echo "checking features..." + + size=$1 + ckpt_url=$2 + km_layer=$3 + ckpt_path="$test_dir/$(basename "$ckpt_url")" + + if [ ! -f "$ckpt_path" ]; then + echo "downloading $ckpt_url to $ckpt_path" + wget "$ckpt_url" -O "$ckpt_path" + fi + + python ./examples/hubert/simple_kmeans/dump_hubert_feature.py \ + "${test_dir}" "${split}" "${ckpt_path}" "${km_layer}" 1 0 "${test_dir}" + + if diff -q "${test_dir}/${split}.${size}.L${km_layer}.npy" "${test_dir}/${split}_0_1.npy" &>/dev/null; then + echo "...passed npy check" + else + echo "...failed npy check" + fi + + if diff -q "${test_dir}/${split}.${size}.L${km_layer}.len" "${test_dir}/${split}_0_1.len" &>/dev/null; then + echo "...passed len check" + else + echo "...failed len check" + fi +} + + +check_unit () { + echo "checking units..." + + size=$1 + km_url=$2 + km_layer=$3 + km_nunit=$4 + km_path="$test_dir/$(basename "$km_url")" + + if [ ! -f "$km_path" ]; then + echo "downloading $km_url to $km_path" + wget "$km_url" -O "$km_path" + fi + + python ./examples/hubert/simple_kmeans/dump_km_label.py \ + "${test_dir}" "${split}" "${km_path}" 1 0 "${test_dir}" + + if diff -q "${test_dir}/${split}.${size}.L${km_layer}.km${km_nunit}.km" "${test_dir}/${split}_0_1.km" &>/dev/null; then + echo "...passed unit check" + else + echo "...failed unit check" + fi +} + + +for size in $sizes; do + echo "=== Running unit test for HuBERT $size ===" + check_feature "$size" "${ckpt_urls[$size]}" "${km_layers[$size]}" + + if [ -n "${km_urls[$size]}" ]; then + check_unit "$size" "${km_urls[$size]}" "${km_layers[$size]}" "${km_nunits[$size]}" + fi + + rm -f $test_dir/${split}_0_1.* +done diff --git a/examples/hubert/tests/test_finetuned_asr.sh b/examples/hubert/tests/test_finetuned_asr.sh new file mode 100644 index 0000000000..3c0538b1f7 --- /dev/null +++ b/examples/hubert/tests/test_finetuned_asr.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +set -e + +sizes="large xlarge" + +declare -A ckpt_urls +ckpt_urls[large]="https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k_finetune_ls960.pt" +ckpt_urls[xlarge]="https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k_finetune_ls960.pt" + +test_dir=$(pwd)/examples/hubert/tests +split=sample + +echo -e "${test_dir}\n6313-76958-0021.flac\t190800" > "${test_dir}/${split}.tsv" +echo -e "K E E P | A | G O I N G | A N D | I F | Y O U ' R E | L U C K Y | Y O U ' L L | R U N | P L U M B | I N T O | T H E M | W A S | T H E | J E E R I N G | A N S W E R | A S | T H E | S L E E P Y | C O W M E N | S P U R R E D | T H E I R | P O N I E S | O N | T O W A R D | C A M P | M U T T E R I N G | T H E I R | D I S A P P R O V A L | O F | T A K I N G | A L O N G | A | B U N C H | O F | B O Y S | O N | A | C A T T L E | D R I V E |" > "${test_dir}/${split}.ltr" + +check_asr () { + echo "checking asr outputs..." + + size=$1 + ckpt_url=$2 + ckpt_path="$test_dir/$(basename "$ckpt_url")" + + if [ ! -f "$ckpt_path" ]; then + echo "downloading $ckpt_url to $ckpt_path" + wget "$ckpt_url" -O "$ckpt_path" + fi + + python examples/speech_recognition/new/infer.py \ + --config-dir examples/hubert/config/decode --config-name infer_viterbi \ + common_eval.path="${ckpt_path}" task.data="${test_dir}" task.normalize=true \ + decoding.results_path="${test_dir}/pred" \ + common_eval.results_path="${test_dir}/pred" \ + common_eval.quiet=false dataset.gen_subset="${split}" + + if diff -q "${test_dir}/pred/hypo.word" "${test_dir}/${split}.${size}.hypo.word" &>/dev/null; then + echo "...passed word check" + else + echo "...failed word check" + fi + rm -rf "${test_dir}/pred" +} + +for size in $sizes; do + check_asr "$size" "${ckpt_urls[$size]}" +done diff --git a/examples/hubert/update_ckpt.py b/examples/hubert/update_ckpt.py new file mode 100644 index 0000000000..53c9e74ea6 --- /dev/null +++ b/examples/hubert/update_ckpt.py @@ -0,0 +1,22 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +src_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2.pt" +ref_ckpt = "/checkpoint/wnhsu/w2v/hubert_icassp_oss_v3/iter2_km100-400k-grp-L6/oss.km500_p0_1_s334.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU100k.s1337.ngpu32/checkpoint_last.pt" +new_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2_updated.pt" + + +def update_state(state): + state["model"]["label_embs_concat"] = state["model"].pop("label_embs") + state["args"].task = "hubert_pretraining" + state["args"].labels = f"['{state['args'].labels}']" + return state + + +src_state = torch.load(src_ckpt) +src_state = update_state(src_state) +torch.save(src_state, new_ckpt) diff --git a/examples/language_model/README.adaptive_inputs.md b/examples/language_model/README.adaptive_inputs.md index 6873467115..6650d58f37 100644 --- a/examples/language_model/README.adaptive_inputs.md +++ b/examples/language_model/README.adaptive_inputs.md @@ -19,10 +19,10 @@ fairseq-train --task language_modeling \ data-bin/wikitext-103 \ --save-dir checkpoints/transformer_wikitext-103 \ --arch transformer_lm_wiki103 \ - --max-update 286000 --max-lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \ - --warmup-updates 16000 --warmup-init-lr 1e-07 --min-lr 1e-09 --optimizer nag --lr 0.0001 --clip-norm 0.1 \ + --max-update 286000 --lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \ + --warmup-updates 16000 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --optimizer nag --min-lr 0.0001 --clip-norm 0.1 \ --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \ - --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=no_c10d + --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=legacy_ddp ``` ## Citation diff --git a/examples/language_model/README.conv.md b/examples/language_model/README.conv.md index f0b6a3a921..1ff8635906 100644 --- a/examples/language_model/README.conv.md +++ b/examples/language_model/README.conv.md @@ -17,7 +17,7 @@ fairseq-train --task language_modeling \ --optimizer nag --clip-norm 0.1 --weight-decay 5e-06 \ --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \ --max-tokens 1024 --tokens-per-sample 1024 \ - --ddp-backend no_c10d \ + --ddp-backend legacy_ddp \ --max-epoch 35 ``` diff --git a/examples/language_model/README.md b/examples/language_model/README.md index dc84d8c761..e78ea48e08 100644 --- a/examples/language_model/README.md +++ b/examples/language_model/README.md @@ -5,7 +5,7 @@ Model | Description | Dataset | Download ---|---|---|--- `transformer_lm.gbw.adaptive_huge` | Adaptive Inputs
([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853))
1026M params | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2) -`transformer_lm.wiki103.adaptive` | Adaptive Inputs
([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853))
247M params | [WikiText-103](https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2) +`transformer_lm.wiki103.adaptive` | Adaptive Inputs
([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853))
247M params | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2) `transformer_lm.wmt19.en` | English LM
([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz) `transformer_lm.wmt19.de` | German LM
([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz) `transformer_lm.wmt19.ru` | Russian LM
([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz) diff --git a/examples/laser/README.md b/examples/laser/README.md new file mode 100644 index 0000000000..66acada04f --- /dev/null +++ b/examples/laser/README.md @@ -0,0 +1,144 @@ +# LASER Language-Agnostic SEntence Representations + +LASER is a library to calculate and use multilingual sentence embeddings. + +You can find more information about LASER and how to use it on the official [LASER repository](https://github.com/facebookresearch/LASER). + +This folder contains source code for training LASER embeddings. + + +## Prepare data and configuration file + +Binarize your data with fairseq, as described [here](https://fairseq.readthedocs.io/en/latest/getting_started.html#data-pre-processing). + +Create a json config file with this format: +``` +{ + "src_vocab": "/path/to/spm.src.cvocab", + "tgt_vocab": "/path/to/spm.tgt.cvocab", + "train": [ + { + "type": "translation", + "id": 0, + "src": "/path/to/srclang1-tgtlang0/train.srclang1", + "tgt": "/path/to/srclang1-tgtlang0/train.tgtlang0" + }, + { + "type": "translation", + "id": 1, + "src": "/path/to/srclang1-tgtlang1/train.srclang1", + "tgt": "/path/to/srclang1-tgtlang1/train.tgtlang1" + }, + { + "type": "translation", + "id": 0, + "src": "/path/to/srclang2-tgtlang0/train.srclang2", + "tgt": "/path/to/srclang2-tgtlang0/train.tgtlang0" + }, + { + "type": "translation", + "id": 1, + "src": "/path/to/srclang2-tgtlang1/train.srclang2", + "tgt": "/path/to/srclang2-tgtlang1/train.tgtlang1" + }, + ... + ], + "valid": [ + { + "type": "translation", + "id": 0, + "src": "/unused", + "tgt": "/unused" + } + ] +} +``` +where paths are paths to binarized indexed fairseq dataset files. +`id` represents the target language id. + + +## Training Command Line Example + +``` +fairseq-train \ + /path/to/configfile_described_above.json \ + --user-dir examples/laser/laser_src \ + --log-interval 100 --log-format simple \ + --task laser --arch laser_lstm \ + --save-dir . \ + --optimizer adam \ + --lr 0.001 \ + --lr-scheduler inverse_sqrt \ + --clip-norm 5 \ + --warmup-updates 90000 \ + --update-freq 2 \ + --dropout 0.0 \ + --encoder-dropout-out 0.1 \ + --max-tokens 2000 \ + --max-epoch 50 \ + --encoder-bidirectional \ + --encoder-layers 5 \ + --encoder-hidden-size 512 \ + --decoder-layers 1 \ + --decoder-hidden-size 2048 \ + --encoder-embed-dim 320 \ + --decoder-embed-dim 320 \ + --decoder-lang-embed-dim 32 \ + --warmup-init-lr 0.001 \ + --disable-validation +``` + + +## Applications + +We showcase several applications of multilingual sentence embeddings +with code to reproduce our results (in the directory "tasks"). + +* [**Cross-lingual document classification**](https://github.com/facebookresearch/LASER/tree/master/tasks/mldoc) using the + [*MLDoc*](https://github.com/facebookresearch/MLDoc) corpus [2,6] +* [**WikiMatrix**](https://github.com/facebookresearch/LASER/tree/master/tasks/WikiMatrix) + Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia [7] +* [**Bitext mining**](https://github.com/facebookresearch/LASER/tree/master/tasks/bucc) using the + [*BUCC*](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) corpus [3,5] +* [**Cross-lingual NLI**](https://github.com/facebookresearch/LASER/tree/master/tasks/xnli) + using the [*XNLI*](https://www.nyu.edu/projects/bowman/xnli/) corpus [4,5,6] +* [**Multilingual similarity search**](https://github.com/facebookresearch/LASER/tree/master/tasks/similarity) [1,6] +* [**Sentence embedding of text files**](https://github.com/facebookresearch/LASER/tree/master/tasks/embed) + example how to calculate sentence embeddings for arbitrary text files in any of the supported language. + +**For all tasks, we use exactly the same multilingual encoder, without any task specific optimization or fine-tuning.** + + + +## References + +[1] Holger Schwenk and Matthijs Douze, + [*Learning Joint Multilingual Sentence Representations with Neural Machine Translation*](https://aclanthology.info/papers/W17-2619/w17-2619), + ACL workshop on Representation Learning for NLP, 2017 + +[2] Holger Schwenk and Xian Li, + [*A Corpus for Multilingual Document Classification in Eight Languages*](http://www.lrec-conf.org/proceedings/lrec2018/pdf/658.pdf), + LREC, pages 3548-3551, 2018. + +[3] Holger Schwenk, + [*Filtering and Mining Parallel Data in a Joint Multilingual Space*](http://aclweb.org/anthology/P18-2037) + ACL, July 2018 + +[4] Alexis Conneau, Guillaume Lample, Ruty Rinott, Adina Williams, Samuel R. Bowman, Holger Schwenk and Veselin Stoyanov, + [*XNLI: Cross-lingual Sentence Understanding through Inference*](https://aclweb.org/anthology/D18-1269), + EMNLP, 2018. + +[5] Mikel Artetxe and Holger Schwenk, + [*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136) + arXiv, Nov 3 2018. + +[6] Mikel Artetxe and Holger Schwenk, + [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464) + arXiv, Dec 26 2018. + +[7] Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, + [*WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia*](https://arxiv.org/abs/1907.05791) + arXiv, July 11 2019. + +[8] Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave and Armand Joulin + [*CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB*](https://arxiv.org/abs/1911.04944) diff --git a/examples/laser/laser_src/__init__.py b/examples/laser/laser_src/__init__.py new file mode 100644 index 0000000000..9ffbd656d8 --- /dev/null +++ b/examples/laser/laser_src/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .laser_task import * # noqa +from .laser_lstm import * # noqa +from .laser_transformer import * # noqa diff --git a/examples/laser/laser_src/laser_lstm.py b/examples/laser/laser_src/laser_lstm.py new file mode 100644 index 0000000000..10df90e002 --- /dev/null +++ b/examples/laser/laser_src/laser_lstm.py @@ -0,0 +1,585 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from fairseq import options, utils + +from fairseq.models import ( + FairseqEncoder, + FairseqIncrementalDecoder, + FairseqEncoderDecoderModel, + register_model, + register_model_architecture, +) + + +@register_model("laser_lstm") +class LSTMModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens=None, + tgt_tokens=None, + tgt_lengths=None, + target_language_id=None, + dataset_name="", + ): + assert target_language_id is not None + + src_encoder_out = self.encoder(src_tokens, src_lengths, dataset_name) + return self.decoder( + prev_output_tokens, src_encoder_out, lang_id=target_language_id + ) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + parser.add_argument( + "--dropout", + default=0.1, + type=float, + metavar="D", + help="dropout probability", + ) + parser.add_argument( + "--encoder-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension", + ) + parser.add_argument( + "--encoder-embed-path", + default=None, + type=str, + metavar="STR", + help="path to pre-trained encoder embedding", + ) + parser.add_argument( + "--encoder-hidden-size", type=int, metavar="N", help="encoder hidden size" + ) + parser.add_argument( + "--encoder-layers", type=int, metavar="N", help="number of encoder layers" + ) + parser.add_argument( + "--encoder-bidirectional", + action="store_true", + help="make all layers of encoder bidirectional", + ) + parser.add_argument( + "--decoder-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension", + ) + parser.add_argument( + "--decoder-embed-path", + default=None, + type=str, + metavar="STR", + help="path to pre-trained decoder embedding", + ) + parser.add_argument( + "--decoder-hidden-size", type=int, metavar="N", help="decoder hidden size" + ) + parser.add_argument( + "--decoder-layers", type=int, metavar="N", help="number of decoder layers" + ) + parser.add_argument( + "--decoder-out-embed-dim", + type=int, + metavar="N", + help="decoder output embedding dimension", + ) + parser.add_argument( + "--decoder-zero-init", + type=str, + metavar="BOOL", + help="initialize the decoder hidden/cell state to zero", + ) + parser.add_argument( + "--decoder-lang-embed-dim", + type=int, + metavar="N", + help="decoder language embedding dimension", + ) + parser.add_argument( + "--fixed-embeddings", + action="store_true", + help="keep embeddings fixed (ENCODER ONLY)", + ) # TODO Also apply to decoder embeddings? + + # Granular dropout settings (if not specified these default to --dropout) + parser.add_argument( + "--encoder-dropout-in", + type=float, + metavar="D", + help="dropout probability for encoder input embedding", + ) + parser.add_argument( + "--encoder-dropout-out", + type=float, + metavar="D", + help="dropout probability for encoder output", + ) + parser.add_argument( + "--decoder-dropout-in", + type=float, + metavar="D", + help="dropout probability for decoder input embedding", + ) + parser.add_argument( + "--decoder-dropout-out", + type=float, + metavar="D", + help="dropout probability for decoder output", + ) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + # make sure that all args are properly defaulted (in case there are any new ones) + base_architecture(args) + + def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) + embed_dict = utils.parse_embedding(embed_path) + utils.print_embed_overlap(embed_dict, dictionary) + return utils.load_embedding(embed_dict, dictionary, embed_tokens) + + pretrained_encoder_embed = None + if args.encoder_embed_path: + pretrained_encoder_embed = load_pretrained_embedding_from_file( + args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim + ) + pretrained_decoder_embed = None + if args.decoder_embed_path: + pretrained_decoder_embed = load_pretrained_embedding_from_file( + args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim + ) + + num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0 + + encoder = LSTMEncoder( + dictionary=task.source_dictionary, + embed_dim=args.encoder_embed_dim, + hidden_size=args.encoder_hidden_size, + num_layers=args.encoder_layers, + dropout_in=args.encoder_dropout_in, + dropout_out=args.encoder_dropout_out, + bidirectional=args.encoder_bidirectional, + pretrained_embed=pretrained_encoder_embed, + fixed_embeddings=args.fixed_embeddings, + ) + decoder = LSTMDecoder( + dictionary=task.target_dictionary, + embed_dim=args.decoder_embed_dim, + hidden_size=args.decoder_hidden_size, + out_embed_dim=args.decoder_out_embed_dim, + num_layers=args.decoder_layers, + dropout_in=args.decoder_dropout_in, + dropout_out=args.decoder_dropout_out, + zero_init=options.eval_bool(args.decoder_zero_init), + encoder_embed_dim=args.encoder_embed_dim, + encoder_output_units=encoder.output_units, + pretrained_embed=pretrained_decoder_embed, + num_langs=num_langs, + lang_embed_dim=args.decoder_lang_embed_dim, + ) + return cls(encoder, decoder) + + +class LSTMEncoder(FairseqEncoder): + """LSTM encoder.""" + + def __init__( + self, + dictionary, + embed_dim=512, + hidden_size=512, + num_layers=1, + dropout_in=0.1, + dropout_out=0.1, + bidirectional=False, + left_pad=True, + pretrained_embed=None, + padding_value=0.0, + fixed_embeddings=False, + ): + super().__init__(dictionary) + self.num_layers = num_layers + self.dropout_in = dropout_in + self.dropout_out = dropout_out + self.bidirectional = bidirectional + self.hidden_size = hidden_size + + num_embeddings = len(dictionary) + self.padding_idx = dictionary.pad() + if pretrained_embed is None: + self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx) + else: + self.embed_tokens = pretrained_embed + if fixed_embeddings: + self.embed_tokens.weight.requires_grad = False + + self.lstm = LSTM( + input_size=embed_dim, + hidden_size=hidden_size, + num_layers=num_layers, + dropout=self.dropout_out if num_layers > 1 else 0.0, + bidirectional=bidirectional, + ) + self.left_pad = left_pad + self.padding_value = padding_value + + self.output_units = hidden_size + if bidirectional: + self.output_units *= 2 + + def forward(self, src_tokens, src_lengths, dataset_name): + if self.left_pad: + # convert left-padding to right-padding + src_tokens = utils.convert_padding_direction( + src_tokens, + self.padding_idx, + left_to_right=True, + ) + + bsz, seqlen = src_tokens.size() + + # embed tokens + x = self.embed_tokens(src_tokens) + x = F.dropout(x, p=self.dropout_in, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + # pack embedded source tokens into a PackedSequence + try: + packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data.tolist()) + except BaseException: + raise Exception(f"Packing failed in dataset {dataset_name}") + + # apply LSTM + if self.bidirectional: + state_size = 2 * self.num_layers, bsz, self.hidden_size + else: + state_size = self.num_layers, bsz, self.hidden_size + h0 = x.data.new(*state_size).zero_() + c0 = x.data.new(*state_size).zero_() + packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0)) + + # unpack outputs and apply dropout + x, _ = nn.utils.rnn.pad_packed_sequence( + packed_outs, padding_value=self.padding_value + ) + x = F.dropout(x, p=self.dropout_out, training=self.training) + assert list(x.size()) == [seqlen, bsz, self.output_units] + + if self.bidirectional: + + def combine_bidir(outs): + return torch.cat( + [ + torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view( + 1, bsz, self.output_units + ) + for i in range(self.num_layers) + ], + dim=0, + ) + + final_hiddens = combine_bidir(final_hiddens) + final_cells = combine_bidir(final_cells) + + encoder_padding_mask = src_tokens.eq(self.padding_idx).t() + + # Set padded outputs to -inf so they are not selected by max-pooling + padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1) + if padding_mask.any(): + x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x) + + # Build the sentence embedding by max-pooling over the encoder outputs + sentemb = x.max(dim=0)[0] + + return { + "sentemb": sentemb, + "encoder_out": (x, final_hiddens, final_cells), + "encoder_padding_mask": encoder_padding_mask + if encoder_padding_mask.any() + else None, + } + + def reorder_encoder_out(self, encoder_out_dict, new_order): + encoder_out_dict["sentemb"] = encoder_out_dict["sentemb"].index_select( + 0, new_order + ) + encoder_out_dict["encoder_out"] = tuple( + eo.index_select(1, new_order) for eo in encoder_out_dict["encoder_out"] + ) + if encoder_out_dict["encoder_padding_mask"] is not None: + encoder_out_dict["encoder_padding_mask"] = encoder_out_dict[ + "encoder_padding_mask" + ].index_select(1, new_order) + return encoder_out_dict + + def max_positions(self): + """Maximum input length supported by the encoder.""" + return int(1e5) # an arbitrary large number + + +class LSTMDecoder(FairseqIncrementalDecoder): + """LSTM decoder.""" + + def __init__( + self, + dictionary, + embed_dim=512, + hidden_size=512, + out_embed_dim=512, + num_layers=1, + dropout_in=0.1, + dropout_out=0.1, + zero_init=False, + encoder_embed_dim=512, + encoder_output_units=512, + pretrained_embed=None, + num_langs=1, + lang_embed_dim=0, + ): + super().__init__(dictionary) + self.dropout_in = dropout_in + self.dropout_out = dropout_out + self.hidden_size = hidden_size + + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + if pretrained_embed is None: + self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) + else: + self.embed_tokens = pretrained_embed + + self.layers = nn.ModuleList( + [ + LSTMCell( + input_size=encoder_output_units + embed_dim + lang_embed_dim + if layer == 0 + else hidden_size, + hidden_size=hidden_size, + ) + for layer in range(num_layers) + ] + ) + if hidden_size != out_embed_dim: + self.additional_fc = Linear(hidden_size, out_embed_dim) + self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out) + + if zero_init: + self.sentemb2init = None + else: + self.sentemb2init = Linear( + encoder_output_units, 2 * num_layers * hidden_size + ) + + if lang_embed_dim == 0: + self.embed_lang = None + else: + self.embed_lang = nn.Embedding(num_langs, lang_embed_dim) + nn.init.uniform_(self.embed_lang.weight, -0.1, 0.1) + + def forward( + self, prev_output_tokens, encoder_out_dict, incremental_state=None, lang_id=0 + ): + sentemb = encoder_out_dict["sentemb"] + encoder_out = encoder_out_dict["encoder_out"] + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + bsz, seqlen = prev_output_tokens.size() + + # get outputs from encoder + encoder_outs, _, _ = encoder_out[:3] + srclen = encoder_outs.size(0) + + # embed tokens + x = self.embed_tokens(prev_output_tokens) + x = F.dropout(x, p=self.dropout_in, training=self.training) + + # embed language identifier + if self.embed_lang is not None: + lang_ids = prev_output_tokens.data.new_full((bsz,), lang_id) + langemb = self.embed_lang(lang_ids) + # TODO Should we dropout here??? + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + # initialize previous states (or get from cache during incremental generation) + cached_state = utils.get_incremental_state( + self, incremental_state, "cached_state" + ) + if cached_state is not None: + prev_hiddens, prev_cells, input_feed = cached_state + else: + num_layers = len(self.layers) + if self.sentemb2init is None: + prev_hiddens = [ + x.data.new(bsz, self.hidden_size).zero_() for i in range(num_layers) + ] + prev_cells = [ + x.data.new(bsz, self.hidden_size).zero_() for i in range(num_layers) + ] + else: + init = self.sentemb2init(sentemb) + prev_hiddens = [ + init[:, (2 * i) * self.hidden_size : (2 * i + 1) * self.hidden_size] + for i in range(num_layers) + ] + prev_cells = [ + init[ + :, + (2 * i + 1) * self.hidden_size : (2 * i + 2) * self.hidden_size, + ] + for i in range(num_layers) + ] + input_feed = x.data.new(bsz, self.hidden_size).zero_() + + attn_scores = x.data.new(srclen, seqlen, bsz).zero_() + outs = [] + for j in range(seqlen): + if self.embed_lang is None: + input = torch.cat((x[j, :, :], sentemb), dim=1) + else: + input = torch.cat((x[j, :, :], sentemb, langemb), dim=1) + + for i, rnn in enumerate(self.layers): + # recurrent cell + hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i])) + + # hidden state becomes the input to the next layer + input = F.dropout(hidden, p=self.dropout_out, training=self.training) + + # save state for next time step + prev_hiddens[i] = hidden + prev_cells[i] = cell + + out = hidden + out = F.dropout(out, p=self.dropout_out, training=self.training) + + # input feeding + input_feed = out + + # save final output + outs.append(out) + + # cache previous states (no-op except during incremental generation) + utils.set_incremental_state( + self, + incremental_state, + "cached_state", + (prev_hiddens, prev_cells, input_feed), + ) + + # collect outputs across time steps + x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size) + + # T x B x C -> B x T x C + x = x.transpose(1, 0) + + # srclen x tgtlen x bsz -> bsz x tgtlen x srclen + attn_scores = attn_scores.transpose(0, 2) + + # project back to size of vocabulary + if hasattr(self, "additional_fc"): + x = self.additional_fc(x) + x = F.dropout(x, p=self.dropout_out, training=self.training) + x = self.fc_out(x) + + return x, attn_scores + + def reorder_incremental_state(self, incremental_state, new_order): + super().reorder_incremental_state(incremental_state, new_order) + cached_state = utils.get_incremental_state( + self, incremental_state, "cached_state" + ) + if cached_state is None: + return + + def reorder_state(state): + if isinstance(state, list): + return [reorder_state(state_i) for state_i in state] + return state.index_select(0, new_order) + + new_state = tuple(map(reorder_state, cached_state)) + utils.set_incremental_state(self, incremental_state, "cached_state", new_state) + + def max_positions(self): + """Maximum output length supported by the decoder.""" + return int(1e5) # an arbitrary large number + + +def Embedding(num_embeddings, embedding_dim, padding_idx): + m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) + nn.init.uniform_(m.weight, -0.1, 0.1) + nn.init.constant_(m.weight[padding_idx], 0) + return m + + +def LSTM(input_size, hidden_size, **kwargs): + m = nn.LSTM(input_size, hidden_size, **kwargs) + for name, param in m.named_parameters(): + if "weight" in name or "bias" in name: + param.data.uniform_(-0.1, 0.1) + return m + + +def LSTMCell(input_size, hidden_size, **kwargs): + m = nn.LSTMCell(input_size, hidden_size, **kwargs) + for name, param in m.named_parameters(): + if "weight" in name or "bias" in name: + param.data.uniform_(-0.1, 0.1) + return m + + +def Linear(in_features, out_features, bias=True, dropout=0): + """Weight-normalized Linear layer (input: N x T x C)""" + m = nn.Linear(in_features, out_features, bias=bias) + m.weight.data.uniform_(-0.1, 0.1) + if bias: + m.bias.data.uniform_(-0.1, 0.1) + return m + + +@register_model_architecture("laser_lstm", "laser_lstm") +def base_architecture(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_embed_path = getattr(args, "encoder_embed_path", None) + args.encoder_hidden_size = getattr( + args, "encoder_hidden_size", args.encoder_embed_dim + ) + args.encoder_layers = getattr(args, "encoder_layers", 1) + args.encoder_bidirectional = getattr(args, "encoder_bidirectional", False) + args.encoder_dropout_in = getattr(args, "encoder_dropout_in", args.dropout) + args.encoder_dropout_out = getattr(args, "encoder_dropout_out", args.dropout) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_embed_path = getattr(args, "decoder_embed_path", None) + args.decoder_hidden_size = getattr( + args, "decoder_hidden_size", args.decoder_embed_dim + ) + args.decoder_layers = getattr(args, "decoder_layers", 1) + args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512) + args.decoder_dropout_in = getattr(args, "decoder_dropout_in", args.dropout) + args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout) + args.decoder_zero_init = getattr(args, "decoder_zero_init", "0") + args.decoder_lang_embed_dim = getattr(args, "decoder_lang_embed_dim", 0) + args.fixed_embeddings = getattr(args, "fixed_embeddings", False) diff --git a/examples/laser/laser_src/laser_task.py b/examples/laser/laser_src/laser_task.py new file mode 100644 index 0000000000..9bf2d7ad81 --- /dev/null +++ b/examples/laser/laser_src/laser_task.py @@ -0,0 +1,334 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from collections import OrderedDict, defaultdict +import json +import os +import logging +from argparse import ArgumentError + +from fairseq import options, models +from fairseq.data import ( + data_utils, + Dictionary, + LanguagePairDataset, + IndexedDataset, + FairseqDataset, +) +from .multitask_data_utils import ( + MultitaskDatasetWrapper, + MultidatasetEpochBatchIterator, +) + + +from fairseq.tasks import LegacyFairseqTask, register_task + +logger = logging.getLogger(__name__) + + +@register_task("laser") +class LaserTask(LegacyFairseqTask): + @staticmethod + def add_args(parser): + """Add task-specific arguments to the parser.""" + parser.add_argument( + "configfile", metavar="PATH", help="dataset configuration file in json" + ) + parser.add_argument( + "--weighting-alpha", + type=float, + default=None, + help="alpha for automatic weighting", + ) + parser.add_argument( + "--raw-text", action="store_true", help="load raw text dataset" + ) + parser.add_argument( + "--left-pad-source", + default="True", + type=str, + metavar="BOOL", + help="pad the source on the left (default: True)", + ) + parser.add_argument( + "--left-pad-target", + default="False", + type=str, + metavar="BOOL", + help="pad the target on the left (default: False)", + ) + try: + parser.add_argument( + "--max-source-positions", + default=1024, + type=int, + metavar="N", + help="max number of tokens in the source sequence", + ) + parser.add_argument( + "--max-target-positions", + default=1024, + type=int, + metavar="N", + help="max number of tokens in the target sequence", + ) + except ArgumentError: + # this might have already been defined. Once we transition this to hydra it should be fine to add it here. + pass + + def __init__(self, args, config, src_dictionary, tgt_dictionary, num_tasks): + super().__init__(args) + self.config = config + self.src_dictionary = src_dictionary + self.tgt_dictionary = tgt_dictionary + self.num_tasks = num_tasks + + @classmethod + def setup_task(cls, args, **kwargs): + with open(args.configfile, "r") as f: + config = json.load(f) + num_tasks = max(dataset["id"] for dataset in config["train"]) + 1 + + args.left_pad_source = options.eval_bool(args.left_pad_source) + args.left_pad_target = options.eval_bool(args.left_pad_target) + + src_dictionary = Dictionary.load(config["src_vocab"]) + tgt_dictionary = Dictionary.load(config["tgt_vocab"]) + + logger.info( + "| src Dictionary {} : {} types".format( + config["src_vocab"], len(src_dictionary) + ) + ) + logger.info( + "| tgt Dictionary {} : {} types".format( + config["tgt_vocab"], len(tgt_dictionary) + ) + ) + + return cls(args, config, src_dictionary, tgt_dictionary, num_tasks) + + # Experimental overriding for backtranslation + def build_model(self, args, from_checkpoint=False): + model = models.build_model(args, self) + return model + + def dataset(self, split): + if split not in self.datasets: + raise KeyError("Dataset not loaded: " + split) + return self.datasets[split] + + def load_dataset(self, split, epoch=1, **kwargs): + """Load a dataset split.""" + + def indexed_dataset(path, dictionary): + if self.args.raw_text: + raise Exception("Unable to handle raw text.") + dataset = IndexedDataset(path, fix_lua_indexing=True) + + return dataset + + pair_datasets = OrderedDict() + + if split == "valid": + self.datasets[split] = pair_datasets + return + + if split not in self.config: + raise FileNotFoundError( + "Dataset not found in config file: {}".format(split) + ) + + size_by_corpus = defaultdict(int) + size_sum = 0 + size_sum_with_subsampling = 0 + init_pair_datasets = {} + + for dataset_config in self.config[split]: + src_path = os.path.dirname(dataset_config["src"]) + corpus_name = src_path.split("/")[-2] + language_pair_name = src_path.split("/")[-1] + pair_datasets_key = corpus_name + "-" + language_pair_name + + logger.info(f"loading... {pair_datasets_key}") + if "src" in dataset_config: + src_dataset = indexed_dataset( + dataset_config["src"], self.src_dictionary + ) + else: + src_dataset = None + + if "tgt" in dataset_config: + tgt_dataset = indexed_dataset( + dataset_config["tgt"], self.tgt_dictionary + ) + else: + tgt_dataset = None + + dataset = LanguagePairDataset( + src_dataset, + src_dataset.sizes, + self.src_dictionary, + tgt_dataset, + tgt_dataset.sizes, + self.tgt_dictionary, + left_pad_source=self.args.left_pad_source, + left_pad_target=self.args.left_pad_target, + ) + + if pair_datasets_key in init_pair_datasets: + logger.warning( + f"Ignoring already added {pair_datasets_key}. " + f"Consider using `sample` key in order to upsample." + ) + else: + init_pair_datasets[pair_datasets_key] = { + "dataset": dataset, + "sample": dataset_config.get("sample", None), + "id": dataset_config.get("id", None), + "len": len(dataset), + } + + length_sum = 0 + weighted_freqs_sum = 0 + freq_per_dataset = {} + vmax = 0 + vmin = 1 + weighted_freq_per_dataset = {} + + if self.args.weighting_alpha: + for key in init_pair_datasets: + if init_pair_datasets[key]["sample"] is None: + length_sum += len(init_pair_datasets[key]["dataset"]) + + for key in init_pair_datasets: + if init_pair_datasets[key]["sample"] is None: + val = float(init_pair_datasets[key]["len"]) / length_sum + freq_per_dataset[key] = val + weighted_freqs_sum += val ** self.args.weighting_alpha + + for key in freq_per_dataset: + val = ( + freq_per_dataset[key] ** self.args.weighting_alpha + / weighted_freqs_sum + ) + vmin = min(vmin, val) + vmax = max(vmax, val) + weighted_freq_per_dataset[key] = val + + for pair_datasets_key in init_pair_datasets: + dataset_config = init_pair_datasets[pair_datasets_key] + dataset = dataset_config["dataset"] + sample = dataset_config["sample"] + if sample is None: + sample = 1.0 + + if pair_datasets_key in weighted_freq_per_dataset: + w = vmax / weighted_freq_per_dataset[pair_datasets_key] + sample = w + + sample = round(sample) + + initial_sample = sample + initial_pair_datasets_key = pair_datasets_key + + while sample >= 1.0: + assert ( + pair_datasets_key not in pair_datasets + ), f"{pair_datasets_key} already in" + size_sum_with_subsampling += len(dataset) + pair_datasets[pair_datasets_key] = MultitaskDatasetWrapper( + dataset, dataset_config.get("id", 0), 1.0, name=pair_datasets_key + ) + size_sum += len(dataset) + sample -= 1.0 + pair_datasets_key += "-up" + + assert sample < 1e-6, f"sample remains > 0 {pair_datasets_key}" + + logger.info( + f"added pair {initial_pair_datasets_key} length {len(dataset)} new_length = {len(dataset)*initial_sample}" + ) + size_by_corpus[corpus_name] += len(dataset) + + self.datasets[split] = pair_datasets + logger.info( + f"Datasets number = {len(self.datasets[split])} size = {size_sum} size_sum_with_subsampling = {size_sum_with_subsampling}" + ) + + @property + def source_dictionary(self): + return self.src_dictionary + + @property + def target_dictionary(self): + return self.tgt_dictionary + + def get_batch_iterator( + self, + dataset, + max_tokens=None, + max_sentences=None, + max_positions=None, + ignore_invalid_inputs=False, + required_batch_size_multiple=1, + seed=1, + num_shards=1, + shard_id=0, + num_workers=0, + epoch=1, + data_buffer_size=0, + disable_iterator_cache=False, + grouped_shuffling=False, + update_epoch_batch_itr=False, + **kwargs, + ): + + assert isinstance(dataset, OrderedDict) + assert len(dataset) + assert isinstance(dataset[next(iter(dataset))], FairseqDataset) + + # initialize the dataset with the correct starting epoch + for _, dt in dataset.items(): + dt.set_epoch(epoch) + + indices = OrderedDict() + batch_sampler = OrderedDict() + + with data_utils.numpy_seed(seed + epoch): + for key, dt in dataset.items(): + logger.info(f"\t ordered_indices {key}") + indices[key] = dt.ordered_indices() + + # filter examples that are too large + if max_positions is not None: + for key, dt in dataset.items(): + logger.info(f"\t filter_by_size {key}") + indices[key], ignored = dt.filter_indices_by_size( + indices[key], max_positions + ) + + for key, dt in dataset.items(): + logger.info(f"\t batch_by_size {key}") + batch_sampler[key] = data_utils.batch_by_size( + indices[key], + dt.num_tokens, + max_tokens=max_tokens, + max_sentences=max_sentences, + required_batch_size_multiple=required_batch_size_multiple, + ) + + epoch_iter = MultidatasetEpochBatchIterator( + dataset=dataset, + batch_sampler=batch_sampler, + seed=seed, + num_shards=num_shards, + shard_id=shard_id, + num_workers=num_workers, + epoch=epoch, + ) + + return epoch_iter diff --git a/examples/laser/laser_src/laser_transformer.py b/examples/laser/laser_src/laser_transformer.py new file mode 100644 index 0000000000..0be030994f --- /dev/null +++ b/examples/laser/laser_src/laser_transformer.py @@ -0,0 +1,354 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +from typing import Any, Dict, List, Optional +from torch import Tensor + +import torch +import torch.nn as nn + +from fairseq.models import ( + FairseqEncoderDecoderModel, + register_model, + register_model_architecture, +) +from fairseq.models.transformer import ( + base_architecture, + Embedding, + TransformerModel, + TransformerEncoder, + TransformerDecoder, +) +from fairseq.modules import ( + TransformerDecoderLayer, +) + +logger = logging.getLogger(__name__) + + +@register_model("laser_transformer") +class LaserTransformerModel(FairseqEncoderDecoderModel): + """Train Transformer for LASER task + + Requires --task laser + """ + + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens=None, + tgt_tokens=None, + tgt_lengths=None, + target_language_id=-1, + dataset_name="", + ): + laser_encoder_out = self.encoder(src_tokens, src_lengths) + return self.decoder( + prev_output_tokens, laser_encoder_out, lang_id=target_language_id + ) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + TransformerModel.add_args(parser) + parser.add_argument( + "--decoder-lang-embed-dim", + type=int, + metavar="N", + help="decoder language embedding dimension", + ) + + @classmethod + def build_model(cls, args, task): + base_laser_transformer_architecture(args) + + num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0 + + def load_embed_tokens(dictionary, embed_dim): + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + + return Embedding(num_embeddings, embed_dim, padding_idx) + + encoder_embed_tokens = load_embed_tokens( + task.source_dictionary, args.encoder_embed_dim + ) + decoder_embed_tokens = load_embed_tokens( + task.target_dictionary, args.decoder_embed_dim + ) + num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0 + + encoder = LaserTransformerEncoder( + args, task.source_dictionary, encoder_embed_tokens + ) + + decoder = LaserTransformerDecoder( + args, + task.target_dictionary, + decoder_embed_tokens, + num_langs=num_langs, + lang_embed_dim=args.decoder_lang_embed_dim, + ) + + return cls(encoder, decoder) + + +class LaserTransformerEncoder(TransformerEncoder): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, src_tokens, *args, **kwargs): + encoder_out = super().forward(src_tokens, *args, **kwargs) + + x = encoder_out["encoder_out"][0] # T x B x C + padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1) + + if padding_mask.any(): + x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x) + + # Build the sentence embedding by max-pooling over the encoder outputs + sentemb = x.max(dim=0)[0] + + # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in + # `foward` so we use a dictionary instead. + # TorchScript does not support mixed values so the values are all lists. + # The empty list is equivalent to None. + return {"sentemb": [sentemb]} # B x C + + @torch.jit.export + def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order): + """ + Same as the one in transformer.py, with new_sentemb + """ + if len(encoder_out["sentemb"]) == 0: + new_sentemb = [] + else: + new_sentemb = [encoder_out["sentemb"][0].index_select(0, new_order)] + + return { + "sentemb": new_sentemb, # B x C + } + + +class LaserTransformerDecoder(TransformerDecoder): + def __init__(self, args, dictionary, *kargs, **kwargs): + self.num_langs = kwargs.get("num_langs", 1) + self.lang_embed_dim = kwargs.get("lang_embed_dim", 0) + kwargs.pop("num_langs", None) + kwargs.pop("lang_embed_dim", None) + + super().__init__(args, dictionary, *kargs, **kwargs, no_encoder_attn=True) + + if self.lang_embed_dim == 0: + self.embed_lang = None + else: + self.embed_lang = nn.Embedding(self.num_langs, self.lang_embed_dim) + nn.init.uniform_(self.embed_lang.weight, -0.1, 0.1) + + if self.output_projection is not None: + laser_output_embed_dim = ( + self.output_embed_dim + self.lang_embed_dim + args.encoder_embed_dim + ) + self.output_projection = nn.Linear( + laser_output_embed_dim, len(dictionary), bias=False + ) + nn.init.normal_( + self.output_projection.weight, + mean=0, + std=laser_output_embed_dim ** -0.5, + ) + + def build_decoder_layer(self, args, no_encoder_attn=False): + decoder_embed_dim = args.decoder_embed_dim + args.decoder_embed_dim = ( + decoder_embed_dim + self.lang_embed_dim + args.encoder_embed_dim + ) + res = TransformerDecoderLayer(args, no_encoder_attn=True) + args.decoder_embed_dim = decoder_embed_dim + + return res + + def extract_features( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + lang_id: Optional[int] = None, + ): + """ + Similar to *forward* but only return features. + + Includes several features from "Jointly Learning to Align and + Translate with Transformer Models" (Garg et al., EMNLP 2019). + + Args: + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + alignment_layer (int, optional): return mean alignment over + heads at this layer (default: last layer). + alignment_heads (int, optional): only average alignment over + this many heads (default: all heads). + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + if alignment_layer is None: + alignment_layer = self.num_layers - 1 + + # embed positions + positions = ( + self.embed_positions( + prev_output_tokens, incremental_state=incremental_state + ) + if self.embed_positions is not None + else None + ) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + bsz, seqlen = prev_output_tokens.size() + + # embed tokens and positions + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + + if self.quant_noise is not None: + x = self.quant_noise(x) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + if positions is not None: + x += positions + + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + if self.embed_lang is not None: + lang_ids = prev_output_tokens.data.new_full((bsz,), lang_id) + langemb = self.embed_lang(lang_ids) + langemb = langemb.unsqueeze(0) + repeat_vals = [x.shape[0] // langemb.shape[0]] + [-1] * ( + len(langemb.shape) - 1 + ) + x = torch.cat((x, langemb.expand(*repeat_vals)), dim=-1) + + sentemb = encoder_out["sentemb"][0] + sentemb = sentemb.unsqueeze(0) + + repeat_vals = [x.shape[0] // sentemb.shape[0]] + [-1] * (len(sentemb.shape) - 1) + x = torch.cat((x, sentemb.expand(*repeat_vals)), dim=-1) + + self_attn_padding_mask: Optional[Tensor] = None + if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): + self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) + + # decoder layers + attn: Optional[Tensor] = None + inner_states: List[Optional[Tensor]] = [x] + for idx, layer in enumerate(self.layers): + if incremental_state is None and not full_context_alignment: + self_attn_mask = self.buffered_future_mask(x) + else: + self_attn_mask = None + + x, layer_attn, _ = layer( + x, + None, + None, + incremental_state, + self_attn_mask=self_attn_mask, + self_attn_padding_mask=self_attn_padding_mask, + need_attn=bool((idx == alignment_layer)), + need_head_weights=bool((idx == alignment_layer)), + ) + inner_states.append(x) + if layer_attn is not None and idx == alignment_layer: + attn = layer_attn.float().to(x) + + if attn is not None: + if alignment_heads is not None: + attn = attn[:alignment_heads] + + # average probabilities over heads + attn = attn.mean(dim=0) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + if self.project_out_dim is not None: + x = self.project_out_dim(x) + + return x, {"attn": [attn], "inner_states": inner_states} + + def forward( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + features_only: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + src_lengths: Optional[Any] = None, + return_all_hiddens: bool = False, + lang_id: Optional[int] = None, + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (optional): output from the encoder, used for + encoder-side attention + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + features_only (bool, optional): only return features without + applying output layer (default: False). + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + + assert lang_id is not None + + x, extra = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + incremental_state=incremental_state, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + lang_id=lang_id, + ) + if not features_only: + x = self.output_layer(x) + return x, extra + + +@register_model_architecture("laser_transformer", "laser_transformer") +def base_laser_transformer_architecture(args): + base_architecture(args) + args.decoder_lang_embed_dim = getattr(args, "decoder_lang_embed_dim", 0) diff --git a/examples/laser/laser_src/multitask_data_utils.py b/examples/laser/laser_src/multitask_data_utils.py new file mode 100644 index 0000000000..b05caea267 --- /dev/null +++ b/examples/laser/laser_src/multitask_data_utils.py @@ -0,0 +1,143 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from collections import OrderedDict + +import numpy as np + +from fairseq.data import BaseWrapperDataset, FairseqDataset, iterators + + +class MultiItr(object): + def __init__(self, itr): + self.itr = itr + self._counts = [0 for x in itr] + + def __len__(self): + return sum(len(itr) for itr in self.itr) + + def __iter__(self): + return self + + def __next__(self): + ratios = [count / len(itr) for count, itr in zip(self._counts, self.itr)] + idx = ratios.index(min(ratios)) + self._counts[idx] += 1 + return next(self.itr[idx]) + + +class MultidatasetEpochBatchIterator(iterators.EpochBatchIterating): + """A wrapper around multiple epoch batch iterators.""" + + def __init__( + self, + dataset, + batch_sampler, + seed=1, + num_shards=1, + shard_id=0, + num_workers=0, + epoch=1, + ): + + assert isinstance(dataset, OrderedDict) + assert len(dataset) + assert isinstance(dataset[next(iter(dataset))], FairseqDataset) + + self.iterators = [] + + self.epoch = epoch + for key, dt in dataset.items(): + epoch_iter = iterators.EpochBatchIterator( + dataset=dt, + collate_fn=dt.collater, + batch_sampler=batch_sampler[key], + seed=seed, + num_shards=num_shards, + shard_id=shard_id, + num_workers=0, + epoch=epoch, + ) + self.iterators.append(epoch_iter) + + def __len__(self): + return sum(len(itr) for itr in self.iterators) + + def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False): + # `self.epoch += 1` should be handled by underlying `EpochBatchIterator`s. + return MultiItr( + [ + itr.next_epoch_itr( + shuffle=shuffle, fix_batches_to_gpus=fix_batches_to_gpus + ) + for itr in self.iterators + ] + ) + + def end_of_epoch(self): + return all(itr.end_of_epoch() for itr in self.iterators) + + @property + def next_epoch_idx(self): + """Return the epoch index after *next_epoch_itr* is called.""" + + epochs = [itr.next_epoch_idx for itr in self.iterators] + self.epoch = epochs[0] + assert all(epoch == self.epoch for epoch in epochs) + + return self.epoch + + @property + def iterations_in_epoch(self): + return sum(itr.iterations_in_epoch for itr in self.iterators) + + def state_dict(self): + return { + "iterators": [it.state_dict() for it in self.iterators], + "epoch": self.epoch, + } + + def load_state_dict(self, state_dict): + self.epoch = state_dict["epoch"] + for it, d in zip(self.iterators, state_dict["iterators"]): + it.load_state_dict(d) + + +class MultitaskDatasetWrapper(BaseWrapperDataset): + """A wrapper for a multitask dataset.""" + + def __init__(self, dataset, target_language_id, sample=1.0, name=""): + super().__init__(dataset) + self.target_language_id = target_language_id + self.sample = sample + self.name = name + + def collater(self, *args, **kwargs): + ans = self.dataset.collater(*args, **kwargs) + if "net_input" in ans: + ans["net_input"]["target_language_id"] = self.target_language_id + ans["net_input"]["dataset_name"] = self.name + return ans + + def num_tokens(self, *args, **kwargs): + return self.dataset.num_tokens(*args, **kwargs) + + def ordered_indices(self, *args, **kwargs): + indices = self.dataset.ordered_indices(*args, **kwargs) + # Hacky solution for sampling + size = int(self.sample * indices.shape[0]) + + return indices.take(np.sort(np.random.permutation(indices.shape[0])[:size])) + + def size(self, index: int): + return self.dataset.size(index) + + @property + def supports_prefetch(self): + """Whether this dataset supports prefetching.""" + return getattr(self.dataset, "supports_prefetch", False) + + def prefetch(self, indices): + return self.dataset.prefetch(indices) diff --git a/examples/latent_depth/README.md b/examples/latent_depth/README.md index bc78ca8055..7774c33305 100644 --- a/examples/latent_depth/README.md +++ b/examples/latent_depth/README.md @@ -14,7 +14,7 @@ lang_pairs_str="eng-aze,eng-bel,eng-ces,eng-glg,eng-por,eng-rus,eng-slk,eng-tur" databin_dir= fairseq-train ${databin_dir} \ - --user-dir, examples/latent_depth/src \ + --user-dir examples/latent_depth/latent_depth_src \ --lang-pairs "${lang_pairs_str}" \ --arch multilingual_transformer_iwslt_de_en \ --task multilingual_translation_latent_depth \ @@ -25,12 +25,12 @@ fairseq-train ${databin_dir} \ --share-decoder-input-output-embed \ --dropout 0.3 --attention-dropout 0.3 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler inverse_sqrt --min-lr 1e-9 --warmup-init-lr 1e-7 --warmup-updates 8000 \ + --lr-scheduler inverse_sqrt --stop-min-lr 1e-9 --warmup-init-lr 1e-7 --warmup-updates 8000 \ --max-tokens 4096 --update-freq 1 \ --lr 0.0015 \ --clip-norm 1.0 \ --seed 2 \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --encoder-layers 12 \ --decoder-layers 24 \ --decoder-latent-layer \ diff --git a/examples/latent_depth/src/__init__.py b/examples/latent_depth/latent_depth_src/__init__.py similarity index 100% rename from examples/latent_depth/src/__init__.py rename to examples/latent_depth/latent_depth_src/__init__.py diff --git a/examples/latent_depth/latent_depth_src/loss/__init__.py b/examples/latent_depth/latent_depth_src/loss/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/latent_depth/src/loss/latent_depth.py b/examples/latent_depth/latent_depth_src/loss/latent_depth.py similarity index 100% rename from examples/latent_depth/src/loss/latent_depth.py rename to examples/latent_depth/latent_depth_src/loss/latent_depth.py diff --git a/examples/latent_depth/latent_depth_src/models/__init__.py b/examples/latent_depth/latent_depth_src/models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/latent_depth/src/models/latent_multilingual_transformer.py b/examples/latent_depth/latent_depth_src/models/latent_multilingual_transformer.py similarity index 93% rename from examples/latent_depth/src/models/latent_multilingual_transformer.py rename to examples/latent_depth/latent_depth_src/models/latent_multilingual_transformer.py index 12b7e67d03..9e7b655fee 100644 --- a/examples/latent_depth/src/models/latent_multilingual_transformer.py +++ b/examples/latent_depth/latent_depth_src/models/latent_multilingual_transformer.py @@ -10,6 +10,7 @@ TransformerEncoder, base_architecture, ) +from fairseq.utils import safe_hasattr from .latent_transformer import LatentTransformerDecoder, LatentTransformerEncoder @@ -40,14 +41,14 @@ def add_args(parser): @classmethod def _get_module_class(cls, is_encoder, args, lang_dict, embed_tokens, langs): if is_encoder: - if hasattr(args, "encoder_latent_layer") and args.encoder_latent_layer: + if safe_hasattr(args, "encoder_latent_layer") and args.encoder_latent_layer: return LatentTransformerEncoder( args, lang_dict, embed_tokens, num_logits=len(langs) ) else: return TransformerEncoder(args, lang_dict, embed_tokens) else: - if hasattr(args, "decoder_latent_layer") and args.decoder_latent_layer: + if safe_hasattr(args, "decoder_latent_layer") and args.decoder_latent_layer: return LatentTransformerDecoder( args, lang_dict, embed_tokens, num_logits=len(langs) ) diff --git a/examples/latent_depth/src/models/latent_transformer.py b/examples/latent_depth/latent_depth_src/models/latent_transformer.py similarity index 100% rename from examples/latent_depth/src/models/latent_transformer.py rename to examples/latent_depth/latent_depth_src/models/latent_transformer.py diff --git a/examples/latent_depth/latent_depth_src/modules/__init__.py b/examples/latent_depth/latent_depth_src/modules/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/latent_depth/src/modules/latent_layers.py b/examples/latent_depth/latent_depth_src/modules/latent_layers.py similarity index 100% rename from examples/latent_depth/src/modules/latent_layers.py rename to examples/latent_depth/latent_depth_src/modules/latent_layers.py diff --git a/examples/latent_depth/src/multilingual_translation_latent_depth.py b/examples/latent_depth/latent_depth_src/multilingual_translation_latent_depth.py similarity index 98% rename from examples/latent_depth/src/multilingual_translation_latent_depth.py rename to examples/latent_depth/latent_depth_src/multilingual_translation_latent_depth.py index b5cd51a470..8cc2a7174b 100644 --- a/examples/latent_depth/src/multilingual_translation_latent_depth.py +++ b/examples/latent_depth/latent_depth_src/multilingual_translation_latent_depth.py @@ -5,6 +5,7 @@ from fairseq.tasks import register_task from fairseq.tasks.multilingual_translation import MultilingualTranslationTask +from fairseq.utils import safe_hasattr from .loss.latent_depth import LatentLayersKLLoss, LatentLayersSparsityLoss @@ -174,14 +175,14 @@ def inference_step( @property def encoder_latent_layer(self): return ( - hasattr(self.args, "encoder_latent_layer") + safe_hasattr(self.args, "encoder_latent_layer") and self.args.encoder_latent_layer ) @property def decoder_latent_layer(self): return ( - hasattr(self.args, "decoder_latent_layer") + safe_hasattr(self.args, "decoder_latent_layer") and self.args.decoder_latent_layer ) diff --git a/examples/layerdrop/README.md b/examples/layerdrop/README.md index 394e710b0f..4d48ee9615 100644 --- a/examples/layerdrop/README.md +++ b/examples/layerdrop/README.md @@ -126,9 +126,9 @@ This model override command overrides the training parameters and updates the mo Looking to reproduce the results in the paper? -1. For Translation on WMT16 en-de, we followed this setting [here](https://github.com/pytorch/fairseq/blob/master/examples/scaling_nmt/README.md) -2. To train RoBERTa, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta) -3. To train Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/language_model) +1. For Translation on WMT16 en-de, we followed this setting [here](https://github.com/pytorch/fairseq/blob/main/examples/scaling_nmt/README.md) +2. To train RoBERTa, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/roberta) +3. To train Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/language_model) ## Tips diff --git a/examples/linformer/README.md b/examples/linformer/README.md index cedd667835..f8b36bc691 100644 --- a/examples/linformer/README.md +++ b/examples/linformer/README.md @@ -6,7 +6,7 @@ This example contains code to train Linformer models as described in our paper ## Training a new Linformer RoBERTa model You can mostly follow the [RoBERTa pretraining README](/examples/roberta/README.pretraining.md), -updating your training command with `--user-dir examples/linformer/src --arch linformer_roberta_base`. +updating your training command with `--user-dir examples/linformer/linformer_src --arch linformer_roberta_base`. ## Citation diff --git a/examples/linformer/src/__init__.py b/examples/linformer/linformer_src/__init__.py similarity index 100% rename from examples/linformer/src/__init__.py rename to examples/linformer/linformer_src/__init__.py diff --git a/examples/linformer/linformer_src/models/__init__.py b/examples/linformer/linformer_src/models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/linformer/linformer_src/models/linformer_roberta.py b/examples/linformer/linformer_src/models/linformer_roberta.py new file mode 100644 index 0000000000..b7bdbb1105 --- /dev/null +++ b/examples/linformer/linformer_src/models/linformer_roberta.py @@ -0,0 +1,120 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Linformer: Self-Attention with Linear Complexity +""" + +import logging + +import torch +from fairseq import utils +from fairseq.models import register_model, register_model_architecture +from fairseq.models.roberta import ( + init_bert_params, + roberta_base_architecture, + roberta_large_architecture, + RobertaEncoder, + RobertaModel, +) +from fairseq.utils import safe_hasattr + +from ..modules.linformer_sentence_encoder import LinformerTransformerEncoder + + +logger = logging.getLogger(__name__) + + +@register_model("linformer_roberta") +class LinformerModel(RobertaModel): + @staticmethod + def add_args(parser): + RobertaModel.add_args(parser) + + # add args for Linformer + parser.add_argument( + "--compressed", type=int, help="compressed ratio of sequence length" + ) + parser.add_argument( + "--shared-kv-compressed", + type=int, + help="share compressed matrix between k and v, in each layer", + ) + parser.add_argument( + "--shared-layer-kv-compressed", + type=int, + help="share compressed matrix between k and v and across all layers", + ) + parser.add_argument( + "--freeze-compress", + type=int, + help="freeze the parameters in compressed layer", + ) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present + base_architecture(args) + + if not safe_hasattr(args, "max_positions"): + args.max_positions = args.tokens_per_sample + + encoder = LinformerEncoder(args, task.source_dictionary) + return cls(args, encoder) + + +class LinformerEncoder(RobertaEncoder): + """Linformer encoder.""" + + def __init__(self, args, dictionary): + super().__init__(args, dictionary) + self.register_buffer("version", torch.tensor(2)) + + def build_encoder(self, args, dictionary, embed_tokens): + encoder = LinformerTransformerEncoder(args, dictionary, embed_tokens) + encoder.apply(init_bert_params) + return encoder + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + prefix = name + "." if name != "" else "" + + # some old checkpoints had weight sharing implemented incorrectly + # (note: this was correct in the original paper code) + if utils.item(state_dict.get(f"{prefix}version", torch.tensor(1))) < 2: + state_dict[f"{prefix}version"] = torch.tensor(1) + # check if input embeddings and output embeddings were tied + if not torch.allclose( + state_dict[f"{prefix}sentence_encoder.embed_tokens.weight"], + state_dict[f"{prefix}lm_head.weight"], + ): + # they weren't tied, re-init the LM head without weight sharing + self.lm_head = self.build_lm_head( + embed_dim=self.args.encoder_embed_dim, + output_dim=len(self.dictionary), + activation_fn=self.args.activation_fn, + weight=None, # don't share weights + ) + + +@register_model_architecture("linformer_roberta", "linformer_roberta") +def base_architecture(args): + args.compressed = getattr(args, "compressed", 4) + args.shared_kv_compressed = getattr(args, "shared_kv_compressed", 0) + args.shared_layer_kv_compressed = getattr(args, "shared_layer_kv_compressed", 0) + args.freeze_compress = getattr(args, "freeze_compress", 0) + roberta_base_architecture(args) + + +@register_model_architecture("linformer_roberta", "linformer_roberta_base") +def linformer_roberta_base_architecture(args): + base_architecture(args) + + +@register_model_architecture("linformer_roberta", "linformer_roberta_large") +def linformer_roberta_large_architecture(args): + roberta_large_architecture(args) + base_architecture(args) diff --git a/examples/linformer/linformer_src/modules/__init__.py b/examples/linformer/linformer_src/modules/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/linformer/linformer_src/modules/linformer_sentence_encoder.py b/examples/linformer/linformer_src/modules/linformer_sentence_encoder.py new file mode 100644 index 0000000000..44f7989bd8 --- /dev/null +++ b/examples/linformer/linformer_src/modules/linformer_sentence_encoder.py @@ -0,0 +1,54 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch.nn as nn +from fairseq.models.transformer import TransformerEncoder + +from .linformer_sentence_encoder_layer import LinformerTransformerEncoderLayer + + +class LinformerTransformerEncoder(TransformerEncoder): + """ + Implementation for a Bi-directional Linformer based Sentence Encoder used + in BERT/XLM style pre-trained models. + + This first computes the token embedding using the token embedding matrix, + position embeddings (if specified) and segment embeddings + (if specified). After applying the specified number of + LinformerEncoderLayers, it outputs all the internal states of the + encoder as well as the final representation associated with the first + token (usually CLS token). + + Input: + - tokens: B x T matrix representing sentences + - segment_labels: B x T matrix representing segment label for tokens + + Output: + - a tuple of the following: + - a list of internal model states used to compute the + predictions where each tensor has shape T x B x C + - sentence representation associated with first input token + in format B x C. + """ + + def __init__(self, args, dictionary, embed_tokens): + self.compress_layer = None + super().__init__(args, dictionary, embed_tokens) + + def build_encoder_layer(self, args): + if self.args.shared_layer_kv_compressed == 1 and self.compress_layer is None: + compress_layer = nn.Linear( + self.args.max_positions, + self.args.max_positions // self.args.compressed, + ) + # intialize parameters for compressed layer + nn.init.xavier_uniform_(compress_layer.weight, gain=1 / math.sqrt(2)) + if self.args.freeze_compress == 1: + compress_layer.weight.requires_grad = False + self.compress_layer = compress_layer + + return LinformerTransformerEncoderLayer(args, self.compress_layer) diff --git a/examples/linformer/linformer_src/modules/linformer_sentence_encoder_layer.py b/examples/linformer/linformer_src/modules/linformer_sentence_encoder_layer.py new file mode 100644 index 0000000000..7e2caa0340 --- /dev/null +++ b/examples/linformer/linformer_src/modules/linformer_sentence_encoder_layer.py @@ -0,0 +1,65 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from fairseq import utils +from fairseq.modules import TransformerEncoderLayer + +from .multihead_linear_attention import MultiheadLinearAttention + + +class LinformerTransformerEncoderLayer(TransformerEncoderLayer): + """ + Implements a Linformer Encoder Layer used in BERT/XLM style pre-trained + models. + """ + + def __init__(self, args, shared_compress_layer): + # wrap in a list so it's not automatically registered by PyTorch + self.shared_compress_layer = [shared_compress_layer] + + super().__init__(args) + + self.register_buffer("version", torch.tensor(2)) + + def build_self_attention(self, embed_dim, args): + return MultiheadLinearAttention( + embed_dim, + args.encoder_attention_heads, + dropout=args.dropout, + self_attention=True, + q_noise=args.quant_noise_pq, + qn_block_size=args.quant_noise_pq_block_size, + compressed=args.compressed, + max_seq_len=args.max_positions, + shared_kv_compressed=args.shared_kv_compressed, + shared_compress_layer=self.shared_compress_layer[0], + freeze_compress=args.freeze_compress, + ) + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + prefix = name + "." if name != "" else "" + + # some old checkpoints had weight sharing implemented incorrectly + # (note: this was correct in the original paper code) + if utils.item(state_dict.get(f"{prefix}version", torch.tensor(1))) < 2: + state_dict[f"{prefix}version"] = torch.tensor(1) + # check compression layer sharing + if f"{prefix}shared_compress_layer.weight" in state_dict: + # reinitialize block without sharing compression layer to match + # old behavior + self.shared_compress_layer = [ + torch.nn.Linear( + self.shared_compress_layer[0].weight.size(1), + self.shared_compress_layer[0].weight.size(0), + ) + ] + self.self_attn = self.build_self_attention(self.embed_dim, self.args) + # delete shared_compress_layer, since it's already copied to + # self_attn.compress_k.weight + del state_dict[f"{prefix}shared_compress_layer.weight"] + if f"{prefix}shared_compress_layer.bias" in state_dict: + del state_dict[f"{prefix}shared_compress_layer.bias"] diff --git a/examples/linformer/src/modules/multihead_linear_attention.py b/examples/linformer/linformer_src/modules/multihead_linear_attention.py similarity index 99% rename from examples/linformer/src/modules/multihead_linear_attention.py rename to examples/linformer/linformer_src/modules/multihead_linear_attention.py index ba2c36b1ef..6be1007279 100644 --- a/examples/linformer/src/modules/multihead_linear_attention.py +++ b/examples/linformer/linformer_src/modules/multihead_linear_attention.py @@ -111,14 +111,10 @@ def __init__( self.compress_v.weight.requires_grad = False self.onnx_trace = False - self.tpu = False def prepare_for_onnx_export_(self): self.onnx_trace = True - def prepare_for_tpu_(self, **kwargs): - self.tpu = True - def reset_parameters(self): if self.qkv_same_dim: # Empirically observed the convergence to be much better with diff --git a/examples/linformer/src/models/linformer_roberta.py b/examples/linformer/src/models/linformer_roberta.py deleted file mode 100644 index 913351f238..0000000000 --- a/examples/linformer/src/models/linformer_roberta.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -""" -Linformer: Self-Attention with Linear Complexity -""" - -import logging - -from fairseq.models import register_model, register_model_architecture -from fairseq.models.roberta import RobertaEncoder, RobertaModel - -from ..modules.linformer_sentence_encoder import LinformerSentenceEncoder - - -logger = logging.getLogger(__name__) - - -@register_model("linformer_roberta") -class LinformerModel(RobertaModel): - @staticmethod - def add_args(parser): - RobertaModel.add_args(parser) - - # add args for Linformer - parser.add_argument( - "--compressed", type=int, help="compressed ratio of sequence length" - ) - parser.add_argument( - "--shared-kv-compressed", - type=int, - help="share compressed matrix between k and v, in each layer", - ) - parser.add_argument( - "--shared-layer-kv-compressed", - type=int, - help="share compressed matrix between k and v and across all layers", - ) - parser.add_argument( - "--freeze-compress", - type=int, - help="freeze the parameters in compressed layer", - ) - - @classmethod - def build_model(cls, args, task): - """Build a new model instance.""" - - # make sure all arguments are present - base_architecture(args) - - if not hasattr(args, "max_positions"): - args.max_positions = args.tokens_per_sample - - encoder = LinformerEncoder(args, task.source_dictionary) - return cls(args, encoder) - - -class LinformerEncoder(RobertaEncoder): - """Linformer encoder.""" - - def __init__(self, args, dictionary): - super().__init__(args, dictionary) - - self.sentence_encoder = LinformerSentenceEncoder( - padding_idx=dictionary.pad(), - vocab_size=len(dictionary), - num_encoder_layers=args.encoder_layers, - embedding_dim=args.encoder_embed_dim, - ffn_embedding_dim=args.encoder_ffn_embed_dim, - num_attention_heads=args.encoder_attention_heads, - dropout=args.dropout, - attention_dropout=args.attention_dropout, - activation_dropout=args.activation_dropout, - layerdrop=args.encoder_layerdrop, - max_seq_len=args.max_positions, - num_segments=0, - encoder_normalize_before=True, - apply_bert_init=True, - activation_fn=args.activation_fn, - q_noise=args.quant_noise_pq, - qn_block_size=args.quant_noise_pq_block_size, - compressed=args.compressed, - shared_kv_compressed=args.shared_kv_compressed, - shared_layer_kv_compressed=args.shared_layer_kv_compressed, - freeze_compress=args.freeze_compress, - ) - - -@register_model_architecture("linformer_roberta", "linformer_roberta") -def base_architecture(args): - args.encoder_layers = getattr(args, "encoder_layers", 12) - args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) - args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072) - args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) - - args.activation_fn = getattr(args, "activation_fn", "gelu") - args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") - - args.dropout = getattr(args, "dropout", 0.1) - args.attention_dropout = getattr(args, "attention_dropout", 0.1) - args.activation_dropout = getattr(args, "activation_dropout", 0.0) - args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) - args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None) - args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0) - args.compressed = getattr(args, "compressed", 4) - args.shared_kv_compressed = getattr(args, "shared_kv_compressed", 0) - args.shared_layer_kv_compressed = getattr(args, "shared_layer_kv_compressed", 0) - args.freeze_compress = getattr(args, "freeze_compress", 0) - - -@register_model_architecture("linformer_roberta", "linformer_roberta_base") -def linformer_roberta_base_architecture(args): - base_architecture(args) - - -@register_model_architecture("linformer_roberta", "linformer_roberta_large") -def linformer_roberta_large_architecture(args): - args.encoder_layers = getattr(args, "encoder_layers", 24) - args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) - args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) - args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) - - args.activation_fn = getattr(args, "activation_fn", "gelu") - args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") - - args.dropout = getattr(args, "dropout", 0.1) - args.attention_dropout = getattr(args, "attention_dropout", 0.1) - args.activation_dropout = getattr(args, "activation_dropout", 0.0) - args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) - args.compressed = getattr(args, "compressed", 4) - args.shared_kv_compressed = getattr(args, "shared_kv_compressed", 0) - args.shared_layer_kv_compressed = getattr(args, "shared_layer_kv_compressed", 0) diff --git a/examples/linformer/src/modules/linformer_sentence_encoder.py b/examples/linformer/src/modules/linformer_sentence_encoder.py deleted file mode 100644 index d6de9eeaae..0000000000 --- a/examples/linformer/src/modules/linformer_sentence_encoder.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import math - -import torch.nn as nn -from fairseq.modules import TransformerSentenceEncoder - -from .linformer_sentence_encoder_layer import LinformerSentenceEncoderLayer - - -class LinformerSentenceEncoder(TransformerSentenceEncoder): - """ - Implementation for a Bi-directional Linformer based Sentence Encoder used - in BERT/XLM style pre-trained models. - - This first computes the token embedding using the token embedding matrix, - position embeddings (if specified) and segment embeddings - (if specified). After applying the specified number of - LinformerEncoderLayers, it outputs all the internal states of the - encoder as well as the final representation associated with the first - token (usually CLS token). - - Input: - - tokens: B x T matrix representing sentences - - segment_labels: B x T matrix representing segment label for tokens - - Output: - - a tuple of the following: - - a list of internal model states used to compute the - predictions where each tensor has shape T x B x C - - sentence representation associated with first input token - in format B x C. - """ - - def __init__( - self, - padding_idx: int, - vocab_size: int, - num_encoder_layers: int = 6, - embedding_dim: int = 768, - ffn_embedding_dim: int = 3072, - num_attention_heads: int = 8, - dropout: float = 0.1, - attention_dropout: float = 0.1, - activation_dropout: float = 0.1, - layerdrop: float = 0.0, - max_seq_len: int = 256, - num_segments: int = 2, - use_position_embeddings: bool = True, - offset_positions_by_padding: bool = True, - encoder_normalize_before: bool = False, - apply_bert_init: bool = False, - activation_fn: str = "relu", - learned_pos_embedding: bool = True, - embed_scale: float = None, - freeze_embeddings: bool = False, - n_trans_layers_to_freeze: int = 0, - export: bool = False, - traceable: bool = False, - q_noise: float = 0.0, - qn_block_size: int = 8, - compressed: int = 4, - shared_kv_compressed: int = 0, - shared_layer_kv_compressed: int = 0, - freeze_compress: int = 0, - ) -> None: - - # Initialize linformer parameters - self.compressed = compressed - self.shared_kv_compressed = shared_kv_compressed - self.shared_layer_kv_compressed = shared_layer_kv_compressed - self.compress_layer = None - self.freeze_compress = freeze_compress - - super().__init__( - padding_idx=padding_idx, - vocab_size=vocab_size, - num_encoder_layers=num_encoder_layers, - embedding_dim=embedding_dim, - ffn_embedding_dim=ffn_embedding_dim, - num_attention_heads=num_attention_heads, - dropout=dropout, - attention_dropout=attention_dropout, - activation_dropout=activation_dropout, - layerdrop=layerdrop, - max_seq_len=max_seq_len, - num_segments=num_segments, - use_position_embeddings=use_position_embeddings, - offset_positions_by_padding=offset_positions_by_padding, - encoder_normalize_before=encoder_normalize_before, - apply_bert_init=apply_bert_init, - activation_fn=activation_fn, - learned_pos_embedding=learned_pos_embedding, - embed_scale=embed_scale, - freeze_embeddings=freeze_embeddings, - n_trans_layers_to_freeze=n_trans_layers_to_freeze, - export=export, - traceable=traceable, - q_noise=q_noise, - qn_block_size=qn_block_size, - ) - - def build_transformer_sentence_encoder_layer( - self, - embedding_dim, - ffn_embedding_dim, - num_attention_heads, - dropout, - attention_dropout, - activation_dropout, - activation_fn, - export, - q_noise, - qn_block_size, - ): - if self.shared_layer_kv_compressed == 1: - compress_layer = nn.Linear( - self.max_seq_len, self.max_seq_len // self.compressed - ) - # intialize parameters for compressed layer - nn.init.xavier_uniform_(compress_layer.weight, gain=1 / math.sqrt(2)) - if self.freeze_compress == 1: - compress_layer.weight.requires_grad = False - self.compress_layer = compress_layer - - return LinformerSentenceEncoderLayer( - embedding_dim=embedding_dim, - ffn_embedding_dim=ffn_embedding_dim, - num_attention_heads=num_attention_heads, - dropout=dropout, - attention_dropout=attention_dropout, - activation_dropout=activation_dropout, - activation_fn=activation_fn, - export=export, - q_noise=q_noise, - qn_block_size=qn_block_size, - compressed=self.compressed, - max_seq_len=self.max_seq_len, - shared_kv_compressed=self.shared_kv_compressed, - shared_compress_layer=( - None if self.shared_layer_kv_compressed == 0 else self.compress_layer - ), - freeze_compress=self.freeze_compress, - ) - - def upgrade_state_dict_named(self, state_dict, name): - prefix = name + "." if name != "" else "" - items_to_add = {} - keys_to_remove = [] - - # update key name for shared layer in new version of code - for k in state_dict.keys(): - if k.startswith(prefix + "compress_layer"): - if self.shared_layer_kv_compressed: - for layer_idx in range(len(self.layers)): - new_k = prefix + "layers.{0}.shared_compress_layer.{1}".format( - layer_idx, - k[len(prefix + "compress_layer.") :], - ) - items_to_add[new_k] = state_dict[k] - - for k in keys_to_remove: - del state_dict[k] - - for key, value in items_to_add.items(): - state_dict[key] = value diff --git a/examples/linformer/src/modules/linformer_sentence_encoder_layer.py b/examples/linformer/src/modules/linformer_sentence_encoder_layer.py deleted file mode 100644 index d27c5afd09..0000000000 --- a/examples/linformer/src/modules/linformer_sentence_encoder_layer.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Callable - -from fairseq.modules import TransformerSentenceEncoderLayer - -from .multihead_linear_attention import MultiheadLinearAttention - - -class LinformerSentenceEncoderLayer(TransformerSentenceEncoderLayer): - """ - Implements a Linformer Encoder Layer used in BERT/XLM style pre-trained - models. - """ - - def __init__( - self, - embedding_dim: int = 768, - ffn_embedding_dim: int = 3072, - num_attention_heads: int = 8, - dropout: float = 0.1, - attention_dropout: float = 0.1, - activation_dropout: float = 0.1, - activation_fn: str = "relu", - export: bool = False, - q_noise: float = 0.0, - qn_block_size: int = 8, - init_fn: Callable = None, - compressed: int = 1, - max_seq_len: int = 256, - shared_kv_compressed: int = 0, - shared_compress_layer: any = None, - freeze_compress: int = 0, - ) -> None: - - # Initialize linformer parameters - self.compressed = compressed - self.max_seq_len = max_seq_len - self.shared_kv_compressed = shared_kv_compressed - self.freeze_compress = freeze_compress - - def init_fn(): - # This needs to be set after nn.Module.__init__ is called - self.shared_compress_layer = shared_compress_layer - - super().__init__( - embedding_dim=embedding_dim, - ffn_embedding_dim=ffn_embedding_dim, - num_attention_heads=num_attention_heads, - dropout=dropout, - attention_dropout=attention_dropout, - activation_dropout=activation_dropout, - activation_fn=activation_fn, - export=export, - q_noise=q_noise, - qn_block_size=qn_block_size, - init_fn=init_fn, - ) - - def build_self_attention( - self, - embed_dim, - num_attention_heads, - dropout, - self_attention, - q_noise, - qn_block_size, - ): - return MultiheadLinearAttention( - embed_dim, - num_attention_heads, - dropout=dropout, - self_attention=True, - q_noise=q_noise, - qn_block_size=qn_block_size, - compressed=self.compressed, - max_seq_len=self.max_seq_len, - shared_kv_compressed=self.shared_kv_compressed, - shared_compress_layer=self.shared_compress_layer, - freeze_compress=self.freeze_compress, - ) diff --git a/examples/m2m_100/README.md b/examples/m2m_100/README.md index a87c0f5748..02a68a5f09 100644 --- a/examples/m2m_100/README.md +++ b/examples/m2m_100/README.md @@ -14,8 +14,8 @@ sacrebleu -t wmt14 -l fr-en --echo src > wmt.test.fr-en.fr sacrebleu -t wmt14 -l fr-en --echo ref > wmt.test.fr-en.en # WAT -wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2019.my-en.zip -unzip wat2019.my-en.zip +wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip +unzip wat2020.my-en.zip # FLORES # download from: https://github.com/facebookresearch/flores @@ -82,7 +82,7 @@ fairseq-preprocess \ 3. **Training Scripts** -To reproduce the training of our models, we train with fairseq-py's multilingual translation [task](https://github.com/pytorch/fairseq/tree/master/examples/multilingual). If you are interested in model parallel training, also check out [fairscale](https://github.com/facebookresearch/fairscale). +To reproduce the training of our models, we train with fairseq-py's multilingual translation [task](https://github.com/pytorch/fairseq/tree/main/examples/multilingual). If you are interested in model parallel training, also check out [fairscale](https://github.com/facebookresearch/fairscale). 4. **Generation** @@ -116,11 +116,40 @@ If you use any of the resources listed here, please cite: ## Trained Models -Looking for other trained models? Check back soon. +### 418M and 1.2B Model +We include the last checkpoint for both of these models. -Model | Description | Download ----|---|--- -`12b_last_checkpoint` | 12B parameter model trained on many-to-many training data for 100 languages | [12b_last_checkpoint](https://dl.fbaipublicfiles.com/m2m_100/12b_last_checkpoint.pt) +```bash +wget https://dl.fbaipublicfiles.com/m2m_100/model_dict.128k.txt +wget https://dl.fbaipublicfiles.com/m2m_100/language_pairs_small_models.txt + +# 418M parameter model +wget https://dl.fbaipublicfiles.com/m2m_100/418M_last_checkpoint.pt + +# 1.2B parameter model +wget https://dl.fbaipublicfiles.com/m2m_100/1.2B_last_checkpoint.pt + +# Generation: +fairseq-generate $binarized_data_path --batch-size 32 --path $path_to_model --fixed-dictionary model_dict.128k.txt -s en -t fr --remove-bpe 'sentencepiece' --beam 5 --task translation_multi_simple_epoch --lang-pairs language_pairs_small_models.txt --decoder-langtok --encoder-langtok src --gen-subset test > gen_out +``` + +### 12B Model +12B parameter model trained on many-to-many training data for 100 languages. We include the last checkpoint, average of last 5 checkpoints, average of last 10 checkpoints. There isn't a universally best choice out of these three, but all three versions are pretty close in accuracy. You can either sweep over the 3 checkpoints on a dev test and use the best performing checkpoint for final testing. Or the last checkpoint can be a good default choice. + +**Model Download Links** +Configuration | 2 32GB GPUs | 4 16GB GPUs | 6 12GB GPUs | 8 8GB GPUs +:--|:--|:--|:--|:-- +Last Checkpoint | [12b_last_chk_2_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_last_chk_2_gpus.pt) | [12b_last_chk_4_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_last_chk_4_gpus.pt) | [12b_last_chk_6_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_last_chk_6_gpus.pt) | [12b_last_chk_8_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_last_chk_8_gpus.pt) +Average of last 5 checkpoints | [12b_avg5_chk_2_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg5_chk_2_gpus.pt) | [12b_avg5_chk_4_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg5_chk_4_gpus.pt) | [12b_avg5_chk_6_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg5_chk_6_gpus.pt) | [12b_avg5_chk_8_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg5_chk_8_gpus.pt) +Average of last 10 checkpoints | [12b_avg10_chk_2_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg10_chk_2_gpus.pt) | [12b_avg10_chk_4_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg10_chk_4_gpus.pt) | [12b_avg10_chk_6_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg10_chk_6_gpus.pt) | [12b_avg10_chk_8_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg10_chk_8_gpus.pt) + +**Generation Arguments** +Configuration | 2 32GB GPUs | 4 16GB GPUs | 6 12GB GPUs | 8 8GB GPUs +:--|:--|:--|:--|:-- +`--pipeline-encoder-balance` | `[26]` | `[1,15,10]` | `[1,9,9,7]` | `[1,6,6,6,7]` +`--pipeline-encoder-devices` | `[0]` | `[0,1,0]` | `[0,1,2,0]` | `[0,4,5,1,0]` +`--pipeline-decoder-balance` | `[3,22,1]` | `[3,11,11,1]` | `[3,7,7,8,1]` | `[1,6,6,6,6,1]` +`--pipeline-decoder-devices` | `[0,1,0]` | `[0,2,3,0]` | `[0,3,4,5,0]` | `[0,2,6,7,3,0]` ## SentencePiece Model @@ -162,16 +191,19 @@ fairseq-preprocess \ --srcdict data_dict.128k.txt --tgtdict data_dict.128k.txt ``` -### Generation on a V100 GPU +### Generation for the 12B model + +Note that generation can currently be run using 2 32GB / 4 16GB / 6 12GB / 8 8GB GPUs, and the corresponding model checkpoints and pipeline arguments can be found in the [12B Model Section](#12b-model). +Generation on CPUs will be added in the future. ```bash wget https://dl.fbaipublicfiles.com/m2m_100/model_dict.128k.txt wget https://dl.fbaipublicfiles.com/m2m_100/language_pairs.txt -wget https://dl.fbaipublicfiles.com/m2m_100/12b_last_checkpoint.pt +wget https://dl.fbaipublicfiles.com/m2m_100/12b_last_chk_4_gpus.pt fairseq-generate \ data_bin \ --batch-size 1 \ - --path 12b_last_checkpoint.pt \ + --path 12b_last_chk_4_gpus.pt \ --fixed-dictionary model_dict.128k.txt \ -s de -t fr \ --remove-bpe 'sentencepiece' \ @@ -185,10 +217,10 @@ fairseq-generate \ --distributed-world-size 1 --distributed-no-spawn \ --pipeline-model-parallel \ --pipeline-chunks 1 \ - --pipeline-encoder-balance '[26]' \ - --pipeline-encoder-devices '[0]' \ - --pipeline-decoder-balance '[1,24,1]' \ - --pipeline-decoder-devices '[0,1,0]' > gen_out + --pipeline-encoder-balance '[1,15,10]' \ + --pipeline-encoder-devices '[0,1,0]' \ + --pipeline-decoder-balance '[3,11,11,1]' \ + --pipeline-decoder-devices '[0,2,3,0]' > gen_out ``` ## Evaluation with M2M-100 diff --git a/examples/mbart/README.md b/examples/mbart/README.md index 510edeff64..a45e37243c 100644 --- a/examples/mbart/README.md +++ b/examples/mbart/README.md @@ -9,7 +9,7 @@ MBART is a sequence-to-sequence denoising auto-encoder pre-trained on large-scal Model | Description | # params | Download ---|---|---|--- -`mbart.CC25` | mBART model with 12 encoder and decoder layers trained on 25 languages' monolingual corpus | 610M | [mbart.CC25.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.CC25.tar.gz) +`mbart.CC25` | mBART model with 12 encoder and decoder layers trained on 25 languages' monolingual corpus | 610M | [mbart.CC25.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz) `mbart.ft.ro_en` | finetune mBART cc25 model on ro-en language pairs | 610M | [mbart.cc25.ft.enro.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.ft.enro.tar.gz) ## Results @@ -26,7 +26,7 @@ Model | en-ro | ro-en ## BPE data # download model -wget https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.CC25.tar.gz +wget https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz tar -xzvf mbart.CC25.tar.gz # bpe data install SPM [here](https://github.com/google/sentencepiece) @@ -73,7 +73,7 @@ fairseq-train path_2_data \ --source-lang en_XX --target-lang ro_RO \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 \ + --lr-scheduler polynomial_decay --lr 3e-05 --warmup-updates 2500 --total-num-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ @@ -81,7 +81,7 @@ fairseq-train path_2_data \ --restore-file $PRETRAIN \ --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \ --langs $langs \ - --ddp-backend no_c10d + --ddp-backend legacy_ddp ``` ## Generate on EN-RO Get sacrebleu on finetuned en-ro model diff --git a/examples/mms/MODEL_CARD.md b/examples/mms/MODEL_CARD.md new file mode 100644 index 0000000000..63f997fb4d --- /dev/null +++ b/examples/mms/MODEL_CARD.md @@ -0,0 +1,63 @@ +# MMS Model Card + +## Model details + +**Organization developing the model** The FAIR team + +**Model version** This is version 1 of the model. + +**Model type** MMS is speech model, based on the transformer architecture. The pre-trained model comes in two sizes: 300M and 1B parameters. We fine-tune the model for speech recognition and make it available in the 1B variant. We also fine-tune the 1B variant for language identification. + +**License** CC BY-NC + +**Where to send questions or comments about the model** Questions and comments about MMS can be sent via the [GitHub repository](https://github.com/pytorch/fairseq/tree/master/examples/mms) of the project , by opening an issue and tagging it as MMS. + +## Uses + +**Primary intended uses** The primary use of MMS is to perform speech processing research for many more languages and to perform tasks such as automatic speech recognition, language identification, and speech synthesis. + +**Primary intended users** The primary intended users of the model are researchers in speech processing, machine learning and artificial intelligence. + +**Out-of-scope use cases** Fine-tuning the pre-pretrained models on other labeled datasets or downstream tasks requires further risk evaluation and mitigation. + +## Bias and Risks + +The MMS models were pre-trained on a blend of data from different domains, including readings of the New Testament. In the paper, we describe two studies analyzing gender bias and the use of religious language which conclude that models perform equally well for both genders and that on average, there is little bias for religious language (section 8 of the paper). + +# Training Details + +## Training Data + +MMS is pre-trained on VoxPopuli (parliamentary speech), MLS (read audiobooks), VoxLingua-107 (YouTube speech), CommonVoice (read Wikipedia text), BABEL (telephone conversations), and MMS-lab-U (New Testament readings), MMS-unlab (various read Christian texts). +Models are fine-tuned on FLEURS, VoxLingua-107, MLS, CommonVoice, and MMS-lab. We obtained the language information for MMS-lab, MMS-lab-U and MMS-unlab from our data soucrce and did not manually verify it for every language. + +## Training Procedure + +Please refer to the research paper for details on this. + +# Evaluation + +## Testing Data, Factors & Metrics + +We evaluate the model on a different benchmarks for the downstream tasks. The evaluation details are presented in the paper. The models performance is measured using standard metrics such as character error rate, word error rate, and classification accuracy. + + +# Citation + +**BibTeX:** + +``` +@article{pratap2023mms, + title={Scaling Speech Technology to 1,000+ Languages}, + author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli}, + journal={arXiv}, + year={2023} +} + +``` + +# Model Card Contact + +Please reach out to the authors at: [vineelkpratap@meta.com](mailto:vineelkpratap@meta.com) [androstj@meta.com](mailto:androstj@meta.com) [bshi@meta.com](mailto:bshi@meta.com) [michaelauli@meta.com](mailto:michaelauli@gmail.com) + + diff --git a/examples/mms/README.md b/examples/mms/README.md new file mode 100644 index 0000000000..0460dd5f93 --- /dev/null +++ b/examples/mms/README.md @@ -0,0 +1,215 @@ +# MMS: Scaling Speech Technology to 1000+ languages + +The Massively Multilingual Speech (MMS) project expands speech technology from about 100 languages to over 1,000 by building a single multilingual speech recognition model supporting over 1,100 languages (more than 10 times as many as before), language identification models able to identify over [4,000 languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) (40 times more than before), pretrained models supporting over 1,400 languages, and text-to-speech models for over 1,100 languages. Our goal is to make it easier for people to access information and to use devices in their preferred language. + +You can find details in the paper [Scaling Speech Technology to 1000+ languages](https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/) and the [blog post](https://ai.facebook.com/blog/multilingual-model-speech-recognition/). + +An overview of the languages covered by MMS can be found [here](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html). + +## 🤗 Transformers + +MMS has been added to Transformers. For more information, please refer to [Transformers' MMS docs](https://huggingface.co/docs/transformers/main/en/model_doc/mms). + +[Click here](https://huggingface.co/models?other=mms) to find all MMS checkpoints on the Hub. + +Checkout the demo here [![Open In HF Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm-dark.svg)](https://huggingface.co/spaces/facebook/MMS) + +## Finetuned models +### ASR + +| Model | Languages | Dataset | Model | Dictionary* | Supported languages | | +|---|---|---|---|---|---|--- +MMS-1B:FL102 | 102 | FLEURS | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_fl102.pt) | [download](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_fl102/eng.txt) | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_fl102_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-1b-fl102) +MMS-1B:L1107| 1107 | MMS-lab | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_l1107.pt) | [download](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_l1107/eng.txt) | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_l1107_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-1b-l1107) +MMS-1B-all| 1162 | MMS-lab + FLEURS
+ CV + VP + MLS | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_all.pt) | [download](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_all/eng.txt) | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_all_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-1b-all) + +\* In the `Dictionary` column, we provide the download link for token dictionary in English language. To download token dictionary for a different language supported by the model, modify the language code in the URL appropriately. For example, to get token dictionary of FL102 model for Hindi language, use [this](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_fl102/hin.txt) link. + +### TTS +1. Download the list of [iso codes](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) of 1107 languages. +2. Find the iso code of the target language and download the checkpoint. Each folder contains 3 files: `G_100000.pth`, `config.json`, `vocab.txt`. The `G_100000.pth` is the generator trained for 100K updates, `config.json` is the training config, `vocab.txt` is the vocabulary for the TTS model. +``` +# Examples: +wget https://dl.fbaipublicfiles.com/mms/tts/eng.tar.gz # English (eng) +wget https://dl.fbaipublicfiles.com/mms/tts/azj-script_latin.tar.gz # North Azerbaijani (azj-script_latin) +``` +The above command downloads generator only, which is enough to run TTS inference. If you want the full model checkpoint which also includes the discriminator (`D_100000.pth`) and the optimizer states, download as follows. +``` +# Example (full checkpoint: generator + discriminator + optimizer): +wget https://dl.fbaipublicfiles.com/mms/tts/full_model/eng.tar.gz # English (eng) +``` + + +### LID + +\# Languages | Dataset | Model | Dictionary | Supported languages | | +|---|---|---|---|---|--- +126 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l126.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l126/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l126_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-lid-126) +256 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l256.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l256/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l256_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-lid-256) +512 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l512.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l512/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l512_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-512) +1024 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l1024.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l1024/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l1024_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-1024) +2048 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l2048.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l2048/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l2048_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-2048) +4017 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l4017.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l4017/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l4017_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-4017) + +## Commands to run inference + +### ASR +Run this command to transcribe one or more audio files: +```shell command +cd /path/to/fairseq-py/ +python examples/mms/asr/infer/mms_infer.py --model "/path/to/asr/model" --lang lang_code \ + --audio "/path/to/audio_1.wav" "/path/to/audio_2.wav" "/path/to/audio_3.wav" +``` +We also provide an Ipython notebook example inside `asr/tutorial` folder [ipynb](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/asr/tutorial/MMS_ASR_Inference_Colab.ipynb) or [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/facebookresearch/fairseq/blob/main/examples/mms/asr/tutorial/MMS_ASR_Inference_Colab.ipynb) + + +For more advance configuration and calculate CER/WER, you could prepare manifest folder by creating a folder with this format: +``` +$ ls /path/to/manifest +dev.tsv +dev.wrd +dev.ltr +dev.uid + +# dev.tsv each line contains
`), which corresponds to embedding index `2`. +Thus **the model never saw newline characters during pretraining** and newlines should not be used during few-shot prompting. + +This is more clearly illustrated in the following example, which uses fairseq's Hub Interface to tokenize two documents in the desired format: +```python +from fairseq.models.transformer_lm import TransformerLanguageModel +model_dir = '/path/to/en_dense_lm_125m' +lm = TransformerLanguageModel.from_pretrained(model_dir, bpe='gpt2') + +data = """\ +This is the first paragraph of the first document. +This is the second paragraph of the first document. + +This is the first paragraph of the second document.\ +""" + +# The following is wrong, since it will encode newlines present in `data`. +tokens_bad = lm.score(data)['tokens'] +assert '\n' in lm.decode(tokens_bad) # oops, we encoded a newline + +# Instead pass the replace_newlines_with_eos option to get the correct behavior. +tokens_good = lm.score(data, replace_newline_with_eos=True)['tokens'] +assert '\n' not in lm.decode(tokens_good) # no newlines were encoded +``` + +## Citation + +Coming soon. diff --git a/examples/moe_lm/data_card.md b/examples/moe_lm/data_card.md new file mode 100644 index 0000000000..54e694b620 --- /dev/null +++ b/examples/moe_lm/data_card.md @@ -0,0 +1,221 @@ +# Data card for the paper "Efficient Large Scale Language Modeling with Mixtures of Experts" +## Version 1.0.0 + +We follow the recommendations of Gebru et al. (2018) and provide a datacard for the dataset used to train the 1.1T parameter model. + +## Motivation +* **For what purpose was the dataset created? Was there a specific task in mind? Was there a specific gap that needed to be filled? Please provide a description.** +The pre-training data for training the 1.1 T model was created by a union of six English language datasets, including five datasets used by RoBERTa (Liu et al 2019) and the English subset of CC 100. These purpose of creating this dataset was to pre-train the language model. + +* **Who created the dataset (e.g., which team, research group) and on behalf of which entity (e.g., company, institution, organization)?** +FAIR (Fundamental Artificial Intelligence Research) + +* **Who funded the creation of the dataset? If there is an associated grant, please provide the name of the grantor and the grant name and number.** +FAIR (Fundamental Artificial Intelligence Research) + +* **Any other comments?** +No. + +## Composition + +* **What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)? Are there multiple types of instances (e.g., movies, users, and ratings; people and interactions between them; nodes and edges)? Please provide a description.** +The instances are textual documents. The overall dataset is composed from a union of the following datasets - + * BookCorpus (Zhu et al., 2019) consists of more than 10K unpublished books (4GB); + * English Wikipedia, excluding lists, tables and headers (12GB); + * CC-News (Nagel,2016) contains 63 million English news articles crawled between September 2016 and February 2019 (76GB); + * OpenWebText (Gokaslan and Cohen, 2019), an open source recreation of the WebText dataset used to train GPT-2 (38GB); + * CC-Stories (Trinh and Le, 2018) contains a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas (31GB); + * English CC100 (Wenzek et al., 2020), a dataset extracted from CommonCrawl snapshots between January 2018 and December 2018, filtered to match the style of Wikipedia (292GB). + +* **How many instances are there in total (of each type, if appropriate)?** +The training data contains 112B tokens corresponding to 453 GB of data. + +* **Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set? If the dataset is a sample, then what is the larger set? Is the sample representative of the larger set (e.g., geographic coverage)? If so, please describe how this representativeness was validated/verified. If it is not representative of the larger set, please describe why not (e.g., to cover a more diverse range of instances, because instances were withheld or unavailable).** +The English CC100 section of the dataset is a subset of CommonCrawl snapshots extracted between January 2018 to December 2018, filtered to match the style of Wikipedia. The CC-stories dataset contains a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas. + +* **What data does each instance consist of? “Raw” data (e.g., unprocessed text or images) or features? In either case, please provide a description.** +Each instance consists of raw text data. + +* **Is there a label or target associated with each instance? If so, please provide a description.** +No. + +* **Is any information missing from individual instances? If so, please provide a description, explaining why this information is missing (e.g., because it was unavailable). This does not include intentionally removed information, but might include, e.g., redacted text.** +No. + +* **Are relationships between individual instances made explicit (e.g., users' movie ratings, social network links)? If so, please describe how these relationships are made explicit.** +There are no explicit relationships between individual instances. + +* **Are there recommended data splits (e.g., training, development/validation, testing)? If so, please provide a description of these splits, explaining the rationale behind them.** +We hold out a random validation set of approximately 150MB from the pretraining data, sampled proportionally to each dataset's size in the pretraining corpus. + +* **Are there any errors, sources of noise, or redundancies in the dataset? If so, please provide a description.** +N/A + +* **Is the dataset self-contained, or does it link to or otherwise rely on external resources (e.g., websites, tweets, other datasets)?** +It's self-contained. + +* **Does the dataset contain data that might be considered confidential (e.g., data that is protected by legal privilege or by doctor-patient confidentiality, data that includes the content of individuals' non-public communications)? If so, please provide a description.** +The datasets used are publicly available, and the information in them is not considered confidential. + +* **Does the dataset contain data that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety? If so, please describe why.** +Parts of the dataset are a subset of public Common Crawl data, which could contain sentences that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety. + +* **Does the dataset relate to people? If not, you may skip the remaining questions in this section.** +Some documents of this data relate to people, such as news articles, Wikipedia descriptions, etc. + +* **Does the dataset identify any subpopulations (e.g., by age, gender)? If so, please describe how these subpopulations are identified and provide a description of their respective distributions within the dataset.** +No. + +* **Is it possible to identify individuals (i.e., one or more natural persons), either directly or indirectly (i.e., in combination with other data) from the dataset? If so, please describe how** +In addition to individuals who have Wikipedia pages (celebrities, politicians, etc.), it may be possible to identify other individuals by their names, Twitter account names, etc. if that information is present in Common Crawl. + +* **Does the dataset contain data that might be considered sensitive in any way (e.g., data that reveals racial or ethnic origins, sexual orientations, religious beliefs, political opinions or union memberships, or locations; financial or health data; biometric or genetic data; forms of government identification, such as social security numbers; criminal history)? If so, please provide a description.** +The training dataset is partially derived from Common Crawl, which may contain some sensitive information. + +* **Any other comments?** +No + + +## Collection Process + +* **How was the data associated with each instance acquired? Was the data directly observable (e.g., raw text, movie ratings), reported by subjects (e.g., survey responses), or indirectly inferred/ derived from other data (e.g., part-of-speech tags, model-based guesses for age or language)? If data was reported by subjects or indirectly inferred/derived from other data, was the data validated/verified? If so, please describe how.** +N/A. The dataset is a union of six publicly available datasets. + +* **What mechanisms or procedures were used to collect the data (e.g., hardware apparatus or sensor, manual human curation, software program, software API)? How were these mechanisms or procedures validated?** +N/A + +* **If the dataset is a sample from a larger set, what was the sampling strategy (e.g., deterministic, probabilistic with specific sampling probabilities)?** +Please refer to the main document for details. + +* **Who was involved in the data collection process (e.g., students, crowdworkers, contractors) and how were they compensated (e.g., how much were crowdworkers paid)?** +This data is mined, filtered and sampled by machines. + +* **Over what timeframe was the data collected? Does this timeframe match the creation timeframe of the data associated with the instances (e.g., recent crawl of old news articles)? If not, please describe the timeframe in which the data associated with the instances was created.** +Different parts of the dataset were mined over different time periods. +1. The CC-News dataset contains English news articles crawled between September 2016 and February 2019. +2. The English CC-100 dataset was extracted from CommonCrawl snapshots between January 2018 and December 2018. + +* **Were any ethical review processes conducted (e.g., by an institutional review board)? If so, please provide a description of these review processes, including the outcomes, as well as a link or other access point to any supporting documentation.** +No. + +* **Does the dataset relate to people? If not, you may skip the remainder of the questions in this section.** +No. + +* **Did you collect the data from the individuals in question directly, or obtain it via third parties or other sources (e.g., websites)?** +N/A + +* **Were the individuals in question notified about the data collection? If so, please describe (or show with screenshots or other information) how notice was provided, and provide a link or other access point to, or otherwise reproduce, the exact language of the notification itself.** +N/A + +* **Did the individuals in question consent to the collection and use of their data? If so, please describe (or show with screenshots or other information) how consent was requested and provided, and provide a link or other access point to, or otherwise reproduce, the exact language to which the individuals consented.** +N/A + +* **If consent was obtained, were the consenting individuals provided with a mechanism to revoke their consent in the future or for certain uses? If so, please provide a description, as well as a link or other access point to the mechanism (if appropriate).** +N/A + +* **Has an analysis of the potential impact of the dataset and its use on data subjects (e.g., a data protection impact analysis) been conducted? If so, please provide a description of this analysis, including the outcomes, as well as a link or other access point to any supporting documentation.** +Some responsible AI related evaluations were performed. Please refer to the main document and the model card for the paper. + +* **Any other comments?** +No + + +## Preprocessing/cleaning/labeling + + +* **Was any preprocessing/cleaning/labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)? If so, please provide a description. If not, you may skip the remainder of the questions in this section.** +The component datasets went through standard cleaning and re-formatting practices, including removing repetitive/non informative text like "Chapter One", or "This ebook by Project Gutenberg". + +* **Was the “raw” data saved in addition to the preprocessed/cleaned/labeled data (e.g., to support unanticipated future uses)? If so, please provide a link or other access point to the “raw” data.** +The "raw" component datasets is publicly available in their respective locations (more details can be seen in the respective papers linked in references). + +* **Is the software used to preprocess/clean/label the instances available? If so, please provide a link or other access point.** +The software is proprietary to Meta Platforms and currently unavailable publicly. + +* **Any other comments?** +No + + +## Uses + +* **Has the dataset been used for any tasks already? If so, please provide a description.** +Yes, this dataset was used to pre-train the models described in the paper. + +* **Is there a repository that links to any or all papers or systems that use the dataset? If so, please provide a link or other access point.** +No. + +* **What (other) tasks could the dataset be used for?** +This data can be used to pretrain English language models, which are foundation to many current and future language tasks. + +* **Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses? For example, is there anything that a future user might need to know to avoid uses that could result in unfair treatment of individuals or groups (e.g., stereotyping, quality of service issues) or other undesirable harms (e.g., financial harms, legal risks) If so, please provide a description. Is there anything a future user could do to mitigate these undesirable harms?** +The pipeline for creating this dataset paves a way for building a scalable infrastructure for mining datasets to be be used for training large-scale models. + +* **Are there tasks for which the dataset should not be used? If so, please provide a description.** +No. + +* **Any other comments?** +No. + +## Distribution + + +* **Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created? If so, please provide a description.** +No. + +* **How will the dataset will be distributed (e.g., tarball on website, API, GitHub)? Does the dataset have a digital object identifier (DOI)?** +N/A + +* **When will the dataset be distributed?** +No. + +* **Will the dataset be distributed under a copyright or other intellectual property (IP) license, and/or under applicable terms of use (ToU)? If so, please describe this license and/or ToU, and provide a link or other access point to, or otherwise reproduce, any relevant licensing terms or ToU, as well as any fees associated with these restrictions.** +No. + +* **Have any third parties imposed IP-based or other restrictions on the data associated with the instances? If so, please describe these restrictions, and provide a link or other access point to, or otherwise reproduce, any relevant licensing terms, as well as any fees associated with these restrictions.** +No. + +* **Do any export controls or other regulatory restrictions apply to the dataset or to individual instances? If so, please describe these restrictions, and provide a link or other access point to, or otherwise reproduce, any supporting documentation.** +N/A + +* **Any other comments?** +No. + +## Maintenance + +* **Who is supporting/hosting/maintaining the dataset?** +FAIR (Fundamental Artificial Intelligence Research) + +* **How can the owner/curator/manager of the dataset be contacted (e.g., email address)?** +Refer to the main document. + +* **Is there an erratum? If so, please provide a link or other access point.** +N/A + +* **Will the dataset be updated (e.g., to correct labeling errors, add new instances, delete instances)? If so, please describe how often, by whom, and how updates will be communicated to users (e.g., mailing list, GitHub)?** +No plan for updating. + +* **If the dataset relates to people, are there applicable limits on the retention of the data associated with the instances (e.g., were individuals in question told that their data would be retained for a fixed period of time and then deleted)? If so, please describe these limits and explain how they will be enforced.** +N/A + +* **Will older versions of the dataset continue to be supported/hosted/maintained? If so, please describe how. If not, please describe how its obsolescence will be communicated to users.** +N/A + +* **If others want to extend/augment/build on/contribute to the dataset, is there a mechanism for them to do so? If so, please provide a description. Will these contributions be validated/ verified? If so, please describe how. If not, why not? Is there a process for communicating/ distributing these contributions to other users? If so, please provide a description.** +No. + +* **Any other comments?** +No. + +## References +Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692. + +Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2019. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. arXiv:1506.06724. + +Sebastian Nagel. 2016. Cc-news. http: //web.archive.org/save/http: //commoncrawl.org/2016/10/news-dataset-available. + +Aaron Gokaslan and Vanya Cohen. 2019. Openwebtext corpus. http://web.archive.org/save/http://Skylion007.github.io/OpenWebTextCorpus + +Trieu H Trinh and Quoc V Le. 2018. A simple method for commonsense reasoning. arXiv preprint arXiv:1806.02847. + +Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, and Edouard Grave. 2020. CCNet: Extracting high quality monolingual datasets from web crawl data. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 4003–4012, Marseille, France. European Language Resources Association. + diff --git a/examples/moe_lm/model_card.md b/examples/moe_lm/model_card.md new file mode 100644 index 0000000000..a1cd68116a --- /dev/null +++ b/examples/moe_lm/model_card.md @@ -0,0 +1,170 @@ +# Model card for the paper ``Efficient Large Scale Language Modeling with Mixtures of Experts" +## Version 1.0.0 + +### Model developer +FAIR (Fundamental Artificial Intelligence Research) + +### Model type +An autoregressive English language model trained on a union of six English language models. We explore dense and sparse (MoE based) architectures in the paper. +* Dense models - Our dense models range from 125M parameters to 13B parameters. +* Sparse (MoE) models - Our MoE based models range from 15B parameters to 1.1 Trillion parameters. +This model card focuses on the 1.1 Trillion parameter model, but the discussion +applies to all of the models explored in this work. + +### Citation details +Artetxe et al. (2021): Efficient Large Scale Language Modeling with Mixtures of Experts + +### Model Feedback Channel +fairseq + +## Intended use +### Primary intended use +For research purposes only, e.g. reproducing model evaluation results. Generation is only used in a limited capacity for explanation/justification or for prompting/probing/priming for class labels. + +### Out of scope uses +The primary purpose of the model is not to generate language, although the model is capable of doing that. + +## Factors influencing model performance +This section discusses potential risks associated with using the model. + +### Relevant factors +Based on known problems with NLP technology, potential relevant factors include bias (gender, profession, race and religion). + +### Evaluation factors +The 1.1T model was evaluated on StereoSet and CrowS-Pairs datasets to quantify encoded bias in the model. + +## Metrics +### Model performance measures +The 1.1T parameter model was primarily evaluated on +1. In-domain and out-of-domain language modeling perplexity. +2. Zero-shot and few-shot priming. +3. Fully supervised finetuning. + +### Approaches to handle uncertainty +For few-shot learning, we report the average results across 25 runs, randomly sampling a different set of few-shot examples from the training set each time. + +## Evaluation data +## Zero Shot evaluation + +### HellaSwag +#### Description +HellaSwag is a dataset for evaluating commonsense reasoning. + +### PIQA +#### Description +PIQA is a dataset designed to evaluate reasoning about Physical Commonsense in Natural Language + +### ReCoRd +#### Description +Reading Comprehension with Commonsense Reasoning Dataset (ReCoRD) is a large-scale reading comprehension dataset which requires commonsense reasoning. ReCoRD consists of queries automatically generated from CNN/Daily Mail news articles; the answer to each query is a text span from a summarizing passage of the corresponding news. The goal of ReCoRD is to evaluate a machine's ability of commonsense reasoning in reading comprehension. + +## Few Shot evaluation +### Winogrande +#### Description +Winogrande is a benchmark for commonsense reasoning. The dataset contains pronoun resolution problems originally designed to be unsolvable for statistical models that rely on selectional preferences or word associations. + +### StoryCloze +#### Description +StoryCloze is a new commonsense reasoning framework for evaluating story understanding, story generation, and script learning. This test requires a system to choose the correct ending to a four-sentence story. + +### OpenBookQA +#### Description +OpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding of a subject. It consists of 5,957 multiple-choice elementary-level science questions (4,957 train, 500 dev, 500 test), which probe the understanding of a small “book” of 1,326 core science facts and the application of these facts to novel situations. + +## Fully supervised evaluation + +### BoolQ +#### Description +BoolQ is a question answering dataset for yes/no questions containing 15942 examples. These questions are naturally occurring – they are generated in unprompted and unconstrained settings. Each example is a triplet of (question, passage, answer), with the title of the page as optional additional context. + +### SST-2 +#### Description +SST-2 (or SST-binary) is a binary classification dataset where the goal is to differentiate between negative or somewhat negative vs somewhat positive or positive. + +### MNLI +#### Description +The Multi-Genre Natural Language Inference (MultiNLI) corpus is a crowd-sourced collection of 433k sentence pairs annotated with textual entailment information. The corpus is modeled on the SNLI corpus, but differs in that covers a range of genres of spoken and written text, and supports a distinctive cross-genre generalization evaluation. + +## Responsible AI (RAI) evaluation +### StereoSet +#### Description +A large-scale natural dataset in English to measure stereotypical biases in four domains: gender, profession, race, and religion + +#### Motivation for dataset use +The motivation for evaluating the 1.1T parameter model on this dataset is to evaluate the model's stereotype bias in gender, profession, race, and religion + +### CrowS +#### Description +Challenge Dataset for Measuring Social Biases in Masked Language Models + +#### Motivation for dataset use +The motivation for evaluating the 1.1T parameter model on this dataset is to evaluate the model’s bias in the domains of race, religion and age + +---- + +## Training data +### BookCorpus +#### Description +A dataset consisting of more than 10K unpublished books. 4GB in size. (Zhu et al., 2019) + +### English Wikipedia +#### Description +Data from English wikipedia, excluding lists, tables and headers. 12GB in size. + +### CC-News +#### Description +A dataset containing 63 millions English news articles crawled between September 2016 and February 2019. 76GB in size. (Nagel,2016) + +### OpenWebText +#### Description +An open source recreation of the WebText dataset used to train GPT-2. 38GB in size. (Gokaslan and Cohen, 2019) + +### CC-Stories +#### Description +A dataset containing a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas. 31GB in size. (Trinh and Le, 2018) + +### English CC100 +#### Description +A dataset extracted from CommonCrawl snapshots between January 2018 and December 2018, filtered to match the style of Wikipedia following the methodology introduced in CCNet (https://arxiv.org/abs/1911.00359). 292GB in size. (Wenzek et al., 2020) + +## Responsible AI (RAI) Dimensions +### Fairness (Bias and inclusion) +The 1.1T parameter model was evaluated on the StereoSet and CrowS pairs dataset for inherent bias in the model, and bias as a result of the data. Similar to StereoSet, we observe that both the dense and MoE models get worse in terms of the Stereotype Score (SS) with scale. + +### Privacy and security +The 1.1T model did not have any special Privacy and Security considerations. The training data and evaluation data were both public and went through standard Meta privacy and licensing procedures. + +### Transparency and control +In the spirit of transparency and accountability we have created this model card for the 1.1T parameter model and a data card for the training data (referenced in Artetxe et al. (2021)). + +### Efficiency (Green AI) +The 1.1T parameter model is trained as a Mixture of Experts (MoE) model. Mixture of expert (MoE) models are efficient because they leverage sparse computation, i.e., only a small fraction of parameters are active for any given input. For instance, our 1.1T parameter MoE model requires only 30% more FLOPS compared to a 6.7B parameter dense model, i.e., a 160x increase in parameters with only a 30% increase in FLOPS. Notably, MoE models achieve much better validation perplexity for a given compute budget compared to dense models. + +## References +Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. 2019. HellaSwag: Can a machine really finish your sentence? In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 4791– 4800, Florence, Italy. Association for Computational Linguistics. + +Yonatan Bisk, Rowan Zellers, Ronan Le bras, Jianfeng Gao, and Yejin Choi. 2020. Piqa: Reasoning about physical commonsense in natural language. Proceedings of the AAAI Conference on Artificial Intelligence, 34(05):7432–7439. + +Sheng Zhang, Xiaodong Liu, Jingjing Liu, Jianfeng Gao, Kevin Duh, and Benjamin Van Durme. 2018. ReCoRD: Bridging the gap between human and machine commonsense reading comprehension. arXiv preprint 1810.12885. + +Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. 2020. Winogrande: An adversarial winograd schema challenge at scale. Proceedings of the AAAI Conference on Artificial Intelligence, 34(05):8732–8740. + +Nasrin Mostafazadeh, Nathanael Chambers, Xiaodong He, Devi Parikh, Dhruv Batra, Lucy Vanderwende, Pushmeet Kohli, and James Allen. 2016. A corpus and cloze evaluation for deeper understanding of commonsense stories. In Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 839–849, San Diego, California. Association for Computational Linguistics. + +Todor Mihaylov, Peter Clark, Tushar Khot, and Ashish Sabharwal. 2018. Can a suit of armor conduct electricity? a new dataset for open book question answering. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 2381–2391, Brussels, Belgium. Association for Computational Linguistics. + +Christopher Clark and Kenton Lee and Ming-Wei Chang and Tom Kwiatkowski and Michael Collins and Kristina Toutanova. 2019. BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions + +Moin Nadeem, Anna Bethke, and Siva Reddy. 2021. StereoSet: Measuring stereotypical bias in pretrained language models. In Association for Computational Linguistics (ACL). + +Nikita Nangia, Clara Vania, Rasika Bhalerao, and Samuel R. Bowman. 2020. CrowS-pairs: A challenge dataset for measuring social biases in masked language models. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1953–1967, Online. Association for Computational Linguistics. + +Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2019. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. arXiv:1506.06724. + +Sebastian Nagel. 2016. Cc-news. http: //web.archive.org/save/http: //commoncrawl.org/2016/10/news-dataset-available. + +Aaron Gokaslan and Vanya Cohen. 2019. Openwebtext corpus. http://web.archive.org/save/http://Skylion007.github.io/OpenWebTextCorpus + +Trieu H Trinh and Quoc V Le. 2018. A simple method for commonsense reasoning. arXiv preprint arXiv:1806.02847. + +Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, and Edouard Grave. 2020. CCNet: Extracting high quality monolingual datasets from web crawl data. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 4003–4012, Marseille, France. European Language Resources Association. diff --git a/examples/mr_hubert/README.md b/examples/mr_hubert/README.md new file mode 100644 index 0000000000..e72c09c047 --- /dev/null +++ b/examples/mr_hubert/README.md @@ -0,0 +1,187 @@ +# MR-HuBERT + +## Pre-trained models + +### Main models +Model | Pretraining Data | Model | Paper Reference +|---|---|---|--- +MR-HuBERT Base (~97M) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_base/mrhubert_mono_base.pt) | mono\_base +MR-HuBERT Base (~321M) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_large/mrhubert_mono_large.pt) | mono\_large +Multilingual MR-HuBERT Base (~97M) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/multi_base/multi_base.pt) | multi\_base +Multilingual MR-HuBERT Large (~321M) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download 400k steps](https://dl.fbaipublicfiles.com/mrhubert/multi_large/multi_large_400k.pt) or [download 600k steps](https://dl.fbaipublicfiles.com/mrhubert/multi_large/multi_large_600k.pt) | Not in the paper + + +### Abalation models +Model | Pretraining Data | Model | Paper Reference +|---|---|---|--- +MR-HuBERT Base (2-4-6 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-a/b1-a.pt) | (B.1)-a +MR-HuBERT Base (5-2-5 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-b/b1-b.pt) | (B.1)-b +MR-HuBERT Base (6-4-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-c/b1-c.pt) | (B.1)-c +MR-HuBERT Base (3res 3-2-2-2-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-a/b2-a.pt) | (B.2)-a +MR-HuBERT Base (3res 2-2-4-2-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-b/b2-b.pt) | (B.2)-b +MR-HuBERT Base (3res 2-2-2-2-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-c/b2-c.pt) | (B.2)-c +MR-HuBERT Base (Simple sampling) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b3-a/b3-a.pt) | (B.3)-a +MR-HuBERT Base (Single target) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b4-a/b4-a.pt) | (B.4)-a +MR-HuBERT Base (Simple Sampling + single target) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b4-b/b4-b.pt) | (B.4)-b +MR-HuBERT Base (Mono-resolution 20ms) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b5-a/b5-a.pt) | (B.5)-a +MR-HuBERT Base (3-3-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b6-a/b6-a.pt) | (B.6)-a +MR-HuBERT Base (Mono-resolution 20ms, 3-3-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b6-b/b6-b.pt) | (B.6)-b +MR-HuBERT Base (HuBERT 20ms&40ms units) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-a/b7-a.pt) | (B.7)-a +MR-HuBERT Base (Encodec 50Hz unit) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-b/b7-b.pt) | (B.7)-b +MR-HuBERT Base (Encodec 50Hz units and 25Hz units) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-c/b7-c.pt) | (B.7)-c +MR-HuBERT Base (Encodec 50Hz units stream 0&1 ) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-d/b7-d.pt) | (B.7)-d +MR-HuBERT Large (no audio norm) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-a/b8-a.pt) | (B.8)-a +MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-b/b8-b.pt) | (B.8)-b +MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-c/b8-c.pt) | (B.8)-c +MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-d/b8-d.pt) | (B.8)-d +MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-e/b8-e.pt) | (B.8)-e +MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-f/b8-f.pt) | (B.8)-f +MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-g/b8-g.pt) | (B.8)-g +MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-h/b8-h.pt) | (B.8)-h +MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-i/b8-i.pt) | (B.8)-i +MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-j/b8-j.pt) | (B.8)-j +Multilingual MR-HuBERT Large (Simple sampling) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/multi_large_simple/multi_large_simple.pt) | Not in paper +MR-HuBERT xLarge (from HuBERT-base label) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_xlarge/v1.pt) | Not in paper +MR-HuBERT xLarge (from HuBERT-large label) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_xlarge/v2.pt) | Not in paper + +## Load a model +``` +ckpt_path = "/path/to/the/checkpoint.pt" +models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path]) +model = models[0] +``` + +## Train a new model + +### Data preparation + +Follow the steps in `./simple_kmeans` to create: +- `{train,valid}.tsv` waveform list files with length information +``` +/path/to/your/audio/files +file1.wav\t160000 +file2.wav\t154600 +... +filen.wav\t54362 +``` +- `{train,valid}.km` frame-aligned pseudo label files (the order is the same as wavefiles in the tsv file). +``` +44 44 44 48 48 962 962 962 962 962 962 962 962 967 967 967 967 967 967 967 967 370 852 370 ... 18 18 745 745 +44 44 44 48 48 962 962 962 147 147 147 147 147 147 147 147 147 147 147 147 176 176 271 271 ... 27 27 745 745 +... +44 44 44 48 962 962 962 962 962 962 377 377 377 77 77 852 696 694 433 578 578 82 740 622 ... 27 27 745 745 +``` +- `dict.km.txt` a dummy dictionary (first column is id, the second is dummy one) +``` +0 1 +1 1 +2 1 +... +999 1 +``` + +The `label_rate` is the same as the feature frame rate used for clustering, +which is 100Hz for MFCC features and 50Hz for HuBERT features by default. + +### Pre-train a MR-HuBERT model + +Suppose `{train,valid}.tsv` are saved at `/path/to/data`, `{train,valid}.km` +are saved at `/path/to/labels`, and the label rate is 100Hz. + +To train a base model (12 layer transformer), run: +```sh +$ python fairseq_cli/hydra_train.py \ + --config-dir /path/to/fairseq-py/examples/mr_hubert/config/pretrain \ + --config-name mrhubert_base_librispeech \ + task.data=/path/to/data task.label_dir=/path/to/labels \ + task.labels='["km"]' model.label_rate=100 \ + task.label_rate_ratios='[1, 2]' \ +``` + +Please see sample pre-training scripts `train.sh` for an example script. + +### Fine-tune a MR-HuBERT model with a CTC loss + +Suppose `{train,valid}.tsv` are saved at `/path/to/data`, and their +corresponding character transcripts `{train,valid}.ltr` are saved at +`/path/to/trans`. A typical ltr file is with the same order of tsv waveform files as +``` +HOW | ARE | YOU +... +THANK | YOU +``` + +To fine-tune a pre-trained MR-HuBERT model at `/path/to/checkpoint`, run +```sh +$ python fairseq_cli/hydra_train.py \ + --config-dir /path/to/fairseq-py/examples/mr_hubert/config/finetune \ + --config-name base_10h \ + task.data=/path/to/data task.label_dir=/path/to/trans \ + model.w2v_path=/path/to/checkpoint +``` + +Please see sample fine-tuning scripts `finetune.sh` for an example script. + +### Decode a MR-HuBERT model + +Suppose the `test.tsv` and `test.ltr` are the waveform list and transcripts of +the split to be decoded, saved at `/path/to/data`, and the fine-tuned model is +saved at `/path/to/checkpoint`. + + +We support three decoding modes: +- Viterbi decoding: greedy decoding without a language model +- KenLM decoding: decoding with an arpa-format KenLM n-gram language model +- Fairseq-LM deocding: decoding with a Fairseq neural language model (not fully tested) + + +#### Viterbi decoding + +`task.normalize` needs to be consistent with the value used during fine-tuning. +Decoding results will be saved at +`/path/to/experiment/directory/decode/viterbi/test`. + +```sh +$ python examples/speech_recognition/new/infer.py \ + --config-dir /path/to/fairseq-py/examples/mr_hubert/config/decode \ + --config-name infer \ + task.data=/path/to/data \ + task.normalize=[true|false] \ + decoding.exp_dir=/path/to/experiment/directory \ + common_eval.path=/path/to/checkpoint + dataset.gen_subset=test \ +``` + +#### KenLM / Fairseq-LM decoding + +Suppose the pronunciation lexicon and the n-gram LM are saved at +`/path/to/lexicon` and `/path/to/arpa`, respectively. Decoding results will be +saved at `/path/to/experiment/directory/decode/kenlm/test`. + +```sh +$ python examples/speech_recognition/new/infer.py \ + --config-dir /path/to/fairseq-py/examples/mr_hubert/config/decode \ + --config-name infer_lm \ + task.data=/path/to/data \ + task.normalize=[true|false] \ + decoding.exp_dir=/path/to/experiment/directory \ + common_eval.path=/path/to/checkpoint + dataset.gen_subset=test \ + decoding.decoder.lexicon=/path/to/lexicon \ + decoding.decoder.lmpath=/path/to/arpa +``` + +The command above uses the default decoding hyperparameter, which can be found +in `examples/speech_recognition/hydra/decoder.py`. These parameters can be +configured from the command line. For example, to search with a beam size of +500, we can append the command above with `decoding.decoder.beam=500`. +Important parameters include: +- decoding.decoder.beam +- decoding.decoder.beamthreshold +- decoding.decoder.lmweight +- decoding.decoder.wordscore +- decoding.decoder.silweight + +To decode with a Fairseq LM, you may check the usage examples in wav2vec2 or hubert examples. + +Please see sample decoding scripts `decode.sh` for an example script. diff --git a/examples/mr_hubert/config/decode/infer.yaml b/examples/mr_hubert/config/decode/infer.yaml new file mode 100644 index 0000000000..eff39802e7 --- /dev/null +++ b/examples/mr_hubert/config/decode/infer.yaml @@ -0,0 +1,30 @@ +# @package _group_ + +defaults: + - model: null + +hydra: + run: + dir: ${common_eval.results_path}/viterbi + sweep: + dir: ${common_eval.results_path} + subdir: viterbi + +task: + _name: multires_hubert_pretraining + single_target: true + fine_tuning: true + label_rate_ratios: ??? + data: ??? + normalize: false + +decoding: + type: viterbi + unique_wer_file: true +common_eval: + results_path: ??? + path: ??? + post_process: letter +dataset: + max_tokens: 1100000 + gen_subset: ??? diff --git a/examples/mr_hubert/config/decode/infer_lm.yaml b/examples/mr_hubert/config/decode/infer_lm.yaml new file mode 100644 index 0000000000..535b950775 --- /dev/null +++ b/examples/mr_hubert/config/decode/infer_lm.yaml @@ -0,0 +1,37 @@ +# @package _group_ + +defaults: + - model: null + +hydra: + run: + dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} + sweep: + dir: ${common_eval.results_path} + subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} + +task: + _name: multires_hubert_pretraining + single_target: true + fine_tuning: true + data: ??? + label_rate_ratios: ??? + normalize: ??? + +decoding: + type: kenlm + lexicon: ??? + lmpath: ??? + beamthreshold: 100 + beam: 500 + lmweight: 1.5 + wordscore: -1 + silweight: 0 + unique_wer_file: true +common_eval: + results_path: ??? + path: ??? + post_process: letter +dataset: + max_tokens: 1100000 + gen_subset: ??? diff --git a/examples/mr_hubert/config/decode/run/submitit_slurm.yaml b/examples/mr_hubert/config/decode/run/submitit_slurm.yaml new file mode 100644 index 0000000000..0b8065832e --- /dev/null +++ b/examples/mr_hubert/config/decode/run/submitit_slurm.yaml @@ -0,0 +1,17 @@ +# @package _global_ +hydra: + launcher: + cpus_per_task: ${distributed_training.distributed_world_size} + gpus_per_node: ${distributed_training.distributed_world_size} + tasks_per_node: ${hydra.launcher.gpus_per_node} + nodes: 1 + mem_gb: 200 + timeout_min: 4320 + max_num_timeout: 50 + name: ${hydra.job.config_name} + submitit_folder: ${hydra.sweep.dir}/submitit + +distributed_training: + distributed_world_size: 1 + distributed_no_spawn: true + distributed_port: 29761 diff --git a/examples/mr_hubert/config/decode/run/submitit_slurm_8gpu.yaml b/examples/mr_hubert/config/decode/run/submitit_slurm_8gpu.yaml new file mode 100644 index 0000000000..2f669f3763 --- /dev/null +++ b/examples/mr_hubert/config/decode/run/submitit_slurm_8gpu.yaml @@ -0,0 +1,17 @@ +# @package _global_ +hydra: + launcher: + cpus_per_task: ${distributed_training.distributed_world_size} + gpus_per_node: ${distributed_training.distributed_world_size} + tasks_per_node: ${hydra.launcher.gpus_per_node} + nodes: 1 + mem_gb: 200 + timeout_min: 4320 + max_num_timeout: 50 + name: ${hydra.job.config_name} + submitit_folder: ${hydra.sweep.dir}/submitit + +distributed_training: + distributed_world_size: 8 + distributed_no_spawn: true + distributed_port: 29761 diff --git a/examples/mr_hubert/config/finetune/base_100h.yaml b/examples/mr_hubert/config/finetune/base_100h.yaml new file mode 100644 index 0000000000..c52a118cb8 --- /dev/null +++ b/examples/mr_hubert/config/finetune/base_100h.yaml @@ -0,0 +1,97 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tblog + seed: 1337 + +checkpoint: + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +distributed_training: + ddp_backend: c10d + find_unused_parameters: true + distributed_world_size: 8 + distributed_port: 29671 + nprocs_per_node: 8 + +task: + _name: multires_hubert_pretraining + data: ??? + fine_tuning: true + label_dir: ??? + label_rate_ratios: ??? + normalize: false # must be consistent with pre-training + labels: ["ltr"] + single_target: true + +dataset: + num_workers: 0 + max_tokens: 3200000 + validate_after_updates: ${model.freeze_finetune_updates} + validate_interval: 5 + train_subset: train_100h + valid_subset: dev_other + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 80000 + lr: [3e-5] + sentence_avg: true + update_freq: [1] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: multires_hubert_ctc + multires_hubert_path: ??? + apply_mask: true + mask_selection: static + mask_length: 10 + mask_other: 0 + mask_prob: 0.75 + mask_channel_selection: static + mask_channel_length: 64 + mask_channel_other: 0 + mask_channel_prob: 0.5 + layerdrop: 0.1 + dropout: 0.0 + activation_dropout: 0.1 + attention_dropout: 0.0 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '__' + exclude_keys: + - run + - task.data + - task.label_dir + - model.multires_hubert_path + - dataset.train_subset + - dataset.valid_subset + - criterion.wer_kenlm_model + - criterion.wer_lexicon + run: + dir: ??? + sweep: + dir: ??? + subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} diff --git a/examples/mr_hubert/config/finetune/base_100h_large.yaml b/examples/mr_hubert/config/finetune/base_100h_large.yaml new file mode 100644 index 0000000000..1d0c0da3db --- /dev/null +++ b/examples/mr_hubert/config/finetune/base_100h_large.yaml @@ -0,0 +1,97 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tblog + seed: 1337 + +checkpoint: + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +distributed_training: + ddp_backend: c10d + find_unused_parameters: true + distributed_world_size: 8 + distributed_port: 29671 + nprocs_per_node: 8 + +task: + _name: multires_hubert_pretraining + data: ??? + fine_tuning: true + label_dir: ??? + label_rate_ratios: ??? + normalize: true # must be consistent with pre-training + labels: ["ltr"] + single_target: true + +dataset: + num_workers: 0 + max_tokens: 1600000 + validate_after_updates: ${model.freeze_finetune_updates} + validate_interval: 5 + train_subset: train_100h + valid_subset: dev_other + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 80000 + lr: [3e-5] + sentence_avg: true + update_freq: [2] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: multires_hubert_ctc + multires_hubert_path: ??? + apply_mask: true + mask_selection: static + mask_length: 10 + mask_other: 0 + mask_prob: 0.75 + mask_channel_selection: static + mask_channel_length: 64 + mask_channel_other: 0 + mask_channel_prob: 0.5 + layerdrop: 0.1 + dropout: 0.0 + activation_dropout: 0.1 + attention_dropout: 0.0 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '__' + exclude_keys: + - run + - task.data + - task.label_dir + - model.multires_hubert_path + - dataset.train_subset + - dataset.valid_subset + - criterion.wer_kenlm_model + - criterion.wer_lexicon + run: + dir: ??? + sweep: + dir: ??? + subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} diff --git a/examples/mr_hubert/config/finetune/base_10h.yaml b/examples/mr_hubert/config/finetune/base_10h.yaml new file mode 100644 index 0000000000..25123e4481 --- /dev/null +++ b/examples/mr_hubert/config/finetune/base_10h.yaml @@ -0,0 +1,101 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tblog + seed: 1337 + +checkpoint: + save_interval: 5 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +distributed_training: + ddp_backend: c10d + find_unused_parameters: true + distributed_world_size: 8 + distributed_port: 29671 + nprocs_per_node: 8 + +task: + _name: multires_hubert_pretraining + data: ??? + fine_tuning: true + label_dir: ??? + label_rate_ratios: ??? + normalize: false # must be consistent with pre-training + labels: ["ltr"] + single_target: true + +dataset: + num_workers: 0 + max_tokens: 3200000 + validate_after_updates: ${model.freeze_finetune_updates} + validate_interval: 5 + train_subset: train_10h + valid_subset: dev + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 25000 + lr: [2e-5] + sentence_avg: true + update_freq: [1] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + warmup_steps: 8000 + hold_steps: 0 + decay_steps: 72000 + final_lr_scale: 0.05 + +model: + _name: multires_hubert_ctc + multires_hubert_path: ??? + apply_mask: true + mask_selection: static + mask_length: 10 + mask_other: 0 + mask_prob: 0.75 + mask_channel_selection: static + mask_channel_length: 64 + mask_channel_other: 0 + mask_channel_prob: 0.5 + layerdrop: 0.1 + dropout: 0.0 + activation_dropout: 0.1 + attention_dropout: 0.0 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '__' + exclude_keys: + - run + - task.data + - task.label_dir + - model.multires_hubert_path + - dataset.train_subset + - dataset.valid_subset + - criterion.wer_kenlm_model + - criterion.wer_lexicon + run: + dir: ??? + sweep: + dir: ??? + subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} diff --git a/examples/mr_hubert/config/finetune/base_10h_large.yaml b/examples/mr_hubert/config/finetune/base_10h_large.yaml new file mode 100644 index 0000000000..65448c7722 --- /dev/null +++ b/examples/mr_hubert/config/finetune/base_10h_large.yaml @@ -0,0 +1,101 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tblog + seed: 1337 + +checkpoint: + save_interval: 5 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +distributed_training: + ddp_backend: c10d + find_unused_parameters: true + distributed_world_size: 8 + distributed_port: 29671 + nprocs_per_node: 8 + +task: + _name: multires_hubert_pretraining + data: ??? + fine_tuning: true + label_dir: ??? + label_rate_ratios: ??? + normalize: true # must be consistent with pre-training + labels: ["ltr"] + single_target: true + +dataset: + num_workers: 0 + max_tokens: 3200000 + validate_after_updates: ${model.freeze_finetune_updates} + validate_interval: 5 + train_subset: train_10h + valid_subset: dev + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 25000 + lr: [2e-5] + sentence_avg: true + update_freq: [1] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + warmup_steps: 8000 + hold_steps: 0 + decay_steps: 72000 + final_lr_scale: 0.05 + +model: + _name: multires_hubert_ctc + multires_hubert_path: ??? + apply_mask: true + mask_selection: static + mask_length: 10 + mask_other: 0 + mask_prob: 0.75 + mask_channel_selection: static + mask_channel_length: 64 + mask_channel_other: 0 + mask_channel_prob: 0.5 + layerdrop: 0.1 + dropout: 0.0 + activation_dropout: 0.1 + attention_dropout: 0.0 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '__' + exclude_keys: + - run + - task.data + - task.label_dir + - model.multires_hubert_path + - dataset.train_subset + - dataset.valid_subset + - criterion.wer_kenlm_model + - criterion.wer_lexicon + run: + dir: ??? + sweep: + dir: ??? + subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} diff --git a/examples/mr_hubert/config/finetune/base_1h.yaml b/examples/mr_hubert/config/finetune/base_1h.yaml new file mode 100644 index 0000000000..7459c3fc4c --- /dev/null +++ b/examples/mr_hubert/config/finetune/base_1h.yaml @@ -0,0 +1,100 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tblog + seed: 1337 + +checkpoint: + save_interval: 50 + keep_interval_updates: 1 + save_interval_updates: 1000 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +distributed_training: + ddp_backend: c10d + find_unused_parameters: true + distributed_world_size: 8 + distributed_port: 29671 + nprocs_per_node: 8 + +task: + _name: multires_hubert_pretraining + data: ??? + fine_tuning: true + label_dir: ??? + label_rate_ratios: ??? + normalize: false # must be consistent with pre-training + labels: ["ltr"] + single_target: true + +dataset: + num_workers: 0 + max_tokens: 3200000 + validate_after_updates: ${model.freeze_finetune_updates} + validate_interval: 1000 + train_subset: train_1h + valid_subset: dev_other + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 13000 + lr: [5e-5] + sentence_avg: true + update_freq: [4] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: multires_hubert_ctc + multires_hubert_path: ??? + apply_mask: true + mask_selection: static + mask_length: 10 + mask_other: 0 + mask_prob: 0.75 + mask_channel_selection: static + mask_channel_length: 64 + mask_channel_other: 0 + mask_channel_prob: 0.5 + layerdrop: 0.1 + dropout: 0.0 + activation_dropout: 0.1 + attention_dropout: 0.0 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '__' + exclude_keys: + - run + - task.data + - task.label_dir + - model.multires_hubert_path + - dataset.train_subset + - dataset.valid_subset + - criterion.wer_kenlm_model + - criterion.wer_lexicon + run: + dir: ??? + sweep: + dir: ??? + subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} diff --git a/examples/mr_hubert/config/finetune/base_1h_large.yaml b/examples/mr_hubert/config/finetune/base_1h_large.yaml new file mode 100644 index 0000000000..34ef4dc19d --- /dev/null +++ b/examples/mr_hubert/config/finetune/base_1h_large.yaml @@ -0,0 +1,99 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tblog + seed: 1337 + +checkpoint: + save_interval: 1000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +distributed_training: + ddp_backend: c10d + find_unused_parameters: true + distributed_world_size: 8 + distributed_port: 29671 + nprocs_per_node: 8 + +task: + _name: multires_hubert_pretraining + data: ??? + fine_tuning: true + label_dir: ??? + label_rate_ratios: ??? + normalize: true # must be consistent with pre-training + labels: ["ltr"] + single_target: true + +dataset: + num_workers: 0 + max_tokens: 1280000 + validate_after_updates: ${model.freeze_finetune_updates} + validate_interval: 5 + train_subset: train_10h + valid_subset: dev + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 25000 + lr: [3e-4] + sentence_avg: true + update_freq: [5] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: multires_hubert_ctc + multires_hubert_path: ??? + apply_mask: true + mask_selection: static + mask_length: 10 + mask_other: 0 + mask_prob: 0.75 + mask_channel_selection: static + mask_channel_length: 64 + mask_channel_other: 0 + mask_channel_prob: 0.5 + layerdrop: 0.1 + dropout: 0.0 + activation_dropout: 0.1 + attention_dropout: 0.0 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '__' + exclude_keys: + - run + - task.data + - task.label_dir + - model.multires_hubert_path + - dataset.train_subset + - dataset.valid_subset + - criterion.wer_kenlm_model + - criterion.wer_lexicon + run: + dir: ??? + sweep: + dir: ??? + subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} diff --git a/examples/mr_hubert/config/pretrain/mrhubert_base_librispeech.yaml b/examples/mr_hubert/config/pretrain/mrhubert_base_librispeech.yaml new file mode 100644 index 0000000000..16a35d340a --- /dev/null +++ b/examples/mr_hubert/config/pretrain/mrhubert_base_librispeech.yaml @@ -0,0 +1,103 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + seed: 1337 + tensorboard_logdir: tblog + min_loss_scale: 1e-8 + +checkpoint: + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +distributed_training: + ddp_backend: no_c10d + distributed_backend: 'nccl' + distributed_world_size: 32 + distributed_port: 29671 + nprocs_per_node: 8 + find_unused_parameters: true + +task: + _name: multires_hubert_pretraining + data: ??? + label_dir: ??? + labels: ??? + label_rate: ${model.label_rate} + label_rate_ratios: ??? + sample_rate: 16000 + max_sample_size: 250000 + min_sample_size: 32000 + pad_audio: false + random_crop: true + normalize: false # must be consistent with extractor + # max_keep_size: 300000 + # max_keep_size: 50000 + + +dataset: + num_workers: 0 + max_tokens: 1000000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + validate_interval_updates: 10000 + +criterion: + _name: hubert + pred_masked_weight: 1.0 + pred_nomask_weight: 0.0 + loss_weights: [10,] + +optimization: + max_update: 400000 + lr: [0.0005] + clip_norm: 10.0 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 32000 + +model: + _name: multires_hubert + label_rate: ??? + label_rate_ratios: ${task.label_rate_ratios} + skip_masked: false + skip_nomask: false + mask_prob: 0.80 + extractor_mode: default + conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' + final_dim: 256 + encoder_layers: 4 + encoder_layerdrop: 0.05 + dropout_input: 0.1 + dropout_features: 0.1 + dropout: 0.1 + attention_dropout: 0.1 + feature_grad_mult: 0.1 + untie_final_proj: true + activation_dropout: 0.0 + conv_adapator_kernal: 1 + use_single_target: true + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '/' + exclude_keys: + - run + - task.data + - task.label_dir + - common.min_loss_scale + - common.log_interval + - optimization.clip_norm diff --git a/examples/mr_hubert/config/pretrain/mrhubert_large_librilight.yaml b/examples/mr_hubert/config/pretrain/mrhubert_large_librilight.yaml new file mode 100644 index 0000000000..423f3b25c2 --- /dev/null +++ b/examples/mr_hubert/config/pretrain/mrhubert_large_librilight.yaml @@ -0,0 +1,107 @@ +# @package _group_ + +common: + memory_efficient_fp16: true + log_format: json + log_interval: 200 + seed: 1337 + tensorboard_logdir: tblog + +checkpoint: + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + + +distributed_training: + ddp_backend: no_c10d + distributed_backend: 'nccl' + distributed_world_size: 128 + distributed_port: 29671 + nprocs_per_node: 8 + find_unused_parameters: true + +task: + _name: multires_hubert_pretraining + data: ??? + label_dir: ??? + labels: ??? + label_rate: ${model.label_rate} + label_rate_ratios: ??? + sample_rate: 16000 + max_sample_size: 250000 + min_sample_size: 32000 + pad_audio: false + random_crop: true + normalize: true # must be consistent with extractor + # max_keep_size: 50000 + +dataset: + num_workers: 0 + max_tokens: 300000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + validate_interval_updates: 10000 + +criterion: + _name: hubert + pred_masked_weight: 1.0 + pred_nomask_weight: 0.0 + loss_weights: [10,] + +optimization: + max_update: 400000 + lr: [0.0015] + clip_norm: 1.0 + update_freq: [3] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 32000 + +model: + _name: multires_hubert + label_rate: ??? + label_rate_ratios: ${task.label_rate_ratios} + encoder_layers: 8 + encoder_embed_dim: 1024 + encoder_ffn_embed_dim: 4096 + encoder_attention_heads: 16 + final_dim: 768 + skip_masked: false + skip_nomask: false + mask_prob: 0.80 + extractor_mode: layer_norm + conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' + encoder_layerdrop: 0.0 + dropout_input: 0.0 + dropout_features: 0.0 + dropout: 0.0 + attention_dropout: 0.0 + layer_norm_first: true + feature_grad_mult: 1.0 + untie_final_proj: true + activation_dropout: 0.0 + conv_adapator_kernal: 1 + use_single_target: true + +hydra: + job: + config: + override_dirname: + kv_sep: '-' + item_sep: '__' + exclude_keys: + - run + - task.data + run: + dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt + sweep: + dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt + subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} diff --git a/examples/mr_hubert/config/pretrain/run/submitit_reg.yaml b/examples/mr_hubert/config/pretrain/run/submitit_reg.yaml new file mode 100644 index 0000000000..46c979cd28 --- /dev/null +++ b/examples/mr_hubert/config/pretrain/run/submitit_reg.yaml @@ -0,0 +1,20 @@ +# @package _global_ + +hydra: + launcher: + cpus_per_task: 8 + gpus_per_node: 8 + tasks_per_node: ${hydra.launcher.gpus_per_node} + nodes: 4 + comment: null + mem_gb: 384 + timeout_min: 4320 + max_num_timeout: 100 + constraint: volta32gb + name: ${hydra.job.config_name}/${hydra.job.override_dirname} + submitit_folder: ${hydra.sweep.dir}/submitit/%j + +distributed_training: + distributed_world_size: 32 + distributed_port: 29671 + nprocs_per_node: 8 diff --git a/examples/mr_hubert/decode.sh b/examples/mr_hubert/decode.sh new file mode 100755 index 0000000000..1ff423a84c --- /dev/null +++ b/examples/mr_hubert/decode.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +FAIRSEQ= # Setup your fairseq directory + +config_dir=${FAIRSEQ}/examples/mr_hubert/config +config_name=mr_hubert_base_librispeech + + +# Prepared Data Directory + +data_dir=librispeech +# -- data_dir +# -- test.tsv +# -- test.ltr +# -- dict.ltr.txt + + +exp_dir=exp # Target experiments directory (where you have your pre-trained model with checkpoint_best.pt) +ratios="[1, 2]" # Default label rate ratios + +_opts= + +# If use slurm, uncomment this line and modify the job submission at +# _opts="${_opts} hydra/launcher=submitit_slurm +hydra.launcher.partition=${your_slurm_partition} +run=submitit_reg" + +# If want to set additional experiment tag, uncomment this line +# _opts="${_opts} hydra.sweep.subdir=${your_experiment_tag}" + +# If use un-normalized audio, uncomment this line +# _opts="${_opts} task.normalize=false" + + + +PYTHONPATH=${FAIRSEQ} +python examples/speech_recognition/new/infer.py \ + --config-dir ${config_dir} \ + --config-name infer_multires \ + ${_opts} \ + task.data=${data_dir} \ + task.label_rate_ratios='${ratios}' \ + common_eval.results_path=${exp_dir} \ + common_eval.path=${exp_dir}/checkpoint_best.pt \ + dataset.max_tokens=2000000 \ + dataset.gen_subset=test \ + dataset.skip_invalid_size_inputs_valid_test=true + diff --git a/examples/mr_hubert/finetune.sh b/examples/mr_hubert/finetune.sh new file mode 100755 index 0000000000..31ba645560 --- /dev/null +++ b/examples/mr_hubert/finetune.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +FAIRSEQ= # Setup your fairseq directory + +config_dir=${FAIRSEQ}/examples/mr_hubert/config +config_name=mr_hubert_base_librispeech + +# override configs if need +max_tokens=3200000 +max_sample_size=1000000 +max_update=50000 + + +# Prepared Data Directory + +data_dir=librispeech +# -- data_dir +# -- train.tsv +# -- train.ltr +# -- valid.tsv +# -- valid.ltr +# -- dict.ltr.txt + + +exp_dir=exp # Target experiments directory +ratios="[1, 2]" # Default label rate ratios +hubert_path=/path/of/your/hubert.pt + +_opts= + +# If use slurm, uncomment this line and modify the job submission at +# _opts="${_opts} hydra/launcher=submitit_slurm +hydra.launcher.partition=${your_slurm_partition} +run=submitit_reg" + +# If want to set additional experiment tag, uncomment this line +# _opts="${_opts} hydra.sweep.subdir=${your_experiment_tag}" + + +python ${FAIRSEQ}/fairseq_cli/hydra_train.py \ + -m --config-dir ${config_dir} --config-name ${config_name} ${_opts} \ + task.data=${data_dir} +task.max_sample_size=${max_sample_size} \ + task.label_dir=${data_dir} \ + task.label_rate_ratios='${ratios}' \ + dataset.max_tokens=${max_tokens} \ + optimization.max_update=${max_update} \ + model.multires_hubert_path=${hubert_path} \ + hydra.sweep.dir=${exp_dir} & diff --git a/examples/mr_hubert/simple_kmeans b/examples/mr_hubert/simple_kmeans new file mode 120000 index 0000000000..4f95545122 --- /dev/null +++ b/examples/mr_hubert/simple_kmeans @@ -0,0 +1 @@ +../hubert/simple_kmeans \ No newline at end of file diff --git a/examples/mr_hubert/train.sh b/examples/mr_hubert/train.sh new file mode 100755 index 0000000000..da561eb171 --- /dev/null +++ b/examples/mr_hubert/train.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +FAIRSEQ= # Setup your fairseq directory + +config_dir=${FAIRSEQ}/examples/mr_hubert/config +config_name=mr_hubert_base_librispeech + +# Prepared Data Directory +data_dir=librispeech +# -- data_dir +# -- train.tsv +# -- valid.tsv + +label_dir=labels +# -- label_dir +# -- train.km +# -- valid.km +# -- dict.km.txt + + +exp_dir=exp # Target experiments directory +ratios="[1, 2]" # Default label rate ratios +label_rate=50 # Base label rate + + +_opts= + +# If use slurm, uncomment this line and modify the job submission at +# _opts="${_opts} hydra/launcher=submitit_slurm +hydra.launcher.partition=${your_slurm_partition} +run=submitit_reg" + +# If want to set additional experiment tag, uncomment this line +# _opts="${_opts} hydra.sweep.subdir=${your_experiment_tag}" + + +python ${FAIRSEQ}/fairseq_cli/hydra_train.py \ + -m --config-dir ${config_dir} --config-name ${config_name} ${_opts} \ + task.data=${data_dir} \ + task.label_dir=${label_dir} \ + task.labels='["km"]' \ + model.label_rate=${label_rate} \ + task.label_rate_ratios='${ratios}' \ + hydra.sweep.dir=${exp_dir} & + + + diff --git a/examples/multilingual/ML50_langs.txt b/examples/multilingual/ML50_langs.txt new file mode 100644 index 0000000000..558abbc785 --- /dev/null +++ b/examples/multilingual/ML50_langs.txt @@ -0,0 +1,52 @@ +ar_AR +cs_CZ +de_DE +en_XX +es_XX +et_EE +fi_FI +fr_XX +gu_IN +hi_IN +it_IT +ja_XX +kk_KZ +ko_KR +lt_LT +lv_LV +my_MM +ne_NP +nl_XX +ro_RO +ru_RU +si_LK +tr_TR +vi_VN +zh_CN +af_ZA +az_AZ +bn_IN +fa_IR +he_IL +hr_HR +id_ID +ka_GE +km_KH +mk_MK +ml_IN +mn_MN +mr_IN +pl_PL +ps_AF +pt_XX +sv_SE +sw_KE +ta_IN +te_IN +th_TH +tl_XX +uk_UA +ur_PK +xh_ZA +gl_ES +sl_SI \ No newline at end of file diff --git a/examples/multilingual/README.md b/examples/multilingual/README.md index 3559c244e2..46ff9c351b 100644 --- a/examples/multilingual/README.md +++ b/examples/multilingual/README.md @@ -17,9 +17,9 @@ This work is for training multilingual translation models with multiple bitext d - --finetune-from-model to specify the path from which to load the pretrained model ## Preprocessing data -Multilingual training requires a joint BPE vocab. Please follow [mBART's preprocessing steps](https://github.com/pytorch/fairseq/tree/master/examples/mbart#bpe-data) to reuse our pretrained sentence-piece model. +Multilingual training requires a joint BPE vocab. Please follow [mBART's preprocessing steps](https://github.com/pytorch/fairseq/tree/main/examples/mbart#bpe-data) to reuse our pretrained sentence-piece model. -You can also train a joint BPE model on your own dataset and then follow the steps in [[link]](https://github.com/pytorch/fairseq/tree/master/examples/translation#multilingual-translation). +You can also train a joint BPE model on your own dataset and then follow the steps in [[link]](https://github.com/pytorch/fairseq/tree/main/examples/translation#multilingual-translation). ## Training @@ -41,7 +41,7 @@ fairseq-train $path_2_data \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \ + --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ @@ -49,7 +49,7 @@ fairseq-train $path_2_data \ ``` ## Finetuning -We can also finetune multilingual models from a monolingual pretrained models, e.g. [mMBART](https://github.com/pytorch/fairseq/tree/master/examples/mbart). +We can also finetune multilingual models from a monolingual pretrained models, e.g. [mMBART](https://github.com/pytorch/fairseq/tree/main/examples/mbart). ```bash lang_pairs= path_2_data= @@ -69,7 +69,7 @@ fairseq-train $path_2_data \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \ + --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ @@ -108,7 +108,41 @@ cat {source_lang}_${target_lang}.txt | grep -P "^T" |sort -V |cut -f 2- |$TOK_CM sacrebleu -tok 'none' -s 'none' ${source_lang}_${target_lang}.ref < ${source_lang}_${target_lang}.hyp ``` +# mBART50 models +* [mMBART 50 pretrained model](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.pretrained.tar.gz). +* [mMBART 50 finetuned many-to-one](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.n1.tar.gz). +* [mMBART 50 finetuned one-to-many](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.1n.tar.gz). +* [mMBART 50 finetuned many-to-many](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.nn.tar.gz). + +Please download and extract from the above tarballs. Each tarball contains +* The fairseq model checkpoint: model.pt +* The list of supported languages: ML50_langs.txt +* Sentence piece model: sentence.bpe.model +* Fairseq dictionary of each language: dict.{lang}.txt (please replace lang with a language specified in ML50_langs.txt) + +To use the trained models, +* use the tool [binarize.py](./data_scripts/binarize.py) to binarize your data using sentence.bpe.model and dict.{lang}.txt, and copy the dictionaries to your data path +* then run the generation command: +```bash +path_2_data= +model=/model.pt +lang_list=/ML50_langs.txt +source_lang= +target_lang= + +fairseq-generate $path_2_data \ + --path $model \ + --task translation_multi_simple_epoch \ + --gen-subset test \ + --source-lang $source_lang \ + --target-lang $target_lang + --sacrebleu --remove-bpe 'sentencepiece'\ + --batch-size 32 \ + --encoder-langtok "src" \ + --decoder-langtok \ + --lang-dict "$lang_list" +``` ## Citation diff --git a/examples/multilingual/data_scripts/README.md b/examples/multilingual/data_scripts/README.md new file mode 100644 index 0000000000..cc610c0c9e --- /dev/null +++ b/examples/multilingual/data_scripts/README.md @@ -0,0 +1,24 @@ + +# Install dependency +```bash +pip install -r requirement.txt +``` + +# Download the data set +```bash +export WORKDIR_ROOT= + +``` +The downloaded data will be at $WORKDIR_ROOT/ML50 + +# preprocess the data +Install SPM [here](https://github.com/google/sentencepiece) +```bash +export WORKDIR_ROOT= +export SPM_PATH= +``` +* $WORKDIR_ROOT/ML50/raw: extracted raw data +* $WORKDIR_ROOT/ML50/dedup: dedup data +* $WORKDIR_ROOT/ML50/clean: data with valid and test sentences removed from the dedup data + + diff --git a/examples/multilingual/data_scripts/binarize.py b/examples/multilingual/data_scripts/binarize.py new file mode 100755 index 0000000000..ee54c6aabf --- /dev/null +++ b/examples/multilingual/data_scripts/binarize.py @@ -0,0 +1,200 @@ +import shutil +import os, sys +from subprocess import check_call, check_output +import glob +import argparse +import shutil +import pathlib +import itertools + +def call_output(cmd): + print(f"Executing: {cmd}") + ret = check_output(cmd, shell=True) + print(ret) + return ret + +def call(cmd): + print(cmd) + check_call(cmd, shell=True) + + +WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) + +if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): + print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') + sys.exit(-1) + +SPM_PATH = os.environ.get('SPM_PATH', None) + +if SPM_PATH is None or not SPM_PATH.strip(): + print("Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting...") + sys.exit(-1) + + +SPM_MODEL = f'{WORKDIR_ROOT}/sentence.bpe.model' +SPM_VOCAB = f'{WORKDIR_ROOT}/dict_250k.txt' + +SPM_ENCODE = f'{SPM_PATH}' + +if not os.path.exists(SPM_MODEL): + call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/sentence.bpe.model -O {SPM_MODEL}") + + +if not os.path.exists(SPM_VOCAB): + call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/dict_250k.txt -O {SPM_VOCAB}") + + + +def get_data_size(raw): + cmd = f'wc -l {raw}' + ret = call_output(cmd) + return int(ret.split()[0]) + +def encode_spm(model, direction, prefix='', splits=['train', 'test', 'valid'], pairs_per_shard=None): + src, tgt = direction.split('-') + + for split in splits: + src_raw, tgt_raw = f'{RAW_DIR}/{split}{prefix}.{direction}.{src}', f'{RAW_DIR}/{split}{prefix}.{direction}.{tgt}' + if os.path.exists(src_raw) and os.path.exists(tgt_raw): + cmd = f"""python {SPM_ENCODE} \ + --model {model}\ + --output_format=piece \ + --inputs {src_raw} {tgt_raw} \ + --outputs {BPE_DIR}/{direction}{prefix}/{split}.bpe.{src} {BPE_DIR}/{direction}{prefix}/{split}.bpe.{tgt} """ + print(cmd) + call(cmd) + + +def binarize_( + bpe_dir, + databin_dir, + direction, spm_vocab=SPM_VOCAB, + splits=['train', 'test', 'valid'], +): + src, tgt = direction.split('-') + + try: + shutil.rmtree(f'{databin_dir}', ignore_errors=True) + os.mkdir(f'{databin_dir}') + except OSError as error: + print(error) + cmds = [ + "fairseq-preprocess", + f"--source-lang {src} --target-lang {tgt}", + f"--destdir {databin_dir}/", + f"--workers 8", + ] + if isinstance(spm_vocab, tuple): + src_vocab, tgt_vocab = spm_vocab + cmds.extend( + [ + f"--srcdict {src_vocab}", + f"--tgtdict {tgt_vocab}", + ] + ) + else: + cmds.extend( + [ + f"--joined-dictionary", + f"--srcdict {spm_vocab}", + ] + ) + input_options = [] + if 'train' in splits and glob.glob(f"{bpe_dir}/train.bpe*"): + input_options.append( + f"--trainpref {bpe_dir}/train.bpe", + ) + if 'valid' in splits and glob.glob(f"{bpe_dir}/valid.bpe*"): + input_options.append(f"--validpref {bpe_dir}/valid.bpe") + if 'test' in splits and glob.glob(f"{bpe_dir}/test.bpe*"): + input_options.append(f"--testpref {bpe_dir}/test.bpe") + if len(input_options) > 0: + cmd = " ".join(cmds + input_options) + print(cmd) + call(cmd) + + +def binarize( + databin_dir, + direction, spm_vocab=SPM_VOCAB, prefix='', + splits=['train', 'test', 'valid'], + pairs_per_shard=None, +): + def move_databin_files(from_folder, to_folder): + for bin_file in glob.glob(f"{from_folder}/*.bin") \ + + glob.glob(f"{from_folder}/*.idx") \ + + glob.glob(f"{from_folder}/dict*"): + try: + shutil.move(bin_file, to_folder) + except OSError as error: + print(error) + bpe_databin_dir = f"{BPE_DIR}/{direction}{prefix}_databin" + bpe_dir = f"{BPE_DIR}/{direction}{prefix}" + if pairs_per_shard is None: + binarize_(bpe_dir, bpe_databin_dir, direction, spm_vocab=spm_vocab, splits=splits) + move_databin_files(bpe_databin_dir, databin_dir) + else: + # binarize valid and test which will not be sharded + binarize_( + bpe_dir, bpe_databin_dir, direction, + spm_vocab=spm_vocab, splits=[s for s in splits if s != "train"]) + for shard_bpe_dir in glob.glob(f"{bpe_dir}/shard*"): + path_strs = os.path.split(shard_bpe_dir) + shard_str = path_strs[-1] + shard_folder = f"{bpe_databin_dir}/{shard_str}" + databin_shard_folder = f"{databin_dir}/{shard_str}" + print(f'working from {shard_folder} to {databin_shard_folder}') + os.makedirs(databin_shard_folder, exist_ok=True) + binarize_( + shard_bpe_dir, shard_folder, direction, + spm_vocab=spm_vocab, splits=["train"]) + + for test_data in glob.glob(f"{bpe_databin_dir}/valid.*") + glob.glob(f"{bpe_databin_dir}/test.*"): + filename = os.path.split(test_data)[-1] + try: + os.symlink(test_data, f"{databin_shard_folder}/{filename}") + except OSError as error: + print(error) + move_databin_files(shard_folder, databin_shard_folder) + + +def load_langs(path): + with open(path) as fr: + langs = [l.strip() for l in fr] + return langs + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--data_root", default=f"{WORKDIR_ROOT}/ML50") + parser.add_argument("--raw-folder", default='raw') + parser.add_argument("--bpe-folder", default='bpe') + parser.add_argument("--databin-folder", default='databin') + + args = parser.parse_args() + + DATA_PATH = args.data_root #'/private/home/yuqtang/public_data/ML50' + RAW_DIR = f'{DATA_PATH}/{args.raw_folder}' + BPE_DIR = f'{DATA_PATH}/{args.bpe_folder}' + DATABIN_DIR = f'{DATA_PATH}/{args.databin_folder}' + os.makedirs(BPE_DIR, exist_ok=True) + + raw_files = itertools.chain( + glob.glob(f'{RAW_DIR}/train*'), + glob.glob(f'{RAW_DIR}/valid*'), + glob.glob(f'{RAW_DIR}/test*'), + ) + + directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] + + for direction in directions: + prefix = "" + splits = ['train', 'valid', 'test'] + try: + shutil.rmtree(f'{BPE_DIR}/{direction}{prefix}', ignore_errors=True) + os.mkdir(f'{BPE_DIR}/{direction}{prefix}') + os.makedirs(DATABIN_DIR, exist_ok=True) + except OSError as error: + print(error) + spm_model, spm_vocab = SPM_MODEL, SPM_VOCAB + encode_spm(spm_model, direction=direction, splits=splits) + binarize(DATABIN_DIR, direction, spm_vocab=spm_vocab, splits=splits) diff --git a/examples/multilingual/data_scripts/check_iswlt_test_data.py b/examples/multilingual/data_scripts/check_iswlt_test_data.py new file mode 100644 index 0000000000..f8e2eb0f15 --- /dev/null +++ b/examples/multilingual/data_scripts/check_iswlt_test_data.py @@ -0,0 +1,67 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import os, sys +import subprocess +import re +from subprocess import check_call, check_output + +WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) + +if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): + print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') + sys.exit(-1) + + +BLEU_REGEX = re.compile("^BLEU\\S* = (\\S+) ") +def run_eval_bleu(cmd): + output = check_output(cmd, shell=True, stderr=subprocess.STDOUT).decode("utf-8").strip() + print(output) + bleu = -1.0 + for line in output.strip().split('\n'): + m = BLEU_REGEX.search(line) + if m is not None: + bleu = m.groups()[0] + bleu = float(bleu) + break + return bleu + +def check_data_test_bleu(raw_folder, data_lang_pairs): + not_matchings = [] + for sacrebleu_set, src_tgts in data_lang_pairs: + for src_tgt in src_tgts: + print(f'checking test bleus for: {src_tgt} at {sacrebleu_set}') + src, tgt = src_tgt.split('-') + ssrc, stgt = src[:2], tgt[:2] + if os.path.exists(f'{raw_folder}/test.{tgt}-{src}.{src}'): + # reversed direction may have different test set + test_src = f'{raw_folder}/test.{tgt}-{src}.{src}' + else: + test_src = f'{raw_folder}/test.{src}-{tgt}.{src}' + cmd1 = f'cat {test_src} | sacrebleu -t "{sacrebleu_set}" -l {stgt}-{ssrc}; [ $? -eq 0 ] || echo ""' + test_tgt = f'{raw_folder}/test.{src}-{tgt}.{tgt}' + cmd2 = f'cat {test_tgt} | sacrebleu -t "{sacrebleu_set}" -l {ssrc}-{stgt}; [ $? -eq 0 ] || echo ""' + bleu1 = run_eval_bleu(cmd1) + if bleu1 != 100.0: + not_matchings.append(f'{sacrebleu_set}:{src_tgt} source side not matching: {test_src}') + bleu2 = run_eval_bleu(cmd2) + if bleu2 != 100.0: + not_matchings.append(f'{sacrebleu_set}:{src_tgt} target side not matching: {test_tgt}') + return not_matchings + +if __name__ == "__main__": + to_data_path = f'{WORKDIR_ROOT}/iwsltv2' + not_matching = check_data_test_bleu( + f'{to_data_path}/raw', + [ + ('iwslt17', ['en_XX-ar_AR', 'en_XX-ko_KR', 'ar_AR-en_XX', 'ko_KR-en_XX']), + ('iwslt17', ['en_XX-it_IT', 'en_XX-nl_XX', 'it_IT-en_XX', 'nl_XX-en_XX']), + ('iwslt17/tst2015', ['en_XX-vi_VN', "vi_VN-en_XX"]), + ] + ) + if len(not_matching) > 0: + print('the following datasets do not have matching test datasets:\n\t', '\n\t'.join(not_matching)) + diff --git a/examples/multilingual/data_scripts/check_self_overlaps.py b/examples/multilingual/data_scripts/check_self_overlaps.py new file mode 100644 index 0000000000..07b338dcfd --- /dev/null +++ b/examples/multilingual/data_scripts/check_self_overlaps.py @@ -0,0 +1,103 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import os +import glob +import argparse +from utils.dedup import deup +import sys + +WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) + +if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): + print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') + sys.exit(-1) + +def get_directions(folder): + raw_files = glob.glob(f'{folder}/train*') + directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] + return directions + +def diff_list(lhs, rhs): + return set(lhs).difference(set(rhs)) + +def check_diff( + from_src_file, from_tgt_file, + to_src_file, to_tgt_file, +): + seen_in_from = set() + seen_src_in_from = set() + seen_tgt_in_from = set() + from_count = 0 + with open(from_src_file, encoding='utf-8') as fsrc, \ + open(from_tgt_file, encoding='utf-8') as ftgt: + for s, t in zip(fsrc, ftgt): + seen_in_from.add((s, t)) + seen_src_in_from.add(s) + seen_tgt_in_from.add(t) + from_count += 1 + common = 0 + common_src = 0 + common_tgt = 0 + to_count = 0 + seen = set() + + with open(to_src_file, encoding='utf-8') as fsrc, \ + open(to_tgt_file, encoding='utf-8') as ftgt: + for s, t in zip(fsrc, ftgt): + to_count += 1 + if (s, t) not in seen: + if (s, t) in seen_in_from: + common += 1 + if s in seen_src_in_from: + common_src += 1 + seen_src_in_from.remove(s) + if t in seen_tgt_in_from: + common_tgt += 1 + seen_tgt_in_from.remove(t) + seen.add((s, t)) + return common, common_src, common_tgt, from_count, to_count + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--folder", type=str, required=True, + help="the data folder ") + parser.add_argument("--split", type=str, default='test', + help="split (valid, test) to check against training data") + parser.add_argument('--directions', type=str, default=None, required=False) + + args = parser.parse_args() + + if args.directions is None: + directions = set(get_directions(args.folder)) + directions = sorted(directions) + else: + directions = args.directions.split(',') + directions = sorted(set(directions)) + + results = [] + print(f'checking where {args.split} split data are in training') + print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size') + + for direction in directions: + src, tgt = direction.split('-') + from_src_file = f'{args.folder}/{args.split}.{src}-{tgt}.{src}' + from_tgt_file = f'{args.folder}/{args.split}.{src}-{tgt}.{tgt}' + if not os.path.exists(from_src_file): + # some test/valid data might in reverse directinos: + from_src_file = f'{args.folder}/{args.split}.{tgt}-{src}.{src}' + from_tgt_file = f'{args.folder}/{args.split}.{tgt}-{src}.{tgt}' + to_src_file = f'{args.folder}/train.{src}-{tgt}.{src}' + to_tgt_file = f'{args.folder}/train.{src}-{tgt}.{tgt}' + if not os.path.exists(to_src_file) or not os.path.exists(from_src_file): + continue + r = check_diff(from_src_file, from_tgt_file, to_src_file, to_tgt_file) + results.append(r) + print(f'{direction}\t', '\t'.join(map(str, r))) + + +if __name__ == "__main__": + main() diff --git a/examples/multilingual/data_scripts/check_valid_test_overlaps.py b/examples/multilingual/data_scripts/check_valid_test_overlaps.py new file mode 100644 index 0000000000..40fa9aecdf --- /dev/null +++ b/examples/multilingual/data_scripts/check_valid_test_overlaps.py @@ -0,0 +1,124 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import os +import argparse +import pandas as pd +import sys + + +WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) + +if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): + print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') + sys.exit(-1) + +def load_langs(path): + with open(path) as fr: + langs = [l.strip() for l in fr] + return langs + + + +def load_sentences(raw_data, split, direction): + src, tgt = direction.split('-') + src_path = f"{raw_data}/{split}.{direction}.{src}" + tgt_path = f"{raw_data}/{split}.{direction}.{tgt}" + if os.path.exists(src_path) and os.path.exists(tgt_path): + return [(src, open(src_path).read().splitlines()), (tgt, open(tgt_path).read().splitlines())] + else: + return [] + +def swap_direction(d): + src, tgt = d.split('-') + return f'{tgt}-{src}' + +def get_all_test_data(raw_data, directions, split='test'): + test_data = [ + x + for dd in directions + for d in [dd, swap_direction(dd)] + for x in load_sentences(raw_data, split, d) + ] + # all_test_data = {s for _, d in test_data for s in d} + all_test_data = {} + for lang, d in test_data: + for s in d: + s = s.strip() + lgs = all_test_data.get(s, set()) + lgs.add(lang) + all_test_data[s] = lgs + return all_test_data, test_data + + +def check_train_sentences(src_path, tgt_path, direction, all_test_data, mess_up_train={}): + # src, tgt = direction.split('-') + print(f'check training data for {direction} in {src_path} and {tgt_path}') + size = 0 + overlapped_size_counted_dup = 0 + if not os.path.exists(tgt_path) or not os.path.exists(src_path): + return mess_up_train, size, overlapped_size_counted_dup + + with open(src_path) as f, open(tgt_path) as g: + for src_line, tgt_line in zip(f, g): + s = src_line.strip() + t = tgt_line.strip() + size += 1 + if s in all_test_data: + langs = mess_up_train.get(s, set()) + langs.add(direction) + mess_up_train[s] = langs + overlapped_size_counted_dup += 1 + if t in all_test_data: + langs = mess_up_train.get(t, set()) + langs.add(direction) + mess_up_train[t] = langs + overlapped_size_counted_dup += 1 + print(f'{direction}: size={size}, overlapped={overlapped_size_counted_dup}') + return mess_up_train, size, overlapped_size_counted_dup + +def check_train_all(raw_data, directions, all_test_data): + mess_up_train = {} + data_sizes = {} + # raw_data = '~chau/data-bin/MineBART/multilingual_mined_100M/en_XX/et_EE-en_XX/all.{en_XX, et_EE}' + print(f'checking training data againsts # {len(all_test_data)} sentences') + print(f'example test data: ', [s for i, s in enumerate(all_test_data.keys()) if i < 10]) + for direction in directions: + src, tgt = direction.split('-') + path = f'{raw_data}/en_XX/{direction}/all' + src_path = f'{path}.{src}' + tgt_path = f'{path}.{tgt}' + print(f'checking {src_path} {tgt_path}') + _, size, overlapped_size_counted_dup = check_train_sentences(src_path, tgt_path, direction, all_test_data, mess_up_train) + data_sizes[direction] = (size, overlapped_size_counted_dup) + return mess_up_train, data_sizes + + + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--folder", type=str, required=True, + help="the data folder ") + parser.add_argument("--test-data", type=str, required=True, + help="the test data folder ") + parser.add_argument('--directions', type=str, default=None, required=False) + + args = parser.parse_args() + directions = args.directions.split(',') + directions = sorted(set(directions)) + + results = [] + # print(f'checking where {args.split} split data are in training') + # print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size') + raw_data = args.folder + all_test_data, test_data = get_all_test_data(args.test_data, directions, split='test') + mess_up_train, data_sizes = check_train_all(raw_data, directions, all_test_data) + print(data_sizes) + + +if __name__ == "__main__": + main() diff --git a/examples/multilingual/data_scripts/dedup_all.py b/examples/multilingual/data_scripts/dedup_all.py new file mode 100644 index 0000000000..ef39c05ee6 --- /dev/null +++ b/examples/multilingual/data_scripts/dedup_all.py @@ -0,0 +1,52 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + + +import os +import glob +import argparse +from utils.dedup import deup + +import sys +WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) + +if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): + print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') + sys.exit(-1) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--from-folder", type=str, required=True, + help="the data folder to be dedup") + parser.add_argument("--to-folder", type=str, required=True, + help="the data folder to save deduped data") + parser.add_argument('--directions', type=str, default=None, required=False) + + args = parser.parse_args() + + if args.directions is None: + raw_files = glob.glob(f'{args.from_folder}/train*') + + directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] + else: + directions = args.directions.split(',') + directions = sorted(set(directions)) + + for direction in directions: + src, tgt = direction.split('-') + src_file = f'{args.from_folder}/train.{src}-{tgt}.{src}' + tgt_file = f'{args.from_folder}/train.{src}-{tgt}.{tgt}' + src_file_out = f'{args.to_folder}/train.{src}-{tgt}.{src}' + tgt_file_out = f'{args.to_folder}/train.{src}-{tgt}.{tgt}' + assert src_file != src_file_out + assert tgt_file != tgt_file_out + print(f'deduping {src_file}, {tgt_file}') + deup(src_file, tgt_file, src_file_out, tgt_file_out) + + +if __name__ == "__main__": + main() diff --git a/examples/multilingual/data_scripts/download_ML50_v1.sh b/examples/multilingual/data_scripts/download_ML50_v1.sh new file mode 100644 index 0000000000..99fbc75920 --- /dev/null +++ b/examples/multilingual/data_scripts/download_ML50_v1.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +if [ -z $WORKDIR_ROOT ] ; +then + echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." + exit +fi + +# first run download_wmt20.sh; it will install a few useful tools for other scripts +# TODO: need to print out instructions on downloading a few files which requires manually authentication from the websites +bash ./download_wmt20.sh + +python ./download_wmt19_and_before.py +bash ./download_wat19_my.sh +python ./download_ted_and_extract.py +bash ./download_lotus.sh +bash ./download_iitb.sh +bash ./download_af_xh.sh + + +# IWSLT downloading URLs have changed in between; TODO: fix them: +bash ./download_iwslt_and_extract.sh + +# TODO: globalvoices URLs changed; need to be fixed +bash ./download_flores_data.sh diff --git a/examples/multilingual/data_scripts/download_af_xh.sh b/examples/multilingual/data_scripts/download_af_xh.sh new file mode 100644 index 0000000000..a78fbbbbcc --- /dev/null +++ b/examples/multilingual/data_scripts/download_af_xh.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# set -x -e + +if [ -z $WORKDIR_ROOT ] ; +then + echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." + exit +fi + + +# put intermediate files +TMP_DIR=$WORKDIR_ROOT/temp/af_xhv2 +# output {train,valid,test} files to dest +DEST=${WORKDIR_ROOT}/ML50/raw + + + +ROOT=${WORKDIR_ROOT} +UTILS=$PWD/utils +TMX2CORPUS="${UTILS}/tmx2corpus" +TMX_TOOL="python ${TMX2CORPUS}/tmx2corpus.py" + +mkdir -p $TMP_DIR +mkdir -p $DEST +mkdir -p $UTILS + +function download_opus(){ + src=$1 + tgt=$2 + subset=$3 + ulr=$4 + + mkdir extract_$subset.$src-$tgt + pushd extract_$subset.$src-$tgt + if [ ! -f "$subset.$src-$tgt.tmx.gz" ]; then + wget $url -O "$subset.$src-$tgt.tmx.gz" + gzip -d "$subset.$src-$tgt.tmx.gz" + f=$subset.$src-$tgt.tmx + $TMX_TOOL $f + mv bitext.$src ../$subset.$src-$tgt.$src + mv bitext.$tgt ../$subset.$src-$tgt.$tgt + fi + popd +} + +function concat_subsets(){ + src=$1 + tgt=$2 + subsets=$3 + src_train=raw_train.$src-$tgt.$src + tgt_train=raw_train.$src-$tgt.$tgt + > $src_train + > $tgt_train + for subset in $subsets; do + cat $subset.$src-$tgt.$src >> $src_train + cat $subset.$src-$tgt.$tgt >> $tgt_train + done +} + + + +function get_seeded_random() +{ + seed="$1" + openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \ + /dev/null +} + +function split_train_valid(){ + src=$1 + tgt=$2 + raw_src_train=raw_train.$src-$tgt.$src + raw_tgt_train=raw_train.$src-$tgt.$tgt + + shuf --random-source=<(get_seeded_random 43) $raw_src_train > shuffled.$src-$tgt.$src + shuf --random-source=<(get_seeded_random 43) $raw_tgt_train > shuffled.$src-$tgt.$tgt + + head -n 1500 shuffled.$src-$tgt.$src > valid.$src-$tgt.$src + head -n 1500 shuffled.$src-$tgt.$tgt > valid.$src-$tgt.$tgt + + tail +1501 shuffled.$src-$tgt.$src > train.$src-$tgt.$src + tail +1501 shuffled.$src-$tgt.$tgt > train.$src-$tgt.$tgt +} + +function copy2dst(){ + lsrc=$1 + ltgt=$2 + src=${lsrc:0:2} + tgt=${ltgt:0:2} + + + cp valid.$src-$tgt.$src $DEST/valid.$lsrc-$ltgt.$lsrc + cp valid.$src-$tgt.$tgt $DEST/valid.$lsrc-$ltgt.$ltgt + + cp train.$src-$tgt.$src $DEST/train.$lsrc-$ltgt.$lsrc + cp train.$src-$tgt.$tgt $DEST/train.$lsrc-$ltgt.$ltgt +} + + + + +#for xh-en +declare -A xh_en_urls +xh_en_urls=( + [Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/en-xh.tmx.gz + [wikimedia]=https://object.pouta.csc.fi/OPUS-wikimedia/v20190628/tmx/en-xh.tmx.gz + [memat]=https://object.pouta.csc.fi/OPUS-memat/v1/tmx/en-xh.tmx.gz + [uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/en-xh.tmx.gz + [GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/en-xh.tmx.gz + [XhosaNavy]=https://object.pouta.csc.fi/OPUS-XhosaNavy/v1/tmx/en-xh.tmx.gz + [KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/en-xh.tmx.gz + [Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/en-xh.tmx.gz +) + +mkdir $TMP_DIR/xh-en +pushd $TMP_DIR/xh-en +for k in "${!xh_en_urls[@]}" +do + name=$k + url=${xh_en_urls[$k]} + echo "$name: $url" + download_opus xh en $name $ulr +done +concat_subsets xh en "${!xh_en_urls[@]}" +split_train_valid xh en +copy2dst xh_ZA en_XX +popd + + +## +#for af-en +declare -A af_en_urls +af_en_urls=( + [Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/af-en.tmx.gz + [uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/af-en.tmx.gz + [GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/af-en.tmx.gz + [QED]=https://object.pouta.csc.fi/OPUS-QED/v2.0a/tmx/af-en.tmx.gz + [KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/af-en.tmx.gz + [OpenSubtitles]=https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/af-en.tmx.gz + [SPC]=https://object.pouta.csc.fi/OPUS-SPC/v1/tmx/af-en.tmx.gz + [Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/af-en.tmx.gz +) + +mkdir $TMP_DIR/af-en +pushd $TMP_DIR/af-en +for k in "${!af_en_urls[@]}" +do + name=$k + url=${af_en_urls[$k]} + echo "$name: $url" + download_opus af en $name $ulr +done +concat_subsets af en "${!af_en_urls[@]}" +split_train_valid af en +copy2dst af_ZA en_XX +popd + + diff --git a/examples/multilingual/data_scripts/download_flores_data.sh b/examples/multilingual/data_scripts/download_flores_data.sh new file mode 100644 index 0000000000..e6175ce0c3 --- /dev/null +++ b/examples/multilingual/data_scripts/download_flores_data.sh @@ -0,0 +1,246 @@ +#!/bin/bash + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# + +if [ -z $WORKDIR_ROOT ] ; +then + echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." + exit +fi + + +set -e +set -o pipefail + +SRC=en +SI_TGT=si +NE_TGT=ne + +DESTDIR=${WORKDIR_ROOT}/ML50/raw/ + +ROOT=${WORKDIR_ROOT}/tmp +mkdir -p $ROOT +DATA=$ROOT/data +NE_ROOT=$DATA/all-clean-ne +SI_ROOT=$DATA/all-clean-si + +mkdir -p $DATA $NE_ROOT $SI_ROOT + +SI_OPUS_DATASETS=( + "$SI_ROOT/GNOME.en-si" + "$SI_ROOT/Ubuntu.en-si" + "$SI_ROOT/KDE4.en-si" + "$SI_ROOT/OpenSubtitles.en-si" +) + +SI_OPUS_URLS=( + "https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-si.txt.zip" + "https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-si.txt.zip" + "https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-si.txt.zip" + "https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/en-si.txt.zip" +) + +NE_OPUS_DATASETS=( + "$NE_ROOT/GNOME.en-ne" + "$NE_ROOT/Ubuntu.en-ne" + "$NE_ROOT/KDE4.en-ne" +) + +NE_OPUS_URLS=( + "https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-ne.txt.zip" + "https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-ne.txt.zip" + "https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-ne.txt.zip" +) + +REMOVE_FILE_PATHS=() + +# Download data +download_data() { + CORPORA=$1 + URL=$2 + + if [ -f $CORPORA ]; then + echo "$CORPORA already exists, skipping download" + else + echo "Downloading $URL" + wget $URL -O $CORPORA --no-check-certificate || rm -f $CORPORA + if [ -f $CORPORA ]; then + echo "$URL successfully downloaded." + else + echo "$URL not successfully downloaded." + rm -f $CORPORA + exit -1 + fi + fi +} + +# Example: download_opus_data $LANG_ROOT $TGT +download_opus_data() { + LANG_ROOT=$1 + TGT=$2 + + if [ "$TGT" = "si" ]; then + URLS=("${SI_OPUS_URLS[@]}") + DATASETS=("${SI_OPUS_DATASETS[@]}") + else + URLS=("${NE_OPUS_URLS[@]}") + DATASETS=("${NE_OPUS_DATASETS[@]}") + fi + + # Download and extract data + for ((i=0;i<${#URLS[@]};++i)); do + URL=${URLS[i]} + CORPORA=${DATASETS[i]} + + download_data $CORPORA $URL + unzip -o $CORPORA -d $LANG_ROOT + REMOVE_FILE_PATHS+=( $CORPORA $CORPORA.xml $CORPORA.ids $LANG_ROOT/README $LANG_ROOT/LICENSE ) + done + + cat ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$SRC + cat ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$TGT + + REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC ) + REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT ) +} + +download_opus_data $SI_ROOT $SI_TGT +cp ${SI_OPUS_DATASETS[3]}.$SRC $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SRC +cp ${SI_OPUS_DATASETS[3]}.$SI_TGT $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SI_TGT +REMOVE_FILE_PATHS+=( ${SI_OPUS_DATASETS[3]}.$SRC ${SI_OPUS_DATASETS[3]}.$SI_TGT ) + +download_opus_data $NE_ROOT $NE_TGT + + +# Download and extract Global Voices data +GLOBAL_VOICES="$NE_ROOT/globalvoices.2018q4.ne-en" +GLOBAL_VOICES_URL="http://www.casmacat.eu/corpus/global-voices/globalvoices.ne-en.xliff.gz" + +download_data $GLOBAL_VOICES.gz $GLOBAL_VOICES_URL +gunzip -Nf $GLOBAL_VOICES.gz + +sed -ne 's?.*\(.*\).*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$NE_TGT +sed -ne 's?.*]*>\(.*\).*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$SRC + +REMOVE_FILE_PATHS+=( $GLOBAL_VOICES ) + +# Download and extract the bible dataset +BIBLE_TOOLS=bible-corpus-tools +XML_BIBLES=XML_Bibles +XML_BIBLES_DUP=XML_Bibles_dup + +if [ ! -e $BIBLE_TOOLS ]; then + echo "Cloning bible-corpus-tools repository..." + git clone https://github.com/christos-c/bible-corpus-tools.git +fi + +mkdir -p $BIBLE_TOOLS/bin $XML_BIBLES $XML_BIBLES_DUP +javac -cp "$BIBLE_TOOLS/lib/*" -d $BIBLE_TOOLS/bin $BIBLE_TOOLS/src/bible/readers/*.java $BIBLE_TOOLS/src/bible/*.java + +download_data bible.tar.gz "https://github.com/christos-c/bible-corpus/archive/v1.2.1.tar.gz" +tar xvzf bible.tar.gz + +cp bible-corpus-1.2.1/bibles/{Greek.xml,English.xml,Nepali.xml} $XML_BIBLES/ +cp bible-corpus-1.2.1/bibles/{Greek.xml,English-WEB.xml,Nepali.xml} $XML_BIBLES_DUP/ + +java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES +java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES_DUP +java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES +java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES_DUP + +cat $XML_BIBLES/aligned/*/English.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$SRC +cat $XML_BIBLES/aligned/*/Nepali.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$NE_TGT +cat $XML_BIBLES_DUP/aligned/*/English-WEB.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$SRC +cat $XML_BIBLES_DUP/aligned/*/Nepali.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$NE_TGT +REMOVE_FILE_PATHS+=( bible-corpus-1.2.1 bible.tar.gz $BIBLE_TOOLS $XML_BIBLES $XML_BIBLES_DUP ) + +# Download and extract the Penn Treebank dataset +NE_TAGGED=$ROOT/new_submissions_parallel_corpus_project_Nepal +NE_TAGGED_URL="http://www.cle.org.pk/Downloads/ling_resources/parallelcorpus/NepaliTaggedCorpus.zip" +EN_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.en.patch" +NE_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.ne.patch" +MOSES=mosesdecoder +MOSES_TOK=$MOSES/scripts/tokenizer +EN_PATCH_REGEX="{s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}" +NE_PATCH_REGEX="{s:\p{Cf}::g;s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}" + +download_data $DATA/nepali-penn-treebank.$SRC.patch $EN_TAGGED_PATCH_URL +download_data $DATA/nepali-penn-treebank.$NE_TGT.patch $NE_TAGGED_PATCH_URL +download_data original.zip $NE_TAGGED_URL +unzip -o original.zip -d $ROOT + +cat $NE_TAGGED/00.txt $NE_TAGGED/01.txt $NE_TAGGED/02.txt > $NE_TAGGED/nepali-penn-treebank.$SRC +cat $NE_TAGGED/00ne_revised.txt $NE_TAGGED/01ne_revised.txt $NE_TAGGED/02ne_revised.txt > $NE_TAGGED/nepali-penn-treebank.$NE_TGT + +patch $NE_TAGGED/nepali-penn-treebank.$SRC -i $DATA/nepali-penn-treebank.$SRC.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$SRC +patch $NE_TAGGED/nepali-penn-treebank.$NE_TGT -i $DATA/nepali-penn-treebank.$NE_TGT.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT + +if [ ! -e $MOSES ]; then + echo "Cloning moses repository..." + git clone https://github.com/moses-smt/mosesdecoder.git +fi + +cat $NE_TAGGED/nepali-penn-treebank-patched.$SRC | \ + perl -anpe "$EN_PATCH_REGEX" | \ + $MOSES_TOK/tokenizer.perl -l $SRC | \ + $MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$SRC + +cat $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT | \ + perl -CIO -anpe "$NE_PATCH_REGEX" | \ + $MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$NE_TGT + + +# Download nepali dictionary data +NE_DICT=$NE_ROOT/dictionaries +download_data $NE_DICT "http://www.seas.upenn.edu/~nlp/resources/TACL-data-release/dictionaries.tar.gz" +tar xvzf $NE_DICT +cp dictionaries/dict.ne $NE_ROOT/dictionary.$NE_TGT-$SRC +REMOVE_FILE_PATHS+=( $NE_DICT dictionaries ) + +REMOVE_FILE_PATHS+=( $MOSES $NE_TAGGED original.zip $DATA/nepali-penn-treebank.$SRC.patch $DATA/nepali-penn-treebank.$NE_TGT.patch ) + + +# Remove the temporary files +for ((i=0;i<${#REMOVE_FILE_PATHS[@]};++i)); do + rm -rf ${REMOVE_FILE_PATHS[i]} +done + +# Copy the training data +si=si_LK +ne=ne_NP +en=en_XX +cat $SI_ROOT/GNOMEKDEUbuntu.en-si.si $SI_ROOT/OpenSubtitles2018.en-si.si > $DESTDIR/train.$si-$en.$si +cat $SI_ROOT/GNOMEKDEUbuntu.en-si.en $SI_ROOT/OpenSubtitles2018.en-si.en > $DESTDIR/train.$si-$en.$en + +cat $NE_ROOT/bible_dup.en-ne.ne $NE_ROOT/bible.en-ne.ne $NE_ROOT/globalvoices.2018q4.ne-en.ne $NE_ROOT/GNOMEKDEUbuntu.en-ne.ne $NE_ROOT/nepali-penn-treebank.ne > $DESTDIR/train.$ne-$en.$ne +cat $NE_ROOT/bible_dup.en-ne.en $NE_ROOT/bible.en-ne.en $NE_ROOT/globalvoices.2018q4.ne-en.en $NE_ROOT/GNOMEKDEUbuntu.en-ne.en $NE_ROOT/nepali-penn-treebank.en > $DESTDIR/train.$ne-$en.$en + + +#Download the test sets +wget https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz +tar -xvzf wikipedia_en_ne_si_test_sets.tgz + +cp wikipedia_en_ne_si_test_sets/wikipedia.dev.ne-en.ne $DESTDIR/valid.$ne-$en.$ne +cp wikipedia_en_ne_si_test_sets/wikipedia.dev.ne-en.en $DESTDIR/valid.$ne-$en.$en + +cp wikipedia_en_ne_si_test_sets/wikipedia.dev.si-en.si $DESTDIR/valid.$si-$en.$si +cp wikipedia_en_ne_si_test_sets/wikipedia.dev.si-en.en $DESTDIR/valid.$si-$en.$en + +cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.ne-en.ne $DESTDIR/devtest.$ne-$en.$ne +cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.ne-en.en $DESTDIR/devtest.$ne-$en.$en + +cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.si-en.si $DESTDIR/devtest.$si-$en.$si +cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.si-en.en $DESTDIR/devtest.$si-$en.$en + +cp wikipedia_en_ne_si_test_sets/wikipedia.test.ne-en.ne $DESTDIR/test.$ne-$en.$ne +cp wikipedia_en_ne_si_test_sets/wikipedia.test.ne-en.en $DESTDIR/test.$ne-$en.$en + +cp wikipedia_en_ne_si_test_sets/wikipedia.test.si-en.si $DESTDIR/test.$si-$en.$si +cp wikipedia_en_ne_si_test_sets/wikipedia.test.si-en.en $DESTDIR/test.$si-$en.$en + +rm -rf wikipedia_en_ne_si_test_sets.tgz wikipedia_en_ne_si_test_sets diff --git a/examples/multilingual/data_scripts/download_iitb.sh b/examples/multilingual/data_scripts/download_iitb.sh new file mode 100644 index 0000000000..a884e20839 --- /dev/null +++ b/examples/multilingual/data_scripts/download_iitb.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +if [ -z $WORKDIR_ROOT ] ; +then + echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." + exit +fi + +IITB=$WORKDIR_ROOT/IITB +mkdir -p $IITB +pushd $IITB + +wget http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download/parallel.tgz +tar -xvzf parallel.tgz + +wget http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download/dev_test.tgz +tar -xvzf dev_test.tgz + +DESTDIR=${WORKDIR_ROOT}/ML50/raw/ + +cp parallel/IITB.en-hi.en $DESTDIR/train.hi_IN-en_XX.en_XX +cp parallel/IITB.en-hi.hi $DESTDIR/train.hi_IN-en_XX.hi_IN + +cp dev_test/dev.en $DESTDIR/valid.hi_IN-en_XX.en_XX +cp dev_test/dev.hi $DESTDIR/valid.hi_IN-en_XX.hi_IN + +cp dev_test/test.en $DESTDIR/test.hi_IN-en_XX.en_XX +cp dev_test/test.hi $DESTDIR/test.hi_IN-en_XX.hi_IN +popd \ No newline at end of file diff --git a/examples/multilingual/data_scripts/download_iwslt_and_extract.sh b/examples/multilingual/data_scripts/download_iwslt_and_extract.sh new file mode 100644 index 0000000000..ca3591b3db --- /dev/null +++ b/examples/multilingual/data_scripts/download_iwslt_and_extract.sh @@ -0,0 +1,225 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +#echo 'Cloning Moses github repository (for tokenization scripts)...' +#git clone https://github.com/moses-smt/mosesdecoder.git + +if [ -z $WORKDIR_ROOT ] ; +then + echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." + exit +fi + + + +data_root=${WORKDIR_ROOT}/iwsltv2 +DESTDIR=${WORKDIR_ROOT}/ML50/raw + + +langs="ar_AR it_IT nl_XX ko_KR vi_VN" +echo "data_root: $data_root" + +download_path=${data_root}/downloads +raw=${DESTDIR} +tmp=${data_root}/tmp +orig=${data_root}/orig + +mkdir -p $download_path $orig $raw $tmp +####################### +download_iwslt(){ + iwslt_key=$1 + src=$2 + tgt=$3 + save_prefix=$4 + pushd ${download_path} + if [[ ! -f ${save_prefix}$src-$tgt.tgz ]]; then + wget https://wit3.fbk.eu/archive/${iwslt_key}/texts/$src/$tgt/$src-$tgt.tgz -O ${save_prefix}$src-$tgt.tgz + [ $? -eq 0 ] && return 0 + fi + popd +} + +extract_iwslt(){ + src=$1 + tgt=$2 + prefix=$3 + pushd $orig + tar zxvf ${download_path}/${prefix}$src-${tgt}.tgz + popd +} + +generate_train(){ + lsrc=$1 + ltgt=$2 + src=${lsrc:0:2} + tgt=${ltgt:0:2} + for ll in $lsrc $ltgt; do + l=${ll:0:2} + f="$orig/*/train.tags.$src-$tgt.$l" + f_raw=$raw/train.$lsrc-$ltgt.$ll + cat $f \ + | grep -v '' \ + | grep -v '' \ + | grep -v '' \ + | grep -v '' \ + | grep -v '' \ + | sed -e 's///g' \ + | sed -e 's/<\/title>//g' \ + | sed -e 's/<description>//g' \ + | sed -e 's/<\/description>//g' \ + | sed 's/^\s*//g' \ + | sed 's/\s*$//g' \ + > $f_raw + [ $? -eq 0 ] && echo "extracted $f to $f_raw" + done + return 0 +} + +convert_valid_test(){ + src=$1 + tgt=$2 + for l in $src $tgt; do + echo "lang: ${l}" + for o in `ls $orig/*/IWSLT*.TED*.$src-$tgt.$l.xml`; do + fname=${o##*/} + f=$tmp/${fname%.*} + echo "$o => $f" + grep '<seg id' $o \ + | sed -e 's/<seg id="[0-9]*">\s*//g' \ + | sed -e 's/\s*<\/seg>\s*//g' \ + | sed -e "s/\’/\'/g" \ + > $f + echo "" + done + done +} + +generate_subset(){ + lsrc=$1 + ltgt=$2 + src=${lsrc:0:2} + tgt=${ltgt:0:2} + subset=$3 + prefix=$4 + for ll in $lsrc $ltgt; do + l=${ll:0:2} + f=$tmp/$prefix.${src}-${tgt}.$l + if [[ -f $f ]]; then + cp $f $raw/$subset.${lsrc}-$ltgt.${ll} + fi + done +} +################# + +echo "downloading iwslt training and dev data" +# using multilingual for it, nl +download_iwslt "2017-01-trnmted" DeEnItNlRo DeEnItNlRo +download_iwslt "2017-01-trnted" ar en +download_iwslt "2017-01-trnted" en ar +download_iwslt "2017-01-trnted" ko en +download_iwslt "2017-01-trnted" en ko +download_iwslt "2015-01" vi en +download_iwslt "2015-01" en vi + +echo "donwloading iwslt test data" +download_iwslt "2017-01-mted-test" it en "test." +download_iwslt "2017-01-mted-test" en it "test." +download_iwslt "2017-01-mted-test" nl en "test." +download_iwslt "2017-01-mted-test" en nl "test." + +download_iwslt "2017-01-ted-test" ar en "test." +download_iwslt "2017-01-ted-test" en ar "test." +download_iwslt "2017-01-ted-test" ko en "test." +download_iwslt "2017-01-ted-test" en ko "test." +download_iwslt "2015-01-test" vi en "test." +download_iwslt "2015-01-test" en vi "test." + +echo "extract training data tar balls" +extract_iwslt DeEnItNlRo DeEnItNlRo +extract_iwslt ar en +extract_iwslt en ar +extract_iwslt ko en +extract_iwslt en ko +extract_iwslt vi en +extract_iwslt en vi + + +echo "extracting iwslt test data" +for lang in $langs; do + l=${lang:0:2} + extract_iwslt $l en "test." + extract_iwslt en $l "test." +done + +echo "convert dev and test data" +for lang in $langs; do + s_lang=${lang:0:2} + convert_valid_test $s_lang en + convert_valid_test en $s_lang +done + + + +echo "creating training data into $raw" +for lang in $langs; do + generate_train $lang en_XX + generate_train en_XX $lang +done + +echo "creating iwslt dev data into raw" +generate_subset en_XX vi_VN valid "IWSLT15.TED.tst2013" +generate_subset vi_VN en_XX valid "IWSLT15.TED.tst2013" + +generate_subset en_XX ar_AR valid "IWSLT17.TED.tst2016" +generate_subset ar_AR en_XX valid "IWSLT17.TED.tst2016" +generate_subset en_XX ko_KR valid "IWSLT17.TED.tst2016" +generate_subset ko_KR en_XX valid "IWSLT17.TED.tst2016" + + +generate_subset en_XX it_IT valid "IWSLT17.TED.tst2010" +generate_subset it_IT en_XX valid "IWSLT17.TED.tst2010" +generate_subset en_XX nl_XX valid "IWSLT17.TED.tst2010" +generate_subset nl_XX en_XX valid "IWSLT17.TED.tst2010" + +echo "creating iswslt test data into raw" +generate_subset en_XX vi_VN test "IWSLT15.TED.tst2015" +generate_subset vi_VN en_XX test "IWSLT15.TED.tst2015" + +generate_subset en_XX ar_AR test "IWSLT17.TED.tst2017" +generate_subset ar_AR en_XX test "IWSLT17.TED.tst2017" +generate_subset en_XX ko_KR test "IWSLT17.TED.tst2017" +generate_subset ko_KR en_XX test "IWSLT17.TED.tst2017" + +generate_subset en_XX it_IT test "IWSLT17.TED.tst2017.mltlng" +generate_subset it_IT en_XX test "IWSLT17.TED.tst2017.mltlng" +generate_subset en_XX nl_XX test "IWSLT17.TED.tst2017.mltlng" +generate_subset nl_XX en_XX test "IWSLT17.TED.tst2017.mltlng" + +# normalze iwslt directions into x-en +pushd $raw +for lang in $langs; do + for split in test valid; do + x_en_f1=$split.$lang-en_XX.en_XX + x_en_f2=$split.$lang-en_XX.${lang} + + en_x_f1=$split.en_XX-$lang.en_XX + en_x_f2=$split.en_XX-$lang.${lang} + + if [ -f $en_x_f1 ] && [ ! -f $x_en_f1 ]; then + echo "cp $en_x_f1 $x_en_f1" + cp $en_x_f1 $x_en_f1 + fi + if [ -f $x_en_f2 ] && [ ! -f $x_en_f2 ]; then + echo "cp $en_x_f2 $x_en_f2" + cp $en_x_f2 $x_en_f2 + fi + done +done +popd \ No newline at end of file diff --git a/examples/multilingual/data_scripts/download_lotus.sh b/examples/multilingual/data_scripts/download_lotus.sh new file mode 100644 index 0000000000..c08c701314 --- /dev/null +++ b/examples/multilingual/data_scripts/download_lotus.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +if [ -z $WORKDIR_ROOT ] ; +then + echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." + exit +fi + + +SRCDIR=$WORKDIR_ROOT/indic_languages_corpus +DESTDIR=${WORKDIR_ROOT}/ML50/raw/ +mkdir -p $SRCDIR +mkdir -p $DESTDIR + +cd $SRCDIR +wget http://lotus.kuee.kyoto-u.ac.jp/WAT/indic-multilingual/indic_languages_corpus.tar.gz +tar -xvzf indic_languages_corpus.tar.gz + +SRC_EXTRACT_DIR=$SRCDIR/indic_languages_corpus/bilingual + +cp $SRC_EXTRACT_DIR/ml-en/train.ml $DESTDIR/train.ml_IN-en_XX.ml_IN +cp $SRC_EXTRACT_DIR/ml-en/train.en $DESTDIR/train.ml_IN-en_XX.en_XX +cp $SRC_EXTRACT_DIR/ml-en/dev.ml $DESTDIR/valid.ml_IN-en_XX.ml_IN +cp $SRC_EXTRACT_DIR/ml-en/dev.en $DESTDIR/valid.ml_IN-en_XX.en_XX +cp $SRC_EXTRACT_DIR/ml-en/test.ml $DESTDIR/test.ml_IN-en_XX.ml_IN +cp $SRC_EXTRACT_DIR/ml-en/test.en $DESTDIR/test.ml_IN-en_XX.en_XX + +cp $SRC_EXTRACT_DIR/ur-en/train.ur $DESTDIR/train.ur_PK-en_XX.ur_PK +cp $SRC_EXTRACT_DIR/ur-en/train.en $DESTDIR/train.ur_PK-en_XX.en_XX +cp $SRC_EXTRACT_DIR/ur-en/dev.ur $DESTDIR/valid.ur_PK-en_XX.ur_PK +cp $SRC_EXTRACT_DIR/ur-en/dev.en $DESTDIR/valid.ur_PK-en_XX.en_XX +cp $SRC_EXTRACT_DIR/ur-en/test.ur $DESTDIR/test.ur_PK-en_XX.ur_PK +cp $SRC_EXTRACT_DIR/ur-en/test.en $DESTDIR/test.ur_PK-en_XX.en_XX + +cp $SRC_EXTRACT_DIR/te-en/train.te $DESTDIR/train.te_IN-en_XX.te_IN +cp $SRC_EXTRACT_DIR/te-en/train.en $DESTDIR/train.te_IN-en_XX.en_XX +cp $SRC_EXTRACT_DIR/te-en/dev.te $DESTDIR/valid.te_IN-en_XX.te_IN +cp $SRC_EXTRACT_DIR/te-en/dev.en $DESTDIR/valid.te_IN-en_XX.en_XX +cp $SRC_EXTRACT_DIR/te-en/test.te $DESTDIR/test.te_IN-en_XX.te_IN +cp $SRC_EXTRACT_DIR/te-en/test.en $DESTDIR/test.te_IN-en_XX.en_XX diff --git a/examples/multilingual/data_scripts/download_ted_and_extract.py b/examples/multilingual/data_scripts/download_ted_and_extract.py new file mode 100644 index 0000000000..eb756680fa --- /dev/null +++ b/examples/multilingual/data_scripts/download_ted_and_extract.py @@ -0,0 +1,338 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import itertools +import os +import csv +from collections import defaultdict +from six.moves import zip +import io +import wget +import sys + +from subprocess import check_call, check_output + +# scripts and data locations +CWD = os.getcwd() +UTILS = f"{CWD}/utils" + +MOSES = f"{UTILS}/mosesdecoder" + +WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) + +if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): + print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') + sys.exit(-1) + + +# please donwload mosesdecoder here: +detok_cmd = f'{MOSES}/scripts/tokenizer/detokenizer.perl' + + +def call(cmd): + print(f"Executing: {cmd}") + check_call(cmd, shell=True) + +class MultiLingualAlignedCorpusReader(object): + """A class to read TED talk dataset + """ + + def __init__(self, corpus_path, delimiter='\t', + target_token=True, bilingual=True, corpus_type='file', + lang_dict={'source': ['fr'], 'target': ['en']}, + eval_lang_dict=None, zero_shot=False, + detok=True, + ): + + self.empty_line_flag = 'NULL' + self.corpus_path = corpus_path + self.delimiter = delimiter + self.bilingual = bilingual + self.lang_dict = lang_dict + self.lang_set = set() + self.target_token = target_token + self.zero_shot = zero_shot + self.eval_lang_dict = eval_lang_dict + self.corpus_type = corpus_type + self.detok = detok + + for list_ in self.lang_dict.values(): + for lang in list_: + self.lang_set.add(lang) + + self.data = dict() + self.data['train'] = self.read_aligned_corpus(split_type='train') + self.data['test'] = self.read_aligned_corpus(split_type='test') + self.data['dev'] = self.read_aligned_corpus(split_type='dev') + + def read_data(self, file_loc_): + data_list = list() + with io.open(file_loc_, 'r', encoding='utf8') as fp: + for line in fp: + try: + text = line.strip() + except IndexError: + text = self.empty_line_flag + data_list.append(text) + return data_list + + def filter_text(self, dict_): + if self.target_token: + field_index = 1 + else: + field_index = 0 + data_dict = defaultdict(list) + list1 = dict_['source'] + list2 = dict_['target'] + for sent1, sent2 in zip(list1, list2): + try: + src_sent = ' '.join(sent1.split()[field_index: ]) + except IndexError: + src_sent = 'NULL' + + if src_sent.find(self.empty_line_flag) != -1 or len(src_sent) == 0: + continue + + elif sent2.find(self.empty_line_flag) != -1 or len(sent2) == 0: + continue + + else: + data_dict['source'].append(sent1) + data_dict['target'].append(sent2) + return data_dict + + def read_file(self, split_type, data_type): + return self.data[split_type][data_type] + + def save_file(self, path_, split_type, data_type, lang): + tok_file = tok_file_name(path_, lang) + with io.open(tok_file, 'w', encoding='utf8') as fp: + for line in self.data[split_type][data_type]: + fp.write(line + '\n') + if self.detok: + de_tok(tok_file, lang) + + def add_target_token(self, list_, lang_id): + new_list = list() + token = '__' + lang_id + '__' + for sent in list_: + new_list.append(token + ' ' + sent) + return new_list + + def read_from_single_file(self, path_, s_lang, t_lang): + data_dict = defaultdict(list) + with io.open(path_, 'r', encoding='utf8') as fp: + reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE) + for row in reader: + data_dict['source'].append(row[s_lang]) + data_dict['target'].append(row[t_lang]) + + if self.target_token: + text = self.add_target_token(data_dict['source'], t_lang) + data_dict['source'] = text + + return data_dict['source'], data_dict['target'] + + def read_aligned_corpus(self, split_type='train'): + data_dict = defaultdict(list) + iterable = [] + s_list = [] + t_list = [] + + if self.zero_shot: + if split_type == "train": + iterable = zip(self.lang_dict['source'], self.lang_dict['target']) + else: + iterable = zip(self.eval_lang_dict['source'], self.eval_lang_dict['target']) + + elif self.bilingual: + iterable = itertools.product(self.lang_dict['source'], self.lang_dict['target']) + + for s_lang, t_lang in iterable: + if s_lang == t_lang: + continue + if self.corpus_type == 'file': + split_type_file_path = os.path.join(self.corpus_path, + "all_talks_{}.tsv".format(split_type)) + s_list, t_list = self.read_from_single_file(split_type_file_path, + s_lang=s_lang, + t_lang=t_lang) + data_dict['source'] += s_list + data_dict['target'] += t_list + new_data_dict = self.filter_text(data_dict) + return new_data_dict + + +def read_langs(corpus_path): + split_type_file_path = os.path.join(corpus_path, 'extracted', + "all_talks_dev.tsv") + with io.open(split_type_file_path, 'r', encoding='utf8') as fp: + reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE) + header = next(reader) + return [k for k in header.keys() if k != 'talk_name'] + +def extra_english(corpus_path, split): + split_type_file_path = os.path.join(corpus_path, + f"all_talks_{split}.tsv") + output_split_type_file_path = os.path.join(corpus_path, + f"all_talks_{split}.en") + with io.open(split_type_file_path, 'r', encoding='utf8') as fp, io.open(output_split_type_file_path, 'w', encoding='utf8') as fw: + reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE) + for row in reader: + line = row['en'] + fw.write(line + '\n') + de_tok(output_split_type_file_path, 'en') + + + +def tok_file_name(filename, lang): + seps = filename.split('.') + seps.insert(-1, 'tok') + tok_file = '.'.join(seps) + return tok_file + +def de_tok(tok_file, lang): + # seps = tok_file.split('.') + # seps.insert(-1, 'detok') + # de_tok_file = '.'.join(seps) + de_tok_file = tok_file.replace('.tok.', '.') + cmd = 'perl {detok_cmd} -l {lang} < {tok_file} > {de_tok_file}'.format( + detok_cmd=detok_cmd, tok_file=tok_file, + de_tok_file=de_tok_file, lang=lang[:2]) + call(cmd) + +def extra_bitex( + ted_data_path, + lsrc_lang, + ltrg_lang, + target_token, + output_data_path, +): + def get_ted_lang(lang): + long_langs = ['pt-br', 'zh-cn', 'zh-tw', 'fr-ca'] + if lang[:5] in long_langs: + return lang[:5] + elif lang[:4] =='calv': + return lang[:5] + elif lang in ['pt_BR', 'zh_CN', 'zh_TW', 'fr_CA']: + return lang.lower().replace('_', '-') + return lang[:2] + src_lang = get_ted_lang(lsrc_lang) + trg_lang = get_ted_lang(ltrg_lang) + train_lang_dict={'source': [src_lang], 'target': [trg_lang]} + eval_lang_dict = {'source': [src_lang], 'target': [trg_lang]} + + obj = MultiLingualAlignedCorpusReader(corpus_path=ted_data_path, + lang_dict=train_lang_dict, + target_token=target_token, + corpus_type='file', + eval_lang_dict=eval_lang_dict, + zero_shot=False, + bilingual=True) + + os.makedirs(output_data_path, exist_ok=True) + lsrc_lang = lsrc_lang.replace('-', '_') + ltrg_lang = ltrg_lang.replace('-', '_') + obj.save_file(output_data_path + f"/train.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}", + split_type='train', data_type='source', lang=src_lang) + obj.save_file(output_data_path + f"/train.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}", + split_type='train', data_type='target', lang=trg_lang) + + obj.save_file(output_data_path + f"/test.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}", + split_type='test', data_type='source', lang=src_lang) + obj.save_file(output_data_path + f"/test.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}", + split_type='test', data_type='target', lang=trg_lang) + + obj.save_file(output_data_path + f"/valid.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}", + split_type='dev', data_type='source', lang=src_lang) + obj.save_file(output_data_path + f"/valid.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}", + split_type='dev', data_type='target', lang=trg_lang) + + +def bar_custom(current, total, width=80): + print("Downloading: %d%% [%d / %d] Ks" % (current / total * 100, current / 1000, total / 1000), end='\r') + + +def download_and_extract(download_to, extract_to): + url = 'http://phontron.com/data/ted_talks.tar.gz' + filename = f"{download_to}/ted_talks.tar.gz" + if os.path.exists(filename): + print(f'{filename} has already been downloaded so skip') + else: + filename = wget.download(url, filename, bar=bar_custom) + if os.path.exists(f'{extract_to}/all_talks_train.tsv'): + print(f'Already extracted so skip') + else: + extract_cmd = f'tar xzfv "{filename}" -C "{extract_to}"' + call(extract_cmd) + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--ted_data_path', type=str, default=WORKDIR_ROOT, required=False) + parser.add_argument( + '--direction-list', + type=str, + # default=None, + #for ML50 + default=( + "bn_IN-en_XX,he_IL-en_XX,fa_IR-en_XX,id_ID-en_XX,sv_SE-en_XX,pt_XX-en_XX,ka_GE-en_XX,ka_GE-en_XX,th_TH-en_XX," + "mr_IN-en_XX,hr_HR-en_XX,uk_UA-en_XX,az_AZ-en_XX,mk_MK-en_XX,gl_ES-en_XX,sl_SI-en_XX,mn_MN-en_XX," + #non-english directions + # "fr_XX-de_DE," # replaced with wmt20 + # "ja_XX-ko_KR,es_XX-pt_XX,ru_RU-sv_SE,hi_IN-bn_IN,id_ID-ar_AR,cs_CZ-pl_PL,ar_AR-tr_TR" + ), + required=False) + parser.add_argument('--target-token', action='store_true', default=False) + parser.add_argument('--extract-all-english', action='store_true', default=False) + + args = parser.parse_args() + + import sys + import json + + # TED Talks data directory + ted_data_path = args.ted_data_path + + download_to = f'{ted_data_path}/downloads' + extract_to = f'{ted_data_path}/extracted' + + #DESTDIR=${WORKDIR_ROOT}/ML50/raw/ + output_path = f'{ted_data_path}/ML50/raw' + os.makedirs(download_to, exist_ok=True) + os.makedirs(extract_to, exist_ok=True) + os.makedirs(output_path, exist_ok=True) + download_and_extract(download_to, extract_to) + + + if args.extract_all_english: + for split in ['train', 'dev', 'test']: + extra_english(ted_data_path, split) + exit(0) + if args.direction_list is not None: + directions = args.direction_list.strip().split(',') + directions = [tuple(d.strip().split('-', 1)) for d in directions if d] + else: + langs = read_langs(ted_data_path) + # directions = [ + # '{}.{}'.format(src, tgt) + # for src in langs + # for tgt in langs + # if src < tgt + # ] + directions = [('en', tgt) for tgt in langs if tgt != 'en'] + print(f'num directions={len(directions)}: {directions}') + + for src_lang, trg_lang in directions: + print('--working on {}-{}'.format(src_lang, trg_lang)) + extra_bitex( + extract_to, + src_lang, + trg_lang, + target_token=args.target_token, + output_data_path=output_path + ) diff --git a/examples/multilingual/data_scripts/download_wat19_my.sh b/examples/multilingual/data_scripts/download_wat19_my.sh new file mode 100644 index 0000000000..c1e2d47287 --- /dev/null +++ b/examples/multilingual/data_scripts/download_wat19_my.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +if [ -z $WORKDIR_ROOT ] ; +then + echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." + exit +fi + + +SRCDIR=$WORKDIR_ROOT/indic_languages_corpus +DESTDIR=$WORKDIR_ROOT/ML50/raw +mkdir -p $SRCDIR +mkdir -p $DESTDIR + +WAT_MY_EN=wat2020.my-en.zip +cd $SRCDIR +# please refer to http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/ for latest URL if the following url expired +#- The data used for WAT2020 are identical to those used in WAT2019. +wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/$WAT_MY_EN +unzip $WAT_MY_EN + + +SRC_EXTRACT_DIR=$SRCDIR/wat2020.my-en/alt + +cp $SRC_EXTRACT_DIR/train.alt.en $DESTDIR/train.my_MM-en_XX.en_XX +cp $SRC_EXTRACT_DIR/train.alt.my $DESTDIR/train.my_MM-en_XX.my_MM +cp $SRC_EXTRACT_DIR/dev.alt.en $DESTDIR/valid.my_MM-en_XX.en_XX +cp $SRC_EXTRACT_DIR/dev.alt.my $DESTDIR/valid.my_MM-en_XX.my_MM +cp $SRC_EXTRACT_DIR/test.alt.en $DESTDIR/test.my_MM-en_XX.en_XX +cp $SRC_EXTRACT_DIR/test.alt.my $DESTDIR/test.my_MM-en_XX.my_MM diff --git a/examples/multilingual/data_scripts/download_wmt19_and_before.py b/examples/multilingual/data_scripts/download_wmt19_and_before.py new file mode 100644 index 0000000000..3465731eb3 --- /dev/null +++ b/examples/multilingual/data_scripts/download_wmt19_and_before.py @@ -0,0 +1,899 @@ +from typing import NamedTuple, List +from urllib.parse import urlparse +import os, sys +import subprocess +from subprocess import check_call, check_output +import glob +import wget +import re +import multiprocessing as mp +from functools import partial +import pathlib +from collections import OrderedDict + +WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) + +if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): + print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') + sys.exit(-1) + +# scripts and data locations +CWD = os.getcwd() +UTILS = f"{CWD}/utils" + +MOSES = f"{UTILS}/mosesdecoder" +SGM_TOOL = f'{MOSES}/scripts/ems/support/input-from-sgm.perl' + +TMX2CORPUS = f"{UTILS}/tmx2corpus" +TMX_TOOL = f'python {TMX2CORPUS}/tmx2corpus.py' + +to_data_path = f'{WORKDIR_ROOT}/wmt' +download_to = f'{to_data_path}/downloads' +manually_downloads = f'{to_data_path}/downloads' +extract_to = f'{to_data_path}/extracted' +#DESTDIR=${WORKDIR_ROOT}/ML50/raw/ +raw_data = f'{WORKDIR_ROOT}/ML50/raw' +#### + +class DLDataset(NamedTuple): + name: str + train_urls: List[str] + valid_urls: List[str] + test_urls: List[str] + train_files_patterns: List[str] = [] + valid_files_patterns: List[str] = [] + test_files_patterns: List[str] = [] + + + +def bar_custom(current, total, width=80): + print("Downloading: %d%% [%d / %d] Ks" % (current / total * 100, current / 1000, total / 1000), end='\r') + +def get_downloaded_file(dl_folder, url): + if isinstance(url, tuple): + url, f = url + else: + url_f = urlparse(url) + # f = os.path.split(url_f.path)[-1] + f = '_'.join(url_f.path.split('/')[1:]) + return url, f"{dl_folder}/{f}" + +def download_parts_and_combine(dl_folder, urls, filename): + parts = [] + for url_record in urls: + url, part_file = get_downloaded_file(dl_folder, url_record) + if os.path.exists(part_file): + print(f'{part_file} has already been downloaded so skip') + else: + part_file = wget.download(url, part_file, bar=bar_custom) + parts.append(part_file) + + def get_combine_cmd(parts): + #default as tar.gz.?? + return f'cat {" ".join(parts)} > {filename}' + + combine_cmd = get_combine_cmd(parts) + call(combine_cmd, debug=True) + return filename + +def download_a_url(dl_folder, url): + url, filename = get_downloaded_file(dl_folder, url) + if os.path.exists(filename): + print(f'{filename} has already been downloaded so skip') + return filename + + print(f'downloading {url} to {filename}') + if isinstance(url, list) or isinstance(url, tuple): + download_parts_and_combine(dl_folder, url, filename) + else: + wget.download(url, filename, bar=bar_custom) + print(f'dowloaded: {filename}') + return filename + +def download_files(dl_folder, urls, completed_urls={}): + for url_record in urls: + url, _ = get_downloaded_file(dl_folder, url_record) + filename = download_a_url(dl_folder, url_record) + completed_urls[str(url)] = filename + return completed_urls + +def check_need_manual_downalod(dl_folder, to_manually_download_urls): + to_be_manually_dowloaded = [] + manually_completed_urls = {} + for url_record, instruction in to_manually_download_urls: + url, filename = get_downloaded_file(dl_folder, url_record) + if not os.path.exists(filename): + print(f'{url} need to be download manually, please download it manually following {instruction}; and copy it to {filename}') + to_be_manually_dowloaded.append((url, filename)) + else: + manually_completed_urls[url] = filename + # if len(to_be_manually_dowloaded) > 0: + # raise ValueError('Missing files that need to be downloaded manually; stop the process now.') + return to_be_manually_dowloaded + +def download_dataset(to_folder, dl_dataset, completed_urls={}): + download_files(to_folder, dl_dataset.train_urls, completed_urls) + download_files(to_folder, dl_dataset.valid_urls, completed_urls) + download_files(to_folder, dl_dataset.test_urls, completed_urls) + print('completed downloading') + return completed_urls + +def call(cmd, debug=False): + if debug: + print(cmd) + check_call(cmd, shell=True) + + +def get_extract_name(file_path): + path = os.path.split(file_path) + return path[-1] + '_extract' #.split('.')[0] + +def extract_file(downloaded_file, extract_folder, get_extract_name=get_extract_name, debug=False): + extract_name = get_extract_name(downloaded_file) + extract_to = f'{extract_folder}/{extract_name}' + os.makedirs(extract_to, exist_ok=True) + if os.path.exists(f'{extract_to}/DONE'): + print(f'{downloaded_file} has already been extracted to {extract_to} so skip') + return extract_to + def get_extract_cmd(filename): + if filename.endswith('.tgz') or filename.endswith('tar.gz'): + return f'tar xzfv {filename} -C {extract_to}' + elif filename.endswith('.gz.tar'): + return f'tar xfv {filename} -C {extract_to}; (cd {extract_to}; gzip -d *.gz; [ $? -eq 0 ] || gzip -d */*.gz)' + elif filename.endswith('.tar'): + return f'tar xfv {filename} -C {extract_to}' + elif filename.endswith('.gz'): + return f'cp {filename} {extract_to}; (cd {extract_to}; gzip -d *.gz)' + elif filename.endswith('.zip'): + return f'unzip {filename} -d {extract_to}' + extract_cmd = get_extract_cmd(downloaded_file) + print(f'extracting {downloaded_file}') + if isinstance(extract_cmd, list): + for c in extract_cmd: + call(c, debug=debug) + else: + call(extract_cmd, debug=debug) + call(f'echo DONE > {extract_to}/DONE') + return extract_to + + +def extract_all_files( + completed_urls, extract_folder, + get_extract_name=get_extract_name, + completed_extraction={}, + debug=False): + extracted_folders = OrderedDict() + for url, downloaded_file in set(completed_urls.items()): + if downloaded_file in completed_extraction: + print(f'{downloaded_file} is already extracted; so skip') + continue + folder = extract_file(downloaded_file, extract_folder, get_extract_name, debug) + extracted_folders[url] = folder + return extracted_folders + + +def my_glob(folder): + for p in [f'{folder}/*', f'{folder}/*/*', f'{folder}/*/*/*']: + for f in glob.glob(p): + yield f + + +def sgm2raw(sgm, debug): + to_file = sgm[0:len(sgm) - len('.sgm')] + if os.path.exists(to_file): + debug and print(f'{sgm} already converted to {to_file}; so skip') + return to_file + cmd = f'{SGM_TOOL} < {sgm} > {to_file}' + call(cmd, debug) + return to_file + +def tmx2raw(tmx, debug): + to_file = tmx[0:len(tmx) - len('.tmx')] + to_folder = os.path.join(*os.path.split(tmx)[:-1]) + if os.path.exists(f'{to_folder}/bitext.en'): + debug and print(f'{tmx} already extracted to {to_file}; so skip') + return to_file + cmd = f'(cd {to_folder}; {TMX_TOOL} {tmx})' + call(cmd, debug) + return to_file + +CZENG16_REGEX = re.compile(r'.*?data.plaintext-format/0[0-9]train$') +WMT19_WIKITITLES_REGEX = re.compile(r'.*?wikititles-v1.(\w\w)-en.tsv.gz') +TSV_REGEX = re.compile(r'.*?(\w\w)-(\w\w).tsv$') + + + +def cut_wikitles(wiki_file, debug): + # different languages have different file names: + if wiki_file.endswith('wiki/fi-en/titles.fi-en'): + to_file1 = f'{wiki_file}.fi' + to_file2 = f'{wiki_file}.en' + BACKSLASH = '\\' + cmd1 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f1 |awk '{{$1=$1}};1' > {to_file1}" + cmd2 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f2 |awk '{{$1=$1}};1' > {to_file2}" +# elif WMT19_WIKITITLES_REGEX.match(wiki_file): +# src = WMT19_WIKITITLES_REGEX.match(wiki_file).groups()[0] +# to_file1 = f'{wiki_file}.{src}' +# to_file2 = f'{wiki_file}.en' +# cmd1 = f"cat {wiki_file} | cut -f1 |awk '{{$1=$1}};1' > {to_file1}" +# cmd2 = f"cat {wiki_file} | cut -f2 |awk '{{$1=$1}};1' > {to_file2}" + else: + return None + if os.path.exists(to_file1) and os.path.exists(to_file2): + debug and print(f'{wiki_file} already processed to {to_file1} and {to_file2}; so skip') + return wiki_file + + call(cmd1, debug=debug) + call(cmd2, debug=debug) + return wiki_file + +def cut_tsv(file, debug): + m = TSV_REGEX.match(file) + if m is None: + raise ValueError(f'{file} is not matching tsv pattern') + src = m.groups()[0] + tgt = m.groups()[1] + + to_file1 = f'{file}.{src}' + to_file2 = f'{file}.{tgt}' + cmd1 = f"cat {file} | cut -f1 |awk '{{$1=$1}};1' > {to_file1}" + cmd2 = f"cat {file} | cut -f2 |awk '{{$1=$1}};1' > {to_file2}" + if os.path.exists(to_file1) and os.path.exists(to_file2): + debug and print(f'{file} already processed to {to_file1} and {to_file2}; so skip') + return file + + call(cmd1, debug=debug) + call(cmd2, debug=debug) + return file + + +def convert_file_if_needed(file, debug): + if file.endswith('.sgm'): + return sgm2raw(file, debug) + elif file.endswith('.tmx'): + return tmx2raw(file, debug) + elif file.endswith('wiki/fi-en/titles.fi-en'): + return cut_wikitles(file, debug) +# elif WMT19_WIKITITLES_REGEX.match(file): +# return cut_wikitles(file, debug) + elif file.endswith('.tsv'): + return cut_tsv(file, debug) + elif CZENG16_REGEX.match(file): + return convert2czeng17(file, debug) + else: + return file + + +def convert_files_if_needed(extracted_foldrs, my_glob=my_glob, debug=False): + return { + url: list(sorted(set(convert_file_if_needed(f, debug)) for f in sorted(set(my_glob(folder))))) + for url, folder in extracted_foldrs.items() + } + +def match_patt(file_path, file_pattern, src, tgt, lang): + return file_pattern.format(src=src, tgt=tgt, lang=lang) in file_path + +def match_patts(file_path, file_patterns, src, tgt, lang): + for file_pattern in file_patterns: + params = { k: v for k, v in [('src', src), ('tgt', tgt), ('lang', lang)] if k in file_pattern} + matching = file_pattern.format(**params) + + if isinstance(file_pattern, tuple): + pattern, directions = file_pattern + if f'{src}-{tgt}' in directions and matching in file_path: + return True + else: + if matching in file_path: + return True + return False + +def extracted_glob(extracted_folder, file_patterns, src, tgt, lang): + def get_matching_pattern(file_pattern): + params = { + k: v + for k, v in [('src', src), ('tgt', tgt), ('lang', lang)] + if '{' + k + '}' in file_pattern + } + file_pattern = re.sub(r'{src:(.*?)}', r'\1' if lang == src else '', file_pattern) + file_pattern = re.sub(r'{tgt:(.*?)}', r'\1' if lang == tgt else '', file_pattern) + file_pattern = file_pattern.format(**params) + return file_pattern + for file_pattern in file_patterns: + if isinstance(file_pattern, tuple): + file_pattern, lang_pairs = file_pattern + if f'{src}-{tgt}' not in lang_pairs: + continue +# print('working on pattern: ', file_pattern, lang_pairs ) + matching_pattern = get_matching_pattern(file_pattern) + if matching_pattern is None: + continue + glob_patterns = f'{extracted_folder}/{matching_pattern}' +# print('glob_patterns: ', glob_patterns) + for f in glob.glob(glob_patterns): + yield f + +# for debug usage +def all_extracted_files(split, src, tgt, extracted_folders, split_urls): + def get_url(url): + if isinstance(url, tuple): + url, downloaded_file = url + return url + return [ + f + for url in split_urls + for f in my_glob(extracted_folders[str(get_url(url))]) + ] + +def concat_files(split, src, tgt, extracted_folders, split_urls, path_patterns, to_folder, debug=False): +# if debug: +# print('extracted files to be filtered by patterns: ', +# '\n\t'.join(sorted(all_extracted_files(split, src, tgt, extracted_folders, split_urls)))) + for lang in [src, tgt]: + to_file = f'{to_folder}/{split}.{src}-{tgt}.{lang}' + s_src, s_tgt, s_lang = src.split('_')[0], tgt.split('_')[0], lang.split('_')[0] + files = [] + for url in split_urls: + if isinstance(url, tuple): + url, downloaded_file = url + if str(url) not in extracted_folders: + print(f'warning: {url} not in extracted files') + for extracted_file in set( + extracted_glob( + extracted_folders[str(url)], path_patterns, + s_src, s_tgt, s_lang)): + files.append(extracted_file) + if len(files) == 0: + print('warning: ', f'No files found for split {to_file}') + continue + files = sorted(set(files)) + print(f'concating {len(files)} files into {to_file}') + cmd = ['cat'] + [f'"{f}"' for f in files] + [f'>{to_file}'] + cmd = " ".join(cmd) + call(cmd, debug=debug) + +UTILS = os.path.join(pathlib.Path(__file__).parent, 'utils') +LID_MODEL = f'{download_to}/lid.176.bin' +LID_MULTI = f'{UTILS}/fasttext_multi_filter.py' + +def lid_filter(split, src, tgt, from_folder, to_folder, debug=False): + if not os.path.exists(LID_MODEL): + call(f'wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O {LID_MODEL}') + from_prefix = f'{from_folder}/{split}.{src}-{tgt}' + to_prefix = f'{to_folder}/{split}.{src}-{tgt}' + if os.path.exists(f'{from_prefix}.{src}') and os.path.exists(f'{from_prefix}.{tgt}'): + s_src, s_tgt = src.split('_')[0], tgt.split('_')[0] + cmd = ( + f'python {LID_MULTI} --model {LID_MODEL} --inputs {from_prefix}.{src} {from_prefix}.{tgt} ' + f'--langs {s_src} {s_tgt} --outputs {to_prefix}.{src} {to_prefix}.{tgt}' + ) + print(f'filtering {from_prefix}') + call(cmd, debug=debug) + +def concat_into_splits(dl_dataset, src, tgt, extracted_folders, to_folder, debug): + to_folder_tmp = f"{to_folder}_tmp" + os.makedirs(to_folder_tmp, exist_ok=True) + concat_files('train', src, tgt, + extracted_folders, + split_urls=dl_dataset.train_urls, + path_patterns=dl_dataset.train_files_patterns, + to_folder=to_folder_tmp, debug=debug) + lid_filter('train', src, tgt, to_folder_tmp, to_folder, debug) + + concat_files('valid', src, tgt, + extracted_folders, + split_urls=dl_dataset.valid_urls, + path_patterns=dl_dataset.valid_files_patterns, + to_folder=to_folder, debug=debug) + concat_files('test', src, tgt, + extracted_folders, + split_urls=dl_dataset.test_urls, + path_patterns=dl_dataset.test_files_patterns, + to_folder=to_folder, debug=debug) + + +def download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=False): + pool = mp.Pool(processes=num_processes) + download_f = partial(download_a_url, dl_folder) + downloaded_files = pool.imap_unordered(download_f, urls) + pool.close() + pool.join() + +BLEU_REGEX = re.compile("^BLEU\\S* = (\\S+) ") +def run_eval_bleu(cmd): + output = check_output(cmd, shell=True, stderr=subprocess.STDOUT).decode("utf-8").strip() + print(output) + bleu = -1.0 + for line in output.strip().split('\n'): + m = BLEU_REGEX.search(line) + if m is not None: + bleu = m.groups()[0] + bleu = float(bleu) + break + return bleu + +def check_wmt_test_bleu(raw_folder, wmt_lang_pairs): + not_matchings = [] + for wmt, src_tgts in wmt_lang_pairs: + for src_tgt in src_tgts: + print(f'checking test bleus for: {src_tgt} at {wmt}') + src, tgt = src_tgt.split('-') + ssrc, stgt = src[:2], tgt[:2] + if os.path.exists(f'{raw_folder}/test.{tgt}-{src}.{src}'): + # reversed direction may have different test set + test_src = f'{raw_folder}/test.{tgt}-{src}.{src}' + else: + test_src = f'{raw_folder}/test.{src}-{tgt}.{src}' + cmd1 = f'cat {test_src} | sacrebleu -t "{wmt}" -l {stgt}-{ssrc}; [ $? -eq 0 ] || echo ""' + test_tgt = f'{raw_folder}/test.{src}-{tgt}.{tgt}' + cmd2 = f'cat {test_tgt} | sacrebleu -t "{wmt}" -l {ssrc}-{stgt}; [ $? -eq 0 ] || echo ""' + bleu1 = run_eval_bleu(cmd1) + if bleu1 != 100.0: + not_matchings.append(f'{wmt}:{src_tgt} source side not matching: {test_src}') + bleu2 = run_eval_bleu(cmd2) + if bleu2 != 100.0: + not_matchings.append(f'{wmt}:{src_tgt} target side not matching: {test_tgt}') + return not_matchings + +def download_and_extract( + to_folder, lang_pairs, dl_dataset, + to_manually_download_urls, + completed_urls={}, completed_extraction={}, + debug=False): + + dl_folder = f'{to_folder}/downloads' + extract_folder = f'{to_folder}/extracted' + raw_folder = f'{to_folder}/raw' + lid_filtered = f'{to_folder}/lid_filtered' + + os.makedirs(extract_folder, exist_ok=True) + os.makedirs(raw_folder, exist_ok=True) + os.makedirs(lid_filtered, exist_ok=True) + + + to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls) + + completed_urls = download_dataset( + dl_folder, dl_dataset, completed_urls) + if debug: + print('completed urls: ', completed_urls) + + + extracted_folders = extract_all_files( + completed_urls, + extract_folder=extract_folder, + completed_extraction=completed_extraction, + debug=debug) + if debug: + print('download files have been extracted to folders: ', extracted_folders) + + converted_files = convert_files_if_needed(extracted_folders, debug=False) + for src_tgt in lang_pairs: + print(f'working on {dl_dataset.name}: {src_tgt}') + src, tgt = src_tgt.split('-') + concat_into_splits(dl_dataset, + src=src, tgt=tgt, + extracted_folders=extracted_folders, + to_folder=raw_folder, debug=debug) + print('completed data into: ', raw_folder) + +def download_czang16(download_to, username=None): + wgets = [ + f'wget --user={username} --password=czeng -P {download_to} http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar' + for i in range(10)] + cmds = [] + for i, cmd in enumerate(wgets): + filename = f'{download_to}/data-plaintext-format.{i}.tar' + if os.path.exists(filename): + print(f'{filename} has already been downloaded; so skip') + continue + cmds.append(cmd) + if cmds and username is None: + raise ValueError('No czeng username is given; please register at http://ufal.mff.cuni.cz/czeng/czeng16 to obtain username to download') + for cmd in cmds: + call(cmd) + print('done with downloading czeng1.6') + +def download_czeng17_script(download_to, extract_folder, debug=False): + url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip' + filename = f'{download_to}/convert_czeng16_to_17.pl.zip' + extract_to = f'{extract_folder}/{get_extract_name(filename)}' + script_path = f'{extract_to}/convert_czeng16_to_17.pl' + + if not os.path.exists(script_path): + wget.download(url, filename, bar=bar_custom) + extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug) + return script_path + +czeng17_script_path = "" +def convert2czeng17(file, debug): + en_file = f'{file}.en' + cs_file = f'{file}.cs' + + if not os.path.exists(en_file) or not os.path.exists(cs_file): + cs_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f3 > {cs_file}' + en_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f4 > {en_file}' + call(cs_cmd, debug) + call(en_cmd, debug) + else: + print(f'already extracted: {en_file} and {cs_file}') + return file + +def extract_czeng17(extract_folder, debug=False): + url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip' + filename = f'{download_to}/convert_czeng16_to_17.pl.zip' + extract_to = f'{extract_folder}/{get_extract_name(filename)}' + script_path = f'{extract_to}/convert_czeng16_to_17.pl' + + if not os.path.exists(script_path): + wget.download(url, filename, bar=bar_custom) + extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug) + return script_path + +######### +# definitions of wmt data sources +# for es-en +# Punctuation in the official test sets will be encoded with ASCII characters (not complex Unicode characters) as much as possible. You may want to normalize your system's output before submission. You are able able to use a rawer version of the test sets that does not have this normalization. +# script to normalize punctuation: http://www.statmt.org/wmt11/normalize-punctuation.perl +wmt13_es_en = DLDataset( + name='wmt13_es-en', + train_urls=[ + 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz', + 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', + 'http://www.statmt.org/wmt13/training-parallel-un.tgz', + 'http://www.statmt.org/wmt13/training-parallel-nc-v8.tgz', + ], + valid_urls=[ + ('http://www.statmt.org/wmt13/dev.tgz', 'wmt13_dev.tgz') + ], + test_urls=[ + ('http://www.statmt.org/wmt13/test.tgz', 'wmt13_test.tgz') + ], + train_files_patterns=[ + ('*/europarl-v7.{src}-{tgt}.{lang}', ['es-en']), + ('*commoncrawl.{src}-{tgt}.{lang}', ['es-en']), + ('*/news-commentary-v8.{src}-{tgt}.{lang}', ['es-en']), + ('un/*undoc.2000.{src}-{tgt}.{lang}', ['es-en']), + ] , + valid_files_patterns=[ + ('dev/newstest2012.{lang}', ['es-en']) + ], + test_files_patterns=[ + ('test/newstest*.{lang}', ['es-en']) + ], +) + +wmt14_de_fr_en = DLDataset( + name='wmt14_de_fr_en', + train_urls=[ + 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz', + 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', + 'http://www.statmt.org/wmt13/training-parallel-un.tgz', + 'http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz', + ('http://www.statmt.org/wmt10/training-giga-fren.tar', 'training-giga-fren.gz.tar'), #it is actuall a gz.tar + ], + valid_urls=[ + ('http://www.statmt.org/wmt14/dev.tgz', 'wmt14_dev.tgz'), + ], + test_urls=[ + ('http://www.statmt.org/wmt14/test-full.tgz', 'wmt14_test_full.tgz'), # cleaned test sets + ], + train_files_patterns=[ + ('*/europarl-v7.{src}-{tgt}.{lang}', ['fr-en', 'de-en']), + ('*commoncrawl.{src}-{tgt}.{lang}', ['fr-en', 'de-en']), + ('*/*news-commentary-v9.{src}-{tgt}.{lang}', ['fr-en', 'de-en']), + ('un/undoc.2000.{src}-{tgt}.{lang}', ['fr-en']), + ('*giga-{src}{tgt}*{lang}', ['fr-en']) + ], + valid_files_patterns=[ + ('dev/newstest2013.{lang}', ['fr-en', 'de-en']) + ], + test_files_patterns=[ + ('test-full/newstest*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['en-de', 'de-en', 'fr-en', 'en-fr']), + ], +) + +# pip install git+https://github.com/amake/tmx2corpus.git +wmt16_ro_en = DLDataset( + name='wmt16_ro-en', + train_urls=[ + ('http://data.statmt.org/wmt16/translation-task/training-parallel-ep-v8.tgz', 'wmt16_training-parallel-ep-v8.tgz'), + ('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-ro.tmx.gz', 'en-ro.tmx.gz'), + ], + valid_urls=[ + ('http://data.statmt.org/wmt16/translation-task/dev-romanian-updated.tgz', 'wmt16_dev.tgz') + ], + test_urls=[ + ('http://data.statmt.org/wmt16/translation-task/test.tgz', 'wmt16_test.tgz') + ], + train_files_patterns=[ + ('*/*europarl-v8.{src}-{tgt}.{lang}', ['ro-en']), + ('bitext.{lang}', ['ro-en']) #setimes from tmux + ] , + valid_files_patterns=[ + ('dev/newsdev2016*{src}{tgt}*.{lang}', ['ro-en', 'ro-en']) + ], + test_files_patterns=[ + ('test/newstest*{src}{tgt}*.{lang}', ['ro-en', 'en-ro']) + ], +) + +cwmt_wmt_instruction = 'cwmt download instruction at: http://nlp.nju.edu.cn/cwmt-wmt' +wmt17_fi_lv_tr_zh_en_manual_downloads = [ + # fake urls to have unique keys for the data + ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'), cwmt_wmt_instruction), + ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'), cwmt_wmt_instruction), + ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'), cwmt_wmt_instruction), + ( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'), cwmt_wmt_instruction), + ( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'), cwmt_wmt_instruction), + ( ('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'), cwmt_wmt_instruction), +] +wmt17_fi_lv_tr_zh_en = DLDataset( + name='wmt17_fi_lv_tr_zh_en', + train_urls=[ + ('http://data.statmt.org/wmt17/translation-task/training-parallel-ep-v8.tgz', 'wmt17_training-parallel-ep-v8.tgz'), + 'http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz', + 'http://www.statmt.org/wmt15/wiki-titles.tgz', + ('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-tr.tmx.gz', 'en-tr.tmx.gz'), + ('http://data.statmt.org/wmt17/translation-task/rapid2016.tgz', 'wmt17_rapid2016.tgz'), + 'http://data.statmt.org/wmt17/translation-task/leta.v1.tgz', + 'http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz', + 'http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz', + (('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00', + 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01',), 'UNv1.0.en-zh.tar.gz'), + #manually download files: + ('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'), + ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'), + ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'), + ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'), + ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'), + ('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'), + ], + valid_urls=[ + ('http://data.statmt.org/wmt17/translation-task/dev.tgz', 'wmt17_dev.tgz'), + ], + test_urls=[ + #NEW: Improved translations for zh test sets + ('http://data.statmt.org/wmt17/translation-task/test-update-1.tgz', 'wmt17_test_zh_en.tgz'), + ('http://data.statmt.org/wmt17/translation-task/test.tgz', 'wmt17_test_others.tgz') + ], + train_files_patterns=[ + ('casict*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ), + ('casia*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ), + ('dataum*/Book*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en']), + ('neu*/NEU*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en'] ), + ('*/*UNv1.0.en-zh.{src:zh}{tgt:en}', ['zh-en']), + ('training/*news-commentary-v12.{src}-{tgt}.{lang}', ['zh-en', ]), + + ('*/*europarl-v8.{src}-{tgt}.{lang}', ['fi-en', 'lv-en']), + ('wiki/fi-en/titles.{src}-{tgt}.{lang}', ['fi-en', ]), + ('rapid2016.{tgt}-{src}.{lang}', ['fi-en', 'lv-en']), + ('*/leta.{lang}', ['lv-en']), + ('*/dcep.{lang}', ['lv-en']), + ('*/farewell.{lang}', ['lv-en']), + ('bitext.{lang}', ['tr-en']), + ] , + valid_files_patterns=[ + ('dev/newsdev2017*{src}{tgt}-{src:src}{tgt:ref}.{lang}', + [ + 'fi-en', 'lv-en', 'tr-en', 'zh-en', + 'en-fi', 'en-lv', 'en-tr', 'en-zh' + ]), + ('dev/newstest2016*{src}{tgt}-{src:src}{tgt:ref}.{lang}', + [ + 'fi-en', 'tr-en', + 'en-fi', 'en-tr', + ]), + ], + test_files_patterns=[ + ('test/newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}', + [ + 'fi-en', 'lv-en', 'tr-en', + 'en-fi', 'en-lv', 'en-tr', + ]), + ('newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}', + [ + 'zh-en', + 'en-zh' + ]), + ], +) + +czeng_instruction = 'download instruction at: http://ufal.mff.cuni.cz/czeng/czeng16' +#alternative: use the prepared data but detokenize it? +wmt18_cs_et_en_manual_downloads = [ +#for cs, need to register and download; Register and download CzEng 1.6. +#Better results can be obtained by using a subset of sentences, released under a new version name CzEng 1.7. + # ((f'http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar', + # f'data-plaintext-format.{i}.tar'), czeng_instruction) + # for i in range(10) +] + +wmt18_cs_et_en = DLDataset( + name='wmt18_cs_et_en', + train_urls=[ + 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz', + 'http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz', + 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-cs.zipporah0-dedup-clean.tgz', + 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-et.zipporah0-dedup-clean.tgz', + 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', + 'http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz', + ('http://data.statmt.org/wmt18/translation-task/rapid2016.tgz', 'wmt18_rapid2016.tgz'), + # (tuple( + # (f'http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar', + # f'data-plaintext-format.{i}.tar') + # for i in range(10) + # ), + # 'czeng16_data_plaintext.gz.tar'), + ], + valid_urls=[ + ('http://data.statmt.org/wmt18/translation-task/dev.tgz', 'wmt18_dev.tgz'), + ], + test_urls=[ + ('http://data.statmt.org/wmt18/translation-task/test.tgz', 'wmt18_test.tgz'), + ], + train_files_patterns=[ + # ('*/*europarl-v7.{src}-{tgt}.{lang}', ['cs-en']), + ('*/*europarl-v8.{src}-{tgt}.{lang}', ['et-en']), + # ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['cs-en', 'et-en']), + ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['et-en']), + # ('*commoncrawl.{src}-{tgt}.{lang}', ['cs-en']), + # ('*/news-commentary-v13.{src}-{tgt}.{lang}', ['cs-en']), + # ('data.plaintext-format/*train.{lang}', ['cs-en']), + ('rapid2016.{tgt}-{src}.{lang}', ['et-en']), + ] , + valid_files_patterns=[ + ('dev/newsdev2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['et-en']), + # ('dev/newstest2017*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['cs-en']) + ], + test_files_patterns=[ + ('test/newstest2018-{src}{tgt}-{src:src}{tgt:ref}.{lang}', + # ['cs-en', 'et-en']), + ['et-en']), + ] +) + +ru_en_yandex_instruction = 'Yandex Corpus download instruction at: https://translate.yandex.ru/corpus?lang=en' +wmt19_ru_gu_kk_lt_manual_downloads = [ + (('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'), ru_en_yandex_instruction) +] +wmt19_ru_gu_kk_lt = DLDataset( + name='wmt19_ru_gu_kk_lt', + train_urls=[ + 'http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz', + 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-lt.bicleaner07.tmx.gz', + 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz', + 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', + 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14-wmt19.en-kk.tsv.gz', + 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz', + 'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz', + 'http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz', + 'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz', + 'http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz', + 'http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz', + (('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00', + 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01', + 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02',), + 'wmt19_UNv1.0.en-ru.tar.gz'), + 'https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2016.en-lt.tmx.zip', + ('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'), + ], + valid_urls=[ + ('http://data.statmt.org/wmt19/translation-task/dev.tgz', 'wmt19_dev.tgz'), + ], + test_urls=[ + ('http://data.statmt.org/wmt19/translation-task/test.tgz', 'wmt19_test.tgz'), + ], + train_files_patterns=[ + ('*europarl-v9.{src}-{tgt}.tsv.{lang}', ['lt-en']), + #paracrawl + ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['ru-en']), + ('bitext.{lang}', ['lt-en',]), + ('*commoncrawl.{src}-{tgt}.{lang}', ['ru-en',]), + ('*news-commentary-v14-wmt19.{tgt}-{src}.tsv.{lang}', ['kk-en', ]), + ('*news-commentary-v14.{tgt}-{src}.tsv.{lang}', ['ru-en']), + #yandex + ('corpus.{tgt}_{src}.1m.{lang}', ['ru-en']), + ('wikititles_v1_wikititles-v1.{src}-{tgt}.tsv.{lang}', ['ru-en', 'kk-en', 'lt-en', 'gu-en']), + ('*/UNv1.0.{tgt}-{src}.{lang}', ['ru-en']), + #rapid + ('bitext.{lang}', ['lt-en']) + ], + valid_files_patterns=[ + ('dev/newsdev2019*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['gu-en', 'kk-en', 'lt-en']), + ('dev/newstest2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['ru-en']), + ], + test_files_patterns=[ + ('sgm/newstest2019-{src}{tgt}-{src:src}{tgt:ref}.{lang}', + ['ru-en', 'gu-en', 'kk-en', 'lt-en', 'en-ru', 'en-gu', 'en-kk', 'en-lt']), + ] +) + + +######### + +if __name__ == "__main__": + # speed up the downloads with multiple processing + dl_folder = f'{to_data_path}/downloads' + extract_folder = f'{to_data_path}/extracted' + + urls = [ + url + for dataset in [wmt13_es_en, wmt14_de_fr_en, wmt16_ro_en, wmt18_cs_et_en, wmt19_ru_gu_kk_lt] + for urls in [dataset.train_urls, dataset.valid_urls, dataset.test_urls] + for url in urls + ] + urls = set(urls) + download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=True) + + # check manually downlaods + to_manually_download_urls = ( + wmt17_fi_lv_tr_zh_en_manual_downloads + wmt18_cs_et_en_manual_downloads + wmt19_ru_gu_kk_lt_manual_downloads + ) + to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls) + if len(to_be_manually_dowloaded) > 0: + print('Missing files that need to be downloaded manually; stop the process now.') + exit(-1) + + completed_urls = {} + completed_extraction = {} + def work_on_wmt(directions, wmt_data): + download_and_extract( + to_data_path, + directions, + wmt_data, + to_manually_download_urls=to_manually_download_urls, + completed_urls=completed_urls, completed_extraction=completed_extraction, debug=True) + + work_on_wmt( + ['es_XX-en_XX'], + wmt13_es_en,) + work_on_wmt( + [ + 'fr_XX-en_XX', 'en_XX-fr_XX', + # 'en_XX-de_DE', 'de_DE-en_XX', + ], + wmt14_de_fr_en,) + work_on_wmt( + ['ro_RO-en_XX', 'en_XX-ro_XX'], + wmt16_ro_en,) + work_on_wmt( + [ + # 'zh_CN-en_XX', + 'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX', + #in case the reversed directions have different train/valid/test data + # 'en_XX-zh_CN', + 'en_XX-lv_LV', 'en_XX-fi_FI', 'en_XX-tr_TR', + ], + wmt17_fi_lv_tr_zh_en, ) + # czeng17_script_path = download_czeng17_script(download_to, extract_to, debug=False) + # cz_username = None + work_on_wmt( + [ + # 'cs_CZ-en_XX', + 'et_EE-en_XX'], + wmt18_cs_et_en,) + work_on_wmt( + [ + # 'ru_RU-en_XX', 'en_XX-ru_RU', + 'gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX', + #in case the reversed directions have different train/valid/test data + 'en_XX-gu_IN', 'en_XX-kk_KZ', 'en_XX-lt_LT' + ], + wmt19_ru_gu_kk_lt,) + + not_matching = check_wmt_test_bleu( + f'{to_data_path}/raw', + [ + ('wmt13', ['es_XX-en_XX']), + ('wmt14/full', ['fr_XX-en_XX',]), + ('wmt16', ['ro_RO-en_XX',]), + # ('wmt17/improved', ['zh_CN-en_XX']), + ('wmt17', [ 'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX']), + ('wmt18', ['cs_CZ-en_XX', 'et_EE-en_XX']), + ('wmt19', ['gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX']), + #'ru_RU-en_XX', + ] + ) + if len(not_matching) > 0: + print('the following datasets do not have matching test datasets:\n\t', '\n\t'.join(not_matching)) + diff --git a/examples/multilingual/data_scripts/download_wmt20.sh b/examples/multilingual/data_scripts/download_wmt20.sh new file mode 100644 index 0000000000..31cd5c76b7 --- /dev/null +++ b/examples/multilingual/data_scripts/download_wmt20.sh @@ -0,0 +1,547 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +if [ -z $WORKDIR_ROOT ] ; +then + echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." + exit +fi + + + +set -x -e + +# TODO update the workdir and dest dir name +# put fasttext model +WORKDIR=$WORKDIR_ROOT +# put intermediate files +TMP_DIR=$WORKDIR_ROOT/tmp/tmp_wmt20_lowres_download +# output {train,valid,test} files to dest +DEST=$WORKDIR_ROOT/ML50/raw + +UTILS=$PWD/utils + +# per dataset locations +COMMONCRAWL_DIR=$TMP_DIR/commoncrawl +YANDEX_CORPUS=$WORKDIR_ROOT/wmt20/official/ru/yandex/1mcorpus.zip +# unzipped +CZENG_CORPUS=$WORKDIR_ROOT/wmt20/official/cs/czeng/czeng20-train +CCMT_DIR=$WORKDIR_ROOT/wmt20/official/zh/ccmt/parallel + +download_and_select() { + SUBFOLDER=$1 + URL=$2 + UNCOMPRESS_CMD=$3 + LANG=$4 + INPUT_FILEPATH=$5 + if [[ $# -gt 5 ]]; then + LANG_COL=$6 + EN_COL=$7 + fi + + mkdir -p $SUBFOLDER + cd $SUBFOLDER + wget -nc --content-disposition $URL + $UNCOMPRESS_CMD + + if [[ $# -gt 5 ]]; then + cut -f$LANG_COL $INPUT_FILEPATH > $INPUT_FILEPATH.$LANG + cut -f$EN_COL $INPUT_FILEPATH > $INPUT_FILEPATH.en + fi + cd .. + + ln -sf $SUBFOLDER/$INPUT_FILEPATH.$LANG $SUBFOLDER.$LANG + ln -sf $SUBFOLDER/$INPUT_FILEPATH.en $SUBFOLDER.en +} + +prepare_lid() { + pip install fasttext + + # TODO specify global workdir + MODEL=$WORKDIR/fasttext/lid.176.bin + LID_MULTI=$UTILS/fasttext_multi_filter.py + + if [ ! -f "$MODEL" ]; then + echo "downloading fasttext lid model..." + mkdir -p $WORKDIR/fasttext + wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O $MODEL + fi +} + +prepare_moses() { + pushd $UTILS + echo 'Cloning Moses github repository (for tokenization scripts)...' + git clone https://github.com/moses-smt/mosesdecoder.git + popd +} + +lid_filter() { + # TODO specify global workdir + MODEL=$WORKDIR/fasttext/lid.176.bin + LID_MULTI=$UTILS/fasttext_multi_filter.py + + prepare_lid + + SRC=$1 + SRC_FILE=$2 + SRC_OUTPUT=$3 + TGT=$4 + TGT_FILE=$5 + TGT_OUTPUT=$6 + python $LID_MULTI --model $MODEL --inputs $SRC_FILE $TGT_FILE --langs $SRC $TGT --outputs $SRC_OUTPUT $TGT_OUTPUT +} + +prepare_ja_ted() { + mkdir -p ted + cd ted + + wget -nc https://wit3.fbk.eu/archive/2017-01-trnted//texts/en/ja/en-ja.tgz + tar -zxvf en-ja.tgz + cat en-ja/train.tags.en-ja.en | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.en + cat en-ja/train.tags.en-ja.ja | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.ja + + cd .. + ln -sf ted/en-ja/train.en-ja.ja ted.ja + ln -sf ted/en-ja/train.en-ja.en ted.en +} + +prepare_ja() { + OUTPUT_DIR=$TMP_DIR/ja + mkdir -p $OUTPUT_DIR + cd $OUTPUT_DIR + + download_and_select paracrawl "http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/2.0/bitext/en-ja.tar.gz" "tar -zxvf en-ja.tar.gz" ja en-ja/en-ja.bicleaner05.txt 4 3 & + download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ja.tsv.gz" "gunzip -f news-commentary-v15.en-ja.tsv.gz" ja news-commentary-v15.en-ja.tsv 2 1 & + download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ja-en.tsv.gz" "gunzip -f wikititles-v2.ja-en.tsv.gz" ja wikititles-v2.ja-en.tsv 1 2 & + download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ja.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ja.langid.tsv.gz" ja WikiMatrix.v1.en-ja.langid.tsv 3 2 & + download_and_select subtitle "https://nlp.stanford.edu/projects/jesc/data/split.tar.gz" "tar -zxvf split.tar.gz" ja split/train 2 1 & + download_and_select kftt "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz" "tar -zxvf kftt-data-1.0.tar.gz" ja kftt-data-1.0/data/orig/kyoto-train & + + prepare_ja_ted & + + # ted data needs to + + wait + + # remove previous results + rm -f all.?? + find ./ -maxdepth 1 -name "*.ja" | sort -V | xargs cat > all.ja + find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en + lid_filter ja all.ja $DEST/train.ja_XX-en_XX.ja_XX en all.en $DEST/train.ja_XX-en_XX.en_XX +} + +prepare_ta() { + OUTPUT_DIR=$TMP_DIR/ta + mkdir -p $OUTPUT_DIR + cd $OUTPUT_DIR + + download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ta-en.tsv.gz" "gunzip -f wikititles-v2.ta-en.tsv.gz" ta wikititles-v2.ta-en.tsv 1 2 & + download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ta.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ta.langid.tsv.gz" ta WikiMatrix.v1.en-ta.langid.tsv 3 2 & + download_and_select pmindia "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.ta-en.tsv" "" ta pmindia.v1.ta-en.tsv 2 1 & + download_and_select tanzil "https://object.pouta.csc.fi/OPUS-Tanzil/v1/moses/en-ta.txt.zip" "unzip en-ta.txt.zip" ta Tanzil.en-ta & + download_and_select pib "http://preon.iiit.ac.in/~jerin/resources/datasets/pib-v0.tar" "tar -xvf pib-v0.tar" ta pib/en-ta/train & + download_and_select mkb "http://preon.iiit.ac.in/~jerin/resources/datasets/mkb-v0.tar" "tar -xvf mkb-v0.tar" ta mkb/en-ta/mkb & + download_and_select ufal "http://ufal.mff.cuni.cz/~ramasamy/parallel/data/v2/en-ta-parallel-v2.tar.gz" "tar -zxvf en-ta-parallel-v2.tar.gz" ta en-ta-parallel-v2/corpus.bcn.train & + + wait + + # need special handling for nlpc + mkdir -p nlpc + cd nlpc + wget -nc https://raw.githubusercontent.com/nlpc-uom/English-Tamil-Parallel-Corpus/master/En-Ta%20Corpus/En-Ta%20English.txt + wget -nc https://github.com/nlpc-uom/English-Tamil-Parallel-Corpus/raw/master/En-Ta%20Corpus/En-Ta%20Tamil.txt + tail -n +4 "En-Ta English.txt" > en-ta.en + tail -n +4 "En-Ta Tamil.txt" > en-ta.ta + cd .. + ln -sf nlpc/en-ta.en nlpc.en + ln -sf nlpc/en-ta.ta nlpc.ta + + # remove previous results + rm -f all.?? + find ./ -maxdepth 1 -name "*.ta" | sort -V | xargs cat > all.ta + find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en + lid_filter ta all.ta $DEST/train.ta_IN-en_XX.ta_IN en all.en $DEST/train.ta_IN-en_XX.en_XX +} + +prepare_iu() { + OUTPUT_DIR=$TMP_DIR/iu + mkdir -p $OUTPUT_DIR + cd $OUTPUT_DIR + + download_and_select nh "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60" "tar -zxvf Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0.1.tgz" iu Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/NunavutHansard > /dev/null & + download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.iu-en.tsv.gz" "gunzip -f wikititles-v2.iu-en.tsv.gz" iu wikititles-v2.iu-en.tsv 1 2 & + + wait + + # remove previous results + rm -f all.?? + find ./ -maxdepth 1 -name "*.iu" | sort -V | xargs cat | nh/Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/scripts/normalize-iu-spelling.pl > all.iu + find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en + paste all.iu all.en | awk -F $'\t' '$1!=""&&$2!=""' > all.iuen + cut -f1 all.iuen > $DEST/train.iu_CA-en_XX.iu_CA + cut -f2 all.iuen > $DEST/train.iu_CA-en_XX.en_XX +} + +prepare_km() { + OUTPUT_DIR=$TMP_DIR/km + mkdir -p $OUTPUT_DIR + cd $OUTPUT_DIR + + download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-km.xz" "unxz wmt20-sent.en-km.zx" km wmt20-sent.en-km 2 1 & + + # km-parallel has multiple sets, concat all of them together + mkdir -p opus + cd opus + wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/km-parallel.tgz" + tar -zxvf km-parallel.tgz + find ./km-parallel -maxdepth 1 -name "*.km" | sort -V | xargs cat > opus.km + find ./km-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en + cd .. + ln -sf opus/opus.km . + ln -sf opus/opus.en . + + wait + + # remove previous results + rm -f all.?? + find ./ -maxdepth 1 -name "*.km" | sort -V | xargs cat > all.km + find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en + lid_filter km all.km $DEST/train.km_KH-en_XX.km_KH en all.en $DEST/train.km_KH-en_XX.en_XX +} + +prepare_ps() { + OUTPUT_DIR=$TMP_DIR/ps + mkdir -p $OUTPUT_DIR + cd $OUTPUT_DIR + + download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz" "unxz wmt20-sent.en-ps.xz" ps wmt20-sent.en-ps 2 1 & + download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ps-en.tsv.gz" "gunzip -f wikititles-v2.ps-en.tsv.gz" ps wikititles-v2.ps-en.tsv 1 2 & + # ps-parallel has multiple sets, concat all of them together + mkdir -p opus + cd opus + wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz" + tar -zxvf ps-parallel.tgz + find ./ps-parallel -maxdepth 1 -name "*.ps" | sort -V | xargs cat > opus.ps + find ./ps-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en + cd .. + ln -sf opus/opus.ps opus.ps + ln -sf opus/opus.en opus.en + + wait + + # remove previous results + rm -f all.?? + find ./ -maxdepth 1 -name "*.ps" | sort -V | xargs cat > all.ps + find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en + lid_filter ps all.ps $DEST/train.ps_AF-en_XX.ps_AF en all.en $DEST/train.ps_AF-en_XX.en_XX +} + +download_commoncrawl() { + mkdir -p $COMMONCRAWL_DIR + cd $COMMONCRAWL_DIR + + wget -nc "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz" + tar -zxvf training-parallel-commoncrawl.tgz +} +link_commoncrawl() { + LANG=$1 + ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.en commoncrawl.en + ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.$LANG commoncrawl.$LANG +} + +strip_xlf() { + INPUT_FILE=$1 + SRC=$2 + TGT=$3 + grep '<source xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$SRC + grep '<target xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$TGT +} + +download_and_process_tilde() { + URL=$1 + UNCOMPRESS_CMD=$2 + FILENAME=$3 + LANG=$4 + PROCESS_CMD=$5 + + mkdir -p tilde + cd tilde + wget -nc $URL + $UNCOMPRESS_CMD + echo "executing cmd" + echo $PROCESS_CMD + $PROCESS_CMD + cd .. + ln -sf tilde/$FILENAME.$LANG tilde.$LANG + ln -sf tilde/$FILENAME.en tilde.en +} + +prepare_cs() { + OUTPUT_DIR=$TMP_DIR/cs + mkdir -p $OUTPUT_DIR + cd $OUTPUT_DIR + + #download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.cs-en.tsv.gz" "gunzip europarl-v10.cs-en.tsv.gz" cs europarl-v10.cs-en.tsv 1 2 & + #download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-cs.txt.gz" "gunzip en-cs.txt.gz" cs en-cs.txt 2 1 & + #link_commoncrawl cs + #download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.cs-en.tsv.gz" "gunzip news-commentary-v15.cs-en.tsv.gz" cs news-commentary-v15.cs-en.tsv 1 2 & + #download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.cs-en.tsv.gz" "gunzip wikititles-v2.cs-en.tsv.gz" cs wikititles-v2.cs-en.tsv 1 2 & + #download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.cs-en.xlf.gz" "gunzip RAPID_2019.cs-en.xlf.gz" RAPID_2019.cs-en.xlf cs "strip_xlf RAPID_2019.cs-en.xlf cs en" & + #download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.cs-en.langid.tsv.gz" "gunzip WikiMatrix.v1.cs-en.langid.tsv.gz" cs WikiMatrix.v1.cs-en.langid.tsv 2 3 & + + #wait + + # remove previous results + #rm -f all.?? + #find ./ -maxdepth 1 -name "*.cs" | sort -V | xargs cat > all.cs + #find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en + if [ -z $CZENG_CORPUS ] ; + then + echo "Please download CZENG_CORPUS manually and place them at $CZENG_CORPUS. Exitting..." + exit + fi + cat $CZENG_CORPUS | sed '/^$/d' | cut -f5 > all.cs + cat $CZENG_CORPUS | sed '/^$/d' | cut -f6 > all.en + + lid_filter cs all.cs $DEST/train.cs_CZ-en_XX.cs_CZ en all.en $DEST/train.cs_CZ-en_XX.en_XX +} + +prepare_de() { + OUTPUT_DIR=$TMP_DIR/de + mkdir -p $OUTPUT_DIR + cd $OUTPUT_DIR + + download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.de-en.tsv.gz" "gunzip europarl-v10.de-en.tsv.gz" de europarl-v10.de-en.tsv 1 2 & + download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-de.txt.gz" "gunzip en-de.txt.gz" de en-de.txt 2 1 & + link_commoncrawl de + download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.de-en.tsv.gz" "gunzip news-commentary-v15.de-en.tsv.gz" de news-commentary-v15.de-en.tsv 1 2 & + download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.de-en.tsv.gz" "gunzip wikititles-v2.de-en.tsv.gz" de wikititles-v2.de-en.tsv 1 2 & + download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.de-en.xlf.gz" "gunzip RAPID_2019.de-en.xlf.gz" RAPID_2019.de-en.xlf de "strip_xlf RAPID_2019.de-en.xlf de en" & + download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.de-en.langid.tsv.gz" "gunzip WikiMatrix.v1.de-en.langid.tsv.gz" de WikiMatrix.v1.de-en.langid.tsv 2 3 & + + wait + + # remove previous results + rm -f all.?? + find ./ -maxdepth 1 -name "*.de" | sort -V | xargs cat > all.de + find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en + lid_filter de all.de $DEST/train.de_DE-en_XX.de_DE en all.en $DEST/train.de_DE-en_XX.en_XX +} + +prepare_tmx() { + TMX_FILE=$1 + git clone https://github.com/amake/TMX2Corpus $UTILS/tmx2corpus + pip install tinysegmenter + + python $UTILS/tmx2corpus/tmx2corpus.py $TMX_FILE +} + +prepare_pl() { + OUTPUT_DIR=$TMP_DIR/pl + mkdir -p $OUTPUT_DIR + cd $OUTPUT_DIR + + # download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.pl-en.tsv.gz" "gunzip europarl-v10.pl-en.tsv.gz" pl europarl-v10.pl-en.tsv 1 2 & + # download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-pl.txt.gz" "gunzip en-pl.txt.gz" pl en-pl.txt 2 1 & + # download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.pl-en.tsv.gz" "gunzip wikititles-v2.pl-en.tsv.gz" pl wikititles-v2.pl-en.tsv 1 2 & + download_and_select tilde "https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2019.en-pl.tmx.zip" "gunzip rapid2019.en-pl.tmx.zip" bitext pl "prepare_tmx RAPID_2019.UNIQUE.en-pl.tmx" & + # download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-pl.langid.tsv.gz" "gunzip WikiMatrix.v1.en-pl.langid.tsv.gz" pl WikiMatrix.v1.en-pl.langid.tsv 3 2 & + + wait + + # remove previous results + rm -f all.?? + find ./ -maxdepth 1 -name "*.pl" | sort -V | xargs cat > all.pl + find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en + lid_filter pl all.pl $DEST/train.pl_PL-en_XX.pl_PL en all.en $DEST/train.pl_PL-en_XX.en_XX +} + +prepare_uncorpus() { + $URLS=$1 + $FILES=$2 + + mkdir -p uncorpus + cd uncorpus + + for URL in $URLS; do + wget -nc $URL + done + cat $FILES > uncorpus.tar.gz + tar -zxvf uncorpus.tar.gz + + cd .. + ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.$LANG uncorpus.$LANG + ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.en uncorpus.en +} + +prepare_yandex() { + mkdir -p yandex + cd yandex + unzip $YANDEX_CORPUS ./ + cd .. + ln -s yandex/corpus.en_ru.1m.en yandex.en + ln -s yandex/corpus.en_ru.1m.ru yandex.ru +} + +prepare_ru() { + OUTPUT_DIR=$TMP_DIR/ru + mkdir -p $OUTPUT_DIR + cd $OUTPUT_DIR + + download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" "tar -zxvf paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" ru paracrawl-release1.en-ru.zipporah0-dedup-clean & + link_commoncrawl ru + download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ru.tsv.gz" "gunzip news-commentary-v15.en-ru.tsv.gz" ru news-commentary-v15.en-ru.tsv 2 1 & + prepare_yandex & + download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ru-en.tsv.gz" "gunzip wikititles-v2.ru-en.tsv.gz" ru wikititles-v2.ru-en.tsv 1 2 & + prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02" "UNv1.0.en-ru.tar.gz.00 UNv1.0.en-ru.tar.gz.01 UNv1.0.en-ru.tar.gz.02" & + download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ru.langid.tsv.gz" "gunzip WikiMatrix.v1.en-ru.langid.tsv.gz" ru WikiMatrix.v1.en-ru.langid.tsv 3 2 & + + wait + + # remove previous results + rm -f all.?? + find ./ -maxdepth 1 -name "*.ru" | sort -V | xargs cat > all.ru + find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en + lid_filter ru all.ru $DEST/train.ru_RU-en_XX.ru_RU en all.en $DEST/train.ru_RU-en_XX.en_XX +} + +prepare_ccmt() { + mkdir -p ccmt + cd ccmt + # assume ccmt data is already unzipped under CCMT_DIR folder + cat $CCMT_DIR/datum2017/Book*_cn.txt | sed 's/ //g' > datum2017.detok.zh + cat $CCMT_DIR/datum2017/Book*_en.txt > datum2017.detok.en + cat $CCMT_DIR/casict2011/casict-A_ch.txt $CCMT_DIR/casict2011/casict-B_ch.txt $CCMT_DIR/casict2015/casict2015_ch.txt $CCMT_DIR/datum2015/datum_ch.txt $CCMT_DIR/neu2017/NEU_cn.txt datum2017.detok.zh > ccmt.zh + cat $CCMT_DIR/casict2011/casict-A_en.txt $CCMT_DIR/casict2011/casict-B_en.txt $CCMT_DIR/casict2015/casict2015_en.txt $CCMT_DIR/datum2015/datum_en.txt $CCMT_DIR/neu2017/NEU_en.txt datum2017.detok.en > ccmt.en + cd .. + ln -sf ccmt/ccmt.zh ccmt.zh + ln -sf ccmt/ccmt.en ccmt.en +} + +prepare_zh() { + OUTPUT_DIR=$TMP_DIR/zh + mkdir -p $OUTPUT_DIR + cd $OUTPUT_DIR + + download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-zh.tsv.gz" "gunzip news-commentary-v15.en-zh.tsv.gz" zh news-commentary-v15.en-zh.tsv 2 1 & + download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.zh-en.tsv.gz" "gunzip wikititles-v2.zh-en.tsv.gz" zh wikititles-v2.zh-en.tsv 1 2 & + prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01" "UNv1.0.en-zh.tar.gz.00 UNv1.0.en-zh.tar.gz.01" & + prepare_ccmt & + download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-zh.langid.tsv.gz" "gunzip WikiMatrix.v1.en-zh.langid.tsv.gz" zh WikiMatrix.v1.en-zh.langid.tsv 3 2 & + + wait + + # remove previous results + rm -f all.?? + find ./ -maxdepth 1 -name "*.zh" | sort -V | xargs cat > all.zh + find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en + lid_filter zh all.zh $DEST/train.zh_CN-en_XX.zh_CN en all.en $DEST/train.zh_CN-en_XX.en_XX +} + +prepare_tests() { + OUTPUT_DIR=$TMP_DIR + mkdir -p $OUTPUT_DIR + cd $OUTPUT_DIR + wget -nc http://data.statmt.org/wmt20/translation-task/dev.tgz + tar -zxvf dev.tgz + cd dev + + cat newsdev2020-jaen-src.ja.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.ja + cat newsdev2020-jaen-ref.en.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.en + split newsdev2020-jaen.ja -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.ja_XX + split newsdev2020-jaen.en -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.en_XX + split newsdev2020-jaen.ja -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.ja_XX + split newsdev2020-jaen.en -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.en_XX + + cat newsdev2020-iuen-src.iu.sgm | strip_sgm.sh > newsdev2020-iuen.iu + cat newsdev2020-iuen-ref.en.sgm | strip_sgm.sh > newsdev2020-iuen.en + split newsdev2020-iuen.iu -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.iu_CA + split newsdev2020-iuen.en -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.en_XX + split newsdev2020-iuen.iu -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.iu_CA + split newsdev2020-iuen.en -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.en_XX + + cat newsdev2020-taen-src.ta.sgm | strip_sgm.sh > newsdev2020-taen.ta + cat newsdev2020-taen-ref.en.sgm | strip_sgm.sh > newsdev2020-taen.en + split newsdev2020-taen.ta -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.ta_IN + split newsdev2020-taen.en -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.en_XX + split newsdev2020-taen.ta -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.ta_IN + split newsdev2020-taen.en -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.en_XX + + cp wikipedia.dev.km-en.km $DEST/valid.km_KH-en_XX.km_KH + cp wikipedia.dev.km-en.en $DEST/valid.km_KH-en_XX.en_XX + cp wikipedia.devtest.km-en.km $DEST/test.km_KH-en_XX.km_KH + cp wikipedia.devtest.km-en.en $DEST/test.km_KH-en_XX.en_XX + + cp wikipedia.dev.ps-en.ps $DEST/valid.ps_AF-en_XX.ps_AF + cp wikipedia.dev.ps-en.en $DEST/valid.ps_AF-en_XX.en_XX + cp wikipedia.devtest.ps-en.ps $DEST/test.ps_AF-en_XX.ps_AF + cp wikipedia.devtest.ps-en.en $DEST/test.ps_AF-en_XX.en_XX + + cat newsdev2020-plen-src.pl.sgm | strip_sgm.sh > newsdev2020-plen.pl + cat newsdev2020-plen-ref.en.sgm | strip_sgm.sh > newsdev2020-plen.en + split newsdev2020-plen.pl -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.pl_PL + split newsdev2020-plen.en -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.en_XX + split newsdev2020-plen.pl -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.pl_PL + split newsdev2020-plen.en -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.en_XX + + cat newstest2018-encs-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.en_XX + cat newstest2018-encs-ref.cs.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.cs_CZ + cat newstest2019-encs-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.en_XX + cat newstest2019-encs-ref.cs.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.cs_CZ + + cat newstest2018-deen-src.de.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.de_DE + cat newstest2018-deen-ref.en.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.en_XX + cat newstest2018-ende-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.en_XX + cat newstest2018-ende-ref.de.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.de_DE + cat newstest2019-deen-src.de.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.de_DE + cat newstest2019-deen-ref.en.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.en_XX + cat newstest2019-ende-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.en_XX + cat newstest2019-ende-ref.de.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.de_DE + + cat newstest2018-ruen-src.ru.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.ru_RU + cat newstest2018-ruen-ref.en.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.en_XX + cat newstest2018-enru-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.en_XX + cat newstest2018-enru-ref.ru.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.ru_RU + cat newstest2019-ruen-src.ru.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.ru_RU + cat newstest2019-ruen-ref.en.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.en_XX + cat newstest2019-enru-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.en_XX + cat newstest2019-enru-ref.ru.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.ru_RU + + cat newstest2018-zhen-src.zh.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.zh_CN + cat newstest2018-zhen-ref.en.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.en_XX + cat newstest2018-enzh-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.en_XX + cat newstest2018-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.zh_CN + cat newstest2019-zhen-src.zh.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.zh_CN + cat newstest2019-zhen-ref.en.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.en_XX + cat newstest2019-enzh-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.en_XX + cat newstest2019-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.zh_CN +} + +mkdir -p $DEST + +prepare_lid +prepare_moses +download_commoncrawl + +prepare_ja & +prepare_ta & +prepare_km & +prepare_ps & +prepare_iu & +prepare_cs & +prepare_de & +prepare_pl & +prepare_ru & +prepare_zh & + +# prepare valid/test set +prepare_tests & + +# wait + +# TODO remove intermediate files +# rm -rf $TMP_DIR diff --git a/examples/multilingual/data_scripts/preprocess_ML50_v1.sh b/examples/multilingual/data_scripts/preprocess_ML50_v1.sh new file mode 100644 index 0000000000..4655936149 --- /dev/null +++ b/examples/multilingual/data_scripts/preprocess_ML50_v1.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +if [ -z $WORKDIR_ROOT ] ; +then + echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." + exit +fi + +if [ -z $SPM_PATH ] ; +then + echo "Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting..." + exit +fi + +ML50=${WORKDIR_ROOT}/ML50 + +mkdir -p $ML50/dedup +mkdir -p $ML50/cleaned_dedup + +python ./dedup_all.py --from-folder $ML50/raw --to-folder $ML50/dedup +python ./remove_valid_test_in_train.py --from-folder $ML50/dedup --to-folder $ML50/clean +python ./binarize.py --raw-folder $ML50/clean \ No newline at end of file diff --git a/examples/multilingual/data_scripts/remove_valid_test_in_train.py b/examples/multilingual/data_scripts/remove_valid_test_in_train.py new file mode 100755 index 0000000000..ef618adef7 --- /dev/null +++ b/examples/multilingual/data_scripts/remove_valid_test_in_train.py @@ -0,0 +1,290 @@ +import os, sys +import glob, itertools +import pandas as pd + +WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) + +if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): + print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') + sys.exit(-1) + + +def load_langs(path): + with open(path) as fr: + langs = [l.strip() for l in fr] + return langs + + + +def load_sentences(raw_data, split, direction): + src, tgt = direction.split('-') + src_path = f"{raw_data}/{split}.{direction}.{src}" + tgt_path = f"{raw_data}/{split}.{direction}.{tgt}" + if os.path.exists(src_path) and os.path.exists(tgt_path): + return [(src, open(src_path).read().splitlines()), (tgt, open(tgt_path).read().splitlines())] + else: + return [] + +def swap_direction(d): + src, tgt = d.split('-') + return f'{tgt}-{src}' + +def get_all_test_data(raw_data, directions, split='test'): + test_data = [ + x + for dd in directions + for d in [dd, swap_direction(dd)] + for x in load_sentences(raw_data, split, d) + ] + # all_test_data = {s for _, d in test_data for s in d} + all_test_data = {} + for lang, d in test_data: + for s in d: + s = s.strip() + lgs = all_test_data.get(s, set()) + lgs.add(lang) + all_test_data[s] = lgs + return all_test_data, test_data + +def check_train_sentences(raw_data, direction, all_test_data, mess_up_train={}): + src, tgt = direction.split('-') + tgt_path = f"{raw_data}/train.{direction}.{tgt}" + src_path = f"{raw_data}/train.{direction}.{src}" + print(f'check training data in {raw_data}/train.{direction}') + size = 0 + if not os.path.exists(tgt_path) or not os.path.exists(src_path): + return mess_up_train, size + with open(src_path) as f, open(tgt_path) as g: + for src_line, tgt_line in zip(f, g): + s = src_line.strip() + t = tgt_line.strip() + size += 1 + if s in all_test_data: + langs = mess_up_train.get(s, set()) + langs.add(direction) + mess_up_train[s] = langs + if t in all_test_data: + langs = mess_up_train.get(t, set()) + langs.add(direction) + mess_up_train[t] = langs + return mess_up_train, size + +def check_train_all(raw_data, directions, all_test_data): + mess_up_train = {} + data_sizes = {} + for direction in directions: + _, size = check_train_sentences(raw_data, direction, all_test_data, mess_up_train) + data_sizes[direction] = size + return mess_up_train, data_sizes + +def count_train_in_other_set(mess_up_train): + train_in_others = [(direction, s) for s, directions in mess_up_train.items() for direction in directions] + counts = {} + for direction, s in train_in_others: + counts[direction] = counts.get(direction, 0) + 1 + return counts + +def train_size_if_remove_in_otherset(data_sizes, mess_up_train): + counts_in_other = count_train_in_other_set(mess_up_train) + remain_sizes = [] + for direction, count in counts_in_other.items(): + remain_sizes.append((direction, data_sizes[direction] - count, data_sizes[direction], count, 100 * count / data_sizes[direction] )) + return remain_sizes + + +def remove_messed_up_sentences(raw_data, direction, mess_up_train, mess_up_train_pairs, corrected_langs): + split = 'train' + src_lang, tgt_lang = direction.split('-') + + tgt = f"{raw_data}/{split}.{direction}.{tgt_lang}" + src = f"{raw_data}/{split}.{direction}.{src_lang}" + print(f'working on {direction}: ', src, tgt) + if not os.path.exists(tgt) or not os.path.exists(src) : + return + + corrected_tgt = f"{to_folder}/{split}.{direction}.{tgt_lang}" + corrected_src = f"{to_folder}/{split}.{direction}.{src_lang}" + line_num = 0 + keep_num = 0 + with open(src, encoding='utf8',) as fsrc, \ + open(tgt, encoding='utf8',) as ftgt, \ + open(corrected_src, 'w', encoding='utf8') as fsrc_corrected, \ + open(corrected_tgt, 'w', encoding='utf8') as ftgt_corrected: + for s, t in zip(fsrc, ftgt): + s = s.strip() + t = t.strip() + if t not in mess_up_train \ + and s not in mess_up_train \ + and (s, t) not in mess_up_train_pairs \ + and (t, s) not in mess_up_train_pairs: + corrected_langs.add(direction) + print(s, file=fsrc_corrected) + print(t, file=ftgt_corrected) + keep_num += 1 + line_num += 1 + if line_num % 1000 == 0: + print(f'completed {line_num} lines', end='\r') + return line_num, keep_num + +########## + + +def merge_valid_test_messup(mess_up_train_valid, mess_up_train_test): + merged_mess = [] + for s in set(list(mess_up_train_valid.keys()) + list(mess_up_train_test.keys())): + if not s: + continue + valid = mess_up_train_valid.get(s, set()) + test = mess_up_train_test.get(s, set()) + merged_mess.append((s, valid | test)) + return dict(merged_mess) + + + +######### +def check_train_pairs(raw_data, direction, all_test_data, mess_up_train={}): + src, tgt = direction.split('-') + #a hack; TODO: check the reversed directions + path1 = f"{raw_data}/train.{src}-{tgt}.{src}" + path2 = f"{raw_data}/train.{src}-{tgt}.{tgt}" + if not os.path.exists(path1) or not os.path.exists(path2) : + return + + with open(path1) as f1, open(path2) as f2: + for src_line, tgt_line in zip(f1, f2): + s = src_line.strip() + t = tgt_line.strip() + if (s, t) in all_test_data or (t, s) in all_test_data: + langs = mess_up_train.get( (s, t), set()) + langs.add(src) + langs.add(tgt) + mess_up_train[(s, t)] = langs + + +def load_pairs(raw_data, split, direction): + src, tgt = direction.split('-') + src_f = f"{raw_data}/{split}.{direction}.{src}" + tgt_f = f"{raw_data}/{split}.{direction}.{tgt}" + if tgt != 'en_XX': + src_f, tgt_f = tgt_f, src_f + if os.path.exists(src_f) and os.path.exists(tgt_f): + return list(zip(open(src_f).read().splitlines(), + open(tgt_f).read().splitlines(), + )) + else: + return [] + +# skip_langs = ['cs_CZ', 'en_XX', 'tl_XX', 'tr_TR'] +def get_messed_up_test_pairs(split, directions): + test_pairs = [ + (d, load_pairs(raw_data, split, d)) + for d in directions + ] + # all_test_data = {s for _, d in test_data for s in d} + all_test_pairs = {} + for direction, d in test_pairs: + src, tgt = direction.split('-') + for s in d: + langs = all_test_pairs.get(s, set()) + langs.add(src) + langs.add(tgt) + all_test_pairs[s] = langs + mess_up_train_pairs = {} + for direction in directions: + check_train_pairs(raw_data, direction, all_test_pairs, mess_up_train_pairs) + return all_test_pairs, mess_up_train_pairs + + + +if __name__ == "__main__": + ####### + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + '--from-folder', + required=True, + type=str) + parser.add_argument( + '--to-folder', + required=True, + type=str) + parser.add_argument( + '--directions', + default=None, + type=str) + + + args = parser.parse_args() + raw_data = args.from_folder + to_folder = args.to_folder + os.makedirs(to_folder, exist_ok=True) + + if args.directions: + directions = args.directions.split(',') + else: + raw_files = itertools.chain( + glob.glob(f'{raw_data}/train*'), + glob.glob(f'{raw_data}/valid*'), + glob.glob(f'{raw_data}/test*'), + ) + directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] + print('working on directions: ', directions) + + ########## + + + + all_test_data, test_data = get_all_test_data(raw_data, directions, 'test') + print('==loaded test data==') + all_valid_data, valid_data = get_all_test_data(raw_data, directions, 'valid') + print('==loaded valid data==') + all_valid_test_data = merge_valid_test_messup(all_test_data, all_valid_data) + mess_up_train, data_sizes = check_train_all(raw_data, directions, all_valid_test_data) + print('training messing up with valid, test data:', len(mess_up_train)) + data_situation = train_size_if_remove_in_otherset(data_sizes, mess_up_train) + df = pd.DataFrame(data_situation, columns=['direction', 'train_size_after_remove', 'orig_size', 'num_to_remove', 'remove_percent']) + df.sort_values('remove_percent', ascending=False) + df.to_csv(f'{raw_data}/clean_summary.tsv', sep='\t') + print(f'projected data clean summary in: {raw_data}/clean_summary.tsv') + + # correct the dataset: + all_test_pairs, mess_up_test_train_pairs = get_messed_up_test_pairs('test', directions) + all_valid_pairs, mess_up_valid_train_pairs = get_messed_up_test_pairs('valid', directions) + + all_messed_pairs = set(mess_up_test_train_pairs.keys()).union(set(mess_up_valid_train_pairs.keys())) + corrected_directions = set() + + real_data_situation = [] + for direction in directions: + org_size, new_size = remove_messed_up_sentences(raw_data, direction, mess_up_train, all_messed_pairs, corrected_directions) + if org_size == 0: + print(f"{direction} has size 0") + continue + real_data_situation.append( + (direction, new_size, org_size, org_size - new_size, (org_size - new_size) / org_size * 100) + ) + print('corrected directions: ', corrected_directions) + df = pd.DataFrame(real_data_situation, columns=['direction', 'train_size_after_remove', 'orig_size', 'num_to_remove', 'remove_percent']) + df.sort_values('remove_percent', ascending=False) + df.to_csv(f'{raw_data}/actual_clean_summary.tsv', sep='\t') + print(f'actual data clean summary (which can be different from the projected one because of duplications) in: {raw_data}/actual_clean_summary.tsv') + + import shutil + for direction in directions: + src_lang, tgt_lang = direction.split('-') + for split in ['train', 'valid', 'test']: + # copying valid, test and uncorrected train + if direction in corrected_directions and split == 'train': + continue + tgt = f"{raw_data}/{split}.{direction}.{tgt_lang}" + src = f"{raw_data}/{split}.{direction}.{src_lang}" + if not (os.path.exists(src) and os.path.exists(tgt)): + continue + corrected_tgt = f"{to_folder}/{split}.{direction}.{tgt_lang}" + corrected_src = f"{to_folder}/{split}.{direction}.{src_lang}" + print(f'copying {src} to {corrected_src}') + shutil.copyfile(src, corrected_src) + print(f'copying {tgt} to {corrected_tgt}') + shutil.copyfile(tgt, corrected_tgt) + + print('completed') \ No newline at end of file diff --git a/examples/multilingual/data_scripts/requirement.txt b/examples/multilingual/data_scripts/requirement.txt new file mode 100644 index 0000000000..e85d7d540e --- /dev/null +++ b/examples/multilingual/data_scripts/requirement.txt @@ -0,0 +1,2 @@ +wget +pandas \ No newline at end of file diff --git a/examples/multilingual/data_scripts/utils/dedup.py b/examples/multilingual/data_scripts/utils/dedup.py new file mode 100644 index 0000000000..d6fed8c695 --- /dev/null +++ b/examples/multilingual/data_scripts/utils/dedup.py @@ -0,0 +1,41 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import argparse + +def deup(src_file, tgt_file, src_file_out, tgt_file_out): + seen = set() + dup_count = 0 + with open(src_file, encoding='utf-8') as fsrc, \ + open(tgt_file, encoding='utf-8') as ftgt, \ + open(src_file_out, 'w', encoding='utf-8') as fsrc_out, \ + open(tgt_file_out, 'w', encoding='utf-8') as ftgt_out: + for s, t in zip(fsrc, ftgt): + if (s, t) not in seen: + fsrc_out.write(s) + ftgt_out.write(t) + seen.add((s, t)) + else: + dup_count += 1 + print(f'number of duplication: {dup_count}') + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--src-file", type=str, required=True, + help="src file") + parser.add_argument("--tgt-file", type=str, required=True, + help="tgt file") + parser.add_argument("--src-file-out", type=str, required=True, + help="src ouptut file") + parser.add_argument("--tgt-file-out", type=str, required=True, + help="tgt ouput file") + args = parser.parse_args() + deup(args.src_file, args.tgt_file, args.src_file_out, args.tgt_file_out) + + +if __name__ == "__main__": + main() diff --git a/examples/multilingual/data_scripts/utils/fasttext_multi_filter.py b/examples/multilingual/data_scripts/utils/fasttext_multi_filter.py new file mode 100644 index 0000000000..41b38ba5be --- /dev/null +++ b/examples/multilingual/data_scripts/utils/fasttext_multi_filter.py @@ -0,0 +1,63 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +#!/bin/python + +import fasttext +from multiprocessing import Pool +import contextlib +import sys +import argparse +from functools import partial +import io + +model = None +def init(model_path): + global model + model = fasttext.load_model(model_path) + +def pred(lines): + return lines, [model.predict(line.strip())[0][0][9:] for line in lines] + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True, + help="model to load") + parser.add_argument("--inputs", nargs="+", default=['-'], + help="input files to filter") + parser.add_argument("--langs", nargs="+", required=True, + help="lang ids of each input file") + parser.add_argument("--outputs", nargs="+", default=['-'], + help="path to save lid filtered outputs") + parser.add_argument("--num-workers", type=int, metavar="N", default=10, + help="number of processes in parallel") + args = parser.parse_args() + + assert len(args.inputs) == len(args.langs) and len(args.inputs) == len(args.outputs) + + with contextlib.ExitStack() as stack: + inputs = [ + stack.enter_context(open(input, "r", encoding="utf-8", newline="\n", errors="replace")) + if input != "-" else io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors="replace") + for input in args.inputs + ] + outputs = [ + stack.enter_context(open(output, "w", encoding="utf-8", newline="\n")) + if output != "-" else sys.stdout + for output in args.outputs + ] + with Pool(args.num_workers, initializer=partial(init, args.model)) as p: + skip_cnt = 0 + for lines, preds in p.imap(pred, list(zip(*inputs)), chunksize=500): + if not all(a == b for a, b in zip(preds, args.langs)): + skip_cnt += 1 + continue + for line, output_h in zip(lines, outputs): + print(line.strip(), file=output_h) + print(f"Skipped {skip_cnt} lines.") + +if __name__ == "__main__": + main() diff --git a/examples/multilingual/data_scripts/utils/strip_sgm.sh b/examples/multilingual/data_scripts/utils/strip_sgm.sh new file mode 100755 index 0000000000..7f4f61d7b1 --- /dev/null +++ b/examples/multilingual/data_scripts/utils/strip_sgm.sh @@ -0,0 +1 @@ +grep "seg id" | sed 's/<seg id="[0-9]\+">//g' | sed 's/<\/seg>//g' diff --git a/examples/multilingual/finetune_multilingual_model.sh b/examples/multilingual/finetune_multilingual_model.sh index cfa9a86113..25960c5dc8 100644 --- a/examples/multilingual/finetune_multilingual_model.sh +++ b/examples/multilingual/finetune_multilingual_model.sh @@ -1,4 +1,9 @@ #!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. path_2_data=$1 # <path to data> which contains binarized data for each directions lang_list=$2 # <path to a file which contains a list of languages separted by new lines> @@ -20,7 +25,7 @@ fairseq-train "$path_2_data" \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \ + --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ diff --git a/examples/multilingual/multilingual_fairseq_gen.sh b/examples/multilingual/multilingual_fairseq_gen.sh index 8c2c7703b2..65aa322d7d 100644 --- a/examples/multilingual/multilingual_fairseq_gen.sh +++ b/examples/multilingual/multilingual_fairseq_gen.sh @@ -1,4 +1,9 @@ #!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. lang_pairs="en-fr,en-cs,fr-en,cs-en" path_2_data=$1 # <path to data> diff --git a/examples/multilingual/train_multilingual_model.sh b/examples/multilingual/train_multilingual_model.sh index 09014c8217..cc050bd3f0 100644 --- a/examples/multilingual/train_multilingual_model.sh +++ b/examples/multilingual/train_multilingual_model.sh @@ -1,4 +1,9 @@ #!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. path_2_data=$1 # <path to data> which contains binarized data for each directions lang_list=$2 # <path to a file which contains a list of languages separted by new lines> @@ -16,7 +21,7 @@ fairseq-train "$path_2_data" \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ - --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \ + --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ diff --git a/examples/noisychannel/rerank.py b/examples/noisychannel/rerank.py index b5ffd1ca34..bb80d11a67 100644 --- a/examples/noisychannel/rerank.py +++ b/examples/noisychannel/rerank.py @@ -11,7 +11,7 @@ from fairseq.data import dictionary from fairseq.scoring import bleu -from . import ( +from examples.noisychannel import ( rerank_generate, rerank_options, rerank_score_bw, diff --git a/examples/noisychannel/rerank_generate.py b/examples/noisychannel/rerank_generate.py index d512088de8..daeeae059a 100644 --- a/examples/noisychannel/rerank_generate.py +++ b/examples/noisychannel/rerank_generate.py @@ -15,7 +15,7 @@ from fairseq import options from fairseq_cli import generate, preprocess -from . import rerank_options, rerank_utils +from examples.noisychannel import rerank_options, rerank_utils def gen_and_reprocess_nbest(args): diff --git a/examples/noisychannel/rerank_score_bw.py b/examples/noisychannel/rerank_score_bw.py index 895673b1cc..b0bc913651 100644 --- a/examples/noisychannel/rerank_score_bw.py +++ b/examples/noisychannel/rerank_score_bw.py @@ -9,7 +9,7 @@ from fairseq import options from fairseq_cli import generate -from . import rerank_options, rerank_utils +from examples.noisychannel import rerank_options, rerank_utils def score_bw(args): diff --git a/examples/noisychannel/rerank_score_lm.py b/examples/noisychannel/rerank_score_lm.py index 89ebf61cce..e80948d78b 100644 --- a/examples/noisychannel/rerank_score_lm.py +++ b/examples/noisychannel/rerank_score_lm.py @@ -7,7 +7,7 @@ from fairseq import options -from . import rerank_options, rerank_utils +from examples.noisychannel import rerank_options, rerank_utils def score_lm(args): diff --git a/examples/noisychannel/rerank_tune.py b/examples/noisychannel/rerank_tune.py index 1be71744a3..b2e8b7594a 100644 --- a/examples/noisychannel/rerank_tune.py +++ b/examples/noisychannel/rerank_tune.py @@ -9,7 +9,7 @@ import numpy as np from fairseq import options -from . import rerank, rerank_options +from examples.noisychannel import rerank, rerank_options def random_search(args): diff --git a/examples/nonautoregressive_translation/README.md b/examples/nonautoregressive_translation/README.md index dfc592f0a0..8793e225c9 100644 --- a/examples/nonautoregressive_translation/README.md +++ b/examples/nonautoregressive_translation/README.md @@ -36,7 +36,7 @@ The following command will train a *Levenshtein Transformer* on the binarized da fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch levenshtein_transformer \ @@ -44,7 +44,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ diff --git a/examples/nonautoregressive_translation/scripts.md b/examples/nonautoregressive_translation/scripts.md index 63b945c1d3..9d3d7b67dc 100644 --- a/examples/nonautoregressive_translation/scripts.md +++ b/examples/nonautoregressive_translation/scripts.md @@ -6,7 +6,7 @@ Note that we need to have an additional module to perform "length prediction" (` fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch nonautoregressive_transformer \ @@ -14,7 +14,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ @@ -35,7 +35,7 @@ Note that we implemented a low-rank appromixated CRF model by setting `--crf-low fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch nacrf_transformer \ @@ -43,7 +43,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ @@ -68,7 +68,7 @@ Note that `--train-step` means how many iterations of refinement we used during fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch iterative_nonautoregressive_transformer \ @@ -76,7 +76,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ @@ -101,7 +101,7 @@ Note that we need to specify the "slot-loss" (uniform or balanced tree) describe fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch insertion_transformer \ @@ -109,7 +109,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ @@ -128,7 +128,7 @@ fairseq-train \ fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch cmlm_transformer \ @@ -136,7 +136,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ @@ -157,7 +157,7 @@ fairseq-train \ fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch levenshtein_transformer \ @@ -165,7 +165,7 @@ fairseq-train \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ - --min-lr '1e-09' --warmup-updates 10000 \ + --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ diff --git a/examples/normformer/README.md b/examples/normformer/README.md new file mode 100644 index 0000000000..037b453ff1 --- /dev/null +++ b/examples/normformer/README.md @@ -0,0 +1,70 @@ +### NormFormer +This is the code for the ["NormFormer: Improved Transformer Pretraining with Extra Normalization"](https://arxiv.org/abs/2110.09456) +- 2021-10-19: Commands for CLM Experiments +- Coming soon: Commands for MLM experiments + +If you have any issues or questions please post a github issue and tag `@sshleifer`. + + +### Data +- To preprocess language modeling data, see [here](https://github.com/pytorch/fairseq/blob/d0fbcb0baef6f6ff3425ded62d8daea0e8b12114/examples/language_model/README.md#1-preprocess-the-data). +- The replication commands below expect `$DATA` to be the path to the binarized data directory. +- Note that NormFormer results in Table 2 use a much larger private dataset, and to get good results you should adapt the pre-processing instructions to your dataset and compare to a baseline on the same data, rather than Table 2. +- The code uses `FSDP`, which requires `pip install fairscale>=0.4.0`. + + +### Modify existing Command +To modify an existing `fairseq-train` command to use NormFormer, simply add the following flags: +```bash +fairseq-train ... \ + --scale-attn --scale-fc --scale-heads +``` +- you probably also want to increase your learning rate +- if your model is small, you may want to add `--scale-resids` + +### Exact Training Commands + +- Note that NormFormer results in Table 2 use a much larger private dataset, and to get good results you should adapt the pre-processing instructions to your dataset. +The full commands are functions defined here, so to run them you must `source examples/normformer/train_lm.sh`. +- We default `--distributed-world-size 8`. You should adjust `--update-freq` and `--batch-size` and such that the effective batch size is (1024x1024x0.5) tokens for 125M and 355M, + and (1024x1024) for 1.3B parameter and above. For small models, `--update-freq`=256/`global_bs`. For large models, `--update-freq`=512/`global_bs`, where `global_bs` = `--batch-size` * `--distributed-world-size` +- The small models will all train on as few as 8 GPUs. + +```bash +train_125M --lr 6e-4 # GPT-3 Replicated +train_125M --lr 1e-3 # stronger high-lr baseline +train_125M --lr 3e-3 --scale-attn --scale-fc --scale-heads # No scale-resids +train_125M --lr 3e-3 --scale-attn --scale-fc --scale-heads --scale-resids # Best command +``` + +```bash +train_355M --lr 6e-4 # GPT-3 Replicated +train_355M --lr 1e-3 # stronger high-lr baseline +train_355M --lr 1e-3 --scale-attn --scale-fc --scale-heads # No scale-resids +train_355M --lr 1e-3 --scale-attn --scale-fc --scale-heads --scale-resids # Slightly better +``` + +```bash +train_1.3B --lr 2e-4 # GPT-3 Replicated +train_1.3B --lr 6e-4 # stronger high-lr baseline +train_1.3B --lr 6e-4 --scale-attn --scale-fc --scale-heads # NormFormer +``` + +```bash +train_2.7B --lr 1.6e-4 # GPT-3 Replicated +train_2.7B --lr 1.6e-4 --activation-fn relu_squared # stronger Relu^2 baseline +train_2.7B --lr 6e-4 --activation-fn relu_squared --scale-attn --scale-fc --scale-heads # NormFormer 2.7B +``` + + +### Citation +```bibtex +@misc{shleifer2021normformer, + title={NormFormer: Improved Transformer Pretraining with Extra Normalization}, + author={Sam Shleifer and Jason Weston and Myle Ott}, + year={2021}, + eprint={2110.09456}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/examples/normformer/train_lm.sh b/examples/normformer/train_lm.sh new file mode 100644 index 0000000000..b081f2ddd3 --- /dev/null +++ b/examples/normformer/train_lm.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +train_common () { + fairseq-train "$DATA" \ + --combine-val \ + --train-subset train \ + --num-workers 2 \ + --validate-interval-updates 1000 \ + --save-interval-updates 1000 \ + --no-epoch-checkpoints \ + --ddp-backend fully_sharded \ + --memory-efficient-fp16 \ + --fp16-init-scale 4 \ + --checkpoint-activations \ + --arch transformer_lm_gpt \ + --activation-fn gelu \ + --share-decoder-input-output-embed \ + --task language_modeling \ + --sample-break-mode none \ + --tokens-per-sample 2048 \ + --optimizer adam --adam-betas "(0.9, 0.98)" \ + --adam-eps 1e-08 \ + --clip-norm 0.0 \ + --lr-scheduler polynomial_decay \ + --warmup-updates 750 \ + --dropout 0.1 \ + --attention-dropout 0.1 \ + --weight-decay 0.01 \ + --batch-size 16 \ + --update-freq 2 \ + --required-batch-size-multiple 1 \ + --total-num-update 572204 \ + --max-update 572204 \ + --seed 1 \ + --log-format json --log-interval 1 \ + --distributed-world-size 8 --distributed-port 13177 \ + "$@" +} + +train_125M () { + train_common --decoder-layers 12 \ + --decoder-embed-dim 768 \ + --decoder-ffn-embed-dim 3072 \ + --decoder-attention-heads 12 "$@" +} + +train_355M () { + train_common --decoder-layers 24 \ + --decoder-embed-dim 1024\ + --decoder-ffn-embed-dim 4096 \ + --decoder-attention-heads 16 \ + --dropout 0.0 \ + --attention-dropout 0.0 \ + "$@" +} + +train_1.3B () { + train_common --decoder-layers 24 \ + --decoder-embed-dim 2048 \ + --decoder-ffn-embed-dim 8192 \ + --decoder-attention-heads 32 \ + --batch-size 4 \ + --update-freq 16 \ + --total-num-update 286102 \ + --max-update 286102 \ + "$@" +} + +train_2.7B () { + train_common --decoder-layers 32 \ + --decoder-embed-dim 2560 \ + --decoder-ffn-embed-dim 10240 \ + --decoder-attention-heads 32 \ + --batch-size 4 \ + --update-freq 16 \ + --total-num-update 286102 \ + --max-update 286102 \ + "$@" +} diff --git a/examples/operators/alignment_train_cpu.cpp b/examples/operators/alignment_train_cpu.cpp new file mode 100644 index 0000000000..13c015308e --- /dev/null +++ b/examples/operators/alignment_train_cpu.cpp @@ -0,0 +1,166 @@ +/** + * Copyright 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include <torch/extension.h> // @manual=//caffe2:torch_extension +#include <algorithm> + +namespace { + +template <typename T> +void exclusiveCumprod( + const T* p_choose, + T* cumprod_1mp, + uint32_t bsz, + uint32_t tgt_len, + uint32_t src_len) { + // cumprod_1mp = 1 - p_choose + for (uint32_t b = 0; b < bsz; b++) { + for (uint32_t tgt = 0; tgt < tgt_len; tgt++) { + for (uint32_t src = 0; src < src_len; src++) { + uint32_t idx = b * tgt_len * src_len + tgt * src_len + src; + cumprod_1mp[idx] = 1 - p_choose[idx]; + } + } + } + + // Implementing exclusive cumprod in the innermost dimension + // cumprod_1mp = cumprod(1 - p_choose) + // There is cumprod in pytorch, however there is no exclusive mode. + // cumprod(x) = [x1, x1x2, x2x3x4, ..., prod_{i=1}^n x_i] + // exclusive means + // cumprod(x) = [1, x1, x1x2, x1x2x3, ..., prod_{i=1}^{n-1} x_i] + for (uint32_t b = 0; b < bsz; b++) { + for (uint32_t tgt = 0; tgt < tgt_len; tgt++) { + uint32_t idx_offset = b * tgt_len * src_len + tgt * src_len; + T prev = cumprod_1mp[idx_offset]; + // index [b][tgt][0] + cumprod_1mp[idx_offset] = (T)1.0; + T curr; + for (uint32_t src = 1; src < src_len; src++) { + uint32_t idx = idx_offset + src; + curr = cumprod_1mp[idx]; + cumprod_1mp[idx] = cumprod_1mp[idx - 1] * prev; + prev = curr; + } + } + } +} + +template <typename T> +void clamp( + const T* cumprod_1mp, + T* cumprod_1mp_clamp, + uint32_t bsz, + uint32_t tgt_len, + uint32_t src_len, + T min_val, + T max_val) { + for (uint32_t b = 0; b < bsz; b++) { + for (uint32_t tgt = 0; tgt < tgt_len; tgt++) { + for (uint32_t src = 0; src < src_len; src++) { + uint32_t idx = b * tgt_len * src_len + tgt * src_len + src; + if (cumprod_1mp[idx] < min_val) { + cumprod_1mp_clamp[idx] = min_val; + } else if (cumprod_1mp[idx] > max_val) { + cumprod_1mp_clamp[idx] = max_val; + } else { + cumprod_1mp_clamp[idx] = cumprod_1mp[idx]; + } + } + } + } +} + +template <typename T> +void alignmentTrainCPUImpl( + const T* p_choose, + T* alpha, + uint32_t bsz, + uint32_t tgt_len, + uint32_t src_len, + float eps) { + // p_choose: bsz , tgt_len, src_len + // cumprod_1mp: bsz , tgt_len, src_len + // cumprod_1mp_clamp : bsz, tgt_len, src_len + // alpha: bsz + 1, tgt_len, src_len + + uint32_t elements = bsz * tgt_len * src_len; + T* cumprod_1mp = new T[elements]; + T* cumprod_1mp_clamp = new T[elements]; + + exclusiveCumprod<T>(p_choose, cumprod_1mp, bsz, tgt_len, src_len); + clamp<T>( + cumprod_1mp, cumprod_1mp_clamp, bsz, tgt_len, src_len, (T)eps, (T)1.0); + + // ai = p_i * cumprod(1 − pi) * cumsum(a_i / cumprod(1 − pi)) + + // Initialize alpha [:, 0, 0] + for (uint32_t b = 0; b < bsz; b++) { + alpha[b * tgt_len * src_len] = 1.0; + } + + for (uint32_t tgt = 0; tgt < tgt_len; tgt++) { + for (uint32_t b = 0; b < bsz; b++) { + uint32_t alpha_idx, inout_idx; + T prev_scan = 0, curr_scan, out; + for (uint32_t src = 0; src < src_len; src++) { + // Apply scan/cumsum + if (tgt == 0) { + // alpha index is [b][tgt][src] + alpha_idx = b * tgt_len * src_len + src; + } else { + // alpha index is [b][tgt-1][src] + alpha_idx = b * tgt_len * src_len + (tgt - 1) * src_len + src; + } + // input index is [b][tgt][src] + inout_idx = b * tgt_len * src_len + tgt * src_len + src; + curr_scan = prev_scan + alpha[alpha_idx] / cumprod_1mp_clamp[inout_idx]; + + out = curr_scan * p_choose[inout_idx] * cumprod_1mp[inout_idx]; + alpha[inout_idx] = std::min<T>(std::max<T>(out, 0), 1.0); + prev_scan = curr_scan; + } + } + } + + free(cumprod_1mp); + free(cumprod_1mp_clamp); +} + +void alignmentTrainCPU( + const torch::Tensor& p_choose, + torch::Tensor& alpha, + float eps) { + uint32_t bsz = p_choose.size(0); + uint32_t tgt_len = p_choose.size(1); + uint32_t src_len = p_choose.size(2); + + AT_DISPATCH_FLOATING_TYPES_AND2( + torch::ScalarType::Half, + torch::ScalarType::BFloat16, + p_choose.scalar_type(), + "alignmentCPUImpl", + [&]() { + alignmentTrainCPUImpl<scalar_t>( + p_choose.data_ptr<scalar_t>(), + alpha.data_ptr<scalar_t>(), + bsz, + tgt_len, + src_len, + eps); + }); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def( + "alignment_train_cpu", + &alignmentTrainCPU, + "expected_alignment_from_p_choose (CPU)"); +} + +} // namespace diff --git a/examples/operators/alignment_train_cuda.cpp b/examples/operators/alignment_train_cuda.cpp new file mode 100644 index 0000000000..430e048139 --- /dev/null +++ b/examples/operators/alignment_train_cuda.cpp @@ -0,0 +1,31 @@ +/** + * Copyright 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "alignment_train_cuda.h" +#include "utils.h" + +namespace { + +void alignmentTrainCUDA( + const torch::Tensor& p_choose, + torch::Tensor& alpha, + float eps) { + CHECK_INPUT(p_choose); + CHECK_INPUT(alpha); + + alignmentTrainCUDAWrapper(p_choose, alpha, eps); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def( + "alignment_train_cuda", + &alignmentTrainCUDA, + "expected_alignment_from_p_choose (CUDA)"); +} + +} // namespace diff --git a/examples/operators/alignment_train_cuda.h b/examples/operators/alignment_train_cuda.h new file mode 100644 index 0000000000..8289d1a690 --- /dev/null +++ b/examples/operators/alignment_train_cuda.h @@ -0,0 +1,16 @@ +/** + * Copyright 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include <torch/extension.h> // @manual=//caffe2:torch_extension + +void alignmentTrainCUDAWrapper( + const torch::Tensor& p_choose, + torch::Tensor& alpha, + float eps); diff --git a/examples/operators/alignment_train_kernel.cu b/examples/operators/alignment_train_kernel.cu new file mode 100644 index 0000000000..efae7cc76f --- /dev/null +++ b/examples/operators/alignment_train_kernel.cu @@ -0,0 +1,354 @@ +/** + * Copyright 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include <ATen/ATen.h> +#include <ATen/cuda/CUDAContext.h> // @manual=//caffe2/aten:ATen-cu +#include <cuda_runtime.h> +#include <algorithm> // std::min/max +#include <cub/cub.cuh> + +#include "alignment_train_cuda.h" +#include "utils.h" + +namespace { + +// The thread block length in threads along the X dimension +constexpr int BLOCK_DIM_X = 128; +// The thread block length in threads along the Y dimension +constexpr int BLOCK_DIM_Y = 8; +// The thread block length in threads for scan operation +constexpr int SCAN_BLOCK = 512; + +#define gpuErrchk(ans) \ + { gpuAssert((ans), __FILE__, __LINE__); } + +inline void +gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) { + if (code != cudaSuccess) { + fprintf( + stderr, + "\nGPUassert: %s %s %d\n", + cudaGetErrorString(code), + file, + line); + if (abort) + exit(code); + } +} + +template <typename T> +struct Prod { + /// prod operator, returns <tt>a * b</tt> + __host__ __device__ __forceinline__ T + operator()(const T& a, const T& b) const { + return a * b; + } +}; + +template <typename T> +struct BlockPrefixProdCallbackOp { + // Running prefix + T running_total; + + // Constructor + __device__ BlockPrefixProdCallbackOp(T running_total) + : running_total(running_total) {} + + // Callback operator to be entered by the first warp of threads in the block. + // Thread-0 is responsible for returning a value for seeding the block-wide + // scan. + __device__ T operator()(const T block_aggregate) { + T old_prefix = running_total; + running_total *= block_aggregate; + return old_prefix; + } +}; + +template <typename T> +struct BlockPrefixSumCallbackOp { + // Running prefix + T running_total; + + // Constructor + __device__ BlockPrefixSumCallbackOp(T running_total) + : running_total(running_total) {} + + // Callback operator to be entered by the first warp of threads in the block. + // Thread-0 is responsible for returning a value for seeding the block-wide + // scan. + __device__ T operator()(const T block_aggregate) { + T old_prefix = running_total; + running_total += block_aggregate; + return old_prefix; + } +}; + +template <typename T> +__global__ void oneMinusPKernel( + const T* __restrict__ p_choose, + T* __restrict__ cumprod_1mp, + uint32_t bsz, + uint32_t tgt_len, + uint32_t src_len) { + for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) { + for (uint32_t tgt = threadIdx.y; tgt < tgt_len; tgt += blockDim.y) { + for (uint32_t src = threadIdx.x; src < src_len; src += blockDim.x) { + uint32_t idx = b * tgt_len * src_len + tgt * src_len + src; + cumprod_1mp[idx] = 1 - p_choose[idx]; + } + } + } +} + +template <typename T, int TPB> +__global__ void innermostScanKernel( + T* __restrict__ cumprod_1mp, + uint32_t bsz, + uint32_t tgt_len, + uint32_t src_len) { + for (uint32_t b = blockIdx.y; b < bsz; b += gridDim.y) { + for (uint32_t tgt = blockIdx.x; tgt < tgt_len; tgt += gridDim.x) { + // Specialize BlockScan for a 1D block of TPB threads on type T + typedef cub::BlockScan<T, TPB> BlockScan; + // Allocate shared memory for BlockScan + __shared__ typename BlockScan::TempStorage temp_storage; + // Initialize running total + BlockPrefixProdCallbackOp<T> prefix_op(1); + + const uint32_t tid = threadIdx.x; + for (uint32_t block_src = 0; block_src < src_len; + block_src += blockDim.x) { + uint32_t src = block_src + tid; + uint32_t idx = b * tgt_len * src_len + tgt * src_len + src; + T thread_data = (src < src_len) ? cumprod_1mp[idx] : (T)0; + + // Collectively compute the block-wide inclusive prefix sum + BlockScan(temp_storage) + .ExclusiveScan(thread_data, thread_data, Prod<T>(), prefix_op); + __syncthreads(); + + // write the scanned value to output + if (src < src_len) { + cumprod_1mp[idx] = thread_data; + } + } + } + } +} + +template <typename T> +__global__ void clampKernel( + const T* __restrict__ cumprod_1mp, + T* __restrict__ cumprod_1mp_clamp, + uint32_t bsz, + uint32_t tgt_len, + uint32_t src_len, + T min_val, + T max_val) { + for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) { + for (uint32_t tgt = threadIdx.y; tgt < tgt_len; tgt += blockDim.y) { + for (uint32_t src = threadIdx.x; src < src_len; src += blockDim.x) { + uint32_t idx = b * tgt_len * src_len + tgt * src_len + src; + if (cumprod_1mp[idx] < min_val) { + cumprod_1mp_clamp[idx] = min_val; + } else if (cumprod_1mp[idx] > max_val) { + cumprod_1mp_clamp[idx] = max_val; + } else { + cumprod_1mp_clamp[idx] = cumprod_1mp[idx]; + } + } + } + } +} + +template <typename T> +__global__ void initAlphaCUDAKernel( + T* alpha, + uint32_t bsz, + uint32_t tgt_len, + uint32_t src_len) { + // alpha[:, 0, 0] = 1.0 + for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) { + alpha[b * tgt_len * src_len] = (T)1.0; + } +} + +template <typename T, int TPB> +__global__ void alignmentTrainCUDAKernel( + const T* __restrict__ p_choose, + const T* __restrict__ cumprod_1mp, + const T* __restrict__ cumprod_1mp_clamp, + T* __restrict__ alpha, + uint32_t bsz, + uint32_t tgt_len, + uint32_t src_len, + uint32_t tgt) { + for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) { + // Specialize BlockScan for a 1D block of TPB threads on type T + typedef cub::BlockScan<T, TPB> BlockScan; + + // Allocate shared memory for BlockScan + __shared__ typename BlockScan::TempStorage temp_storage; + // Initialize running total + BlockPrefixSumCallbackOp<T> prefix_op(0); + + uint32_t b_offset = b * tgt_len * src_len; + const uint32_t tid = threadIdx.x; + for (uint32_t block_src = 0; block_src < src_len; block_src += blockDim.x) { + uint32_t src = block_src + tid; + // Obtain a segment of consecutive items that are blocked across threads + uint32_t inout_idx, alpha_idx; + if (tgt == 0) { + // both alpha and other input index is [b][0][src] + alpha_idx = b_offset + src; + } else { + // alpha index is [b][tgt-1][src] + alpha_idx = b_offset + (tgt - 1) * src_len + src; + } + inout_idx = b_offset + tgt * src_len + src; + T thread_data = (T)0; + if (src < src_len) { + thread_data = alpha[alpha_idx] / cumprod_1mp_clamp[inout_idx]; + } + + // Collectively compute the block-wide inclusive prefix sum + BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, prefix_op); + __syncthreads(); + + if (src < src_len) { + T out = thread_data * p_choose[inout_idx] * cumprod_1mp[inout_idx]; + // Clamps all elements into the range [ 0, 1.0 ] + alpha[inout_idx] = std::min<T>(std::max<T>(out, 0), (T)1.0); + } + } + } +} + +template <typename T> +void exclusiveCumprod( + const T* p_choose, + T* cumprod_1mp, + uint32_t bsz, + uint32_t tgt_len, + uint32_t src_len, + uint32_t max_grid_x, + uint32_t max_grid_y, + cudaStream_t& stream) { + // cumprod_1mp = 1 - p_choose + dim3 grid(std::min<T>(max_grid_x, bsz), 1, 1); + dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + oneMinusPKernel<T><<<grid, block, 0, stream>>>( + p_choose, cumprod_1mp, bsz, tgt_len, src_len); + gpuErrchk(cudaGetLastError()); + + // scan on the innermost dimension of cumprod_1mp + // cumprod_1mp = cumprod(cumprod_1mp) + dim3 grid_scan( + std::min<T>(max_grid_x, tgt_len), std::min<T>(max_grid_y, bsz), 1); + innermostScanKernel<T, SCAN_BLOCK><<<grid_scan, SCAN_BLOCK, 0, stream>>>( + cumprod_1mp, bsz, tgt_len, src_len); + gpuErrchk(cudaGetLastError()); +} + +template <typename T> +void alignmentTrainCUDAImpl( + const T* p_choose, + T* alpha, + uint32_t bsz, + uint32_t tgt_len, + uint32_t src_len, + float eps) { + // p_choose: bsz , tgt_len, src_len + // cumprod_1mp: bsz , tgt_len, src_len + // cumprod_1mp_clamp : bsz, tgt_len, src_len + // alpha: bsz, tgt_len, src_len + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + uint32_t max_grid_x = at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; + uint32_t max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + + // Implementing exclusive cumprod. + // cumprod_1mp = cumprod(1 - p_choose) + // There is cumprod in pytorch, however there is no exclusive mode. + // cumprod(x) = [x1, x1x2, x2x3x4, ..., prod_{i=1}^n x_i] + // exclusive means + // cumprod(x) = [1, x1, x1x2, x1x2x3, ..., prod_{i=1}^{n-1} x_i] + uint32_t elements = bsz * tgt_len * src_len; + T* cumprod_1mp; + gpuErrchk(cudaMalloc(&cumprod_1mp, elements * sizeof(T))); + exclusiveCumprod<T>( + p_choose, + cumprod_1mp, + bsz, + tgt_len, + src_len, + max_grid_x, + max_grid_y, + stream); + + // clamp cumprod_1mp to the range [eps, 1.0] + T* cumprod_1mp_clamp; + gpuErrchk(cudaMalloc(&cumprod_1mp_clamp, elements * sizeof(T))); + dim3 grid_clamp(std::min<T>(max_grid_x, bsz), 1, 1); + dim3 block_clamp(BLOCK_DIM_X, BLOCK_DIM_Y, 1); + clampKernel<T><<<grid_clamp, block_clamp, 0, stream>>>( + cumprod_1mp, cumprod_1mp_clamp, bsz, tgt_len, src_len, (T)eps, (T)1.0); + gpuErrchk(cudaGetLastError()); + + // ai = p_i * cumprod(1 − pi) * cumsum(a_i / cumprod(1 − pi)) + dim3 grid_init(std::min<int>(max_grid_x, bsz), 1, 1); + initAlphaCUDAKernel<T> + <<<grid_init, 1, 0, stream>>>(alpha, bsz, tgt_len, src_len); + gpuErrchk(cudaGetLastError()); + + const int grid = std::min(bsz, max_grid_x); + + for (uint32_t i = 0; i < tgt_len; i++) { + alignmentTrainCUDAKernel<T, SCAN_BLOCK><<<grid, SCAN_BLOCK, 0, stream>>>( + p_choose, + cumprod_1mp, + cumprod_1mp_clamp, + alpha, + bsz, + tgt_len, + src_len, + i); + gpuErrchk(cudaGetLastError()); + } + + gpuErrchk(cudaFree(cumprod_1mp)); + gpuErrchk(cudaFree(cumprod_1mp_clamp)); +} + +} // namespace + +void alignmentTrainCUDAWrapper( + const torch::Tensor& p_choose, + torch::Tensor& alpha, + float eps) { + // p_choose dimension: bsz, tgt_len, src_len + uint32_t bsz = p_choose.size(0); + uint32_t tgt_len = p_choose.size(1); + uint32_t src_len = p_choose.size(2); + + cudaSetDevice(p_choose.get_device()); + + AT_DISPATCH_FLOATING_TYPES_AND2( + torch::ScalarType::Half, + torch::ScalarType::BFloat16, + p_choose.scalar_type(), + "alignmentTrainCUDAImpl", + [&]() { + alignmentTrainCUDAImpl<scalar_t>( + p_choose.data_ptr<scalar_t>(), + alpha.data_ptr<scalar_t>(), + bsz, + tgt_len, + src_len, + eps); + }); +} diff --git a/examples/operators/utils.h b/examples/operators/utils.h new file mode 100644 index 0000000000..0ef5b4383f --- /dev/null +++ b/examples/operators/utils.h @@ -0,0 +1,19 @@ +/** + * Copyright 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include <torch/extension.h> // @manual=//caffe2:torch_extension + +#define CHECK_CUDA(x) \ + TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) \ + TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) \ + CHECK_CUDA(x); \ + CHECK_CONTIGUOUS(x) diff --git a/examples/pay_less_attention_paper/README.md b/examples/pay_less_attention_paper/README.md index 3fb93b23d1..5adab11f4d 100644 --- a/examples/pay_less_attention_paper/README.md +++ b/examples/pay_less_attention_paper/README.md @@ -110,10 +110,10 @@ mkdir -p $SAVE CUDA_VISIBLE_DEVICES=0 $(which fairseq-train) data-bin/iwslt14.tokenized.de-en \ --clip-norm 0 --optimizer adam --lr 0.0005 \ --source-lang de --target-lang en --max-tokens 4000 --no-progress-bar \ - --log-interval 100 --min-lr '1e-09' --weight-decay 0.0001 \ + --log-interval 100 --stop-min-lr '1e-09' --weight-decay 0.0001 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --lr-scheduler inverse_sqrt \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --max-update 50000 --warmup-updates 4000 --warmup-init-lr '1e-07' \ --adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \ -a lightconv_iwslt_de_en --save-dir $SAVE \ @@ -137,10 +137,10 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \ --max-update 30000 --share-all-embeddings --optimizer adam \ --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ - --min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ - --ddp-backend=no_c10d --max-tokens 3584 \ + --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ + --ddp-backend=legacy_ddp --max-tokens 3584 \ --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \ - --lr-shrink 1 --max-lr 0.001 --lr 1e-7 --min-lr 1e-9 --warmup-init-lr 1e-07 \ + --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \ --t-mult 1 --lr-period-updates 20000 \ --arch lightconv_wmt_en_de_big --save-dir $SAVE \ --dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \ @@ -162,10 +162,10 @@ python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \ --max-update 30000 --share-all-embeddings --optimizer adam \ --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ - --min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ - --ddp-backend=no_c10d --max-tokens 3584 \ + --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ + --ddp-backend=legacy_ddp --max-tokens 3584 \ --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \ - --lr-shrink 1 --max-lr 0.001 --lr 1e-7 --min-lr 1e-9 --warmup-init-lr 1e-07 \ + --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \ --t-mult 1 --lr-period-updates 70000 \ --arch lightconv_wmt_en_fr_big --save-dir $SAVE \ --dropout 0.1 --attention-dropout 0.1 --weight-dropout 0.1 \ diff --git a/examples/pointer_generator/README.xsum.md b/examples/pointer_generator/README.xsum.md index ab288afc0c..ac3a8c3ddc 100644 --- a/examples/pointer_generator/README.xsum.md +++ b/examples/pointer_generator/README.xsum.md @@ -77,7 +77,7 @@ update_freq=4 pointer_layer=-2 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train bin \ - --user-dir examples/pointer_generator/src \ + --user-dir examples/pointer_generator/pointer_generator_src \ --max-tokens "$max_tokens" \ --task translation \ --source-lang src --target-lang tgt \ @@ -125,7 +125,7 @@ max_length=60 length_penalty=1.0 fairseq-interactive bin \ - --user-dir examples/pointer_generator/src \ + --user-dir examples/pointer_generator/pointer_generator_src \ --batch-size "$batch_size" \ --task translation \ --source-lang src --target-lang tgt \ diff --git a/examples/pointer_generator/src/__init__.py b/examples/pointer_generator/pointer_generator_src/__init__.py similarity index 100% rename from examples/pointer_generator/src/__init__.py rename to examples/pointer_generator/pointer_generator_src/__init__.py diff --git a/examples/pointer_generator/src/transformer_pg.py b/examples/pointer_generator/pointer_generator_src/transformer_pg.py similarity index 85% rename from examples/pointer_generator/src/transformer_pg.py rename to examples/pointer_generator/pointer_generator_src/transformer_pg.py index 079fdda581..4ccf30f4eb 100644 --- a/examples/pointer_generator/src/transformer_pg.py +++ b/examples/pointer_generator/pointer_generator_src/transformer_pg.py @@ -4,13 +4,12 @@ # LICENSE file in the root directory of this source tree. import logging -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, List, Tuple import torch import torch.nn as nn -from fairseq import metrics, utils +from fairseq import utils from fairseq.models import register_model, register_model_architecture -from fairseq.models.fairseq_encoder import EncoderOut from fairseq.models.transformer import ( DEFAULT_MAX_SOURCE_POSITIONS, DEFAULT_MAX_TARGET_POSITIONS, @@ -155,7 +154,13 @@ class TransformerPointerGeneratorEncoder(TransformerEncoder): to the decoder. """ - def forward(self, src_tokens, src_lengths, **kwargs): + def forward( + self, + src_tokens, + src_lengths: Optional[Tensor] = None, + return_all_hiddens: bool = False, + token_embeddings: Optional[Tensor] = None + ): """ Runs the `forward()` method of the parent Transformer class. Then adds the source tokens into the encoder output tuple. @@ -169,6 +174,10 @@ def forward(self, src_tokens, src_lengths, **kwargs): shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings Returns: namedtuple: @@ -184,15 +193,23 @@ def forward(self, src_tokens, src_lengths, **kwargs): - **src_tokens** (Tensor): input token ids of shape `(batch, src_len)` """ - encoder_out = super().forward(src_tokens, src_lengths, **kwargs) - return EncoderOut( - encoder_out=encoder_out.encoder_out, # T x B x C - encoder_padding_mask=encoder_out.encoder_padding_mask, # B x T - encoder_embedding=encoder_out.encoder_embedding, # B x T x C - encoder_states=encoder_out.encoder_states, # List[T x B x C] - src_tokens=src_tokens, # B x T - src_lengths=None, - ) + encoder_out = self.forward_scriptable(src_tokens, + src_lengths, + return_all_hiddens, + token_embeddings) + + # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in + # `forward` so we use a dictionary instead. + # TorchScript does not support mixed values so the values are all lists. + # The empty list is equivalent to None. + return { + "encoder_out": encoder_out["encoder_out"], # T x B x C + "encoder_padding_mask": encoder_out["encoder_padding_mask"], # B x T + "encoder_embedding": encoder_out["encoder_embedding"], # B x T x C + "encoder_states": encoder_out["encoder_states"], # List[T x B x C] + "src_tokens": [src_tokens], # B x T + "src_lengths": [], + } class TransformerPointerGeneratorDecoder(TransformerDecoder): @@ -236,7 +253,7 @@ def __init__(self, args, dictionary, embed_tokens): def forward( self, prev_output_tokens, - encoder_out: Optional[EncoderOut] = None, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, features_only: bool = False, alignment_layer: Optional[int] = 0, @@ -248,8 +265,8 @@ def forward( Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing - encoder_out (EncoderOut, optional): output from the encoder, used - for encoder-side attention + encoder_out (optional): output from the encoder, used for + encoder-side attention incremental_state (dict, optional): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without @@ -283,11 +300,22 @@ def forward( prev_output_embed *= self.embed_scale predictors = torch.cat((prev_output_embed, x), 2) p_gens = self.project_p_gens(predictors) - p_gens = torch.sigmoid(p_gens) - x = self.output_layer(x, extra["attn"][0], encoder_out.src_tokens, p_gens) + p_gens = torch.sigmoid(p_gens.float()) + # Torchscript complains if encoder_out or attn are None because + # `output_layer()` signature expects tensors instead + attn: Optional[Tensor] = extra["attn"][0] + assert encoder_out is not None + assert attn is not None + x = self.output_layer(x, attn, encoder_out["src_tokens"][0], p_gens) return x, extra - def output_layer(self, features, attn, src_tokens, p_gens, **kwargs): + def output_layer( + self, + features: Tensor, + attn: Tensor, + src_tokens: Tensor, + p_gens: Tensor + ) -> Tensor: """ Project features to the vocabulary size and mix with the attention distributions. @@ -296,7 +324,10 @@ def output_layer(self, features, attn, src_tokens, p_gens, **kwargs): p_gens = self.force_p_gen # project back to size of vocabulary - logits = super().output_layer(features, **kwargs) + if self.adaptive_softmax is None: + logits = self.output_projection(features) + else: + logits = features batch_size = logits.shape[0] output_length = logits.shape[1] @@ -306,7 +337,7 @@ def output_layer(self, features, attn, src_tokens, p_gens, **kwargs): # The final output distribution will be a mixture of the normal output # distribution (softmax of logits) and attention weights. - gen_dists = super().get_normalized_probs( + gen_dists = self.get_normalized_probs_scriptable( (logits, None), log_probs=False, sample=None ) gen_dists = torch.mul(gen_dists, p_gens) @@ -320,17 +351,22 @@ def output_layer(self, features, attn, src_tokens, p_gens, **kwargs): # vocab_size]. Each attention weight will be written into a location # that is for other dimensions the same as in the index tensor, but for # the third dimension it's the value of the index tensor (the token ID). - attn = torch.mul(attn, 1 - p_gens) + attn = torch.mul(attn.float(), 1 - p_gens) index = src_tokens[:, None, :] index = index.expand(batch_size, output_length, src_length) attn_dists_size = (batch_size, output_length, self.num_types) attn_dists = attn.new_zeros(attn_dists_size) - attn_dists.scatter_add_(2, index, attn) + attn_dists.scatter_add_(2, index, attn.float()) # Final distributions, [batch_size, output_length, num_types]. return gen_dists + attn_dists - def get_normalized_probs(self, net_output, log_probs, sample): + def get_normalized_probs( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): """ Get normalized probabilities (or log probs) from a net's output. Pointer-generator network output is already normalized. @@ -375,8 +411,19 @@ class Embedding(nn.Embedding): """ __constants__ = ["unk_idx"] - def __init__(self, num_embeddings, embedding_dim, padding_idx, unk_idx): - super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx) + # Torchscript: Inheriting from Embedding class produces an error when exporting to Torchscript + # -> RuntimeError: Unable to cast Python instance to C++ type (compile in debug mode for details + # It's happening because max_norm attribute from nn.Embedding is None by default and it cannot be + # cast to a C++ type + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + padding_idx: Optional[int], + unk_idx: int, + max_norm: Optional[float] = float("inf"), + ): + super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx, max_norm=max_norm) self.unk_idx = unk_idx nn.init.normal_(self.weight, mean=0, std=embedding_dim ** -0.5) nn.init.constant_(self.weight[padding_idx], 0) @@ -385,7 +432,10 @@ def forward(self, input): input = torch.where( input >= self.num_embeddings, torch.ones_like(input) * self.unk_idx, input ) - return super().forward(input) + return nn.functional.embedding( + input, self.weight, self.padding_idx, self.max_norm, + self.norm_type, self.scale_grad_by_freq, self.sparse + ) @register_model_architecture( diff --git a/examples/quant_noise/README.md b/examples/quant_noise/README.md index 057ea620ab..a04d7e4e8a 100644 --- a/examples/quant_noise/README.md +++ b/examples/quant_noise/README.md @@ -33,7 +33,7 @@ Unlike the section [Iterative Product Quantization](#iterative-product-quantizat #### Training -Scalar quantization with Quant-Noise consists in randomly quantizing a proportion `p` of the weights during training. Scalar quantization is implemented [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/scalar) under the form of Fake Quantization, meaning that we emulate int8 on GPU by quantizing and de-quantizing both the weights and the activations. We rely on PyTorch's [quantization primitives](https://github.com/pytorch/pytorch/tree/master/torch/quantization). +Scalar quantization with Quant-Noise consists in randomly quantizing a proportion `p` of the weights during training. Scalar quantization is implemented [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quantization/scalar) under the form of Fake Quantization, meaning that we emulate int8 on GPU by quantizing and de-quantizing both the weights and the activations. We rely on PyTorch's [quantization primitives](https://github.com/pytorch/pytorch/tree/master/torch/quantization). To train a model with Quant-Noise, add the following flag: ``` @@ -49,7 +49,7 @@ When evaluating a network, all quantized modules and activation hooks automatica #### Integration with your own code Looking to quantize your own models with Quant-Noise + Scalar Quantization? -- Use the function `quantize_model_` implemented [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/scalar/utils.py) to (1) replace all your modules by their quantized counterparts and (2) add hooks to those modules to quantize the activations. +- Use the function `quantize_model_` implemented [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quantization/scalar/utils.py) to (1) replace all your modules by their quantized counterparts and (2) add hooks to those modules to quantize the activations. - Then, perform your training as usual. Note that in `eval()` mode, the network is always fully quantized (weights and activations) by default (`p=1`). @@ -66,12 +66,12 @@ To train a model with Quant-Noise, add the following flags: --quant-noise-pq 0.1 --quant-noise-pq-block-size 8 ``` `quant-noise-pq` controls how much dropout is applied to the blocks of the weight matrix. `quant-noise-pq-block-size` controls the size of the weight matrix blocks. -We recommend training with 0.05 to 0.2 Quant-Noise, a value that worked well in our experiments. For the block-size, we recommend training with block-size of 8. Note that the block size must be a multiple of `input_features`, see the size checks [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py). Large block sizes result in higher compression ratio but may induce a loss in accuracy. +We recommend training with 0.05 to 0.2 Quant-Noise, a value that worked well in our experiments. For the block-size, we recommend training with block-size of 8. Note that the block size must be a multiple of `input_features`, see the size checks [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quant_noise.py). Large block sizes result in higher compression ratio but may induce a loss in accuracy. -We currently support training Transformer based models, such as sequence-to-sequence, language models, and BERT architectures. The `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py) wraps a module. It splits a weight matrix into blocks and applies random dropout to these blocks. +We currently support training Transformer based models, such as sequence-to-sequence, language models, and BERT architectures. The `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quant_noise.py) wraps a module. It splits a weight matrix into blocks and applies random dropout to these blocks. In the Transformer architectures, quant-noise is applied to the input and output embeddings, the attention, and the FFN. -Quant-Noise can also be combined with **LayerDrop** (see [here](https://github.com/pytorch/fairseq/tree/master/examples/layerdrop)) to add its pruning effect to the quantized model and make the model even smaller. We recommend training with LayerDrop 0.1 or 0.2. +Quant-Noise can also be combined with **LayerDrop** (see [here](https://github.com/pytorch/fairseq/tree/main/examples/layerdrop)) to add its pruning effect to the quantized model and make the model even smaller. We recommend training with LayerDrop 0.1 or 0.2. #### Quantization @@ -84,8 +84,8 @@ For the particular case of PQ, quantization is made sequentially. We recommend f #### Integration with your own code Looking to quantize your own models with Quant-Noise + iPQ? -- First wrap your modules with the `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py), which is module-agnostic and train your favorite model. -- Then, quantize your trained model using the code [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/pq). This can be done *without any changes to your training loop*. Below is an example code for integration. +- First wrap your modules with the `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quant_noise.py), which is module-agnostic and train your favorite model. +- Then, quantize your trained model using the code [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quantization/pq). This can be done *without any changes to your training loop*. Below is an example code for integration. Note that we tried our approach only on Transformers and various Convolutional Models such as EfficientNets. ```python @@ -128,7 +128,7 @@ We detail below how to reproduce the state-of-the-art results in reported in the ### Training with Quant-Noise -To **train** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta). +To **train** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/roberta). The following command can be used to train a RoBERTa Base + QuantNoise model: ```bash @@ -154,11 +154,11 @@ fairseq-train $DATA_DIR \ --batch-size $MAX_SENTENCES \ --update-freq $UPDATE_FREQ --max-update $TOTAL_UPDATES \ --save-dir checkpoint/roberta \ - --ddp-backend no_c10d --encoder-layerdrop 0.2 \ + --ddp-backend legacy_ddp --encoder-layerdrop 0.2 \ --quant-noise-pq 0.2 --quant-noise-pq-block-size 8 --untie-weights-roberta ``` -To **finetune** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.glue.md). +To **finetune** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/blob/main/examples/roberta/README.glue.md). The following command can be used to finetune a RoBERTa Base + QuantNoise model on the RTE dataset: ```bash @@ -189,11 +189,11 @@ fairseq-train /path/to/rte/data/ \ --max-epoch 10 \ --find-unused-parameters \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ - --ddp-backend no_c10d \ + --ddp-backend legacy_ddp \ --quant-noise-pq 0.2 --quant-noise-pq-block-size 8 ``` -To **train** Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/language_model). +To **train** Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/language_model). The following command can be used to train a Transformer + QuantNoise model on Wikitext-103: ```bash @@ -205,14 +205,14 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \ --arch transformer_lm_gbw \ --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \ --clip-norm 0.1 --criterion adaptive_loss \ - --ddp-backend no_c10d \ + --ddp-backend legacy_ddp \ --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 \ --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \ - --lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --max-lr 1.0 --t-mult 2.0 \ + --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 1.0 --t-mult 2.0 \ --max-tokens 3072 --tokens-per-sample 3072 --momentum 0.99 --optimizer nag \ --sample-break-mode none --update-freq 3 \ --warmup-init-lr 1e-07 --warmup-updates 16000 \ - --weight-decay 0 --seed 1 --min-lr 1e-09 \ + --weight-decay 0 --seed 1 --stop-min-lr 1e-09 \ --quant-noise-pq 0.05 --quant-noise-pq-block-size 8 ``` @@ -252,7 +252,7 @@ fairseq-train --task sentence_prediction /path/to/data/ \ --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ --clip-norm 0.0 --lr-scheduler polynomial_decay \ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ - --no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \ + --no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend legacy_ddp \ --quantization-config-path /path/to/config/yaml ``` @@ -266,10 +266,10 @@ fairseq-train --task language_modeling /path/to/wikitext-103/data \ --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \ --bucket-cap-mb 25 --char-embedder-highway-layers 2 --character-embedding-dim 4 \ --clip-norm 0.1 --criterion adaptive_loss \ - --ddp-backend no_c10d \ + --ddp-backend legacy_ddp \ --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \ --fp16 --keep-last-epochs -1 \ - --lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --max-lr 0.05 --min-lr 1e-09 \ + --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 0.05 --stop-min-lr 1e-09 \ --max-tokens 2944 --tokens-per-sample 2944\ --momentum 0.99 --no-epoch-checkpoints --no-progress-bar --optimizer nag --required-batch-size-multiple 8 \ --sample-break-mode none --t-mult 2.0 --skip-invalid-size-inputs-valid-test \ diff --git a/examples/roberta/README.glue.md b/examples/roberta/README.glue.md index 77015d2e2f..4f596d55af 100644 --- a/examples/roberta/README.glue.md +++ b/examples/roberta/README.glue.md @@ -17,54 +17,19 @@ Use `ALL` for preprocessing all the glue tasks. ### 3) Fine-tuning on GLUE task: Example fine-tuning cmd for `RTE` task ```bash -TOTAL_NUM_UPDATES=2036 # 10 epochs through RTE for bsz 16 -WARMUP_UPDATES=122 # 6 percent of the number of updates -LR=2e-05 # Peak LR for polynomial LR scheduler. -NUM_CLASSES=2 -MAX_SENTENCES=16 # Batch size. ROBERTA_PATH=/path/to/roberta/model.pt -CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin/ \ - --restore-file $ROBERTA_PATH \ - --max-positions 512 \ - --batch-size $MAX_SENTENCES \ - --max-tokens 4400 \ - --task sentence_prediction \ - --reset-optimizer --reset-dataloader --reset-meters \ - --required-batch-size-multiple 1 \ - --init-token 0 --separator-token 2 \ - --arch roberta_large \ - --criterion sentence_prediction \ - --num-classes $NUM_CLASSES \ - --dropout 0.1 --attention-dropout 0.1 \ - --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ - --clip-norm 0.0 \ - --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ - --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ - --max-epoch 10 \ - --find-unused-parameters \ - --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric; +CUDA_VISIBLE_DEVICES=0 fairseq-hydra-train -config-dir examples/roberta/config/finetuning --config-name rte \ +task.data=RTE-bin checkpoint.restore_file=$ROBERTA_PATH ``` -For each of the GLUE task, you will need to use following cmd-line arguments: - -Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B ----|---|---|---|---|---|---|---|--- -`--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1 -`--lr` | 1e-5 | 1e-5 | 1e-5 | 2e-5 | 1e-5 | 1e-5 | 1e-5 | 2e-5 -`--batch-size` | 32 | 32 | 32 | 16 | 32 | 16 | 16 | 16 -`--total-num-update` | 123873 | 33112 | 113272 | 2036 | 20935 | 2296 | 5336 | 3598 -`--warmup-updates` | 7432 | 1986 | 28318 | 122 | 1256 | 137 | 320 | 214 - -For `STS-B` additionally add `--regression-target --best-checkpoint-metric loss` and remove `--maximize-best-checkpoint-metric`. +There are additional config files for each of the GLUE tasks in the examples/roberta/config/finetuning directory. **Note:** -a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--batch-size=16/32` depending on the task. - -b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`. +a) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`. -c) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search. +b) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search. ### Inference on GLUE task After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet: diff --git a/examples/roberta/README.md b/examples/roberta/README.md index fdddd5b8d2..ed4d5df52c 100644 --- a/examples/roberta/README.md +++ b/examples/roberta/README.md @@ -8,12 +8,13 @@ RoBERTa iterates on BERT's pretraining procedure, including training the model l ### What's New: +- December 2020: German model (GottBERT) is available: [GottBERT](https://github.com/pytorch/fairseq/tree/main/examples/gottbert). - January 2020: Italian model (UmBERTo) is available from Musixmatch Research: [UmBERTo](https://github.com/musixmatchresearch/umberto). -- November 2019: French model (CamemBERT) is available: [CamemBERT](https://github.com/pytorch/fairseq/tree/master/examples/camembert). -- November 2019: Multilingual encoder (XLM-RoBERTa) is available: [XLM-R](https://github.com/pytorch/fairseq/tree/master/examples/xlmr). +- November 2019: French model (CamemBERT) is available: [CamemBERT](https://github.com/pytorch/fairseq/tree/main/examples/camembert). +- November 2019: Multilingual encoder (XLM-RoBERTa) is available: [XLM-R](https://github.com/pytorch/fairseq/tree/main/examples/xlmr). - September 2019: TensorFlow and TPU support via the [transformers library](https://github.com/huggingface/transformers). - August 2019: RoBERTa is now supported in the [pytorch-transformers library](https://github.com/huggingface/pytorch-transformers). -- August 2019: Added [tutorial for finetuning on WinoGrande](https://github.com/pytorch/fairseq/tree/master/examples/roberta/wsc#roberta-training-on-winogrande-dataset). +- August 2019: Added [tutorial for finetuning on WinoGrande](https://github.com/pytorch/fairseq/tree/main/examples/roberta/wsc#roberta-training-on-winogrande-dataset). - August 2019: Added [tutorial for pretraining RoBERTa using your own data](README.pretraining.md). ## Pre-trained models @@ -276,7 +277,6 @@ print('| Accuracy: ', float(ncorrect)/float(nsamples)) - [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md) - [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md) - [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md) -- Finetuning on SQuAD: coming soon ## Pretraining using your own data diff --git a/examples/roberta/README.pretraining.md b/examples/roberta/README.pretraining.md index 8b6e10c08c..a4e7453529 100644 --- a/examples/roberta/README.pretraining.md +++ b/examples/roberta/README.pretraining.md @@ -48,35 +48,21 @@ fairseq-preprocess \ ### 2) Train RoBERTa base ```bash -TOTAL_UPDATES=125000 # Total number of training steps -WARMUP_UPDATES=10000 # Warmup the learning rate over this many updates -PEAK_LR=0.0005 # Peak learning rate, adjust as needed -TOKENS_PER_SAMPLE=512 # Max sequence length -MAX_POSITIONS=512 # Num. positional embeddings (usually same as above) -MAX_SENTENCES=16 # Number of sequences per batch (batch size) -UPDATE_FREQ=16 # Increase the batch size 16x - DATA_DIR=data-bin/wikitext-103 -fairseq-train --fp16 $DATA_DIR \ - --task masked_lm --criterion masked_lm \ - --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \ - --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \ - --lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \ - --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \ - --batch-size $MAX_SENTENCES --update-freq $UPDATE_FREQ \ - --max-update $TOTAL_UPDATES --log-format simple --log-interval 1 +fairseq-hydra-train -m --config-dir examples/roberta/config/pretraining \ +--config-name base task.data=$DATA_DIR ``` **Note:** You can optionally resume training the released RoBERTa base model by -adding `--restore-file /path/to/roberta.base/model.pt`. +adding `checkpoint.restore_file=/path/to/roberta.base/model.pt`. **Note:** The above command assumes training on 8x32GB V100 GPUs. Each GPU uses -a batch size of 16 sequences (`$MAX_SENTENCES`) and accumulates gradients to -further increase the batch size by 16x (`$UPDATE_FREQ`), for a total batch size +a batch size of 16 sequences (`dataset.batch_size`) and accumulates gradients to +further increase the batch size by 16x (`optimization.update_freq`), for a total batch size of 2048 sequences. If you have fewer GPUs or GPUs with less memory you may need -to reduce `$MAX_SENTENCES` and increase `$UPDATE_FREQ` to compensate. -Alternatively if you have more GPUs you can decrease `$UPDATE_FREQ` accordingly +to reduce `dataset.batch_size` and increase dataset.update_freq to compensate. +Alternatively if you have more GPUs you can decrease `dataset.update_freq` accordingly to increase training speed. **Note:** The learning rate and batch size are tightly connected and need to be diff --git a/examples/roberta/README.race.md b/examples/roberta/README.race.md index 527a0bce14..13c917e8ec 100644 --- a/examples/roberta/README.race.md +++ b/examples/roberta/README.race.md @@ -19,7 +19,7 @@ UPDATE_FREQ=8 # Accumulate gradients to simulate training on 8 GPUs. DATA_DIR=/path/to/race-output-dir ROBERTA_PATH=/path/to/roberta/model.pt -CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=no_c10d \ +CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=legacy_ddp \ --restore-file $ROBERTA_PATH \ --reset-optimizer --reset-dataloader --reset-meters \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ diff --git a/examples/roberta/commonsense_qa/README.md b/examples/roberta/commonsense_qa/README.md index 4f371f8b30..7f386decd8 100644 --- a/examples/roberta/commonsense_qa/README.md +++ b/examples/roberta/commonsense_qa/README.md @@ -39,7 +39,7 @@ DATA_DIR=data/CommonsenseQA FAIRSEQ_PATH=/path/to/fairseq FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa -CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=no_c10d \ +CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=legacy_ddp \ $DATA_DIR \ --user-dir $FAIRSEQ_USER_DIR \ --restore-file $ROBERTA_PATH \ @@ -96,4 +96,4 @@ print('Accuracy: ' + str(ncorrect / float(nsamples))) ``` The above snippet is not batched, which makes it quite slow. See [instructions -for batched prediction with RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta#batched-prediction). +for batched prediction with RoBERTa](https://github.com/pytorch/fairseq/tree/main/examples/roberta#batched-prediction). diff --git a/examples/roberta/commonsense_qa/commonsense_qa_task.py b/examples/roberta/commonsense_qa/commonsense_qa_task.py index 216093f708..7d8f8131b3 100644 --- a/examples/roberta/commonsense_qa/commonsense_qa_task.py +++ b/examples/roberta/commonsense_qa/commonsense_qa_task.py @@ -169,7 +169,7 @@ def binarize(s, append_bos=False): self.datasets[split] = dataset return self.datasets[split] - def build_model(self, args): + def build_model(self, args, from_checkpoint=False): from fairseq import models model = models.build_model(args, self) diff --git a/examples/roberta/config/finetuning/cola.yaml b/examples/roberta/config/finetuning/cola.yaml new file mode 100644 index 0000000000..ac76611201 --- /dev/null +++ b/examples/roberta/config/finetuning/cola.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 320 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 5336 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/examples/roberta/config/finetuning/mnli.yaml b/examples/roberta/config/finetuning/mnli.yaml new file mode 100644 index 0000000000..5be10c362f --- /dev/null +++ b/examples/roberta/config/finetuning/mnli.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 3 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 7432 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 123873 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/examples/roberta/config/finetuning/mrpc.yaml b/examples/roberta/config/finetuning/mrpc.yaml new file mode 100644 index 0000000000..aa8b7db393 --- /dev/null +++ b/examples/roberta/config/finetuning/mrpc.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 137 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 2296 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/examples/roberta/config/finetuning/qnli.yaml b/examples/roberta/config/finetuning/qnli.yaml new file mode 100644 index 0000000000..b4595b090e --- /dev/null +++ b/examples/roberta/config/finetuning/qnli.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 1986 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 33112 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/examples/roberta/config/finetuning/qqp.yaml b/examples/roberta/config/finetuning/qqp.yaml new file mode 100644 index 0000000000..5a2b2ed743 --- /dev/null +++ b/examples/roberta/config/finetuning/qqp.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 28318 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 113272 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/examples/roberta/config/finetuning/rte.yaml b/examples/roberta/config/finetuning/rte.yaml new file mode 100644 index 0000000000..7318465011 --- /dev/null +++ b/examples/roberta/config/finetuning/rte.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 122 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 2036 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/examples/roberta/config/finetuning/run_config/local.yaml b/examples/roberta/config/finetuning/run_config/local.yaml new file mode 100644 index 0000000000..45595f9eea --- /dev/null +++ b/examples/roberta/config/finetuning/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/examples/roberta/config/finetuning/run_config/slurm_1g.yaml b/examples/roberta/config/finetuning/run_config/slurm_1g.yaml new file mode 100644 index 0000000000..8bc21854d4 --- /dev/null +++ b/examples/roberta/config/finetuning/run_config/slurm_1g.yaml @@ -0,0 +1,28 @@ + +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/roberta_ft/${env:PREFIX}/${hydra.job.config_name}/${env:SUFFIX} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 1000 + cpus_per_task: 8 + gpus_per_node: 1 + tasks_per_node: 1 + mem_gb: 60 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 + exclude: learnfair1381,learnfair5192,learnfair2304 diff --git a/examples/roberta/config/finetuning/run_config/slurm_1g_aws.yaml b/examples/roberta/config/finetuning/run_config/slurm_1g_aws.yaml new file mode 100644 index 0000000000..085391cffa --- /dev/null +++ b/examples/roberta/config/finetuning/run_config/slurm_1g_aws.yaml @@ -0,0 +1,25 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /fsx-wav2vec/${env:USER}/roberta_ft/${env:PREFIX}/${hydra.job.config_name}/${env:SUFFIX} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 1000 + cpus_per_task: 8 + gpus_per_node: 1 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: learnfair,wav2vec + max_num_timeout: 30 diff --git a/examples/roberta/config/finetuning/sst_2.yaml b/examples/roberta/config/finetuning/sst_2.yaml new file mode 100644 index 0000000000..a93ad2f22c --- /dev/null +++ b/examples/roberta/config/finetuning/sst_2.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 1256 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 20935 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/examples/roberta/config/finetuning/sts_b.yaml b/examples/roberta/config/finetuning/sts_b.yaml new file mode 100644 index 0000000000..2d495221ad --- /dev/null +++ b/examples/roberta/config/finetuning/sts_b.yaml @@ -0,0 +1,58 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 1 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + regression_target: true + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 214 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 3598 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/examples/roberta/config/pretraining/base.yaml b/examples/roberta/config/pretraining/base.yaml new file mode 100644 index 0000000000..97829908f7 --- /dev/null +++ b/examples/roberta/config/pretraining/base.yaml @@ -0,0 +1,42 @@ +# @package _group_ +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + no_epoch_checkpoints: true + +task: + _name: masked_lm + data: ??? + sample_break_mode: complete + tokens_per_sample: 512 + +criterion: masked_lm + +dataset: + batch_size: 16 + ignore_unused_valid_subsets: true + +optimizer: + _name: adam + weight_decay: 0.01 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 10000 + +optimization: + clip_norm: 0 + lr: [0.0005] + max_update: 125000 + update_freq: [16] + +model: + _name: roberta + max_positions: 512 + dropout: 0.1 + attention_dropout: 0.1 diff --git a/examples/roberta/config/pretraining/run_config/local.yaml b/examples/roberta/config/pretraining/run_config/local.yaml new file mode 100644 index 0000000000..45595f9eea --- /dev/null +++ b/examples/roberta/config/pretraining/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/examples/roberta/config/pretraining/run_config/slurm_2.yaml b/examples/roberta/config/pretraining/run_config/slurm_2.yaml new file mode 100644 index 0000000000..006a0f2116 --- /dev/null +++ b/examples/roberta/config/pretraining/run_config/slurm_2.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/roberta/config/pretraining/run_config/slurm_2_aws.yaml b/examples/roberta/config/pretraining/run_config/slurm_2_aws.yaml new file mode 100644 index 0000000000..a5937ea5a8 --- /dev/null +++ b/examples/roberta/config/pretraining/run_config/slurm_2_aws.yaml @@ -0,0 +1,39 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.local_cache_path + - task.data + - task.post_save_script + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + - model.model_path + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 diff --git a/examples/roberta/config/pretraining/run_config/slurm_3.yaml b/examples/roberta/config/pretraining/run_config/slurm_3.yaml new file mode 100644 index 0000000000..0e1555d20f --- /dev/null +++ b/examples/roberta/config/pretraining/run_config/slurm_3.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 3 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/roberta/config/pretraining/run_config/slurm_4.yaml b/examples/roberta/config/pretraining/run_config/slurm_4.yaml new file mode 100644 index 0000000000..c54d735fb2 --- /dev/null +++ b/examples/roberta/config/pretraining/run_config/slurm_4.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/examples/roberta/fb_multilingual/README.multilingual.pretraining.md b/examples/roberta/fb_multilingual/README.multilingual.pretraining.md new file mode 100644 index 0000000000..234fd74708 --- /dev/null +++ b/examples/roberta/fb_multilingual/README.multilingual.pretraining.md @@ -0,0 +1,26 @@ +# Multilingual pretraining RoBERTa + +This tutorial will walk you through pretraining multilingual RoBERTa. + +### 1) Preprocess the data + +```bash +DICTIONARY="/private/home/namangoyal/dataset/XLM/wiki/17/175k/vocab" +DATA_LOCATION="/private/home/namangoyal/dataset/XLM/wiki/17/175k" + +for LANG in en es it +do + fairseq-preprocess \ + --only-source \ + --srcdict $DICTIONARY \ + --trainpref "$DATA_LOCATION/train.$LANG" \ + --validpref "$DATA_LOCATION/valid.$LANG" \ + --testpref "$DATA_LOCATION/test.$LANG" \ + --destdir "wiki_17-bin/$LANG" \ + --workers 60; +done +``` + +### 2) Train RoBERTa base + +[COMING UP...] diff --git a/examples/roberta/wsc/README.md b/examples/roberta/wsc/README.md index d40da6a5fd..21a045d999 100644 --- a/examples/roberta/wsc/README.md +++ b/examples/roberta/wsc/README.md @@ -51,7 +51,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train WSC/ \ --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --valid-subset val \ - --fp16 --ddp-backend no_c10d \ + --fp16 --ddp-backend legacy_ddp \ --user-dir $FAIRSEQ_USER_DIR \ --task wsc --criterion wsc --wsc-cross-entropy \ --arch roberta_large --bpe gpt2 --max-positions 512 \ @@ -110,7 +110,7 @@ CUDA_VISIBLE_DEVICES=0 fairseq-train winogrande_1.0/ \ --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --valid-subset val \ - --fp16 --ddp-backend no_c10d \ + --fp16 --ddp-backend legacy_ddp \ --user-dir $FAIRSEQ_USER_DIR \ --task winogrande --criterion winogrande \ --wsc-margin-alpha 5.0 --wsc-margin-beta 0.4 \ diff --git a/examples/rxf/README.md b/examples/rxf/README.md index a09de63d33..22a1cc47df 100644 --- a/examples/rxf/README.md +++ b/examples/rxf/README.md @@ -38,7 +38,7 @@ CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin \ --find-unused-parameters \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --noise-type uniform --r3f-lambda 0.7 \ - --user-dir examples/rxf; + --user-dir examples/rxf/rxf_src ``` ## Citation diff --git a/examples/rxf/__init__.py b/examples/rxf/__init__.py index 63453f9333..b24cb6b797 100644 --- a/examples/rxf/__init__.py +++ b/examples/rxf/__init__.py @@ -3,4 +3,4 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from . import src # noqa +from . import rxf_src # noqa diff --git a/examples/rxf/src/__init__.py b/examples/rxf/rxf_src/__init__.py similarity index 100% rename from examples/rxf/src/__init__.py rename to examples/rxf/rxf_src/__init__.py diff --git a/examples/rxf/src/label_smoothed_cross_entropy_r3f.py b/examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py similarity index 98% rename from examples/rxf/src/label_smoothed_cross_entropy_r3f.py rename to examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py index 079db13e61..6191fd55ac 100644 --- a/examples/rxf/src/label_smoothed_cross_entropy_r3f.py +++ b/examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py @@ -7,7 +7,8 @@ import torch import torch.nn.functional as F -from fairseq import metrics, utils +from fairseq import utils +from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.criterions.label_smoothed_cross_entropy import label_smoothed_nll_loss diff --git a/examples/rxf/src/sentence_prediction_r3f.py b/examples/rxf/rxf_src/sentence_prediction_r3f.py similarity index 98% rename from examples/rxf/src/sentence_prediction_r3f.py rename to examples/rxf/rxf_src/sentence_prediction_r3f.py index 62dd63390c..6ecffd6b14 100644 --- a/examples/rxf/src/sentence_prediction_r3f.py +++ b/examples/rxf/rxf_src/sentence_prediction_r3f.py @@ -52,6 +52,7 @@ def add_args(parser): parser.add_argument('--classification-head-name', default='sentence_classification_head', help='name of the classification head to use') + parser.add_argument('--regression-target', action='store_true') # fmt: on def _get_symm_kl(self, noised_logits, input_logits): diff --git a/examples/shuffled_word_order/README.finetuning.md b/examples/shuffled_word_order/README.finetuning.md new file mode 100644 index 0000000000..ecbcb65884 --- /dev/null +++ b/examples/shuffled_word_order/README.finetuning.md @@ -0,0 +1,135 @@ +# Fine-tuning details + +For each task (GLUE and PAWS), we perform hyperparam search for each model, and report the mean and standard deviation across 5 seeds of the best model. First, get the datasets following the instructions in [RoBERTa fine-tuning README](../roberta/README.glue.md). Alternatively, you can use [huggingface datasets](https://huggingface.co/docs/datasets/) to get the task data: + +```python +from datasets import load_dataset +import pandas as pd +from pathlib import Path + +key2file = { +"paws": { + "loc": "paws_data", + "columns": ["id", "sentence1", "sentence2", "label"], + "train": "train.tsv", + "validation": "dev.tsv", + "test": "test.tsv" + } +} + +task_data = load_dataset("paws", "labeled_final") +task_config = key2file["paws"] +save_path = Path(task_config["loc"]) +save_path.mkdir(exist_ok=True, parents=True) +for key, fl in task_config.items(): + if key in ["loc", "columns"]: + continue + print(f"Reading {key}") + columns = task_config["columns"] + df = pd.DataFrame(task_data[key]) + print(df.columns) + df = df[columns] + print(f"Got {len(df)} records") + save_loc = save_path / fl + print(f"Saving to : {save_loc}") + df.to_csv(save_loc, sep="\t", header=None, index=None) + +``` + +- Preprocess using RoBERTa GLUE preprocessing script, while keeping in mind the column numbers for `sentence1`, `sentence2` and `label` (which is 0,1,2 if you save the data according to the above example.) +- Then, fine-tuning is performed similarly to RoBERTa (for example, in case of RTE): + +```bash +TOTAL_NUM_UPDATES=30875 # 10 epochs through RTE for bsz 16 +WARMUP_UPDATES=1852 # 6 percent of the number of updates +LR=2e-05 # Peak LR for polynomial LR scheduler. +NUM_CLASSES=2 +MAX_SENTENCES=16 # Batch size. +SHUFFLED_ROBERTA_PATH=/path/to/shuffled_roberta/model.pt + +CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin/ \ + --restore-file $SHUFFLED_ROBERTA_PATH \ + --max-positions 512 \ + --batch-size $MAX_SENTENCES \ + --max-tokens 4400 \ + --task sentence_prediction \ + --reset-optimizer --reset-dataloader --reset-meters \ + --required-batch-size-multiple 1 \ + --init-token 0 --separator-token 2 \ + --arch roberta_large \ + --criterion sentence_prediction \ + --num-classes $NUM_CLASSES \ + --dropout 0.1 --attention-dropout 0.1 \ + --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ + --clip-norm 0.0 \ + --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ + --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ + --max-epoch 10 \ + --find-unused-parameters \ + --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric; +``` + +- `TOTAL_NUM_UPDATES` is computed based on the `--batch_size` value and the dataset size. +- `WARMUP_UPDATES` is computed as 6% of `TOTAL_NUM_UPDATES` +- Best hyperparam of `--lr` and `--batch_size` is reported below: + +## `--lr` + +| | name | RTE | MRPC | SST-2 | CoLA | QQP | QNLI | MNLI | PAWS | +| --: | :----------- | ----: | ----: | ----: | ----: | ----: | ----: | ----: | ----: | +| 0 | original | 2e-05 | 2e-05 | 1e-05 | 2e-05 | 1e-05 | 1e-05 | 1e-05 | 2e-05 | +| 1 | n_1 | 2e-05 | 1e-05 | 1e-05 | 1e-05 | 3e-05 | 1e-05 | 2e-05 | 2e-05 | +| 2 | n_2 | 2e-05 | 2e-05 | 1e-05 | 1e-05 | 2e-05 | 1e-05 | 1e-05 | 3e-05 | +| 3 | n_3 | 3e-05 | 1e-05 | 2e-05 | 2e-05 | 3e-05 | 1e-05 | 1e-05 | 2e-05 | +| 4 | n_4 | 3e-05 | 1e-05 | 2e-05 | 2e-05 | 2e-05 | 1e-05 | 1e-05 | 2e-05 | +| 5 | r512 | 1e-05 | 3e-05 | 2e-05 | 2e-05 | 3e-05 | 2e-05 | 3e-05 | 2e-05 | +| 6 | rand_corpus | 2e-05 | 1e-05 | 3e-05 | 1e-05 | 3e-05 | 3e-05 | 3e-05 | 2e-05 | +| 7 | rand_uniform | 2e-05 | 1e-05 | 3e-05 | 2e-05 | 3e-05 | 3e-05 | 3e-05 | 1e-05 | +| 8 | rand_init | 1e-05 | 1e-05 | 3e-05 | 1e-05 | 1e-05 | 1e-05 | 2e-05 | 1e-05 | +| 9 | no_pos | 1e-05 | 3e-05 | 2e-05 | 1e-05 | 1e-05 | 1e-05 | 1e-05 | 1e-05 | + +## `--batch_size` + +| | name | RTE | MRPC | SST-2 | CoLA | QQP | QNLI | MNLI | PAWS | +| --: | :----------- | --: | ---: | ----: | ---: | --: | ---: | ---: | ---: | +| 0 | orig | 16 | 16 | 32 | 16 | 16 | 32 | 32 | 16 | +| 1 | n_1 | 32 | 32 | 16 | 32 | 32 | 16 | 32 | 16 | +| 2 | n_2 | 32 | 16 | 32 | 16 | 32 | 32 | 16 | 32 | +| 3 | n_3 | 32 | 32 | 16 | 32 | 32 | 16 | 32 | 32 | +| 4 | n_4 | 32 | 16 | 32 | 16 | 32 | 32 | 32 | 32 | +| 5 | r512 | 32 | 16 | 16 | 32 | 32 | 16 | 16 | 16 | +| 6 | rand_corpus | 16 | 16 | 16 | 16 | 32 | 16 | 16 | 32 | +| 7 | rand_uniform | 16 | 32 | 16 | 16 | 32 | 16 | 16 | 16 | +| 8 | rand_init | 16 | 16 | 32 | 16 | 16 | 16 | 32 | 16 | +| 9 | no_pos | 16 | 32 | 16 | 16 | 32 | 16 | 16 | 16 | + +- Perform inference similar to RoBERTa as well: + +```python +from fairseq.models.roberta import RobertaModel + +roberta = RobertaModel.from_pretrained( + 'checkpoints/', + checkpoint_file='checkpoint_best.pt', + data_name_or_path='PAWS-bin' +) + +label_fn = lambda label: roberta.task.label_dictionary.string( + [label + roberta.task.label_dictionary.nspecial] +) +ncorrect, nsamples = 0, 0 +roberta.cuda() +roberta.eval() +with open('paws_data/dev.tsv') as fin: + fin.readline() + for index, line in enumerate(fin): + tokens = line.strip().split('\t') + sent1, sent2, target = tokens[0], tokens[1], tokens[2] + tokens = roberta.encode(sent1, sent2) + prediction = roberta.predict('sentence_classification_head', tokens).argmax().item() + prediction_label = label_fn(prediction) + ncorrect += int(prediction_label == target) + nsamples += 1 +print('| Accuracy: ', float(ncorrect)/float(nsamples)) + +``` diff --git a/examples/shuffled_word_order/README.md b/examples/shuffled_word_order/README.md new file mode 100644 index 0000000000..6ce0b3927d --- /dev/null +++ b/examples/shuffled_word_order/README.md @@ -0,0 +1,94 @@ +# Masked Language Modeling and the Distributional Hypothesis: Order Word Matters Pre-training for Little + +[https://arxiv.org/abs/2104.06644](https://arxiv.org/abs/2104.06644) + +## Introduction + +In this work, we pre-train [RoBERTa](../roberta) base on various word shuffled variants of BookWiki corpus (16GB). We observe that a word shuffled pre-trained model achieves surprisingly good scores on GLUE, PAWS and several parametric probing tasks. Please read our paper for more details on the experiments. + +## Pre-trained models + +| Model | Description | Download | +| ------------------------------------- | -------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | +| `roberta.base.orig` | RoBERTa (base) trained on natural corpus | [roberta.base.orig.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.orig.tar.gz) | +| `roberta.base.shuffle.n1` | RoBERTa (base) trained on n=1 gram sentence word shuffled data | [roberta.base.shuffle.n1.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n1.tar.gz) | +| `roberta.base.shuffle.n2` | RoBERTa (base) trained on n=2 gram sentence word shuffled data | [roberta.base.shuffle.n2.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n2.tar.gz) | +| `roberta.base.shuffle.n3` | RoBERTa (base) trained on n=3 gram sentence word shuffled data | [roberta.base.shuffle.n3.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n3.tar.gz) | +| `roberta.base.shuffle.n4` | RoBERTa (base) trained on n=4 gram sentence word shuffled data | [roberta.base.shuffle.n4.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n4.tar.gz) | +| `roberta.base.shuffle.512` | RoBERTa (base) trained on unigram 512 word block shuffled data | [roberta.base.shuffle.512.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.512.tar.gz) | +| `roberta.base.shuffle.corpus` | RoBERTa (base) trained on unigram corpus word shuffled data | [roberta.base.shuffle.corpus.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus.tar.gz) | +| `roberta.base.shuffle.corpus_uniform` | RoBERTa (base) trained on unigram corpus word shuffled data, where all words are uniformly sampled | [roberta.base.shuffle.corpus_uniform.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus_uniform.tar.gz) | +| `roberta.base.nopos` | RoBERTa (base) without positional embeddings, trained on natural corpus | [roberta.base.nopos.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.nopos.tar.gz) | + +## Results + +[GLUE (Wang et al, 2019)](https://gluebenchmark.com/) & [PAWS (Zhang et al, 2019)](https://github.com/google-research-datasets/paws) _(dev set, single model, single-task fine-tuning, median of 5 seeds)_ + +| name | CoLA | MNLI | MRPC | PAWS | QNLI | QQP | RTE | SST-2 | +| :----------------------------------- | ----: | ----: | ----: | ----: | ----: | ----: | ----: | ----: | +| `roberta.base.orig` | 61.4 | 86.11 | 89.19 | 94.46 | 92.53 | 91.26 | 74.64 | 93.92 | +| `roberta.base.shuffle.n1` | 35.15 | 82.64 | 86 | 89.97 | 89.02 | 91.01 | 69.02 | 90.47 | +| `roberta.base.shuffle.n2` | 54.37 | 83.43 | 86.24 | 93.46 | 90.44 | 91.36 | 70.83 | 91.79 | +| `roberta.base.shuffle.n3` | 48.72 | 83.85 | 86.36 | 94.05 | 91.69 | 91.24 | 70.65 | 92.02 | +| `roberta.base.shuffle.n4` | 58.64 | 83.77 | 86.98 | 94.32 | 91.69 | 91.4 | 70.83 | 92.48 | +| `roberta.base.shuffle.512` | 12.76 | 77.52 | 79.61 | 84.77 | 85.19 | 90.2 | 56.52 | 86.34 | +| `roberta.base.shuffle.corpus` | 0 | 71.9 | 70.52 | 58.52 | 71.11 | 85.52 | 53.99 | 83.35 | +| `roberta.base.shuffle.corpus_random` | 9.19 | 72.33 | 70.76 | 58.42 | 77.76 | 85.93 | 53.99 | 84.04 | +| `roberta.base.nopos` | 0 | 63.5 | 72.73 | 57.08 | 77.72 | 87.87 | 54.35 | 83.24 | + +For more results on probing tasks, please refer to [our paper](https://arxiv.org/abs/2104.06644). + +## Example Usage + +Follow the same usage as in [RoBERTa](https://github.com/pytorch/fairseq/tree/main/examples/roberta) to load and test your models: + +```python +# Download roberta.base.shuffle.n1 model +wget https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n1.tar.gz +tar -xzvf roberta.base.shuffle.n1.tar.gz +# Copy the dictionary files +cd roberta.base.shuffle.n1.tar.gz +wget -O dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt && wget -O encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json && wget -O vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe +cd .. + +# Load the model in fairseq +from fairseq.models.roberta import RobertaModel +roberta = RobertaModel.from_pretrained('/path/to/roberta.base.shuffle.n1', checkpoint_file='model.pt') +roberta.eval() # disable dropout (or leave in train mode to finetune) +``` + +We have also provided a [Google Colab](https://colab.research.google.com/drive/1IJDVfNVWdvRfLjphQKBGzmob84t-OXpm) notebook to demonstrate the loading of the model. The models were trained on top of Fairseq from the following commit: [62cff008ebeeed855093837507d5e6bf52065ee6](https://github.com/pytorch/fairseq/commit/62cff008ebeeed855093837507d5e6bf52065ee6). + +**Note**: The model trained without positional embeddings (`roberta.base.nopos`) is a modified `RoBERTa` model, where the positional embeddings are not used. Thus, the typical `from_pretrained` method on fairseq version of RoBERTa will not be able to load the above model weights. To do so, construct a new `RoBERTaModel` object by setting the flag `use_positional_embeddings` to `False` (or [in the latest code](https://github.com/pytorch/fairseq/blob/main/fairseq/models/roberta/model.py#L543), set `no_token_positional_embeddings` to `True`), and then load the individual weights. + +## Fine-tuning Evaluation + +We provide the trained fine-tuned models on MNLI here for each model above for quick evaluation (1 seed for each model). Please refer to [finetuning details](README.finetuning.md) for the parameters of these models. Follow [RoBERTa](https://github.com/pytorch/fairseq/tree/main/examples/roberta) instructions to evaluate these models. + +| Model | MNLI M Dev Accuracy | Link | +| :----------------------------------------- | :------------------ | :--------------------------------------------------------------------------------------------------------------- | +| `roberta.base.orig.mnli` | 86.14 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.orig.mnli.tar.gz) | +| `roberta.base.shuffle.n1.mnli` | 82.55 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n1.mnli.tar.gz) | +| `roberta.base.shuffle.n2.mnli` | 83.21 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n2.mnli.tar.gz) | +| `roberta.base.shuffle.n3.mnli` | 83.89 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n3.mnli.tar.gz) | +| `roberta.base.shuffle.n4.mnli` | 84.00 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n4.mnli.tar.gz) | +| `roberta.base.shuffle.512.mnli` | 77.22 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.512.mnli.tar.gz) | +| `roberta.base.shuffle.corpus.mnli` | 71.88 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus.mnli.tar.gz) | +| `roberta.base.shuffle.corpus_uniform.mnli` | 72.46 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus_uniform.mnli.tar.gz) | + +## Citation + +```bibtex +@misc{sinha2021masked, + title={Masked Language Modeling and the Distributional Hypothesis: Order Word Matters Pre-training for Little}, + author={Koustuv Sinha and Robin Jia and Dieuwke Hupkes and Joelle Pineau and Adina Williams and Douwe Kiela}, + year={2021}, + eprint={2104.06644}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +## Contact + +For questions and comments, please reach out to Koustuv Sinha (koustuv.sinha@mail.mcgill.ca). diff --git a/examples/simultaneous_translation/README.md b/examples/simultaneous_translation/README.md index e27b65280e..62a005e0ec 100644 --- a/examples/simultaneous_translation/README.md +++ b/examples/simultaneous_translation/README.md @@ -1,106 +1,5 @@ -# Simultaneous Machine Translation - -This directory contains the code for the paper [Monotonic Multihead Attention](https://openreview.net/forum?id=Hyg96gBKPS) - -## Prepare Data - -[Please follow the instructions to download and preprocess the WMT'15 En-De dataset.](https://github.com/pytorch/fairseq/tree/simulastsharedtask/examples/translation#prepare-wmt14en2desh) - -## Training - -- MMA-IL - -```shell -fairseq-train \ - data-bin/wmt15_en_de_32k \ - --simul-type infinite_lookback \ - --user-dir $FAIRSEQ/example/simultaneous_translation \ - --mass-preservation \ - --criterion latency_augmented_label_smoothed_cross_entropy \ - --latency-weight-avg 0.1 \ - --max-update 50000 \ - --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ - --optimizer adam --adam-betas '(0.9, 0.98)' \ - --lr-scheduler 'inverse_sqrt' \ - --warmup-init-lr 1e-7 --warmup-updates 4000 \ - --lr 5e-4 --min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ - --dropout 0.3 \ - --label-smoothing 0.1\ - --max-tokens 3584 -``` - -- MMA-H - -```shell -fairseq-train \ - data-bin/wmt15_en_de_32k \ - --simul-type hard_aligned \ - --user-dir $FAIRSEQ/example/simultaneous_translation \ - --mass-preservation \ - --criterion latency_augmented_label_smoothed_cross_entropy \ - --latency-weight-var 0.1 \ - --max-update 50000 \ - --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ - --optimizer adam --adam-betas '(0.9, 0.98)' \ - --lr-scheduler 'inverse_sqrt' \ - --warmup-init-lr 1e-7 --warmup-updates 4000 \ - --lr 5e-4 --min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ - --dropout 0.3 \ - --label-smoothing 0.1\ - --max-tokens 3584 -``` - -- wait-k - -```shell -fairseq-train \ - data-bin/wmt15_en_de_32k \ - --simul-type wait-k \ - --waitk-lagging 3 \ - --user-dir $FAIRSEQ/example/simultaneous_translation \ - --mass-preservation \ - --criterion latency_augmented_label_smoothed_cross_entropy \ - --max-update 50000 \ - --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ - --optimizer adam --adam-betas '(0.9, 0.98)' \ - --lr-scheduler 'inverse_sqrt' \ - --warmup-init-lr 1e-7 --warmup-updates 4000 \ - --lr 5e-4 --min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ - --dropout 0.3 \ - --label-smoothing 0.1\ - --max-tokens 3584 -``` - - -## Evaluation - -More details on evaluation can be found [here](https://github.com/pytorch/fairseq/blob/simulastsharedtask/examples/simultaneous_translation/docs/evaluation.md) - -### Start the server - -```shell -python ./eval/server.py \ - --src-file $SRC_FILE \ - --ref-file $TGT_FILE -``` - -### Run the client - -```shell -python ./evaluate.py \ - --data-bin data-bin/wmt15_en_de_32k \ - --model-path ./checkpoints/checkpoint_best.pt - --scores --output $RESULT_DIR -``` - -### Run evaluation locally without server - -```shell -python ./eval/evaluate.py - --local \ - --src-file $SRC_FILE \ - --tgt-file $TGT_FILE \ - --data-bin data-bin/wmt15_en_de_32k \ - --model-path ./checkpoints/checkpoint_best.pt \ - --scores --output $RESULT_DIR -``` +# Simultaneous Translation +Examples of simultaneous translation in fairseq +- [English-to-Japanese text-to-text wait-k model](docs/enja-waitk.md) +- [English-to-Germen text-to-text monotonic multihead attention model](docs/ende-mma.md) +- [English-to-Germen speech-to-text simultaneous translation model](../speech_to_text/docs/simulst_mustc_example.md) diff --git a/examples/simultaneous_translation/__init__.py b/examples/simultaneous_translation/__init__.py index 446fc86c8a..5835316ba9 100644 --- a/examples/simultaneous_translation/__init__.py +++ b/examples/simultaneous_translation/__init__.py @@ -3,4 +3,4 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from . import criterions, eval, models # noqa +from . import models # noqa diff --git a/examples/simultaneous_translation/criterions/label_smoothed_cross_entropy_latency_augmented.py b/examples/simultaneous_translation/criterions/label_smoothed_cross_entropy_latency_augmented.py deleted file mode 100644 index b3c8f6d53f..0000000000 --- a/examples/simultaneous_translation/criterions/label_smoothed_cross_entropy_latency_augmented.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from examples.simultaneous_translation.utils.latency import LatencyTraining -from fairseq.criterions import register_criterion -from fairseq.criterions.label_smoothed_cross_entropy import ( - LabelSmoothedCrossEntropyCriterion, -) - - -@register_criterion("latency_augmented_label_smoothed_cross_entropy") -class LatencyAugmentedLabelSmoothedCrossEntropyCriterion( - LabelSmoothedCrossEntropyCriterion -): - def __init__(self, args, task): - super().__init__(args, task) - self.eps = args.label_smoothing - self.latency_weight_avg = args.latency_weight_avg - self.latency_weight_avg_type = args.latency_weight_avg_type - self.latency_weight_var = args.latency_weight_var - self.latency_weight_var_type = args.latency_weight_var_type - self.mass_preservation = args.mass_preservation - self.average_method = args.average_method - self.latency_train = LatencyTraining( - self.latency_weight_avg, - self.latency_weight_var, - self.latency_weight_avg_type, - self.latency_weight_var_type, - self.mass_preservation, - self.average_method, - ) - - @staticmethod - def add_args(parser): - super( - LatencyAugmentedLabelSmoothedCrossEntropyCriterion, - LatencyAugmentedLabelSmoothedCrossEntropyCriterion, - ).add_args(parser) - """Add criterion-specific arguments to the parser.""" - # fmt: off - parser.add_argument("--latency-weight-avg", default=0., type=float, metavar='D', - help="Average loss weight") - parser.add_argument("--latency-weight-var", default=0., type=float, metavar='D', - help="Variance loss weight") - parser.add_argument("--latency-weight-avg-type", default="differentiable_average_lagging", - help="Statistics for Average loss type") - parser.add_argument("--latency-weight-var-type", default="variance_delay", - help="Statistics for variance loss type") - parser.add_argument("--average-method", default="weighted_average", - help="Average loss type") - # fmt: on - - def compute_loss(self, model, net_output, sample, reduce=True): - # Compute cross entropy loss first - loss, nll_loss = super().compute_loss(model, net_output, sample, reduce) - - # Obtain the expected alignment - attn_list = [item["alpha"] for item in net_output[-1]["attn_list"]] - - target_padding_mask = model.get_targets(sample, net_output).eq(self.padding_idx) - - source_padding_mask = net_output[-1].get("encoder_padding_mask", None) - - # Get latency loss - latency_loss = self.latency_train.loss( - attn_list, source_padding_mask, target_padding_mask - ) - - loss += latency_loss - - return loss, nll_loss diff --git a/examples/simultaneous_translation/docs/baseline.md b/examples/simultaneous_translation/docs/baseline.md deleted file mode 100644 index d9bf1a1117..0000000000 --- a/examples/simultaneous_translation/docs/baseline.md +++ /dev/null @@ -1,178 +0,0 @@ -# **Baseline Simultaneous Translation** ---- - -This is an instruction of training and evaluating a *wait-k* simultanoes LSTM model on MUST-C English-Gernam Dataset. - -[STACL: Simultaneous Translation with Implicit Anticipation and Controllable Latency using Prefix-to-Prefix Framework](https://https://www.aclweb.org/anthology/P19-1289/) - - -## **Requirements** -Install fairseq (make sure to use the correct branch): -``` -git clone --branch simulastsharedtask git@github.com:pytorch/fairseq.git -cd fairseq -pip install -e . -``` - -Assuming that fairseq is installed in a directory called `FAIRSEQ`. - -Install SentencePiece. One easy way is to use anaconda: - -``` -conda install -c powerai sentencepiece -``` - -Download the MuST-C data for English-German available at https://ict.fbk.eu/must-c/. -We will assume that the data is downloaded in a directory called `DATA_ROOT`. - - -## **Text-to-text Model** ---- -### Data Preparation -Train a SentencePiece model: -```shell -for lang in en de; do - python $FAIRSEQ/examples/simultaneous_translation/data/train_spm.py \ - --data-path $DATA_ROOT/data \ - --vocab-size 10000 \ - --max-frame 3000 \ - --model-type unigram \ - --lang $lang \ - --out-path . -``` - -Process the data with the SentencePiece model: -```shell -proc_dir=proc -mkdir -p $proc_dir -for split in train dev tst-COMMON tst-HE; do - for lang in en de; do - spm_encode \ - --model unigram-$lang-10000-3000/spm.model \ - < $DATA_ROOT/data/$split/txt/$split.$lang \ - > $proc_dir/$split.spm.$lang - done -done -``` - -Binarize the data: - -```shell -proc_dir=proc -fairseq-preprocess \ - --source-lang en --target-lang de \ - --trainpref $proc_dir/train.spm \ - --validpref $proc_dir/dev.spm \ - --testpref $proc_dir/tst-COMMON.spm \ - --thresholdtgt 0 \ - --thresholdsrc 0 \ - --workers 20 \ - --destdir ./data-bin/mustc_en_de \ -``` - -### Training - - -```shell -mkdir -p checkpoints -CUDA_VISIBLE_DEVICES=1 python $FAIRSEQ/train.py data-bin/mustc_en_de \ - --save-dir checkpoints \ - --arch berard_simul_text_iwslt \ - --simul-type waitk \ - --waitk-lagging 2 \ - --optimizer adam \ - --max-epoch 100 \ - --lr 0.001 \ - --clip-norm 5.0 \ - --batch-size 128 \ - --log-format json \ - --log-interval 10 \ - --criterion cross_entropy_acc \ - --user-dir $FAIRSEQ/examples/simultaneous_translation -``` - -## **Speech-to-text Model** ---- -### Data Preparation -First, segment wav files. -```shell -python $FAIRSEQ/examples/simultaneous_translation/data/segment_wav.py \ - --datapath $DATA_ROOT -``` -Similar to text-to-text model, train a Sentencepiecemodel, but only train on German -```Shell -python $FAIRSEQ/examples/simultaneous_translation/data/train_spm.py \ - --data-path $DATA_ROOT/data \ - --vocab-size 10000 \ - --max-frame 3000 \ - --model-type unigram \ - --lang $lang \ - --out-path . -``` -## Training -```shell -mkdir -p checkpoints -CUDA_VISIBLE_DEVICES=1 python $FAIRSEQ/train.py data-bin/mustc_en_de \ - --save-dir checkpoints \ - --arch berard_simul_text_iwslt \ - --waitk-lagging 2 \ - --waitk-stride 10 \ - --input-feat-per-channel 40 \ - --encoder-hidden-size 512 \ - --output-layer-dim 128 \ - --decoder-num-layers 3 \ - --task speech_translation \ - --user-dir $FAIRSEQ/examples/simultaneous_translation - --optimizer adam \ - --max-epoch 100 \ - --lr 0.001 \ - --clip-norm 5.0 \ - --batch-size 128 \ - --log-format json \ - --log-interval 10 \ - --criterion cross_entropy_acc \ - --user-dir $FAIRSEQ/examples/simultaneous_translation -``` - -## Evaluation ---- -### Evaluation Server -For text translation models, the server is set up as follow give input file and reference file. - -``` shell -python ./eval/server.py \ - --hostname localhost \ - --port 12321 \ - --src-file $DATA_ROOT/data/dev/txt/dev.en \ - --ref-file $DATA_ROOT/data/dev/txt/dev.de -``` -For speech translation models, the input is the data direcrory. -``` shell -python ./eval/server.py \ - --hostname localhost \ - --port 12321 \ - --ref-file $DATA_ROOT \ - --data-type speech -``` - -### Decode and Evaluate with Client -Once the server is set up, run client to evaluate translation quality and latency. -```shell -# TEXT -python $fairseq_dir/examples/simultaneous_translation/evaluate.py \ - data-bin/mustc_en_de \ - --user-dir $FAIRSEQ/examples/simultaneous_translation \ - --src-spm unigram-en-10000-3000/spm.model\ - --tgt-spm unigram-de-10000-3000/spm.model\ - -s en -t de \ - --path checkpoints/checkpoint_best.pt - -# SPEECH -python $fairseq_dir/examples/simultaneous_translation/evaluate.py \ - data-bin/mustc_en_de \ - --user-dir $FAIRSEQ/examples/simultaneous_translation \ - --data-type speech \ - --tgt-spm unigram-de-10000-3000/spm.model\ - -s en -t de \ - --path checkpoints/checkpoint_best.pt -``` diff --git a/examples/simultaneous_translation/docs/ende-mma.md b/examples/simultaneous_translation/docs/ende-mma.md new file mode 100644 index 0000000000..241d604a3b --- /dev/null +++ b/examples/simultaneous_translation/docs/ende-mma.md @@ -0,0 +1,74 @@ +# Simultaneous Machine Translation + +This directory contains the code for the paper [Monotonic Multihead Attention](https://openreview.net/forum?id=Hyg96gBKPS) + +## Prepare Data + +[Please follow the instructions to download and preprocess the WMT'15 En-De dataset.](https://github.com/pytorch/fairseq/tree/simulastsharedtask/examples/translation#prepare-wmt14en2desh) + +Another example of training an English to Japanese model can be found [here](docs/enja.md) + +## Training + +- MMA-IL + +```shell +fairseq-train \ + data-bin/wmt15_en_de_32k \ + --simul-type infinite_lookback \ + --user-dir $FAIRSEQ/example/simultaneous_translation \ + --mass-preservation \ + --criterion latency_augmented_label_smoothed_cross_entropy \ + --latency-weight-avg 0.1 \ + --max-update 50000 \ + --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ + --optimizer adam --adam-betas '(0.9, 0.98)' \ + --lr-scheduler 'inverse_sqrt' \ + --warmup-init-lr 1e-7 --warmup-updates 4000 \ + --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ + --dropout 0.3 \ + --label-smoothing 0.1\ + --max-tokens 3584 +``` + +- MMA-H + +```shell +fairseq-train \ + data-bin/wmt15_en_de_32k \ + --simul-type hard_aligned \ + --user-dir $FAIRSEQ/example/simultaneous_translation \ + --mass-preservation \ + --criterion latency_augmented_label_smoothed_cross_entropy \ + --latency-weight-var 0.1 \ + --max-update 50000 \ + --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ + --optimizer adam --adam-betas '(0.9, 0.98)' \ + --lr-scheduler 'inverse_sqrt' \ + --warmup-init-lr 1e-7 --warmup-updates 4000 \ + --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ + --dropout 0.3 \ + --label-smoothing 0.1\ + --max-tokens 3584 +``` + +- wait-k + +```shell +fairseq-train \ + data-bin/wmt15_en_de_32k \ + --simul-type wait-k \ + --waitk-lagging 3 \ + --user-dir $FAIRSEQ/example/simultaneous_translation \ + --mass-preservation \ + --criterion latency_augmented_label_smoothed_cross_entropy \ + --max-update 50000 \ + --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ + --optimizer adam --adam-betas '(0.9, 0.98)' \ + --lr-scheduler 'inverse_sqrt' \ + --warmup-init-lr 1e-7 --warmup-updates 4000 \ + --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ + --dropout 0.3 \ + --label-smoothing 0.1\ + --max-tokens 3584 +``` diff --git a/examples/simultaneous_translation/docs/enja-waitk.md b/examples/simultaneous_translation/docs/enja-waitk.md new file mode 100644 index 0000000000..fb9d82576f --- /dev/null +++ b/examples/simultaneous_translation/docs/enja-waitk.md @@ -0,0 +1,106 @@ +# An example of English to Japaneses Simultaneous Translation System + +This is an example of training and evaluating a transformer *wait-k* English to Japanese simultaneous text-to-text translation model. + +## Data Preparation +This section introduces the data preparation for training and evaluation. +If you only want to evaluate the model, please jump to [Inference & Evaluation](#inference-&-evaluation) + +For illustration, we only use the following subsets of the available data from [WMT20 news translation task](http://www.statmt.org/wmt20/translation-task.html), which results in 7,815,391 sentence pairs. +- News Commentary v16 +- Wiki Titles v3 +- WikiMatrix V1 +- Japanese-English Subtitle Corpus +- The Kyoto Free Translation Task Corpus + +We use WMT20 development data as development set. Training `transformer_vaswani_wmt_en_de_big` model on such amount of data will result in 17.3 BLEU with greedy search and 19.7 with beam (10) search. Notice that a better performance can be achieved with the full WMT training data. + +We use [sentencepiece](https://github.com/google/sentencepiece) toolkit to tokenize the data with a vocabulary size of 32000. +Additionally, we filtered out the sentences longer than 200 words after tokenization. +Assuming the tokenized text data is saved at `${DATA_DIR}`, +we prepare the data binary with the following command. + +```bash +fairseq-preprocess \ + --source-lang en --target-lang ja \ + --trainpref ${DATA_DIR}/train \ + --validpref ${DATA_DIR}/dev \ + --testpref ${DATA_DIR}/test \ + --destdir ${WMT20_ENJA_DATA_BIN} \ + --nwordstgt 32000 --nwordssrc 32000 \ + --workers 20 +``` + +## Simultaneous Translation Model Training +To train a wait-k `(k=10)` model. +```bash +fairseq-train ${WMT20_ENJA_DATA_BIN} \ + --save-dir ${SAVEDIR} + --simul-type waitk \ + --waitk-lagging 10 \ + --max-epoch 70 \ + --arch transformer_monotonic_vaswani_wmt_en_de_big \ + --optimizer adam \ + --adam-betas '(0.9, 0.98)' \ + --lr-scheduler inverse_sqrt \ + --warmup-init-lr 1e-07 \ + --warmup-updates 4000 \ + --lr 0.0005 \ + --stop-min-lr 1e-09 \ + --clip-norm 10.0 \ + --dropout 0.3 \ + --weight-decay 0.0 \ + --criterion label_smoothed_cross_entropy \ + --label-smoothing 0.1 \ + --max-tokens 3584 +``` +This command is for training on 8 GPUs. Equivalently, the model can be trained on one GPU with `--update-freq 8`. + +## Inference & Evaluation +First of all, install [SimulEval](https://github.com/facebookresearch/SimulEval) for evaluation. + +```bash +git clone https://github.com/facebookresearch/SimulEval.git +cd SimulEval +pip install -e . +``` + +The following command is for the evaluation. +Assuming the source and reference files are `${SRC_FILE}` and `${REF_FILE}`, the sentencepiece model file for English is saved at `${SRC_SPM_PATH}` + + +```bash +simuleval \ + --source ${SRC_FILE} \ + --target ${TGT_FILE} \ + --data-bin ${WMT20_ENJA_DATA_BIN} \ + --sacrebleu-tokenizer ja-mecab \ + --eval-latency-unit char \ + --no-space \ + --src-splitter-type sentencepiecemodel \ + --src-splitter-path ${SRC_SPM_PATH} \ + --agent ${FAIRSEQ}/examples/simultaneous_translation/agents/simul_trans_text_agent_enja.py \ + --model-path ${SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --output ${OUTPUT} \ + --scores +``` + +The `--data-bin` should be the same in previous sections if you prepare the data from the scratch. +If only for evaluation, a prepared data directory can be found [here](https://dl.fbaipublicfiles.com/simultaneous_translation/wmt20_enja_medium_databin.tgz) and a pretrained checkpoint (wait-k=10 model) can be downloaded from [here](https://dl.fbaipublicfiles.com/simultaneous_translation/wmt20_enja_medium_wait10_ckpt.pt). + +The output should look like this: +```bash +{ + "Quality": { + "BLEU": 11.442253287568398 + }, + "Latency": { + "AL": 8.6587861866951, + "AP": 0.7863304776251316, + "DAL": 9.477850951194764 + } +} +``` +The latency is evaluated by characters (`--eval-latency-unit`) on the target side. The latency is evaluated with `sacrebleu` with `MeCab` tokenizer `--sacrebleu-tokenizer ja-mecab`. `--no-space` indicates that do not add space when merging the predicted words. + +If `--output ${OUTPUT}` option is used, the detailed log and scores will be stored under the `${OUTPUT}` directory. diff --git a/examples/simultaneous_translation/docs/evaluation.md b/examples/simultaneous_translation/docs/evaluation.md deleted file mode 100644 index c53407354e..0000000000 --- a/examples/simultaneous_translation/docs/evaluation.md +++ /dev/null @@ -1,115 +0,0 @@ -# Introduction to evaluation interface -The simultaneous translation models from sharedtask participents are evaluated under a server-client protocol. The participents are requisted to plug in their own model API in the protocol, and submit a docker file. - -## Server-Client Protocol -An server-client protocol that will be used in evaluation. For example, when a *wait-k* model (k=3) translate the English sentence "Alice and Bob are good friends" to Genman sentence "Alice und Bob sind gute Freunde." , the evaluation process is shown as following figure. - -While every time client needs to read a new state (word or speech utterence), a "GET" request is supposed to sent over to server. Whenever a new token is generated, a "SEND" request with the word predicted (untokenized word) will be sent to server immediately. The server can hence calculate both latency and BLEU score of the sentence. - -### Server -The server code is provided and can be set up directly locally for development purpose. For example, to evaluate a text simultaneous test set, - -```shell - - python fairseq/examples/simultaneous_translation/eval/server.py \ - --hostname local_host \ - --port 1234 \ - --src-file SRC_FILE \ - --ref-file REF_FILE \ - --data-type text \ -``` -The state that server sent to client is has the following format -```json -{ - 'sent_id': Int, - 'segment_id': Int, - 'segment': String -} -``` - -### Client -The client will handle the evaluation process mentioned above. It should be out-of-box as well. The client's protocol is as following table - -|Action|Content| -|:---:|:---:| -|Request new word / utterence| ```{key: "Get", value: None}```| -|Predict word "W"| ```{key: "SEND", value: "W"}```| - - - -The core of the client module is the agent, which needs to be modified to different models accordingly. The abstract class of agent is as follow, the evaluation process happens in the `decode()` function. -```python -class Agent(object): - "an agent needs to follow this pattern" - def __init__(self, *args, **kwargs): - ... - - def init_states(self): - # Initializing states - ... - - def update_states(self, states, new_state): - # Update states with given new state from server - # TODO (describe the states) - ... - - def finish_eval(self, states, new_state): - # Check if evaluation is finished - ... - - def policy(self, state: list) -> dict: - # Provide a action given current states - # The action can only be either - # {key: "GET", value: NONE} - # or - # {key: "SEND", value: W} - ... - - def reset(self): - # Reset agent - ... - - def decode(self, session): - - states = self.init_states() - self.reset() - - # Evaluataion protocol happens here - while True: - # Get action from the current states according to self.policy() - action = self.policy(states) - - if action['key'] == GET: - # Read a new state from server - new_state = session.get_src() - states = self.update_states(states, new_state) - - if self.finish_eval(states, new_state): - # End of document - break - - elif action['key'] == SEND: - # Send a new prediction to server - session.send_hypo(action['value']) - - # Clean the history, wait for next sentence - if action['value'] == DEFAULT_EOS: - states = self.init_states() - self.reset() - else: - raise NotImplementedError - - -``` -Here an implementation of agent of text [*wait-k* model](somelink). Notice that the tokenization is not considered. - -## Quality -The quality is measured by detokenized BLEU. So make sure that the predicted words sent to server are detokenized. An implementation is can be find [here](some link) - -## Latency -The latency metrics are -* Average Proportion -* Average Lagging -* Differentiable Average Lagging -Again Thery will also be evaluated on detokenized text. - diff --git a/examples/simultaneous_translation/eval/agents/__init__.py b/examples/simultaneous_translation/eval/agents/__init__.py deleted file mode 100644 index 511e7b2474..0000000000 --- a/examples/simultaneous_translation/eval/agents/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import importlib -import os - -from fairseq import registry - - -build_agent, register_agent, MONOTONIC_AGENT, _ = registry.setup_registry( - "--agent-type" -) - - -DEFAULT_EOS = "</s>" -GET = 0 -SEND = 1 - -for file in os.listdir(os.path.dirname(__file__)): - if file.endswith(".py") and not file.startswith("_"): - module = file[: file.find(".py")] - importlib.import_module("agents." + module) diff --git a/examples/simultaneous_translation/eval/agents/agent.py b/examples/simultaneous_translation/eval/agents/agent.py deleted file mode 100644 index 997392cf9b..0000000000 --- a/examples/simultaneous_translation/eval/agents/agent.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import time -from functools import partial -from multiprocessing.pool import ThreadPool as Pool - -from . import DEFAULT_EOS, GET, SEND - - -class Agent(object): - "an agent needs to follow this pattern" - - def __init__(self, *args, **kwargs): - pass - - def init_states(self, *args, **kwargs): - raise NotImplementedError - - def update_states(self, states, new_state): - raise NotImplementedError - - def finish_eval(self, states, new_state): - raise NotImplementedError - - def policy(self, state): - raise NotImplementedError - - def reset(self): - raise NotImplementedError - - def decode(self, session, low=0, high=100000, num_thread=10): - corpus_info = session.corpus_info() - high = min(corpus_info["num_sentences"] - 1, high) - if low >= high: - return - - t0 = time.time() - if num_thread > 1: - with Pool(10) as p: - p.map( - partial(self._decode_one, session), - [sent_id for sent_id in range(low, high + 1)], - ) - else: - for sent_id in range(low, high + 1): - self._decode_one(session, sent_id) - - print(f"Finished {low} to {high} in {time.time() - t0}s") - - def _decode_one(self, session, sent_id): - action = {} - self.reset() - states = self.init_states() - while action.get("value", None) != DEFAULT_EOS: - # take an action - action = self.policy(states) - - if action["key"] == GET: - new_states = session.get_src(sent_id, action["value"]) - states = self.update_states(states, new_states) - - elif action["key"] == SEND: - session.send_hypo(sent_id, action["value"]) - print(" ".join(states["tokens"]["tgt"])) diff --git a/examples/simultaneous_translation/eval/agents/simul_t2t_enja.py b/examples/simultaneous_translation/eval/agents/simul_t2t_enja.py new file mode 100644 index 0000000000..8f3c8703ca --- /dev/null +++ b/examples/simultaneous_translation/eval/agents/simul_t2t_enja.py @@ -0,0 +1,226 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os + +from fairseq import checkpoint_utils, tasks +import sentencepiece as spm +import torch + +try: + from simuleval import READ_ACTION, WRITE_ACTION, DEFAULT_EOS + from simuleval.agents import TextAgent +except ImportError: + print("Please install simuleval 'pip install simuleval'") + + +BOS_PREFIX = "\u2581" + + +class SimulTransTextAgentJA(TextAgent): + """ + Simultaneous Translation + Text agent for Japanese + """ + def __init__(self, args): + + # Whether use gpu + self.gpu = getattr(args, "gpu", False) + + # Max len + self.max_len = args.max_len + + # Load Model + self.load_model_vocab(args) + + # build word splitter + self.build_word_splitter(args) + + self.eos = DEFAULT_EOS + + def initialize_states(self, states): + states.incremental_states = dict() + states.incremental_states["online"] = dict() + + def to_device(self, tensor): + if self.gpu: + return tensor.cuda() + else: + return tensor.cpu() + + def load_model_vocab(self, args): + + filename = args.model_path + if not os.path.exists(filename): + raise IOError("Model file not found: {}".format(filename)) + + state = checkpoint_utils.load_checkpoint_to_cpu(filename) + + task_args = state["cfg"]["task"] + task_args.data = args.data_bin + + task = tasks.setup_task(task_args) + + # build model for ensemble + state["cfg"]["model"].load_pretrained_encoder_from = None + state["cfg"]["model"].load_pretrained_decoder_from = None + + self.model = task.build_model(state["cfg"]["model"]) + self.model.load_state_dict(state["model"], strict=True) + self.model.eval() + self.model.share_memory() + + if self.gpu: + self.model.cuda() + + # Set dictionary + self.dict = {} + self.dict["tgt"] = task.target_dictionary + self.dict["src"] = task.source_dictionary + + @staticmethod + def add_args(parser): + # fmt: off + parser.add_argument('--model-path', type=str, required=True, + help='path to your pretrained model.') + parser.add_argument("--data-bin", type=str, required=True, + help="Path of data binary") + parser.add_argument("--max-len", type=int, default=100, + help="Max length of translation") + parser.add_argument("--tgt-splitter-type", type=str, default="SentencePiece", + help="Subword splitter type for target text.") + parser.add_argument("--tgt-splitter-path", type=str, default=None, + help="Subword splitter model path for target text.") + parser.add_argument("--src-splitter-type", type=str, default="SentencePiece", + help="Subword splitter type for source text.") + parser.add_argument("--src-splitter-path", type=str, default=None, + help="Subword splitter model path for source text.") + # fmt: on + return parser + + def build_word_splitter(self, args): + self.spm = {} + for lang in ['src', 'tgt']: + if getattr(args, f'{lang}_splitter_type', None): + path = getattr(args, f'{lang}_splitter_path', None) + if path: + self.spm[lang] = spm.SentencePieceProcessor() + self.spm[lang].Load(path) + + def segment_to_units(self, segment, states): + # Split a full word (segment) into subwords (units) + return self.spm['src'].EncodeAsPieces(segment) + + def update_model_encoder(self, states): + if len(states.units.source) == 0: + return + + src_indices = [ + self.dict['src'].index(x) + for x in states.units.source.value + ] + + if states.finish_read(): + # Append the eos index when the prediction is over + src_indices += [self.dict["tgt"].eos_index] + + src_indices = self.to_device( + torch.LongTensor(src_indices).unsqueeze(0) + ) + src_lengths = self.to_device( + torch.LongTensor([src_indices.size(1)]) + ) + + states.encoder_states = self.model.encoder(src_indices, src_lengths) + + torch.cuda.empty_cache() + + def update_states_read(self, states): + # Happens after a read action. + self.update_model_encoder(states) + + def units_to_segment(self, units, states): + # Merge sub words (units) to full word (segment). + # For Japanese, we can directly send + # the untokenized token to server except the BOS token + # with following option + # --sacrebleu-tokenizer MeCab + # --eval-latency-unit char + # --no-space + token = units.value.pop() + + if ( + token == self.dict["tgt"].eos_word + or len(states.segments.target) > self.max_len + ): + return DEFAULT_EOS + + if BOS_PREFIX == token: + return None + if token[0] == BOS_PREFIX: + return token[1:] + else: + return token + + def policy(self, states): + + if not getattr(states, "encoder_states", None): + # No encoder states, read a token first + return READ_ACTION + + # encode previous predicted target tokens + tgt_indices = self.to_device( + torch.LongTensor( + [self.model.decoder.dictionary.eos()] + + [ + self.dict['tgt'].index(x) + for x in states.units.target.value + if x is not None + ] + ).unsqueeze(0) + ) + + # Current steps + states.incremental_states["steps"] = { + "src": states.encoder_states["encoder_out"][0].size(0), + "tgt": 1 + len(states.units.target), + } + + # Online only means the reading is not finished + states.incremental_states["online"]["only"] = ( + torch.BoolTensor([not states.finish_read()]) + ) + + x, outputs = self.model.decoder.forward( + prev_output_tokens=tgt_indices, + encoder_out=states.encoder_states, + incremental_state=states.incremental_states, + ) + + states.decoder_out = x + + torch.cuda.empty_cache() + + if outputs.action == 0: + return READ_ACTION + else: + return WRITE_ACTION + + def predict(self, states): + # Predict target token from decoder states + decoder_states = states.decoder_out + + lprobs = self.model.get_normalized_probs( + [decoder_states[:, -1:]], log_probs=True + ) + + index = lprobs.argmax(dim=-1)[0, 0].item() + + if index != self.dict['tgt'].eos_index: + token = self.dict['tgt'].string([index]) + else: + token = self.dict['tgt'].eos_word + + return token diff --git a/examples/simultaneous_translation/eval/agents/simul_trans_agent.py b/examples/simultaneous_translation/eval/agents/simul_trans_agent.py deleted file mode 100644 index 071b9e89ce..0000000000 --- a/examples/simultaneous_translation/eval/agents/simul_trans_agent.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import json -import os - -from fairseq import checkpoint_utils, tasks, utils - -from . import DEFAULT_EOS, GET, SEND -from .agent import Agent - - -class SimulTransAgent(Agent): - def __init__(self, args): - # Load Model - self.load_model(args) - - # build word spliter - self.build_word_splitter(args) - - self.max_len = args.max_len - - self.eos = DEFAULT_EOS - - @staticmethod - def add_args(parser): - # fmt: off - parser.add_argument('--model-path', type=str, required=True, - help='path to your pretrained model.') - parser.add_argument("--data-bin", type=str, required=True, - help="Path of data binary") - parser.add_argument("--user-dir", type=str, default="example/simultaneous_translation", - help="User directory for simultaneous translation") - parser.add_argument("--src-splitter-type", type=str, default=None, - help="Subword splitter type for source text") - parser.add_argument("--tgt-splitter-type", type=str, default=None, - help="Subword splitter type for target text") - parser.add_argument("--src-splitter-path", type=str, default=None, - help="Subword splitter model path for source text") - parser.add_argument("--tgt-splitter-path", type=str, default=None, - help="Subword splitter model path for target text") - parser.add_argument("--max-len", type=int, default=150, - help="Maximum length difference between source and target prediction") - parser.add_argument('--model-overrides', default="{}", type=str, metavar='DICT', - help='A dictionary used to override model args at generation ' - 'that were used during model training') - # fmt: on - return parser - - def load_dictionary(self, task): - raise NotImplementedError - - def load_model(self, args): - args.user_dir = os.path.join(os.path.dirname(__file__), "..", "..") - utils.import_user_module(args) - filename = args.model_path - if not os.path.exists(filename): - raise IOError("Model file not found: {}".format(filename)) - - state = checkpoint_utils.load_checkpoint_to_cpu( - filename, json.loads(args.model_overrides) - ) - - saved_args = state["args"] - saved_args.data = args.data_bin - - task = tasks.setup_task(saved_args) - - # build model for ensemble - self.model = task.build_model(saved_args) - self.model.load_state_dict(state["model"], strict=True) - - # Set dictionary - self.load_dictionary(task) - - def init_states(self): - return { - "indices": {"src": [], "tgt": []}, - "tokens": {"src": [], "tgt": []}, - "segments": {"src": [], "tgt": []}, - "steps": {"src": 0, "tgt": 0}, - "finished": False, - "finish_read": False, - "model_states": {}, - } - - def update_states(self, states, new_state): - raise NotImplementedError - - def policy(self, states): - # Read and Write policy - action = None - - while action is None: - if states["finished"]: - # Finish the hypo by sending eos to server - return self.finish_action() - - # Model make decision given current states - decision = self.model.decision_from_states(states) - - if decision == 0 and not self.finish_read(states): - # READ - action = self.read_action(states) - else: - # WRITE - action = self.write_action(states) - - # None means we make decision again but not sending server anything - # This happened when read a bufffered token - # Or predict a subword - return action - - def finish_read(self, states): - raise NotImplementedError - - def write_action(self, states): - token, index = self.model.predict_from_states(states) - - if ( - index == self.dict["tgt"].eos() - or len(states["tokens"]["tgt"]) > self.max_len - ): - # Finish this sentence is predict EOS - states["finished"] = True - end_idx_last_full_word = self._target_length(states) - - else: - states["tokens"]["tgt"] += [token] - end_idx_last_full_word = self.word_splitter["tgt"].end_idx_last_full_word( - states["tokens"]["tgt"] - ) - self._append_indices(states, [index], "tgt") - - if end_idx_last_full_word > states["steps"]["tgt"]: - # Only sent detokenized full words to the server - word = self.word_splitter["tgt"].merge( - states["tokens"]["tgt"][states["steps"]["tgt"] : end_idx_last_full_word] - ) - states["steps"]["tgt"] = end_idx_last_full_word - states["segments"]["tgt"] += [word] - - return {"key": SEND, "value": word} - else: - return None - - def read_action(self, states): - return {"key": GET, "value": None} - - def finish_action(self): - return {"key": SEND, "value": DEFAULT_EOS} - - def reset(self): - pass - - def finish_eval(self, states, new_state): - if len(new_state) == 0 and len(states["indices"]["src"]) == 0: - return True - return False - - def _append_indices(self, states, new_indices, key): - states["indices"][key] += new_indices - - def _target_length(self, states): - return len(states["tokens"]["tgt"]) diff --git a/examples/simultaneous_translation/eval/agents/simul_trans_text_agent.py b/examples/simultaneous_translation/eval/agents/simul_trans_text_agent.py deleted file mode 100644 index 7c34817bf6..0000000000 --- a/examples/simultaneous_translation/eval/agents/simul_trans_text_agent.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from . import DEFAULT_EOS, GET, register_agent -from .simul_trans_agent import SimulTransAgent -from .word_splitter import SPLITTER_DICT - - -@register_agent("simul_trans_text") -class SimulTransTextAgent(SimulTransAgent): - def build_word_splitter(self, args): - self.word_splitter = {} - - self.word_splitter["src"] = SPLITTER_DICT[args.src_splitter_type]( - getattr(args, f"src_splitter_path") - ) - self.word_splitter["tgt"] = SPLITTER_DICT[args.tgt_splitter_type]( - getattr(args, f"tgt_splitter_path") - ) - - def load_dictionary(self, task): - self.dict = {} - self.dict["tgt"] = task.target_dictionary - self.dict["src"] = task.source_dictionary - - def update_states(self, states, new_state): - if states["finish_read"]: - return states - - new_word = new_state["segment"] - - # Split words and index the token - if new_word not in [DEFAULT_EOS]: - tokens = self.word_splitter["src"].split(new_word) - # Get indices from dictionary - # You can change to you own dictionary - indices = ( - self.dict["src"] - .encode_line( - tokens, - line_tokenizer=lambda x: x, - add_if_not_exist=False, - append_eos=False, - ) - .tolist() - ) - else: - tokens = [new_word] - indices = [self.dict["src"].eos()] - states["finish_read"] = True - - # Update states - states["segments"]["src"] += [new_word] - states["tokens"]["src"] += tokens - self._append_indices(states, indices, "src") - - return states - - def read_action(self, states): - # Increase source step by one - states["steps"]["src"] += 1 - - # At leat one word is read - if len(states["tokens"]["src"]) == 0: - return {"key": GET, "value": None} - - # Only request new word if there is no buffered tokens - if len(states["tokens"]["src"]) <= states["steps"]["src"]: - return {"key": GET, "value": None} - - return None - - def finish_read(self, states): - # The first means all segments (full words) has been read from server - # The second means all tokens (subwords) has been read locally - return ( - states["finish_read"] - and len(states["tokens"]["src"]) == states["steps"]["src"] - ) diff --git a/examples/simultaneous_translation/eval/agents/word_splitter.py b/examples/simultaneous_translation/eval/agents/word_splitter.py deleted file mode 100644 index c3f71200a5..0000000000 --- a/examples/simultaneous_translation/eval/agents/word_splitter.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - - -class SubwordSplitter(object): - def process_line(self, string): - raise NotImplementedError - - def split(self, string): - raise NotImplementedError - - -class NoneWordSplitter(object): - def __init__(self, model): - pass - - def split(self, string): - return [string] - - def process_line(self, string): - return [string] - - def finished_word(self, string): - return True - - def merge(self, list_of_string): - return "".join(list_of_string) - - def last_full_word_step(self, tokens, step): - return len(tokens) - - def end_idx_last_full_word(self, tokens): - return len(tokens) - - -class BPEWordSplitter(object): - # TODO: lock back here - def __init__(self, model_path): - super().__init__() - from subword_nmt.apply_bpe import BPE - - with open(model_path) as f: - self.model = BPE(f) - - def split(self, string): - return self.model.process_line(string).split() - - def end_idx_last_full_word(self, tokens): - # Begin of word indices - bow_indices = [0] + [i + 1 for i, t in enumerate(tokens[1:]) if t[-2:] != "@@"] - - if len(bow_indices) < 2: - return 0 - else: - return bow_indices[-1] - - def merge(self, list_of_string): - return " ".join([item.replace("@@", "") for item in list_of_string]) - - -class SentencePieceModelWordSplitter(object): - def __init__(self, model_path): - super().__init__() - import sentencepiece as spm - - self.model = spm.SentencePieceProcessor() - self.model.Load(model_path) - - def split(self, string): - return self.model.EncodeAsPieces(string) - - def end_idx_last_full_word(self, tokens): - # Begin of word indices - bow_indices = [i for i, t in enumerate(tokens) if t[0] == "\u2581"] - - if len(bow_indices) < 2: - return 0 - else: - return bow_indices[-1] - - def merge(self, list_of_string): - return self.model.DecodePieces(list_of_string) - - -SPLITTER_DICT = { - None: NoneWordSplitter, - "BPE": BPEWordSplitter, - "SentencePieceModel": SentencePieceModelWordSplitter, -} diff --git a/examples/simultaneous_translation/eval/client.py b/examples/simultaneous_translation/eval/client.py deleted file mode 100644 index 3ca4ea73b8..0000000000 --- a/examples/simultaneous_translation/eval/client.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Optional - -import requests -from scorers import build_scorer - - -class SimulSTEvaluationService(object): - DEFAULT_HOSTNAME = "localhost" - DEFAULT_PORT = 12321 - - def __init__(self, hostname=DEFAULT_HOSTNAME, port=DEFAULT_PORT): - self.hostname = hostname - self.port = port - self.base_url = f"http://{self.hostname}:{self.port}" - - def __enter__(self): - self.new_session() - - def __exit__(self, exc_type, exc_val, exc_tb): - pass - - def new_session(self): - # start eval session - url = f"{self.base_url}" - - try: - _ = requests.post(url) - except Exception as e: - print(f"Failed to start an evaluation session: {e}") - - print("Evaluation session started.") - return self - - def get_scores(self): - # end eval session - url = f"{self.base_url}/result" - try: - r = requests.get(url) - print("Scores: {}".format(r.json())) - print("Evaluation session finished.") - except Exception as e: - print(f"Failed to end an evaluation session: {e}") - - def get_src(self, sent_id: int, extra_params: Optional[dict] = None) -> str: - url = f"{self.base_url}/src" - params = {"sent_id": sent_id} - if extra_params is not None: - for key in extra_params.keys(): - params[key] = extra_params[key] - try: - r = requests.get(url, params=params) - except Exception as e: - print(f"Failed to request a source segment: {e}") - return r.json() - - def send_hypo(self, sent_id: int, hypo: str) -> None: - url = f"{self.base_url}/hypo" - params = {"sent_id": sent_id} - - try: - requests.put(url, params=params, data=hypo.encode("utf-8")) - except Exception as e: - print(f"Failed to send a translated segment: {e}") - - def corpus_info(self): - url = f"{self.base_url}" - try: - r = requests.get(url) - except Exception as e: - print(f"Failed to request corpus information: {e}") - - return r.json() - - -class SimulSTLocalEvaluationService(object): - def __init__(self, args): - self.scorer = build_scorer(args) - - def get_scores(self): - return self.scorer.score() - - def get_src(self, sent_id: int, extra_params: Optional[dict] = None) -> str: - if extra_params is not None: - segment_size = extra_params.get("segment_size", None) - else: - segment_size = None - - return self.scorer.send_src(int(sent_id), segment_size) - - def send_hypo(self, sent_id: int, hypo: str) -> None: - list_of_tokens = hypo.strip().split() - self.scorer.recv_hyp(sent_id, list_of_tokens) - - def corpus_info(self): - return self.scorer.get_info() diff --git a/examples/simultaneous_translation/eval/eval_latency.py b/examples/simultaneous_translation/eval/eval_latency.py deleted file mode 100644 index 50021de47c..0000000000 --- a/examples/simultaneous_translation/eval/eval_latency.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import json - -import torch -from examples.simultaneous_translation.utils.latency import LatencyInference - - -LATENCY_METRICS = [ - "differentiable_average_lagging", - "average_lagging", - "average_proportion", -] - - -class LatencyScorer: - def __init__(self, start_from_zero=True): - self.recorder = [] - self.scores = {} - self.scorer = LatencyInference() - self.start_from_zero = start_from_zero - - def update_reorder(self, list_of_dict): - self.recorder = [] - for info in list_of_dict: - delays = [int(x) - int(not self.start_from_zero) for x in info["delays"]] - delays = torch.LongTensor(delays).unsqueeze(0) - src_len = torch.LongTensor([info["src_len"]]).unsqueeze(0) - - self.recorder.append(self.scorer(delays, src_len)) - - def cal_latency(self): - self.scores = {} - for metric in LATENCY_METRICS: - self.scores[metric] = sum( - [x[metric][0, 0].item() for x in self.recorder] - ) / len(self.recorder) - return self.scores - - @classmethod - def score(cls, list_of_dict, start_from_zero=True): - scorer_to_return = cls(start_from_zero) - scorer_to_return.update_reorder(list_of_dict) - scorer_to_return.cal_latency() - return scorer_to_return.scores - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--input", required=True) - parser.add_argument("--start-from-zero", action="store_true") - args = parser.parse_args() - - scorer = LatencyInference() - recorder = [] - with open(args.input, "r") as f: - for line in f: - info = json.loads(line) - - delays = [int(x) - int(not args.start_from_zero) for x in info["delays"]] - - delays = torch.LongTensor(delays).unsqueeze(0) - - src_len = torch.LongTensor([info["src_len"]]).unsqueeze(0) - - recorder.append(scorer(delays, src_len)) - - average_results = {} - - for metric in LATENCY_METRICS: - average_results[metric] = sum([x[metric][0, 0].item() for x in recorder]) / len( - recorder - ) - print(f"{metric}: {average_results[metric]}") diff --git a/examples/simultaneous_translation/eval/evaluate.py b/examples/simultaneous_translation/eval/evaluate.py deleted file mode 100644 index 2f7474621a..0000000000 --- a/examples/simultaneous_translation/eval/evaluate.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import argparse - -from agents import build_agent -from client import SimulSTEvaluationService, SimulSTLocalEvaluationService -from fairseq.registry import REGISTRIES - - -DEFAULT_HOSTNAME = "localhost" -DEFAULT_PORT = 12321 - - -def get_args(): - parser = argparse.ArgumentParser() - - parser.add_argument( - "--hostname", type=str, default=DEFAULT_HOSTNAME, help="server hostname" - ) - parser.add_argument( - "--port", type=int, default=DEFAULT_PORT, help="server port number" - ) - parser.add_argument("--agent-type", default="simul_trans_text", help="Agent type") - parser.add_argument("--scorer-type", default="text", help="Scorer type") - parser.add_argument( - "--start-idx", - type=int, - default=0, - help="Start index of the sentence to evaluate", - ) - parser.add_argument( - "--end-idx", - type=int, - default=float("inf"), - help="End index of the sentence to evaluate", - ) - parser.add_argument( - "--scores", action="store_true", help="Request scores from server" - ) - parser.add_argument("--reset-server", action="store_true", help="Reset the server") - parser.add_argument( - "--num-threads", type=int, default=10, help="Number of threads used by agent" - ) - parser.add_argument( - "--local", action="store_true", default=False, help="Local evaluation" - ) - - args, _ = parser.parse_known_args() - - for registry_name, REGISTRY in REGISTRIES.items(): - choice = getattr(args, registry_name, None) - if choice is not None: - cls = REGISTRY["registry"][choice] - if hasattr(cls, "add_args"): - cls.add_args(parser) - args = parser.parse_args() - - return args - - -if __name__ == "__main__": - args = get_args() - - if args.local: - session = SimulSTLocalEvaluationService(args) - else: - session = SimulSTEvaluationService(args.hostname, args.port) - - if args.reset_server: - session.new_session() - - if args.agent_type is not None: - agent = build_agent(args) - agent.decode(session, args.start_idx, args.end_idx, args.num_threads) - - if args.scores: - session.get_scores() - print(session.get_scores()) diff --git a/examples/simultaneous_translation/eval/scorers/__init__.py b/examples/simultaneous_translation/eval/scorers/__init__.py deleted file mode 100644 index 0a0e0a0518..0000000000 --- a/examples/simultaneous_translation/eval/scorers/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import importlib -import os - -from fairseq import registry - - -(build_scorer, register_scorer, SCORER_REGISTRIES, _) = registry.setup_registry( - "--scorer-type" -) - -for file in os.listdir(os.path.dirname(__file__)): - if file.endswith(".py") and not file.startswith("_"): - module = file[: file.find(".py")] - importlib.import_module("scorers." + module) diff --git a/examples/simultaneous_translation/eval/scorers/scorer.py b/examples/simultaneous_translation/eval/scorers/scorer.py deleted file mode 100644 index d6d3e30aef..0000000000 --- a/examples/simultaneous_translation/eval/scorers/scorer.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import json -import os -from collections import defaultdict - -from examples.simultaneous_translation.eval.eval_latency import LatencyScorer -from vizseq.scorers.bleu import BLEUScorer -from vizseq.scorers.meteor import METEORScorer -from vizseq.scorers.ter import TERScorer - - -DEFAULT_EOS = "</s>" - - -class SimulScorer(object): - def __init__(self, args): - self.tokenizer = args.tokenizer - self.output_dir = args.output - if args.output is not None: - self.output_files = { - "text": os.path.join(args.output, "text"), - "delay": os.path.join(args.output, "delay"), - "scores": os.path.join(args.output, "scores"), - } - else: - self.output_files = None - self.eos = DEFAULT_EOS - self.data = {"tgt": []} - self.reset() - - def get_info(self): - return {"num_sentences": len(self)} - - @staticmethod - def add_args(parser): - # fmt: off - parser.add_argument('--src-file', type=str, required=True, - help='Source input file') - parser.add_argument('--tgt-file', type=str, required=True, - help='Target reference file') - parser.add_argument('--tokenizer', default="13a", choices=["none", "13a"], - help='Tokenizer used for sacrebleu') - parser.add_argument('--output', type=str, default=None, - help='Path for output directory') - # fmt: on - - def send_src(self, sent_id, *args): - raise NotImplementedError - - def recv_hyp(self, sent_id, list_of_tokens): - for token in list_of_tokens: - self.translations[sent_id].append((token, self.steps[sent_id])) - - def reset(self): - self.steps = defaultdict(int) - self.translations = defaultdict(list) - - def src_lengths(self): - raise NotImplementedError - - def score(self): - translations = [] - delays = [] - for i in range(1 + max(self.translations.keys())): - translations += [" ".join(t[0] for t in self.translations[i][:-1])] - delays += [[t[1] for t in self.translations[i]]] - - bleu_score = BLEUScorer( - sent_level=False, - corpus_level=True, - extra_args={"bleu_tokenizer": self.tokenizer}, - ).score(translations, [self.data["tgt"]]) - - ter_score = TERScorer(sent_level=False, corpus_level=True).score( - translations, [self.data["tgt"]] - ) - meteor_score = METEORScorer(sent_level=False, corpus_level=True).score( - translations, [self.data["tgt"]] - ) - - latency_score = LatencyScorer().score( - [ - {"src_len": src_len, "delays": delay} - for src_len, delay in zip(self.src_lengths(), delays) - ], - start_from_zero=False, - ) - - scores = { - "BLEU": bleu_score[0], - "TER": ter_score[0], - "METEOR": meteor_score[0], - "DAL": latency_score["differentiable_average_lagging"], - "AL": latency_score["average_lagging"], - "AP": latency_score["average_proportion"], - } - - if self.output_files is not None: - try: - os.makedirs(self.output_dir, exist_ok=True) - self.write_results_to_file(translations, delays, scores) - except BaseException as be: - print(f"Failed to write results to {self.output_dir}.") - print(be) - print("Skip writing predictions") - - return scores - - def write_results_to_file(self, translations, delays, scores): - if self.output_files["text"] is not None: - with open(self.output_files["text"], "w") as f: - for line in translations: - f.write(line + "\n") - - if self.output_files["delay"] is not None: - with open(self.output_files["delay"], "w") as f: - for i, delay in enumerate(delays): - f.write( - json.dumps({"src_len": self.src_lengths()[i], "delays": delay}) - + "\n" - ) - - with open(self.output_files["scores"], "w") as f: - for key, value in scores.items(): - f.write(f"{key}, {value}\n") - - @classmethod - def _load_text_file(cls, file, split=False): - with open(file) as f: - if split: - return [r.strip().split() for r in f] - else: - return [r.strip() for r in f] - - @classmethod - def _load_text_from_json(cls, file): - list_to_return = [] - with open(file) as f: - content = json.load(f) - for item in content["utts"].values(): - list_to_return.append(item["output"]["text"].strip()) - return list_to_return - - @classmethod - def _load_wav_info_from_json(cls, file): - list_to_return = [] - with open(file) as f: - content = json.load(f) - for item in content["utts"].values(): - list_to_return.append( - { - "path": item["input"]["path"].strip(), - "length": item["input"]["length_ms"], - } - ) - return list_to_return - - @classmethod - def _load_wav_info_from_list(cls, file): - list_to_return = [] - with open(file) as f: - for line in f: - list_to_return.append( - { - "path": line.strip(), - } - ) - return list_to_return - - def __len__(self): - return len(self.data["tgt"]) diff --git a/examples/simultaneous_translation/eval/scorers/text_scorer.py b/examples/simultaneous_translation/eval/scorers/text_scorer.py deleted file mode 100644 index 649a2c7e5c..0000000000 --- a/examples/simultaneous_translation/eval/scorers/text_scorer.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from . import register_scorer -from .scorer import SimulScorer - - -@register_scorer("text") -class SimulTextScorer(SimulScorer): - def __init__(self, args): - super().__init__(args) - self.data = { - "src": self._load_text_file(args.src_file, split=True), - "tgt": self._load_text_file(args.tgt_file, split=False), - } - - def send_src(self, sent_id, *args): - if self.steps[sent_id] >= len(self.data["src"][sent_id]): - dict_to_return = { - "sent_id": sent_id, - "segment_id": self.steps[sent_id], - "segment": self.eos, - } - # Consider EOS - self.steps[sent_id] = len(self.data["src"][sent_id]) + 1 - else: - dict_to_return = { - "sent_id": sent_id, - "segment_id": self.steps[sent_id], - "segment": self.data["src"][sent_id][self.steps[sent_id]], - } - - self.steps[sent_id] += 1 - - return dict_to_return - - def src_lengths(self): - # +1 for eos - return [len(sent) + 1 for sent in self.data["src"]] diff --git a/examples/simultaneous_translation/eval/server.py b/examples/simultaneous_translation/eval/server.py deleted file mode 100644 index e44ceaff85..0000000000 --- a/examples/simultaneous_translation/eval/server.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -import argparse -import json -import sys - -from scorers import build_scorer -from tornado import ioloop, web - - -DEFAULT_HOSTNAME = "localhost" -DEFAULT_PORT = 12321 - - -class ScorerHandler(web.RequestHandler): - def initialize(self, scorer): - self.scorer = scorer - - -class EvalSessionHandler(ScorerHandler): - def post(self): - self.scorer.reset() - - def get(self): - r = json.dumps(self.scorer.get_info()) - self.write(r) - - -class ResultHandler(ScorerHandler): - def get(self): - r = json.dumps(self.scorer.score()) - self.write(r) - - -class SourceHandler(ScorerHandler): - def get(self): - sent_id = int(self.get_argument("sent_id")) - segment_size = None - if "segment_size" in self.request.arguments: - string = self.get_argument("segment_size") - if len(string) > 0: - segment_size = int(string) - - r = json.dumps(self.scorer.send_src(int(sent_id), segment_size)) - - self.write(r) - - -class HypothesisHandler(ScorerHandler): - def put(self): - sent_id = int(self.get_argument("sent_id")) - list_of_tokens = self.request.body.decode("utf-8").strip().split() - self.scorer.recv_hyp(sent_id, list_of_tokens) - - -def add_args(): - parser = argparse.ArgumentParser() - # fmt: off - parser.add_argument('--hostname', type=str, default=DEFAULT_HOSTNAME, - help='Server hostname') - parser.add_argument('--port', type=int, default=DEFAULT_PORT, - help='Server port number') - - args, _ = parser.parse_known_args() - # fmt: on - return args - - -def start_server(scorer, hostname=DEFAULT_HOSTNAME, port=DEFAULT_PORT, debug=False): - app = web.Application( - [ - (r"/result", ResultHandler, dict(scorer=scorer)), - (r"/src", SourceHandler, dict(scorer=scorer)), - (r"/hypo", HypothesisHandler, dict(scorer=scorer)), - (r"/", EvalSessionHandler, dict(scorer=scorer)), - ], - debug=debug, - ) - app.listen(port, max_buffer_size=1024 ** 3) - sys.stdout.write(f"Evaluation Server Started. Listening to port {port}\n") - ioloop.IOLoop.current().start() - - -if __name__ == "__main__": - args = add_args() - scorer = build_scorer(args) - start_server(scorer, args.hostname, args.port, args.debug) diff --git a/examples/simultaneous_translation/models/__init__.py b/examples/simultaneous_translation/models/__init__.py index 083da43732..257a96593f 100644 --- a/examples/simultaneous_translation/models/__init__.py +++ b/examples/simultaneous_translation/models/__init__.py @@ -7,7 +7,7 @@ import os -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): model_name = file[: file.find(".py")] importlib.import_module( diff --git a/examples/simultaneous_translation/models/convtransformer_simul_trans.py b/examples/simultaneous_translation/models/convtransformer_simul_trans.py new file mode 100644 index 0000000000..4a26422f65 --- /dev/null +++ b/examples/simultaneous_translation/models/convtransformer_simul_trans.py @@ -0,0 +1,204 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +from fairseq import checkpoint_utils +from fairseq.models import ( + register_model, + register_model_architecture, +) +from fairseq.models.speech_to_text import ( + ConvTransformerModel, + convtransformer_espnet, + ConvTransformerEncoder, +) +from fairseq.models.speech_to_text.modules.augmented_memory_attention import ( + augmented_memory, + SequenceEncoder, + AugmentedMemoryConvTransformerEncoder, +) + +from torch import nn, Tensor +from typing import Dict, List +from fairseq.models.speech_to_text.modules.emformer import NoSegAugmentedMemoryTransformerEncoderLayer + +@register_model("convtransformer_simul_trans") +class SimulConvTransformerModel(ConvTransformerModel): + """ + Implementation of the paper: + + SimulMT to SimulST: Adapting Simultaneous Text Translation to + End-to-End Simultaneous Speech Translation + + https://www.aclweb.org/anthology/2020.aacl-main.58.pdf + """ + + @staticmethod + def add_args(parser): + super(SimulConvTransformerModel, SimulConvTransformerModel).add_args(parser) + parser.add_argument( + "--train-monotonic-only", + action="store_true", + default=False, + help="Only train monotonic attention", + ) + + @classmethod + def build_decoder(cls, args, task, embed_tokens): + tgt_dict = task.tgt_dict + + from examples.simultaneous_translation.models.transformer_monotonic_attention import ( + TransformerMonotonicDecoder, + ) + + decoder = TransformerMonotonicDecoder(args, tgt_dict, embed_tokens) + + if getattr(args, "load_pretrained_decoder_from", None): + decoder = checkpoint_utils.load_pretrained_component_from_model( + component=decoder, checkpoint=args.load_pretrained_decoder_from + ) + return decoder + + +@register_model_architecture( + "convtransformer_simul_trans", "convtransformer_simul_trans_espnet" +) +def convtransformer_simul_trans_espnet(args): + convtransformer_espnet(args) + + +@register_model("convtransformer_augmented_memory") +@augmented_memory +class AugmentedMemoryConvTransformerModel(SimulConvTransformerModel): + @classmethod + def build_encoder(cls, args): + encoder = SequenceEncoder(args, AugmentedMemoryConvTransformerEncoder(args)) + + if getattr(args, "load_pretrained_encoder_from", None) is not None: + encoder = checkpoint_utils.load_pretrained_component_from_model( + component=encoder, checkpoint=args.load_pretrained_encoder_from + ) + + return encoder + + +@register_model_architecture( + "convtransformer_augmented_memory", "convtransformer_augmented_memory" +) +def augmented_memory_convtransformer_espnet(args): + convtransformer_espnet(args) + + +# ============================================================================ # +# Convtransformer +# with monotonic attention decoder +# with emformer encoder +# ============================================================================ # + + +class ConvTransformerEmformerEncoder(ConvTransformerEncoder): + def __init__(self, args): + super().__init__(args) + stride = self.conv_layer_stride(args) + trf_left_context = args.segment_left_context // stride + trf_right_context = args.segment_right_context // stride + context_config = [trf_left_context, trf_right_context] + self.transformer_layers = nn.ModuleList( + [ + NoSegAugmentedMemoryTransformerEncoderLayer( + input_dim=args.encoder_embed_dim, + num_heads=args.encoder_attention_heads, + ffn_dim=args.encoder_ffn_embed_dim, + num_layers=args.encoder_layers, + dropout_in_attn=args.dropout, + dropout_on_attn=args.dropout, + dropout_on_fc1=args.dropout, + dropout_on_fc2=args.dropout, + activation_fn=args.activation_fn, + context_config=context_config, + segment_size=args.segment_length, + max_memory_size=args.max_memory_size, + scaled_init=True, # TODO: use constant for now. + tanh_on_mem=args.amtrf_tanh_on_mem, + ) + ] + ) + self.conv_transformer_encoder = ConvTransformerEncoder(args) + + def forward(self, src_tokens, src_lengths): + encoder_out: Dict[str, List[Tensor]] = self.conv_transformer_encoder(src_tokens, src_lengths.to(src_tokens.device)) + output = encoder_out["encoder_out"][0] + encoder_padding_masks = encoder_out["encoder_padding_mask"] + + return { + "encoder_out": [output], + # This is because that in the original implementation + # the output didn't consider the last segment as right context. + "encoder_padding_mask": [encoder_padding_masks[0][:, : output.size(0)]] if len(encoder_padding_masks) > 0 + else [], + "encoder_embedding": [], + "encoder_states": [], + "src_tokens": [], + "src_lengths": [], + } + + @staticmethod + def conv_layer_stride(args): + # TODO: make it configurable from the args + return 4 + + +@register_model("convtransformer_emformer") +class ConvtransformerEmformer(SimulConvTransformerModel): + @staticmethod + def add_args(parser): + super(ConvtransformerEmformer, ConvtransformerEmformer).add_args(parser) + + parser.add_argument( + "--segment-length", + type=int, + metavar="N", + help="length of each segment (not including left context / right context)", + ) + parser.add_argument( + "--segment-left-context", + type=int, + help="length of left context in a segment", + ) + parser.add_argument( + "--segment-right-context", + type=int, + help="length of right context in a segment", + ) + parser.add_argument( + "--max-memory-size", + type=int, + default=-1, + help="Right context for the segment.", + ) + parser.add_argument( + "--amtrf-tanh-on-mem", + default=False, + action="store_true", + help="whether to use tanh on memory vector", + ) + + @classmethod + def build_encoder(cls, args): + encoder = ConvTransformerEmformerEncoder(args) + if getattr(args, "load_pretrained_encoder_from", None): + encoder = checkpoint_utils.load_pretrained_component_from_model( + component=encoder, checkpoint=args.load_pretrained_encoder_from + ) + return encoder + + +@register_model_architecture( + "convtransformer_emformer", + "convtransformer_emformer", +) +def convtransformer_emformer_base(args): + convtransformer_espnet(args) diff --git a/examples/simultaneous_translation/models/transformer_monotonic_attention.py b/examples/simultaneous_translation/models/transformer_monotonic_attention.py index ab8adf3aab..7b9414b0eb 100644 --- a/examples/simultaneous_translation/models/transformer_monotonic_attention.py +++ b/examples/simultaneous_translation/models/transformer_monotonic_attention.py @@ -3,26 +3,44 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +from typing import Dict, List, NamedTuple, Optional + import torch import torch.nn as nn -import torch.nn.functional as F from examples.simultaneous_translation.modules.monotonic_transformer_layer import ( TransformerMonotonicDecoderLayer, TransformerMonotonicEncoderLayer, ) -from fairseq.models import register_model, register_model_architecture +from fairseq.models import ( + register_model, + register_model_architecture, +) from fairseq.models.transformer import ( - TransformerDecoder, - TransformerEncoder, TransformerModel, + TransformerEncoder, + TransformerDecoder, base_architecture, transformer_iwslt_de_en, transformer_vaswani_wmt_en_de_big, + tiny_architecture ) - +from torch import Tensor DEFAULT_MAX_SOURCE_POSITIONS = 1024 DEFAULT_MAX_TARGET_POSITIONS = 1024 +READ_ACTION = 0 +WRITE_ACTION = 1 + +TransformerMonotonicDecoderOut = NamedTuple( + "TransformerMonotonicDecoderOut", + [ + ("action", int), + ("p_choose", Optional[Tensor]), + ("attn_list", Optional[List[Optional[Dict[str, Tensor]]]]), + ("encoder_out", Optional[Dict[str, List[Tensor]]]), + ("encoder_padding_mask", Optional[Tensor]), + ], +) @register_model("transformer_unidirectional") @@ -33,7 +51,7 @@ def build_encoder(cls, args, src_dict, embed_tokens): @register_model("transformer_monotonic") -class TransformerMonotonicModel(TransformerModel): +class TransformerModelSimulTrans(TransformerModel): @classmethod def build_encoder(cls, args, src_dict, embed_tokens): return TransformerMonotonicEncoder(args, src_dict, embed_tokens) @@ -42,80 +60,6 @@ def build_encoder(cls, args, src_dict, embed_tokens): def build_decoder(cls, args, tgt_dict, embed_tokens): return TransformerMonotonicDecoder(args, tgt_dict, embed_tokens) - def _indices_from_states(self, states): - if type(states["indices"]["src"]) == list: - if next(self.parameters()).is_cuda: - tensor = torch.cuda.LongTensor - else: - tensor = torch.LongTensor - - src_indices = tensor( - [states["indices"]["src"][: 1 + states["steps"]["src"]]] - ) - - tgt_indices = tensor( - [[self.decoder.dictionary.eos()] + states["indices"]["tgt"]] - ) - else: - src_indices = states["indices"]["src"][: 1 + states["steps"]["src"]] - tgt_indices = states["indices"]["tgt"] - - return src_indices, None, tgt_indices - - def predict_from_states(self, states): - decoder_states = self.decoder.output_layer(states["decoder_features"]) - lprobs = self.get_normalized_probs([decoder_states[:, -1:]], log_probs=True) - - index = lprobs.argmax(dim=-1) - - token = self.decoder.dictionary.string(index) - - return token, index[0, 0].item() - - def decision_from_states(self, states): - """ - This funcion take states dictionary as input, and gives the agent - a decision of whether read a token from server. Moreover, the decoder - states are also calculated here so we can directly generate a target - token without recompute every thing - """ - - self.eval() - - if len(states["tokens"]["src"]) == 0: - return 0 - - src_indices, src_lengths, tgt_indices = self._indices_from_states(states) - - # Update encoder states if needed - if ( - "encoder_states" not in states - or states["encoder_states"][0].size(1) <= states["steps"]["src"] - ): - encoder_out_dict = self.encoder(src_indices, src_lengths) - states["encoder_states"] = encoder_out_dict - else: - encoder_out_dict = states["encoder_states"] - - # online means we still need tokens to feed the model - states["model_states"]["online"] = not ( - states["finish_read"] - and len(states["tokens"]["src"]) == states["steps"]["src"] - ) - - states["model_states"]["steps"] = states["steps"] - - x, outputs = self.decoder.forward( - prev_output_tokens=tgt_indices, - encoder_out=encoder_out_dict, - incremental_state=states["model_states"], - features_only=True, - ) - - states["decoder_features"] = x - - return outputs["action"] - class TransformerMonotonicEncoder(TransformerEncoder): def __init__(self, args, dictionary, embed_tokens): @@ -124,7 +68,10 @@ def __init__(self, args, dictionary, embed_tokens): self.dictionary = dictionary self.layers = nn.ModuleList([]) self.layers.extend( - [TransformerMonotonicEncoderLayer(args) for i in range(args.encoder_layers)] + [ + TransformerMonotonicEncoderLayer(args) + for i in range(args.encoder_layers) + ] ) @@ -148,13 +95,21 @@ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.layers = nn.ModuleList([]) self.layers.extend( [ - TransformerMonotonicDecoderLayer(args, no_encoder_attn) + TransformerMonotonicDecoderLayer(args) for _ in range(args.decoder_layers) ] ) + self.policy_criterion = getattr(args, "policy_criterion", "any") + self.num_updates = None + + def set_num_updates(self, num_updates): + self.num_updates = num_updates def pre_attention( - self, prev_output_tokens, encoder_out_dict, incremental_state=None + self, + prev_output_tokens, + encoder_out_dict: Dict[str, List[Tensor]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, ): positions = ( self.embed_positions( @@ -169,7 +124,6 @@ def pre_attention( prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] - # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) @@ -178,18 +132,28 @@ def pre_attention( if positions is not None: x += positions + x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) - encoder_out = encoder_out_dict.encoder_out - encoder_padding_mask = encoder_out_dict.encoder_padding_mask + encoder_out = encoder_out_dict["encoder_out"][0] + + if "encoder_padding_mask" in encoder_out_dict: + encoder_padding_mask = ( + encoder_out_dict["encoder_padding_mask"][0] + if encoder_out_dict["encoder_padding_mask"] + and len(encoder_out_dict["encoder_padding_mask"]) > 0 + else None + ) + else: + encoder_padding_mask = None return x, encoder_out, encoder_padding_mask def post_attention(self, x): - if self.layer_norm: + if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C @@ -200,8 +164,32 @@ def post_attention(self, x): return x + def clean_cache( + self, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], + end_id: Optional[int] = None, + ): + """ + Clean cache in the monotonic layers. + The cache is generated because of a forward pass of decoder has run but no prediction, + so that the self attention key value in decoder is written in the incremental state. + end_id is the last idx of the layers + """ + if end_id is None: + end_id = len(self.layers) + + for index, layer in enumerate(self.layers): + if index < end_id: + layer.prune_incremental_state(incremental_state) + def extract_features( - self, prev_output_tokens, encoder_out, incremental_state=None, **unused + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, # unused + alignment_layer: Optional[int] = None, # unused + alignment_heads: Optional[int] = None, # unsed ): """ Similar to *forward* but only return features. @@ -212,13 +200,15 @@ def extract_features( - a dictionary with any model-specific outputs """ # incremental_state = None + assert encoder_out is not None (x, encoder_outs, encoder_padding_mask) = self.pre_attention( prev_output_tokens, encoder_out, incremental_state ) attn = None inner_states = [x] - attn_list = [] - step_list = [] + attn_list: List[Optional[Dict[str, Tensor]]] = [] + + p_choose = torch.tensor([1.0]) for i, layer in enumerate(self.layers): @@ -236,58 +226,42 @@ def extract_features( attn_list.append(attn) if incremental_state is not None: - curr_steps = layer.get_steps(incremental_state) - step_list.append(curr_steps) - - if incremental_state.get("online", False): - p_choose = ( - attn["p_choose"].squeeze(0).squeeze(1).gather(1, curr_steps.t()) - ) - - new_steps = curr_steps + (p_choose < 0.5).t().type_as(curr_steps) - - if (new_steps >= incremental_state["steps"]["src"]).any(): - # We need to prune the last self_attn saved_state - # if model decide not to read - # otherwise there will be duplicated saved_state - for j in range(i + 1): - self.layers[j].prune_incremental_state(incremental_state) - - return x, {"action": 0} - - if incremental_state is not None and not incremental_state.get("online", False): - # Here is for fast evaluation - fastest_step = ( - torch.max(torch.cat(step_list, dim=1), dim=1, keepdim=True)[0] + 1 - ) - - if "fastest_step" in incremental_state: - incremental_state["fastest_step"] = torch.cat( - [incremental_state["fastest_step"], fastest_step], dim=1 - ) - else: - incremental_state["fastest_step"] = fastest_step + if_online = incremental_state["online"]["only"] + assert if_online is not None + if if_online.to(torch.bool): + # Online indicates that the encoder states are still changing + assert attn is not None + if self.policy_criterion == "any": + # Any head decide to read than read + head_read = layer.encoder_attn._get_monotonic_buffer(incremental_state)["head_read"] + assert head_read is not None + if head_read.any(): + # We need to prune the last self_attn saved_state + # if model decide not to read + # otherwise there will be duplicated saved_state + self.clean_cache(incremental_state, i + 1) + + return x, TransformerMonotonicDecoderOut( + action=0, + p_choose=p_choose, + attn_list=None, + encoder_out=None, + encoder_padding_mask=None, + ) x = self.post_attention(x) - return x, { - "action": 1, - "attn_list": attn_list, - "step_list": step_list, - "encoder_out": encoder_out, - "encoder_padding_mask": encoder_padding_mask, - } - - def reorder_incremental_state(self, incremental_state, new_order): - super().reorder_incremental_state(incremental_state, new_order) - if "fastest_step" in incremental_state: - incremental_state["fastest_step"] = incremental_state[ - "fastest_step" - ].index_select(0, new_order) + return x, TransformerMonotonicDecoderOut( + action=1, + p_choose=p_choose, + attn_list=attn_list, + encoder_out=encoder_out, + encoder_padding_mask=encoder_padding_mask, + ) @register_model_architecture("transformer_monotonic", "transformer_monotonic") -def base_monotonic_rchitecture(args): +def base_monotonic_architecture(args): base_architecture(args) args.encoder_unidirectional = getattr(args, "encoder_unidirectional", False) @@ -297,7 +271,7 @@ def base_monotonic_rchitecture(args): ) def transformer_monotonic_iwslt_de_en(args): transformer_iwslt_de_en(args) - base_monotonic_rchitecture(args) + base_monotonic_architecture(args) # parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017) @@ -320,3 +294,9 @@ def transformer_monotonic_vaswani_wmt_en_fr_big(args): ) def transformer_unidirectional_iwslt_de_en(args): transformer_iwslt_de_en(args) + + +@register_model_architecture("transformer_monotonic", "transformer_monotonic_tiny") +def monotonic_tiny_architecture(args): + tiny_architecture(args) + base_monotonic_architecture(args) diff --git a/examples/simultaneous_translation/modules/__init__.py b/examples/simultaneous_translation/modules/__init__.py index ad64774de4..f5ea180f9b 100644 --- a/examples/simultaneous_translation/modules/__init__.py +++ b/examples/simultaneous_translation/modules/__init__.py @@ -3,12 +3,11 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import importlib -import os +import os +import importlib from fairseq import registry - ( build_monotonic_attention, register_monotonic_attention, @@ -16,7 +15,7 @@ _, ) = registry.setup_registry("--simul-type") -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): model_name = file[: file.find(".py")] importlib.import_module( diff --git a/examples/simultaneous_translation/modules/fixed_pre_decision.py b/examples/simultaneous_translation/modules/fixed_pre_decision.py new file mode 100644 index 0000000000..3991414aed --- /dev/null +++ b/examples/simultaneous_translation/modules/fixed_pre_decision.py @@ -0,0 +1,190 @@ +from functools import partial + +import torch +from torch import Tensor +import math +import torch.nn.functional as F + +from . import register_monotonic_attention +from .monotonic_multihead_attention import ( + MonotonicAttention, + MonotonicInfiniteLookbackAttention, + WaitKAttention +) +from typing import Dict, Optional + + +def fixed_pooling_monotonic_attention(monotonic_attention): + def create_model(monotonic_attention, klass): + class FixedStrideMonotonicAttention(monotonic_attention): + def __init__(self, args): + self.waitk_lagging = 0 + self.num_heads = 0 + self.noise_mean = 0.0 + self.noise_var = 0.0 + super().__init__(args) + self.pre_decision_type = args.fixed_pre_decision_type + self.pre_decision_ratio = args.fixed_pre_decision_ratio + self.pre_decision_pad_threshold = args.fixed_pre_decision_pad_threshold + assert self.pre_decision_ratio > 1 + + if args.fixed_pre_decision_type == "average": + self.pooling_layer = torch.nn.AvgPool1d( + kernel_size=self.pre_decision_ratio, + stride=self.pre_decision_ratio, + ceil_mode=True, + ) + elif args.fixed_pre_decision_type == "last": + + def last(key): + if key.size(2) < self.pre_decision_ratio: + return key + else: + k = key[ + :, + :, + self.pre_decision_ratio - 1:: self.pre_decision_ratio, + ].contiguous() + if key.size(-1) % self.pre_decision_ratio != 0: + k = torch.cat([k, key[:, :, -1:]], dim=-1).contiguous() + return k + + self.pooling_layer = last + else: + raise NotImplementedError + + @staticmethod + def add_args(parser): + super( + FixedStrideMonotonicAttention, FixedStrideMonotonicAttention + ).add_args(parser) + parser.add_argument( + "--fixed-pre-decision-ratio", + type=int, + required=True, + help=( + "Ratio for the fixed pre-decision," + "indicating how many encoder steps will start" + "simultaneous decision making process." + ), + ) + parser.add_argument( + "--fixed-pre-decision-type", + default="average", + choices=["average", "last"], + help="Pooling type", + ) + parser.add_argument( + "--fixed-pre-decision-pad-threshold", + type=float, + default=0.3, + help="If a part of the sequence has pad" + ",the threshold the pooled part is a pad.", + ) + + def insert_zeros(self, x): + bsz_num_heads, tgt_len, src_len = x.size() + stride = self.pre_decision_ratio + weight = F.pad(torch.ones(1, 1, 1).to(x), (stride - 1, 0)) + x_upsample = F.conv_transpose1d( + x.view(-1, src_len).unsqueeze(1), + weight, + stride=stride, + padding=0, + ) + return x_upsample.squeeze(1).view(bsz_num_heads, tgt_len, -1) + + def p_choose( + self, + query: Optional[Tensor], + key: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + ): + assert key is not None + assert query is not None + src_len = key.size(0) + tgt_len = query.size(0) + batch_size = query.size(1) + + key_pool = self.pooling_layer(key.transpose(0, 2)).transpose(0, 2) + + if key_padding_mask is not None: + key_padding_mask_pool = ( + self.pooling_layer(key_padding_mask.unsqueeze(0).float()) + .squeeze(0) + .gt(self.pre_decision_pad_threshold) + ) + # Make sure at least one element is not pad + key_padding_mask_pool[:, 0] = 0 + else: + key_padding_mask_pool = None + + if incremental_state is not None: + # The floor instead of ceil is used for inference + # But make sure the length key_pool at least 1 + if ( + max(1, math.floor(key.size(0) / self.pre_decision_ratio)) + ) < key_pool.size(0): + key_pool = key_pool[:-1] + if key_padding_mask_pool is not None: + key_padding_mask_pool = key_padding_mask_pool[:-1] + + p_choose_pooled = self.p_choose_from_qk( + query, + key_pool, + key_padding_mask_pool, + incremental_state=incremental_state, + ) + + # Upsample, interpolate zeros + p_choose = self.insert_zeros(p_choose_pooled) + + if p_choose.size(-1) < src_len: + # Append zeros if the upsampled p_choose is shorter than src_len + p_choose = torch.cat( + [ + p_choose, + torch.zeros( + p_choose.size(0), + tgt_len, + src_len - p_choose.size(-1) + ).to(p_choose) + ], + dim=2 + ) + else: + # can be larger than src_len because we used ceil before + p_choose = p_choose[:, :, :src_len] + p_choose[:, :, -1] = p_choose_pooled[:, :, -1] + + assert list(p_choose.size()) == [ + batch_size * self.num_heads, + tgt_len, + src_len, + ] + + return p_choose + + FixedStrideMonotonicAttention.__name__ = klass.__name__ + return FixedStrideMonotonicAttention + + return partial(create_model, monotonic_attention) + + +@register_monotonic_attention("waitk_fixed_pre_decision") +@fixed_pooling_monotonic_attention(WaitKAttention) +class WaitKAttentionFixedStride: + pass + + +@register_monotonic_attention("hard_aligned_fixed_pre_decision") +@fixed_pooling_monotonic_attention(MonotonicAttention) +class MonotonicAttentionFixedStride: + pass + + +@register_monotonic_attention("infinite_lookback_fixed_pre_decision") +@fixed_pooling_monotonic_attention(MonotonicInfiniteLookbackAttention) +class MonotonicInfiniteLookbackAttentionFixedStride: + pass diff --git a/examples/simultaneous_translation/modules/monotonic_multihead_attention.py b/examples/simultaneous_translation/modules/monotonic_multihead_attention.py index c09725ac9a..06d20d8d4a 100644 --- a/examples/simultaneous_translation/modules/monotonic_multihead_attention.py +++ b/examples/simultaneous_translation/modules/monotonic_multihead_attention.py @@ -6,30 +6,49 @@ import math import torch +from torch import Tensor import torch.nn as nn -import torch.nn.functional as F -from examples.simultaneous_translation.utils.functions import ( - exclusive_cumprod, - lengths_to_mask, + +from examples.simultaneous_translation.utils.p_choose_strategy import ( + learnable_p_choose, + waitk_p_choose +) + +from examples.simultaneous_translation.utils.monotonic_attention import ( + expected_alignment_from_p_choose, + expected_soft_attention, + mass_preservation, ) -from fairseq import utils -from fairseq.incremental_decoding_utils import with_incremental_state from fairseq.modules import MultiheadAttention -from fairseq.utils import convert_padding_direction from . import register_monotonic_attention +from typing import Dict, Optional -@with_incremental_state -class MonotonicAttention(nn.Module): +@register_monotonic_attention("hard_aligned") +class MonotonicAttention(MultiheadAttention): """ Abstract class of monotonic attentions """ + k_in_proj: Dict[str, nn.Linear] + q_in_proj: Dict[str, nn.Linear] def __init__(self, args): - self.eps = args.attention_eps - self.mass_preservation = args.mass_preservation + super().__init__( + embed_dim=args.decoder_embed_dim, + num_heads=args.decoder_attention_heads, + kdim=getattr(args, "encoder_embed_dim", None), + vdim=getattr(args, "encoder_embed_dim", None), + dropout=args.attention_dropout, + encoder_decoder_attention=True, + ) + + self.soft_attention = False + self.eps = getattr(args, "attention_eps", True) + self.mass_preservation = getattr(args, "mass_preservation", True) + + self.noise_type = args.noise_type self.noise_mean = args.noise_mean self.noise_var = args.noise_var @@ -40,442 +59,372 @@ def __init__(self, args): else 0 ) + self.k_in_proj = {"monotonic": self.k_proj} + self.q_in_proj = {"monotonic": self.q_proj} + self.chunk_size = None + @staticmethod def add_args(parser): # fmt: off - parser.add_argument('--no-mass-preservation', action="store_false", dest="mass_preservation", + parser.add_argument('--no-mass-preservation', action="store_false", + dest="mass_preservation", help='Do not stay on the last token when decoding') - parser.add_argument('--mass-preservation', action="store_true", dest="mass_preservation", + parser.add_argument('--mass-preservation', action="store_true", + dest="mass_preservation", help='Stay on the last token when decoding') parser.set_defaults(mass_preservation=True) - parser.add_argument('--noise-var', type=float, default=1.0, help='Variance of discretness noise') parser.add_argument('--noise-mean', type=float, default=0.0, help='Mean of discretness noise') - parser.add_argument('--energy-bias', action="store_true", default=False, + parser.add_argument('--noise-type', type=str, default="flat", + help='Type of discretness noise') + parser.add_argument('--energy-bias', action="store_true", + default=False, help='Bias for energy') parser.add_argument('--energy-bias-init', type=float, default=-2.0, help='Initial value of the bias for energy') parser.add_argument('--attention-eps', type=float, default=1e-6, help='Epsilon when calculating expected attention') - # fmt: on - - def p_choose(self, *args): - raise NotImplementedError - def input_projections(self, *args): - raise NotImplementedError - - def attn_energy(self, q_proj, k_proj, key_padding_mask=None): + def energy_from_qk( + self, + query: Tensor, + key: Tensor, + energy_type: str, + key_padding_mask: Optional[Tensor] = None, + bias: int = 0 + ): """ - Calculating monotonic energies - - ============================================================ - Expected input size - q_proj: bsz * num_heads, tgt_len, self.head_dim - k_proj: bsz * num_heads, src_len, self.head_dim - key_padding_mask: bsz, src_len - attn_mask: tgt_len, src_len + Compute energy from query and key + q_func_value is a tuple looks like + (q_proj_func, q_tensor) + q_tensor size: bsz, tgt_len, emb_dim + k_tensor size: bsz, src_len, emb_dim + key_padding_mask size: bsz, src_len + attn_mask: bsz, src_len """ - bsz, tgt_len, embed_dim = q_proj.size() - bsz = bsz // self.num_heads - src_len = k_proj.size(1) - attn_energy = torch.bmm(q_proj, k_proj.transpose(1, 2)) + self.energy_bias + length, bsz, _ = query.size() + q = self.q_in_proj[energy_type].forward(query) + q = ( + q.contiguous() + .view(length, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + q = q * self.scaling + length, bsz, _ = key.size() + k = self.k_in_proj[energy_type].forward(key) + k = ( + k.contiguous() + .view(length, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) - attn_energy = attn_energy.view(bsz, self.num_heads, tgt_len, src_len) + energy = torch.bmm(q, k.transpose(1, 2)) + bias if key_padding_mask is not None: - attn_energy = attn_energy.masked_fill( - key_padding_mask.unsqueeze(1).unsqueeze(2).bool(), - float("-inf"), + energy = energy.masked_fill( + key_padding_mask.unsqueeze(1).to(torch.bool), + - float("inf") ) - return attn_energy + return energy - def expected_alignment_train(self, p_choose, key_padding_mask): - """ - Calculating expected alignment for MMA - Mask is not need because p_choose will be 0 if masked - - q_ij = (1 − p_{ij−1})q_{ij−1} + a+{i−1j} - a_ij = p_ij q_ij - - parellel solution: - ai = p_i * cumprod(1 − pi) * cumsum(a_i / cumprod(1 − pi)) - - ============================================================ - Expected input size - p_choose: bsz * num_heads, tgt_len, src_len - """ - - # p_choose: bsz * num_heads, tgt_len, src_len - bsz_num_heads, tgt_len, src_len = p_choose.size() - - # cumprod_1mp : bsz * num_heads, tgt_len, src_len - cumprod_1mp = exclusive_cumprod(1 - p_choose, dim=2, eps=self.eps) - cumprod_1mp_clamp = torch.clamp(cumprod_1mp, self.eps, 1.0) - - init_attention = p_choose.new_zeros([bsz_num_heads, 1, src_len]) - init_attention[:, :, 0] = 1.0 - - previous_attn = [init_attention] - - for i in range(tgt_len): - # p_choose: bsz * num_heads, tgt_len, src_len - # cumprod_1mp_clamp : bsz * num_heads, tgt_len, src_len - # previous_attn[i]: bsz * num_heads, 1, src_len - # alpha_i: bsz * num_heads, src_len - alpha_i = ( - p_choose[:, i] - * cumprod_1mp[:, i] - * torch.cumsum(previous_attn[i][:, 0] / cumprod_1mp_clamp[:, i], dim=1) - ).clamp(0, 1.0) - previous_attn.append(alpha_i.unsqueeze(1)) - - # alpha: bsz * num_heads, tgt_len, src_len - alpha = torch.cat(previous_attn[1:], dim=1) - - if self.mass_preservation: - # Last token has the residual probabilities - alpha[:, :, -1] = 1 - alpha[:, :, :-1].sum(dim=-1).clamp(0.0, 1.0) + def p_choose_from_qk(self, query, key, key_padding_mask, incremental_states=None): + monotonic_energy = self.energy_from_qk( + query, + key, + "monotonic", + key_padding_mask=key_padding_mask, + bias=self.energy_bias, + ) - assert not torch.isnan(alpha).any(), "NaN detected in alpha." + p_choose = learnable_p_choose( + monotonic_energy, + self.noise_mean, + self.noise_var, + self.training + ) + return p_choose - return alpha + def p_choose(self, query, key, key_padding_mask, incremental_states=None): + return self.p_choose_from_qk(self, query, key, key_padding_mask) - def expected_alignment_infer(self, p_choose, key_padding_mask, incremental_state): + def monotonic_attention_process_infer( + self, + query: Optional[Tensor], + key: Optional[Tensor], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], + ): """ - Calculating mo alignment for MMA during inference time - - ============================================================ - Expected input size - p_choose: bsz * num_heads, tgt_len, src_len - key_padding_mask: bsz * src_len - incremental_state: dict + Monotonic attention at inference time + Notice that this function is designed for simuleval not sequence_generator """ - # p_choose: bsz * self.num_heads, src_len - bsz_num_heads, tgt_len, src_len = p_choose.size() - # One token at a time - assert tgt_len == 1 - p_choose = p_choose[:, 0, :] + assert query is not None + assert key is not None + if query.size(1) != 1: + raise RuntimeError( + "Simultaneous translation models don't support batch decoding." + ) + # 1. compute stepwise probability + p_choose = self.p_choose( + query, key, None, incremental_state + ).squeeze(1) + + # 2. Compute the alpha + src_len = key.size(0) + # Maximum steps allows in this iteration + max_steps = src_len - 1 if self.mass_preservation else src_len monotonic_cache = self._get_monotonic_buffer(incremental_state) - - # prev_monotonic_step: bsz, num_heads - bsz = bsz_num_heads // self.num_heads - prev_monotonic_step = monotonic_cache.get( - "step", p_choose.new_zeros([bsz, self.num_heads]).long() + # Step for each head + monotonic_step = monotonic_cache.get( + 'head_step', + p_choose.new_zeros(1, self.num_heads).long() ) - bsz, num_heads = prev_monotonic_step.size() - assert num_heads == self.num_heads - assert bsz * num_heads == bsz_num_heads - - # p_choose: bsz, num_heads, src_len - p_choose = p_choose.view(bsz, num_heads, src_len) + assert monotonic_step is not None + finish_read = monotonic_step.eq(max_steps) + p_choose_i = torch.tensor(1) - if key_padding_mask is not None: - src_lengths = src_len - key_padding_mask.sum(dim=1, keepdim=True).long() - else: - src_lengths = prev_monotonic_step.new_ones(bsz, 1) * src_len - - # src_lengths: bsz, num_heads - src_lengths = src_lengths.expand_as(prev_monotonic_step) - # new_monotonic_step: bsz, num_heads - new_monotonic_step = prev_monotonic_step - - step_offset = 0 - if key_padding_mask is not None: - if key_padding_mask[:, 0].any(): - # left_pad_source = True: - step_offset = key_padding_mask.sum(dim=-1, keepdim=True) - - max_steps = src_lengths - 1 if self.mass_preservation else src_lengths - - # finish_read: bsz, num_heads - finish_read = new_monotonic_step.eq(max_steps) - - while finish_read.sum().item() < bsz * self.num_heads: - # p_choose: bsz * self.num_heads, src_len + while finish_read.sum().item() < self.num_heads: + # p_choose: self.num_heads, src_len # only choose the p at monotonic steps - # p_choose_i: bsz , self.num_heads + # p_choose_i: 1, self.num_heads p_choose_i = ( p_choose.gather( - 2, - (step_offset + new_monotonic_step) - .unsqueeze(2) + 1, + monotonic_step .clamp(0, src_len - 1), ) - ).squeeze(2) + ) - action = ( + read_one_step = ( (p_choose_i < 0.5) - .type_as(prev_monotonic_step) + .type_as(monotonic_step) .masked_fill(finish_read, 0) ) # 1 x bsz # sample actions on unfinished seq - # 1 means stay, finish reading - # 0 means leave, continue reading - # dist = torch.distributions.bernoulli.Bernoulli(p_choose) - # action = dist.sample().type_as(finish_read) * (1 - finish_read) - - new_monotonic_step += action - - finish_read = new_monotonic_step.eq(max_steps) | (action == 0) - # finish_read = (~ (finish_read.sum(dim=1, keepdim=True) < self.num_heads / 2)) | finish_read - - monotonic_cache["step"] = new_monotonic_step - - # alpha: bsz * num_heads, 1, src_len - # new_monotonic_step: bsz, num_heads - alpha = p_choose.new_zeros([bsz * self.num_heads, src_len]).scatter( - 1, - (step_offset + new_monotonic_step) - .view(bsz * self.num_heads, 1) - .clamp(0, src_len - 1), - 1, - ) - - if not self.mass_preservation: - alpha = alpha.masked_fill( - (new_monotonic_step == max_steps).view(bsz * self.num_heads, 1), 0 - ) - - alpha = alpha.unsqueeze(1) - - self._set_monotonic_buffer(incremental_state, monotonic_cache) - - return alpha - - def v_proj_output(self, value): - raise NotImplementedError + # 0 means stay, finish reading + # 1 means leave, continue reading - def forward( - self, - query, - key, - value, - key_padding_mask=None, - incremental_state=None, - *args, - **kwargs, - ): - - tgt_len, bsz, embed_dim = query.size() - src_len = value.size(0) + monotonic_step += read_one_step - # stepwise prob - # p_choose: bsz * self.num_heads, tgt_len, src_len - p_choose = self.p_choose(query, key, key_padding_mask) + finish_read = monotonic_step.eq(max_steps) | (read_one_step == 0) - # expected alignment alpha - # bsz * self.num_heads, tgt_len, src_len - if incremental_state is not None: - alpha = self.expected_alignment_infer( - p_choose, key_padding_mask, incremental_state + # p_choose at last steps + p_choose_i = ( + p_choose.gather( + 1, + monotonic_step + .clamp(0, src_len - 1), ) - else: - alpha = self.expected_alignment_train(p_choose, key_padding_mask) - - # expected attention beta - # bsz * self.num_heads, tgt_len, src_len - beta = self.expected_attention( - alpha, query, key, value, key_padding_mask, incremental_state ) - attn_weights = beta - - v_proj = self.v_proj_output(value) - attn = torch.bmm(attn_weights.type_as(v_proj), v_proj) - - attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) - - attn = self.out_proj(attn) - - beta = beta.view(bsz, self.num_heads, tgt_len, src_len) - alpha = alpha.view(bsz, self.num_heads, tgt_len, src_len) - p_choose = p_choose.view(bsz, self.num_heads, tgt_len, src_len) + monotonic_cache["head_step"] = monotonic_step + # Whether a head is looking for new input + monotonic_cache["head_read"] = ( + monotonic_step.eq(max_steps) & (p_choose_i < 0.5) + ) + self._set_monotonic_buffer(incremental_state, monotonic_cache) - return attn, {"alpha": alpha, "beta": beta, "p_choose": p_choose} - - def reorder_incremental_state(self, incremental_state, new_order): - """Reorder buffered internal state (for incremental generation).""" - super().reorder_incremental_state(incremental_state, new_order) - input_buffer = self._get_monotonic_buffer(incremental_state) - if input_buffer is not None: - for k in input_buffer.keys(): - input_buffer[k] = input_buffer[k].index_select(0, new_order) - self._set_monotonic_buffer(incremental_state, input_buffer) - - def _get_monotonic_buffer(self, incremental_state): - return ( - utils.get_incremental_state( - self, - incremental_state, - "monotonic", + # 2. Update alpha + alpha = ( + p_choose + .new_zeros([self.num_heads, src_len]) + .scatter( + 1, + (monotonic_step) + .view(self.num_heads, 1).clamp(0, src_len - 1), + 1 ) - or {} ) - def _set_monotonic_buffer(self, incremental_state, buffer): - utils.set_incremental_state( - self, - incremental_state, - "monotonic", - buffer, - ) + if not self.mass_preservation: + alpha = alpha.masked_fill( + (monotonic_step == max_steps) + .view(self.num_heads, 1), + 0 + ) - def get_pointer(self, incremental_state): - return ( - utils.get_incremental_state( - self, - incremental_state, - "monotonic", + # 4. Compute Beta + if self.soft_attention: + monotonic_step = monotonic_step.t() + beta_mask = torch.arange(src_len).expand_as(alpha).gt(monotonic_step).unsqueeze(1) + # If it's soft attention just do softmax on current context + soft_energy = self.energy_from_qk( + query, + key, + "soft" ) - or {} - ) + beta = torch.nn.functional.softmax( + soft_energy.masked_fill(beta_mask, -float("inf")), dim=-1 + ) + # It could happen that a head doesn't move at all + beta = beta.masked_fill(monotonic_step.eq(0).unsqueeze(1), 0) + else: + # If it's hard attention just select the last state + beta = alpha - def get_fastest_pointer(self, incremental_state): - return self.get_pointer(incremental_state)["step"].max(0)[0] + return p_choose, alpha, beta - def set_pointer(self, incremental_state, p_choose): - curr_pointer = self.get_pointer(incremental_state) - if len(curr_pointer) == 0: - buffer = torch.zeros_like(p_choose) - else: - buffer = self.get_pointer(incremental_state)["step"] + def monotonic_attention_process_train( + self, + query: Optional[Tensor], + key: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + ): + """ + Calculating monotonic attention process for training + Including: + stepwise probability: p_choose + expected hard alignment: alpha + expected soft attention: beta + """ + assert query is not None + assert key is not None - buffer += (p_choose < 0.5).type_as(buffer) + # 1. compute stepwise probability + p_choose = self.p_choose_from_qk(query, key, key_padding_mask) - utils.set_incremental_state( - self, - incremental_state, - "monotonic", - {"step": buffer}, + # 2. compute expected_alignment + alpha = expected_alignment_from_p_choose( + p_choose, + key_padding_mask, + eps=self.eps, ) + if self.mass_preservation: + alpha = mass_preservation( + alpha, key_padding_mask + ) -@register_monotonic_attention("hard_aligned") -class MonotonicMultiheadAttentionHard(MonotonicAttention, MultiheadAttention): - def __init__(self, args): - MultiheadAttention.__init__( - self, - embed_dim=args.decoder_embed_dim, - num_heads=args.decoder_attention_heads, - kdim=getattr(args, "encoder_embed_dim", None), - vdim=getattr(args, "encoder_embed_dim", None), - dropout=args.attention_dropout, - encoder_decoder_attention=True, - ) + # 3. compute expected soft attention (soft aligned model only) + if self.soft_attention: + soft_energy = self.energy_from_qk( + query, + key, + "soft", + key_padding_mask=None, + ) - MonotonicAttention.__init__(self, args) + beta = expected_soft_attention( + alpha, + soft_energy, + padding_mask=key_padding_mask, + chunk_size=self.chunk_size, + eps=self.eps, + ) + else: + beta = alpha + soft_energy = alpha - self.k_in_proj = {"monotonic": self.k_proj} - self.q_in_proj = {"monotonic": self.q_proj} - self.v_in_proj = {"output": self.v_proj} + return p_choose, alpha, beta, soft_energy - def input_projections(self, query, key, value, name): + def forward( + self, + query: Optional[Tensor], + key: Optional[Tensor], + value: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + attn_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + need_weights: bool = True, static_kv: bool = False, need_head_weights: bool = False, + ): """ - Prepare inputs for multihead attention - - ============================================================ - Expected input size query: tgt_len, bsz, embed_dim key: src_len, bsz, embed_dim value: src_len, bsz, embed_dim - name: monotonic or soft """ - if query is not None: - bsz = query.size(1) - q = self.q_in_proj[name](query) - q *= self.scaling - q = ( - q.contiguous() - .view(-1, bsz * self.num_heads, self.head_dim) - .transpose(0, 1) + assert attn_mask is None + assert query is not None + assert key is not None + assert value is not None + + tgt_len, bsz, embed_dim = query.size() + src_len = value.size(0) + + if key_padding_mask is not None: + assert not key_padding_mask[:, 0].any(), ( + "Only right padding is supported." ) - else: - q = None - - if key is not None: - bsz = key.size(1) - k = self.k_in_proj[name](key) - k = ( - k.contiguous() - .view(-1, bsz * self.num_heads, self.head_dim) - .transpose(0, 1) + key_padding_mask = ( + key_padding_mask + .unsqueeze(1) + .expand([bsz, self.num_heads, src_len]) + .contiguous() + .view(-1, src_len) ) - else: - k = None - - if value is not None: - bsz = value.size(1) - v = self.v_in_proj[name](value) - v = ( - v.contiguous() - .view(-1, bsz * self.num_heads, self.head_dim) - .transpose(0, 1) + + if incremental_state is not None: + # Inference + ( + p_choose, alpha, beta + ) = self.monotonic_attention_process_infer( + query, key, incremental_state ) + soft_energy = beta else: - v = None - - return q, k, v - - def p_choose(self, query, key, key_padding_mask=None): - """ - Calculating step wise prob for reading and writing - 1 to read, 0 to write - - ============================================================ - Expected input size - query: bsz, tgt_len, embed_dim - key: bsz, src_len, embed_dim - value: bsz, src_len, embed_dim - key_padding_mask: bsz, src_len - attn_mask: bsz, src_len - query: bsz, tgt_len, embed_dim - """ + # Train + ( + p_choose, alpha, beta, soft_energy + ) = self.monotonic_attention_process_train( + query, key, key_padding_mask + ) - # prepare inputs - q_proj, k_proj, _ = self.input_projections(query, key, None, "monotonic") + v = self.v_proj(value) + length, bsz, _ = v.size() + v = ( + v.contiguous() + .view(length, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) - # attention energy - attn_energy = self.attn_energy(q_proj, k_proj, key_padding_mask) + attn = torch.bmm(beta.type_as(v), v) - noise = 0 + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) - if self.training: - # add noise here to encourage discretness - noise = ( - torch.normal(self.noise_mean, self.noise_var, attn_energy.size()) - .type_as(attn_energy) - .to(attn_energy.device) - ) + attn = self.out_proj(attn) - p_choose = torch.sigmoid(attn_energy + noise) - _, _, tgt_len, src_len = p_choose.size() + p_choose = p_choose.view(bsz, self.num_heads, tgt_len, src_len) + alpha = alpha.view(bsz, self.num_heads, tgt_len, src_len) + beta = beta.view(bsz, self.num_heads, tgt_len, src_len) - # p_choose: bsz * self.num_heads, tgt_len, src_len - return p_choose.view(-1, tgt_len, src_len) + return attn, { + "p_choose": p_choose, + "alpha": alpha, + "beta": beta, + "soft_energy": soft_energy, + } - def expected_attention(self, alpha, *args): - """ - For MMA-H, beta = alpha - """ - return alpha + def _get_monotonic_buffer(self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]): + maybe_incremental_state = self.get_incremental_state( + incremental_state, + 'monotonic', + ) + if maybe_incremental_state is None: + typed_empty_dict: Dict[str, Optional[Tensor]] = {} + return typed_empty_dict + else: + return maybe_incremental_state - def v_proj_output(self, value): - _, _, v_proj = self.input_projections(None, None, value, "output") - return v_proj + def _set_monotonic_buffer(self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], buffer: Dict[str, Optional[Tensor]]): + self.set_incremental_state( + incremental_state, + 'monotonic', + buffer, + ) @register_monotonic_attention("infinite_lookback") -class MonotonicMultiheadAttentionInfiniteLookback(MonotonicMultiheadAttentionHard): +class MonotonicInfiniteLookbackAttention( + MonotonicAttention +): def __init__(self, args): super().__init__(args) + self.soft_attention = True self.init_soft_attention() def init_soft_attention(self): @@ -497,126 +446,75 @@ def init_soft_attention(self): nn.init.xavier_uniform_(self.k_in_proj["soft"].weight) nn.init.xavier_uniform_(self.q_in_proj["soft"].weight) - def expected_attention( - self, alpha, query, key, value, key_padding_mask, incremental_state - ): - # monotonic attention, we will calculate milk here - bsz_x_num_heads, tgt_len, src_len = alpha.size() - bsz = int(bsz_x_num_heads / self.num_heads) - - q, k, _ = self.input_projections(query, key, None, "soft") - soft_energy = self.attn_energy(q, k, key_padding_mask) - - assert list(soft_energy.size()) == [bsz, self.num_heads, tgt_len, src_len] - - soft_energy = soft_energy.view(bsz * self.num_heads, tgt_len, src_len) - - if incremental_state is not None: - monotonic_cache = self._get_monotonic_buffer(incremental_state) - monotonic_step = monotonic_cache["step"] + 1 - step_offset = 0 - if key_padding_mask is not None: - if key_padding_mask[:, 0].any(): - # left_pad_source = True: - step_offset = key_padding_mask.sum(dim=-1, keepdim=True) - monotonic_step += step_offset - mask = lengths_to_mask( - monotonic_step.view(-1), soft_energy.size(2), 1 - ).unsqueeze(1) - - soft_energy = soft_energy.masked_fill(~mask.bool(), float("-inf")) - soft_energy = soft_energy - soft_energy.max(dim=2, keepdim=True)[0] - exp_soft_energy = torch.exp(soft_energy) - exp_soft_energy_sum = exp_soft_energy.sum(dim=2) - beta = exp_soft_energy / exp_soft_energy_sum.unsqueeze(2) - - else: - # bsz * num_heads, tgt_len, src_len - soft_energy = soft_energy - soft_energy.max(dim=2, keepdim=True)[0] - exp_soft_energy = torch.exp(soft_energy) - exp_soft_energy_cumsum = torch.cumsum(exp_soft_energy, dim=2) - - if key_padding_mask is not None: - if key_padding_mask.any(): - exp_soft_energy_cumsum = ( - exp_soft_energy_cumsum.view( - -1, self.num_heads, tgt_len, src_len - ) - .masked_fill( - key_padding_mask.unsqueeze(1).unsqueeze(1), self.eps - ) - .view(-1, tgt_len, src_len) - ) - - inner_items = alpha / exp_soft_energy_cumsum - - beta = exp_soft_energy * torch.cumsum( - inner_items.flip(dims=[2]), dim=2 - ).flip(dims=[2]) - - beta = self.dropout_module(beta) - - assert not torch.isnan(beta).any(), "NaN detected in beta." - - return beta - @register_monotonic_attention("waitk") -class MonotonicMultiheadAttentionWaitk(MonotonicMultiheadAttentionInfiniteLookback): +class WaitKAttention( + MonotonicInfiniteLookbackAttention +): + """ + STACL: Simultaneous Translation with Implicit Anticipation and + Controllable Latency using Prefix-to-Prefix Framework + https://www.aclweb.org/anthology/P19-1289/ + """ def __init__(self, args): super().__init__(args) self.q_in_proj["soft"] = self.q_in_proj["monotonic"] self.k_in_proj["soft"] = self.k_in_proj["monotonic"] + self.waitk_lagging = args.waitk_lagging - assert ( - self.waitk_lagging > 0 - ), f"Lagging has to been larger than 0, get {self.waitk_lagging}." + assert self.waitk_lagging > 0, ( + f"Lagging has to been larger than 0, get {self.waitk_lagging}." + ) @staticmethod def add_args(parser): super( - MonotonicMultiheadAttentionWaitk, - MonotonicMultiheadAttentionWaitk, + MonotonicInfiniteLookbackAttention, + MonotonicInfiniteLookbackAttention ).add_args(parser) parser.add_argument( - "--waitk-lagging", type=int, required=True, help="Wait k lagging" + "--waitk-lagging", type=int, required=True, help="Wait K lagging" ) - def p_choose( - self, query, key, key_padding_mask=None, attn_mask=None, incremental_state=None + def p_choose_from_qk( + self, + query: Optional[Tensor], + key: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, ): - """ - query: bsz, tgt_len - key: bsz, src_len - key_padding_mask: bsz, src_len - """ - src_len, bsz, _ = key.size() - tgt_len, bsz, _ = query.size() - p_choose = query.new_ones(bsz, tgt_len, src_len) - p_choose = torch.tril(p_choose, diagonal=self.waitk_lagging - 1) - p_choose = torch.triu(p_choose, diagonal=self.waitk_lagging - 1) - - if key_padding_mask is not None and key_padding_mask[:, 0].eq(1).any(): - # Left pad source - # add -1 to the end - p_choose = p_choose.masked_fill( - key_padding_mask.float().flip(1).unsqueeze(1).bool(), -1 - ) - p_choose = convert_padding_direction( - p_choose.view(-1, src_len).long(), padding_idx=-1, right_to_left=True - ) - p_choose = p_choose.view(bsz, tgt_len, src_len).type_as(query) - # remove -1 - p_choose[p_choose.eq(-1)] = 0 - - # Extend to each head - p_choose = ( - p_choose.contiguous() - .unsqueeze(1) - .expand(-1, self.num_heads, -1, -1) - .contiguous() - .view(-1, tgt_len, src_len) + assert query is not None + assert key is not None + + p_choose = waitk_p_choose( + tgt_len=query.size(0), + src_len=key.size(0), + bsz=query.size(1) * self.num_heads, + waitk_lagging=self.waitk_lagging, + key_padding_mask=key_padding_mask, + incremental_state=incremental_state, ) - return p_choose + return p_choose.to(query) + + +@register_monotonic_attention("chunkwise") +class ChunkwiseAttention( + MonotonicInfiniteLookbackAttention +): + def __init__(self, args): + super().__init__(args) + self.chunk_size = args.mocha_chunk_size + assert self.chunk_size > 1 + + @staticmethod + def add_args(parser): + super( + MonotonicInfiniteLookbackAttention + ).add_args(parser) + + parser.add_argument( + "--mocha-chunk-size", type=int, + required=True, help="Mocha chunk size" + ) diff --git a/examples/simultaneous_translation/modules/monotonic_transformer_layer.py b/examples/simultaneous_translation/modules/monotonic_transformer_layer.py index 442b7d487d..94bd71fb9c 100644 --- a/examples/simultaneous_translation/modules/monotonic_transformer_layer.py +++ b/examples/simultaneous_translation/modules/monotonic_transformer_layer.py @@ -3,10 +3,15 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from fairseq.modules import LayerNorm, TransformerDecoderLayer, TransformerEncoderLayer +from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer from . import build_monotonic_attention +from typing import Dict, Optional, List + +from torch import Tensor +import torch + class TransformerMonotonicEncoderLayer(TransformerEncoderLayer): def forward(self, x, encoder_padding_mask): @@ -17,32 +22,161 @@ def forward(self, x, encoder_padding_mask): class TransformerMonotonicDecoderLayer(TransformerDecoderLayer): - def __init__( - self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False + def __init__(self, args): + super().__init__(args) + + assert args.simul_type is not None, "A --simul-type is needed." + self.encoder_attn = build_monotonic_attention(args) + + def prune_incremental_state( + self, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] + ): + input_buffer = self.self_attn._get_input_buffer(incremental_state) + for key in ["prev_key", "prev_value"]: + input_buffer_key = input_buffer[key] + assert input_buffer_key is not None + if input_buffer_key.size(2) > 1: + input_buffer[key] = input_buffer_key[:, :, :-1, :] + else: + typed_empty_dict: Dict[str, Optional[Tensor]] = {} + input_buffer = typed_empty_dict + break + assert incremental_state is not None + self.self_attn._set_input_buffer(incremental_state, input_buffer) + + def forward( + self, + x, + encoder_out: Optional[Tensor] = None, + encoder_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + prev_self_attn_state: Optional[List[Tensor]] = None, + prev_attn_state: Optional[List[Tensor]] = None, + self_attn_mask: Optional[Tensor] = None, + self_attn_padding_mask: Optional[Tensor] = None, + need_attn: bool = False, + need_head_weights: bool = False, ): - super().__init__( - args, - no_encoder_attn=True, - add_bias_kv=add_bias_kv, - add_zero_attn=add_zero_attn, + """ + Args: + x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_padding_mask (ByteTensor, optional): binary + ByteTensor of shape `(batch, src_len)` where padding + elements are indicated by ``1``. + need_attn (bool, optional): return attention weights + need_head_weights (bool, optional): return attention weights + for each head (default: return average over heads). + + Returns: + encoded output of shape `(seq_len, batch, embed_dim)` + """ + if need_head_weights: + need_attn = True + + residual = x + if self.normalize_before: + x = self.self_attn_layer_norm(x) + if prev_self_attn_state is not None: + prev_key, prev_value = prev_self_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_self_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_self_attn_state[2] + assert incremental_state is not None + self.self_attn._set_input_buffer(incremental_state, saved_state) + _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state) + if self.cross_self_attention and not ( + incremental_state is not None + and _self_attn_input_buffer is not None + and "prev_key" in _self_attn_input_buffer + ): + if self_attn_mask is not None: + assert encoder_out is not None + self_attn_mask = torch.cat( + (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1 + ) + if self_attn_padding_mask is not None: + if encoder_padding_mask is None: + assert encoder_out is not None + encoder_padding_mask = self_attn_padding_mask.new_zeros( + encoder_out.size(1), encoder_out.size(0) + ) + self_attn_padding_mask = torch.cat( + (encoder_padding_mask, self_attn_padding_mask), dim=1 + ) + assert encoder_out is not None + y = torch.cat((encoder_out, x), dim=0) + else: + y = x + + x, attn = self.self_attn( + query=x, + key=y, + value=y, + key_padding_mask=self_attn_padding_mask, + incremental_state=incremental_state, + need_weights=False, + attn_mask=self_attn_mask, ) - self.encoder_attn = build_monotonic_attention(args) - self.encoder_attn_layer_norm = LayerNorm( - self.embed_dim, export=getattr(args, "char_inputs", False) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.self_attn_layer_norm(x) + + assert self.encoder_attn is not None + residual = x + if self.normalize_before: + x = self.encoder_attn_layer_norm(x) + if prev_attn_state is not None: + prev_key, prev_value = prev_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_attn_state[2] + assert incremental_state is not None + self.encoder_attn._set_input_buffer(incremental_state, saved_state) + + x, attn = self.encoder_attn( + query=x, + key=encoder_out, + value=encoder_out, + key_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + static_kv=True, + need_weights=need_attn or (not self.training and self.need_attn), + need_head_weights=need_head_weights, ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.encoder_attn_layer_norm(x) + + residual = x + if self.normalize_before: + x = self.final_layer_norm(x) - def prune_incremental_state(self, incremental_state): - def prune(module): - input_buffer = module._get_input_buffer(incremental_state) - for key in ["prev_key", "prev_value"]: - if input_buffer[key].size(2) > 1: - input_buffer[key] = input_buffer[key][:, :, :-1, :] - else: - input_buffer = {} - break - module._set_input_buffer(incremental_state, input_buffer) - - prune(self.self_attn) - - def get_steps(self, incremental_state): - return self.encoder_attn._get_monotonic_buffer(incremental_state).get("step", 0) + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + x = self.fc2(x) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.final_layer_norm(x) + if self.onnx_trace and incremental_state is not None: + saved_state = self.self_attn._get_input_buffer(incremental_state) + assert saved_state is not None + if self_attn_padding_mask is not None: + self_attn_state = [ + saved_state["prev_key"], + saved_state["prev_value"], + saved_state["prev_key_padding_mask"], + ] + else: + self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]] + return x, attn, self_attn_state + return x, attn, None diff --git a/examples/simultaneous_translation/tests/test_alignment_train.py b/examples/simultaneous_translation/tests/test_alignment_train.py new file mode 100644 index 0000000000..2ad4ef1f6d --- /dev/null +++ b/examples/simultaneous_translation/tests/test_alignment_train.py @@ -0,0 +1,88 @@ +import unittest + +import numpy as np +import torch + +import hypothesis.strategies as st +from hypothesis import assume, given, settings +from torch.testing._internal.common_utils import TestCase +from examples.simultaneous_translation.utils.functions import exclusive_cumprod + + +TEST_CUDA = torch.cuda.is_available() + + +class AlignmentTrainTest(TestCase): + def _test_custom_alignment_train_ref(self, p_choose, eps): + cumprod_1mp = exclusive_cumprod(1 - p_choose, dim=2, eps=eps) + cumprod_1mp_clamp = torch.clamp(cumprod_1mp, eps, 1.0) + + bsz = p_choose.size(0) + tgt_len = p_choose.size(1) + src_len = p_choose.size(2) + + alpha_0 = p_choose.new_zeros([bsz, 1, src_len]) + alpha_0[:, :, 0] = 1.0 + + previous_alpha = [alpha_0] + + for i in range(tgt_len): + # p_choose: bsz , tgt_len, src_len + # cumprod_1mp_clamp : bsz, tgt_len, src_len + # previous_alpha[i]: bsz, 1, src_len + # alpha_i: bsz, src_len + alpha_i = ( + p_choose[:, i] + * cumprod_1mp[:, i] + * torch.cumsum( + previous_alpha[i][:, 0] / cumprod_1mp_clamp[:, i], dim=1 + ) + ).clamp(0, 1.0) + + previous_alpha.append(alpha_i.unsqueeze(1)) + + # alpha: bsz * num_heads, tgt_len, src_len + alpha = torch.cat(previous_alpha[1:], dim=1) + return alpha + + def _test_custom_alignment_train_impl(self, p_choose, alpha, eps): + if p_choose.is_cuda: + from alignment_train_cuda_binding import alignment_train_cuda # @manual=//deeplearning/projects/fairseq-py:alignment_train_cuda_binding + alignment_train_cuda(p_choose, alpha, eps) + else: + from alignment_train_cpu_binding import alignment_train_cpu # @manual=//deeplearning/projects/fairseq-py:alignment_train_cpu_binding + alignment_train_cpu(p_choose, alpha, eps) + + @settings(deadline=None) + @given( + bsz=st.integers(1, 100), + tgt_len=st.integers(1, 100), + src_len=st.integers(1, 550), + device=st.sampled_from(["cpu", "cuda"]), + ) + def test_alignment_train(self, bsz, tgt_len, src_len, device): + eps = 1e-6 + + assume(device == "cpu" or TEST_CUDA) + p_choose = torch.rand(bsz, tgt_len, src_len, device=device) + + # run the alignment with the custom operator + alpha_act = p_choose.new_zeros([bsz, tgt_len, src_len]) + self._test_custom_alignment_train_impl(p_choose, alpha_act, eps) + + # runu the alignment with the ref implementation + alpha_ref = self._test_custom_alignment_train_ref(p_choose, eps) + + # verify the results + alpha_act = alpha_act.cpu().detach().numpy() + alpha_ref = alpha_ref.cpu().detach().numpy() + np.testing.assert_allclose( + alpha_act, + alpha_ref, + atol=1e-3, + rtol=1e-3, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/examples/simultaneous_translation/tests/test_text_models.py b/examples/simultaneous_translation/tests/test_text_models.py new file mode 100644 index 0000000000..19d6356304 --- /dev/null +++ b/examples/simultaneous_translation/tests/test_text_models.py @@ -0,0 +1,407 @@ +import argparse +import unittest +from typing import Any, Dict + +import torch +from examples.simultaneous_translation.models import ( + transformer_monotonic_attention +) + + +from tests.test_roberta import FakeTask + + +DEFAULT_CONFIG = { + "attention_eps": 1e-6, + "mass_preservation": True, + "noise_type": "flat", + "noise_mean": 0.0, + "noise_var": 1.0, + "energy_bias_init": -2, + "energy_bias": True +} + + +PAD_INDEX = 1 + + +def generate_config(overrides_kv): + new_dict = {key: value for key, value in DEFAULT_CONFIG.items()} + for key, value in overrides_kv.items(): + new_dict[key] = value + return new_dict + + +def make_sample_with_padding(longer_src=False) -> Dict[str, Any]: + tokens_1 = torch.LongTensor( + [ + [2, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 2], + [ + 2, 11, 12, 14, 15, 10, 11, 12, 13, 14, 15, 2, + PAD_INDEX, PAD_INDEX + ], + ] + ) + tokens_2 = torch.LongTensor( + [ + [2, 11, 12, 13, 14, 2, PAD_INDEX, PAD_INDEX], + [2, 11, 22, 33, 2, PAD_INDEX, PAD_INDEX, PAD_INDEX] + ] + ) + if longer_src: + src_tokens = tokens_1[:, 1:] + prev_output_tokens = tokens_2 + else: + src_tokens = tokens_2[:, 1:8] + prev_output_tokens = tokens_1 + + src_lengths = src_tokens.ne(PAD_INDEX).sum(dim=1).long() + + sample = { + "net_input": { + "src_tokens": src_tokens, + "prev_output_tokens": prev_output_tokens, + "src_lengths": src_lengths, + }, + "target": prev_output_tokens[:, 1:], + } + return sample + + +def build_transformer_monotonic_attention(**extra_args: Any): + overrides = { + # Use characteristics dimensions + "encoder_embed_dim": 12, + "encoder_ffn_embed_dim": 14, + "decoder_embed_dim": 12, + "decoder_ffn_embed_dim": 14, + # Disable dropout so we have comparable tests. + "dropout": 0, + "attention_dropout": 0, + "activation_dropout": 0, + "encoder_layerdrop": 0, + } + overrides.update(extra_args) + # Overrides the defaults from the parser + args = argparse.Namespace(**overrides) + transformer_monotonic_attention.monotonic_tiny_architecture(args) + + torch.manual_seed(0) + task = FakeTask(args) + return ( + transformer_monotonic_attention + .TransformerModelSimulTrans + .build_model(args, task) + ) + + +def expected_alignment_formula( + p_choose, + mass_perservation=True, + padding_mask=None +): + # Online and Linear-Time Attention by Enforcing Monotonic Alignments + # https://arxiv.org/pdf/1704.00784.pdf + # Eq 18, 19 + bsz, tgt_len, src_len = p_choose.size() + alpha = torch.zeros_like(p_choose) + + if padding_mask is not None: + bsz_pad = padding_mask.size(0) + num_heads = int(bsz / bsz_pad) + padding_mask = ( + padding_mask + .unsqueeze(1) + .expand([bsz_pad, num_heads, src_len]) + .contiguous() + .view(-1, src_len) + ) + + p_choose = p_choose.masked_fill(padding_mask.unsqueeze(1), 0) + + for bsz_i in range(bsz): + for i in range(tgt_len): + for j in range(src_len): + if i == 0: + if j == 0: + # First source token + alpha[bsz_i, i, j] = p_choose[bsz_i, i, j] + else: + # First target token + alpha[bsz_i, i, j] = ( + p_choose[bsz_i, i, j] + * torch.prod( + 1 - p_choose[bsz_i, i, :j] + ) + ) + else: + alpha[bsz_i, i, j] = alpha[bsz_i, i - 1, j] + for k in range(j): + alpha[bsz_i, i, j] += ( + alpha[bsz_i, i - 1, k] + * torch.prod( + 1 - p_choose[bsz_i, i, k:j] + ) + ) + alpha[bsz_i, i, j] *= p_choose[bsz_i, i, j] + + alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0) + + if mass_perservation: + alpha = mass_perservation_formula(alpha, False, padding_mask) + + return alpha + + +def mass_perservation_formula(alpha, left_padding=False, padding_mask=None): + if padding_mask is None or alpha.size(-1) == 1: + if alpha.size(-1) > 1: + alpha[:, :, -1] = 1 - alpha[:, :, :-1].sum(dim=-1) + return alpha + + src_lens = (padding_mask.logical_not()).sum(dim=1).long() + + bsz, tgt_len, src_len = alpha.size() + + assert ( + not left_padding + or (left_padding and (not padding_mask[:, 0].any())) + ) + + alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0) + + for bsz_i in range(bsz): + if left_padding: + alpha[bsz_i, :, -1] = ( + 1 - alpha[bsz_i, :, :-1].sum(dim=-1) + ) + else: + alpha[bsz_i, :, src_lens[bsz_i] - 1] = ( + 1 - alpha[bsz_i, :, :src_lens[bsz_i] - 1].sum(dim=-1) + ) + + return alpha + + +def expected_soft_attention_formula( + alpha, + soft_energy, + padding_mask=None, + chunksize=1e10, +): + # Monotonic Infinite Lookback Attention for Simultaneous Machine Translation + # https://arxiv.org/pdf/1906.05218.pdf + # Eq 14 + + # Monotonic Chunkwise Attention + # https://arxiv.org/abs/1712.05382 + # Eq 17 + bsz, tgt_len, src_len = alpha.size() + beta = torch.zeros_like(alpha) + + if padding_mask is not None: + bsz_pad = padding_mask.size(0) + num_heads = int(bsz / bsz_pad) + # Expanding for potential head dimension + padding_mask = ( + padding_mask + .unsqueeze(1) + .expand([bsz_pad, num_heads, src_len]) + .contiguous() + .view(-1, src_len) + ) + soft_energy = soft_energy.masked_fill(padding_mask.unsqueeze(1), float('-inf')) + + for bsz_i in range(bsz): + for i in range(tgt_len): + for j in range(src_len): + for k in range(j, min([src_len, j + chunksize])): + if not padding_mask[bsz_i, j]: + beta[bsz_i, i, j] += ( + alpha[bsz_i, i, k] * torch.exp(soft_energy[bsz_i, i, j]) + / torch.sum(torch.exp(soft_energy[bsz_i, i, max([0, k - chunksize + 1]):k + 1])) + ) + return beta + + +class MonotonicAttentionTestAbstractClass(object): + def test_forward(self): + sample = make_sample_with_padding() + out, _ = self.model.forward(**sample["net_input"]) + loss = out.sum() + loss.backward() + + def test_p_choose(self): + sample = make_sample_with_padding() + _, extra_out = self.model.forward(**sample["net_input"]) + for item in extra_out.attn_list: + p_choose = item["p_choose"] + self.assertTrue(p_choose.le(1.0).all()) + self.assertTrue(p_choose.ge(0.0).all()) + + def test_expected_alignment(self): + for longer_src in [True, False]: + sample = make_sample_with_padding(longer_src) + _, extra_out = self.model.forward(**sample["net_input"]) + for item in extra_out.attn_list: + p_choose = item["p_choose"] + alpha_system = item["alpha"] + self.assertTrue(p_choose.size() == alpha_system.size()) + bsz, num_head, tgt_len, src_len = alpha_system.size() + alpha_system = alpha_system.view(-1, tgt_len, src_len) + p_choose = p_choose.view(-1, tgt_len, src_len) + + alpha_real = expected_alignment_formula( + p_choose, + self.model.decoder.layers[0].encoder_attn.mass_preservation, + sample["net_input"]["src_tokens"].eq(PAD_INDEX) + ) + + self.assertTrue( + torch.abs(alpha_system - alpha_real).le(5e-5).all(), + ) + + +class HardMonotonicAttentionTestCase( + unittest.TestCase, + MonotonicAttentionTestAbstractClass +): + def setUp(self): + self.model = build_transformer_monotonic_attention( + **generate_config({"simul_type": "hard_aligned"}) + ) + + +class InfiniteLookbackTestCase( + unittest.TestCase, + MonotonicAttentionTestAbstractClass +): + def setUp(self): + self.model = build_transformer_monotonic_attention( + **generate_config( + { + "simul_type": "infinite_lookback" + } + ) + ) + self.model.train() + + def test_fp16_for_long_input(self): + sample = { + "net_input": { + "src_tokens": torch.LongTensor([7] * 1000 + [2]).cuda().unsqueeze(0), + "prev_output_tokens": torch.LongTensor([7] * 1000 + [2]).cuda().unsqueeze(0), + "src_lengths": torch.LongTensor([1000]).cuda(), + }, + "target": torch.LongTensor([2] + [7] * 1000).unsqueeze(0).cuda() + } + self.model.cuda().half() + _, extra_out = self.model.forward(**sample["net_input"]) + for item in extra_out.attn_list: + for key in ["p_choose", "alpha", "beta", "soft_energy"]: + self.assertFalse(torch.isnan(item[key]).any()) + + def test_expected_attention(self): + for longer_src in [True, False]: + sample = make_sample_with_padding(longer_src) + _, extra_out = self.model.forward(**sample["net_input"]) + for item in extra_out.attn_list: + p_choose = item["p_choose"] + alpha_system = item["alpha"] + beta_system = item["beta"] + soft_energy_system = item["soft_energy"] + self.assertTrue(beta_system.size() == alpha_system.size()) + self.assertTrue(p_choose.size() == alpha_system.size()) + + bsz, num_head, tgt_len, src_len = alpha_system.size() + + alpha_system = alpha_system.view(-1, tgt_len, src_len) + beta_system = beta_system.view(-1, tgt_len, src_len) + p_choose = p_choose.view(-1, tgt_len, src_len) + soft_energy_system = soft_energy_system.view(-1, tgt_len, src_len) + + alpha_real = expected_alignment_formula( + p_choose, + self.model.decoder.layers[0].encoder_attn.mass_preservation, + sample["net_input"]["src_tokens"].eq(PAD_INDEX) + ) + + beta_real = expected_soft_attention_formula( + alpha_real, + soft_energy_system, + sample["net_input"]["src_tokens"].eq(PAD_INDEX), + chunksize=getattr( + self.model.decoder.layers[0].encoder_attn, + "chunk_size", + int(1e10) + ) or int(1e10) + ) + + self.assertTrue( + torch.abs(beta_system - beta_real).le(1e-5).all(), + ) + + +class ChunkwiswTestCase( + InfiniteLookbackTestCase +): + def setUp(self): + self.model = build_transformer_monotonic_attention( + **generate_config( + { + "simul_type": "chunkwise", + "mocha_chunk_size": 3 + } + ) + ) + + +class WaitkTestCase(InfiniteLookbackTestCase): + def setUp(self): + self.model = build_transformer_monotonic_attention( + **generate_config( + { + "simul_type": "waitk", + "waitk_lagging": 3, + } + ) + ) + + def check_waitk(self, p_choose, lagging, padding_mask): + bsz, tgt_len, src_len = p_choose.size() + for bsz_i in range(bsz): + for i in range(tgt_len): + for j in range(src_len): + if not padding_mask[bsz_i, j]: + if j - i == lagging - 1: + self.assertTrue(p_choose[bsz_i, i, j] == 1) + else: + self.assertTrue(p_choose[bsz_i, i, j] == 0) + + def test_waitk_p_choose(self): + for longer_src in [True, False]: + for k in [1, 3, 10, 20, 100]: + sample = make_sample_with_padding(longer_src) + model = build_transformer_monotonic_attention( + **generate_config( + { + "simul_type": "waitk", + "waitk_lagging": k, + } + ) + ) + model.train() + _, extra_out = model.forward(**sample["net_input"]) + for item in extra_out.attn_list: + p_choose = item["p_choose"] + bsz, num_heads, tgt_len, src_len = p_choose.size() + padding_mask = sample["net_input"]["src_tokens"].eq(PAD_INDEX) + padding_mask = ( + padding_mask + .unsqueeze(1) + .expand([bsz, num_heads, src_len]) + .contiguous() + .view(-1, src_len) + ) + p_choose = p_choose.view(bsz * num_heads, tgt_len, src_len) + self.check_waitk(p_choose, k, padding_mask) diff --git a/examples/simultaneous_translation/utils/__init__.py b/examples/simultaneous_translation/utils/__init__.py index be0ba4d99a..1e9ce844f5 100644 --- a/examples/simultaneous_translation/utils/__init__.py +++ b/examples/simultaneous_translation/utils/__init__.py @@ -8,7 +8,7 @@ # automatically import any Python files in the criterions/ directory -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): module = file[: file.find(".py")] importlib.import_module("examples.simultaneous_translation.utils." + module) diff --git a/examples/simultaneous_translation/utils/functions.py b/examples/simultaneous_translation/utils/functions.py index f795b5f31c..590a6c11ce 100644 --- a/examples/simultaneous_translation/utils/functions.py +++ b/examples/simultaneous_translation/utils/functions.py @@ -6,12 +6,24 @@ import torch +def prob_check(tensor, eps=1e-10): + assert not torch.isnan(tensor).any(), ( + "Nan in a probability tensor." + ) + # Add the eps here to prevent errors introduced by precision + assert tensor.le(1.0 + eps).all() and tensor.ge(0.0 - eps).all(), ( + "Incorrect values in a probability tensor" + ", 0.0 <= tensor <= 1.0" + ) + + def exclusive_cumprod(tensor, dim: int, eps: float = 1e-10): """ Implementing exclusive cumprod. There is cumprod in pytorch, however there is no exclusive mode. cumprod(x) = [x1, x1x2, x2x3x4, ..., prod_{i=1}^n x_i] - exclusive means cumprod(x) = [1, x1, x1x2, x1x2x3, ..., prod_{i=1}^{n-1} x_i] + exclusive means + cumprod(x) = [1, x1, x1x2, x1x2x3, ..., prod_{i=1}^{n-1} x_i] """ tensor_size = list(tensor.size()) tensor_size[dim] = 1 @@ -28,7 +40,9 @@ def exclusive_cumprod(tensor, dim: int, eps: float = 1e-10): elif dim == 2: return return_tensor[:, :, :-1] else: - raise RuntimeError("Cumprod on dimension 3 and more is not implemented") + raise RuntimeError( + "Cumprod on dimension 3 and more is not implemented" + ) def safe_cumprod(tensor, dim: int, eps: float = 1e-10): @@ -52,42 +66,6 @@ def safe_cumprod(tensor, dim: int, eps: float = 1e-10): return exp_cumsum_log_tensor -def lengths_to_mask(lengths, max_len: int, dim: int = 0, negative_mask: bool = False): - """ - Convert a tensor of lengths to mask - For example, lengths = [[2, 3, 4]], max_len = 5 - mask = - [[1, 1, 1], - [1, 1, 1], - [0, 1, 1], - [0, 0, 1], - [0, 0, 0]] - """ - assert len(lengths.size()) <= 2 - if len(lengths) == 2: - if dim == 1: - lengths = lengths.t() - lengths = lengths - else: - lengths = lengths.unsqueeze(1) - - # lengths : batch_size, 1 - lengths = lengths.view(-1, 1) - - batch_size = lengths.size(0) - # batch_size, max_len - mask = torch.arange(max_len).expand(batch_size, max_len).type_as(lengths) < lengths - - if negative_mask: - mask = ~mask - - if dim == 0: - # max_len, batch_size - mask = mask.t() - - return mask - - def moving_sum(x, start_idx: int, end_idx: int): """ From MONOTONIC CHUNKWISE ATTENTION @@ -126,24 +104,22 @@ def moving_sum(x, start_idx: int, end_idx: int): [ 7, 17, 27], [ 4, 9, 14]] """ + # TODO: Make dimension configurable assert start_idx > 0 and end_idx > 0 - assert len(x.size()) == 2 - src_len, batch_size = x.size() + batch_size, tgt_len, src_len = x.size() + x = x.view(-1, src_len).unsqueeze(1) # batch_size, 1, src_len - x = x.t().unsqueeze(1) - # batch_size, 1, src_len - moving_sum_weight = x.new_ones([1, 1, end_idx + start_idx - 1]) + moving_sum_weight = torch.ones([1, 1, end_idx + start_idx - 1]).type_as(x) - moving_sum = ( - torch.nn.functional.conv1d( - x, moving_sum_weight, padding=start_idx + end_idx - 1 - ) - .squeeze(1) - .t() - ) - moving_sum = moving_sum[end_idx:-start_idx] + moving_sum = torch.nn.functional.conv1d( + x, moving_sum_weight, padding=start_idx + end_idx - 1 + ).squeeze(1) + + moving_sum = moving_sum[:, end_idx:-start_idx] + + assert src_len == moving_sum.size(1) + assert batch_size * tgt_len == moving_sum.size(0) - assert src_len == moving_sum.size(0) - assert batch_size == moving_sum.size(1) + moving_sum = moving_sum.view(batch_size, tgt_len, src_len) return moving_sum diff --git a/examples/simultaneous_translation/utils/latency.py b/examples/simultaneous_translation/utils/latency.py deleted file mode 100644 index 5d800a5d9e..0000000000 --- a/examples/simultaneous_translation/utils/latency.py +++ /dev/null @@ -1,451 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import torch - - -class LatencyMetric(object): - @staticmethod - def length_from_padding_mask(padding_mask, batch_first: bool = False): - dim = 1 if batch_first else 0 - return padding_mask.size(dim) - padding_mask.sum(dim=dim, keepdim=True) - - def prepare_latency_metric( - self, - delays, - src_lens, - target_padding_mask=None, - batch_first: bool = False, - start_from_zero: bool = True, - ): - assert len(delays.size()) == 2 - assert len(src_lens.size()) == 2 - - if start_from_zero: - delays = delays + 1 - - if batch_first: - # convert to batch_last - delays = delays.t() - src_lens = src_lens.t() - tgt_len, bsz = delays.size() - _, bsz_1 = src_lens.size() - - if target_padding_mask is not None: - target_padding_mask = target_padding_mask.t() - tgt_len_1, bsz_2 = target_padding_mask.size() - assert tgt_len == tgt_len_1 - assert bsz == bsz_2 - - assert bsz == bsz_1 - - if target_padding_mask is None: - tgt_lens = tgt_len * delays.new_ones([1, bsz]).float() - else: - # 1, batch_size - tgt_lens = self.length_from_padding_mask(target_padding_mask, False).float() - delays = delays.masked_fill(target_padding_mask, 0) - - return delays, src_lens, tgt_lens, target_padding_mask - - def __call__( - self, - delays, - src_lens, - target_padding_mask=None, - batch_first: bool = False, - start_from_zero: bool = True, - ): - delays, src_lens, tgt_lens, target_padding_mask = self.prepare_latency_metric( - delays, src_lens, target_padding_mask, batch_first, start_from_zero - ) - return self.cal_metric(delays, src_lens, tgt_lens, target_padding_mask) - - @staticmethod - def cal_metric(delays, src_lens, tgt_lens, target_padding_mask): - """ - Expected sizes: - delays: tgt_len, batch_size - src_lens: 1, batch_size - target_padding_mask: tgt_len, batch_size - """ - raise NotImplementedError - - -class AverageProportion(LatencyMetric): - """ - Function to calculate Average Proportion from - Can neural machine translation do simultaneous translation? - (https://arxiv.org/abs/1606.02012) - - Delays are monotonic steps, range from 1 to src_len. - Give src x tgt y, AP is calculated as: - - AP = 1 / (|x||y]) sum_i^|Y| deleys_i - """ - - @staticmethod - def cal_metric(delays, src_lens, tgt_lens, target_padding_mask): - if target_padding_mask is not None: - AP = torch.sum( - delays.masked_fill(target_padding_mask, 0), dim=0, keepdim=True - ) - else: - AP = torch.sum(delays, dim=0, keepdim=True) - - AP = AP / (src_lens * tgt_lens) - return AP - - -class AverageLagging(LatencyMetric): - """ - Function to calculate Average Lagging from - STACL: Simultaneous Translation with Implicit Anticipation - and Controllable Latency using Prefix-to-Prefix Framework - (https://arxiv.org/abs/1810.08398) - - Delays are monotonic steps, range from 1 to src_len. - Give src x tgt y, AP is calculated as: - - AL = 1 / tau sum_i^tau delays_i - (i - 1) / gamma - - Where - gamma = |y| / |x| - tau = argmin_i(delays_i = |x|) - """ - - @staticmethod - def cal_metric(delays, src_lens, tgt_lens, target_padding_mask): - # tau = argmin_i(delays_i = |x|) - tgt_len, bsz = delays.size() - lagging_padding_mask = delays >= src_lens - lagging_padding_mask = torch.nn.functional.pad( - lagging_padding_mask.t(), (1, 0) - ).t()[:-1, :] - gamma = tgt_lens / src_lens - lagging = ( - delays - - torch.arange(delays.size(0)) - .unsqueeze(1) - .type_as(delays) - .expand_as(delays) - / gamma - ) - lagging.masked_fill_(lagging_padding_mask, 0) - tau = (1 - lagging_padding_mask.type_as(lagging)).sum(dim=0, keepdim=True) - AL = lagging.sum(dim=0, keepdim=True) / tau - - return AL - - -class DifferentiableAverageLagging(LatencyMetric): - """ - Function to calculate Differentiable Average Lagging from - Monotonic Infinite Lookback Attention for Simultaneous Machine Translation - (https://arxiv.org/abs/1906.05218) - - Delays are monotonic steps, range from 0 to src_len-1. - (In the original paper thery are from 1 to src_len) - Give src x tgt y, AP is calculated as: - - DAL = 1 / |Y| sum_i^|Y| delays'_i - (i - 1) / gamma - - Where - delays'_i = - 1. delays_i if i == 1 - 2. max(delays_i, delays'_{i-1} + 1 / gamma) - - """ - - @staticmethod - def cal_metric(delays, src_lens, tgt_lens, target_padding_mask): - tgt_len, bsz = delays.size() - - gamma = tgt_lens / src_lens - new_delays = torch.zeros_like(delays) - - for i in range(delays.size(0)): - if i == 0: - new_delays[i] = delays[i] - else: - new_delays[i] = torch.cat( - [ - new_delays[i - 1].unsqueeze(0) + 1 / gamma, - delays[i].unsqueeze(0), - ], - dim=0, - ).max(dim=0)[0] - - DAL = ( - new_delays - - torch.arange(delays.size(0)) - .unsqueeze(1) - .type_as(delays) - .expand_as(delays) - / gamma - ) - if target_padding_mask is not None: - DAL = DAL.masked_fill(target_padding_mask, 0) - - DAL = DAL.sum(dim=0, keepdim=True) / tgt_lens - - return DAL - - -class LatencyMetricVariance(LatencyMetric): - def prepare_latency_metric( - self, - delays, - src_lens, - target_padding_mask=None, - batch_first: bool = True, - start_from_zero: bool = True, - ): - assert batch_first - assert len(delays.size()) == 3 - assert len(src_lens.size()) == 2 - - if start_from_zero: - delays = delays + 1 - - # convert to batch_last - bsz, num_heads_x_layers, tgt_len = delays.size() - bsz_1, _ = src_lens.size() - assert bsz == bsz_1 - - if target_padding_mask is not None: - bsz_2, tgt_len_1 = target_padding_mask.size() - assert tgt_len == tgt_len_1 - assert bsz == bsz_2 - - if target_padding_mask is None: - tgt_lens = tgt_len * delays.new_ones([bsz, tgt_len]).float() - else: - # batch_size, 1 - tgt_lens = self.length_from_padding_mask(target_padding_mask, True).float() - delays = delays.masked_fill(target_padding_mask.unsqueeze(1), 0) - - return delays, src_lens, tgt_lens, target_padding_mask - - -class VarianceDelay(LatencyMetricVariance): - @staticmethod - def cal_metric(delays, src_lens, tgt_lens, target_padding_mask): - """ - delays : bsz, num_heads_x_layers, tgt_len - src_lens : bsz, 1 - target_lens : bsz, 1 - target_padding_mask: bsz, tgt_len or None - """ - if delays.size(1) == 1: - return delays.new_zeros([1]) - - variance_delays = delays.var(dim=1) - - if target_padding_mask is not None: - variance_delays.masked_fill_(target_padding_mask, 0) - - return variance_delays.sum(dim=1, keepdim=True) / tgt_lens - - -class LatencyInference(object): - def __init__(self, start_from_zero=True): - self.metric_calculator = { - "differentiable_average_lagging": DifferentiableAverageLagging(), - "average_lagging": AverageLagging(), - "average_proportion": AverageProportion(), - } - - self.start_from_zero = start_from_zero - - def __call__(self, monotonic_step, src_lens): - """ - monotonic_step range from 0 to src_len. src_len means eos - delays: bsz, tgt_len - src_lens: bsz, 1 - """ - if not self.start_from_zero: - monotonic_step -= 1 - - src_lens = src_lens - - delays = monotonic_step.view( - monotonic_step.size(0), -1, monotonic_step.size(-1) - ).max(dim=1)[0] - - delays = delays.masked_fill(delays >= src_lens, 0) + (src_lens - 1).expand_as( - delays - ).masked_fill(delays < src_lens, 0) - return_dict = {} - for key, func in self.metric_calculator.items(): - return_dict[key] = func( - delays.float(), - src_lens.float(), - target_padding_mask=None, - batch_first=True, - start_from_zero=True, - ).t() - - return return_dict - - -class LatencyTraining(object): - def __init__( - self, - avg_weight, - var_weight, - avg_type, - var_type, - stay_on_last_token, - average_method, - ): - self.avg_weight = avg_weight - self.var_weight = var_weight - self.avg_type = avg_type - self.var_type = var_type - self.stay_on_last_token = stay_on_last_token - self.average_method = average_method - - self.metric_calculator = { - "differentiable_average_lagging": DifferentiableAverageLagging(), - "average_lagging": AverageLagging(), - "average_proportion": AverageProportion(), - } - - self.variance_calculator = { - "variance_delay": VarianceDelay(), - } - - def expected_delays_from_attention( - self, attention, source_padding_mask=None, target_padding_mask=None - ): - if type(attention) == list: - # bsz, num_heads, tgt_len, src_len - bsz, num_heads, tgt_len, src_len = attention[0].size() - attention = torch.cat(attention, dim=1) - bsz, num_heads_x_layers, tgt_len, src_len = attention.size() - # bsz * num_heads * num_layers, tgt_len, src_len - attention = attention.view(-1, tgt_len, src_len) - else: - # bsz * num_heads * num_layers, tgt_len, src_len - bsz, tgt_len, src_len = attention.size() - num_heads_x_layers = 1 - attention = attention.view(-1, tgt_len, src_len) - - if not self.stay_on_last_token: - residual_attention = 1 - attention[:, :, :-1].sum(dim=2, keepdim=True) - attention = torch.cat([attention[:, :, :-1], residual_attention], dim=2) - - # bsz * num_heads_x_num_layers, tgt_len, src_len for MMA - steps = ( - torch.arange(1, 1 + src_len) - .unsqueeze(0) - .unsqueeze(1) - .expand_as(attention) - .type_as(attention) - ) - - if source_padding_mask is not None: - src_offset = ( - source_padding_mask.type_as(attention) - .sum(dim=1, keepdim=True) - .expand(bsz, num_heads_x_layers) - .contiguous() - .view(-1, 1) - ) - src_lens = src_len - src_offset - if source_padding_mask[:, 0].any(): - # Pad left - src_offset = src_offset.view(-1, 1, 1) - steps = steps - src_offset - steps = steps.masked_fill(steps <= 0, 0) - else: - src_lens = attention.new_ones([bsz, num_heads_x_layers]) * src_len - src_lens = src_lens.view(-1, 1) - - # bsz * num_heads_num_layers, tgt_len, src_len - expected_delays = ( - (steps * attention).sum(dim=2).view(bsz, num_heads_x_layers, tgt_len) - ) - - if target_padding_mask is not None: - expected_delays.masked_fill_(target_padding_mask.unsqueeze(1), 0) - - return expected_delays, src_lens - - def avg_loss(self, expected_delays, src_lens, target_padding_mask): - - bsz, num_heads_x_layers, tgt_len = expected_delays.size() - target_padding_mask = ( - target_padding_mask.unsqueeze(1) - .expand_as(expected_delays) - .contiguous() - .view(-1, tgt_len) - ) - - if self.average_method == "average": - # bsz * tgt_len - expected_delays = expected_delays.mean(dim=1) - elif self.average_method == "weighted_average": - weights = torch.nn.functional.softmax(expected_delays, dim=1) - expected_delays = torch.sum(expected_delays * weights, dim=1) - elif self.average_method == "max": - # bsz * num_heads_x_num_layers, tgt_len - expected_delays = expected_delays.max(dim=1)[0] - else: - raise RuntimeError(f"{self.average_method} is not supported") - - src_lens = src_lens.view(bsz, -1)[:, :1] - target_padding_mask = target_padding_mask.view(bsz, -1, tgt_len)[:, 0] - - if self.avg_weight > 0.0: - if self.avg_type in self.metric_calculator: - average_delays = self.metric_calculator[self.avg_type]( - expected_delays, - src_lens, - target_padding_mask, - batch_first=True, - start_from_zero=False, - ) - else: - raise RuntimeError(f"{self.avg_type} is not supported.") - - # bsz * num_heads_x_num_layers, 1 - return self.avg_weight * average_delays.sum() - else: - return 0.0 - - def var_loss(self, expected_delays, src_lens, target_padding_mask): - src_lens = src_lens.view(expected_delays.size(0), expected_delays.size(1))[ - :, :1 - ] - if self.var_weight > 0.0: - if self.var_type in self.variance_calculator: - variance_delays = self.variance_calculator[self.var_type]( - expected_delays, - src_lens, - target_padding_mask, - batch_first=True, - start_from_zero=False, - ) - else: - raise RuntimeError(f"{self.var_type} is not supported.") - - return self.var_weight * variance_delays.sum() - else: - return 0.0 - - def loss(self, attention, source_padding_mask=None, target_padding_mask=None): - expected_delays, src_lens = self.expected_delays_from_attention( - attention, source_padding_mask, target_padding_mask - ) - - latency_loss = 0 - - latency_loss += self.avg_loss(expected_delays, src_lens, target_padding_mask) - - latency_loss += self.var_loss(expected_delays, src_lens, target_padding_mask) - - return latency_loss diff --git a/examples/simultaneous_translation/utils/monotonic_attention.py b/examples/simultaneous_translation/utils/monotonic_attention.py new file mode 100644 index 0000000000..3b8e0a858c --- /dev/null +++ b/examples/simultaneous_translation/utils/monotonic_attention.py @@ -0,0 +1,180 @@ +from typing import Optional +import torch +from torch import Tensor + +from examples.simultaneous_translation.utils.functions import ( + exclusive_cumprod, + prob_check, + moving_sum, +) + + +def expected_alignment_from_p_choose( + p_choose: Tensor, + padding_mask: Optional[Tensor] = None, + eps: float = 1e-6 +): + """ + Calculating expected alignment for from stepwise probability + + Reference: + Online and Linear-Time Attention by Enforcing Monotonic Alignments + https://arxiv.org/pdf/1704.00784.pdf + + q_ij = (1 − p_{ij−1})q_{ij−1} + a+{i−1j} + a_ij = p_ij q_ij + + Parallel solution: + ai = p_i * cumprod(1 − pi) * cumsum(a_i / cumprod(1 − pi)) + + ============================================================ + Expected input size + p_choose: bsz, tgt_len, src_len + """ + prob_check(p_choose) + + # p_choose: bsz, tgt_len, src_len + bsz, tgt_len, src_len = p_choose.size() + dtype = p_choose.dtype + + p_choose = p_choose.float() + + if padding_mask is not None: + p_choose = p_choose.masked_fill(padding_mask.unsqueeze(1), 0.0) + + if p_choose.is_cuda: + p_choose = p_choose.contiguous() + from alignment_train_cuda_binding import alignment_train_cuda as alignment_train + else: + from alignment_train_cpu_binding import alignment_train_cpu as alignment_train + + alpha = p_choose.new_zeros([bsz, tgt_len, src_len]) + alignment_train(p_choose, alpha, eps) + + # Mix precision to prevent overflow for fp16 + alpha = alpha.type(dtype) + + prob_check(alpha) + + return alpha + + +def expected_soft_attention( + alpha: Tensor, + soft_energy: Tensor, + padding_mask: Optional[Tensor] = None, + chunk_size: Optional[int] = None, + eps: float = 1e-10 +): + """ + Function to compute expected soft attention for + monotonic infinite lookback attention from + expected alignment and soft energy. + + Reference: + Monotonic Chunkwise Attention + https://arxiv.org/abs/1712.05382 + + Monotonic Infinite Lookback Attention for Simultaneous Machine Translation + https://arxiv.org/abs/1906.05218 + + alpha: bsz, tgt_len, src_len + soft_energy: bsz, tgt_len, src_len + padding_mask: bsz, src_len + left_padding: bool + """ + if padding_mask is not None: + alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0.0) + soft_energy = soft_energy.masked_fill( + padding_mask.unsqueeze(1), -float("inf") + ) + + prob_check(alpha) + + dtype = alpha.dtype + + alpha = alpha.float() + soft_energy = soft_energy.float() + + soft_energy = soft_energy - soft_energy.max(dim=2, keepdim=True)[0] + exp_soft_energy = torch.exp(soft_energy) + eps + + if chunk_size is not None: + # Chunkwise + beta = ( + exp_soft_energy + * moving_sum( + alpha / (eps + moving_sum(exp_soft_energy, chunk_size, 1)), + 1, chunk_size + ) + ) + else: + # Infinite lookback + # Notice that infinite lookback is a special case of chunkwise + # where chunksize = inf + inner_items = alpha / (eps + torch.cumsum(exp_soft_energy, dim=2)) + + beta = ( + exp_soft_energy + * torch.cumsum(inner_items.flip(dims=[2]), dim=2) + .flip(dims=[2]) + ) + + if padding_mask is not None: + beta = beta.masked_fill( + padding_mask.unsqueeze(1).to(torch.bool), 0.0) + + # Mix precision to prevent overflow for fp16 + beta = beta.type(dtype) + + beta = beta.clamp(0, 1) + + prob_check(beta) + + return beta + + +def mass_preservation( + alpha: Tensor, + padding_mask: Optional[Tensor] = None, + left_padding: bool = False +): + """ + Function to compute the mass perservation for alpha. + This means that the residual weights of alpha will be assigned + to the last token. + + Reference: + Monotonic Infinite Lookback Attention for Simultaneous Machine Translation + https://arxiv.org/abs/1906.05218 + + alpha: bsz, tgt_len, src_len + padding_mask: bsz, src_len + left_padding: bool + """ + + prob_check(alpha) + + if padding_mask is not None: + if not left_padding: + assert not padding_mask[:, 0].any(), ( + "Find padding on the beginning of the sequence." + ) + alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0.0) + + if left_padding or padding_mask is None: + residuals = 1 - alpha[:, :, :-1].sum(dim=-1).clamp(0, 1) + alpha[:, :, -1] = residuals + else: + # right padding + _, tgt_len, src_len = alpha.size() + residuals = 1 - alpha.sum(dim=-1, keepdim=True).clamp(0, 1) + src_lens = src_len - padding_mask.sum(dim=1, keepdim=True) + src_lens = src_lens.expand(-1, tgt_len).contiguous() + # add back the last value + residuals += alpha.gather(2, src_lens.unsqueeze(2) - 1) + alpha = alpha.scatter(2, src_lens.unsqueeze(2) - 1, residuals) + + prob_check(alpha) + + return alpha diff --git a/examples/simultaneous_translation/utils/p_choose_strategy.py b/examples/simultaneous_translation/utils/p_choose_strategy.py new file mode 100644 index 0000000000..724c6912a6 --- /dev/null +++ b/examples/simultaneous_translation/utils/p_choose_strategy.py @@ -0,0 +1,126 @@ +from typing import Optional, Dict +from torch import Tensor +import torch + + +def waitk_p_choose( + tgt_len: int, + src_len: int, + bsz: int, + waitk_lagging: int, + key_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None +): + + max_src_len = src_len + if incremental_state is not None: + # Retrieve target length from incremental states + # For inference the length of query is always 1 + max_tgt_len = incremental_state["steps"]["tgt"] + assert max_tgt_len is not None + max_tgt_len = int(max_tgt_len) + else: + max_tgt_len = tgt_len + + if max_src_len < waitk_lagging: + if incremental_state is not None: + max_tgt_len = 1 + return torch.zeros( + bsz, max_tgt_len, max_src_len + ) + + # Assuming the p_choose looks like this for wait k=3 + # src_len = 6, max_tgt_len = 5 + # [0, 0, 1, 0, 0, 0, 0] + # [0, 0, 0, 1, 0, 0, 0] + # [0, 0, 0, 0, 1, 0, 0] + # [0, 0, 0, 0, 0, 1, 0] + # [0, 0, 0, 0, 0, 0, 1] + # linearize the p_choose matrix: + # [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0...] + # The indices of linearized matrix that equals 1 is + # 2 + 6 * 0 + # 3 + 6 * 1 + # ... + # n + src_len * n + k - 1 = n * (src_len + 1) + k - 1 + # n from 0 to max_tgt_len - 1 + # + # First, generate the indices (activate_indices_offset: bsz, max_tgt_len) + # Second, scatter a zeros tensor (bsz, max_tgt_len * src_len) + # with activate_indices_offset + # Third, resize the tensor to (bsz, max_tgt_len, src_len) + + activate_indices_offset = ( + ( + torch.arange(max_tgt_len) * (max_src_len + 1) + + waitk_lagging - 1 + ) + .unsqueeze(0) + .expand(bsz, max_tgt_len) + .long() + ) + + if key_padding_mask is not None: + if key_padding_mask[:, 0].any(): + # Left padding + activate_indices_offset += ( + key_padding_mask.sum(dim=1, keepdim=True) + ) + + # Need to clamp the indices that are too large + activate_indices_offset = ( + activate_indices_offset + .clamp( + 0, + min( + [ + max_tgt_len, + max_src_len - waitk_lagging + 1 + ] + ) * max_src_len - 1 + ) + ) + + p_choose = torch.zeros(bsz, max_tgt_len * max_src_len) + + p_choose = p_choose.scatter( + 1, + activate_indices_offset, + 1.0 + ).view(bsz, max_tgt_len, max_src_len) + + if key_padding_mask is not None: + p_choose = p_choose.to(key_padding_mask) + p_choose = p_choose.masked_fill(key_padding_mask.unsqueeze(1), 0) + + if incremental_state is not None: + p_choose = p_choose[:, -1:] + + return p_choose.float() + + +def learnable_p_choose( + energy, + noise_mean: float = 0.0, + noise_var: float = 0.0, + training: bool = True +): + """ + Calculating step wise prob for reading and writing + 1 to read, 0 to write + energy: bsz, tgt_len, src_len + """ + + noise = 0 + if training: + # add noise here to encourage discretness + noise = ( + torch.normal(noise_mean, noise_var, energy.size()) + .type_as(energy) + .to(energy.device) + ) + + p_choose = torch.sigmoid(energy + noise) + + # p_choose: bsz * self.num_heads, tgt_len, src_len + return p_choose diff --git a/examples/speech_recognition/README.md b/examples/speech_recognition/README.md index 19f7cc563e..5f9b27880e 100644 --- a/examples/speech_recognition/README.md +++ b/examples/speech_recognition/README.md @@ -1,3 +1,5 @@ +### 2021 Update: We are merging this example into the [S2T framework](../speech_to_text), which supports more generic speech-to-text tasks (e.g. speech translation) and more flexible data processing pipelines. Please stay tuned. + # Speech Recognition `examples/speech_recognition` is implementing ASR task in Fairseq, along with needed features, datasets, models and loss functions to train and infer model described in [Transformers with convolutional context for ASR (Abdelrahman Mohamed et al., 2019)](https://arxiv.org/abs/1904.11660). @@ -32,41 +34,20 @@ sclite -r ${RES_DIR}/ref.word-checkpoint_last.pt-${SET}.txt -h ${RES_DIR}/hypo.w ``` `Sum/Avg` row from first table of the report has WER -## Using wav2letter components -[wav2letter](https://github.com/facebookresearch/wav2letter) now has integration with fairseq. Currently this includes: +## Using flashlight (previously called [wav2letter](https://github.com/facebookresearch/wav2letter)) components +[flashlight](https://github.com/facebookresearch/flashlight) now has integration with fairseq. Currently this includes: * AutoSegmentationCriterion (ASG) -* wav2letter-style Conv/GLU model -* wav2letter's beam search decoder - -To use these, follow the instructions on [this page](https://github.com/facebookresearch/wav2letter/tree/master/bindings/python) to install python bindings. Please note that python bindings are for a *subset* of wav2letter and don't require its full dependencies (notably, `flashlight` and `ArrayFire` are *not* required). - -To quickly summarize the instructions: first, install [CUDA](https://developer.nvidia.com/cuda-downloads). Then follow these steps: -``` -# additional prerequisites - use equivalents for your distro -sudo apt-get install build-essential cmake libatlas-base-dev libfftw3-dev liblzma-dev libbz2-dev libzstd-dev -# install KenLM from source -git clone https://github.com/kpu/kenlm.git -cd kenlm -mkdir -p build && cd build -cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_POSITION_INDEPENDENT_CODE=ON -make -j16 -cd .. -export KENLM_ROOT_DIR=$(pwd) -cd .. -# install wav2letter python bindings -git clone https://github.com/facebookresearch/wav2letter.git -cd wav2letter/bindings/python -# make sure your python environment is active at this point -pip install torch packaging -pip install -e . -# try some examples to verify installation succeeded -python ./examples/criterion_example.py -python ./examples/decoder_example.py ../../src/decoder/test -python ./examples/feature_example.py ../../src/feature/test/data -``` - -## Training librispeech data (wav2letter style, Conv/GLU + ASG loss) +* flashlight-style Conv/GLU model +* flashlight's beam search decoder + +To use these, follow the instructions on [this page](https://github.com/flashlight/flashlight/tree/e16682fa32df30cbf675c8fe010f929c61e3b833/bindings/python) to install python bindings. **Flashlight v0.3.2** must be used to install the bindings. Running: +``` +git clone --branch v0.3.2 https://github.com/flashlight/flashlight +``` +will properly clone and check out this version. + +## Training librispeech data (flashlight style, Conv/GLU + ASG loss) Training command: ``` python train.py $DIR_FOR_PREPROCESSED_DATA --save-dir $MODEL_PATH --max-epoch 100 --task speech_recognition --arch w2l_conv_glu_enc --batch-size 4 --optimizer sgd --lr 0.3,0.8 --momentum 0.8 --clip-norm 0.2 --max-tokens 50000 --log-format json --log-interval 100 --num-workers 0 --sentence-avg --criterion asg_loss --asg-transitions-init 5 --max-replabel 2 --linseg-updates 8789 --user-dir examples/speech_recognition @@ -74,13 +55,13 @@ python train.py $DIR_FOR_PREPROCESSED_DATA --save-dir $MODEL_PATH --max-epoch 10 Note that ASG loss currently doesn't do well with word-pieces. You should prepare a dataset with character targets by setting `nbpe=31` in `prepare-librispeech.sh`. -## Inference for librispeech (wav2letter decoder, n-gram LM) +## Inference for librispeech (flashlight decoder, n-gram LM) Inference command: ``` python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --seed 1 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --gen-subset $SET --results-path $RES_DIR --w2l-decoder kenlm --kenlm-model $KENLM_MODEL_PATH --lexicon $LEXICON_PATH --beam 200 --beam-threshold 15 --lm-weight 1.5 --word-score 1.5 --sil-weight -0.3 --criterion asg_loss --max-replabel 2 --user-dir examples/speech_recognition ``` -`$KENLM_MODEL_PATH` should be a standard n-gram language model file. `$LEXICON_PATH` should be a wav2letter-style lexicon (list of known words and their spellings). For ASG inference, a lexicon line should look like this (note the repetition labels): +`$KENLM_MODEL_PATH` should be a standard n-gram language model file. `$LEXICON_PATH` should be a flashlight-style lexicon (list of known words and their spellings). For ASG inference, a lexicon line should look like this (note the repetition labels): ``` doorbell D O 1 R B E L 1 ▁ ``` @@ -99,7 +80,7 @@ doorbell ▁DO OR BE L L ``` Lowercase vs. uppercase matters: the *word* should match the case of the n-gram language model (i.e. `$KENLM_MODEL_PATH`), while the *spelling* should match the case of the token dictionary (i.e. `$DIR_FOR_PREPROCESSED_DATA/dict.txt`). -## Inference for librispeech (wav2letter decoder, viterbi only) +## Inference for librispeech (flashlight decoder, viterbi only) Inference command: ``` python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --seed 1 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --gen-subset $SET --results-path $RES_DIR --w2l-decoder viterbi --criterion asg_loss --max-replabel 2 --user-dir examples/speech_recognition diff --git a/examples/speech_recognition/criterions/ASG_loss.py b/examples/speech_recognition/criterions/ASG_loss.py index 7493654afc..41f50bbd70 100644 --- a/examples/speech_recognition/criterions/ASG_loss.py +++ b/examples/speech_recognition/criterions/ASG_loss.py @@ -46,7 +46,7 @@ def __init__( linseg_updates, hide_linseg_messages, ): - from wav2letter.criterion import ASGLoss, CriterionScaleMode + from flashlight.lib.sequence.criterion import ASGLoss, CriterionScaleMode super().__init__(task) self.tgt_dict = task.target_dictionary diff --git a/examples/speech_recognition/criterions/__init__.py b/examples/speech_recognition/criterions/__init__.py index 88af9f340f..579abd2ace 100644 --- a/examples/speech_recognition/criterions/__init__.py +++ b/examples/speech_recognition/criterions/__init__.py @@ -2,14 +2,14 @@ import os -# ASG loss requires wav2letter +# ASG loss requires flashlight bindings files_to_skip = set() try: - import wav2letter + import flashlight.lib.sequence.criterion except ImportError: files_to_skip.add("ASG_loss.py") -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_") and file not in files_to_skip: criterion_name = file[: file.find(".py")] importlib.import_module( diff --git a/examples/speech_recognition/data/replabels.py b/examples/speech_recognition/data/replabels.py index d76bda7aef..441f1bd432 100644 --- a/examples/speech_recognition/data/replabels.py +++ b/examples/speech_recognition/data/replabels.py @@ -6,13 +6,13 @@ # LICENSE file in the root directory of this source tree. """ -Replabel transforms for use with wav2letter's ASG criterion. +Replabel transforms for use with flashlight's ASG criterion. """ def replabel_symbol(i): """ - Replabel symbols used in wav2letter, currently just "1", "2", ... + Replabel symbols used in flashlight, currently just "1", "2", ... This prevents training with numeral tokens, so this might change in the future """ return str(i) diff --git a/examples/speech_recognition/infer.py b/examples/speech_recognition/infer.py index 1570177cc6..ce16bf47cf 100644 --- a/examples/speech_recognition/infer.py +++ b/examples/speech_recognition/infer.py @@ -8,6 +8,7 @@ Run inference for pre-processed data with a trained model. """ +import ast import logging import math import os @@ -18,7 +19,6 @@ import torch from fairseq import checkpoint_utils, options, progress_bar, tasks, utils from fairseq.data.data_utils import post_process -from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.logging.meters import StopwatchMeter, TimeMeter @@ -144,11 +144,11 @@ def process_predictions( print( "{} ({}-{})".format(tgt_words, speaker, id), file=res_files["ref.words"] ) - # only score top hypothesis - if not args.quiet: - logger.debug("HYPO:" + hyp_words) - logger.debug("TARGET:" + tgt_words) - logger.debug("___________________") + + if not args.quiet: + logger.info("HYPO:" + hyp_words) + logger.info("TARGET:" + tgt_words) + logger.info("___________________") hyp_words = hyp_words.split() tgt_words = tgt_words.split() @@ -178,53 +178,6 @@ def get_res_file(file_prefix): } -def load_models_and_criterions( - filenames, data_path, arg_overrides=None, task=None, model_state=None -): - models = [] - criterions = [] - - if arg_overrides is None: - arg_overrides = {} - - arg_overrides["wer_args"] = None - arg_overrides["data"] = data_path - - if filenames is None: - assert model_state is not None - filenames = [0] - else: - filenames = filenames.split(":") - - for filename in filenames: - if model_state is None: - if not os.path.exists(filename): - raise IOError("Model file not found: {}".format(filename)) - state = checkpoint_utils.load_checkpoint_to_cpu(filename, arg_overrides) - else: - state = model_state - - if "cfg" in state: - cfg = state["cfg"] - else: - cfg = convert_namespace_to_omegaconf(state["args"]) - - if task is None: - if hasattr(cfg.task, 'data'): - cfg.task.data = data_path - task = tasks.setup_task(cfg.task) - - model = task.build_model(cfg.model) - model.load_state_dict(state["model"], strict=True) - models.append(model) - - criterion = task.build_criterion(cfg.criterion) - if "criterion" in state: - criterion.load_state_dict(state["criterion"], strict=True) - criterions.append(criterion) - return models, criterions, task - - def optimize_models(args, use_cuda, models): """Optimize ensemble for generation""" for model in models: @@ -238,6 +191,12 @@ def optimize_models(args, use_cuda, models): model.cuda() +def apply_half(t): + if t.dtype is torch.float32: + return t.to(dtype=torch.half) + return t + + class ExistingEmissionsDecoder(object): def __init__(self, decoder, emissions): self.decoder = decoder @@ -257,31 +216,36 @@ def generate(self, models, sample, **unused): def main(args, task=None, model_state=None): check_args(args) + use_fp16 = args.fp16 if args.max_tokens is None and args.batch_size is None: args.max_tokens = 4000000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu - logger.info("| decoding with criterion {}".format(args.criterion)) + task = tasks.setup_task(args) + # Load ensemble if args.load_emissions: models, criterions = [], [] + task.load_dataset(args.gen_subset) else: logger.info("| loading model(s) from {}".format(args.path)) - models, criterions, task = load_models_and_criterions( - args.path, - data_path=args.data, - arg_overrides=eval(args.model_overrides), # noqa + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + utils.split_paths(args.path, separator="\\"), + arg_overrides=ast.literal_eval(args.model_overrides), task=task, - model_state=model_state, + suffix=args.checkpoint_suffix, + strict=(args.checkpoint_shard_count == 1), + num_shards=args.checkpoint_shard_count, + state=model_state, ) optimize_models(args, use_cuda, models) + task.load_dataset(args.gen_subset, task_cfg=saved_cfg.task) + - # Load dataset splits - task.load_dataset(args.gen_subset) # Set dictionary tgt_dict = task.target_dictionary @@ -293,8 +257,9 @@ def main(args, task=None, model_state=None): # hack to pass transitions to W2lDecoder if args.criterion == "asg_loss": - trans = criterions[0].asg.trans.data - args.asg_transitions = torch.flatten(trans).tolist() + raise NotImplementedError("asg_loss is currently not supported") + # trans = criterions[0].asg.trans.data + # args.asg_transitions = torch.flatten(trans).tolist() # Load dataset (possibly sharded) itr = get_dataset_itr(args, task, models) @@ -318,7 +283,7 @@ def build_generator(args): return W2lFairseqLMDecoder(args, task.target_dictionary) else: print( - "only wav2letter decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment" + "only flashlight decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment" ) # please do not touch this unless you test both generate.py and infer.py with audio_pretraining task @@ -359,6 +324,8 @@ def build_generator(args): wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample + if use_fp16: + sample = utils.apply_to_sample(apply_half, sample) if "net_input" not in sample: continue diff --git a/examples/speech_recognition/kaldi/__init__.py b/examples/speech_recognition/kaldi/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/speech_recognition/kaldi/add-self-loop-simple.cc b/examples/speech_recognition/kaldi/add-self-loop-simple.cc new file mode 100644 index 0000000000..e18fb62df5 --- /dev/null +++ b/examples/speech_recognition/kaldi/add-self-loop-simple.cc @@ -0,0 +1,94 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include <iostream> +#include "fstext/fstext-lib.h" // @manual +#include "util/common-utils.h" // @manual + +/* + * This program is to modify a FST without self-loop by: + * for each incoming arc with non-eps input symbol, add a self-loop arc + * with that non-eps symbol as input and eps as output. + * + * This is to make sure the resultant FST can do deduplication for repeated + * symbols, which is very common in acoustic model + * + */ +namespace { +int32 AddSelfLoopsSimple(fst::StdVectorFst* fst) { + typedef fst::MutableArcIterator<fst::StdVectorFst> IterType; + + int32 num_states_before = fst->NumStates(); + fst::MakePrecedingInputSymbolsSame(false, fst); + int32 num_states_after = fst->NumStates(); + KALDI_LOG << "There are " << num_states_before + << " states in the original FST; " + << " after MakePrecedingInputSymbolsSame, there are " + << num_states_after << " states " << std::endl; + + auto weight_one = fst::StdArc::Weight::One(); + + int32 num_arc_added = 0; + + fst::StdArc self_loop_arc; + self_loop_arc.weight = weight_one; + + int32 num_states = fst->NumStates(); + std::vector<std::set<int32>> incoming_non_eps_label_per_state(num_states); + + for (int32 state = 0; state < num_states; state++) { + for (IterType aiter(fst, state); !aiter.Done(); aiter.Next()) { + fst::StdArc arc(aiter.Value()); + if (arc.ilabel != 0) { + incoming_non_eps_label_per_state[arc.nextstate].insert(arc.ilabel); + } + } + } + + for (int32 state = 0; state < num_states; state++) { + if (!incoming_non_eps_label_per_state[state].empty()) { + auto& ilabel_set = incoming_non_eps_label_per_state[state]; + for (auto it = ilabel_set.begin(); it != ilabel_set.end(); it++) { + self_loop_arc.ilabel = *it; + self_loop_arc.olabel = 0; + self_loop_arc.nextstate = state; + fst->AddArc(state, self_loop_arc); + num_arc_added++; + } + } + } + return num_arc_added; +} + +void print_usage() { + std::cout << "add-self-loop-simple usage:\n" + "\tadd-self-loop-simple <in-fst> <out-fst> \n"; +} +} // namespace + +int main(int argc, char** argv) { + if (argc != 3) { + print_usage(); + exit(1); + } + + auto input = argv[1]; + auto output = argv[2]; + + auto fst = fst::ReadFstKaldi(input); + auto num_states = fst->NumStates(); + KALDI_LOG << "Loading FST from " << input << " with " << num_states + << " states." << std::endl; + + int32 num_arc_added = AddSelfLoopsSimple(fst); + KALDI_LOG << "Adding " << num_arc_added << " self-loop arcs " << std::endl; + + fst::WriteFstKaldi(*fst, std::string(output)); + KALDI_LOG << "Writing FST to " << output << std::endl; + + delete fst; +} diff --git a/examples/speech_recognition/kaldi/config/kaldi_initializer.yaml b/examples/speech_recognition/kaldi/config/kaldi_initializer.yaml new file mode 100644 index 0000000000..be9ba98f55 --- /dev/null +++ b/examples/speech_recognition/kaldi/config/kaldi_initializer.yaml @@ -0,0 +1,8 @@ +# @package _group_ + +data_dir: ??? +fst_dir: ??? +in_labels: ??? +kaldi_root: ??? +lm_arpa: ??? +blank_symbol: <s> diff --git a/examples/speech_recognition/kaldi/kaldi_decoder.py b/examples/speech_recognition/kaldi/kaldi_decoder.py new file mode 100644 index 0000000000..5f62cc58ae --- /dev/null +++ b/examples/speech_recognition/kaldi/kaldi_decoder.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from concurrent.futures import ThreadPoolExecutor +import logging +from omegaconf import MISSING +import os +import torch +from typing import Optional +import warnings + + +from dataclasses import dataclass +from fairseq.dataclass import FairseqDataclass +from .kaldi_initializer import KaldiInitializerConfig, initalize_kaldi + + +logger = logging.getLogger(__name__) + + +@dataclass +class KaldiDecoderConfig(FairseqDataclass): + hlg_graph_path: Optional[str] = None + output_dict: str = MISSING + + kaldi_initializer_config: Optional[KaldiInitializerConfig] = None + + acoustic_scale: float = 0.5 + max_active: int = 10000 + beam_delta: float = 0.5 + hash_ratio: float = 2.0 + + is_lattice: bool = False + lattice_beam: float = 10.0 + prune_interval: int = 25 + determinize_lattice: bool = True + prune_scale: float = 0.1 + max_mem: int = 0 + phone_determinize: bool = True + word_determinize: bool = True + minimize: bool = True + + num_threads: int = 1 + + +class KaldiDecoder(object): + def __init__( + self, + cfg: KaldiDecoderConfig, + beam: int, + nbest: int = 1, + ): + try: + from kaldi.asr import FasterRecognizer, LatticeFasterRecognizer + from kaldi.base import set_verbose_level + from kaldi.decoder import ( + FasterDecoder, + FasterDecoderOptions, + LatticeFasterDecoder, + LatticeFasterDecoderOptions, + ) + from kaldi.lat.functions import DeterminizeLatticePhonePrunedOptions + from kaldi.fstext import read_fst_kaldi, SymbolTable + except: + warnings.warn( + "pykaldi is required for this functionality. Please install from https://github.com/pykaldi/pykaldi" + ) + + # set_verbose_level(2) + + self.acoustic_scale = cfg.acoustic_scale + self.nbest = nbest + + if cfg.hlg_graph_path is None: + assert ( + cfg.kaldi_initializer_config is not None + ), "Must provide hlg graph path or kaldi initializer config" + cfg.hlg_graph_path = initalize_kaldi(cfg.kaldi_initializer_config) + + assert os.path.exists(cfg.hlg_graph_path), cfg.hlg_graph_path + + if cfg.is_lattice: + self.dec_cls = LatticeFasterDecoder + opt_cls = LatticeFasterDecoderOptions + self.rec_cls = LatticeFasterRecognizer + else: + assert self.nbest == 1, "nbest > 1 requires lattice decoder" + self.dec_cls = FasterDecoder + opt_cls = FasterDecoderOptions + self.rec_cls = FasterRecognizer + + self.decoder_options = opt_cls() + self.decoder_options.beam = beam + self.decoder_options.max_active = cfg.max_active + self.decoder_options.beam_delta = cfg.beam_delta + self.decoder_options.hash_ratio = cfg.hash_ratio + + if cfg.is_lattice: + self.decoder_options.lattice_beam = cfg.lattice_beam + self.decoder_options.prune_interval = cfg.prune_interval + self.decoder_options.determinize_lattice = cfg.determinize_lattice + self.decoder_options.prune_scale = cfg.prune_scale + det_opts = DeterminizeLatticePhonePrunedOptions() + det_opts.max_mem = cfg.max_mem + det_opts.phone_determinize = cfg.phone_determinize + det_opts.word_determinize = cfg.word_determinize + det_opts.minimize = cfg.minimize + self.decoder_options.det_opts = det_opts + + self.output_symbols = {} + with open(cfg.output_dict, "r") as f: + for line in f: + items = line.rstrip().split() + assert len(items) == 2 + self.output_symbols[int(items[1])] = items[0] + + logger.info(f"Loading FST from {cfg.hlg_graph_path}") + self.fst = read_fst_kaldi(cfg.hlg_graph_path) + self.symbol_table = SymbolTable.read_text(cfg.output_dict) + + self.executor = ThreadPoolExecutor(max_workers=cfg.num_threads) + + def generate(self, models, sample, **unused): + """Generate a batch of inferences.""" + # model.forward normally channels prev_output_tokens into the decoder + # separately, but SequenceGenerator directly calls model.encoder + encoder_input = { + k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens" + } + emissions, padding = self.get_emissions(models, encoder_input) + return self.decode(emissions, padding) + + def get_emissions(self, models, encoder_input): + """Run encoder and normalize emissions""" + model = models[0] + + all_encoder_out = [m(**encoder_input) for m in models] + + if len(all_encoder_out) > 1: + + if "encoder_out" in all_encoder_out[0]: + encoder_out = { + "encoder_out": sum(e["encoder_out"] for e in all_encoder_out) + / len(all_encoder_out), + "encoder_padding_mask": all_encoder_out[0]["encoder_padding_mask"], + } + padding = encoder_out["encoder_padding_mask"] + else: + encoder_out = { + "logits": sum(e["logits"] for e in all_encoder_out) + / len(all_encoder_out), + "padding_mask": all_encoder_out[0]["padding_mask"], + } + padding = encoder_out["padding_mask"] + else: + encoder_out = all_encoder_out[0] + padding = ( + encoder_out["padding_mask"] + if "padding_mask" in encoder_out + else encoder_out["encoder_padding_mask"] + ) + + if hasattr(model, "get_logits"): + emissions = model.get_logits(encoder_out, normalize=True) + else: + emissions = model.get_normalized_probs(encoder_out, log_probs=True) + + return ( + emissions.cpu().float().transpose(0, 1), + padding.cpu() if padding is not None and padding.any() else None, + ) + + def decode_one(self, logits, padding): + from kaldi.matrix import Matrix + + decoder = self.dec_cls(self.fst, self.decoder_options) + asr = self.rec_cls( + decoder, self.symbol_table, acoustic_scale=self.acoustic_scale + ) + + if padding is not None: + logits = logits[~padding] + + mat = Matrix(logits.numpy()) + + out = asr.decode(mat) + + if self.nbest > 1: + from kaldi.fstext import shortestpath + from kaldi.fstext.utils import ( + convert_compact_lattice_to_lattice, + convert_lattice_to_std, + convert_nbest_to_list, + get_linear_symbol_sequence, + ) + + lat = out["lattice"] + + sp = shortestpath(lat, nshortest=self.nbest) + + sp = convert_compact_lattice_to_lattice(sp) + sp = convert_lattice_to_std(sp) + seq = convert_nbest_to_list(sp) + + results = [] + for s in seq: + _, o, w = get_linear_symbol_sequence(s) + words = list(self.output_symbols[z] for z in o) + results.append( + { + "tokens": words, + "words": words, + "score": w.value, + "emissions": logits, + } + ) + return results + else: + words = out["text"].split() + return [ + { + "tokens": words, + "words": words, + "score": out["likelihood"], + "emissions": logits, + } + ] + + def decode(self, emissions, padding): + if padding is None: + padding = [None] * len(emissions) + + ret = list( + map( + lambda e, p: self.executor.submit(self.decode_one, e, p), + emissions, + padding, + ) + ) + return ret diff --git a/examples/speech_recognition/kaldi/kaldi_initializer.py b/examples/speech_recognition/kaldi/kaldi_initializer.py new file mode 100644 index 0000000000..6d2a2a4b6b --- /dev/null +++ b/examples/speech_recognition/kaldi/kaldi_initializer.py @@ -0,0 +1,698 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +import hydra +from hydra.core.config_store import ConfigStore +import logging +from omegaconf import MISSING, OmegaConf +import os +import os.path as osp +from pathlib import Path +import subprocess +from typing import Optional + +from fairseq.data.dictionary import Dictionary +from fairseq.dataclass import FairseqDataclass + +script_dir = Path(__file__).resolve().parent +config_path = script_dir / "config" + + +logger = logging.getLogger(__name__) + + +@dataclass +class KaldiInitializerConfig(FairseqDataclass): + data_dir: str = MISSING + fst_dir: Optional[str] = None + in_labels: str = MISSING + out_labels: Optional[str] = None + wav2letter_lexicon: Optional[str] = None + lm_arpa: str = MISSING + kaldi_root: str = MISSING + blank_symbol: str = "<s>" + silence_symbol: Optional[str] = None + + +def create_units(fst_dir: Path, in_labels: str, vocab: Dictionary) -> Path: + in_units_file = fst_dir / f"kaldi_dict.{in_labels}.txt" + if not in_units_file.exists(): + + logger.info(f"Creating {in_units_file}") + + with open(in_units_file, "w") as f: + print("<eps> 0", file=f) + i = 1 + for symb in vocab.symbols[vocab.nspecial :]: + if not symb.startswith("madeupword"): + print(f"{symb} {i}", file=f) + i += 1 + return in_units_file + + +def create_lexicon( + cfg: KaldiInitializerConfig, + fst_dir: Path, + unique_label: str, + in_units_file: Path, + out_words_file: Path, +) -> (Path, Path): + + disambig_in_units_file = fst_dir / f"kaldi_dict.{cfg.in_labels}_disambig.txt" + lexicon_file = fst_dir / f"kaldi_lexicon.{unique_label}.txt" + disambig_lexicon_file = fst_dir / f"kaldi_lexicon.{unique_label}_disambig.txt" + if ( + not lexicon_file.exists() + or not disambig_lexicon_file.exists() + or not disambig_in_units_file.exists() + ): + logger.info(f"Creating {lexicon_file} (in units file: {in_units_file})") + + assert cfg.wav2letter_lexicon is not None or cfg.in_labels == cfg.out_labels + + if cfg.wav2letter_lexicon is not None: + lm_words = set() + with open(out_words_file, "r") as lm_dict_f: + for line in lm_dict_f: + lm_words.add(line.split()[0]) + + num_skipped = 0 + total = 0 + with open(cfg.wav2letter_lexicon, "r") as w2l_lex_f, open( + lexicon_file, "w" + ) as out_f: + for line in w2l_lex_f: + items = line.rstrip().split("\t") + assert len(items) == 2, items + if items[0] in lm_words: + print(items[0], items[1], file=out_f) + else: + num_skipped += 1 + logger.debug( + f"Skipping word {items[0]} as it was not found in LM" + ) + total += 1 + if num_skipped > 0: + logger.warning( + f"Skipped {num_skipped} out of {total} words as they were not found in LM" + ) + else: + with open(in_units_file, "r") as in_f, open(lexicon_file, "w") as out_f: + for line in in_f: + symb = line.split()[0] + if symb != "<eps>" and symb != "<ctc_blank>" and symb != "<SIL>": + print(symb, symb, file=out_f) + + lex_disambig_path = ( + Path(cfg.kaldi_root) / "egs/wsj/s5/utils/add_lex_disambig.pl" + ) + res = subprocess.run( + [lex_disambig_path, lexicon_file, disambig_lexicon_file], + check=True, + capture_output=True, + ) + ndisambig = int(res.stdout) + disamib_path = Path(cfg.kaldi_root) / "egs/wsj/s5/utils/add_disambig.pl" + res = subprocess.run( + [disamib_path, "--include-zero", in_units_file, str(ndisambig)], + check=True, + capture_output=True, + ) + with open(disambig_in_units_file, "wb") as f: + f.write(res.stdout) + + return disambig_lexicon_file, disambig_in_units_file + + +def create_G( + kaldi_root: Path, fst_dir: Path, lm_arpa: Path, arpa_base: str +) -> (Path, Path): + + out_words_file = fst_dir / f"kaldi_dict.{arpa_base}.txt" + grammar_graph = fst_dir / f"G_{arpa_base}.fst" + if not grammar_graph.exists() or not out_words_file.exists(): + logger.info(f"Creating {grammar_graph}") + arpa2fst = kaldi_root / "src/lmbin/arpa2fst" + subprocess.run( + [ + arpa2fst, + "--disambig-symbol=#0", + f"--write-symbol-table={out_words_file}", + lm_arpa, + grammar_graph, + ], + check=True, + ) + return grammar_graph, out_words_file + + +def create_L( + kaldi_root: Path, + fst_dir: Path, + unique_label: str, + lexicon_file: Path, + in_units_file: Path, + out_words_file: Path, +) -> Path: + lexicon_graph = fst_dir / f"L.{unique_label}.fst" + + if not lexicon_graph.exists(): + logger.info(f"Creating {lexicon_graph} (in units: {in_units_file})") + make_lex = kaldi_root / "egs/wsj/s5/utils/make_lexicon_fst.pl" + fstcompile = kaldi_root / "tools/openfst-1.6.7/bin/fstcompile" + fstaddselfloops = kaldi_root / "src/fstbin/fstaddselfloops" + fstarcsort = kaldi_root / "tools/openfst-1.6.7/bin/fstarcsort" + + def write_disambig_symbol(file): + with open(file, "r") as f: + for line in f: + items = line.rstrip().split() + if items[0] == "#0": + out_path = str(file) + "_disamig" + with open(out_path, "w") as out_f: + print(items[1], file=out_f) + return out_path + + return None + + in_disambig_sym = write_disambig_symbol(in_units_file) + assert in_disambig_sym is not None + out_disambig_sym = write_disambig_symbol(out_words_file) + assert out_disambig_sym is not None + + try: + with open(lexicon_graph, "wb") as out_f: + res = subprocess.run( + [make_lex, lexicon_file], capture_output=True, check=True + ) + assert len(res.stderr) == 0, res.stderr.decode("utf-8") + res = subprocess.run( + [ + fstcompile, + f"--isymbols={in_units_file}", + f"--osymbols={out_words_file}", + "--keep_isymbols=false", + "--keep_osymbols=false", + ], + input=res.stdout, + capture_output=True, + ) + assert len(res.stderr) == 0, res.stderr.decode("utf-8") + res = subprocess.run( + [fstaddselfloops, in_disambig_sym, out_disambig_sym], + input=res.stdout, + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstarcsort, "--sort_type=olabel"], + input=res.stdout, + capture_output=True, + check=True, + ) + out_f.write(res.stdout) + except subprocess.CalledProcessError as e: + logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") + os.remove(lexicon_graph) + raise + except AssertionError: + os.remove(lexicon_graph) + raise + + return lexicon_graph + + +def create_LG( + kaldi_root: Path, + fst_dir: Path, + unique_label: str, + lexicon_graph: Path, + grammar_graph: Path, +) -> Path: + lg_graph = fst_dir / f"LG.{unique_label}.fst" + + if not lg_graph.exists(): + logger.info(f"Creating {lg_graph}") + + fsttablecompose = kaldi_root / "src/fstbin/fsttablecompose" + fstdeterminizestar = kaldi_root / "src/fstbin/fstdeterminizestar" + fstminimizeencoded = kaldi_root / "src/fstbin/fstminimizeencoded" + fstpushspecial = kaldi_root / "src/fstbin/fstpushspecial" + fstarcsort = kaldi_root / "tools/openfst-1.6.7/bin/fstarcsort" + + try: + with open(lg_graph, "wb") as out_f: + res = subprocess.run( + [fsttablecompose, lexicon_graph, grammar_graph], + capture_output=True, + check=True, + ) + res = subprocess.run( + [ + fstdeterminizestar, + "--use-log=true", + ], + input=res.stdout, + capture_output=True, + ) + res = subprocess.run( + [fstminimizeencoded], + input=res.stdout, + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstpushspecial], + input=res.stdout, + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstarcsort, "--sort_type=ilabel"], + input=res.stdout, + capture_output=True, + check=True, + ) + out_f.write(res.stdout) + except subprocess.CalledProcessError as e: + logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") + os.remove(lg_graph) + raise + + return lg_graph + + +def create_H( + kaldi_root: Path, + fst_dir: Path, + disambig_out_units_file: Path, + in_labels: str, + vocab: Dictionary, + blk_sym: str, + silence_symbol: Optional[str], +) -> (Path, Path, Path): + h_graph = ( + fst_dir / f"H.{in_labels}{'_' + silence_symbol if silence_symbol else ''}.fst" + ) + h_out_units_file = fst_dir / f"kaldi_dict.h_out.{in_labels}.txt" + disambig_in_units_file_int = Path(str(h_graph) + "isym_disambig.int") + disambig_out_units_file_int = Path(str(disambig_out_units_file) + ".int") + if ( + not h_graph.exists() + or not h_out_units_file.exists() + or not disambig_in_units_file_int.exists() + ): + logger.info(f"Creating {h_graph}") + eps_sym = "<eps>" + + num_disambig = 0 + osymbols = [] + + with open(disambig_out_units_file, "r") as f, open( + disambig_out_units_file_int, "w" + ) as out_f: + for line in f: + symb, id = line.rstrip().split() + if line.startswith("#"): + num_disambig += 1 + print(id, file=out_f) + else: + if len(osymbols) == 0: + assert symb == eps_sym, symb + osymbols.append((symb, id)) + + i_idx = 0 + isymbols = [(eps_sym, 0)] + + imap = {} + + for i, s in enumerate(vocab.symbols): + i_idx += 1 + isymbols.append((s, i_idx)) + imap[s] = i_idx + + fst_str = [] + + node_idx = 0 + root_node = node_idx + + special_symbols = [blk_sym] + if silence_symbol is not None: + special_symbols.append(silence_symbol) + + for ss in special_symbols: + fst_str.append("{} {} {} {}".format(root_node, root_node, ss, eps_sym)) + + for symbol, _ in osymbols: + if symbol == eps_sym or symbol.startswith("#"): + continue + + node_idx += 1 + # 1. from root to emitting state + fst_str.append("{} {} {} {}".format(root_node, node_idx, symbol, symbol)) + # 2. from emitting state back to root + fst_str.append("{} {} {} {}".format(node_idx, root_node, eps_sym, eps_sym)) + # 3. from emitting state to optional blank state + pre_node = node_idx + node_idx += 1 + for ss in special_symbols: + fst_str.append("{} {} {} {}".format(pre_node, node_idx, ss, eps_sym)) + # 4. from blank state back to root + fst_str.append("{} {} {} {}".format(node_idx, root_node, eps_sym, eps_sym)) + + fst_str.append("{}".format(root_node)) + + fst_str = "\n".join(fst_str) + h_str = str(h_graph) + isym_file = h_str + ".isym" + + with open(isym_file, "w") as f: + for sym, id in isymbols: + f.write("{} {}\n".format(sym, id)) + + with open(h_out_units_file, "w") as f: + for sym, id in osymbols: + f.write("{} {}\n".format(sym, id)) + + with open(disambig_in_units_file_int, "w") as f: + disam_sym_id = len(isymbols) + for _ in range(num_disambig): + f.write("{}\n".format(disam_sym_id)) + disam_sym_id += 1 + + fstcompile = kaldi_root / "tools/openfst-1.6.7/bin/fstcompile" + fstaddselfloops = kaldi_root / "src/fstbin/fstaddselfloops" + fstarcsort = kaldi_root / "tools/openfst-1.6.7/bin/fstarcsort" + + try: + with open(h_graph, "wb") as out_f: + res = subprocess.run( + [ + fstcompile, + f"--isymbols={isym_file}", + f"--osymbols={h_out_units_file}", + "--keep_isymbols=false", + "--keep_osymbols=false", + ], + input=str.encode(fst_str), + capture_output=True, + check=True, + ) + res = subprocess.run( + [ + fstaddselfloops, + disambig_in_units_file_int, + disambig_out_units_file_int, + ], + input=res.stdout, + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstarcsort, "--sort_type=olabel"], + input=res.stdout, + capture_output=True, + check=True, + ) + out_f.write(res.stdout) + except subprocess.CalledProcessError as e: + logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") + os.remove(h_graph) + raise + return h_graph, h_out_units_file, disambig_in_units_file_int + + +def create_HLGa( + kaldi_root: Path, + fst_dir: Path, + unique_label: str, + h_graph: Path, + lg_graph: Path, + disambig_in_words_file_int: Path, +) -> Path: + hlga_graph = fst_dir / f"HLGa.{unique_label}.fst" + + if not hlga_graph.exists(): + logger.info(f"Creating {hlga_graph}") + + fsttablecompose = kaldi_root / "src/fstbin/fsttablecompose" + fstdeterminizestar = kaldi_root / "src/fstbin/fstdeterminizestar" + fstrmsymbols = kaldi_root / "src/fstbin/fstrmsymbols" + fstrmepslocal = kaldi_root / "src/fstbin/fstrmepslocal" + fstminimizeencoded = kaldi_root / "src/fstbin/fstminimizeencoded" + + try: + with open(hlga_graph, "wb") as out_f: + res = subprocess.run( + [ + fsttablecompose, + h_graph, + lg_graph, + ], + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstdeterminizestar, "--use-log=true"], + input=res.stdout, + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstrmsymbols, disambig_in_words_file_int], + input=res.stdout, + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstrmepslocal], + input=res.stdout, + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstminimizeencoded], + input=res.stdout, + capture_output=True, + check=True, + ) + out_f.write(res.stdout) + except subprocess.CalledProcessError as e: + logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") + os.remove(hlga_graph) + raise + + return hlga_graph + + +def create_HLa( + kaldi_root: Path, + fst_dir: Path, + unique_label: str, + h_graph: Path, + l_graph: Path, + disambig_in_words_file_int: Path, +) -> Path: + hla_graph = fst_dir / f"HLa.{unique_label}.fst" + + if not hla_graph.exists(): + logger.info(f"Creating {hla_graph}") + + fsttablecompose = kaldi_root / "src/fstbin/fsttablecompose" + fstdeterminizestar = kaldi_root / "src/fstbin/fstdeterminizestar" + fstrmsymbols = kaldi_root / "src/fstbin/fstrmsymbols" + fstrmepslocal = kaldi_root / "src/fstbin/fstrmepslocal" + fstminimizeencoded = kaldi_root / "src/fstbin/fstminimizeencoded" + + try: + with open(hla_graph, "wb") as out_f: + res = subprocess.run( + [ + fsttablecompose, + h_graph, + l_graph, + ], + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstdeterminizestar, "--use-log=true"], + input=res.stdout, + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstrmsymbols, disambig_in_words_file_int], + input=res.stdout, + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstrmepslocal], + input=res.stdout, + capture_output=True, + check=True, + ) + res = subprocess.run( + [fstminimizeencoded], + input=res.stdout, + capture_output=True, + check=True, + ) + out_f.write(res.stdout) + except subprocess.CalledProcessError as e: + logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") + os.remove(hla_graph) + raise + + return hla_graph + + +def create_HLG( + kaldi_root: Path, + fst_dir: Path, + unique_label: str, + hlga_graph: Path, + prefix: str = "HLG", +) -> Path: + hlg_graph = fst_dir / f"{prefix}.{unique_label}.fst" + + if not hlg_graph.exists(): + logger.info(f"Creating {hlg_graph}") + + add_self_loop = script_dir / "add-self-loop-simple" + kaldi_src = kaldi_root / "src" + kaldi_lib = kaldi_src / "lib" + + try: + if not add_self_loop.exists(): + fst_include = kaldi_root / "tools/openfst-1.6.7/include" + add_self_loop_src = script_dir / "add-self-loop-simple.cc" + + subprocess.run( + [ + "c++", + f"-I{kaldi_src}", + f"-I{fst_include}", + f"-L{kaldi_lib}", + add_self_loop_src, + "-lkaldi-base", + "-lkaldi-fstext", + "-o", + add_self_loop, + ], + check=True, + ) + + my_env = os.environ.copy() + my_env["LD_LIBRARY_PATH"] = f"{kaldi_lib}:{my_env['LD_LIBRARY_PATH']}" + + subprocess.run( + [ + add_self_loop, + hlga_graph, + hlg_graph, + ], + check=True, + capture_output=True, + env=my_env, + ) + except subprocess.CalledProcessError as e: + logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") + raise + + return hlg_graph + + +def initalize_kaldi(cfg: KaldiInitializerConfig) -> Path: + if cfg.fst_dir is None: + cfg.fst_dir = osp.join(cfg.data_dir, "kaldi") + if cfg.out_labels is None: + cfg.out_labels = cfg.in_labels + + kaldi_root = Path(cfg.kaldi_root) + data_dir = Path(cfg.data_dir) + fst_dir = Path(cfg.fst_dir) + fst_dir.mkdir(parents=True, exist_ok=True) + + arpa_base = osp.splitext(osp.basename(cfg.lm_arpa))[0] + unique_label = f"{cfg.in_labels}.{arpa_base}" + + with open(data_dir / f"dict.{cfg.in_labels}.txt", "r") as f: + vocab = Dictionary.load(f) + + in_units_file = create_units(fst_dir, cfg.in_labels, vocab) + + grammar_graph, out_words_file = create_G( + kaldi_root, fst_dir, Path(cfg.lm_arpa), arpa_base + ) + + disambig_lexicon_file, disambig_L_in_units_file = create_lexicon( + cfg, fst_dir, unique_label, in_units_file, out_words_file + ) + + h_graph, h_out_units_file, disambig_in_units_file_int = create_H( + kaldi_root, + fst_dir, + disambig_L_in_units_file, + cfg.in_labels, + vocab, + cfg.blank_symbol, + cfg.silence_symbol, + ) + lexicon_graph = create_L( + kaldi_root, + fst_dir, + unique_label, + disambig_lexicon_file, + disambig_L_in_units_file, + out_words_file, + ) + lg_graph = create_LG( + kaldi_root, fst_dir, unique_label, lexicon_graph, grammar_graph + ) + hlga_graph = create_HLGa( + kaldi_root, fst_dir, unique_label, h_graph, lg_graph, disambig_in_units_file_int + ) + hlg_graph = create_HLG(kaldi_root, fst_dir, unique_label, hlga_graph) + + # for debugging + # hla_graph = create_HLa(kaldi_root, fst_dir, unique_label, h_graph, lexicon_graph, disambig_in_units_file_int) + # hl_graph = create_HLG(kaldi_root, fst_dir, unique_label, hla_graph, prefix="HL_looped") + # create_HLG(kaldi_root, fst_dir, "phnc", h_graph, prefix="H_looped") + + return hlg_graph + + +@hydra.main(config_path=config_path, config_name="kaldi_initializer") +def cli_main(cfg: KaldiInitializerConfig) -> None: + container = OmegaConf.to_container(cfg, resolve=True, enum_to_str=True) + cfg = OmegaConf.create(container) + OmegaConf.set_struct(cfg, True) + initalize_kaldi(cfg) + + +if __name__ == "__main__": + + logging.root.setLevel(logging.INFO) + logging.basicConfig(level=logging.INFO) + + try: + from hydra._internal.utils import ( + get_args, + ) # pylint: disable=import-outside-toplevel + + cfg_name = get_args().config_name or "kaldi_initializer" + except ImportError: + logger.warning("Failed to get config name from hydra args") + cfg_name = "kaldi_initializer" + + cs = ConfigStore.instance() + cs.store(name=cfg_name, node=KaldiInitializerConfig) + + cli_main() diff --git a/examples/speech_recognition/models/__init__.py b/examples/speech_recognition/models/__init__.py index 0ad9663f11..54b5a1c312 100644 --- a/examples/speech_recognition/models/__init__.py +++ b/examples/speech_recognition/models/__init__.py @@ -2,7 +2,7 @@ import os -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): model_name = file[: file.find(".py")] importlib.import_module("examples.speech_recognition.models." + model_name) diff --git a/examples/speech_recognition/models/vggtransformer.py b/examples/speech_recognition/models/vggtransformer.py index 97974360a4..bca0ae59a8 100644 --- a/examples/speech_recognition/models/vggtransformer.py +++ b/examples/speech_recognition/models/vggtransformer.py @@ -203,6 +203,7 @@ def prepare_transformer_decoder_params( relu_dropout, ): args = argparse.Namespace() + args.encoder_embed_dim = None args.decoder_embed_dim = input_dim args.decoder_attention_heads = num_heads args.attention_dropout = attention_dropout diff --git a/examples/speech_recognition/new/README.md b/examples/speech_recognition/new/README.md new file mode 100644 index 0000000000..5fa0e97245 --- /dev/null +++ b/examples/speech_recognition/new/README.md @@ -0,0 +1,43 @@ +# Flashlight Decoder + +This script runs decoding for pre-trained speech recognition models. + +## Usage + +Assuming a few variables: + +```bash +checkpoint=<path-to-checkpoint> +data=<path-to-data-directory> +lm_model=<path-to-language-model> +lexicon=<path-to-lexicon> +``` + +Example usage for decoding a fine-tuned Wav2Vec model: + +```bash +python $FAIRSEQ_ROOT/examples/speech_recognition/new/infer.py --multirun \ + task=audio_pretraining \ + task.data=$data \ + task.labels=ltr \ + common_eval.path=$checkpoint \ + decoding.type=kenlm \ + decoding.lexicon=$lexicon \ + decoding.lmpath=$lm_model \ + dataset.gen_subset=dev_clean,dev_other,test_clean,test_other +``` + +Example usage for using Ax to sweep WER parameters (requires `pip install hydra-ax-sweeper`): + +```bash +python $FAIRSEQ_ROOT/examples/speech_recognition/new/infer.py --multirun \ + hydra/sweeper=ax \ + task=audio_pretraining \ + task.data=$data \ + task.labels=ltr \ + common_eval.path=$checkpoint \ + decoding.type=kenlm \ + decoding.lexicon=$lexicon \ + decoding.lmpath=$lm_model \ + dataset.gen_subset=dev_other +``` diff --git a/examples/speech_recognition/new/__init__.py b/examples/speech_recognition/new/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/speech_recognition/new/conf/hydra/sweeper/ax.yaml b/examples/speech_recognition/new/conf/hydra/sweeper/ax.yaml new file mode 100644 index 0000000000..38e9c221db --- /dev/null +++ b/examples/speech_recognition/new/conf/hydra/sweeper/ax.yaml @@ -0,0 +1,29 @@ +# @package hydra.sweeper +_target_: hydra_plugins.hydra_ax_sweeper.ax_sweeper.AxSweeper +max_batch_size: null +ax_config: + max_trials: 128 + early_stop: + minimize: true + max_epochs_without_improvement: 10 + epsilon: 0.025 + experiment: + name: ${dataset.gen_subset} + objective_name: wer + minimize: true + parameter_constraints: null + outcome_constraints: null + status_quo: null + client: + verbose_logging: false + random_seed: null + params: + decoding.lmweight: + type: range + bounds: [0.0, 5.0] + decoding.wordscore: + type: range + bounds: [-5.0, 5.0] + decoding.silweight: + type: range + bounds: [ -8.0, 0.0 ] diff --git a/examples/speech_recognition/new/conf/hydra/sweeper/ax_sil.yaml b/examples/speech_recognition/new/conf/hydra/sweeper/ax_sil.yaml new file mode 100644 index 0000000000..eaaebcf5f6 --- /dev/null +++ b/examples/speech_recognition/new/conf/hydra/sweeper/ax_sil.yaml @@ -0,0 +1,29 @@ +# @package hydra.sweeper +_target_: hydra_plugins.hydra_ax_sweeper.ax_sweeper.AxSweeper +max_batch_size: null +ax_config: + max_trials: 64 + early_stop: + minimize: true + max_epochs_without_improvement: 10 + epsilon: 0.025 + experiment: + name: ${dataset.gen_subset} + objective_name: wer + minimize: true + parameter_constraints: null + outcome_constraints: null + status_quo: null + client: + verbose_logging: false + random_seed: null + params: + decoding.lmweight: + type: range + bounds: [0.0, 10.0] + decoding.wordscore: + type: range + bounds: [-10.0, 10.0] + decoding.silweight: + type: range + bounds: [ -10.0, 0.0 ] diff --git a/examples/speech_recognition/new/conf/infer.yaml b/examples/speech_recognition/new/conf/infer.yaml new file mode 100644 index 0000000000..2d168d06af --- /dev/null +++ b/examples/speech_recognition/new/conf/infer.yaml @@ -0,0 +1,27 @@ +# @package _group_ + +defaults: + - task: null + - model: null + +hydra: + run: + dir: ${common_eval.results_path}/${dataset.gen_subset} + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${common_eval.results_path} + subdir: ${dataset.gen_subset} +common: + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec +common_eval: + results_path: null + path: null + post_process: letter + quiet: true +dataset: + max_tokens: 3000000 + gen_subset: test +distributed_training: + distributed_world_size: 1 +decoding: + beam: 5 + type: viterbi diff --git a/examples/speech_recognition/new/conf/run_config/fb_slurm_1.yaml b/examples/speech_recognition/new/conf/run_config/fb_slurm_1.yaml new file mode 100644 index 0000000000..d0a9b0e586 --- /dev/null +++ b/examples/speech_recognition/new/conf/run_config/fb_slurm_1.yaml @@ -0,0 +1,28 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - common_eval.path + sweep: + dir: /checkpoint/abaevski/asr/d2v2/decoding/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} +# subdir: ${hydra.job.override_dirname} + launcher: + cpus_per_task: 16 + gpus_per_node: 1 + tasks_per_node: 1 + nodes: 1 + partition: devlab,learnlab + mem_gb: 100 + timeout_min: 2000 + max_num_timeout: 10 + name: ${env:PREFIX}_${hydra.job.config_name} + submitit_folder: ${hydra.sweep.dir}/%j + constraint: volta32gb + exclude: learnfair7598 \ No newline at end of file diff --git a/examples/speech_recognition/new/conf/run_config/fb_slurm_2g.yaml b/examples/speech_recognition/new/conf/run_config/fb_slurm_2g.yaml new file mode 100644 index 0000000000..c0c442f76d --- /dev/null +++ b/examples/speech_recognition/new/conf/run_config/fb_slurm_2g.yaml @@ -0,0 +1,27 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - common_eval.path + sweep: + dir: /checkpoint/abaevski/asr/d2v2/decoding/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} +# subdir: ${hydra.job.override_dirname} + launcher: + cpus_per_task: 16 + gpus_per_node: 2 + tasks_per_node: 2 + nodes: 1 + partition: devlab,learnlab + mem_gb: 100 + timeout_min: 2000 + max_num_timeout: 10 + name: ${env:PREFIX}_${hydra.job.config_name} + submitit_folder: ${hydra.sweep.dir}/%j + constraint: volta32gb \ No newline at end of file diff --git a/examples/speech_recognition/new/decoders/__init__.py b/examples/speech_recognition/new/decoders/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/speech_recognition/new/decoders/base_decoder.py b/examples/speech_recognition/new/decoders/base_decoder.py new file mode 100644 index 0000000000..a097969b3c --- /dev/null +++ b/examples/speech_recognition/new/decoders/base_decoder.py @@ -0,0 +1,62 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import itertools as it +from typing import Any, Dict, List + +import torch +from fairseq.data.dictionary import Dictionary +from fairseq.models.fairseq_model import FairseqModel + + +class BaseDecoder: + def __init__(self, tgt_dict: Dictionary) -> None: + self.tgt_dict = tgt_dict + self.vocab_size = len(tgt_dict) + + self.blank = ( + tgt_dict.index("<ctc_blank>") + if "<ctc_blank>" in tgt_dict.indices + else tgt_dict.bos() + ) + if "<sep>" in tgt_dict.indices: + self.silence = tgt_dict.index("<sep>") + elif "|" in tgt_dict.indices: + self.silence = tgt_dict.index("|") + else: + self.silence = tgt_dict.eos() + + def generate( + self, models: List[FairseqModel], sample: Dict[str, Any], **unused + ) -> List[List[Dict[str, torch.LongTensor]]]: + encoder_input = { + k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens" + } + emissions = self.get_emissions(models, encoder_input) + return self.decode(emissions) + + def get_emissions( + self, + models: List[FairseqModel], + encoder_input: Dict[str, Any], + ) -> torch.FloatTensor: + model = models[0] + encoder_out = model(**encoder_input) + if hasattr(model, "get_logits"): + emissions = model.get_logits(encoder_out) + else: + emissions = model.get_normalized_probs(encoder_out, log_probs=True) + return emissions.transpose(0, 1).float().cpu().contiguous() + + def get_tokens(self, idxs: torch.IntTensor) -> torch.LongTensor: + idxs = (g[0] for g in it.groupby(idxs)) + idxs = filter(lambda x: x != self.blank, idxs) + return torch.LongTensor(list(idxs)) + + def decode( + self, + emissions: torch.FloatTensor, + ) -> List[List[Dict[str, torch.LongTensor]]]: + raise NotImplementedError diff --git a/examples/speech_recognition/new/decoders/decoder.py b/examples/speech_recognition/new/decoders/decoder.py new file mode 100644 index 0000000000..b5bec8cf70 --- /dev/null +++ b/examples/speech_recognition/new/decoders/decoder.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Union + +from fairseq.data.dictionary import Dictionary + +from .decoder_config import DecoderConfig, FlashlightDecoderConfig +from .base_decoder import BaseDecoder + + +def Decoder( + cfg: Union[DecoderConfig, FlashlightDecoderConfig], tgt_dict: Dictionary +) -> BaseDecoder: + + if cfg.type == "viterbi": + from .viterbi_decoder import ViterbiDecoder + + return ViterbiDecoder(tgt_dict) + if cfg.type == "kenlm": + from .flashlight_decoder import KenLMDecoder + + return KenLMDecoder(cfg, tgt_dict) + if cfg.type == "fairseqlm": + from .flashlight_decoder import FairseqLMDecoder + + return FairseqLMDecoder(cfg, tgt_dict) + raise NotImplementedError(f"Invalid decoder name: {cfg.name}") diff --git a/examples/speech_recognition/new/decoders/decoder_config.py b/examples/speech_recognition/new/decoders/decoder_config.py new file mode 100644 index 0000000000..659eb94a9b --- /dev/null +++ b/examples/speech_recognition/new/decoders/decoder_config.py @@ -0,0 +1,70 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass, field +from typing import Optional + +from fairseq.dataclass.configs import FairseqDataclass +from fairseq.dataclass.constants import ChoiceEnum +from omegaconf import MISSING + + +DECODER_CHOICES = ChoiceEnum(["viterbi", "kenlm", "fairseqlm"]) + + +@dataclass +class DecoderConfig(FairseqDataclass): + type: DECODER_CHOICES = field( + default="viterbi", + metadata={"help": "The type of decoder to use"}, + ) + + +@dataclass +class FlashlightDecoderConfig(FairseqDataclass): + nbest: int = field( + default=1, + metadata={"help": "Number of decodings to return"}, + ) + unitlm: bool = field( + default=False, + metadata={"help": "If set, use unit language model"}, + ) + lmpath: str = field( + default=MISSING, + metadata={"help": "Language model for KenLM decoder"}, + ) + lexicon: Optional[str] = field( + default=None, + metadata={"help": "Lexicon for Flashlight decoder"}, + ) + beam: int = field( + default=50, + metadata={"help": "Number of beams to use for decoding"}, + ) + beamthreshold: float = field( + default=50.0, + metadata={"help": "Threshold for beam search decoding"}, + ) + beamsizetoken: Optional[int] = field( + default=None, metadata={"help": "Beam size to use"} + ) + wordscore: float = field( + default=-1, + metadata={"help": "Word score for KenLM decoder"}, + ) + unkweight: float = field( + default=-math.inf, + metadata={"help": "Unknown weight for KenLM decoder"}, + ) + silweight: float = field( + default=0, + metadata={"help": "Silence weight for KenLM decoder"}, + ) + lmweight: float = field( + default=2, + metadata={"help": "Weight for LM while interpolating score"}, + ) diff --git a/examples/speech_recognition/new/decoders/flashlight_decoder.py b/examples/speech_recognition/new/decoders/flashlight_decoder.py new file mode 100644 index 0000000000..7790fcdb82 --- /dev/null +++ b/examples/speech_recognition/new/decoders/flashlight_decoder.py @@ -0,0 +1,433 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import gc +import os.path as osp +import warnings +from collections import deque, namedtuple +from typing import Any, Dict, Tuple + +import numpy as np +import torch +from fairseq import tasks +from fairseq.data.dictionary import Dictionary +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.models.fairseq_model import FairseqModel +from fairseq.utils import apply_to_sample +from omegaconf import open_dict, OmegaConf + +from typing import List + +from .decoder_config import FlashlightDecoderConfig +from .base_decoder import BaseDecoder + +try: + from flashlight.lib.text.decoder import ( + LM, + CriterionType, + DecodeResult, + KenLM, + LexiconDecoder, + LexiconDecoderOptions, + LexiconFreeDecoder, + LexiconFreeDecoderOptions, + LMState, + SmearingMode, + Trie, + ) + from flashlight.lib.text.dictionary import create_word_dict, load_words + from flashlight.lib.text.dictionary import Dictionary as flDictionary +except ImportError: + warnings.warn( + "flashlight python bindings are required to use this functionality. " + "Please install from " + "https://github.com/facebookresearch/flashlight/tree/master/bindings/python" + ) + LM = object + LMState = object + + +class KenLMDecoder(BaseDecoder): + def __init__(self, cfg: FlashlightDecoderConfig, tgt_dict: Dictionary) -> None: + super().__init__(tgt_dict) + + self.nbest = cfg.nbest + self.unitlm = cfg.unitlm + + if cfg.lexicon: + self.lexicon = load_words(cfg.lexicon) + self.word_dict = create_word_dict(self.lexicon) + self.unk_word = self.word_dict.get_index("<unk>") + + self.lm = KenLM(cfg.lmpath, self.word_dict) + self.trie = Trie(self.vocab_size, self.silence) + + start_state = self.lm.start(False) + for word, spellings in self.lexicon.items(): + word_idx = self.word_dict.get_index(word) + _, score = self.lm.score(start_state, word_idx) + for spelling in spellings: + spelling_idxs = [tgt_dict.index(token) for token in spelling] + assert ( + tgt_dict.unk() not in spelling_idxs + ), f"{word} {spelling} {spelling_idxs}" + self.trie.insert(spelling_idxs, word_idx, score) + self.trie.smear(SmearingMode.MAX) + + self.decoder_opts = LexiconDecoderOptions( + beam_size=cfg.beam, + beam_size_token=cfg.beamsizetoken or len(tgt_dict), + beam_threshold=cfg.beamthreshold, + lm_weight=cfg.lmweight, + word_score=cfg.wordscore, + unk_score=cfg.unkweight, + sil_score=cfg.silweight, + log_add=False, + criterion_type=CriterionType.CTC, + ) + + self.decoder = LexiconDecoder( + self.decoder_opts, + self.trie, + self.lm, + self.silence, + self.blank, + self.unk_word, + [], + self.unitlm, + ) + else: + assert self.unitlm, "Lexicon-free decoding requires unit LM" + + self.word_dict = flDictionary() + for sym in tgt_dict.symbols: + self.word_dict.add_entry(sym, tgt_dict.index(sym)) + self.lm = KenLM(cfg.lmpath, self.word_dict) + self.decoder_opts = LexiconFreeDecoderOptions( + beam_size=cfg.beam, + beam_size_token=cfg.beamsizetoken or len(tgt_dict), + beam_threshold=cfg.beamthreshold, + lm_weight=cfg.lmweight, + sil_score=cfg.silweight, + log_add=False, + criterion_type=CriterionType.CTC, + ) + self.decoder = LexiconFreeDecoder( + self.decoder_opts, self.lm, self.silence, self.blank, [] + ) + + def get_timesteps(self, token_idxs: List[int]) -> List[int]: + """Returns frame numbers corresponding to every non-blank token. + + Parameters + ---------- + token_idxs : List[int] + IDs of decoded tokens. + + Returns + ------- + List[int] + Frame numbers corresponding to every non-blank token. + """ + timesteps = [] + for i, token_idx in enumerate(token_idxs): + if token_idx == self.blank: + continue + if i == 0 or token_idx != token_idxs[i-1]: + timesteps.append(i) + return timesteps + + def decode( + self, + emissions: torch.FloatTensor, + ) -> List[List[Dict[str, torch.LongTensor]]]: + B, T, N = emissions.size() + hypos = [] + for b in range(B): + emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0) + results = self.decoder.decode(emissions_ptr, T, N) + + nbest_results = results[: self.nbest] + hypos.append( + [ + { + "tokens": self.get_tokens(result.tokens), + "score": result.score, + "timesteps": self.get_timesteps(result.tokens), + "words": [ + self.word_dict.get_entry(x) for x in result.words if x >= 0 + ], + } + for result in nbest_results + ] + ) + return hypos + + +FairseqLMState = namedtuple( + "FairseqLMState", + [ + "prefix", + "incremental_state", + "probs", + ], +) + + +class FairseqLM(LM): + def __init__(self, dictionary: Dictionary, model: FairseqModel) -> None: + super().__init__() + + self.dictionary = dictionary + self.model = model + self.unk = self.dictionary.unk() + + self.save_incremental = False # this currently does not work properly + self.max_cache = 20_000 + + if torch.cuda.is_available(): + model.cuda() + model.eval() + model.make_generation_fast_() + + self.states = {} + self.stateq = deque() + + def start(self, start_with_nothing: bool) -> LMState: + state = LMState() + prefix = torch.LongTensor([[self.dictionary.eos()]]) + incremental_state = {} if self.save_incremental else None + with torch.no_grad(): + res = self.model(prefix.cuda(), incremental_state=incremental_state) + probs = self.model.get_normalized_probs(res, log_probs=True, sample=None) + + if incremental_state is not None: + incremental_state = apply_to_sample(lambda x: x.cpu(), incremental_state) + self.states[state] = FairseqLMState( + prefix.numpy(), incremental_state, probs[0, -1].cpu().numpy() + ) + self.stateq.append(state) + + return state + + def score( + self, + state: LMState, + token_index: int, + no_cache: bool = False, + ) -> Tuple[LMState, int]: + """ + Evaluate language model based on the current lm state and new word + Parameters: + ----------- + state: current lm state + token_index: index of the word + (can be lexicon index then you should store inside LM the + mapping between indices of lexicon and lm, or lm index of a word) + Returns: + -------- + (LMState, float): pair of (new state, score for the current word) + """ + curr_state = self.states[state] + + def trim_cache(targ_size: int) -> None: + while len(self.stateq) > targ_size: + rem_k = self.stateq.popleft() + rem_st = self.states[rem_k] + rem_st = FairseqLMState(rem_st.prefix, None, None) + self.states[rem_k] = rem_st + + if curr_state.probs is None: + new_incremental_state = ( + curr_state.incremental_state.copy() + if curr_state.incremental_state is not None + else None + ) + with torch.no_grad(): + if new_incremental_state is not None: + new_incremental_state = apply_to_sample( + lambda x: x.cuda(), new_incremental_state + ) + elif self.save_incremental: + new_incremental_state = {} + + res = self.model( + torch.from_numpy(curr_state.prefix).cuda(), + incremental_state=new_incremental_state, + ) + probs = self.model.get_normalized_probs( + res, log_probs=True, sample=None + ) + + if new_incremental_state is not None: + new_incremental_state = apply_to_sample( + lambda x: x.cpu(), new_incremental_state + ) + + curr_state = FairseqLMState( + curr_state.prefix, new_incremental_state, probs[0, -1].cpu().numpy() + ) + + if not no_cache: + self.states[state] = curr_state + self.stateq.append(state) + + score = curr_state.probs[token_index].item() + + trim_cache(self.max_cache) + + outstate = state.child(token_index) + if outstate not in self.states and not no_cache: + prefix = np.concatenate( + [curr_state.prefix, torch.LongTensor([[token_index]])], -1 + ) + incr_state = curr_state.incremental_state + + self.states[outstate] = FairseqLMState(prefix, incr_state, None) + + if token_index == self.unk: + score = float("-inf") + + return outstate, score + + def finish(self, state: LMState) -> Tuple[LMState, int]: + """ + Evaluate eos for language model based on the current lm state + Returns: + -------- + (LMState, float): pair of (new state, score for the current word) + """ + return self.score(state, self.dictionary.eos()) + + def empty_cache(self) -> None: + self.states = {} + self.stateq = deque() + gc.collect() + + +class FairseqLMDecoder(BaseDecoder): + def __init__(self, cfg: FlashlightDecoderConfig, tgt_dict: Dictionary) -> None: + super().__init__(tgt_dict) + + self.nbest = cfg.nbest + self.unitlm = cfg.unitlm + + self.lexicon = load_words(cfg.lexicon) if cfg.lexicon else None + self.idx_to_wrd = {} + + checkpoint = torch.load(cfg.lmpath, map_location="cpu") + + if "cfg" in checkpoint and checkpoint["cfg"] is not None: + lm_args = checkpoint["cfg"] + else: + lm_args = convert_namespace_to_omegaconf(checkpoint["args"]) + + if not OmegaConf.is_dict(lm_args): + lm_args = OmegaConf.create(lm_args) + + with open_dict(lm_args.task): + lm_args.task.data = osp.dirname(cfg.lmpath) + + task = tasks.setup_task(lm_args.task) + model = task.build_model(lm_args.model) + model.load_state_dict(checkpoint["model"], strict=False) + + self.trie = Trie(self.vocab_size, self.silence) + + self.word_dict = task.dictionary + self.unk_word = self.word_dict.unk() + self.lm = FairseqLM(self.word_dict, model) + + if self.lexicon: + start_state = self.lm.start(False) + for i, (word, spellings) in enumerate(self.lexicon.items()): + if self.unitlm: + word_idx = i + self.idx_to_wrd[i] = word + score = 0 + else: + word_idx = self.word_dict.index(word) + _, score = self.lm.score(start_state, word_idx, no_cache=True) + + for spelling in spellings: + spelling_idxs = [tgt_dict.index(token) for token in spelling] + assert ( + tgt_dict.unk() not in spelling_idxs + ), f"{spelling} {spelling_idxs}" + self.trie.insert(spelling_idxs, word_idx, score) + self.trie.smear(SmearingMode.MAX) + + self.decoder_opts = LexiconDecoderOptions( + beam_size=cfg.beam, + beam_size_token=cfg.beamsizetoken or len(tgt_dict), + beam_threshold=cfg.beamthreshold, + lm_weight=cfg.lmweight, + word_score=cfg.wordscore, + unk_score=cfg.unkweight, + sil_score=cfg.silweight, + log_add=False, + criterion_type=CriterionType.CTC, + ) + + self.decoder = LexiconDecoder( + self.decoder_opts, + self.trie, + self.lm, + self.silence, + self.blank, + self.unk_word, + [], + self.unitlm, + ) + else: + assert self.unitlm, "Lexicon-free decoding requires unit LM" + + d = {w: [[w]] for w in tgt_dict.symbols} + self.word_dict = create_word_dict(d) + self.lm = KenLM(cfg.lmpath, self.word_dict) + self.decoder_opts = LexiconFreeDecoderOptions( + beam_size=cfg.beam, + beam_size_token=cfg.beamsizetoken or len(tgt_dict), + beam_threshold=cfg.beamthreshold, + lm_weight=cfg.lmweight, + sil_score=cfg.silweight, + log_add=False, + criterion_type=CriterionType.CTC, + ) + self.decoder = LexiconFreeDecoder( + self.decoder_opts, self.lm, self.silence, self.blank, [] + ) + + def decode( + self, + emissions: torch.FloatTensor, + ) -> List[List[Dict[str, torch.LongTensor]]]: + B, T, N = emissions.size() + hypos = [] + + def make_hypo(result: DecodeResult) -> Dict[str, Any]: + hypo = { + "tokens": self.get_tokens(result.tokens), + "score": result.score, + } + if self.lexicon: + hypo["words"] = [ + self.idx_to_wrd[x] if self.unitlm else self.word_dict[x] + for x in result.words + if x >= 0 + ] + return hypo + + for b in range(B): + emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0) + results = self.decoder.decode(emissions_ptr, T, N) + + nbest_results = results[: self.nbest] + hypos.append([make_hypo(result) for result in nbest_results]) + self.lm.empty_cache() + + return hypos diff --git a/examples/speech_recognition/new/decoders/viterbi_decoder.py b/examples/speech_recognition/new/decoders/viterbi_decoder.py new file mode 100644 index 0000000000..a35d95e146 --- /dev/null +++ b/examples/speech_recognition/new/decoders/viterbi_decoder.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from typing import List, Dict + +from .base_decoder import BaseDecoder + + +class ViterbiDecoder(BaseDecoder): + def decode( + self, + emissions: torch.FloatTensor, + ) -> List[List[Dict[str, torch.LongTensor]]]: + def get_pred(e): + score = e.log_softmax(dim=-1).max(dim=-1)[0].sum() + toks = e.argmax(dim=-1).unique_consecutive() + return {"tokens":toks[toks != self.blank], "score":score} + return [[get_pred(x)] for x in emissions] diff --git a/examples/speech_recognition/new/infer.py b/examples/speech_recognition/new/infer.py new file mode 100644 index 0000000000..ca5cea4a7c --- /dev/null +++ b/examples/speech_recognition/new/infer.py @@ -0,0 +1,502 @@ +#!/usr/bin/env python -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import ast +import hashlib +import logging +import os +import shutil +import sys +import re +from dataclasses import dataclass, field, is_dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +import editdistance +import torch +import torch.distributed as dist +from examples.speech_recognition.new.decoders.decoder_config import ( + DecoderConfig, + FlashlightDecoderConfig, +) +from examples.speech_recognition.new.decoders.decoder import Decoder +from fairseq import checkpoint_utils, distributed_utils, progress_bar, tasks, utils +from fairseq.data.data_utils import post_process +from fairseq.dataclass.configs import ( + CheckpointConfig, + CommonConfig, + CommonEvalConfig, + DatasetConfig, + DistributedTrainingConfig, + FairseqDataclass, +) +from fairseq.logging.meters import StopwatchMeter, TimeMeter +from fairseq.logging.progress_bar import BaseProgressBar +from fairseq.models.fairseq_model import FairseqModel +from omegaconf import OmegaConf + +import hydra +from hydra.core.config_store import ConfigStore + +logging.root.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +config_path = Path(__file__).resolve().parent / "conf" + + +@dataclass +class DecodingConfig(DecoderConfig, FlashlightDecoderConfig): + unique_wer_file: bool = field( + default=False, + metadata={"help": "If set, use a unique file for storing WER"}, + ) + results_path: Optional[str] = field( + default=None, + metadata={ + "help": "If set, write hypothesis and reference sentences into this directory" + }, + ) + + +@dataclass +class InferConfig(FairseqDataclass): + task: Any = None + decoding: DecodingConfig = DecodingConfig() + common: CommonConfig = CommonConfig() + common_eval: CommonEvalConfig = CommonEvalConfig() + checkpoint: CheckpointConfig = CheckpointConfig() + distributed_training: DistributedTrainingConfig = DistributedTrainingConfig() + dataset: DatasetConfig = DatasetConfig() + is_ax: bool = field( + default=False, + metadata={ + "help": "if true, assumes we are using ax for tuning and returns a tuple for ax to consume" + }, + ) + + +def reset_logging(): + root = logging.getLogger() + for handler in root.handlers: + root.removeHandler(handler) + root.setLevel(os.environ.get("LOGLEVEL", "INFO").upper()) + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter( + logging.Formatter( + fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + ) + root.addHandler(handler) + + +class InferenceProcessor: + cfg: InferConfig + + def __init__(self, cfg: InferConfig) -> None: + self.cfg = cfg + self.task = tasks.setup_task(cfg.task) + + models, saved_cfg = self.load_model_ensemble() + + ### LOAD ADAPTER #### + ckpt_obj = checkpoint_utils.load_checkpoint_to_cpu(self.cfg.common_eval.path) + if "adapter" in ckpt_obj: + target_lang = self.cfg.dataset.gen_subset.split(":")[0] + assert target_lang in ckpt_obj["adapter"] + + logger.info(f">>> LOADING ADAPTER: {target_lang}") + ft_obj = ckpt_obj["adapter"][target_lang] + ft_model = ft_obj["model"] + cdevice = models[0].w2v_encoder.proj.weight.device + cdtype = models[0].w2v_encoder.proj.weight.dtype + ft_proj_out, ft_proj_in = ft_model["w2v_encoder.proj.weight"].shape + ft_proj = torch.nn.Linear(ft_proj_in, ft_proj_out, bias=True) + ft_proj.to(device=cdevice, dtype=cdtype) + models[0].w2v_encoder.proj = ft_proj + with torch.no_grad(): + for kk, vv in models[0].named_parameters(): + if kk in ft_model: + vv.copy_(ft_model[kk]) + self.task.load_state_dict(ft_obj["task_state"]) + # overwrite gen_subset with master config + self.cfg.dataset.gen_subset = re.sub('^[\w-]+:', saved_cfg['task']['multi_corpus_keys']+":", self.cfg.dataset.gen_subset) + self.models = models + self.saved_cfg = saved_cfg + self.tgt_dict = self.task.target_dictionary + + self.task.load_dataset( + self.cfg.dataset.gen_subset, + task_cfg=saved_cfg.task, + ) + self.generator = Decoder(cfg.decoding, self.tgt_dict) + self.gen_timer = StopwatchMeter() + self.wps_meter = TimeMeter() + self.num_sentences = 0 + self.total_errors = 0 + self.total_length = 0 + + self.hypo_words_file = None + self.hypo_units_file = None + self.ref_words_file = None + self.ref_units_file = None + self.score_file = None + + self.progress_bar = self.build_progress_bar() + + def __enter__(self) -> "InferenceProcessor": + if self.cfg.decoding.results_path is not None: + self.hypo_words_file = self.get_res_file("hypo.word") + self.hypo_units_file = self.get_res_file("hypo.units") + self.ref_words_file = self.get_res_file("ref.word") + self.ref_units_file = self.get_res_file("ref.units") + self.score_file = self.get_res_file("asr_score") + return self + + def __exit__(self, *exc) -> bool: + if self.cfg.decoding.results_path is not None: + self.hypo_words_file.close() + self.hypo_units_file.close() + self.ref_words_file.close() + self.ref_units_file.close() + self.score_file.close() + return False + + def __iter__(self) -> Any: + for sample in self.progress_bar: + if not self.cfg.common.cpu: + sample = utils.move_to_cuda(sample) + + # Happens on the last batch. + if "net_input" not in sample: + continue + yield sample + + def log(self, *args, **kwargs): + self.progress_bar.log(*args, **kwargs) + + def print(self, *args, **kwargs): + self.progress_bar.print(*args, **kwargs) + + def get_res_file(self, fname: str) -> None: + fname = os.path.join(self.cfg.decoding.results_path, fname) + if self.data_parallel_world_size > 1: + fname = f"{fname}.{self.data_parallel_rank}" + return open(fname, "w", buffering=1) + + def merge_shards(self) -> None: + """Merges all shard files into shard 0, then removes shard suffix.""" + + shard_id = self.data_parallel_rank + num_shards = self.data_parallel_world_size + + if self.data_parallel_world_size > 1: + + def merge_shards_with_root(fname: str) -> None: + fname = os.path.join(self.cfg.decoding.results_path, fname) + logger.info("Merging %s on shard %d", fname, shard_id) + base_fpath = Path(f"{fname}.0") + with open(base_fpath, "a") as out_file: + for s in range(1, num_shards): + shard_fpath = Path(f"{fname}.{s}") + with open(shard_fpath, "r") as in_file: + for line in in_file: + out_file.write(line) + shard_fpath.unlink() + shutil.move(f"{fname}.0", fname) + + dist.barrier() # ensure all shards finished writing + if shard_id == (0 % num_shards): + merge_shards_with_root("hypo.word") + if shard_id == (1 % num_shards): + merge_shards_with_root("hypo.units") + if shard_id == (2 % num_shards): + merge_shards_with_root("ref.word") + if shard_id == (3 % num_shards): + merge_shards_with_root("ref.units") + dist.barrier() + + def optimize_model(self, model: FairseqModel) -> None: + model.make_generation_fast_() + if self.cfg.common.fp16: + model.half() + if not self.cfg.common.cpu: + model.cuda() + + def load_model_ensemble(self) -> Tuple[List[FairseqModel], FairseqDataclass]: + arg_overrides = ast.literal_eval(self.cfg.common_eval.model_overrides) + models, saved_cfg = checkpoint_utils.load_model_ensemble( + utils.split_paths(self.cfg.common_eval.path, separator="\\"), + arg_overrides=arg_overrides, + task=self.task, + suffix=self.cfg.checkpoint.checkpoint_suffix, + strict=(self.cfg.checkpoint.checkpoint_shard_count == 1), + num_shards=self.cfg.checkpoint.checkpoint_shard_count, + ) + for model in models: + self.optimize_model(model) + return models, saved_cfg + + def get_dataset_itr(self, disable_iterator_cache: bool = False) -> None: + return self.task.get_batch_iterator( + dataset=self.task.dataset(self.cfg.dataset.gen_subset), + max_tokens=self.cfg.dataset.max_tokens, + max_sentences=self.cfg.dataset.batch_size, + max_positions=(sys.maxsize, sys.maxsize), + ignore_invalid_inputs=self.cfg.dataset.skip_invalid_size_inputs_valid_test, + required_batch_size_multiple=self.cfg.dataset.required_batch_size_multiple, + seed=self.cfg.common.seed, + num_shards=self.data_parallel_world_size, + shard_id=self.data_parallel_rank, + num_workers=self.cfg.dataset.num_workers, + data_buffer_size=self.cfg.dataset.data_buffer_size, + disable_iterator_cache=disable_iterator_cache, + ).next_epoch_itr(shuffle=False) + + def build_progress_bar( + self, + epoch: Optional[int] = None, + prefix: Optional[str] = None, + default_log_format: str = "tqdm", + ) -> BaseProgressBar: + return progress_bar.progress_bar( + iterator=self.get_dataset_itr(), + log_format=self.cfg.common.log_format, + log_interval=self.cfg.common.log_interval, + epoch=epoch, + prefix=prefix, + tensorboard_logdir=self.cfg.common.tensorboard_logdir, + default_log_format=default_log_format, + ) + + @property + def data_parallel_world_size(self): + if self.cfg.distributed_training.distributed_world_size == 1: + return 1 + return distributed_utils.get_data_parallel_world_size() + + @property + def data_parallel_rank(self): + if self.cfg.distributed_training.distributed_world_size == 1: + return 0 + return distributed_utils.get_data_parallel_rank() + + def process_sentence( + self, + sample: Dict[str, Any], + hypo: Dict[str, Any], + sid: int, + batch_id: int, + ) -> Tuple[int, int]: + speaker = None # Speaker can't be parsed from dataset. + if "target_label" in sample: + toks = sample["target_label"] + else: + toks = sample["target"] + toks = toks[batch_id, :] + + # Processes hypothesis. + hyp_pieces = self.tgt_dict.string(hypo["tokens"].int().cpu()) + if "words" in hypo: + hyp_words = " ".join(hypo["words"]) + else: + hyp_words = post_process(hyp_pieces, self.cfg.common_eval.post_process) + + # Processes target. + target_tokens = utils.strip_pad(toks, self.tgt_dict.pad()) + tgt_pieces = self.tgt_dict.string(target_tokens.int().cpu()) + tgt_words = post_process(tgt_pieces, self.cfg.common_eval.post_process) + + if self.cfg.decoding.results_path is not None: + print(f"{hyp_pieces} ({speaker}-{sid})", file=self.hypo_units_file) + print(f"{hyp_words} ({speaker}-{sid})", file=self.hypo_words_file) + print(f"{tgt_pieces} ({speaker}-{sid})", file=self.ref_units_file) + print(f"{tgt_words} ({speaker}-{sid})", file=self.ref_words_file) + print(f"{hypo['score'].item()} ({speaker}-{sid})", file=self.score_file) + + if not self.cfg.common_eval.quiet: + logger.info(f"HYPO: {hyp_words}") + logger.info(f"REF: {tgt_words}") + logger.info("---------------------") + + hyp_words, tgt_words = hyp_words.split(), tgt_words.split() + + return editdistance.eval(hyp_words, tgt_words), len(tgt_words) + + def process_sample(self, sample: Dict[str, Any]) -> None: + self.gen_timer.start() + hypos = self.task.inference_step( + generator=self.generator, + models=self.models, + sample=sample, + ) + num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) + self.gen_timer.stop(num_generated_tokens) + self.wps_meter.update(num_generated_tokens) + + for batch_id, sample_id in enumerate(sample["id"].tolist()): + errs, length = self.process_sentence( + sample=sample, + sid=sample_id, + batch_id=batch_id, + hypo=hypos[batch_id][0], + ) + self.total_errors += errs + self.total_length += length + + self.log({"wps": round(self.wps_meter.avg)}) + if "nsentences" in sample: + self.num_sentences += sample["nsentences"] + else: + self.num_sentences += sample["id"].numel() + + def log_generation_time(self) -> None: + logger.info( + "Processed %d sentences (%d tokens) in %.1fs %.2f " + "sentences per second, %.2f tokens per second)", + self.num_sentences, + self.gen_timer.n, + self.gen_timer.sum, + self.num_sentences / (self.gen_timer.sum + 1e-6), + 1.0 / (self.gen_timer.avg + 1e-6), + ) + + +def parse_wer(wer_file: Path) -> float: + with open(wer_file, "r") as f: + return float(f.readline().strip().split(" ")[1]) + + +def get_wer_file(cfg: InferConfig) -> Path: + """Hashes the decoding parameters to a unique file ID.""" + base_path = "wer" + if cfg.decoding.results_path is not None: + base_path = os.path.join(cfg.decoding.results_path, base_path) + + if cfg.decoding.unique_wer_file: + yaml_str = OmegaConf.to_yaml(cfg.decoding) + fid = int(hashlib.md5(yaml_str.encode("utf-8")).hexdigest(), 16) + return Path(f"{base_path}.{fid % 1000000}") + else: + return Path(base_path) + + +def main(cfg: InferConfig) -> float: + """Entry point for main processing logic. + + Args: + cfg: The inferance configuration to use. + wer: Optional shared memory pointer for returning the WER. If not None, + the final WER value will be written here instead of being returned. + + Returns: + The final WER if `wer` is None, otherwise None. + """ + + yaml_str, wer_file = OmegaConf.to_yaml(cfg.decoding), get_wer_file(cfg) + + # Validates the provided configuration. + if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None: + cfg.dataset.max_tokens = 4000000 + if not cfg.common.cpu and not torch.cuda.is_available(): + raise ValueError("CUDA not found; set `cpu=True` to run without CUDA") + + logger.info(cfg.common_eval.path) + + with InferenceProcessor(cfg) as processor: + for sample in processor: + processor.process_sample(sample) + + processor.log_generation_time() + + if cfg.decoding.results_path is not None: + processor.merge_shards() + + errs_t, leng_t = processor.total_errors, processor.total_length + + if cfg.common.cpu: + logger.warning("Merging WER requires CUDA.") + elif processor.data_parallel_world_size > 1: + stats = torch.LongTensor([errs_t, leng_t]).cuda() + dist.all_reduce(stats, op=dist.ReduceOp.SUM) + errs_t, leng_t = stats[0].item(), stats[1].item() + + wer = errs_t * 100.0 / leng_t + + if distributed_utils.is_master(cfg.distributed_training): + with open(wer_file, "w") as f: + f.write( + ( + f"WER: {wer}\n" + f"err / num_ref_words = {errs_t} / {leng_t}\n\n" + f"{yaml_str}" + ) + ) + + return wer + + +@hydra.main(config_path=config_path, config_name="infer") +def hydra_main(cfg: InferConfig) -> Union[float, Tuple[float, Optional[float]]]: + container = OmegaConf.to_container(cfg, resolve=True, enum_to_str=True) + cfg = OmegaConf.create(container) + OmegaConf.set_struct(cfg, True) + + if cfg.common.reset_logging: + reset_logging() + + utils.import_user_module(cfg.common) + + # logger.info("Config:\n%s", OmegaConf.to_yaml(cfg)) + wer = float("inf") + + try: + if cfg.common.profile: + with torch.cuda.profiler.profile(): + with torch.autograd.profiler.emit_nvtx(): + distributed_utils.call_main(cfg, main) + else: + distributed_utils.call_main(cfg, main) + + wer = parse_wer(get_wer_file(cfg)) + except BaseException as e: # pylint: disable=broad-except + if not cfg.common.suppress_crashes: + raise + else: + logger.error("Crashed! %s", str(e)) + + logger.info("Word error rate: %.4f", wer) + if cfg.is_ax: + return wer, None + + return wer + + +def cli_main() -> None: + try: + from hydra._internal.utils import ( + get_args, + ) # pylint: disable=import-outside-toplevel + + cfg_name = get_args().config_name or "infer" + except ImportError: + logger.warning("Failed to get config name from hydra args") + cfg_name = "infer" + + cs = ConfigStore.instance() + cs.store(name=cfg_name, node=InferConfig) + + for k in InferConfig.__dataclass_fields__: + if is_dataclass(InferConfig.__dataclass_fields__[k].type): + v = InferConfig.__dataclass_fields__[k].default + cs.store(name=k, node=v) + + hydra_main() # pylint: disable=no-value-for-parameter + + +if __name__ == "__main__": + cli_main() diff --git a/examples/speech_recognition/tasks/__init__.py b/examples/speech_recognition/tasks/__init__.py index ffa5f3bd8c..7ac3b8dc69 100644 --- a/examples/speech_recognition/tasks/__init__.py +++ b/examples/speech_recognition/tasks/__init__.py @@ -2,7 +2,7 @@ import os -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): task_name = file[: file.find(".py")] importlib.import_module("examples.speech_recognition.tasks." + task_name) diff --git a/examples/speech_recognition/w2l_decoder.py b/examples/speech_recognition/w2l_decoder.py index 2a1d8a779d..fbf2d3524e 100644 --- a/examples/speech_recognition/w2l_decoder.py +++ b/examples/speech_recognition/w2l_decoder.py @@ -6,12 +6,13 @@ # LICENSE file in the root directory of this source tree. """ -Wav2letter decoders. +Flashlight decoders. """ import gc import itertools as it import os.path as osp +from typing import List import warnings from collections import deque, namedtuple @@ -20,25 +21,26 @@ from examples.speech_recognition.data.replabels import unpack_replabels from fairseq import tasks from fairseq.utils import apply_to_sample +from omegaconf import open_dict +from fairseq.dataclass.utils import convert_namespace_to_omegaconf try: - from wav2letter.common import create_word_dict, load_words - from wav2letter.criterion import CpuViterbiPath, get_data_ptr_as_bytes - from wav2letter.decoder import ( + from flashlight.lib.text.dictionary import create_word_dict, load_words + from flashlight.lib.sequence.criterion import CpuViterbiPath, get_data_ptr_as_bytes + from flashlight.lib.text.decoder import ( CriterionType, - DecoderOptions, + LexiconDecoderOptions, KenLM, LM, LMState, SmearingMode, Trie, LexiconDecoder, - LexiconFreeDecoder, ) except: warnings.warn( - "wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings" + "flashlight python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/flashlight/tree/master/bindings/python" ) LM = object LMState = object @@ -51,22 +53,19 @@ def __init__(self, args, tgt_dict): self.nbest = args.nbest # criterion-specific init - if args.criterion == "ctc": - self.criterion_type = CriterionType.CTC - self.blank = ( - tgt_dict.index("<ctc_blank>") - if "<ctc_blank>" in tgt_dict.indices - else tgt_dict.bos() - ) - self.asg_transitions = None - elif args.criterion == "asg_loss": - self.criterion_type = CriterionType.ASG - self.blank = -1 - self.asg_transitions = args.asg_transitions - self.max_replabel = args.max_replabel - assert len(self.asg_transitions) == self.vocab_size ** 2 + self.criterion_type = CriterionType.CTC + self.blank = ( + tgt_dict.index("<ctc_blank>") + if "<ctc_blank>" in tgt_dict.indices + else tgt_dict.bos() + ) + if "<sep>" in tgt_dict.indices: + self.silence = tgt_dict.index("<sep>") + elif "|" in tgt_dict.indices: + self.silence = tgt_dict.index("|") else: - raise RuntimeError(f"unknown criterion: {args.criterion}") + self.silence = tgt_dict.eos() + self.asg_transitions = None def generate(self, models, sample, **unused): """Generate a batch of inferences.""" @@ -80,22 +79,18 @@ def generate(self, models, sample, **unused): def get_emissions(self, models, encoder_input): """Run encoder and normalize emissions""" - # encoder_out = models[0].encoder(**encoder_input) - encoder_out = models[0](**encoder_input) - if self.criterion_type == CriterionType.CTC: - emissions = models[0].get_normalized_probs(encoder_out, log_probs=True) - elif self.criterion_type == CriterionType.ASG: - emissions = encoder_out["encoder_out"] + model = models[0] + encoder_out = model(**encoder_input) + if hasattr(model, "get_logits"): + emissions = model.get_logits(encoder_out) # no need to normalize emissions + else: + emissions = model.get_normalized_probs(encoder_out, log_probs=True) return emissions.transpose(0, 1).float().cpu().contiguous() def get_tokens(self, idxs): """Normalize tokens by handling CTC blank, ASG replabels, etc.""" idxs = (g[0] for g in it.groupby(idxs)) - if self.criterion_type == CriterionType.CTC: - idxs = filter(lambda x: x != self.blank, idxs) - elif self.criterion_type == CriterionType.ASG: - idxs = filter(lambda x: x >= 0, idxs) - idxs = unpack_replabels(list(idxs), self.tgt_dict, self.max_replabel) + idxs = filter(lambda x: x != self.blank, idxs) return torch.LongTensor(list(idxs)) @@ -131,58 +126,95 @@ class W2lKenLMDecoder(W2lDecoder): def __init__(self, args, tgt_dict): super().__init__(args, tgt_dict) - self.silence = ( - tgt_dict.index("<ctc_blank>") - if "<ctc_blank>" in tgt_dict.indices - else tgt_dict.bos() - ) - self.lexicon = load_words(args.lexicon) - self.word_dict = create_word_dict(self.lexicon) - self.unk_word = self.word_dict.get_index("<unk>") + self.unit_lm = getattr(args, "unit_lm", False) - self.lm = KenLM(args.kenlm_model, self.word_dict) - self.trie = Trie(self.vocab_size, self.silence) + if args.lexicon: + self.lexicon = load_words(args.lexicon) + self.word_dict = create_word_dict(self.lexicon) + self.unk_word = self.word_dict.get_index("<unk>") - start_state = self.lm.start(False) - for i, (word, spellings) in enumerate(self.lexicon.items()): - word_idx = self.word_dict.get_index(word) - _, score = self.lm.score(start_state, word_idx) - for spelling in spellings: - spelling_idxs = [tgt_dict.index(token) for token in spelling] - assert ( - tgt_dict.unk() not in spelling_idxs - ), f"{spelling} {spelling_idxs}" - self.trie.insert(spelling_idxs, word_idx, score) - self.trie.smear(SmearingMode.MAX) - - self.decoder_opts = DecoderOptions( - args.beam, - int(getattr(args, "beam_size_token", len(tgt_dict))), - args.beam_threshold, - args.lm_weight, - args.word_score, - args.unk_weight, - args.sil_weight, - 0, - False, - self.criterion_type, - ) + self.lm = KenLM(args.kenlm_model, self.word_dict) + self.trie = Trie(self.vocab_size, self.silence) - if self.asg_transitions is None: - N = 768 - # self.asg_transitions = torch.FloatTensor(N, N).zero_() - self.asg_transitions = [] - - self.decoder = LexiconDecoder( - self.decoder_opts, - self.trie, - self.lm, - self.silence, - self.blank, - self.unk_word, - self.asg_transitions, - False, - ) + start_state = self.lm.start(False) + for i, (word, spellings) in enumerate(self.lexicon.items()): + word_idx = self.word_dict.get_index(word) + _, score = self.lm.score(start_state, word_idx) + for spelling in spellings: + spelling_idxs = [tgt_dict.index(token) for token in spelling] + assert ( + tgt_dict.unk() not in spelling_idxs + ), f"{spelling} {spelling_idxs}" + self.trie.insert(spelling_idxs, word_idx, score) + self.trie.smear(SmearingMode.MAX) + + self.decoder_opts = LexiconDecoderOptions( + beam_size=args.beam, + beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))), + beam_threshold=args.beam_threshold, + lm_weight=args.lm_weight, + word_score=args.word_score, + unk_score=args.unk_weight, + sil_score=args.sil_weight, + log_add=False, + criterion_type=self.criterion_type, + ) + + if self.asg_transitions is None: + N = 768 + # self.asg_transitions = torch.FloatTensor(N, N).zero_() + self.asg_transitions = [] + + self.decoder = LexiconDecoder( + self.decoder_opts, + self.trie, + self.lm, + self.silence, + self.blank, + self.unk_word, + self.asg_transitions, + self.unit_lm, + ) + else: + assert args.unit_lm, "lexicon free decoding can only be done with a unit language model" + from flashlight.lib.text.decoder import LexiconFreeDecoder, LexiconFreeDecoderOptions + + d = {w: [[w]] for w in tgt_dict.symbols} + self.word_dict = create_word_dict(d) + self.lm = KenLM(args.kenlm_model, self.word_dict) + self.decoder_opts = LexiconFreeDecoderOptions( + beam_size=args.beam, + beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))), + beam_threshold=args.beam_threshold, + lm_weight=args.lm_weight, + sil_score=args.sil_weight, + log_add=False, + criterion_type=self.criterion_type, + ) + self.decoder = LexiconFreeDecoder( + self.decoder_opts, self.lm, self.silence, self.blank, [] + ) + + def get_timesteps(self, token_idxs: List[int]) -> List[int]: + """Returns frame numbers corresponding to every non-blank token. + + Parameters + ---------- + token_idxs : List[int] + IDs of decoded tokens. + + Returns + ------- + List[int] + Frame numbers corresponding to every non-blank token. + """ + timesteps = [] + for i, token_idx in enumerate(token_idxs): + if token_idx == self.blank: + continue + if i == 0 or token_idx != token_idxs[i-1]: + timesteps.append(i) + return timesteps def decode(self, emissions): B, T, N = emissions.size() @@ -197,6 +229,7 @@ def decode(self, emissions): { "tokens": self.get_tokens(result.tokens), "score": result.score, + "timesteps": self.get_timesteps(result.tokens), "words": [ self.word_dict.get_entry(x) for x in result.words if x >= 0 ], @@ -340,19 +373,23 @@ class W2lFairseqLMDecoder(W2lDecoder): def __init__(self, args, tgt_dict): super().__init__(args, tgt_dict) - self.silence = tgt_dict.bos() - self.unit_lm = getattr(args, "unit_lm", False) self.lexicon = load_words(args.lexicon) if args.lexicon else None self.idx_to_wrd = {} checkpoint = torch.load(args.kenlm_model, map_location="cpu") - lm_args = checkpoint["args"] - lm_args.data = osp.dirname(args.kenlm_model) - print(lm_args) - task = tasks.setup_task(lm_args) - model = task.build_model(lm_args) + + if "cfg" in checkpoint and checkpoint["cfg"] is not None: + lm_args = checkpoint["cfg"] + else: + lm_args = convert_namespace_to_omegaconf(checkpoint["args"]) + + with open_dict(lm_args.task): + lm_args.task.data = osp.dirname(args.kenlm_model) + + task = tasks.setup_task(lm_args.task) + model = task.build_model(lm_args.model) model.load_state_dict(checkpoint["model"], strict=False) self.trie = Trie(self.vocab_size, self.silence) @@ -361,19 +398,6 @@ def __init__(self, args, tgt_dict): self.unk_word = self.word_dict.unk() self.lm = FairseqLM(self.word_dict, model) - self.decoder_opts = DecoderOptions( - args.beam, - int(getattr(args, "beam_size_token", len(tgt_dict))), - args.beam_threshold, - args.lm_weight, - args.word_score, - args.unk_weight, - args.sil_weight, - 0, - False, - self.criterion_type, - ) - if self.lexicon: start_state = self.lm.start(False) for i, (word, spellings) in enumerate(self.lexicon.items()): @@ -393,6 +417,18 @@ def __init__(self, args, tgt_dict): self.trie.insert(spelling_idxs, word_idx, score) self.trie.smear(SmearingMode.MAX) + self.decoder_opts = LexiconDecoderOptions( + beam_size=args.beam, + beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))), + beam_threshold=args.beam_threshold, + lm_weight=args.lm_weight, + word_score=args.word_score, + unk_score=args.unk_weight, + sil_score=args.sil_weight, + log_add=False, + criterion_type=self.criterion_type, + ) + self.decoder = LexiconDecoder( self.decoder_opts, self.trie, @@ -404,6 +440,21 @@ def __init__(self, args, tgt_dict): self.unit_lm, ) else: + assert args.unit_lm, "lexicon free decoding can only be done with a unit language model" + from flashlight.lib.text.decoder import LexiconFreeDecoder, LexiconFreeDecoderOptions + + d = {w: [[w]] for w in tgt_dict.symbols} + self.word_dict = create_word_dict(d) + self.lm = KenLM(args.kenlm_model, self.word_dict) + self.decoder_opts = LexiconFreeDecoderOptions( + beam_size=args.beam, + beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))), + beam_threshold=args.beam_threshold, + lm_weight=args.lm_weight, + sil_score=args.sil_weight, + log_add=False, + criterion_type=self.criterion_type, + ) self.decoder = LexiconFreeDecoder( self.decoder_opts, self.lm, self.silence, self.blank, [] ) diff --git a/examples/speech_synthesis/README.md b/examples/speech_synthesis/README.md new file mode 100644 index 0000000000..a31e7f68bd --- /dev/null +++ b/examples/speech_synthesis/README.md @@ -0,0 +1,38 @@ +Speech Synthesis (S^2) +=== +[https://arxiv.org/abs/2109.06912](https://arxiv.org/abs/2109.06912) + +Speech synthesis with fairseq. + +## Features + +- Autoregressive and non-autoregressive models +- Multi-speaker synthesis +- Audio preprocessing (denoising, VAD, etc.) for less curated data +- Automatic metrics for model development +- Similar data configuration as [S2T](../speech_to_text/README.md) + + +## Examples +- [Single-speaker synthesis on LJSpeech](docs/ljspeech_example.md) +- [Multi-speaker synthesis on VCTK](docs/vctk_example.md) +- [Multi-speaker synthesis on Common Voice](docs/common_voice_example.md) + + +## Citation +Please cite as: +``` +@article{wang2021fairseqs2, + title={fairseq S\^{} 2: A Scalable and Integrable Speech Synthesis Toolkit}, + author={Wang, Changhan and Hsu, Wei-Ning and Adi, Yossi and Polyak, Adam and Lee, Ann and Chen, Peng-Jen and Gu, Jiatao and Pino, Juan}, + journal={arXiv preprint arXiv:2109.06912}, + year={2021} +} + +@inproceedings{ott2019fairseq, + title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling}, + author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli}, + booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations}, + year = {2019}, +} +``` diff --git a/examples/simultaneous_translation/eval/__init__.py b/examples/speech_synthesis/__init__.py similarity index 100% rename from examples/simultaneous_translation/eval/__init__.py rename to examples/speech_synthesis/__init__.py diff --git a/examples/speech_synthesis/data_utils.py b/examples/speech_synthesis/data_utils.py new file mode 100644 index 0000000000..3b2d079a9a --- /dev/null +++ b/examples/speech_synthesis/data_utils.py @@ -0,0 +1,344 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import io +import os +from pathlib import Path +from typing import Optional, List, Dict +import zipfile +import tempfile +from dataclasses import dataclass +from itertools import groupby + +import torch +import torch.nn.functional as F +import numpy as np +from tqdm import tqdm + +from examples.speech_to_text.data_utils import load_tsv_to_dicts +from fairseq.data.audio.audio_utils import ( + TTSSpectrogram, TTSMelScale, parse_path, read_from_stored_zip, is_npy_data +) + + +def trim_or_pad_to_target_length( + data_1d_or_2d: np.ndarray, target_length: int +) -> np.ndarray: + assert len(data_1d_or_2d.shape) in {1, 2} + delta = data_1d_or_2d.shape[0] - target_length + if delta >= 0: # trim if being longer + data_1d_or_2d = data_1d_or_2d[: target_length] + else: # pad if being shorter + if len(data_1d_or_2d.shape) == 1: + data_1d_or_2d = np.concatenate( + [data_1d_or_2d, np.zeros(-delta)], axis=0 + ) + else: + data_1d_or_2d = np.concatenate( + [data_1d_or_2d, np.zeros((-delta, data_1d_or_2d.shape[1]))], + axis=0 + ) + return data_1d_or_2d + + +def extract_logmel_spectrogram( + waveform: torch.Tensor, sample_rate: int, + output_path: Optional[Path] = None, win_length: int = 1024, + hop_length: int = 256, n_fft: int = 1024, + win_fn: callable = torch.hann_window, n_mels: int = 80, + f_min: float = 0., f_max: float = 8000, eps: float = 1e-5, + overwrite: bool = False, target_length: Optional[int] = None +): + if output_path is not None and output_path.is_file() and not overwrite: + return + + spectrogram_transform = TTSSpectrogram( + n_fft=n_fft, win_length=win_length, hop_length=hop_length, + window_fn=win_fn + ) + mel_scale_transform = TTSMelScale( + n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max, + n_stft=n_fft // 2 + 1 + ) + spectrogram = spectrogram_transform(waveform) + mel_spec = mel_scale_transform(spectrogram) + logmel_spec = torch.clamp(mel_spec, min=eps).log() + assert len(logmel_spec.shape) == 3 and logmel_spec.shape[0] == 1 + logmel_spec = logmel_spec.squeeze().t() # D x T -> T x D + if target_length is not None: + logmel_spec = trim_or_pad_to_target_length(logmel_spec, target_length) + + if output_path is not None: + np.save(output_path.as_posix(), logmel_spec) + else: + return logmel_spec + + +def extract_pitch( + waveform: torch.Tensor, sample_rate: int, + output_path: Optional[Path] = None, hop_length: int = 256, + log_scale: bool = True, phoneme_durations: Optional[List[int]] = None +): + if output_path is not None and output_path.is_file(): + return + + try: + import pyworld + except ImportError: + raise ImportError("Please install PyWORLD: pip install pyworld") + + _waveform = waveform.squeeze(0).double().numpy() + pitch, t = pyworld.dio( + _waveform, sample_rate, frame_period=hop_length / sample_rate * 1000 + ) + pitch = pyworld.stonemask(_waveform, pitch, t, sample_rate) + + if phoneme_durations is not None: + pitch = trim_or_pad_to_target_length(pitch, sum(phoneme_durations)) + try: + from scipy.interpolate import interp1d + except ImportError: + raise ImportError("Please install SciPy: pip install scipy") + nonzero_ids = np.where(pitch != 0)[0] + if len(nonzero_ids) == 0: + print((f"{output_path} has all empty values in the pitch contour")) + return + elif len(nonzero_ids) == 1: + print((f"{output_path} has only one non-zero values in the pitch contour")) + return + else: + interp_fn = interp1d( + nonzero_ids, + pitch[nonzero_ids], + fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]), + bounds_error=False, + ) + pitch = interp_fn(np.arange(0, len(pitch))) + d_cumsum = np.cumsum(np.concatenate([np.array([0]), phoneme_durations])) + pitch = np.array( + [ + np.mean(pitch[d_cumsum[i-1]: d_cumsum[i]]) + for i in range(1, len(d_cumsum)) + ] + ) + assert len(pitch) == len(phoneme_durations) + + if log_scale: + pitch = np.log(pitch + 1) + + if output_path is not None: + np.save(output_path.as_posix(), pitch) + else: + return pitch + + +def extract_energy( + waveform: torch.Tensor, output_path: Optional[Path] = None, + hop_length: int = 256, n_fft: int = 1024, log_scale: bool = True, + phoneme_durations: Optional[List[int]] = None +): + if output_path is not None and output_path.is_file(): + return + + assert len(waveform.shape) == 2 and waveform.shape[0] == 1 + waveform = waveform.view(1, 1, waveform.shape[1]) + waveform = F.pad( + waveform.unsqueeze(1), [n_fft // 2, n_fft // 2, 0, 0], + mode="reflect" + ) + waveform = waveform.squeeze(1) + + fourier_basis = np.fft.fft(np.eye(n_fft)) + cutoff = int((n_fft / 2 + 1)) + fourier_basis = np.vstack( + [np.real(fourier_basis[:cutoff, :]), + np.imag(fourier_basis[:cutoff, :])] + ) + + forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) + forward_transform = F.conv1d( + waveform, forward_basis, stride=hop_length, padding=0 + ) + + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2) + energy = torch.norm(magnitude, dim=1).squeeze(0).numpy() + + if phoneme_durations is not None: + energy = trim_or_pad_to_target_length(energy, sum(phoneme_durations)) + d_cumsum = np.cumsum(np.concatenate([np.array([0]), phoneme_durations])) + energy = np.array( + [ + np.mean(energy[d_cumsum[i - 1]: d_cumsum[i]]) + for i in range(1, len(d_cumsum)) + ] + ) + assert len(energy) == len(phoneme_durations) + + if log_scale: + energy = np.log(energy + 1) + + if output_path is not None: + np.save(output_path.as_posix(), energy) + else: + return energy + + +def get_global_cmvn(feature_root: Path, output_path: Optional[Path] = None): + mean_x, mean_x2, n_frames = None, None, 0 + feature_paths = feature_root.glob("*.npy") + for p in tqdm(feature_paths): + with open(p, 'rb') as f: + frames = np.load(f).squeeze() + + n_frames += frames.shape[0] + + cur_mean_x = frames.sum(axis=0) + if mean_x is None: + mean_x = cur_mean_x + else: + mean_x += cur_mean_x + + cur_mean_x2 = (frames ** 2).sum(axis=0) + if mean_x2 is None: + mean_x2 = cur_mean_x2 + else: + mean_x2 += cur_mean_x2 + + mean_x /= n_frames + mean_x2 /= n_frames + var_x = mean_x2 - mean_x ** 2 + std_x = np.sqrt(np.maximum(var_x, 1e-10)) + + if output_path is not None: + with open(output_path, 'wb') as f: + np.savez(f, mean=mean_x, std=std_x) + else: + return {"mean": mean_x, "std": std_x} + + +def ipa_phonemize(text, lang="en-us", use_g2p=False): + if use_g2p: + assert lang == "en-us", "g2pE phonemizer only works for en-us" + try: + from g2p_en import G2p + g2p = G2p() + return " ".join("|" if p == " " else p for p in g2p(text)) + except ImportError: + raise ImportError( + "Please install phonemizer: pip install g2p_en" + ) + else: + try: + from phonemizer import phonemize + from phonemizer.separator import Separator + return phonemize( + text, backend='espeak', language=lang, + separator=Separator(word="| ", phone=" ") + ) + except ImportError: + raise ImportError( + "Please install phonemizer: pip install phonemizer" + ) + + +@dataclass +class ForceAlignmentInfo(object): + tokens: List[str] + frame_durations: List[int] + start_sec: Optional[float] + end_sec: Optional[float] + + +def get_mfa_alignment_by_sample_id( + textgrid_zip_path: str, sample_id: str, sample_rate: int, + hop_length: int, silence_phones: List[str] = ("sil", "sp", "spn") +) -> ForceAlignmentInfo: + try: + import tgt + except ImportError: + raise ImportError("Please install TextGridTools: pip install tgt") + + filename = f"{sample_id}.TextGrid" + out_root = Path(tempfile.gettempdir()) + tgt_path = out_root / filename + with zipfile.ZipFile(textgrid_zip_path) as f_zip: + f_zip.extract(filename, path=out_root) + textgrid = tgt.io.read_textgrid(tgt_path.as_posix()) + os.remove(tgt_path) + + phones, frame_durations = [], [] + start_sec, end_sec, end_idx = 0, 0, 0 + for t in textgrid.get_tier_by_name("phones")._objects: + s, e, p = t.start_time, t.end_time, t.text + # Trim leading silences + if len(phones) == 0: + if p in silence_phones: + continue + else: + start_sec = s + phones.append(p) + if p not in silence_phones: + end_sec = e + end_idx = len(phones) + r = sample_rate / hop_length + frame_durations.append(int(np.round(e * r) - np.round(s * r))) + # Trim tailing silences + phones = phones[:end_idx] + frame_durations = frame_durations[:end_idx] + + return ForceAlignmentInfo( + tokens=phones, frame_durations=frame_durations, start_sec=start_sec, + end_sec=end_sec + ) + + +def get_mfa_alignment( + textgrid_zip_path: str, sample_ids: List[str], sample_rate: int, + hop_length: int +) -> Dict[str, ForceAlignmentInfo]: + return { + i: get_mfa_alignment_by_sample_id( + textgrid_zip_path, i, sample_rate, hop_length + ) for i in tqdm(sample_ids) + } + + +def get_unit_alignment( + id_to_unit_tsv_path: str, sample_ids: List[str] +) -> Dict[str, ForceAlignmentInfo]: + id_to_units = { + e["id"]: e["units"] for e in load_tsv_to_dicts(id_to_unit_tsv_path) + } + id_to_units = {i: id_to_units[i].split() for i in sample_ids} + id_to_units_collapsed = { + i: [uu for uu, _ in groupby(u)] for i, u in id_to_units.items() + } + id_to_durations = { + i: [len(list(g)) for _, g in groupby(u)] for i, u in id_to_units.items() + } + + return { + i: ForceAlignmentInfo( + tokens=id_to_units_collapsed[i], frame_durations=id_to_durations[i], + start_sec=None, end_sec=None + ) + for i in sample_ids + } + + +def get_feature_value_min_max(feature_paths: List[str]): + v_min, v_max = 1e-8, -1e-8 + for p in tqdm(feature_paths): + _path, slice_ptr = parse_path(p) + assert len(slice_ptr) == 2 + byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1]) + assert is_npy_data(byte_data) + path_or_fp = io.BytesIO(byte_data) + features = np.load(path_or_fp).squeeze() + v_min = min(v_min, features.min().item()) + v_max = max(v_max, features.max().item()) + return v_min, v_max diff --git a/examples/speech_synthesis/docs/common_voice_example.md b/examples/speech_synthesis/docs/common_voice_example.md new file mode 100644 index 0000000000..1c0eef69a0 --- /dev/null +++ b/examples/speech_synthesis/docs/common_voice_example.md @@ -0,0 +1,67 @@ +[[Back]](..) + +# Common Voice + +[Common Voice](https://commonvoice.mozilla.org/en/datasets) is a public domain speech corpus with 11.2K hours of read +speech in 76 languages (the latest version 7.0). We provide examples for building +[Transformer](https://arxiv.org/abs/1809.08895) models on this dataset. + + +## Data preparation +[Download](https://commonvoice.mozilla.org/en/datasets) and unpack Common Voice v4 to a path `${DATA_ROOT}/${LANG_ID}`. +Create splits and generate audio manifests with +```bash +python -m examples.speech_synthesis.preprocessing.get_common_voice_audio_manifest \ + --data-root ${DATA_ROOT} \ + --lang ${LANG_ID} \ + --output-manifest-root ${AUDIO_MANIFEST_ROOT} --convert-to-wav +``` + +To denoise audio and trim leading/trailing silence using signal processing based VAD, run +```bash +for SPLIT in dev test train; do + python -m examples.speech_synthesis.preprocessing.denoise_and_vad_audio \ + --audio-manifest ${AUDIO_MANIFEST_ROOT}/${SPLIT}.audio.tsv \ + --output-dir ${PROCESSED_DATA_ROOT} \ + --denoise --vad --vad-agg-level 2 +done +``` + +which generates a new audio TSV manifest under `${PROCESSED_DATA_ROOT}` with updated path to the processed audio and +a new column for SNR. + +To do filtering by CER, follow the [Automatic Evaluation](../docs/ljspeech_example.md#automatic-evaluation) section to +run ASR model (add `--eval-target` to `get_eval_manifest` for evaluation on the reference audio; add `--err-unit char` +to `eval_asr` to compute CER instead of WER). The example-level CER is saved to +`${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv`. + +Then, extract log-Mel spectrograms, generate feature manifest and create data configuration YAML with +```bash +python -m examples.speech_synthesis.preprocessing.get_feature_manifest \ + --audio-manifest-root ${AUDIO_MANIFEST_ROOT} \ + --output-root ${FEATURE_MANIFEST_ROOT} \ + --ipa-vocab --lang ${LANG_ID} \ + --snr-threshold 15 \ + --cer-threshold 0.1 --cer-tsv-path ${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv +``` +where we use phoneme inputs (`--ipa-vocab`) as example. For sample filtering, we set the SNR and CER threshold +to 15 and 10%, respectively. + + +## Training +(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#transformer).) + + +## Inference +(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#inference).) + +## Automatic Evaluation +(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#automatic-evaluation).) + +## Results + +| Language | Speakers | --arch | Params | Test MCD | Model | +|---|---|---|---|---|---| +| English | 200 | tts_transformer | 54M | 3.8 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/cv4_en200_transformer_phn.tar) | + +[[Back]](..) diff --git a/examples/speech_synthesis/docs/ljspeech_example.md b/examples/speech_synthesis/docs/ljspeech_example.md new file mode 100644 index 0000000000..836c30d6d5 --- /dev/null +++ b/examples/speech_synthesis/docs/ljspeech_example.md @@ -0,0 +1,137 @@ +[[Back]](..) + +# LJSpeech + +[LJSpeech](https://keithito.com/LJ-Speech-Dataset) is a public domain TTS +corpus with around 24 hours of English speech sampled at 22.05kHz. We provide examples for building +[Transformer](https://arxiv.org/abs/1809.08895) and [FastSpeech 2](https://arxiv.org/abs/2006.04558) +models on this dataset. + + +## Data preparation + +Download data, create splits and generate audio manifests with +```bash +python -m examples.speech_synthesis.preprocessing.get_ljspeech_audio_manifest \ + --output-data-root ${AUDIO_DATA_ROOT} \ + --output-manifest-root ${AUDIO_MANIFEST_ROOT} +``` + +Then, extract log-Mel spectrograms, generate feature manifest and create data configuration YAML with +```bash +python -m examples.speech_synthesis.preprocessing.get_feature_manifest \ + --audio-manifest-root ${AUDIO_MANIFEST_ROOT} \ + --output-root ${FEATURE_MANIFEST_ROOT} \ + --ipa-vocab --use-g2p +``` +where we use phoneme inputs (`--ipa-vocab --use-g2p`) as example. + +FastSpeech 2 additionally requires frame durations, pitch and energy as auxiliary training targets. +Add `--add-fastspeech-targets` to include these fields in the feature manifests. We get frame durations either from +phoneme-level force-alignment or frame-level pseudo-text unit sequence. They should be pre-computed and specified via: +- `--textgrid-zip ${TEXT_GRID_ZIP_PATH}` for a ZIP file, inside which there is one + [TextGrid](https://www.fon.hum.uva.nl/praat/manual/TextGrid.html) file per sample to provide force-alignment info. +- `--id-to-units-tsv ${ID_TO_UNIT_TSV}` for a TSV file, where there are 2 columns for sample ID and + space-delimited pseudo-text unit sequence, respectively. + +For your convenience, we provide pre-computed +[force-alignment](https://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_mfa.zip) from +[Montreal Forced Aligner](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) and +[pseudo-text units](s3://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_hubert.tsv) from +[HuBERT](https://github.com/pytorch/fairseq/tree/main/examples/hubert). You can also generate them by yourself using +a different software or model. + + +## Training +#### Transformer +```bash +fairseq-train ${FEATURE_MANIFEST_ROOT} --save-dir ${SAVE_DIR} \ + --config-yaml config.yaml --train-subset train --valid-subset dev \ + --num-workers 4 --max-tokens 30000 --max-update 200000 \ + --task text_to_speech --criterion tacotron2 --arch tts_transformer \ + --clip-norm 5.0 --n-frames-per-step 4 --bce-pos-weight 5.0 \ + --dropout 0.1 --attention-dropout 0.1 --activation-dropout 0.1 \ + --encoder-normalize-before --decoder-normalize-before \ + --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ + --seed 1 --update-freq 8 --eval-inference --best-checkpoint-metric mcd_loss +``` +where `SAVE_DIR` is the checkpoint root path. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to +update it accordingly when using more than 1 GPU. + +#### FastSpeech2 +```bash +fairseq-train ${FEATURE_MANIFEST_ROOT} --save-dir ${SAVE_DIR} \ + --config-yaml config.yaml --train-subset train --valid-subset dev \ + --num-workers 4 --max-sentences 6 --max-update 200000 \ + --task text_to_speech --criterion fastspeech2 --arch fastspeech2 \ + --clip-norm 5.0 --n-frames-per-step 1 \ + --dropout 0.1 --attention-dropout 0.1 \ + --optimizer adam --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ + --seed 1 --update-freq 8 --eval-inference --best-checkpoint-metric mcd_loss +``` + + +## Inference +Average the last 5 checkpoints, generate the test split spectrogram and waveform using the default Griffin-Lim vocoder: +```bash +SPLIT=test +CHECKPOINT_NAME=avg_last_5 +CHECKPOINT_PATH=${SAVE_DIR}/checkpoint_${CHECKPOINT_NAME}.pt +python scripts/average_checkpoints.py --inputs ${SAVE_DIR} \ + --num-epoch-checkpoints 5 \ + --output ${CHECKPOINT_PATH} + +python -m examples.speech_synthesis.generate_waveform ${FEATURE_MANIFEST_ROOT} \ + --config-yaml config.yaml --gen-subset ${SPLIT} --task text_to_speech \ + --path ${CHECKPOINT_PATH} --max-tokens 50000 --spec-bwd-max-iter 32 \ + --dump-waveforms +``` +which dumps files (waveform, feature, attention plot, etc.) to `${SAVE_DIR}/generate-${CHECKPOINT_NAME}-${SPLIT}`. To +re-synthesize target waveforms for automatic evaluation, add `--dump-target`. + +## Automatic Evaluation +To start with, generate the manifest for synthetic speech, which will be taken as inputs by evaluation scripts. +```bash +python -m examples.speech_synthesis.evaluation.get_eval_manifest \ + --generation-root ${SAVE_DIR}/generate-${CHECKPOINT_NAME}-${SPLIT} \ + --audio-manifest ${AUDIO_MANIFEST_ROOT}/${SPLIT}.audio.tsv \ + --output-path ${EVAL_OUTPUT_ROOT}/eval.tsv \ + --vocoder griffin_lim --sample-rate 22050 --audio-format flac \ + --use-resynthesized-target +``` +Speech recognition (ASR) models usually operate at lower sample rates (e.g. 16kHz). For the WER/CER metric, +you may need to resample the audios accordingly --- add `--output-sample-rate 16000` for `generate_waveform.py` and +use `--sample-rate 16000` for `get_eval_manifest.py`. + + +#### WER/CER metric +We use wav2vec 2.0 ASR model as example. [Download](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec) +the model checkpoint and dictionary, then compute WER/CER with +```bash +python -m examples.speech_synthesis.evaluation.eval_asr \ + --audio-header syn --text-header text --err-unit char --split ${SPLIT} \ + --w2v-ckpt ${WAV2VEC2_CHECKPOINT_PATH} --w2v-dict-dir ${WAV2VEC2_DICT_DIR} \ + --raw-manifest ${EVAL_OUTPUT_ROOT}/eval_16khz.tsv --asr-dir ${EVAL_OUTPUT_ROOT}/asr +``` + +#### MCD/MSD metric +```bash +python -m examples.speech_synthesis.evaluation.eval_sp \ + ${EVAL_OUTPUT_ROOT}/eval.tsv --mcd --msd +``` + +#### F0 metrics +```bash +python -m examples.speech_synthesis.evaluation.eval_f0 \ + ${EVAL_OUTPUT_ROOT}/eval.tsv --gpe --vde --ffe +``` + + +## Results + +| --arch | Params | Test MCD | Model | +|---|---|---|---| +| tts_transformer | 54M | 3.8 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_transformer_phn.tar) | +| fastspeech2 | 41M | 3.8 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_fastspeech2_phn.tar) | + +[[Back]](..) diff --git a/examples/speech_synthesis/docs/vctk_example.md b/examples/speech_synthesis/docs/vctk_example.md new file mode 100644 index 0000000000..6808256d44 --- /dev/null +++ b/examples/speech_synthesis/docs/vctk_example.md @@ -0,0 +1,61 @@ +[[Back]](..) + +# VCTK + +[VCTK](https://datashare.ed.ac.uk/handle/10283/3443) is an open English speech corpus. We provide examples +for building [Transformer](https://arxiv.org/abs/1809.08895) models on this dataset. + + +## Data preparation +Download data, create splits and generate audio manifests with +```bash +python -m examples.speech_synthesis.preprocessing.get_vctk_audio_manifest \ + --output-data-root ${AUDIO_DATA_ROOT} \ + --output-manifest-root ${AUDIO_MANIFEST_ROOT} +``` + +To denoise audio and trim leading/trailing silence using signal processing based VAD, run +```bash +for SPLIT in dev test train; do + python -m examples.speech_synthesis.preprocessing.denoise_and_vad_audio \ + --audio-manifest ${AUDIO_MANIFEST_ROOT}/${SPLIT}.audio.tsv \ + --output-dir ${PROCESSED_DATA_ROOT} \ + --denoise --vad --vad-agg-level 3 +done +``` +which generates a new audio TSV manifest under `${PROCESSED_DATA_ROOT}` with updated path to the processed audio and +a new column for SNR. + +To do filtering by CER, follow the [Automatic Evaluation](../docs/ljspeech_example.md#automatic-evaluation) section to +run ASR model (add `--eval-target` to `get_eval_manifest` for evaluation on the reference audio; add `--err-unit char` +to `eval_asr` to compute CER instead of WER). The example-level CER is saved to +`${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv`. + +Then, extract log-Mel spectrograms, generate feature manifest and create data configuration YAML with +```bash +python -m examples.speech_synthesis.preprocessing.get_feature_manifest \ + --audio-manifest-root ${PROCESSED_DATA_ROOT} \ + --output-root ${FEATURE_MANIFEST_ROOT} \ + --ipa-vocab --use-g2p \ + --snr-threshold 15 \ + --cer-threshold 0.1 --cer-tsv-path ${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv +``` +where we use phoneme inputs (`--ipa-vocab --use-g2p`) as example. For sample filtering, we set the SNR and CER threshold +to 15 and 10%, respectively. + +## Training +(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#transformer).) + +## Inference +(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#inference).) + +## Automatic Evaluation +(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#automatic-evaluation).) + +## Results + +| --arch | Params | Test MCD | Model | +|---|---|---|---| +| tts_transformer | 54M | 3.4 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/vctk_transformer_phn.tar) | + +[[Back]](..) diff --git a/examples/speech_synthesis/evaluation/__init__.py b/examples/speech_synthesis/evaluation/__init__.py new file mode 100644 index 0000000000..6264236915 --- /dev/null +++ b/examples/speech_synthesis/evaluation/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/examples/speech_synthesis/evaluation/eval_asr.py b/examples/speech_synthesis/evaluation/eval_asr.py new file mode 100644 index 0000000000..005a11bfb3 --- /dev/null +++ b/examples/speech_synthesis/evaluation/eval_asr.py @@ -0,0 +1,128 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import editdistance +import re +import shutil +import soundfile as sf +import subprocess +from pathlib import Path + +from examples.speech_to_text.data_utils import load_tsv_to_dicts + + +def preprocess_text(text): + text = "|".join(re.sub(r"[^A-Z' ]", " ", text.upper()).split()) + text = " ".join(text) + return text + + +def prepare_w2v_data( + dict_dir, sample_rate, label, audio_paths, texts, split, data_dir +): + data_dir.mkdir(parents=True, exist_ok=True) + shutil.copyfile( + dict_dir / f"dict.{label}.txt", + data_dir / f"dict.{label}.txt" + ) + with open(data_dir / f"{split}.tsv", "w") as f: + f.write("/\n") + for audio_path in audio_paths: + wav, sr = sf.read(audio_path) + assert sr == sample_rate, f"{sr} != sample_rate" + nsample = len(wav) + f.write(f"{audio_path}\t{nsample}\n") + with open(data_dir / f"{split}.{label}", "w") as f: + for text in texts: + text = preprocess_text(text) + f.write(f"{text}\n") + + +def run_asr(asr_dir, split, w2v_ckpt, w2v_label, res_dir): + """ + results will be saved at + {res_dir}/{ref,hypo}.word-{w2v_ckpt.filename}-{split}.txt + """ + cmd = ["python", "-m", "examples.speech_recognition.infer"] + cmd += [str(asr_dir.resolve())] + cmd += ["--task", "audio_finetuning", "--nbest", "1", "--quiet"] + cmd += ["--w2l-decoder", "viterbi", "--criterion", "ctc"] + cmd += ["--post-process", "letter", "--max-tokens", "4000000"] + cmd += ["--path", str(w2v_ckpt.resolve()), "--labels", w2v_label] + cmd += ["--gen-subset", split, "--results-path", str(res_dir.resolve())] + + print(f"running cmd:\n{' '.join(cmd)}") + subprocess.run(cmd, check=True) + + +def compute_error_rate(hyp_wrd_path, ref_wrd_path, unit="word"): + """each line is "<text> (None-<index>)" """ + tokenize_line = { + "word": lambda x: re.sub(r" \(.*\)$", "", x.rstrip()).split(), + "char": lambda x: list(re.sub(r" \(.*\)$", "", x.rstrip())) + }.get(unit) + if tokenize_line is None: + raise ValueError(f"{unit} not supported") + + inds = [int(re.sub(r"\D*(\d*)\D*", r"\1", line)) + for line in open(hyp_wrd_path)] + hyps = [tokenize_line(line) for line in open(hyp_wrd_path)] + refs = [tokenize_line(line) for line in open(ref_wrd_path)] + assert(len(hyps) == len(refs)) + err_rates = [ + editdistance.eval(hyp, ref) / len(ref) for hyp, ref in zip(hyps, refs) + ] + ind_to_err_rates = {i: e for i, e in zip(inds, err_rates)} + return ind_to_err_rates + + +def main(args): + samples = load_tsv_to_dicts(args.raw_manifest) + ids = [ + sample[args.id_header] if args.id_header else "" for sample in samples + ] + audio_paths = [sample[args.audio_header] for sample in samples] + texts = [sample[args.text_header] for sample in samples] + + prepare_w2v_data( + args.w2v_dict_dir, + args.w2v_sample_rate, + args.w2v_label, + audio_paths, + texts, + args.split, + args.asr_dir + ) + run_asr(args.asr_dir, args.split, args.w2v_ckpt, args.w2v_label, args.asr_dir) + ind_to_err_rates = compute_error_rate( + args.asr_dir / f"hypo.word-{args.w2v_ckpt.name}-{args.split}.txt", + args.asr_dir / f"ref.word-{args.w2v_ckpt.name}-{args.split}.txt", + args.err_unit, + ) + + uer_path = args.asr_dir / f"uer_{args.err_unit}.{args.split}.tsv" + with open(uer_path, "w") as f: + f.write("id\taudio\tuer\n") + for ind, (id_, audio_path) in enumerate(zip(ids, audio_paths)): + f.write(f"{id_}\t{audio_path}\t{ind_to_err_rates[ind]:.4f}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--raw-manifest", required=True, type=Path) + parser.add_argument("--asr-dir", required=True, type=Path) + parser.add_argument("--id-header", default="id", type=str) + parser.add_argument("--audio-header", default="audio", type=str) + parser.add_argument("--text-header", default="src_text", type=str) + parser.add_argument("--split", default="raw", type=str) + parser.add_argument("--w2v-ckpt", required=True, type=Path) + parser.add_argument("--w2v-dict-dir", required=True, type=Path) + parser.add_argument("--w2v-sample-rate", default=16000, type=int) + parser.add_argument("--w2v-label", default="ltr", type=str) + parser.add_argument("--err-unit", default="word", type=str) + args = parser.parse_args() + + main(args) diff --git a/examples/speech_synthesis/evaluation/eval_f0.py b/examples/speech_synthesis/evaluation/eval_f0.py new file mode 100644 index 0000000000..df721d6831 --- /dev/null +++ b/examples/speech_synthesis/evaluation/eval_f0.py @@ -0,0 +1,266 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +Signal processing-based evaluation using waveforms +""" +import numpy as np +import os.path as op + +import torchaudio +import tqdm +from tabulate import tabulate + +from examples.speech_synthesis.utils import ( + gross_pitch_error, voicing_decision_error, f0_frame_error +) +from examples.speech_synthesis.evaluation.eval_sp import load_eval_spec + + +def difference_function(x, n, tau_max): + """ + Compute difference function of data x. This solution is implemented directly + with Numpy fft. + + + :param x: audio data + :param n: length of data + :param tau_max: integration window size + :return: difference function + :rtype: list + """ + + x = np.array(x, np.float64) + w = x.size + tau_max = min(tau_max, w) + x_cumsum = np.concatenate((np.array([0.]), (x * x).cumsum())) + size = w + tau_max + p2 = (size // 32).bit_length() + nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32) + size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size) + fc = np.fft.rfft(x, size_pad) + conv = np.fft.irfft(fc * fc.conjugate())[:tau_max] + return x_cumsum[w:w - tau_max:-1] + x_cumsum[w] - x_cumsum[:tau_max] - \ + 2 * conv + + +def cumulative_mean_normalized_difference_function(df, n): + """ + Compute cumulative mean normalized difference function (CMND). + + :param df: Difference function + :param n: length of data + :return: cumulative mean normalized difference function + :rtype: list + """ + + # scipy method + cmn_df = df[1:] * range(1, n) / np.cumsum(df[1:]).astype(float) + return np.insert(cmn_df, 0, 1) + + +def get_pitch(cmdf, tau_min, tau_max, harmo_th=0.1): + """ + Return fundamental period of a frame based on CMND function. + + :param cmdf: Cumulative Mean Normalized Difference function + :param tau_min: minimum period for speech + :param tau_max: maximum period for speech + :param harmo_th: harmonicity threshold to determine if it is necessary to + compute pitch frequency + :return: fundamental period if there is values under threshold, 0 otherwise + :rtype: float + """ + tau = tau_min + while tau < tau_max: + if cmdf[tau] < harmo_th: + while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]: + tau += 1 + return tau + tau += 1 + + return 0 # if unvoiced + + +def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500, + harmo_thresh=0.1): + """ + + Compute the Yin Algorithm. Return fundamental frequency and harmonic rate. + + https://github.com/NVIDIA/mellotron adaption of + https://github.com/patriceguyot/Yin + + :param sig: Audio signal (list of float) + :param sr: sampling rate (int) + :param w_len: size of the analysis window (samples) + :param w_step: size of the lag between two consecutives windows (samples) + :param f0_min: Minimum fundamental frequency that can be detected (hertz) + :param f0_max: Maximum fundamental frequency that can be detected (hertz) + :param harmo_thresh: Threshold of detection. The yalgorithmù return the + first minimum of the CMND function below this threshold. + + :returns: + + * pitches: list of fundamental frequencies, + * harmonic_rates: list of harmonic rate values for each fundamental + frequency value (= confidence value) + * argmins: minimums of the Cumulative Mean Normalized DifferenceFunction + * times: list of time of each estimation + :rtype: tuple + """ + + tau_min = int(sr / f0_max) + tau_max = int(sr / f0_min) + + # time values for each analysis window + time_scale = range(0, len(sig) - w_len, w_step) + times = [t/float(sr) for t in time_scale] + frames = [sig[t:t + w_len] for t in time_scale] + + pitches = [0.0] * len(time_scale) + harmonic_rates = [0.0] * len(time_scale) + argmins = [0.0] * len(time_scale) + + for i, frame in enumerate(frames): + # Compute YIN + df = difference_function(frame, w_len, tau_max) + cm_df = cumulative_mean_normalized_difference_function(df, tau_max) + p = get_pitch(cm_df, tau_min, tau_max, harmo_thresh) + + # Get results + if np.argmin(cm_df) > tau_min: + argmins[i] = float(sr / np.argmin(cm_df)) + if p != 0: # A pitch was found + pitches[i] = float(sr / p) + harmonic_rates[i] = cm_df[p] + else: # No pitch, but we compute a value of the harmonic rate + harmonic_rates[i] = min(cm_df) + + return pitches, harmonic_rates, argmins, times + + +def extract_f0(samples): + f0_samples = [] + for sample in tqdm.tqdm(samples): + if not op.isfile(sample["ref"]) or not op.isfile(sample["syn"]): + f0_samples.append(None) + continue + + # assume single channel + yref, sr = torchaudio.load(sample["ref"]) + ysyn, _sr = torchaudio.load(sample["syn"]) + yref, ysyn = yref[0], ysyn[0] + assert sr == _sr, f"{sr} != {_sr}" + + yref_f0 = compute_yin(yref, sr) + ysyn_f0 = compute_yin(ysyn, sr) + + f0_samples += [ + { + "ref": yref_f0, + "syn": ysyn_f0 + } + ] + + return f0_samples + + +def eval_f0_error(samples, distortion_fn): + results = [] + for sample in tqdm.tqdm(samples): + if sample is None: + results.append(None) + continue + # assume single channel + yref_f, _, _, yref_t = sample["ref"] + ysyn_f, _, _, ysyn_t = sample["syn"] + + yref_f = np.array(yref_f) + yref_t = np.array(yref_t) + ysyn_f = np.array(ysyn_f) + ysyn_t = np.array(ysyn_t) + + distortion = distortion_fn(yref_t, yref_f, ysyn_t, ysyn_f) + results.append((distortion.item(), + len(yref_f), + len(ysyn_f) + )) + return results + + +def eval_gross_pitch_error(samples): + return eval_f0_error(samples, gross_pitch_error) + + +def eval_voicing_decision_error(samples): + return eval_f0_error(samples, voicing_decision_error) + + +def eval_f0_frame_error(samples): + return eval_f0_error(samples, f0_frame_error) + + +def print_results(results, show_bin): + results = np.array(list(filter(lambda x: x is not None, results))) + + np.set_printoptions(precision=3) + + def _print_result(results): + res = { + "nutt": len(results), + "error": results[:, 0].mean(), + "std": results[:, 0].std(), + "dur_ref": int(results[:, 1].sum()), + "dur_syn": int(results[:, 2].sum()), + } + print(tabulate([res.values()], res.keys(), floatfmt=".4f")) + + print(">>>> ALL") + _print_result(results) + + if show_bin: + edges = [0, 200, 400, 600, 800, 1000, 2000, 4000] + for i in range(1, len(edges)): + mask = np.logical_and(results[:, 1] >= edges[i-1], + results[:, 1] < edges[i]) + if not mask.any(): + continue + bin_results = results[mask] + print(f">>>> ({edges[i-1]}, {edges[i]})") + _print_result(bin_results) + + +def main(eval_f0, gpe, vde, ffe, show_bin): + samples = load_eval_spec(eval_f0) + if gpe or vde or ffe: + f0_samples = extract_f0(samples) + + if gpe: + print("===== Evaluate Gross Pitch Error =====") + results = eval_gross_pitch_error(f0_samples) + print_results(results, show_bin) + if vde: + print("===== Evaluate Voicing Decision Error =====") + results = eval_voicing_decision_error(f0_samples) + print_results(results, show_bin) + if ffe: + print("===== Evaluate F0 Frame Error =====") + results = eval_f0_frame_error(f0_samples) + print_results(results, show_bin) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("eval_f0") + parser.add_argument("--gpe", action="store_true") + parser.add_argument("--vde", action="store_true") + parser.add_argument("--ffe", action="store_true") + parser.add_argument("--show-bin", action="store_true") + args = parser.parse_args() + + main(args.eval_f0, args.gpe, args.vde, args.ffe, args.show_bin) diff --git a/examples/speech_synthesis/evaluation/eval_sp.py b/examples/speech_synthesis/evaluation/eval_sp.py new file mode 100644 index 0000000000..702c498038 --- /dev/null +++ b/examples/speech_synthesis/evaluation/eval_sp.py @@ -0,0 +1,131 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +""" +Signal processing-based evaluation using waveforms +""" + +import csv +import numpy as np +import os.path as op + +import torch +import tqdm +from tabulate import tabulate +import torchaudio + +from examples.speech_synthesis.utils import batch_mel_spectral_distortion +from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion + + +def load_eval_spec(path): + with open(path) as f: + reader = csv.DictReader(f, delimiter='\t') + samples = list(reader) + return samples + + +def eval_distortion(samples, distortion_fn, device="cuda"): + nmiss = 0 + results = [] + for sample in tqdm.tqdm(samples): + if not op.isfile(sample["ref"]) or not op.isfile(sample["syn"]): + nmiss += 1 + results.append(None) + continue + # assume single channel + yref, sr = torchaudio.load(sample["ref"]) + ysyn, _sr = torchaudio.load(sample["syn"]) + yref, ysyn = yref[0].to(device), ysyn[0].to(device) + assert sr == _sr, f"{sr} != {_sr}" + + distortion, extra = distortion_fn([yref], [ysyn], sr, None)[0] + _, _, _, _, _, pathmap = extra + nins = torch.sum(pathmap.sum(dim=1) - 1) # extra frames in syn + ndel = torch.sum(pathmap.sum(dim=0) - 1) # missing frames from syn + results.append( + (distortion.item(), # path distortion + pathmap.size(0), # yref num frames + pathmap.size(1), # ysyn num frames + pathmap.sum().item(), # path length + nins.item(), # insertion + ndel.item(), # deletion + ) + ) + return results + + +def eval_mel_cepstral_distortion(samples, device="cuda"): + return eval_distortion(samples, batch_mel_cepstral_distortion, device) + + +def eval_mel_spectral_distortion(samples, device="cuda"): + return eval_distortion(samples, batch_mel_spectral_distortion, device) + + +def print_results(results, show_bin): + results = np.array(list(filter(lambda x: x is not None, results))) + + np.set_printoptions(precision=3) + + def _print_result(results): + dist, dur_ref, dur_syn, dur_ali, nins, ndel = results.sum(axis=0) + res = { + "nutt": len(results), + "dist": dist, + "dur_ref": int(dur_ref), + "dur_syn": int(dur_syn), + "dur_ali": int(dur_ali), + "dist_per_ref_frm": dist/dur_ref, + "dist_per_syn_frm": dist/dur_syn, + "dist_per_ali_frm": dist/dur_ali, + "ins": nins/dur_ref, + "del": ndel/dur_ref, + } + print(tabulate( + [res.values()], + res.keys(), + floatfmt=".4f" + )) + + print(">>>> ALL") + _print_result(results) + + if show_bin: + edges = [0, 200, 400, 600, 800, 1000, 2000, 4000] + for i in range(1, len(edges)): + mask = np.logical_and(results[:, 1] >= edges[i-1], + results[:, 1] < edges[i]) + if not mask.any(): + continue + bin_results = results[mask] + print(f">>>> ({edges[i-1]}, {edges[i]})") + _print_result(bin_results) + + +def main(eval_spec, mcd, msd, show_bin): + samples = load_eval_spec(eval_spec) + device = "cpu" + if mcd: + print("===== Evaluate Mean Cepstral Distortion =====") + results = eval_mel_cepstral_distortion(samples, device) + print_results(results, show_bin) + if msd: + print("===== Evaluate Mean Spectral Distortion =====") + results = eval_mel_spectral_distortion(samples, device) + print_results(results, show_bin) + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("eval_spec") + parser.add_argument("--mcd", action="store_true") + parser.add_argument("--msd", action="store_true") + parser.add_argument("--show-bin", action="store_true") + args = parser.parse_args() + + main(args.eval_spec, args.mcd, args.msd, args.show_bin) diff --git a/examples/speech_synthesis/evaluation/get_eval_manifest.py b/examples/speech_synthesis/evaluation/get_eval_manifest.py new file mode 100644 index 0000000000..44b3685bb2 --- /dev/null +++ b/examples/speech_synthesis/evaluation/get_eval_manifest.py @@ -0,0 +1,64 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import csv +from pathlib import Path + + +def main(args): + """ + `uid syn ref text` + """ + in_root = Path(args.generation_root).resolve() + ext = args.audio_format + with open(args.audio_manifest) as f, open(args.output_path, "w") as f_out: + reader = csv.DictReader( + f, delimiter="\t", quotechar=None, doublequote=False, + lineterminator="\n", quoting=csv.QUOTE_NONE + ) + header = ["id", "syn", "ref", "text", "speaker"] + f_out.write("\t".join(header) + "\n") + for row in reader: + dir_name = f"{ext}_{args.sample_rate}hz_{args.vocoder}" + id_ = row["id"] + syn = (in_root / dir_name / f"{id_}.{ext}").as_posix() + ref = row["audio"] + if args.use_resynthesized_target: + ref = (in_root / f"{dir_name}_tgt" / f"{id_}.{ext}").as_posix() + if args.eval_target: + syn = row["audio"] + sample = [id_, syn, ref, row["tgt_text"], row["speaker"]] + f_out.write("\t".join(sample) + "\n") + print(f"wrote evaluation file to {args.output_path}") + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + "--generation-root", help="output directory for generate_waveform.py" + ) + parser.add_argument( + "--audio-manifest", + help="used to determine the original utterance ID and text" + ) + parser.add_argument( + "--output-path", help="path to output evaluation spec file" + ) + parser.add_argument( + "--use-resynthesized-target", action="store_true", + help="use resynthesized reference instead of the original audio" + ) + parser.add_argument( + "--eval-target", action="store_true", + help="evaluate reference instead of model prediction" + ) + parser.add_argument("--vocoder", type=str, default="griffin_lim") + parser.add_argument("--sample-rate", type=int, default=22_050) + parser.add_argument("--audio-format", type=str, default="wav") + args = parser.parse_args() + + main(args) diff --git a/examples/speech_synthesis/generate_waveform.py b/examples/speech_synthesis/generate_waveform.py new file mode 100644 index 0000000000..3b56190dbe --- /dev/null +++ b/examples/speech_synthesis/generate_waveform.py @@ -0,0 +1,192 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import ast +import logging +import matplotlib.pyplot as plt +import numpy as np +from pathlib import Path +import soundfile as sf +import sys +import torch +import torchaudio + +from fairseq import checkpoint_utils, options, tasks, utils +from fairseq.logging import progress_bar +from fairseq.tasks.text_to_speech import plot_tts_output +from fairseq.data.audio.text_to_speech_dataset import TextToSpeechDataset + + +logging.basicConfig() +logging.root.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def make_parser(): + parser = options.get_speech_generation_parser() + parser.add_argument("--dump-features", action="store_true") + parser.add_argument("--dump-waveforms", action="store_true") + parser.add_argument("--dump-attentions", action="store_true") + parser.add_argument("--dump-eos-probs", action="store_true") + parser.add_argument("--dump-plots", action="store_true") + parser.add_argument("--dump-target", action="store_true") + parser.add_argument("--output-sample-rate", default=22050, type=int) + parser.add_argument("--teacher-forcing", action="store_true") + parser.add_argument( + "--audio-format", type=str, default="wav", choices=["wav", "flac"] + ) + return parser + + +def postprocess_results( + dataset: TextToSpeechDataset, sample, hypos, resample_fn, dump_target +): + def to_np(x): + return None if x is None else x.detach().cpu().numpy() + + sample_ids = [dataset.ids[i] for i in sample["id"].tolist()] + texts = sample["src_texts"] if "src_texts" in sample else [""] * len(hypos) + attns = [to_np(hypo["attn"]) for hypo in hypos] + eos_probs = [to_np(hypo.get("eos_prob", None)) for hypo in hypos] + feat_preds = [to_np(hypo["feature"]) for hypo in hypos] + wave_preds = [to_np(resample_fn(h["waveform"])) for h in hypos] + if dump_target: + feat_targs = [to_np(hypo["targ_feature"]) for hypo in hypos] + wave_targs = [to_np(resample_fn(h["targ_waveform"])) for h in hypos] + else: + feat_targs = [None for _ in hypos] + wave_targs = [None for _ in hypos] + + return zip(sample_ids, texts, attns, eos_probs, feat_preds, wave_preds, + feat_targs, wave_targs) + + +def dump_result( + is_na_model, + args, + vocoder, + sample_id, + text, + attn, + eos_prob, + feat_pred, + wave_pred, + feat_targ, + wave_targ, +): + sample_rate = args.output_sample_rate + out_root = Path(args.results_path) + if args.dump_features: + feat_dir = out_root / "feat" + feat_dir.mkdir(exist_ok=True, parents=True) + np.save(feat_dir / f"{sample_id}.npy", feat_pred) + if args.dump_target: + feat_tgt_dir = out_root / "feat_tgt" + feat_tgt_dir.mkdir(exist_ok=True, parents=True) + np.save(feat_tgt_dir / f"{sample_id}.npy", feat_targ) + if args.dump_attentions: + attn_dir = out_root / "attn" + attn_dir.mkdir(exist_ok=True, parents=True) + np.save(attn_dir / f"{sample_id}.npy", attn.numpy()) + if args.dump_eos_probs and not is_na_model: + eos_dir = out_root / "eos" + eos_dir.mkdir(exist_ok=True, parents=True) + np.save(eos_dir / f"{sample_id}.npy", eos_prob) + + if args.dump_plots: + images = [feat_pred.T] if is_na_model else [feat_pred.T, attn] + names = ["output"] if is_na_model else ["output", "alignment"] + if feat_targ is not None: + images = [feat_targ.T] + images + names = [f"target (idx={sample_id})"] + names + if is_na_model: + plot_tts_output(images, names, attn, "alignment", suptitle=text) + else: + plot_tts_output(images, names, eos_prob, "eos prob", suptitle=text) + plot_dir = out_root / "plot" + plot_dir.mkdir(exist_ok=True, parents=True) + plt.savefig(plot_dir / f"{sample_id}.png") + plt.close() + + if args.dump_waveforms: + ext = args.audio_format + if wave_pred is not None: + wav_dir = out_root / f"{ext}_{sample_rate}hz_{vocoder}" + wav_dir.mkdir(exist_ok=True, parents=True) + sf.write(wav_dir / f"{sample_id}.{ext}", wave_pred, sample_rate) + if args.dump_target and wave_targ is not None: + wav_tgt_dir = out_root / f"{ext}_{sample_rate}hz_{vocoder}_tgt" + wav_tgt_dir.mkdir(exist_ok=True, parents=True) + sf.write(wav_tgt_dir / f"{sample_id}.{ext}", wave_targ, sample_rate) + + +def main(args): + assert(args.dump_features or args.dump_waveforms or args.dump_attentions + or args.dump_eos_probs or args.dump_plots) + if args.max_tokens is None and args.batch_size is None: + args.max_tokens = 8000 + logger.info(args) + + use_cuda = torch.cuda.is_available() and not args.cpu + task = tasks.setup_task(args) + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [args.path], + task=task, + arg_overrides=ast.literal_eval(args.model_overrides), + ) + model = models[0].cuda() if use_cuda else models[0] + # use the original n_frames_per_step + task.args.n_frames_per_step = saved_cfg.task.n_frames_per_step + task.load_dataset(args.gen_subset, task_cfg=saved_cfg.task) + + data_cfg = task.data_cfg + sample_rate = data_cfg.config.get("features", {}).get("sample_rate", 22050) + resample_fn = { + False: lambda x: x, + True: lambda x: torchaudio.sox_effects.apply_effects_tensor( + x.detach().cpu().unsqueeze(0), sample_rate, + [['rate', str(args.output_sample_rate)]] + )[0].squeeze(0) + }.get(args.output_sample_rate != sample_rate) + if args.output_sample_rate != sample_rate: + logger.info(f"resampling to {args.output_sample_rate}Hz") + + generator = task.build_generator([model], args) + itr = task.get_batch_iterator( + dataset=task.dataset(args.gen_subset), + max_tokens=args.max_tokens, + max_sentences=args.batch_size, + max_positions=(sys.maxsize, sys.maxsize), + ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, + required_batch_size_multiple=args.required_batch_size_multiple, + num_shards=args.num_shards, + shard_id=args.shard_id, + num_workers=args.num_workers, + data_buffer_size=args.data_buffer_size, + ).next_epoch_itr(shuffle=False) + + Path(args.results_path).mkdir(exist_ok=True, parents=True) + is_na_model = getattr(model, "NON_AUTOREGRESSIVE", False) + dataset = task.dataset(args.gen_subset) + vocoder = task.args.vocoder + with progress_bar.build_progress_bar(args, itr) as t: + for sample in t: + sample = utils.move_to_cuda(sample) if use_cuda else sample + hypos = generator.generate(model, sample, has_targ=args.dump_target) + for result in postprocess_results( + dataset, sample, hypos, resample_fn, args.dump_target + ): + dump_result(is_na_model, args, vocoder, *result) + + +def cli_main(): + parser = make_parser() + args = options.parse_args_and_arch(parser) + main(args) + + +if __name__ == "__main__": + cli_main() diff --git a/examples/speech_synthesis/preprocessing/__init__.py b/examples/speech_synthesis/preprocessing/__init__.py new file mode 100644 index 0000000000..6264236915 --- /dev/null +++ b/examples/speech_synthesis/preprocessing/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/examples/speech_synthesis/preprocessing/denoise_and_vad_audio.py b/examples/speech_synthesis/preprocessing/denoise_and_vad_audio.py new file mode 100644 index 0000000000..4e13b38a5d --- /dev/null +++ b/examples/speech_synthesis/preprocessing/denoise_and_vad_audio.py @@ -0,0 +1,204 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os +import csv +import tempfile +from collections import defaultdict +from pathlib import Path + +import torchaudio +try: + import webrtcvad +except ImportError: + raise ImportError("Please install py-webrtcvad: pip install webrtcvad") +import pandas as pd +from tqdm import tqdm + +from examples.speech_synthesis.preprocessing.denoiser.pretrained import master64 +import examples.speech_synthesis.preprocessing.denoiser.utils as utils +from examples.speech_synthesis.preprocessing.vad import ( + frame_generator, vad_collector, read_wave, write_wave, FS_MS, THRESHOLD, + SCALE +) +from examples.speech_to_text.data_utils import save_df_to_tsv + + +log = logging.getLogger(__name__) + +PATHS = ["after_denoise", "after_vad"] +MIN_T = 0.05 + + +def generate_tmp_filename(extension="txt"): + return tempfile._get_default_tempdir() + "/" + \ + next(tempfile._get_candidate_names()) + "." + extension + + +def convert_sr(inpath, sr, output_path=None): + if not output_path: + output_path = generate_tmp_filename("wav") + cmd = f"sox {inpath} -r {sr} {output_path}" + os.system(cmd) + return output_path + + +def apply_vad(vad, inpath): + audio, sample_rate = read_wave(inpath) + frames = frame_generator(FS_MS, audio, sample_rate) + frames = list(frames) + segments = vad_collector(sample_rate, FS_MS, 300, vad, frames) + merge_segments = list() + timestamp_start = 0.0 + timestamp_end = 0.0 + # removing start, end, and long sequences of sils + for i, segment in enumerate(segments): + merge_segments.append(segment[0]) + if i and timestamp_start: + sil_duration = segment[1] - timestamp_end + if sil_duration > THRESHOLD: + merge_segments.append(int(THRESHOLD / SCALE) * (b'\x00')) + else: + merge_segments.append(int((sil_duration / SCALE)) * (b'\x00')) + timestamp_start = segment[1] + timestamp_end = segment[2] + segment = b''.join(merge_segments) + return segment, sample_rate + + +def write(wav, filename, sr=16_000): + # Normalize audio if it prevents clipping + wav = wav / max(wav.abs().max().item(), 1) + torchaudio.save(filename, wav.cpu(), sr, encoding="PCM_S", + bits_per_sample=16) + + +def process(args): + # making sure we are requested either denoise or vad + if not args.denoise and not args.vad: + log.error("No denoise or vad is requested.") + return + + log.info("Creating out directories...") + if args.denoise: + out_denoise = Path(args.output_dir).absolute().joinpath(PATHS[0]) + out_denoise.mkdir(parents=True, exist_ok=True) + if args.vad: + out_vad = Path(args.output_dir).absolute().joinpath(PATHS[1]) + out_vad.mkdir(parents=True, exist_ok=True) + + log.info("Loading pre-trained speech enhancement model...") + model = master64().to(args.device) + + log.info("Building the VAD model...") + vad = webrtcvad.Vad(int(args.vad_agg_level)) + + # preparing the output dict + output_dict = defaultdict(list) + + log.info(f"Parsing input manifest: {args.audio_manifest}") + with open(args.audio_manifest, "r") as f: + manifest_dict = csv.DictReader(f, delimiter="\t") + for row in tqdm(manifest_dict): + filename = str(row["audio"]) + + final_output = filename + keep_sample = True + n_frames = row["n_frames"] + snr = -1 + if args.denoise: + output_path_denoise = out_denoise.joinpath(Path(filename).name) + # convert to 16khz in case we use a differet sr + tmp_path = convert_sr(final_output, 16000) + + # loading audio file and generating the enhanced version + out, sr = torchaudio.load(tmp_path) + out = out.to(args.device) + estimate = model(out) + estimate = (1 - args.dry_wet) * estimate + args.dry_wet * out + write(estimate[0], str(output_path_denoise), sr) + + snr = utils.cal_snr(out, estimate) + snr = snr.cpu().detach().numpy()[0][0] + final_output = str(output_path_denoise) + + if args.vad: + output_path_vad = out_vad.joinpath(Path(filename).name) + sr = torchaudio.info(final_output).sample_rate + if sr in [16000, 32000, 48000]: + tmp_path = final_output + elif sr < 16000: + tmp_path = convert_sr(final_output, 16000) + elif sr < 32000: + tmp_path = convert_sr(final_output, 32000) + else: + tmp_path = convert_sr(final_output, 48000) + # apply VAD + segment, sample_rate = apply_vad(vad, tmp_path) + if len(segment) < sample_rate * MIN_T: + keep_sample = False + print(( + f"WARNING: skip {filename} because it is too short " + f"after VAD ({len(segment) / sample_rate} < {MIN_T})" + )) + else: + if sample_rate != sr: + tmp_path = generate_tmp_filename("wav") + write_wave(tmp_path, segment, sample_rate) + convert_sr(tmp_path, sr, + output_path=str(output_path_vad)) + else: + write_wave(str(output_path_vad), segment, sample_rate) + final_output = str(output_path_vad) + segment, _ = torchaudio.load(final_output) + n_frames = segment.size(1) + + if keep_sample: + output_dict["id"].append(row["id"]) + output_dict["audio"].append(final_output) + output_dict["n_frames"].append(n_frames) + output_dict["tgt_text"].append(row["tgt_text"]) + output_dict["speaker"].append(row["speaker"]) + output_dict["src_text"].append(row["src_text"]) + output_dict["snr"].append(snr) + + out_tsv_path = Path(args.output_dir) / Path(args.audio_manifest).name + log.info(f"Saving manifest to {out_tsv_path.as_posix()}") + save_df_to_tsv(pd.DataFrame.from_dict(output_dict), out_tsv_path) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--audio-manifest", "-i", required=True, + type=str, help="path to the input manifest.") + parser.add_argument( + "--output-dir", "-o", required=True, type=str, + help="path to the output dir. it will contain files after denoising and" + " vad" + ) + parser.add_argument("--vad-agg-level", "-a", type=int, default=2, + help="the aggresive level of the vad [0-3].") + parser.add_argument( + "--dry-wet", "-dw", type=float, default=0.01, + help="the level of linear interpolation between noisy and enhanced " + "files." + ) + parser.add_argument( + "--device", "-d", type=str, default="cpu", + help="the device to be used for the speech enhancement model: " + "cpu | cuda." + ) + parser.add_argument("--denoise", action="store_true", + help="apply a denoising") + parser.add_argument("--vad", action="store_true", help="apply a VAD") + args = parser.parse_args() + + process(args) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_synthesis/preprocessing/denoiser/__init__.py b/examples/speech_synthesis/preprocessing/denoiser/__init__.py new file mode 100644 index 0000000000..6264236915 --- /dev/null +++ b/examples/speech_synthesis/preprocessing/denoiser/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/examples/speech_synthesis/preprocessing/denoiser/demucs.py b/examples/speech_synthesis/preprocessing/denoiser/demucs.py new file mode 100644 index 0000000000..3f70e73d6a --- /dev/null +++ b/examples/speech_synthesis/preprocessing/denoiser/demucs.py @@ -0,0 +1,473 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# author: adefossez + +import math +import time + +import torch as th +from torch import nn +from torch.nn import functional as F + +from .resample import downsample2, upsample2 +from .utils import capture_init + + +class BLSTM(nn.Module): + def __init__(self, dim, layers=2, bi=True): + super().__init__() + klass = nn.LSTM + self.lstm = klass( + bidirectional=bi, num_layers=layers, hidden_size=dim, input_size=dim + ) + self.linear = None + if bi: + self.linear = nn.Linear(2 * dim, dim) + + def forward(self, x, hidden=None): + x, hidden = self.lstm(x, hidden) + if self.linear: + x = self.linear(x) + return x, hidden + + +def rescale_conv(conv, reference): + std = conv.weight.std().detach() + scale = (std / reference)**0.5 + conv.weight.data /= scale + if conv.bias is not None: + conv.bias.data /= scale + + +def rescale_module(module, reference): + for sub in module.modules(): + if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)): + rescale_conv(sub, reference) + + +class Demucs(nn.Module): + """ + Demucs speech enhancement model. + Args: + - chin (int): number of input channels. + - chout (int): number of output channels. + - hidden (int): number of initial hidden channels. + - depth (int): number of layers. + - kernel_size (int): kernel size for each layer. + - stride (int): stride for each layer. + - causal (bool): if false, uses BiLSTM instead of LSTM. + - resample (int): amount of resampling to apply to the input/output. + Can be one of 1, 2 or 4. + - growth (float): number of channels is multiplied by this for every layer. + - max_hidden (int): maximum number of channels. Can be useful to + control the size/speed of the model. + - normalize (bool): if true, normalize the input. + - glu (bool): if true uses GLU instead of ReLU in 1x1 convolutions. + - rescale (float): controls custom weight initialization. + See https://arxiv.org/abs/1911.13254. + - floor (float): stability flooring when normalizing. + + """ + @capture_init + def __init__(self, + chin=1, + chout=1, + hidden=48, + depth=5, + kernel_size=8, + stride=4, + causal=True, + resample=4, + growth=2, + max_hidden=10_000, + normalize=True, + glu=True, + rescale=0.1, + floor=1e-3): + + super().__init__() + if resample not in [1, 2, 4]: + raise ValueError("Resample should be 1, 2 or 4.") + + self.chin = chin + self.chout = chout + self.hidden = hidden + self.depth = depth + self.kernel_size = kernel_size + self.stride = stride + self.causal = causal + self.floor = floor + self.resample = resample + self.normalize = normalize + + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + activation = nn.GLU(1) if glu else nn.ReLU() + ch_scale = 2 if glu else 1 + + for index in range(depth): + encode = [] + encode += [ + nn.Conv1d(chin, hidden, kernel_size, stride), + nn.ReLU(), + nn.Conv1d(hidden, hidden * ch_scale, 1), activation, + ] + self.encoder.append(nn.Sequential(*encode)) + + decode = [] + decode += [ + nn.Conv1d(hidden, ch_scale * hidden, 1), activation, + nn.ConvTranspose1d(hidden, chout, kernel_size, stride), + ] + if index > 0: + decode.append(nn.ReLU()) + self.decoder.insert(0, nn.Sequential(*decode)) + chout = hidden + chin = hidden + hidden = min(int(growth * hidden), max_hidden) + + self.lstm = BLSTM(chin, bi=not causal) + if rescale: + rescale_module(self, reference=rescale) + + def valid_length(self, length): + """ + Return the nearest valid length to use with the model so that + there is no time steps left over in a convolutions, e.g. for all + layers, size of the input - kernel_size % stride = 0. + + If the mixture has a valid length, the estimated sources + will have exactly the same length. + """ + length = math.ceil(length * self.resample) + for _ in range(self.depth): + length = math.ceil((length - self.kernel_size) / self.stride) + 1 + length = max(length, 1) + for _ in range(self.depth): + length = (length - 1) * self.stride + self.kernel_size + length = int(math.ceil(length / self.resample)) + return int(length) + + @property + def total_stride(self): + return self.stride ** self.depth // self.resample + + def forward(self, mix): + if mix.dim() == 2: + mix = mix.unsqueeze(1) + + if self.normalize: + mono = mix.mean(dim=1, keepdim=True) + std = mono.std(dim=-1, keepdim=True) + mix = mix / (self.floor + std) + else: + std = 1 + length = mix.shape[-1] + x = mix + x = F.pad(x, (0, self.valid_length(length) - length)) + if self.resample == 2: + x = upsample2(x) + elif self.resample == 4: + x = upsample2(x) + x = upsample2(x) + skips = [] + for encode in self.encoder: + x = encode(x) + skips.append(x) + x = x.permute(2, 0, 1) + x, _ = self.lstm(x) + x = x.permute(1, 2, 0) + for decode in self.decoder: + skip = skips.pop(-1) + x = x + skip[..., :x.shape[-1]] + x = decode(x) + if self.resample == 2: + x = downsample2(x) + elif self.resample == 4: + x = downsample2(x) + x = downsample2(x) + + x = x[..., :length] + return std * x + + +def fast_conv(conv, x): + """ + Faster convolution evaluation if either kernel size is 1 + or length of sequence is 1. + """ + batch, chin, length = x.shape + chout, chin, kernel = conv.weight.shape + assert batch == 1 + if kernel == 1: + x = x.view(chin, length) + out = th.addmm(conv.bias.view(-1, 1), + conv.weight.view(chout, chin), x) + elif length == kernel: + x = x.view(chin * kernel, 1) + out = th.addmm(conv.bias.view(-1, 1), + conv.weight.view(chout, chin * kernel), x) + else: + out = conv(x) + return out.view(batch, chout, -1) + + +class DemucsStreamer: + """ + Streaming implementation for Demucs. It supports being fed with any amount + of audio at a time. You will get back as much audio as possible at that + point. + + Args: + - demucs (Demucs): Demucs model. + - dry (float): amount of dry (e.g. input) signal to keep. 0 is maximum + noise removal, 1 just returns the input signal. Small values > 0 + allows to limit distortions. + - num_frames (int): number of frames to process at once. Higher values + will increase overall latency but improve the real time factor. + - resample_lookahead (int): extra lookahead used for the resampling. + - resample_buffer (int): size of the buffer of previous inputs/outputs + kept for resampling. + """ + def __init__(self, demucs, + dry=0, + num_frames=1, + resample_lookahead=64, + resample_buffer=256): + device = next(iter(demucs.parameters())).device + self.demucs = demucs + self.lstm_state = None + self.conv_state = None + self.dry = dry + self.resample_lookahead = resample_lookahead + resample_buffer = min(demucs.total_stride, resample_buffer) + self.resample_buffer = resample_buffer + self.frame_length = demucs.valid_length(1) + \ + demucs.total_stride * (num_frames - 1) + self.total_length = self.frame_length + self.resample_lookahead + self.stride = demucs.total_stride * num_frames + self.resample_in = th.zeros(demucs.chin, resample_buffer, device=device) + self.resample_out = th.zeros( + demucs.chin, resample_buffer, device=device + ) + + self.frames = 0 + self.total_time = 0 + self.variance = 0 + self.pending = th.zeros(demucs.chin, 0, device=device) + + bias = demucs.decoder[0][2].bias + weight = demucs.decoder[0][2].weight + chin, chout, kernel = weight.shape + self._bias = bias.view(-1, 1).repeat(1, kernel).view(-1, 1) + self._weight = weight.permute(1, 2, 0).contiguous() + + def reset_time_per_frame(self): + self.total_time = 0 + self.frames = 0 + + @property + def time_per_frame(self): + return self.total_time / self.frames + + def flush(self): + """ + Flush remaining audio by padding it with zero. Call this + when you have no more input and want to get back the last chunk of audio. + """ + pending_length = self.pending.shape[1] + padding = th.zeros( + self.demucs.chin, self.total_length, device=self.pending.device + ) + out = self.feed(padding) + return out[:, :pending_length] + + def feed(self, wav): + """ + Apply the model to mix using true real time evaluation. + Normalization is done online as is the resampling. + """ + begin = time.time() + demucs = self.demucs + resample_buffer = self.resample_buffer + stride = self.stride + resample = demucs.resample + + if wav.dim() != 2: + raise ValueError("input wav should be two dimensional.") + chin, _ = wav.shape + if chin != demucs.chin: + raise ValueError(f"Expected {demucs.chin} channels, got {chin}") + + self.pending = th.cat([self.pending, wav], dim=1) + outs = [] + while self.pending.shape[1] >= self.total_length: + self.frames += 1 + frame = self.pending[:, :self.total_length] + dry_signal = frame[:, :stride] + if demucs.normalize: + mono = frame.mean(0) + variance = (mono**2).mean() + self.variance = variance / self.frames + \ + (1 - 1 / self.frames) * self.variance + frame = frame / (demucs.floor + math.sqrt(self.variance)) + frame = th.cat([self.resample_in, frame], dim=-1) + self.resample_in[:] = frame[:, stride - resample_buffer:stride] + + if resample == 4: + frame = upsample2(upsample2(frame)) + elif resample == 2: + frame = upsample2(frame) + # remove pre sampling buffer + frame = frame[:, resample * resample_buffer:] + # remove extra samples after window + frame = frame[:, :resample * self.frame_length] + + out, extra = self._separate_frame(frame) + padded_out = th.cat([self.resample_out, out, extra], 1) + self.resample_out[:] = out[:, -resample_buffer:] + if resample == 4: + out = downsample2(downsample2(padded_out)) + elif resample == 2: + out = downsample2(padded_out) + else: + out = padded_out + + out = out[:, resample_buffer // resample:] + out = out[:, :stride] + + if demucs.normalize: + out *= math.sqrt(self.variance) + out = self.dry * dry_signal + (1 - self.dry) * out + outs.append(out) + self.pending = self.pending[:, stride:] + + self.total_time += time.time() - begin + if outs: + out = th.cat(outs, 1) + else: + out = th.zeros(chin, 0, device=wav.device) + return out + + def _separate_frame(self, frame): + demucs = self.demucs + skips = [] + next_state = [] + first = self.conv_state is None + stride = self.stride * demucs.resample + x = frame[None] + for idx, encode in enumerate(demucs.encoder): + stride //= demucs.stride + length = x.shape[2] + if idx == demucs.depth - 1: + # This is sligthly faster for the last conv + x = fast_conv(encode[0], x) + x = encode[1](x) + x = fast_conv(encode[2], x) + x = encode[3](x) + else: + if not first: + prev = self.conv_state.pop(0) + prev = prev[..., stride:] + tgt = (length - demucs.kernel_size) // demucs.stride + 1 + missing = tgt - prev.shape[-1] + offset = length - demucs.kernel_size - \ + demucs.stride * (missing - 1) + x = x[..., offset:] + x = encode[1](encode[0](x)) + x = fast_conv(encode[2], x) + x = encode[3](x) + if not first: + x = th.cat([prev, x], -1) + next_state.append(x) + skips.append(x) + + x = x.permute(2, 0, 1) + x, self.lstm_state = demucs.lstm(x, self.lstm_state) + x = x.permute(1, 2, 0) + # In the following, x contains only correct samples, i.e. the one + # for which each time position is covered by two window of the upper + # layer. extra contains extra samples to the right, and is used only as + # a better padding for the online resampling. + extra = None + for idx, decode in enumerate(demucs.decoder): + skip = skips.pop(-1) + x += skip[..., :x.shape[-1]] + x = fast_conv(decode[0], x) + x = decode[1](x) + + if extra is not None: + skip = skip[..., x.shape[-1]:] + extra += skip[..., :extra.shape[-1]] + extra = decode[2](decode[1](decode[0](extra))) + x = decode[2](x) + next_state.append( + x[..., -demucs.stride:] - decode[2].bias.view(-1, 1) + ) + if extra is None: + extra = x[..., -demucs.stride:] + else: + extra[..., :demucs.stride] += next_state[-1] + x = x[..., :-demucs.stride] + + if not first: + prev = self.conv_state.pop(0) + x[..., :demucs.stride] += prev + if idx != demucs.depth - 1: + x = decode[3](x) + extra = decode[3](extra) + self.conv_state = next_state + return x[0], extra[0] + + +def test(): + import argparse + parser = argparse.ArgumentParser( + "denoiser.demucs", + description="Benchmark the streaming Demucs implementation, as well as " + "checking the delta with the offline implementation.") + parser.add_argument("--depth", default=5, type=int) + parser.add_argument("--resample", default=4, type=int) + parser.add_argument("--hidden", default=48, type=int) + parser.add_argument("--sample_rate", default=16000, type=float) + parser.add_argument("--device", default="cpu") + parser.add_argument("-t", "--num_threads", type=int) + parser.add_argument("-f", "--num_frames", type=int, default=1) + args = parser.parse_args() + if args.num_threads: + th.set_num_threads(args.num_threads) + sr = args.sample_rate + sr_ms = sr / 1000 + demucs = Demucs( + depth=args.depth, hidden=args.hidden, resample=args.resample + ).to(args.device) + x = th.randn(1, int(sr * 4)).to(args.device) + out = demucs(x[None])[0] + streamer = DemucsStreamer(demucs, num_frames=args.num_frames) + out_rt = [] + frame_size = streamer.total_length + with th.no_grad(): + while x.shape[1] > 0: + out_rt.append(streamer.feed(x[:, :frame_size])) + x = x[:, frame_size:] + frame_size = streamer.demucs.total_stride + out_rt.append(streamer.flush()) + out_rt = th.cat(out_rt, 1) + model_size = sum(p.numel() for p in demucs.parameters()) * 4 / 2**20 + initial_lag = streamer.total_length / sr_ms + tpf = 1000 * streamer.time_per_frame + print(f"model size: {model_size:.1f}MB, ", end='') + print(f"delta batch/streaming: {th.norm(out - out_rt) / th.norm(out):.2%}") + print(f"initial lag: {initial_lag:.1f}ms, ", end='') + print(f"stride: {streamer.stride * args.num_frames / sr_ms:.1f}ms") + print(f"time per frame: {tpf:.1f}ms, ", end='') + rtf = (1000 * streamer.time_per_frame) / (streamer.stride / sr_ms) + print(f"RTF: {rtf:.2f}") + print(f"Total lag with computation: {initial_lag + tpf:.1f}ms") + + +if __name__ == "__main__": + test() diff --git a/examples/speech_synthesis/preprocessing/denoiser/pretrained.py b/examples/speech_synthesis/preprocessing/denoiser/pretrained.py new file mode 100644 index 0000000000..2fa846075b --- /dev/null +++ b/examples/speech_synthesis/preprocessing/denoiser/pretrained.py @@ -0,0 +1,81 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# author: adefossez + +import logging + +import torch.hub + +from .demucs import Demucs +from .utils import deserialize_model + +logger = logging.getLogger(__name__) +ROOT = "https://dl.fbaipublicfiles.com/adiyoss/denoiser/" +DNS_48_URL = ROOT + "dns48-11decc9d8e3f0998.th" +DNS_64_URL = ROOT + "dns64-a7761ff99a7d5bb6.th" +MASTER_64_URL = ROOT + "master64-8a5dfb4bb92753dd.th" + + +def _demucs(pretrained, url, **kwargs): + model = Demucs(**kwargs) + if pretrained: + state_dict = torch.hub.load_state_dict_from_url(url, map_location='cpu') + model.load_state_dict(state_dict) + return model + + +def dns48(pretrained=True): + return _demucs(pretrained, DNS_48_URL, hidden=48) + + +def dns64(pretrained=True): + return _demucs(pretrained, DNS_64_URL, hidden=64) + + +def master64(pretrained=True): + return _demucs(pretrained, MASTER_64_URL, hidden=64) + + +def add_model_flags(parser): + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument( + "-m", "--model_path", help="Path to local trained model." + ) + group.add_argument( + "--dns48", action="store_true", + help="Use pre-trained real time H=48 model trained on DNS." + ) + group.add_argument( + "--dns64", action="store_true", + help="Use pre-trained real time H=64 model trained on DNS." + ) + group.add_argument( + "--master64", action="store_true", + help="Use pre-trained real time H=64 model trained on DNS and Valentini." + ) + + +def get_model(args): + """ + Load local model package or torchhub pre-trained model. + """ + if args.model_path: + logger.info("Loading model from %s", args.model_path) + pkg = torch.load(args.model_path) + model = deserialize_model(pkg) + elif args.dns64: + logger.info("Loading pre-trained real time H=64 model trained on DNS.") + model = dns64() + elif args.master64: + logger.info( + "Loading pre-trained real time H=64 model trained on DNS and Valentini." + ) + model = master64() + else: + logger.info("Loading pre-trained real time H=48 model trained on DNS.") + model = dns48() + logger.debug(model) + return model diff --git a/examples/speech_synthesis/preprocessing/denoiser/resample.py b/examples/speech_synthesis/preprocessing/denoiser/resample.py new file mode 100644 index 0000000000..1222addc42 --- /dev/null +++ b/examples/speech_synthesis/preprocessing/denoiser/resample.py @@ -0,0 +1,79 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# author: adefossez + +import math + +import torch as th +from torch.nn import functional as F + + +def sinc(t): + """sinc. + + :param t: the input tensor + """ + return th.where(t == 0, th.tensor(1., device=t.device, dtype=t.dtype), + th.sin(t) / t) + + +def kernel_upsample2(zeros=56): + """kernel_upsample2. + + """ + win = th.hann_window(4 * zeros + 1, periodic=False) + winodd = win[1::2] + t = th.linspace(-zeros + 0.5, zeros - 0.5, 2 * zeros) + t *= math.pi + kernel = (sinc(t) * winodd).view(1, 1, -1) + return kernel + + +def upsample2(x, zeros=56): + """ + Upsampling the input by 2 using sinc interpolation. + Smith, Julius, and Phil Gossett. "A flexible sampling-rate conversion method." + ICASSP'84. IEEE International Conference on Acoustics, Speech, and Signal Processing. + Vol. 9. IEEE, 1984. + """ + *other, time = x.shape + kernel = kernel_upsample2(zeros).to(x) + out = F.conv1d(x.view(-1, 1, time), kernel, padding=zeros)[..., 1:].view( + *other, time + ) + y = th.stack([x, out], dim=-1) + return y.view(*other, -1) + + +def kernel_downsample2(zeros=56): + """kernel_downsample2. + + """ + win = th.hann_window(4 * zeros + 1, periodic=False) + winodd = win[1::2] + t = th.linspace(-zeros + 0.5, zeros - 0.5, 2 * zeros) + t.mul_(math.pi) + kernel = (sinc(t) * winodd).view(1, 1, -1) + return kernel + + +def downsample2(x, zeros=56): + """ + Downsampling the input by 2 using sinc interpolation. + Smith, Julius, and Phil Gossett. "A flexible sampling-rate conversion method." + ICASSP'84. IEEE International Conference on Acoustics, Speech, and Signal Processing. + Vol. 9. IEEE, 1984. + """ + if x.shape[-1] % 2 != 0: + x = F.pad(x, (0, 1)) + xeven = x[..., ::2] + xodd = x[..., 1::2] + *other, time = xodd.shape + kernel = kernel_downsample2(zeros).to(x) + out = xeven + F.conv1d( + xodd.view(-1, 1, time), kernel, padding=zeros + )[..., :-1].view(*other, time) + return out.view(*other, -1).mul(0.5) diff --git a/examples/speech_synthesis/preprocessing/denoiser/utils.py b/examples/speech_synthesis/preprocessing/denoiser/utils.py new file mode 100644 index 0000000000..734d047f1b --- /dev/null +++ b/examples/speech_synthesis/preprocessing/denoiser/utils.py @@ -0,0 +1,176 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# author: adefossez + +import functools +import logging +from contextlib import contextmanager +import inspect +import time + +logger = logging.getLogger(__name__) + +EPS = 1e-8 + + +def capture_init(init): + """capture_init. + + Decorate `__init__` with this, and you can then + recover the *args and **kwargs passed to it in `self._init_args_kwargs` + """ + @functools.wraps(init) + def __init__(self, *args, **kwargs): + self._init_args_kwargs = (args, kwargs) + init(self, *args, **kwargs) + + return __init__ + + +def deserialize_model(package, strict=False): + """deserialize_model. + + """ + klass = package['class'] + if strict: + model = klass(*package['args'], **package['kwargs']) + else: + sig = inspect.signature(klass) + kw = package['kwargs'] + for key in list(kw): + if key not in sig.parameters: + logger.warning("Dropping inexistant parameter %s", key) + del kw[key] + model = klass(*package['args'], **kw) + model.load_state_dict(package['state']) + return model + + +def copy_state(state): + return {k: v.cpu().clone() for k, v in state.items()} + + +def serialize_model(model): + args, kwargs = model._init_args_kwargs + state = copy_state(model.state_dict()) + return {"class": model.__class__, "args": args, "kwargs": kwargs, "state": state} + + +@contextmanager +def swap_state(model, state): + """ + Context manager that swaps the state of a model, e.g: + + # model is in old state + with swap_state(model, new_state): + # model in new state + # model back to old state + """ + old_state = copy_state(model.state_dict()) + model.load_state_dict(state) + try: + yield + finally: + model.load_state_dict(old_state) + + +def pull_metric(history, name): + out = [] + for metrics in history: + if name in metrics: + out.append(metrics[name]) + return out + + +class LogProgress: + """ + Sort of like tqdm but using log lines and not as real time. + Args: + - logger: logger obtained from `logging.getLogger`, + - iterable: iterable object to wrap + - updates (int): number of lines that will be printed, e.g. + if `updates=5`, log every 1/5th of the total length. + - total (int): length of the iterable, in case it does not support + `len`. + - name (str): prefix to use in the log. + - level: logging level (like `logging.INFO`). + """ + def __init__(self, + logger, + iterable, + updates=5, + total=None, + name="LogProgress", + level=logging.INFO): + self.iterable = iterable + self.total = total or len(iterable) + self.updates = updates + self.name = name + self.logger = logger + self.level = level + + def update(self, **infos): + self._infos = infos + + def __iter__(self): + self._iterator = iter(self.iterable) + self._index = -1 + self._infos = {} + self._begin = time.time() + return self + + def __next__(self): + self._index += 1 + try: + value = next(self._iterator) + except StopIteration: + raise + else: + return value + finally: + log_every = max(1, self.total // self.updates) + # logging is delayed by 1 it, in order to have the metrics from update + if self._index >= 1 and self._index % log_every == 0: + self._log() + + def _log(self): + self._speed = (1 + self._index) / (time.time() - self._begin) + infos = " | ".join(f"{k.capitalize()} {v}" for k, v in self._infos.items()) + if self._speed < 1e-4: + speed = "oo sec/it" + elif self._speed < 0.1: + speed = f"{1/self._speed:.1f} sec/it" + else: + speed = f"{self._speed:.1f} it/sec" + out = f"{self.name} | {self._index}/{self.total} | {speed}" + if infos: + out += " | " + infos + self.logger.log(self.level, out) + + +def colorize(text, color): + """ + Display text with some ANSI color in the terminal. + """ + code = f"\033[{color}m" + restore = "\033[0m" + return "".join([code, text, restore]) + + +def bold(text): + """ + Display text in bold in the terminal. + """ + return colorize(text, "1") + + +def cal_snr(lbl, est): + import torch + y = 10.0 * torch.log10( + torch.sum(lbl**2, dim=-1) / (torch.sum((est-lbl)**2, dim=-1) + EPS) + + EPS + ) + return y diff --git a/examples/speech_synthesis/preprocessing/get_common_voice_audio_manifest.py b/examples/speech_synthesis/preprocessing/get_common_voice_audio_manifest.py new file mode 100644 index 0000000000..a302546043 --- /dev/null +++ b/examples/speech_synthesis/preprocessing/get_common_voice_audio_manifest.py @@ -0,0 +1,140 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +from pathlib import Path +from collections import defaultdict +from typing import List, Dict, Tuple + +import pandas as pd +import numpy as np +import torchaudio +from tqdm import tqdm + +from examples.speech_to_text.data_utils import load_df_from_tsv, save_df_to_tsv + + +log = logging.getLogger(__name__) + +SPLITS = ["train", "dev", "test"] + + +def get_top_n( + root: Path, n_speakers: int = 10, min_n_tokens: int = 5 +) -> pd.DataFrame: + df = load_df_from_tsv(root / "validated.tsv") + df["n_tokens"] = [len(s.split()) for s in df["sentence"]] + df = df[df["n_tokens"] >= min_n_tokens] + df["n_frames"] = [ + torchaudio.info((root / "clips" / p).as_posix()).num_frames + for p in tqdm(df["path"]) + ] + df["id"] = [Path(p).stem for p in df["path"]] + total_duration_ms = df.groupby("client_id")["n_frames"].agg(["sum"]) + total_duration_ms = total_duration_ms.sort_values("sum", ascending=False) + + top_n_total_duration_ms = total_duration_ms.head(n_speakers) + top_n_client_ids = set(top_n_total_duration_ms.index.tolist()) + df_top_n = df[df["client_id"].isin(top_n_client_ids)] + return df_top_n + + +def get_splits( + df, train_split_ratio=0.99, speaker_in_all_splits=False, rand_seed=0 +) -> Tuple[Dict[str, str], List[str]]: + np.random.seed(rand_seed) + dev_split_ratio = (1. - train_split_ratio) / 3 + grouped = list(df.groupby("client_id")) + id_to_split = {} + for _, cur_df in tqdm(grouped): + cur_n_examples = len(cur_df) + if speaker_in_all_splits and cur_n_examples < 3: + continue + cur_n_train = int(cur_n_examples * train_split_ratio) + cur_n_dev = int(cur_n_examples * dev_split_ratio) + cur_n_test = cur_n_examples - cur_n_dev - cur_n_train + if speaker_in_all_splits and cur_n_dev * cur_n_test == 0: + cur_n_dev, cur_n_test = 1, 1 + cur_n_train = cur_n_examples - cur_n_dev - cur_n_test + cur_indices = cur_df.index.tolist() + cur_shuffled_indices = np.random.permutation(cur_n_examples) + cur_shuffled_indices = [cur_indices[i] for i in cur_shuffled_indices] + cur_indices_by_split = { + "train": cur_shuffled_indices[:cur_n_train], + "dev": cur_shuffled_indices[cur_n_train: cur_n_train + cur_n_dev], + "test": cur_shuffled_indices[cur_n_train + cur_n_dev:] + } + for split in SPLITS: + for i in cur_indices_by_split[split]: + id_ = df["id"].loc[i] + id_to_split[id_] = split + return id_to_split, sorted(df["client_id"].unique()) + + +def convert_to_wav(root: Path, filenames: List[str], target_sr=16_000): + out_root = root / "wav" + out_root.mkdir(exist_ok=True, parents=True) + print("Converting to WAV...") + for n in tqdm(filenames): + in_path = (root / "clips" / n).as_posix() + waveform, sr = torchaudio.load(in_path) + converted, converted_sr = torchaudio.sox_effects.apply_effects_tensor( + waveform, sr, [["rate", str(target_sr)], ["channels", "1"]] + ) + out_path = (out_root / Path(n).with_suffix(".wav").name).as_posix() + torchaudio.save(out_path, converted, converted_sr, encoding="PCM_S", + bits_per_sample=16) + + +def process(args): + data_root = Path(args.data_root).absolute() / args.lang + + # Generate TSV manifest + print("Generating manifest...") + + df_top_n = get_top_n(data_root) + id_to_split, speakers = get_splits(df_top_n) + + if args.convert_to_wav: + convert_to_wav(data_root, df_top_n["path"].tolist()) + + manifest_by_split = {split: defaultdict(list) for split in SPLITS} + for sample in tqdm(df_top_n.to_dict(orient="index").values()): + sample_id = sample["id"] + split = id_to_split[sample_id] + manifest_by_split[split]["id"].append(sample_id) + if args.convert_to_wav: + audio_path = data_root / "wav" / f"{sample_id}.wav" + else: + audio_path = data_root / "clips" / f"{sample_id}.mp3" + manifest_by_split[split]["audio"].append(audio_path.as_posix()) + manifest_by_split[split]["n_frames"].append(sample["n_frames"]) + manifest_by_split[split]["tgt_text"].append(sample["sentence"]) + manifest_by_split[split]["speaker"].append(sample["client_id"]) + manifest_by_split[split]["src_text"].append(sample["sentence"]) + + output_root = Path(args.output_manifest_root).absolute() + output_root.mkdir(parents=True, exist_ok=True) + for split in SPLITS: + save_df_to_tsv( + pd.DataFrame.from_dict(manifest_by_split[split]), + output_root / f"{split}.audio.tsv" + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data-root", "-d", required=True, type=str) + parser.add_argument("--output-manifest-root", "-m", required=True, type=str) + parser.add_argument("--lang", "-l", required=True, type=str) + parser.add_argument("--convert-to-wav", action="store_true") + args = parser.parse_args() + + process(args) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_synthesis/preprocessing/get_feature_manifest.py b/examples/speech_synthesis/preprocessing/get_feature_manifest.py new file mode 100644 index 0000000000..4a1e119b32 --- /dev/null +++ b/examples/speech_synthesis/preprocessing/get_feature_manifest.py @@ -0,0 +1,262 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +from pathlib import Path +import shutil +from tempfile import NamedTemporaryFile +from collections import Counter, defaultdict + +import pandas as pd +import torchaudio +from tqdm import tqdm + +from fairseq.data.audio.audio_utils import convert_waveform +from examples.speech_to_text.data_utils import ( + create_zip, + gen_config_yaml, + gen_vocab, + get_zip_manifest, + load_tsv_to_dicts, + save_df_to_tsv +) +from examples.speech_synthesis.data_utils import ( + extract_logmel_spectrogram, extract_pitch, extract_energy, get_global_cmvn, + ipa_phonemize, get_mfa_alignment, get_unit_alignment, + get_feature_value_min_max +) + + +log = logging.getLogger(__name__) + + +def process(args): + assert "train" in args.splits + out_root = Path(args.output_root).absolute() + out_root.mkdir(exist_ok=True) + + print("Fetching data...") + audio_manifest_root = Path(args.audio_manifest_root).absolute() + samples = [] + for s in args.splits: + for e in load_tsv_to_dicts(audio_manifest_root / f"{s}.audio.tsv"): + e["split"] = s + samples.append(e) + sample_ids = [s["id"] for s in samples] + + # Get alignment info + id_to_alignment = None + if args.textgrid_zip is not None: + assert args.id_to_units_tsv is None + id_to_alignment = get_mfa_alignment( + args.textgrid_zip, sample_ids, args.sample_rate, args.hop_length + ) + elif args.id_to_units_tsv is not None: + # assume identical hop length on the unit sequence + id_to_alignment = get_unit_alignment(args.id_to_units_tsv, sample_ids) + + # Extract features and pack features into ZIP + feature_name = "logmelspec80" + zip_path = out_root / f"{feature_name}.zip" + pitch_zip_path = out_root / "pitch.zip" + energy_zip_path = out_root / "energy.zip" + gcmvn_npz_path = out_root / "gcmvn_stats.npz" + if zip_path.exists() and gcmvn_npz_path.exists(): + print(f"{zip_path} and {gcmvn_npz_path} exist.") + else: + feature_root = out_root / feature_name + feature_root.mkdir(exist_ok=True) + pitch_root = out_root / "pitch" + energy_root = out_root / "energy" + if args.add_fastspeech_targets: + pitch_root.mkdir(exist_ok=True) + energy_root.mkdir(exist_ok=True) + print("Extracting Mel spectrogram features...") + for sample in tqdm(samples): + waveform, sample_rate = torchaudio.load(sample["audio"]) + waveform, sample_rate = convert_waveform( + waveform, sample_rate, normalize_volume=args.normalize_volume, + to_sample_rate=args.sample_rate + ) + sample_id = sample["id"] + target_length = None + if id_to_alignment is not None: + a = id_to_alignment[sample_id] + target_length = sum(a.frame_durations) + if a.start_sec is not None and a.end_sec is not None: + start_frame = int(a.start_sec * sample_rate) + end_frame = int(a.end_sec * sample_rate) + waveform = waveform[:, start_frame: end_frame] + extract_logmel_spectrogram( + waveform, sample_rate, feature_root / f"{sample_id}.npy", + win_length=args.win_length, hop_length=args.hop_length, + n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min, + f_max=args.f_max, target_length=target_length + ) + if args.add_fastspeech_targets: + assert id_to_alignment is not None + extract_pitch( + waveform, sample_rate, pitch_root / f"{sample_id}.npy", + hop_length=args.hop_length, log_scale=True, + phoneme_durations=id_to_alignment[sample_id].frame_durations + ) + extract_energy( + waveform, energy_root / f"{sample_id}.npy", + hop_length=args.hop_length, n_fft=args.n_fft, + log_scale=True, + phoneme_durations=id_to_alignment[sample_id].frame_durations + ) + print("ZIPing features...") + create_zip(feature_root, zip_path) + get_global_cmvn(feature_root, gcmvn_npz_path) + shutil.rmtree(feature_root) + if args.add_fastspeech_targets: + create_zip(pitch_root, pitch_zip_path) + shutil.rmtree(pitch_root) + create_zip(energy_root, energy_zip_path) + shutil.rmtree(energy_root) + + print("Fetching ZIP manifest...") + audio_paths, audio_lengths = get_zip_manifest(zip_path) + pitch_paths, pitch_lengths, energy_paths, energy_lengths = [None] * 4 + if args.add_fastspeech_targets: + pitch_paths, pitch_lengths = get_zip_manifest(pitch_zip_path) + energy_paths, energy_lengths = get_zip_manifest(energy_zip_path) + # Generate TSV manifest + print("Generating manifest...") + id_to_cer = None + if args.cer_threshold is not None: + assert Path(args.cer_tsv_path).is_file() + id_to_cer = { + x["id"]: x["uer"] for x in load_tsv_to_dicts(args.cer_tsv_path) + } + manifest_by_split = {split: defaultdict(list) for split in args.splits} + for sample in tqdm(samples): + sample_id, split = sample["id"], sample["split"] + + if args.snr_threshold is not None and "snr" in sample \ + and sample["snr"] < args.snr_threshold: + continue + if args.cer_threshold is not None \ + and id_to_cer[sample_id] > args.cer_threhold: + continue + + normalized_utt = sample["tgt_text"] + if id_to_alignment is not None: + normalized_utt = " ".join(id_to_alignment[sample_id].tokens) + elif args.ipa_vocab: + normalized_utt = ipa_phonemize( + normalized_utt, lang=args.lang, use_g2p=args.use_g2p + ) + manifest_by_split[split]["id"].append(sample_id) + manifest_by_split[split]["audio"].append(audio_paths[sample_id]) + manifest_by_split[split]["n_frames"].append(audio_lengths[sample_id]) + manifest_by_split[split]["tgt_text"].append(normalized_utt) + manifest_by_split[split]["speaker"].append(sample["speaker"]) + manifest_by_split[split]["src_text"].append(sample["src_text"]) + if args.add_fastspeech_targets: + assert id_to_alignment is not None + duration = " ".join( + str(d) for d in id_to_alignment[sample_id].frame_durations + ) + manifest_by_split[split]["duration"].append(duration) + manifest_by_split[split]["pitch"].append(pitch_paths[sample_id]) + manifest_by_split[split]["energy"].append(energy_paths[sample_id]) + for split in args.splits: + save_df_to_tsv( + pd.DataFrame.from_dict(manifest_by_split[split]), + out_root / f"{split}.tsv" + ) + # Generate vocab + vocab_name, spm_filename = None, None + if id_to_alignment is not None or args.ipa_vocab: + vocab = Counter() + for t in manifest_by_split["train"]["tgt_text"]: + vocab.update(t.split(" ")) + vocab_name = "vocab.txt" + with open(out_root / vocab_name, "w") as f: + for s, c in vocab.most_common(): + f.write(f"{s} {c}\n") + else: + spm_filename_prefix = "spm_char" + spm_filename = f"{spm_filename_prefix}.model" + with NamedTemporaryFile(mode="w") as f: + for t in manifest_by_split["train"]["tgt_text"]: + f.write(t + "\n") + f.flush() # needed to ensure gen_vocab sees dumped text + gen_vocab(Path(f.name), out_root / spm_filename_prefix, "char") + # Generate speaker list + speakers = sorted({sample["speaker"] for sample in samples}) + speakers_path = out_root / "speakers.txt" + with open(speakers_path, "w") as f: + for speaker in speakers: + f.write(f"{speaker}\n") + # Generate config YAML + win_len_t = args.win_length / args.sample_rate + hop_len_t = args.hop_length / args.sample_rate + extra = { + "sample_rate": args.sample_rate, + "features": { + "type": "spectrogram+melscale+log", + "eps": 1e-5, "n_mels": args.n_mels, "n_fft": args.n_fft, + "window_fn": "hann", "win_length": args.win_length, + "hop_length": args.hop_length, "sample_rate": args.sample_rate, + "win_len_t": win_len_t, "hop_len_t": hop_len_t, + "f_min": args.f_min, "f_max": args.f_max, + "n_stft": args.n_fft // 2 + 1 + } + } + if len(speakers) > 1: + extra["speaker_set_filename"] = "speakers.txt" + if args.add_fastspeech_targets: + pitch_min, pitch_max = get_feature_value_min_max( + [(out_root / n).as_posix() for n in pitch_paths.values()] + ) + energy_min, energy_max = get_feature_value_min_max( + [(out_root / n).as_posix() for n in energy_paths.values()] + ) + extra["features"]["pitch_min"] = pitch_min + extra["features"]["pitch_max"] = pitch_max + extra["features"]["energy_min"] = energy_min + extra["features"]["energy_max"] = energy_max + gen_config_yaml( + out_root, spm_filename=spm_filename, vocab_name=vocab_name, + audio_root=out_root.as_posix(), input_channels=None, + input_feat_per_channel=None, specaugment_policy=None, + cmvn_type="global", gcmvn_path=gcmvn_npz_path, extra=extra + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--audio-manifest-root", "-m", required=True, type=str) + parser.add_argument("--output-root", "-o", required=True, type=str) + parser.add_argument("--splits", "-s", type=str, nargs="+", + default=["train", "dev", "test"]) + parser.add_argument("--ipa-vocab", action="store_true") + parser.add_argument("--use-g2p", action="store_true") + parser.add_argument("--lang", type=str, default="en-us") + parser.add_argument("--win-length", type=int, default=1024) + parser.add_argument("--hop-length", type=int, default=256) + parser.add_argument("--n-fft", type=int, default=1024) + parser.add_argument("--n-mels", type=int, default=80) + parser.add_argument("--f-min", type=int, default=20) + parser.add_argument("--f-max", type=int, default=8000) + parser.add_argument("--sample-rate", type=int, default=22050) + parser.add_argument("--normalize-volume", "-n", action="store_true") + parser.add_argument("--textgrid-zip", type=str, default=None) + parser.add_argument("--id-to-units-tsv", type=str, default=None) + parser.add_argument("--add-fastspeech-targets", action="store_true") + parser.add_argument("--snr-threshold", type=float, default=None) + parser.add_argument("--cer-threshold", type=float, default=None) + parser.add_argument("--cer-tsv-path", type=str, default="") + args = parser.parse_args() + + process(args) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_synthesis/preprocessing/get_ljspeech_audio_manifest.py b/examples/speech_synthesis/preprocessing/get_ljspeech_audio_manifest.py new file mode 100644 index 0000000000..7ec1fb7521 --- /dev/null +++ b/examples/speech_synthesis/preprocessing/get_ljspeech_audio_manifest.py @@ -0,0 +1,70 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +from pathlib import Path +from collections import defaultdict + +import pandas as pd +from torchaudio.datasets import LJSPEECH +from tqdm import tqdm + +from examples.speech_to_text.data_utils import save_df_to_tsv + + +log = logging.getLogger(__name__) + +SPLITS = ["train", "dev", "test"] + + +def process(args): + out_root = Path(args.output_data_root).absolute() + out_root.mkdir(parents=True, exist_ok=True) + + # Generate TSV manifest + print("Generating manifest...") + # following FastSpeech's splits + dataset = LJSPEECH(out_root.as_posix(), download=True) + id_to_split = {} + for x in dataset._flist: + id_ = x[0] + speaker = id_.split("-")[0] + id_to_split[id_] = { + "LJ001": "test", "LJ002": "test", "LJ003": "dev" + }.get(speaker, "train") + manifest_by_split = {split: defaultdict(list) for split in SPLITS} + progress = tqdm(enumerate(dataset), total=len(dataset)) + for i, (waveform, _, utt, normalized_utt) in progress: + sample_id = dataset._flist[i][0] + split = id_to_split[sample_id] + manifest_by_split[split]["id"].append(sample_id) + audio_path = f"{dataset._path}/{sample_id}.wav" + manifest_by_split[split]["audio"].append(audio_path) + manifest_by_split[split]["n_frames"].append(len(waveform[0])) + manifest_by_split[split]["tgt_text"].append(normalized_utt) + manifest_by_split[split]["speaker"].append("ljspeech") + manifest_by_split[split]["src_text"].append(utt) + + manifest_root = Path(args.output_manifest_root).absolute() + manifest_root.mkdir(parents=True, exist_ok=True) + for split in SPLITS: + save_df_to_tsv( + pd.DataFrame.from_dict(manifest_by_split[split]), + manifest_root / f"{split}.audio.tsv" + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--output-data-root", "-d", required=True, type=str) + parser.add_argument("--output-manifest-root", "-m", required=True, type=str) + args = parser.parse_args() + + process(args) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_synthesis/preprocessing/get_speaker_embedding.py b/examples/speech_synthesis/preprocessing/get_speaker_embedding.py new file mode 100644 index 0000000000..0e3e4c5cd7 --- /dev/null +++ b/examples/speech_synthesis/preprocessing/get_speaker_embedding.py @@ -0,0 +1,89 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import argparse +from collections import defaultdict +from itertools import chain +from pathlib import Path + +import numpy as np +import torchaudio +import torchaudio.sox_effects as ta_sox +import yaml +from tqdm import tqdm + +from examples.speech_to_text.data_utils import load_tsv_to_dicts +from examples.speech_synthesis.preprocessing.speaker_embedder import SpkrEmbedder + + +def extract_embedding(audio_path, embedder): + wav, sr = torchaudio.load(audio_path) # 2D + if sr != embedder.RATE: + wav, sr = ta_sox.apply_effects_tensor( + wav, sr, [["rate", str(embedder.RATE)]] + ) + try: + emb = embedder([wav[0].cuda().float()]).cpu().numpy() + except RuntimeError: + emb = None + return emb + + +def process(args): + print("Fetching data...") + raw_manifest_root = Path(args.raw_manifest_root).absolute() + samples = [load_tsv_to_dicts(raw_manifest_root / (s + ".tsv")) + for s in args.splits] + samples = list(chain(*samples)) + with open(args.config, "r") as f: + config = yaml.load(f, Loader=yaml.FullLoader) + with open(f"{config['audio_root']}/{config['speaker_set_filename']}") as f: + speaker_to_id = {r.strip(): i for i, r in enumerate(f)} + + embedder = SpkrEmbedder(args.ckpt).cuda() + speaker_to_cnt = defaultdict(float) + speaker_to_emb = defaultdict(float) + for sample in tqdm(samples, desc="extract emb"): + emb = extract_embedding(sample["audio"], embedder) + if emb is not None: + speaker_to_cnt[sample["speaker"]] += 1 + speaker_to_emb[sample["speaker"]] += emb + if len(speaker_to_emb) != len(speaker_to_id): + missed = set(speaker_to_id) - set(speaker_to_emb.keys()) + print( + f"WARNING: missing embeddings for {len(missed)} speaker:\n{missed}" + ) + speaker_emb_mat = np.zeros((len(speaker_to_id), len(emb)), float) + for speaker in speaker_to_emb: + idx = speaker_to_id[speaker] + emb = speaker_to_emb[speaker] + cnt = speaker_to_cnt[speaker] + speaker_emb_mat[idx, :] = emb / cnt + speaker_emb_name = "speaker_emb.npy" + speaker_emb_path = f"{config['audio_root']}/{speaker_emb_name}" + np.save(speaker_emb_path, speaker_emb_mat) + config["speaker_emb_filename"] = speaker_emb_name + + with open(args.new_config, "w") as f: + yaml.dump(config, f) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--raw-manifest-root", "-m", required=True, type=str) + parser.add_argument("--splits", "-s", type=str, nargs="+", + default=["train"]) + parser.add_argument("--config", "-c", required=True, type=str) + parser.add_argument("--new-config", "-n", required=True, type=str) + parser.add_argument("--ckpt", required=True, type=str, + help="speaker embedder checkpoint") + args = parser.parse_args() + + process(args) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_synthesis/preprocessing/get_vctk_audio_manifest.py b/examples/speech_synthesis/preprocessing/get_vctk_audio_manifest.py new file mode 100644 index 0000000000..7afa40fcd1 --- /dev/null +++ b/examples/speech_synthesis/preprocessing/get_vctk_audio_manifest.py @@ -0,0 +1,79 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import numpy as np +import re +from pathlib import Path +from collections import defaultdict + +import pandas as pd +from torchaudio.datasets import VCTK +from tqdm import tqdm + +from examples.speech_to_text.data_utils import save_df_to_tsv + + +log = logging.getLogger(__name__) + +SPLITS = ["train", "dev", "test"] + + +def normalize_text(text): + return re.sub(r"[^a-zA-Z.?!,'\- ]", '', text) + + +def process(args): + out_root = Path(args.output_data_root).absolute() + out_root.mkdir(parents=True, exist_ok=True) + + # Generate TSV manifest + print("Generating manifest...") + dataset = VCTK(out_root.as_posix(), download=False) + ids = list(dataset._walker) + np.random.seed(args.seed) + np.random.shuffle(ids) + n_train = len(ids) - args.n_dev - args.n_test + _split = ["train"] * n_train + ["dev"] * args.n_dev + ["test"] * args.n_test + id_to_split = dict(zip(ids, _split)) + manifest_by_split = {split: defaultdict(list) for split in SPLITS} + progress = tqdm(enumerate(dataset), total=len(dataset)) + for i, (waveform, _, text, speaker_id, _) in progress: + sample_id = dataset._walker[i] + _split = id_to_split[sample_id] + audio_dir = Path(dataset._path) / dataset._folder_audio / speaker_id + audio_path = audio_dir / f"{sample_id}.wav" + text = normalize_text(text) + manifest_by_split[_split]["id"].append(sample_id) + manifest_by_split[_split]["audio"].append(audio_path.as_posix()) + manifest_by_split[_split]["n_frames"].append(len(waveform[0])) + manifest_by_split[_split]["tgt_text"].append(text) + manifest_by_split[_split]["speaker"].append(speaker_id) + manifest_by_split[_split]["src_text"].append(text) + + manifest_root = Path(args.output_manifest_root).absolute() + manifest_root.mkdir(parents=True, exist_ok=True) + for _split in SPLITS: + save_df_to_tsv( + pd.DataFrame.from_dict(manifest_by_split[_split]), + manifest_root / f"{_split}.audio.tsv" + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--output-data-root", "-d", required=True, type=str) + parser.add_argument("--output-manifest-root", "-m", required=True, type=str) + parser.add_argument("--n-dev", default=50, type=int) + parser.add_argument("--n-test", default=100, type=int) + parser.add_argument("--seed", "-s", default=1234, type=int) + args = parser.parse_args() + + process(args) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_synthesis/preprocessing/speaker_embedder/__init__.py b/examples/speech_synthesis/preprocessing/speaker_embedder/__init__.py new file mode 100644 index 0000000000..3b178676ba --- /dev/null +++ b/examples/speech_synthesis/preprocessing/speaker_embedder/__init__.py @@ -0,0 +1,135 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import librosa +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.data +import torchaudio + + +EMBEDDER_PARAMS = { + 'num_mels': 40, + 'n_fft': 512, + 'emb_dim': 256, + 'lstm_hidden': 768, + 'lstm_layers': 3, + 'window': 80, + 'stride': 40, +} + + +def set_requires_grad(nets, requires_grad=False): + """Set requies_grad=Fasle for all the networks to avoid unnecessary + computations + Parameters: + nets (network list) -- a list of networks + requires_grad (bool) -- whether the networks require gradients or not + """ + if not isinstance(nets, list): + nets = [nets] + for net in nets: + if net is not None: + for param in net.parameters(): + param.requires_grad = requires_grad + + +class LinearNorm(nn.Module): + def __init__(self, hp): + super(LinearNorm, self).__init__() + self.linear_layer = nn.Linear(hp["lstm_hidden"], hp["emb_dim"]) + + def forward(self, x): + return self.linear_layer(x) + + +class SpeechEmbedder(nn.Module): + def __init__(self, hp): + super(SpeechEmbedder, self).__init__() + self.lstm = nn.LSTM(hp["num_mels"], + hp["lstm_hidden"], + num_layers=hp["lstm_layers"], + batch_first=True) + self.proj = LinearNorm(hp) + self.hp = hp + + def forward(self, mel): + # (num_mels, T) -> (num_mels, T', window) + mels = mel.unfold(1, self.hp["window"], self.hp["stride"]) + mels = mels.permute(1, 2, 0) # (T', window, num_mels) + x, _ = self.lstm(mels) # (T', window, lstm_hidden) + x = x[:, -1, :] # (T', lstm_hidden), use last frame only + x = self.proj(x) # (T', emb_dim) + x = x / torch.norm(x, p=2, dim=1, keepdim=True) # (T', emb_dim) + + x = x.mean(dim=0) + if x.norm(p=2) != 0: + x = x / x.norm(p=2) + return x + + +class SpkrEmbedder(nn.Module): + RATE = 16000 + + def __init__( + self, + embedder_path, + embedder_params=EMBEDDER_PARAMS, + rate=16000, + hop_length=160, + win_length=400, + pad=False, + ): + super(SpkrEmbedder, self).__init__() + embedder_pt = torch.load(embedder_path, map_location="cpu") + self.embedder = SpeechEmbedder(embedder_params) + self.embedder.load_state_dict(embedder_pt) + self.embedder.eval() + set_requires_grad(self.embedder, requires_grad=False) + self.embedder_params = embedder_params + + self.register_buffer('mel_basis', torch.from_numpy( + librosa.filters.mel( + sr=self.RATE, + n_fft=self.embedder_params["n_fft"], + n_mels=self.embedder_params["num_mels"]) + ) + ) + + self.resample = None + if rate != self.RATE: + self.resample = torchaudio.transforms.Resample(rate, self.RATE) + self.hop_length = hop_length + self.win_length = win_length + self.pad = pad + + def get_mel(self, y): + if self.pad and y.shape[-1] < 14000: + y = F.pad(y, (0, 14000 - y.shape[-1])) + + window = torch.hann_window(self.win_length).to(y) + y = torch.stft(y, n_fft=self.embedder_params["n_fft"], + hop_length=self.hop_length, + win_length=self.win_length, + window=window) + magnitudes = torch.norm(y, dim=-1, p=2) ** 2 + mel = torch.log10(self.mel_basis @ magnitudes + 1e-6) + return mel + + def forward(self, inputs): + dvecs = [] + for wav in inputs: + mel = self.get_mel(wav) + if mel.dim() == 3: + mel = mel.squeeze(0) + dvecs += [self.embedder(mel)] + dvecs = torch.stack(dvecs) + + dvec = torch.mean(dvecs, dim=0) + dvec = dvec / torch.norm(dvec) + + return dvec diff --git a/examples/speech_synthesis/preprocessing/vad/__init__.py b/examples/speech_synthesis/preprocessing/vad/__init__.py new file mode 100644 index 0000000000..9cf121081f --- /dev/null +++ b/examples/speech_synthesis/preprocessing/vad/__init__.py @@ -0,0 +1,192 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import collections +import contextlib +import wave + +try: + import webrtcvad +except ImportError: + raise ImportError("Please install py-webrtcvad: pip install webrtcvad") +import argparse +import os +import logging +from tqdm import tqdm + +AUDIO_SUFFIX = '.wav' +FS_MS = 30 +SCALE = 6e-5 +THRESHOLD = 0.3 + + +def read_wave(path): + """Reads a .wav file. + Takes the path, and returns (PCM audio data, sample rate). + """ + with contextlib.closing(wave.open(path, 'rb')) as wf: + num_channels = wf.getnchannels() + assert num_channels == 1 + sample_width = wf.getsampwidth() + assert sample_width == 2 + sample_rate = wf.getframerate() + assert sample_rate in (8000, 16000, 32000, 48000) + pcm_data = wf.readframes(wf.getnframes()) + return pcm_data, sample_rate + + +def write_wave(path, audio, sample_rate): + """Writes a .wav file. + Takes path, PCM audio data, and sample rate. + """ + with contextlib.closing(wave.open(path, 'wb')) as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(audio) + + +class Frame(object): + """Represents a "frame" of audio data.""" + def __init__(self, bytes, timestamp, duration): + self.bytes = bytes + self.timestamp = timestamp + self.duration = duration + + +def frame_generator(frame_duration_ms, audio, sample_rate): + """Generates audio frames from PCM audio data. + Takes the desired frame duration in milliseconds, the PCM data, and + the sample rate. + Yields Frames of the requested duration. + """ + n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) + offset = 0 + timestamp = 0.0 + duration = (float(n) / sample_rate) / 2.0 + while offset + n < len(audio): + yield Frame(audio[offset:offset + n], timestamp, duration) + timestamp += duration + offset += n + + +def vad_collector(sample_rate, frame_duration_ms, + padding_duration_ms, vad, frames): + """Filters out non-voiced audio frames. + Given a webrtcvad.Vad and a source of audio frames, yields only + the voiced audio. + Uses a padded, sliding window algorithm over the audio frames. + When more than 90% of the frames in the window are voiced (as + reported by the VAD), the collector triggers and begins yielding + audio frames. Then the collector waits until 90% of the frames in + the window are unvoiced to detrigger. + The window is padded at the front and back to provide a small + amount of silence or the beginnings/endings of speech around the + voiced frames. + Arguments: + sample_rate - The audio sample rate, in Hz. + frame_duration_ms - The frame duration in milliseconds. + padding_duration_ms - The amount to pad the window, in milliseconds. + vad - An instance of webrtcvad.Vad. + frames - a source of audio frames (sequence or generator). + Returns: A generator that yields PCM audio data. + """ + num_padding_frames = int(padding_duration_ms / frame_duration_ms) + # We use a deque for our sliding window/ring buffer. + ring_buffer = collections.deque(maxlen=num_padding_frames) + # We have two states: TRIGGERED and NOTTRIGGERED. We start in the + # NOTTRIGGERED state. + triggered = False + + voiced_frames = [] + for frame in frames: + is_speech = vad.is_speech(frame.bytes, sample_rate) + + # sys.stdout.write('1' if is_speech else '0') + if not triggered: + ring_buffer.append((frame, is_speech)) + num_voiced = len([f for f, speech in ring_buffer if speech]) + # If we're NOTTRIGGERED and more than 90% of the frames in + # the ring buffer are voiced frames, then enter the + # TRIGGERED state. + if num_voiced > 0.9 * ring_buffer.maxlen: + triggered = True + # We want to yield all the audio we see from now until + # we are NOTTRIGGERED, but we have to start with the + # audio that's already in the ring buffer. + for f, _ in ring_buffer: + voiced_frames.append(f) + ring_buffer.clear() + else: + # We're in the TRIGGERED state, so collect the audio data + # and add it to the ring buffer. + voiced_frames.append(frame) + ring_buffer.append((frame, is_speech)) + num_unvoiced = len([f for f, speech in ring_buffer if not speech]) + # If more than 90% of the frames in the ring buffer are + # unvoiced, then enter NOTTRIGGERED and yield whatever + # audio we've collected. + if num_unvoiced > 0.9 * ring_buffer.maxlen: + triggered = False + yield [b''.join([f.bytes for f in voiced_frames]), + voiced_frames[0].timestamp, voiced_frames[-1].timestamp] + ring_buffer.clear() + voiced_frames = [] + # If we have any leftover voiced audio when we run out of input, + # yield it. + if voiced_frames: + yield [b''.join([f.bytes for f in voiced_frames]), + voiced_frames[0].timestamp, voiced_frames[-1].timestamp] + + +def main(args): + # create output folder + try: + cmd = f"mkdir -p {args.out_path}" + os.system(cmd) + except Exception: + logging.error("Can not create output folder") + exit(-1) + + # build vad object + vad = webrtcvad.Vad(int(args.agg)) + # iterating over wavs in dir + for file in tqdm(os.listdir(args.in_path)): + if file.endswith(AUDIO_SUFFIX): + audio_inpath = os.path.join(args.in_path, file) + audio_outpath = os.path.join(args.out_path, file) + audio, sample_rate = read_wave(audio_inpath) + frames = frame_generator(FS_MS, audio, sample_rate) + frames = list(frames) + segments = vad_collector(sample_rate, FS_MS, 300, vad, frames) + merge_segments = list() + timestamp_start = 0.0 + timestamp_end = 0.0 + # removing start, end, and long sequences of sils + for i, segment in enumerate(segments): + merge_segments.append(segment[0]) + if i and timestamp_start: + sil_duration = segment[1] - timestamp_end + if sil_duration > THRESHOLD: + merge_segments.append(int(THRESHOLD / SCALE)*(b'\x00')) + else: + merge_segments.append(int((sil_duration / SCALE))*(b'\x00')) + timestamp_start = segment[1] + timestamp_end = segment[2] + segment = b''.join(merge_segments) + write_wave(audio_outpath, segment, sample_rate) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Apply vad to a file of fils.') + parser.add_argument('in_path', type=str, help='Path to the input files') + parser.add_argument('out_path', type=str, + help='Path to save the processed files') + parser.add_argument('--agg', type=int, default=3, + help='The level of aggressiveness of the VAD: [0-3]') + args = parser.parse_args() + + main(args) diff --git a/examples/speech_synthesis/utils.py b/examples/speech_synthesis/utils.py new file mode 100644 index 0000000000..2c7b03733d --- /dev/null +++ b/examples/speech_synthesis/utils.py @@ -0,0 +1,101 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +from scipy.interpolate import interp1d +import torchaudio + +from fairseq.tasks.text_to_speech import ( + batch_compute_distortion, compute_rms_dist +) + + +def batch_mel_spectral_distortion( + y1, y2, sr, normalize_type="path", mel_fn=None +): + """ + https://arxiv.org/pdf/2011.03568.pdf + + Same as Mel Cepstral Distortion, but computed on log-mel spectrograms. + """ + if mel_fn is None or mel_fn.sample_rate != sr: + mel_fn = torchaudio.transforms.MelSpectrogram( + sr, n_fft=int(0.05 * sr), win_length=int(0.05 * sr), + hop_length=int(0.0125 * sr), f_min=20, n_mels=80, + window_fn=torch.hann_window + ).to(y1[0].device) + offset = 1e-6 + return batch_compute_distortion( + y1, y2, sr, lambda y: torch.log(mel_fn(y) + offset).transpose(-1, -2), + compute_rms_dist, normalize_type + ) + + +# This code is based on +# "https://github.com/bastibe/MAPS-Scripts/blob/master/helper.py" +def _same_t_in_true_and_est(func): + def new_func(true_t, true_f, est_t, est_f): + assert type(true_t) is np.ndarray + assert type(true_f) is np.ndarray + assert type(est_t) is np.ndarray + assert type(est_f) is np.ndarray + + interpolated_f = interp1d( + est_t, est_f, bounds_error=False, kind='nearest', fill_value=0 + )(true_t) + return func(true_t, true_f, true_t, interpolated_f) + + return new_func + + +@_same_t_in_true_and_est +def gross_pitch_error(true_t, true_f, est_t, est_f): + """The relative frequency in percent of pitch estimates that are + outside a threshold around the true pitch. Only frames that are + considered pitched by both the ground truth and the estimator (if + applicable) are considered. + """ + + correct_frames = _true_voiced_frames(true_t, true_f, est_t, est_f) + gross_pitch_error_frames = _gross_pitch_error_frames( + true_t, true_f, est_t, est_f + ) + return np.sum(gross_pitch_error_frames) / np.sum(correct_frames) + + +def _gross_pitch_error_frames(true_t, true_f, est_t, est_f, eps=1e-8): + voiced_frames = _true_voiced_frames(true_t, true_f, est_t, est_f) + true_f_p_eps = [x + eps for x in true_f] + pitch_error_frames = np.abs(est_f / true_f_p_eps - 1) > 0.2 + return voiced_frames & pitch_error_frames + + +def _true_voiced_frames(true_t, true_f, est_t, est_f): + return (est_f != 0) & (true_f != 0) + + +def _voicing_decision_error_frames(true_t, true_f, est_t, est_f): + return (est_f != 0) != (true_f != 0) + + +@_same_t_in_true_and_est +def f0_frame_error(true_t, true_f, est_t, est_f): + gross_pitch_error_frames = _gross_pitch_error_frames( + true_t, true_f, est_t, est_f + ) + voicing_decision_error_frames = _voicing_decision_error_frames( + true_t, true_f, est_t, est_f + ) + return (np.sum(gross_pitch_error_frames) + + np.sum(voicing_decision_error_frames)) / (len(true_t)) + + +@_same_t_in_true_and_est +def voicing_decision_error(true_t, true_f, est_t, est_f): + voicing_decision_error_frames = _voicing_decision_error_frames( + true_t, true_f, est_t, est_f + ) + return np.sum(voicing_decision_error_frames) / (len(true_t)) diff --git a/examples/speech_text_joint_to_text/README.md b/examples/speech_text_joint_to_text/README.md new file mode 100644 index 0000000000..c1aa11929a --- /dev/null +++ b/examples/speech_text_joint_to_text/README.md @@ -0,0 +1,51 @@ +# Joint Speech Text training in Fairseq +An extension of Fairseq s2t project with the speech to text task enhanced by the co-trained text to text mapping task. More details about Fairseq s2t can be found [here](../speech_to_text/README.md) + +## Examples +Examples of speech text joint training in fairseq +- [English-to-German MuST-C model](docs/ende-mustc.md) +- [IWSLT 2021 Multilingual Speech Translation](docs/iwslt2021.md) +- [Speech Text Joint Pre-training ](docs/pre-training.md) +## Citation +Please cite as: +``` +@inproceedings{Tang2022UnifiedSP, + title={Unified Speech-Text Pre-training for Speech Translation and Recognition}, + author={Yun Tang and Hongyu Gong and Ning Dong and Changhan Wang and Wei-Ning Hsu and Jiatao Gu and Alexei Baevski and Xian Li and Abdelrahman Mohamed and Michael Auli and Juan Miguel Pino}, + booktitle={ACL}, + year={2022} +} +@inproceedings{Tang2021IST, + title = {Improving Speech Translation by Understanding and Learning from the Auxiliary Text Translation Task}, + author = {Yun Tang and Juan Pino and Xian Li and Changhan Wang and Dmitriy Genzel}, + booktitle = {ACL}, + year = {2021}, +} + +@inproceedings{Tang2021FST, + title = {FST: the FAIR Speech Translation System for the IWSLT21 Multilingual Shared Task}, + author = {Yun Tang and Hongyu Gong and Xian Li and Changhan Wang and Juan Pino and Holger Schwenk and Naman Goyal}, + booktitle = {IWSLT}, + year = {2021}, +} +@inproceedings{Tang2021AGM, + title={A General Multi-Task Learning Framework to Leverage Text Data for Speech to Text Tasks}, + author={Yun Tang and J. Pino and Changhan Wang and Xutai Ma and Dmitriy Genzel}, + booktitle={ICASSP}, + year={2021} +} + +@inproceedings{wang2020fairseqs2t, + title = {fairseq S2T: Fast Speech-to-Text Modeling with fairseq}, + author = {Changhan Wang and Yun Tang and Xutai Ma and Anne Wu and Dmytro Okhonko and Juan Pino}, + booktitle = {Proceedings of the 2020 Conference of the Asian Chapter of the Association for Computational Linguistics (AACL): System Demonstrations}, + year = {2020}, +} + +@inproceedings{ott2019fairseq, + title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling}, + author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli}, + booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations}, + year = {2019}, +} +``` diff --git a/examples/speech_text_joint_to_text/__init__.py b/examples/speech_text_joint_to_text/__init__.py new file mode 100644 index 0000000000..239d2e69f9 --- /dev/null +++ b/examples/speech_text_joint_to_text/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import tasks, criterions, models # noqa diff --git a/examples/speech_text_joint_to_text/configs/mustc_noise.list b/examples/speech_text_joint_to_text/configs/mustc_noise.list new file mode 100644 index 0000000000..02eeac4e00 --- /dev/null +++ b/examples/speech_text_joint_to_text/configs/mustc_noise.list @@ -0,0 +1,49 @@ +"(Applause) NOISE +"(Laughter) VOICE +"(Laughter)" VOICE +(Applause) NOISE +(Applause). NOISE +(Audience) VOICE +(Audio) NOISE +(Beat) NOISE +(Beatboxing) VOICE +(Beep) NOISE +(Beeps) NOISE +(Cheering) VOICE +(Cheers) VOICE +(Claps) NOISE +(Clicking) NOISE +(Clunk) NOISE +(Coughs) NOISE +(Drums) NOISE +(Explosion) NOISE +(Gasps) VOICE +(Guitar) NOISE +(Honk) NOISE +(Laugher) VOICE +(Laughing) VOICE +(Laughs) VOICE +(Laughter) VOICE +(Laughter). VOICE +(Laughter)... VOICE +(Mumbling) VOICE +(Music) NOISE +(Noise) NOISE +(Recording) VOICE +(Ringing) NOISE +(Shouts) VOICE +(Sigh) VOICE +(Sighs) VOICE +(Silence) NOISE +(Singing) VOICE +(Sings) VOICE +(Spanish) VOICE +(Static) NOISE +(Tones) NOISE +(Trumpet) NOISE +(Video) NOISE +(Video): NOISE +(Voice-over) NOISE +(Whistle) NOISE +(Whistling) NOISE +(video): NOISE diff --git a/examples/simultaneous_translation/criterions/__init__.py b/examples/speech_text_joint_to_text/criterions/__init__.py similarity index 83% rename from examples/simultaneous_translation/criterions/__init__.py rename to examples/speech_text_joint_to_text/criterions/__init__.py index 08791bfff3..7faae73119 100644 --- a/examples/simultaneous_translation/criterions/__init__.py +++ b/examples/speech_text_joint_to_text/criterions/__init__.py @@ -11,5 +11,5 @@ if file.endswith(".py") and not file.startswith("_"): criterion_name = file[: file.find(".py")] importlib.import_module( - "examples.simultaneous_translation.criterions." + criterion_name + "examples.speech_text_joint_to_text.criterions." + criterion_name ) diff --git a/examples/speech_text_joint_to_text/criterions/multi_modality_compound.py b/examples/speech_text_joint_to_text/criterions/multi_modality_compound.py new file mode 100644 index 0000000000..b3a5506a2d --- /dev/null +++ b/examples/speech_text_joint_to_text/criterions/multi_modality_compound.py @@ -0,0 +1,181 @@ +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import logging +import math +from dataclasses import dataclass, field + +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.criterions.ctc import CtcCriterion, CtcCriterionConfig +from fairseq.criterions.label_smoothed_cross_entropy import ( + LabelSmoothedCrossEntropyCriterionConfig, +) +from fairseq.logging.meters import safe_round + +from .multi_modality_cross_entropy import SpeechTextPreTrainCrossEntCriterion + +logger = logging.getLogger(__name__) + + +@dataclass +class SpeechTextPreTrainCompoundCriterionConfig( + LabelSmoothedCrossEntropyCriterionConfig +): + zero_infinity: bool = field( + default=False, + metadata={"help": "zero inf loss when source length <= target length"}, + ) + post_process: str = field( + default="none", + metadata={ + "help": "how to post process predictions into words. can be letter, " + "wordpiece, BPE symbols, etc. " + "See fairseq.data.data_utils.post_process() for full list of options" + }, + ) + + +@register_criterion( + "speech_text_pretrain_compound", dataclass=SpeechTextPreTrainCompoundCriterionConfig +) +class SpeechTextPreTrainCompoundCriterion(FairseqCriterion): + def __init__( + self, + task, + sentence_avg, + label_smoothing, + report_accuracy=False, + zero_infinity=False, + post_process=None, + ): + super().__init__(task) + self.xent = SpeechTextPreTrainCrossEntCriterion( + task, sentence_avg, label_smoothing, report_accuracy + ) + cfg_dict = { + "zero_infinity": zero_infinity, + "sentence_avg": sentence_avg, + "post_process": post_process, + } + cfg_ctc = CtcCriterionConfig(**cfg_dict) + self.ctc = CtcCriterion(cfg_ctc, task) + + def forward(self, model, sample, reduce=True): + mode = sample["net_input"]["mode"] + if mode == "sup_speech_ctc": # CTC + sample["net_input"][ + "src_lengths" + ] = None # get downsampled src_lengths from padding_mask + loss, sample_size, logging_output = self.ctc(model, sample, reduce) + logging_output["mode"] = SpeechTextPreTrainCompoundCriterion.mode2value( + "CTC" + ) + else: + loss, sample_size, logging_output = self.xent(model, sample, reduce) + logging_output["mode"] = SpeechTextPreTrainCompoundCriterion.mode2value( + "xent" + ) + + return loss, sample_size, logging_output + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True + + @staticmethod + def mode2value(mode): # make the logging_outputs_can_be_summed = True + if mode == "CTC": + return 907 # prime number + if mode == "xent": + return 887 # prime number + return 0 + + @staticmethod + def value2mode(value): + if value % 907 == 0: + return "CTC" + if value % 887 == 0: + return "xent" + raise ValueError("Unknow mode") + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + + def _get_mode(logging_outputs): + mds = [ + SpeechTextPreTrainCompoundCriterion.value2mode(log["mode"]) + for log in logging_outputs + ] + if sum([1 if l != mds[0] else 0 for l in mds]) > 0: + raise ValueError("mode in one mini-batch is expected to be the same!") + return mds[0] + + log_mode = _get_mode(logging_outputs) + if log_mode == "xent": + return SpeechTextPreTrainCrossEntCriterion.reduce_metrics(logging_outputs) + + # ctc loss + loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) + ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs)) + nsentences = utils.item( + sum(log.get("nsentences", 0) for log in logging_outputs) + ) + sample_size = utils.item( + sum(log.get("sample_size", 0) for log in logging_outputs) + ) + + metrics.log_scalar( + "ctc_loss", loss_sum / sample_size / math.log(2), sample_size, round=3 + ) + metrics.log_scalar("ctc_ntokens", ntokens) + metrics.log_scalar("ctc_nsentences", nsentences) + if sample_size != ntokens: + metrics.log_scalar( + "ctc_nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 + ) + + c_errors = sum(log.get("c_errors", 0) for log in logging_outputs) + metrics.log_scalar("_c_errors", c_errors) + c_total = sum(log.get("c_total", 0) for log in logging_outputs) + metrics.log_scalar("_c_total", c_total) + w_errors = sum(log.get("w_errors", 0) for log in logging_outputs) + metrics.log_scalar("_w_errors", w_errors) + wv_errors = sum(log.get("wv_errors", 0) for log in logging_outputs) + metrics.log_scalar("_wv_errors", wv_errors) + w_total = sum(log.get("w_total", 0) for log in logging_outputs) + metrics.log_scalar("_w_total", w_total) + + if c_total > 0: + metrics.log_derived( + "uer", + lambda meters: safe_round( + meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3 + ) + if meters["_c_total"].sum > 0 + else float("nan"), + ) + if w_total > 0: + metrics.log_derived( + "wer", + lambda meters: safe_round( + meters["_w_errors"].sum * 100.0 / meters["_w_total"].sum, 3 + ) + if meters["_w_total"].sum > 0 + else float("nan"), + ) + metrics.log_derived( + "raw_wer", + lambda meters: safe_round( + meters["_wv_errors"].sum * 100.0 / meters["_w_total"].sum, 3 + ) + if meters["_w_total"].sum > 0 + else float("nan"), + ) diff --git a/examples/speech_text_joint_to_text/criterions/multi_modality_cross_entropy.py b/examples/speech_text_joint_to_text/criterions/multi_modality_cross_entropy.py new file mode 100644 index 0000000000..6c9cb0f20d --- /dev/null +++ b/examples/speech_text_joint_to_text/criterions/multi_modality_cross_entropy.py @@ -0,0 +1,101 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import torch + +from fairseq import utils +from fairseq.criterions import register_criterion +from fairseq.criterions.label_smoothed_cross_entropy import ( + LabelSmoothedCrossEntropyCriterion, + LabelSmoothedCrossEntropyCriterionConfig, + label_smoothed_nll_loss, +) + + +@register_criterion( + "speech_text_pretrain_cross_entropy", + dataclass=LabelSmoothedCrossEntropyCriterionConfig, +) +class SpeechTextPreTrainCrossEntCriterion(LabelSmoothedCrossEntropyCriterion): + def __init__(self, task, sentence_avg, label_smoothing, report_accuracy=False): + super().__init__( + task, sentence_avg, label_smoothing, report_accuracy=report_accuracy + ) + + def forward(self, model, sample, reduce=True): + net_output = model(**sample["net_input"]) + loss, nll_loss, nsentences, ntokens, n_correct = self.compute_loss( + model, net_output, sample, reduce=reduce + ) + sample_size = nsentences if self.sentence_avg else ntokens + logging_output = { + "loss": loss.data, + "nll_loss": nll_loss.data, + "ntokens": ntokens, + "nsentences": nsentences, + "sample_size": sample_size, + } + if self.report_accuracy: + logging_output["n_correct"] = utils.item(n_correct) + logging_output["total"] = utils.item(ntokens) + return loss, sample_size, logging_output + + def get_lprobs_and_target(self, model, net_output, sample): + lprobs = model.get_normalized_probs(net_output, log_probs=True) + target = model.get_targets(sample, net_output) + assert self.ignore_prefix_size == 0 + if self.ignore_prefix_size > 0: + if getattr(lprobs, "batch_first", False): + lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous() + target = target[:, self.ignore_prefix_size :].contiguous() + else: + lprobs = lprobs[self.ignore_prefix_size :, :, :].contiguous() + target = target[self.ignore_prefix_size :, :].contiguous() + return lprobs, target + + def compute_loss(self, model, net_output, sample, reduce=True): + lprobs, target = self.get_lprobs_and_target(model, net_output, sample) + n_correct = 0 + if isinstance(target, dict): + t_lprobs = target["target_logprobs"] + + if not lprobs.batch_first: + lprobs = lprobs.transpose(0, 1) + t_lprobs = t_lprobs.transpose(0, 1) + nsentences, seq_len = lprobs.size()[:2] + ntokens = nsentences * seq_len + t_probs = t_lprobs.exp() + mask_indices = ( + net_output[1]["mask_indices"][0] + if len(net_output[1]["mask_indices"]) > 0 + else None + ) + + # mask_indices is True for those masking frames + if mask_indices is not None: # B X T + t_probs = t_probs.masked_fill(mask_indices.eq(False).unsqueeze(-1), 0) + ntokens = mask_indices.int().sum() + t_probs = t_probs.detach() + t_lprobs = t_lprobs.detach() + loss = ( + -(t_probs * (lprobs - t_lprobs)).sum() + if reduce + else -(t_probs * (lprobs - t_lprobs)).sum(-1, keepdim=True) + ) + nll_loss = loss + else: + nsentences = target.size(0) + mask = target.ne(self.padding_idx) + loss, nll_loss = label_smoothed_nll_loss( + lprobs.view(-1, lprobs.size(-1)), + target.view(-1), + self.eps, + ignore_index=self.padding_idx, + reduce=reduce, + ) + n_correct = torch.sum( + lprobs.argmax(-1).masked_select(mask).eq(target.masked_select(mask)) + ) + ntokens = torch.sum(mask) + return loss, nll_loss, nsentences, ntokens, n_correct diff --git a/examples/speech_text_joint_to_text/criterions/text_guide_cross_entropy_acc.py b/examples/speech_text_joint_to_text/criterions/text_guide_cross_entropy_acc.py new file mode 100644 index 0000000000..fd6ff155c9 --- /dev/null +++ b/examples/speech_text_joint_to_text/criterions/text_guide_cross_entropy_acc.py @@ -0,0 +1,224 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import math + +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.criterions.label_smoothed_cross_entropy import label_smoothed_nll_loss +from fairseq.logging import metrics + + +@register_criterion("guided_label_smoothed_cross_entropy_with_accuracy") +class GuidedCrossEntAccCriterion(FairseqCriterion): + def __init__( + self, + task, + sentence_avg, + guide_alpha, + text_input_cost_ratio, + label_smoothing, + disable_text_guide_update_num=0, + attentive_cost_regularization=0, + ): + """ + guide_alpha: alpha to inteplate nll and kd loss + text_input_cost_ratio: loss ratio for text only input data + label_smoothing: label smoothing ratio + disable_text_guide_update_num: only use nll loss for the first N updates + attentive_cost_regularization: ratio fo attentive cost + """ + super().__init__(task) + self.alpha = guide_alpha + self.attn_beta = attentive_cost_regularization + self.sentence_avg = sentence_avg + self.eps = label_smoothing + self.text_input_cost_ratio = text_input_cost_ratio + self.disable_update_num = disable_text_guide_update_num + assert self.alpha >= 0 and self.alpha <= 1.0 + + @staticmethod + def add_args(parser): + """Add criterion-specific arguments to the parser.""" + # fmt: off + parser.add_argument('--label-smoothing', default=0., type=float, metavar='D', + help='epsilon for label smoothing, 0 means no label smoothing') + # fmt: off + parser.add_argument('--guide-alpha', default=0., type=float, metavar='D', + help='alpha to merge kd cost from text to speech input with ce loss') + # fmt: off + parser.add_argument('--disable-text-guide-update-num', default=0, type=int, metavar='D', + help='disable guided target from text for the first N updates.') + parser.add_argument("--attentive-cost-regularization", default=0.0, type=float, metavar='D', + help="use encoder attentive loss regularization with cost ratio D") + parser.add_argument("--attentive-cost-without-normalize", action='store_true', + help="Don't do normalization during attentive cost computation") + + def forward(self, model, sample, reduce=True): + reduction = 'sum' if reduce else 'none' + net_input = sample["net_input"] + net_output = model(**net_input) + attn_cost = None + lprobs = model.get_normalized_probs(net_output, log_probs=True) + is_dual_input = True if net_input['src_tokens'] is not None and net_input.get('src_txt_tokens') is not None else False + target = model.get_targets(sample, net_output) + src_token_num = 0 + if is_dual_input: + # lprobs_spch from speech encoder and lprobs_text from text encoder + lprobs_spch, lprobs_text = torch.chunk(lprobs, 2) + lprobs_spch.batch_first = lprobs.batch_first + lprobs_text.batch_first = lprobs.batch_first + + speech_loss, speech_nll_loss, speech_correct, speech_total = \ + self.guide_loss_and_acc(model, lprobs_spch, lprobs_text, target, reduce=(reduction == 'sum')) + text_loss, text_nll_loss, text_correct, text_total = self.compute_loss_and_acc(model, lprobs_text, target, reduction=reduction) + loss = (speech_loss + text_loss) + nll_loss = (speech_nll_loss + text_nll_loss) + correct = speech_correct + text_correct + total = speech_total + text_total + + attn_cost = net_output[1].get('attn_cost') + if attn_cost is not None: + # attn_cost is batch_first and padding tokens have been masked already + src_token_num = attn_cost.ne(0).sum() + attn_cost = attn_cost.sum() + loss = loss + attn_cost * self.attn_beta + else: + attn_cost = 0 + else: + loss, nll_loss, correct, total = self.compute_loss_and_acc(model, lprobs, target, reduction=reduction) + if sample["net_input"]['src_tokens'] is None: # text input only + loss = loss * self.text_input_cost_ratio + speech_loss = None + speech_nll_loss = None + + sample_size, logging_output = self.get_logging_output( + sample, loss, nll_loss, correct, total, src_token_num, speech_loss, speech_nll_loss, attn_cost, is_dual_input + ) + return loss, sample_size, logging_output + + def compute_loss_and_acc(self, model, lprobs, target, reduction='sum'): + if not lprobs.batch_first: + lprobs = lprobs.transpose(0, 1) + lprobs = lprobs.view(-1, lprobs.size(-1)) # -> (B x T) x C + target = target.view(-1) + loss, nll_loss = label_smoothed_nll_loss( + lprobs, target, self.eps, ignore_index=self.padding_idx, reduce=(reduction == 'sum'), + ) + + mask = target.ne(self.padding_idx) + correct = torch.sum(lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask))) + total = torch.sum(mask) + return loss, nll_loss, correct, total + + def guide_loss_and_acc(self, model, lprobs, lprobs_teacher, target, reduce=True): + """ lprobs_teacher is used as guide for lprobs """ + if self.alpha == 0.0 or model.num_updates < self.disable_update_num: + return self.compute_loss_and_acc(model, lprobs, target, reduction=('sum' if reduce else 'none')) + if not lprobs.batch_first: + lprobs = lprobs.transpose(0, 1) + lprobs_teacher = lprobs_teacher.transpose(0, 1) + + lprobs = lprobs.view(-1, lprobs.size(-1)).float() # -> (B x T) x C + lprobs_teacher = lprobs_teacher.view(-1, lprobs_teacher.size(-1)).float() # -> (B x T) x C + target = target.view(-1) + loss = F.nll_loss(lprobs, target, ignore_index=self.padding_idx, reduction='sum' if reduce else 'none') + nll_loss = loss + probs_teacher = lprobs_teacher.exp().masked_fill_(target.unsqueeze(-1).eq(self.padding_idx), 0) + probs_teacher = probs_teacher.detach() + guide_loss = -(probs_teacher*lprobs).sum() if reduce else -(probs_teacher*lprobs).sum(-1, keepdim=True) + loss = self.alpha*guide_loss + (1.0 - self.alpha)*loss + + mask = target.ne(self.padding_idx) + correct = torch.sum(lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask))) + total = torch.sum(mask) + return loss, nll_loss, correct, total + + def get_logging_output( + self, + sample, + loss, + nll_loss, + correct, + total, + src_token_num=0, + speech_loss=None, + speech_nll_loss=None, + attn_cost=None, + is_dual_input=False, + ): + + sample_size = ( + sample["target"].size(0) if self.sentence_avg else sample["ntokens"] + ) + mul_size = 2 if is_dual_input else 1 + + logging_output = { + "loss": utils.item(loss.data), # * sample['ntokens'], + "nll_loss": utils.item(nll_loss.data), # * sample['ntokens'], + "ntokens": sample["ntokens"]*mul_size, + "nsentences": sample["target"].size(0)*mul_size, + "sample_size": sample_size*mul_size, + "correct": utils.item(correct.data), + "total": utils.item(total.data), + "src_token_num": utils.item(src_token_num.data) if src_token_num > 0 else 0, + "nframes": torch.sum(sample["net_input"]["src_lengths"]).item(), + } + + if speech_loss is not None: + logging_output["speech_loss"] = utils.item(speech_loss.data) + logging_output["speech_nll_loss"] = utils.item(speech_nll_loss.data) + logging_output["sample_size_speech_cost"] = sample_size + logging_output["speech_attn_loss"] = attn_cost + + return sample_size*mul_size, logging_output + + @staticmethod + def aggregate_logging_outputs(logging_outputs): + """Aggregate logging outputs from data parallel training.""" + correct_sum = sum(log.get("correct", 0) for log in logging_outputs) + total_sum = sum(log.get("total", 0) for log in logging_outputs) + src_token_sum = sum(log.get("src_token_num", 0) for log in logging_outputs) + loss_sum = sum(log.get("loss", 0) for log in logging_outputs) + nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs) + ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) + nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + nframes = sum(log.get("nframes", 0) for log in logging_outputs) + speech_loss_sum = sum(log.get("speech_loss", 0) for log in logging_outputs) + speech_nll_loss_sum = sum(log.get("speech_nll_loss", 0) for log in logging_outputs) + speech_attn_loss_sum = sum(log.get("speech_attn_loss", 0) for log in logging_outputs) + sample_size_speech = sum(log.get("sample_size_speech_cost", 0) for log in logging_outputs) + + agg_output = { + "loss": loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.0, + "nll_loss": nll_loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.0, + # if args.sentence_avg, then sample_size is nsentences, and loss + # is per-sentence loss; else sample_size is ntokens, and the loss + # becomes per-output token loss + "speech_loss": speech_loss_sum / sample_size_speech / math.log(2) if sample_size_speech > 0 else 0.0, + "speech_nll_loss": speech_nll_loss_sum / sample_size_speech / math.log(2) if sample_size_speech > 0 else 0.0, + "speech_attn_loss": speech_attn_loss_sum / src_token_sum / math.log(2) if src_token_sum > 0 else 0.0, + "ntokens": ntokens, + "nsentences": nsentences, + "nframes": nframes, + "sample_size": sample_size, + "acc": correct_sum * 100.0 / total_sum if total_sum > 0 else 0.0, + "correct": correct_sum, + "total": total_sum, + "src_token_num": src_token_sum, + # total is the number of validate tokens + } + return agg_output + + @classmethod + def reduce_metrics(cls, logging_outputs): + """Aggregate logging outputs from data parallel training.""" + agg_logging_outputs = cls.aggregate_logging_outputs(logging_outputs) + for k, v in agg_logging_outputs.items(): + if k in {'nsentences', 'ntokens', 'sample_size'}: + continue + metrics.log_scalar(k, v, round=3) diff --git a/examples/speech_text_joint_to_text/data/pair_denoising_dataset.py b/examples/speech_text_joint_to_text/data/pair_denoising_dataset.py new file mode 100644 index 0000000000..fc94fbaf11 --- /dev/null +++ b/examples/speech_text_joint_to_text/data/pair_denoising_dataset.py @@ -0,0 +1,318 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import math +import re + +import torch + +from fairseq.data import data_utils +from fairseq.data.language_pair_dataset import LanguagePairDataset + + +# Part of the code is modified from DenoisingDataset +# compared with DenoisingDataset, no permute_sentences or documents (rotate_ratio, permute_sentence_ratio) +class LanguagePairDenoisingDataset(LanguagePairDataset): + def __init__( + self, + src, + src_sizes, + src_dict, + tgt, + tgt_sizes, + tgt_dict, + mask_idx, + mask_whole_words, + seed, + args, + left_pad_source=True, + left_pad_target=False, + shuffle=True, + input_feeding=True, + remove_eos_from_source=False, + append_eos_to_target=False, + align_dataset=None, + constraints=None, + append_bos=False, + eos=None, + num_buckets=0, + src_lang_id=None, + tgt_lang_id=None, + pad_to_multiple=1, + ): + super().__init__( + src, + src_sizes, + src_dict, + tgt, + tgt_sizes, + tgt_dict, + left_pad_source, + left_pad_target, + shuffle, + input_feeding, + remove_eos_from_source, + append_eos_to_target, + align_dataset, + constraints, + append_bos, + eos, + num_buckets, + src_lang_id, + tgt_lang_id, + pad_to_multiple, + ) + + self.mask_idx = mask_idx + self.mask_whole_word = mask_whole_words + self.mask_ratio = args.mask + self.random_ratio = args.mask_random + self.insert_ratio = args.insert + + self.replace_length = args.replace_length + + if self.replace_length not in [-1, 0, 1]: + raise ValueError(f"invalid arg: replace_length={self.replace_length}") + if args.mask_length not in ["subword", "word", "span-poisson"]: + raise ValueError(f"invalid arg: mask-length={args.mask_length}") + if args.mask_length == "subword" and args.replace_length not in [0, 1]: + raise ValueError("if using subwords, use replace-length=1 or 0") + + self.mask_span_distribution = None + if args.mask_length == "span-poisson": + # Text infilling: "A number of text spans are sampled, with span lengths drawn from a Poisson distribution (λ = 3). Each span is replaced with a single [MASK] token. 0-length spans correspond to the insertion of [MASK] tokens." + _lambda = args.poisson_lambda + + lambda_to_the_k = 1 + e_to_the_minus_lambda = math.exp(-_lambda) + k_factorial = 1 + ps = [] + for k in range(0, 128): + ps.append(e_to_the_minus_lambda * lambda_to_the_k / k_factorial) + lambda_to_the_k *= _lambda + k_factorial *= k + 1 + if ps[-1] < 0.0000001: + break + ps = torch.FloatTensor(ps) + self.mask_span_distribution = torch.distributions.Categorical(ps) + + self.epoch = 0 + self.seed = seed + + def _is_phoneme(x): + if re.search("<lang:", x) or x in ( + "<mask>", + "<sil>", + "<pad>", + "<s>", + "</s>", + "<unk>", + ): + return False + return True + + self.voc_valid_ids = torch.LongTensor( + [i for i, x in enumerate(self.src_dict.symbols) if _is_phoneme(x)] + ) + self.voc_valid_size = self.voc_valid_ids.size(0) + + @property + def can_reuse_epoch_itr_across_epochs(self): + return False + + def set_epoch(self, epoch, **unused): + self.epoch = epoch + + def __getitem__(self, index): + tgt_item = self.tgt[index] if self.tgt is not None else None + src_item = copy.deepcopy(self.src[index]) + with data_utils.numpy_seed(self.seed, self.epoch, index): + source = src_item + assert source[-1] == self.eos + if self.mask_ratio > 0: + source = self.add_whole_word_mask(source, self.mask_ratio) + + if self.insert_ratio > 0: + source = self.add_insertion_noise(source, self.insert_ratio) + src_item = source + + if self.append_eos_to_target: + eos = self.tgt_dict.eos() if self.tgt_dict else self.src_dict.eos() + if self.tgt and self.tgt[index][-1] != eos: + tgt_item = torch.cat([self.tgt[index], torch.LongTensor([eos])]) + + if self.append_bos: + bos = self.tgt_dict.bos() if self.tgt_dict else self.src_dict.bos() + if self.tgt and self.tgt[index][0] != bos: + tgt_item = torch.cat([torch.LongTensor([bos]), self.tgt[index]]) + + bos = self.src_dict.bos() + if src_item[0] != bos: + src_item = torch.cat([torch.LongTensor([bos]), src_item]) + + if self.remove_eos_from_source: + eos = self.src_dict.eos() + if src_item[-1] == eos: + src_item = src_item[:-1] + + example = { + "id": index, + "source": src_item, + "target": tgt_item, + } + if self.align_dataset is not None: + example["alignment"] = self.align_dataset[index] + if self.constraints is not None: + example["constraints"] = self.constraints[index] + if self.src_lang_id is not None: + example["src_lang_id"] = self.src_lang_id + if self.tgt_lang_id is not None: + example["tgt_lang_id"] = self.tgt_lang_id + return example + + # following functions are borrowed from denoising_dataset + def word_starts(self, source): + if self.mask_whole_word is not None: + is_word_start = self.mask_whole_word.gather(0, source) + else: + is_word_start = torch.ones(source.size()) + is_word_start[0] = 0 + is_word_start[-1] = 0 + return is_word_start + + def add_whole_word_mask(self, source, p): + is_word_start = self.word_starts(source) + num_to_mask = int(math.ceil(is_word_start.float().sum() * p)) + num_inserts = 0 + if num_to_mask == 0: + return source + + if self.mask_span_distribution is not None: + lengths = self.mask_span_distribution.sample(sample_shape=(num_to_mask,)) + + # Make sure we have enough to mask + cum_length = torch.cumsum(lengths, 0) + while cum_length[-1] < num_to_mask: + lengths = torch.cat( + [ + lengths, + self.mask_span_distribution.sample(sample_shape=(num_to_mask,)), + ], + dim=0, + ) + cum_length = torch.cumsum(lengths, 0) + + # Trim to masking budget + i = 0 + while cum_length[i] < num_to_mask: + i += 1 + lengths[i] = num_to_mask - (0 if i == 0 else cum_length[i - 1]) + num_to_mask = i + 1 + lengths = lengths[:num_to_mask] + + # Handle 0-length mask (inserts) separately + lengths = lengths[lengths > 0] + num_inserts = num_to_mask - lengths.size(0) + num_to_mask -= num_inserts + if num_to_mask == 0: + return self.add_insertion_noise(source, num_inserts / source.size(0)) + + assert (lengths > 0).all() + else: + lengths = torch.ones((num_to_mask,)).long() + assert is_word_start[-1] == 0 + word_starts = is_word_start.nonzero(as_tuple=False) + indices = word_starts[ + torch.randperm(word_starts.size(0))[:num_to_mask] + ].squeeze(1) + mask_random = torch.FloatTensor(num_to_mask).uniform_() < self.random_ratio + + source_length = source.size(0) + assert source_length - 1 not in indices + to_keep = torch.ones(source_length, dtype=torch.bool) + is_word_start[ + -1 + ] = 255 # acts as a long length, so spans don't go over the end of doc + if self.replace_length == 0: + to_keep[indices] = 0 + else: + # keep index, but replace it with [MASK] + source[indices] = self.mask_idx + source[indices[mask_random]] = self.voc_valid_ids[ + torch.randint(0, self.voc_valid_size - 1, size=(mask_random.sum(),)) + ] + + if self.mask_span_distribution is not None: + assert len(lengths.size()) == 1 + assert lengths.size() == indices.size() + lengths -= 1 + while indices.size(0) > 0: + assert lengths.size() == indices.size() + lengths -= is_word_start[indices + 1].long() + uncompleted = lengths >= 0 + indices = indices[uncompleted] + 1 + mask_random = mask_random[uncompleted] + lengths = lengths[uncompleted] + if self.replace_length != -1: + # delete token + to_keep[indices] = 0 + else: + # keep index, but replace it with [MASK] + source[indices] = self.mask_idx + source[indices[mask_random]] = self.voc_valid_ids[ + torch.randint( + 0, self.voc_valid_size - 1, size=(mask_random.sum(),) + ) + ] + else: + # A bit faster when all lengths are 1 + while indices.size(0) > 0: + uncompleted = is_word_start[indices + 1] == 0 + indices = indices[uncompleted] + 1 + mask_random = mask_random[uncompleted] + if self.replace_length != -1: + # delete token + to_keep[indices] = 0 + else: + # keep index, but replace it with [MASK] + source[indices] = self.mask_idx + source[indices[mask_random]] = self.voc_valid_ids[ + torch.randint( + 0, self.voc_valid_size - 1, size=(mask_random.sum(),) + ) + ] + + assert source_length - 1 not in indices + + source = source[to_keep] + + if num_inserts > 0: + source = self.add_insertion_noise(source, num_inserts / source.size(0)) + + return source + + def add_insertion_noise(self, tokens, p): + if p == 0.0: + return tokens + + num_tokens = len(tokens) + n = int(math.ceil(num_tokens * p)) + + noise_indices = torch.randperm(num_tokens + n - 2)[:n] + 1 + noise_mask = torch.zeros(size=(num_tokens + n,), dtype=torch.bool) + noise_mask[noise_indices] = 1 + result = torch.LongTensor(n + len(tokens)).fill_(-1) + + num_random = int(math.ceil(n * self.random_ratio)) + result[noise_indices[num_random:]] = self.mask_idx + result[noise_indices[:num_random]] = self.voc_valid_ids[ + torch.randint(0, self.voc_valid_size - 1, size=(num_random,)) + ] + + result[~noise_mask] = tokens + + assert (result >= 0).all() + return result diff --git a/examples/speech_text_joint_to_text/docs/ende-mustc.md b/examples/speech_text_joint_to_text/docs/ende-mustc.md new file mode 100644 index 0000000000..1acf6e001b --- /dev/null +++ b/examples/speech_text_joint_to_text/docs/ende-mustc.md @@ -0,0 +1,118 @@ +[[Back]](..) + +# Joint Speech Text Training for the MuST-C English to German Speech Translation task + +Joint Training Baseline: it is based on paper ["A general multi-task learning framework to leverage text data for speech to text tasks"](https://arxiv.org/pdf/2010.11338.pdf) + +Enhanced Joint Training: the joint training is enhanced with pre-trained models, cross attentive regularization and online knowledge distillation based on paper ["Improving Speech Translation by Understanding and Learning from the Auxiliary Text Translation Task"](https://research.fb.com/publications/improving-speech-translation-by-understanding-and-learning-from-the-auxiliary-text-translation-task) + +## Prepare Data +#### Download files +- Sentence piece model [spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/spm.model) +- Dictionary [dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/dict.txt) +- config [config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/config.yaml) +#### Prepare MuST-C data set +- Please follow the data preparation in the [S2T example](https://github.com/pytorch/fairseq/blob/main/examples/speech_to_text/docs/mustc_example.md) +- Convert source text under the "src_text" column in the tsv file into phoneme representation. +```bash + python examples/speech_text_joint_to_text/scripts/g2p_encode.py \ + --lower-case --do-filter --use-word-start --no-punc \ + --reserve-word examples/speech_text_joint_to_text/configs/mustc_noise.list \ + --data-path ${must_c_en_de_src_text} \ + --out-path ${must_c_en_de_src_text_pho} +``` +- Replace the source text under the "src_text" column in the tsv file with the corresponding phoneme reprentation generated in the step above. +Below is the snapshot for the MuST-C en-de dev tsv +``` +id audio n_frames tgt_text src_text speaker +ted_767_0 en-de/flac.zip:10071514743:48445 56160 Heute spreche ich zu Ihnen über Energie und Klima. ▁AY1 M ▁G OW1 IH0 NG ▁T UW1 ▁T AO1 K ▁T AH0 D EY1 ▁AH0 B AW1 T ▁EH1 N ER0 JH IY0 ▁AH0 N D ▁K L AY1 M AH0 T spk.767_ +ted_767_1 en-de/flac.zip:1214217978:205678 226080 Und das überrascht vielleicht etwas, weil sich meine Vollzeitbeschäftigung bei der Stiftung hauptsächlich um Impfstoffe und Saatgut dreht, um die Dinge, die wir erfinden und liefern müssen um den ärmsten 2 Milliarden ein besseres Leben zu ermöglichen. ▁AH0 N D ▁DH AE1 T ▁M AY1 T ▁S IY1 M ▁AH0 ▁B IH1 T ▁S ER0 P R AY1 Z IH0 NG ▁B IH0 K AO1 Z ▁M AY1 ▁F UH1 L ▁T AY1 M ▁W ER1 K ▁AE1 T ▁DH AH0 ▁F AW0 N D EY1 SH AH0 N ▁IH1 Z ▁M OW1 S T L IY0 ▁AH0 B AW1 T ▁V AE2 K S IY1 N Z ▁AH0 N D ▁S IY1 D Z ▁AH0 B AW1 T ▁DH AH0 ▁TH IH1 NG Z ▁DH AE1 T ▁W IY1 ▁N IY1 D ▁T UW1 ▁IH0 N V EH1 N T ▁AH0 N D ▁D IH0 L IH1 V ER0 ▁T UW1 ▁HH EH1 L P ▁DH AH0 ▁P UH1 R IH0 S T ▁T UW1 ▁B IH1 L Y AH0 N ▁L AY1 V ▁B EH1 T ER0 ▁L IH1 V Z spk.767_ +``` +- Prepare phoneme dictionary and save to $MANIFEST_ROOT as [src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/src_dict.txt) +#### Prepare WMT text data +- [Download wmt data](https://github.com/pytorch/fairseq/blob/main/examples/translation/prepare-wmt14en2de.sh) +- Convert source text (English) into phoneme representation as above +- Generate binary parallel files with "fairseq-preprocess" from fairseq for training and validation. The source input is English phoneme representation and the target input is German sentencepiece token . The output is saved under $parallel_text_data + +## Training +The model is trained with 8 v100 GPUs. + +#### Download pretrained models +- [pretrain_encoder](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_joint_asr_transformer_m.pt) +- [pretrain_nmt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/checkpoint_mt.pt) + +#### Training scripts +- Jointly trained model from scratch +```bash +python train.py ${MANIFEST_ROOT} \ + --save-dir ${save_dir} \ + --num-workers 8 \ + --task speech_text_joint_to_text \ + --arch dualinputs2ttransformer_s \ + --user-dir examples/speech_text_joint_to_text \ + --max-epoch 100 --update-mix-data \ + --optimizer adam --lr-scheduler inverse_sqrt \ + --lr 0.001 --update-freq 4 --clip-norm 10.0 \ + --criterion guided_label_smoothed_cross_entropy_with_accuracy \ + --label-smoothing 0.1 --max-tokens 10000 --max-tokens-text 10000 \ + --max-positions-text 400 --seed 2 --speech-encoder-layers 12 \ + --text-encoder-layers 6 --encoder-shared-layers 6 --decoder-layers 6 \ + --dropout 0.1 --warmup-updates 20000 \ + --text-sample-ratio 0.25 --parallel-text-data ${parallel_text_data} \ + --text-input-cost-ratio 0.5 --enc-grad-mult 2.0 --add-speech-eos \ + --log-format json --langpairs en-de --noise-token '"'"'▁NOISE'"'"' \ + --mask-text-ratio 0.0 --max-tokens-valid 20000 --ddp-backend no_c10d \ + --log-interval 100 --data-buffer-size 50 --config-yaml config.yaml \ + --keep-last-epochs 10 +``` +- Jointly trained model with good initialization, cross attentive loss and online knowledge distillation +```bash +python train.py ${MANIFEST_ROOT} \ + --save-dir ${save_dir} \ + --num-workers 8 \ + --task speech_text_joint_to_text \ + --arch dualinputs2ttransformer_m \ + --user-dir examples/speech_text_joint_to_text \ + --max-epoch 100 --update-mix-data \ + --optimizer adam --lr-scheduler inverse_sqrt \ + --lr 0.002 --update-freq 4 --clip-norm 10.0 \ + --criterion guided_label_smoothed_cross_entropy_with_accuracy \ + --guide-alpha 0.8 --disable-text-guide-update-num 5000 \ + --label-smoothing 0.1 --max-tokens 10000 --max-tokens-text 10000 \ + --max-positions-text 400 --seed 2 --speech-encoder-layers 12 \ + --text-encoder-layers 6 --encoder-shared-layers 6 --decoder-layers 6 \ + --dropout 0.1 --warmup-updates 20000 --attentive-cost-regularization 0.02 \ + --text-sample-ratio 0.25 --parallel-text-data ${parallel_text_data} \ + --text-input-cost-ratio 0.5 --enc-grad-mult 2.0 --add-speech-eos \ + --log-format json --langpairs en-de --noise-token '"'"'▁NOISE'"'"' \ + --mask-text-ratio 0.0 --max-tokens-valid 20000 --ddp-backend no_c10d \ + --log-interval 100 --data-buffer-size 50 --config-yaml config.yaml \ + --load-pretrain-speech-encoder ${pretrain_encoder} \ + --load-pretrain-decoder ${pretrain_nmt} \ + --load-pretrain-text-encoder-last ${pretrain_nmt} \ + --keep-last-epochs 10 +``` + +## Evaluation +```bash +python ./fairseq_cli/generate.py \ + ${MANIFEST_ROOT} \ + --task speech_text_joint_to_text \ + --max-tokens 25000 \ + --nbest 1 \ + --results-path ${infer_results} \ + --batch-size 512 \ + --path ${model} \ + --gen-subset tst-COMMON_st \ + --config-yaml config.yaml \ + --scoring sacrebleu \ + --beam 5 --lenpen 1.0 \ + --user-dir examples/speech_text_joint_to_text \ + --load-speech-only +``` + +## Results (Joint training with initialization + CAR + online KD) +|Direction|En-De | En-Es | En-Fr | +|---|---|---|---| +|BLEU|27.4| 31.2 | 37.6 | +|checkpoint | [link](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/checkpoint_ave_10.pt) |[link](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_es/checkpoint_ave_10.pt)|[link](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_fr/checkpoint_ave_10.pt)| diff --git a/examples/speech_text_joint_to_text/docs/iwslt2021.md b/examples/speech_text_joint_to_text/docs/iwslt2021.md new file mode 100644 index 0000000000..0af0fbff1b --- /dev/null +++ b/examples/speech_text_joint_to_text/docs/iwslt2021.md @@ -0,0 +1,76 @@ +[[Back]](..) + +# Joint Speech Text Training for the 2021 IWSLT multilingual speech translation + +This directory contains the code from paper ["FST: the FAIR Speech Translation System for the IWSLT21 Multilingual Shared Task"](https://arxiv.org/pdf/2107.06959.pdf). + +## Prepare Data +#### Download files +- Sentence piece model [spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/spm.model) +- Dictionary [tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/dict.txt) +- Config [config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/config.yaml) + +#### Prepare +- Please follow the data preparation in [speech-to-text](https://github.com/pytorch/fairseq/blob/main/examples/speech_to_text/docs/mtedx_example.md) with option "--use-audio-input" for raw audio tsv files. +- Prepare tsv files with phoneme based source text (under column 'src_text') as [MuST-C](ende-mustc.md) example. + + +## Training + +#### Download pretrained models +- [Pretrained mbart model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/mbart.pt) +- [Pretrained w2v model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/xlsr_53_56k.pt) + + +#### Training scripts + +```bash +python train.py ${MANIFEST_ROOT} \ + --save-dir ${save_dir} \ + --user-dir examples/speech_text_joint_to_text \ + --train-subset train_es_en_tedx,train_es_es_tedx,train_fr_en_tedx,train_fr_es_tedx,train_fr_fr_tedx,train_it_it_tedx,train_pt_en_tedx,train_pt_pt_tedx \ + --valid-subset valid_es_en_tedx,valid_es_es_tedx,valid_es_fr_tedx,valid_es_it_tedx,valid_es_pt_tedx,valid_fr_en_tedx,valid_fr_es_tedx,valid_fr_fr_tedx,valid_fr_pt_tedx,valid_it_en_tedx,valid_it_es_tedx,valid_it_it_tedx,valid_pt_en_tedx,valid_pt_es_tedx,valid_pt_pt_tedx \ + --config-yaml config.yaml --ddp-backend no_c10d \ + --num-workers 2 --task speech_text_joint_to_text \ + --criterion guided_label_smoothed_cross_entropy_with_accuracy \ + --label-smoothing 0.3 --guide-alpha 0.8 \ + --disable-text-guide-update-num 5000 --arch dualinputxmtransformer_base \ + --max-tokens 500000 --max-sentences 3 --max-tokens-valid 800000 \ + --max-source-positions 800000 --enc-grad-mult 2.0 \ + --attentive-cost-regularization 0.02 --optimizer adam \ + --clip-norm 1.0 --log-format simple --log-interval 200 \ + --keep-last-epochs 5 --seed 1 \ + --w2v-path ${w2v_path} \ + --load-pretrained-mbart-from ${mbart_path} \ + --max-update 1000000 --update-freq 4 \ + --skip-invalid-size-inputs-valid-test \ + --skip-encoder-projection --save-interval 1 \ + --attention-dropout 0.3 --mbart-dropout 0.3 \ + --finetune-w2v-params all --finetune-mbart-decoder-params all \ + --finetune-mbart-encoder-params all --stack-w2v-mbart-encoder \ + --drop-w2v-layers 12 --normalize \ + --lr 5e-05 --lr-scheduler inverse_sqrt --warmup-updates 5000 +``` + +## Evaluation +```bash +python ./fairseq_cli/generate.py + ${MANIFEST_ROOT} \ + --task speech_text_joint_to_text \ + --user-dir ./examples/speech_text_joint_to_text \ + --load-speech-only --gen-subset test_es_en_tedx \ + --path ${model} \ + --max-source-positions 800000 \ + --skip-invalid-size-inputs-valid-test \ + --config-yaml config.yaml \ + --infer-target-lang en \ + --max-tokens 800000 \ + --beam 5 \ + --results-path ${RESULTS_DIR} \ + --scoring sacrebleu +``` +The trained model can be downloaded [here](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/checkpoint17.pt) + +|direction|es_en|fr_en|pt_en|it_en|fr_es|pt_es|it_es|es_es|fr_fr|pt_pt|it_it| +|---|---|---|---|---|---|---|---|---|---|---|---| +|BLEU|31.62|36.93|35.07|27.12|38.87|35.57|34.13|74.59|74.64|70.84|69.76| diff --git a/examples/speech_text_joint_to_text/docs/pre-training.md b/examples/speech_text_joint_to_text/docs/pre-training.md new file mode 100644 index 0000000000..6d9e2cb0bb --- /dev/null +++ b/examples/speech_text_joint_to_text/docs/pre-training.md @@ -0,0 +1,192 @@ +[[Back]](..) + +# Unified Speech-Text Pre-training for Speech Translation and Recognition + +This directory contains the pre-training recipes from paper ["Unified Speech-Text Pre-training for Speech Translation and Recognition"](https://arxiv.org/abs/2204.05409). + +## Librispeech ASR Pre-training +### Prepare Data +#### Download files +#### Prepare pre-training data +- Text to text task (T2T): prepare the binary data following the similar steps in [EN_DE Joint training](./ende-mustc.md). The source data is presented as phomeme token sequence and the target data is coded as subword tokens via SentencePiece. The text data is downloaded from [openslr](https://www.openslr.org/12) +- Self-supervised speech learning task (SSL): The data is prepared as [wav2vec 2.0](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec/README.md) +- Speech to phoneme classification task (S2P): The tsv file contains 5 fields: "id", "audio", "n_frames", "tgt_text", and "align". The tgt_text field is corresponding to the phoneme based representation of the speech data. "align" field contains the alignment information. The phoneme level forced alignment for the labelled speech data (i.e. Librispeech) can be obtained via [kaldi](http://kaldi-asr.org) or [MFA](https://montrealcorpustools.github.io/Montreal-Forced-Aligner/). The segmentation information is normalized to 0$\sim$1 for the whole utterance. The snapshot of the tsv file is below: +``` +id audio n_frames tgt_text align +116-288045-0000 /librispeech/dev-other/116/288045/116-288045-0000.flac 170400 <sil> ▁AE1 Z AY1 ▁AH0 P R OW1 CH T ▁DH AH1 ▁S IH1 T IY0 <sil> AY1 ▁HH ER1 D ▁B EH1 L Z ▁R IH1 NG IH0 NG <sil> ▁AE1 N D AH0 ▁L IH1 T AH0 L ▁L EY1 T ER0 AY1 ▁F AW1 N D ▁DH AH0 ▁S T R IY1 T S ▁AH0 S T IH1 R ▁W IH0 TH ▁TH R AO1 NG Z ▁AH0 V ▁W EH1 L ▁D R EH1 S T ▁P IY1 P AH0 L ▁IH1 N ▁F AE1 M L IY0 ▁G R UW1 P S <sil> ▁W EH1 N D IH0 NG ▁DH EH1 R ▁W EY1 <sil> ▁HH IH1 DH ER0 ▁AH0 N D ▁TH IH1 DH ER0 <sil> 0.047977 0.056444 0.064911 0.075259 0.081844 0.089370 0.095014 0.104421 0.109125 0.111947 0.115710 0.120414 0.134525 0.141110 0.143932 0.174036 0.176858 0.190028 0.199436 0.207902 0.218250 0.224835 0.231421 0.242709 0.251176 0.257761 0.263405 0.268109 0.270931 0.290687 0.342427 0.349953 0.353716 0.356538 0.360301 0.363123 0.365945 0.368768 0.371590 0.376294 0.384760 0.394167 0.401693 0.409219 0.419567 0.430856 0.441204 0.444026 0.446849 0.449671 0.456256 0.463782 0.471308 0.477893 0.486359 0.491063 0.494826 0.501411 0.512700 0.517404 0.520226 0.534337 0.540922 0.545626 0.550329 0.559737 0.568203 0.583255 0.592662 0.600188 0.603951 0.611477 0.619003 0.624647 0.634055 0.639699 0.646284 0.653810 0.659454 0.664158 0.670743 0.682032 0.687676 0.692380 0.708373 0.713076 0.719661 0.729069 0.740357 0.744120 0.748824 0.752587 0.761994 0.770461 0.781750 0.790216 0.805268 0.808090 0.823142 0.832549 0.836312 0.840075 0.843838 0.851364 0.854186 0.857008 0.862653 0.878645 0.898401 0.901223 0.906867 0.913452 0.920038 0.926623 0.934149 0.939793 0.942615 0.945437 0.952023 0.957667 0.977422 1.000000 + +``` +- Speech to text task (S2T): The data preparation follow the steps in [EN_DE Joint training](./ende-mustc.md). + +#### Prepare fine-tuning data: +We re-use the data from T2T and S2T tasks in the fine-tuning stage. + +### Model Build +#### Pre-training +``` +python train.py $T2T_DATA \ + --save-dir $SAVE_PRE_PATH --user-dir examples/speech_text_joint_to_text --task speech_text_joint_denoising \ + --criterion speech_text_pretrain_cross_entropy --optimizer adam --weight-decay 0.01 --config-yaml config_s2p.yaml --config-s2s-yaml config.yaml --ddp-backend no_c10d \ + --lang-pairs pho-wrd --num-workers 4 --log-interval 500 --save-interval-updates 5000 --keep-interval-updates 1 --no-emb-update-unsup --report-accuracy --lr 0.001 --end-learning-rate 1e-06 \ + --lr-scheduler polynomial_decay --warmup-updates 10000 --total-num-update 800000 --update-freq 6 --validate-interval-updates 10000 --train-subset train \ + --valid-subset valid,valid_sup_speech,valid_sup_speech_s2s,valid_unsup_speech --dataset-impl mmap \ + --sup-speech-data $S2P_DATA_PATH --sup-speech-train-subset train_960.ali --sup-speech-valid-subset dev-clean.ali --sup-speech-s2s-data $S2T_DATA_PATH \ + --sup-speech-s2s-train-subset train --sup-speech-s2s-valid-subset dev-clean --unsup-speech-train-data $SSL_DATA_PATH/train.tsv --unsup-speech-valid-data $SSL_DATA_PATH/valid.tsv \ + --batch-size 200 --batch-size-valid 150 --max-source-positions 1024 --max-target-positions 1024 --max-text-tokens 3072 --max-speech-positions 600000 \ + --max-sample-size 750000 --min-sample-size 64000 --max-speech-tokens 750000 --max-tokens-valid 750000 --skip-invalid-size-inputs-valid-test \ + --unsupervised-speech-sample-ratio 3.0 --supervised-speech-sample-ratio 5 --supervised-speech-s2s-sample-ratio 5 --text-sample-ratio 1.0 --mask 0.3 --mask-random 0.1 \ + --mask-length span-poisson --speech-sup-mask-prob 0.3 --speech-unsup-mask-prob 0.7 --use-mask-whole-words --arch speech_text_pretrain_bart_base_stack \ + --no-scale-feature --activation-fn gelu --speech-extractor-mode default --stacked-encoder all --encoder-normalize-before --decoder-normalize-before \ + --encoder-learned-pos --decoder-learned-pos --dropout 0.1 --load-pretrained-mbart-encoder-from $BART --load-pretrained-mbart-decoder-from $BART +``` +The current implementation also supports model pre-training without the forced alignment supervised data. In this case, CTC is used to optimize the S2P task. We need to do following changes for the setting: +1. options to be added +``` +--use-sup-speech-ctc --criterion speech_text_pretrain_compound +``` +2. options to be deleted +``` +--same-data-update --criterion speech_text_pretrain_cross_entropy +``` +However, we find the CTC based pre-training is still worse than the forced alignment based setting. It could be partially due to the inferior pre-training setting that we re-use the forced alignment based pre-training setting for the CTC based pre-training. + +#### Fine-tuning +``` +python train.py $S2T_DATA_PATH \ + --save-dir $SAVE_FT_PATH --num-workers 8 --task speech_text_joint_to_text --arch dualinputs2twavtransformer_base_stack \ + --user-dir examples/speech_text_joint_to_text --max-update 100000 --optimizer adam --lr-scheduler inverse_sqrt --lr 0.0003 --update-freq 3 --clip-norm 10.0 \ + --criterion guided_label_smoothed_cross_entropy_with_accuracy --guide-alpha 0.8 --label-smoothing 0.1 --warmup-updates 20000 --attentive-cost-regularization 0.02 \ + --enc-grad-mult 2.0 --max-tokens 800000 --max-source-positions 800000 --max-tokens-text 10000 --max-positions-text 1024 --max-target-positions 1024 --no-scale-feature \ + --activation-fn gelu --load-pretrained-speech-text-encoder $SAVE_PRE_PATH/checkpoint_last.pt --load-pretrained-speech-text-decoder $SAVE_PRE_PATH/checkpoint_last.pt \ + --encoder-normalize-before --decoder-normalize-before --speech-extractor-mode default --speech-mask-channel-length 64 --speech-mask-channel-prob 0.5 \ + --speech-mask-length 10 --speech-mask-prob 0.65 --text-sample-ratio 0.25 --mask-text-ratio 0.3 --mask-text-type random --parallel-text-data text_bin \ + --text-input-cost-ratio 0.5 --langpairs pho-wrd --update-mix-data --log-format json --max-tokens-valid 800000 --ddp-backend no_c10d --log-interval 500 \ + --config-yaml config.yaml --skip-invalid-size-inputs-valid-test --keep-last-epochs 50 --layernorm-embedding --encoder-learned-pos --decoder-learned-pos +``` + +### Evaluation +The last 10 epoch models from fine-tuning is conducted model average to get $FINAL_MODEL +``` +python ./fairseq_cli/generate.py \ + $S2T_DATA_PATH \ + --task speech_text_joint_to_text \ + --max-tokens 800000 \ + --max-source-positions 800000 \ + --nbest 1 \ + --results-path $RESULTS_LOG \ + --batch-size 512 \ + --path $FINAL_MODEL \ + --gen-subset $SUBSET \ + --config-yaml config.yaml \ + --scoring wer \ + --beam 10 --lenpen 1.0 examples/speech_text_joint_to_text \ + --user-dir examples/speech_text_joint_to_text --load-speech-only \ + --model-overrides {'load_pretrained_speech_text_decoder':'','load_pretrained_speech_text_encoder':''} +``` + +### Results and models +| | dev-clean | dev-other | test-clean | test-other | +|---|---|---|---|---| +| WER| 2.0 | 4.4 | 2.1 |4.6 | + +**Model Links**: +- [config_s2p.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/pretrain/config_s2p.yaml): Config for S2P +- [spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/spm.model): Sentence Piece model +- [src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/src_dict.txt): Source Phoneme Dictionary +- [tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/tgt_dict.txt): Target Sentence Piece Dictionary +- [config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/config.yaml): Config for S2T +- [BART](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/pretrain/bart.pt): trained from Librispeech text data +- [Joint Pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/pretrain/checkpoint6.pt): model pre-trained with 960 hours Librispeech data (S2P, S2T) Librispeech text training data (T2T) and Librilight data (SSL) +- [Fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/checkpoint_ave_10.pt): the pre-trained model is fined one 960 hours Librispeech speech and text data. (S2T + T2T) + +## MuST-C +### Prepare Data +Compared with the ASR Librispeech ASR recipe, the differences are below: +- Replace the speech data with corresponding MuST-C data +- Parallel text data from WMT is replaced the Librispeech text data + +### Model Build +#### Pre-training +EN-DE is used as an example +``` +python train.py $TXT_DATA \ + --save-dir $SAVE_PRE_PATH --user-dir examples/speech_text_joint_to_text --task speech_text_joint_denoising --criterion speech_text_pretrain_cross_entropy --optimizer adam --weight-decay 0.01 \ + --config-yaml config_s2p.yaml --config-s2s-yaml config.yaml --ddp-backend no_c10d --lang-pairs-bitext en-fr --num-workers 4 --log-interval 500 --save-interval-updates 5000 --keep-interval-updates 1 \ + --no-emb-update-unsup --use-decoder-output-proj --report-accuracy --lr 0.001 --end-learning-rate 1e-06 --lr-scheduler polynomial_decay --warmup-updates 10000 --total-num-update 800000 \ + --update-freq 8 --validate-interval-updates 10000 --train-subset train --valid-subset valid_sup_speech,valid_sup_speech_s2s,valid_unsup_speech --dataset-impl mmap \ + --sup-speech-data $S2P_DATA_PATH --sup-speech-train-subset train --sup-speech-valid-subset dev --sup-speech-s2s-data $S2T_DATA_PATH --sup-speech-s2s-train-subset train \ + --sup-speech-s2s-valid-subset dev --unsup-speech-train-data $SSL_DATA_PATH/train.tsv --unsup-speech-valid-data $SSL_DATA_PATH/valid.tsv --batch-size 200 --batch-size-valid 100 \ + --max-source-positions 1024 --max-target-positions 1024 --max-text-tokens 2048 --max-speech-positions 600000 --max-sample-size 600000 --min-sample-size 64000 \ + --max-speech-tokens 600000 --max-tokens-valid 600000 --skip-invalid-size-inputs-valid-test --unsupervised-speech-sample-ratio 1.2 --supervised-speech-sample-ratio 10 \ + --supervised-speech-s2s-sample-ratio 10 --bitext-sample-ratio 0.5 --mask 0.3 --mask-random 0.1 --mask-length span-poisson --speech-sup-mask-prob 0.3 \ + --speech-unsup-mask-prob 0.7 --use-mask-whole-words --arch speech_text_pretrain_bart_base_stack --no-scale-feature --activation-fn gelu --speech-extractor-mode default \ + --stacked-encoder s2s --encoder-normalize-before --decoder-normalize-before --encoder-learned-pos --decoder-learned-pos --dropout 0.1 \ + --load-pretrained-mbart-encoder-from $EN_FR_NMT --load-pretrained-mbart-decoder-from $EN_FR_NMT +``` +#### Fine-tuning +``` +python train.py $S2T_DATA_PATH \ + --save-dir $SAVE_FT_PATH --num-workers 8 --task speech_text_joint_to_text --arch dualinputs2twavtransformer_base_stack --user-dir examples/speech_text_joint_to_text \ + --max-epoch 25 --update-mix-data --optimizer adam --lr-scheduler inverse_sqrt --lr 0.0003 --update-freq 4 --clip-norm 10.0 --warmup-updates 20000 \ + --criterion guided_label_smoothed_cross_entropy_with_accuracy --guide-alpha 0.8 --attentive-cost-regularization 0.02 --enc-grad-mult 2.0 --label-smoothing 0.1 \ + --max-tokens 800000 --max-source-positions 800000 --max-tokens-text 10000 --max-positions-text 1024 --load-pretrained-speech-text-encoder $SAVE_PRE_PATH/checkpoint_last.pt \ + --load-pretrained-speech-text-decoder $SAVE_PRE_PATH/checkpoint_last.pt --speech-mask-channel-length 64 --speech-mask-channel-prob 0.5 --speech-mask-length 10 \ + --speech-mask-prob 0.65 --text-sample-ratio 0.05 --mask-text-ratio 0.3 --mask-text-type random --parallel-text-data data-bin-wt --text-input-cost-ratio 0.5 \ + --langpairs en-fr --log-format json --max-tokens-valid 800000 --ddp-backend no_c10d --log-interval 100 --config-yaml config.yaml --skip-invalid-size-inputs-valid-test \ + --noise-token '▁NOISE' --keep-last-epochs 40 --layernorm-embedding --encoder-learned-pos --decoder-learned-pos --activation-fn gelu \ + --speech-extractor-mode default --max-target-positions 1024 --encoder-normalize-before --decoder-normalize-before +``` + +### Evaluation +The last 10 epoch models from fine-tuning is conducted model average to get $FINAL_MODEL +``` +python fairseq_cli/generate.py \ + $S2T_DATA_PATH \ + --task speech_text_joint_to_text \ + --nbest 1 \ + --max-tokens 800000 \ + --max-source-positions 800000 \ + --results-path $RESULTS_LOG \ + --batch-size 512 \ + --path $FINAL_MODEL \ + --gen-subset $SUBSET \ + --config-yaml config.yaml \ + --scoring sacrebleu \ + --beam 10 --lenpen 1.0 examples/speech_text_joint_to_text \ + --user-dir examples/speech_text_joint_to_text --load-speech-only \ + --model-overrides {'load_pretrained_speech_text_decoder':'','load_pretrained_speech_text_encoder':''} +``` + + +### Results and models +| | en-fr | en-es | en-de | +|---|---|---|---| +| BLEU| 39.7 | 33.2 |29.2 | + + +**Model Links**: +1. DE + - [de config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/config.yaml) + - [de src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/src_dict.txt) + - [de tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/tgt_dict.txt) + - [de spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/spm.model) + - [de pre-trained nmt model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/nmt.pt) + - [de pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/checkpoint_pretraing.pt) + - [de fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/checkpoint_finetune_ave10.pt) +2. ES + - [es config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/config.yaml) + - [es src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/src_dict.txt) + - [es tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/tgt_dict.txt) + - [es spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/spm.model) + - [es pre-trained nmt model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/nmt.pt) + - [es pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/checkpoint_pretraing.pt) + - [es fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/checkpoint_finetune_ave10.pt) +3. FR + - [fr config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/config.yaml) + - [fr src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/src_dict.txt) + - [fr tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/tgt_dict.txt) + - [fr spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/spm.model) + - [fr pre-trained nmt model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/nmt.pt) + - [fr pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/checkpoint_pretraing.pt) + - [fr fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/checkpoint_finetune_ave10.pt) +4. [config_s2p.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/config_s2p.yaml) diff --git a/examples/speech_text_joint_to_text/models/__init__.py b/examples/speech_text_joint_to_text/models/__init__.py new file mode 100644 index 0000000000..5fc5d9e21b --- /dev/null +++ b/examples/speech_text_joint_to_text/models/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import importlib +import os + diff --git a/examples/speech_text_joint_to_text/models/joint_speech_text_pretrain_transformer.py b/examples/speech_text_joint_to_text/models/joint_speech_text_pretrain_transformer.py new file mode 100644 index 0000000000..6f917398a5 --- /dev/null +++ b/examples/speech_text_joint_to_text/models/joint_speech_text_pretrain_transformer.py @@ -0,0 +1,698 @@ +#!/usr/bin/env python3 + +import logging +from collections import OrderedDict, namedtuple +from typing import Dict, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from fairseq import checkpoint_utils, utils +from fairseq.file_io import PathManager +from fairseq.models import ( + FairseqDecoder, + FairseqEncoderDecoderModel, + register_model, + register_model_architecture, +) +from fairseq.models.speech_to_text import ( + MultiInputDecoder, + MultiModalityEncoder, + SpeechWavTransformerEncoder, + StackedSpeechWavTransformerEncoder, +) +from fairseq.models.transformer import ( + TransformerDecoder, + TransformerEncoder, + TransformerModel, +) + +logger = logging.getLogger(__name__) + + +class SpeechTextPreTrainEncoder(MultiModalityEncoder): + def __init__( + self, + dictionary, + sup_speech_encoder, + sup_s2s_speech_encoder, + unsup_speech_encoder, + text_encoder, + ): + super().__init__(dictionary) + self.sup_speech_encoder = sup_speech_encoder + self.sup_s2s_speech_encoder = sup_s2s_speech_encoder + self.unsup_speech_encoder = unsup_speech_encoder + self.text_encoder = text_encoder + + @classmethod + def update_transformer_encoder_cfg(cls, args, update_dict): + cfg = dict(args._get_kwargs()) + for fkey in update_dict.keys(): + cfg[fkey] = update_dict[fkey] + cfg.pop("_name", None) # remove keys start with _ + model_args = namedtuple("args", cfg.keys())(*cfg.values()) + return model_args + + @classmethod + def build_text_encoder(cls, args, src_dictionary): + enc_emb = nn.Embedding( + len(src_dictionary), args.encoder_embed_dim, src_dictionary.pad() + ) + model_args = cls.update_transformer_encoder_cfg( + args, {"encoder_layers": args.text_encoder_layers} + ) + text_encoder = TransformerEncoder(model_args, src_dictionary, enc_emb) + return text_encoder + + @classmethod + def build_speech_encoder(cls, args): + model_args = cls.update_transformer_encoder_cfg( + args, + { + "encoder_layers": args.speech_encoder_layers, + "speech_mask_prob": args.speech_sup_mask_prob, + }, + ) + speech_encoder = SpeechWavTransformerEncoder(model_args) + return speech_encoder + + @classmethod + def share_layers(cls, src_layers, tgt_layers): # share layer but not dropout + # share parameters in src_layers with tgt_layers + assert len(src_layers) == len(tgt_layers) + for i, ly in enumerate(src_layers): + tly = tgt_layers[i] + tly.self_attn = ly.self_attn + tly.self_attn_layer_norm = ly.self_attn_layer_norm + tly.activation_fn = ly.activation_fn + tly.normalize_before = ly.normalize_before + tly.fc1 = ly.fc1 + tly.fc2 = ly.fc2 + tly.final_layer_norm = ly.final_layer_norm + if hasattr(tly, "encoder_attn"): + tly.encoder_attn = ly.encoder_attn + tly.encoder_attn_layer_norm = ly.encoder_attn_layer_norm + return tgt_layers + + @classmethod + def build_unsup_speech_encoder(cls, args, sup_speech_encoder): + model_args = cls.update_transformer_encoder_cfg( + args, + { + "encoder_layers": args.speech_encoder_layers, + "speech_mask_prob": args.speech_unsup_mask_prob, + "encoder_layerdrop": 0.0, + "decoder_layerdrop": 0.0, + "dropout": args.speech_unsup_dropout, + "activation_dropout": args.speech_unsup_dropout, + "attention_dropout": 0.0, + "dropout_features": args.speech_unsup_feature_dropout, + "dropout_input": args.speech_unsup_feature_dropout, + }, + ) + + unsup_speech_encoder = SpeechWavTransformerEncoder(model_args, alway_mask=True) + unsup_speech_encoder.layer_norm = sup_speech_encoder.layer_norm + unsup_speech_encoder.layers = cls.share_layers( + sup_speech_encoder.layers, unsup_speech_encoder.layers + ) + unsup_speech_encoder.mask_emb = sup_speech_encoder.mask_emb + unsup_speech_encoder.embed_positions = sup_speech_encoder.embed_positions + unsup_speech_encoder.feat_layer_norm = sup_speech_encoder.feat_layer_norm + unsup_speech_encoder.feat_proj = sup_speech_encoder.feat_proj + unsup_speech_encoder.subsample = sup_speech_encoder.subsample + return unsup_speech_encoder + + @classmethod + def build_encoder(cls, args, dictionary): + text_encoder = cls.build_text_encoder(args, dictionary) + if getattr(args, "load_pretrained_mbart_encoder_from", None): + text_encoder = checkpoint_utils.load_pretrained_component_from_model( + component=text_encoder, + checkpoint=args.load_pretrained_mbart_encoder_from, + ) + speech_encoder = cls.build_speech_encoder(args) + if getattr(args, "load_pretrained_feature_extractor_from", None): + + def load_feature_extractor(component, checkpoint): + if not PathManager.exists(checkpoint): + raise IOError("Model file not found: {}".format(checkpoint)) + state = checkpoint_utils.load_checkpoint_to_cpu(checkpoint) + component_state_dict = OrderedDict() + + component_prefix = "feature_extractor" + for key in state["model"].keys(): + if key.startswith(component_prefix): + component_subkey = key[len(component_prefix) + 1 :] + component_state_dict[component_subkey] = state["model"][key] + component.load_state_dict(component_state_dict, strict=True) + return component + + speech_encoder.subsample = load_feature_extractor( + speech_encoder.subsample, args.load_pretrained_feature_extractor_from + ) + speech_s2s_encoder = speech_encoder + unsup_speech_encoder = cls.build_unsup_speech_encoder(args, speech_encoder) + if getattr(args, "stacked_encoder", "none") != "none": + if args.encoder_shared_text_layers_from_begin > 0: + raise ValueError( + "We can not stack encoders and share encoders at the same time!" + ) + speech_s2s_encoder = StackedSpeechWavTransformerEncoder( + speech_encoder, text_encoder.layers, text_encoder.layer_norm + ) + if args.stacked_encoder == "all": + speech_encoder = speech_s2s_encoder + unsup_speech_encoder = StackedSpeechWavTransformerEncoder( + unsup_speech_encoder, text_encoder.layers, text_encoder.layer_norm + ) + else: + cls.share_speech_text_encoder( + speech_encoder, text_encoder, args.encoder_shared_text_layers_from_begin + ) + return SpeechTextPreTrainEncoder( + dictionary, + speech_encoder, + speech_s2s_encoder, + unsup_speech_encoder, + text_encoder, + ) + + @classmethod + def share_speech_text_encoder( + cls, speech_encoder, text_encoder, shared_layers_from_begin + ): + if shared_layers_from_begin > 0: + num_text_encoder_layers = len(text_encoder.layers) + assert len(speech_encoder.layers) >= shared_layers_from_begin + assert num_text_encoder_layers >= shared_layers_from_begin + assert len(speech_encoder.layers) >= num_text_encoder_layers + for i, ly in enumerate( + speech_encoder.layers[ + -num_text_encoder_layers : -num_text_encoder_layers + + shared_layers_from_begin + ] + ): + assert isinstance(text_encoder.layers[i], type(ly)) + text_encoder.layers[i] = ly + + def select_encoder(self, mode, **kwargs): + if mode in ("speech", "sup_speech_ctc", "sup_speech_ali", "sup_speech_s2s"): + kwargs["features_only"] = True + if mode == "sup_speech_s2s": + return self.sup_s2s_speech_encoder, kwargs + return self.sup_speech_encoder, kwargs + elif mode == "unsup_speech": + kwargs["features_only"] = False + return self.unsup_speech_encoder, kwargs + elif mode in ("text", "bitext"): + return self.text_encoder, kwargs + else: + raise NotImplementedError(f"{mode} is not supported") + return None, kwargs + + def forward(self, src_tokens, src_lengths=None, mode="", alignment=None, **kwargs): + return super().forward(src_tokens, src_lengths, mode, **kwargs) + + +# SpeechDummyDecoder works as an extension of encoder, so we could fit encoder only training into seq2seq training +class SpeechDummyDecoder(FairseqDecoder): + def __init__( + self, + dictionary, + output_embedding, + no_emb_update_unsup=False, + use_output_proj=False, + ): + super().__init__(dictionary) + self.output_embedding = output_embedding + num_embedding, num_dim = self.output_embedding.weight.size() + self.out_proj = ( + None if use_output_proj is False else nn.Linear(num_dim, num_dim) + ) + self.no_emb_update_unsup = no_emb_update_unsup + + def extend_alignment(self, alignment, src_lengths, prev_output_tokens): + # alignment: B X N + # src_lengths: B X T + # prev_output_tokens: B X (N + 1) + tgt_tokens = prev_output_tokens[ + :, 1: + ] # remove the leading start of sentence token + ext_alignment = ( + torch.ones(len(src_lengths), src_lengths.max(), device=src_lengths.device) + .long() + .fill_(self.dictionary.pad()) + ) + for bs in range(src_lengths.size(0)): + tgt_length = tgt_tokens[bs].ne(self.dictionary.pad()).sum().item() + assert tgt_length == sum(alignment[bs].ne(1)) + 1 + src_st = 0 + for i in range(tgt_length): + tok = tgt_tokens[bs][i] + src_ed = (alignment[bs][i] * src_lengths[bs]).int().item() + ext_alignment[bs][src_st:src_ed].fill_(tok) + src_st = src_ed + return ext_alignment + + def forward( + self, + prev_output_tokens, + encoder_out, + incremental_state=None, + mode="speech", + alignment=None, + **kwargs, + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (optional): output from the encoder, used for + encoder-side attention + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + features_only (bool, optional): only return features without + applying output layer (default: False). + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + + Returns: + sup_speech_ctc: + dictionary{"logits": logits, "padding_mask": padding_mask} + sup_speech_ali and unsup_speech: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + emb_weight = self.output_embedding.weight + if ( + mode == "unsup_speech" and self.no_emb_update_unsup + ): # no gradient for embedding here + emb_weight = emb_weight.detach() + enc_out = ( + encoder_out["encoder_out"][0] + if self.out_proj is None + else self.out_proj(encoder_out["encoder_out"][0]) + ) + logits = F.linear(enc_out, emb_weight, None).transpose(0, 1) # B X T X C + others = None + if mode in ( + "speech", + "sup_speech_ctc", + ): # speech data with label, do forcealignment + if len(encoder_out["encoder_padding_mask"]) > 0: + padding_mask = encoder_out["encoder_padding_mask"][0] + logits = logits.masked_fill(padding_mask, float("-inf")) + else: + seq_len, bsz = encoder_out["encoder_out"][0].size()[:2] + padding_mask = torch.zeros( + bsz, seq_len, device=encoder_out["encoder_out"][0].device + ).bool() + return {"x": logits, "padding_mask": padding_mask} + elif mode == "sup_speech_ali": + src_lengths = None + if len(encoder_out["encoder_padding_mask"]) > 0: + src_lengths = (1 - encoder_out["encoder_padding_mask"][0].long()).sum( + -1 + ) + else: + seq_len, bsz = encoder_out["encoder_out"][0].size()[:2] + src_lengths = ( + torch.ones(bsz, device=encoder_out["encoder_out"][0].device).long() + * seq_len + ) + assert alignment is not None + alignment = self.extend_alignment( + alignment, src_lengths, prev_output_tokens + ) + others = {"pseudo_target_tokens": alignment} + elif mode == "unsup_speech": + enc_out_ori = ( + encoder_out["encoder_unmasked_out"][0] + if self.out_proj is None + else self.out_proj(encoder_out["encoder_unmasked_out"][0]) + ) + logits_ori = F.linear(enc_out_ori, emb_weight, None).transpose(0, 1) + if len(encoder_out["encoder_padding_mask"]) > 0: + encoder_padding_mask = encoder_out["encoder_padding_mask"][0] + logits_ori = logits_ori.masked_fill(encoder_padding_mask, float("-inf")) + pseudo_labels = utils.log_softmax(logits_ori, dim=-1) + others = { + "pseudo_target_logprobs": pseudo_labels, + "padding_mask": encoder_out["encoder_padding_mask"], # B X T + "mask_indices": encoder_out[ + "mask_indices" + ], # True for masked frames B X T + } + return logits, others + + def get_normalized_probs( + self, + net_output: Dict[str, Tensor], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): + return self.get_normalized_probs_scriptable( + (net_output["x"], None), log_probs, sample + ) + + +class SpeechTextPreTrainDecoder(MultiInputDecoder): + def __init__(self, dictionary, speech_decoder, text_decoder): + super().__init__(dictionary) + self.speech_decoder = speech_decoder + self.text_decoder = text_decoder + + def select_decoder(self, mode, **kwargs): + if mode == "unsup_speech": + kwargs["mode"] = mode + return self.speech_decoder, kwargs + if mode in ("text", "bitext"): + return self.text_decoder, kwargs + if mode in ("speech", "sup_speech_ctc", "sup_speech_ali"): + kwargs["mode"] = mode + return self.speech_decoder, kwargs + if mode in ("speech", "sup_speech_s2s"): + if "alignment" in kwargs: + del kwargs["alignment"] + return self.text_decoder, kwargs + + raise NotImplementedError(f"{mode} is not supported") + return None, kwargs + + def get_normalized_probs( + self, + net_output, + log_probs, + sample=None, + ): + """Get normalized probabilities (or log probs) from a net's output.""" + if isinstance(net_output, dict): + return self.speech_decoder.get_normalized_probs( + net_output, log_probs, sample + ) + return self.text_decoder.get_normalized_probs(net_output, log_probs, sample) + + @classmethod + def build_text_decoder(cls, args, tgt_dictionary, dec_emb_share=None): + dec_emb = ( + nn.Embedding( + len(tgt_dictionary), args.decoder_embed_dim, tgt_dictionary.pad() + ) + if dec_emb_share is None + else dec_emb_share + ) + text_decoder = TransformerDecoder(args, tgt_dictionary, dec_emb) + return text_decoder + + @classmethod + def build_dummy_speech_decoder(cls, args, dictionary, dec_emb_share=None): + dec_emb = ( + nn.Embedding(len(dictionary), args.decoder_embed_dim, dictionary.pad()) + if dec_emb_share is None + else dec_emb_share + ) + speech_decoder = SpeechDummyDecoder( + dictionary, + dec_emb, + no_emb_update_unsup=getattr(args, "no_emb_update_unsup", False), + use_output_proj=getattr(args, "use_decoder_output_proj", False), + ) + return speech_decoder + + @classmethod + def build_decoder( + cls, args, text_dictionary, speech_dictionary, speech_output_embedding + ): + text_decoder = cls.build_text_decoder(args, text_dictionary) + speech_decoder = cls.build_dummy_speech_decoder( + args, speech_dictionary, speech_output_embedding + ) + if getattr(args, "load_pretrained_mbart_decoder_from", None): + text_decoder = checkpoint_utils.load_pretrained_component_from_model( + component=text_decoder, + checkpoint=args.load_pretrained_mbart_decoder_from, + ) + return SpeechTextPreTrainDecoder(text_dictionary, speech_decoder, text_decoder) + + +@register_model("speech_text_pretrain_bart") +class SpeechTextPreTrainModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + self.num_updates = 0 + + def forward( + self, src_tokens, src_lengths, prev_output_tokens, src_lang_ids=None, **kwargs + ): + if src_lang_ids is not None: + encoder_out = self.encoder( + src_tokens, src_lengths=src_lengths, src_lang_ids=src_lang_ids, **kwargs + ) + else: + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + decoder_out = self.decoder( + prev_output_tokens, encoder_out=encoder_out, **kwargs + ) + return decoder_out + + def max_positions(self): + return None # it is provided in task + + def get_targets(self, sample, net_output): + mode = sample["net_input"]["mode"] + if mode == "unsup_speech": + return {"target_logprobs": net_output[1]["pseudo_target_logprobs"]} + if mode == "sup_speech_ali": + return net_output[1]["pseudo_target_tokens"] + return sample["target"] + + def get_normalized_probs( + self, + net_output, + log_probs, + sample=None, + ): + # net_output['encoder_out'] is a (B, T, D) tensor + lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample) + lprobs.batch_first = True + return lprobs + + @staticmethod + def add_args(parser): + TransformerModel.add_args(parser) + SpeechWavTransformerEncoder.add_args(parser) + parser.add_argument( + "--speech-sup-mask-prob", + type=float, + help="probability of replacing a token with mask (sup-speech)", + ) + parser.add_argument( + "--speech-unsup-mask-prob", + type=float, + help="probability of replacing a token with mask (unsup-speech)", + ) + parser.add_argument( + "--load-pretrained-mbart-encoder-from", + type=str, + metavar="STR", + help="model to take text encoder weights from (for initialization)", + ) + + parser.add_argument( + "--load-pretrained-mbart-decoder-from", + type=str, + metavar="STR", + help="model to take text decoder weights from (for initialization)", + ) + + parser.add_argument( + "--load-pretrained-feature-extractor-from", + type=str, + metavar="STR", + help="model to take feature extractor weights from (for initialization)", + ) + + parser.add_argument( + "--speech-unsup-dropout", + type=float, + default=0, + help="dropout for unsupervised speech encoder", + ) + + parser.add_argument( + "--speech-unsup-feature-dropout", + type=float, + default=0, + help="dropout for unsupervised speech feature encoder", + ) + + parser.add_argument( + "--encoder-shared-text-layers-from-begin", + type=int, + help="number of text encoder layers shared with speech encoder (from first layer)", + ) + + parser.add_argument( + "--stacked-encoder", + default="none", + choices=["none", "s2s", "all"], + help="stack speech and text encoders", + ) + + parser.add_argument("--use-decoder-output-proj", action="store_true") + + @classmethod + def build_model(cls, args, task): + encoder = SpeechTextPreTrainEncoder.build_encoder(args, task.src_dict) + decoder = SpeechTextPreTrainDecoder.build_decoder( + args, task.tgt_dict, task.src_dict, encoder.text_encoder.embed_tokens + ) + model = SpeechTextPreTrainModel(encoder, decoder) + return model + + def upgrade_state_dict(self, state_dict): + """Upgrade old state dicts to work with newer code.""" + if "decoder.speech_decoder.output_projection.weight" in state_dict: + del state_dict["decoder.speech_decoder.output_projection.weight"] + self.upgrade_state_dict_named(state_dict, "") + + +@register_model_architecture( + "speech_text_pretrain_bart", "speech_text_pretrain_bart_base" +) +def speech_text_pretrain_bart_base(args): + # speech masking + args.dropout_input = getattr(args, "dropout_input", 0) + args.dropout_features = getattr(args, "dropout_features", 0) + args.speech_mask_length = getattr(args, "speech_mask_length", 10) + args.speech_mask_prob = getattr(args, "speech_mask_prob", 0.65) + args.speech_sup_mask_prob = getattr(args, "speech_sup_mask_prob", 0.3) + args.speech_unsup_mask_prob = getattr( + args, "speech_unsup_mask_prob", args.speech_mask_prob + ) + args.speech_mask_selection = getattr(args, "speech_mask_selection", "static") + args.speech_mask_other = getattr(args, "speech_mask_other", 0) + args.speech_mask_min_space = getattr(args, "speech_mask_min_space", 1) + args.speech_no_mask_overlap = getattr(args, "speech_no_mask_overlap", False) + + args.speech_mask_channel_length = getattr(args, "speech_mask_channel_length", 10) + args.speech_mask_channel_prob = getattr(args, "speech_mask_channel_prob", 0.0) + args.speech_mask_channel_selection = getattr( + args, "speech_mask_channel_selection", "static" + ) + args.speech_mask_channel_other = getattr(args, "speech_mask_channel_other", 0) + args.speech_mask_channel_min_space = getattr( + args, "speech_mask_channel_min_space", 1 + ) + args.speech_no_mask_channel_overlap = getattr( + args, "speech_no_mask_channel_overlap", False + ) + args.no_scale_feature = getattr(args, "", False) + args.feature_grad_mult = getattr(args, "feature_grad_mult", 1.0) # 0.1 + + # Transformer + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) + args.encoder_ffn_embed_dim = getattr( + args, "encoder_ffn_embed_dim", args.encoder_embed_dim * 4 + ) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) + args.speech_conv_bias = getattr(args, "speech_conv_bias", False) + + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_attention_heads = getattr( + args, "decoder_attention_heads", args.encoder_attention_heads + ) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.dropout = getattr(args, "dropout", 0.1) + args.attention_dropout = getattr(args, "attention_dropout", args.dropout) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.activation_fn = getattr(args, "activation_fn", "relu") # gelu? + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + + args.speech_unsup_dropout = getattr(args, "speech_unsup_dropout", 0) + args.speech_unsup_feature_dropout = getattr(args, "speech_unsup_feature_dropout", 0) + + args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.layernorm_embedding = getattr(args, "layernorm_embedding", False) + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) + + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) + args.encoder_shared_text_layers_from_begin = getattr( + args, "encoder_shared_text_layers_from_begin", 6 + ) + args.decoder_layers = getattr(args, "decoder_layers", 6) + + args.no_emb_update_unsup = getattr(args, "no_emb_update_unsup", False) + + +@register_model_architecture( + "speech_text_pretrain_bart", "speech_text_pretrain_bart_base_stack" +) +def speech_text_pretrain_bart_base_stack(args): + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 6) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) + args.encoder_shared_text_layers_from_begin = getattr( + args, "encoder_shared_text_layers_from_begin", 0 + ) + args.stacked_encoder = getattr(args, "stacked_encoder", "all") + args.layernorm_embedding = getattr(args, "layernorm_embedding", True) + speech_text_pretrain_bart_base(args) + + +@register_model_architecture( + "speech_text_pretrain_bart", "speech_text_pretrain_bart_large" +) +def speech_text_pretrain_bart_large(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 24) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 12) + args.encoder_shared_text_layers_from_begin = getattr( + args, "encoder_shared_text_layers_from_begin", 12 + ) + args.decoder_layers = getattr(args, "decoder_layers", 12) + args.dropout = getattr(args, "dropout", 0.3) + speech_text_pretrain_bart_base(args) + + +@register_model_architecture( + "speech_text_pretrain_bart", "speech_text_pretrain_bart_large_stack" +) +def speech_text_pretrain_bart_large_stack(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 6) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 12) + args.encoder_shared_text_layers_from_begin = getattr( + args, "encoder_shared_text_layers_from_begin", 0 + ) + args.decoder_layers = getattr(args, "decoder_layers", 12) + args.stacked_encoder = getattr(args, "stacked_encoder", "s2s") + args.layernorm_embedding = getattr(args, "layernorm_embedding", True) + speech_text_pretrain_bart_base(args) diff --git a/examples/speech_text_joint_to_text/models/s2t_dualinputtransformer.py b/examples/speech_text_joint_to_text/models/s2t_dualinputtransformer.py new file mode 100644 index 0000000000..c4ec41bda1 --- /dev/null +++ b/examples/speech_text_joint_to_text/models/s2t_dualinputtransformer.py @@ -0,0 +1,1093 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from collections import namedtuple + +import torch +import torch.nn as nn +from fairseq import checkpoint_utils +from fairseq import utils +from fairseq.models import ( + FairseqEncoder, + FairseqDecoder, + FairseqEncoderDecoderModel, + register_model, + register_model_architecture, +) +from fairseq.models.fairseq_encoder import EncoderOut +from fairseq.models.speech_to_text import ( + TransformerDecoder, + S2TTransformerEncoder, +) +from fairseq.models.transformer import TransformerEncoder +from fairseq.modules import ( + TransformerEncoderLayer, + GradMultiply, + LayerNorm, +) + +logger = logging.getLogger(__name__) + + +class SpeechEoSEncoder(FairseqEncoder): + def __init__(self, encoder, eos_num, feat_dim, adapter_type="None", adapter_dim=0): + super().__init__(None) + self.encoder = encoder + self.eos_num = eos_num # downsampling rate for speech input feature + self.eos_emb = ( + nn.Parameter(torch.zeros(1, feat_dim), requires_grad=True) + if eos_num > 0 + else None + ) + self.adapter = self.add_adapter(adapter_type, adapter_dim) + + def add_adapter(self, adapter_type, adapter_dim): + def _make_identity(linear, eps=1e-5): + assert isinstance(linear, nn.Linear) + linear.weight.data.mul_(eps) + linear.weight.data.fill_diagonal_(1.0) + if linear.bias is not None: + linear.bias.data.mul_(eps) + + adapter = None + if adapter_type == "Linear": + assert adapter_dim > 0 + adapter = nn.Sequential( + nn.Linear(adapter_dim, adapter_dim), LayerNorm(adapter_dim) + ) + # initialize the adapter as identity matrix first + _make_identity(adapter[0]) + + elif adapter_type == "MLP": + assert adapter_dim > 0 + # assume the model is pre-norm model + adapter = nn.Sequential( + nn.Linear(adapter_dim, 2 * adapter_dim), + nn.ReLU(), + nn.Linear(2 * adapter_dim, adapter_dim), + LayerNorm(adapter_dim), + ) + _make_identity(adapter[0]) + _make_identity(adapter[2]) + return adapter + + def add_eos(self, src_tokens, src_lengths): + bsz, max_seq_len, fdim = src_tokens.size() + if self.eos_num > 0: + src_token_eos = torch.zeros( + [bsz, max_seq_len + self.eos_num, fdim], + dtype=src_tokens.dtype, + device=src_tokens.device, + ) + src_token_eos[:, :max_seq_len] = src_tokens + for bi in range(bsz): + src_token_eos[bi][ + src_lengths[bi] : src_lengths[bi] + self.eos_num + ] = self.eos_emb.expand(self.eos_num, fdim) + src_lengths = src_lengths + self.eos_num + src_tokens = src_token_eos + return src_tokens, src_lengths + + def apply_adapter(self, enc_out): + if self.adapter is None: + return enc_out + rst = self.adapter(enc_out.encoder_out) + if enc_out.encoder_padding_mask is not None: + rst.masked_fill_( + enc_out.encoder_padding_mask.transpose(0, 1).unsqueeze(-1), 0 + ) + return EncoderOut( + encoder_out=rst, + encoder_padding_mask=enc_out.encoder_padding_mask, + encoder_embedding=enc_out.encoder_embedding, + encoder_states=enc_out.encoder_states, + src_tokens=enc_out.src_tokens, + src_lengths=enc_out.src_lengths, + ) + + def forward(self, src_tokens, src_lengths=None, return_all_hiddens=False, **kwargs): + """ + src_tokens: padded tensor (B, T, C * feat) + src_lengths: tensor of original lengths of input utterances (B,) + """ + src_tokens, src_lengths = self.add_eos(src_tokens, src_lengths) + enc_out = self.encoder(src_tokens, src_lengths, return_all_hiddens) + enc_out = self.apply_adapter(enc_out) + return enc_out + + def reorder_encoder_out(self, encoder_out, new_order): + return self.encoder.reorder_encoder_out(encoder_out, new_order) + + +class DualInputEncoder(FairseqEncoder): + def __init__( + self, + args, + spch_encoder, + text_encoder, + dictionary, + cross_attentive_loss_before_last_layer=-1, + ): + super().__init__(dictionary) + + self.spch_encoder = spch_encoder + self.text_encoder = text_encoder + self.enc_grad_mult = args.enc_grad_mult + self.cross_attentive_loss_before_last_layer = ( + cross_attentive_loss_before_last_layer + ) + self.use_cross_attentive_loss = ( + False if cross_attentive_loss_before_last_layer <= -1 else True + ) + self.enc2_along_grad_mult = args.enc2_along_grad_mult + + @classmethod + def set_shared_layer(cls, share_level, src_layer, tgt_layer): + """ + share parameters from tgt_layer to src_layer + share_level: + 0: share everything + 1: share everything but different model + 2: share weight but not bias, layernorm + """ + if share_level == 0: + return tgt_layer + if isinstance(src_layer, nn.Linear): + return tgt_layer + if isinstance(src_layer, TransformerEncoderLayer): + assert src_layer.embed_dim == tgt_layer.embed_dim + assert src_layer.normalize_before == tgt_layer.normalize_before + if share_level == 1: + src_layer.fc1 = tgt_layer.fc1 + src_layer.fc2 = tgt_layer.fc2 + src_layer.self_attn = tgt_layer.self_attn + src_layer.final_layer_norm = tgt_layer.final_layer_norm + src_layer.self_attn_layer_norm = tgt_layer.self_attn_layer_norm + src_layer.layernorm_embedding = tgt_layer.layernorm_embedding + else: + src_layer.fc1.weight = tgt_layer.fc1.weight + src_layer.fc2.weight = tgt_layer.fc2.weight + src_layer.self_attn.k_proj.weight = tgt_layer.self_attn.k_proj.weight + src_layer.self_attn.v_proj.weight = tgt_layer.self_attn.v_proj.weight + src_layer.self_attn.q_proj.weight = tgt_layer.self_attn.q_proj.weight + src_layer.self_attn.out_proj.weight = ( + tgt_layer.self_attn.out_proj.weight + ) + else: + if share_level == 1: + return tgt_layer + return src_layer + + @classmethod + def build_spch_encoder(cls, args): + cfg = { + "input_feat_per_channel": args.input_feat_per_channel, + "input_channels": args.input_channels, + "conv_kernel_sizes": args.conv_kernel_sizes, + "conv_channels": args.conv_channels, + "encoder_embed_dim": args.encoder_embed_dim, + "encoder_ffn_embed_dim": args.encoder_ffn_embed_dim, + "encoder_layers": args.speech_encoder_layers, + "encoder_layerdrop": args.encoder_layerdrop, + "encoder_attention_heads": args.encoder_attention_heads, + "max_source_positions": args.max_source_positions, + "dropout": args.dropout, + "encoder_normalize_before": args.encoder_normalize_before, + "activation_dropout": args.activation_dropout, + "attention_dropout": args.attention_dropout, + "activation_fn": args.activation_fn, + "layernorm_embedding": args.layernorm_embedding, + "no_token_positional_embeddings": args.no_token_positional_embeddings, + "no_scale_embedding": args.no_scale_embedding, + "quant_noise_pq": args.quant_noise_pq, + "encoder_freezing_updates": 0, + } + model_args = namedtuple("args", cfg.keys())(*cfg.values()) + spch_encoder = S2TTransformerEncoder(model_args) + if args.add_speech_eos: + spch_encoder = SpeechEoSEncoder( + spch_encoder, + 2 * len(args.conv_kernel_sizes.split(",")), + args.input_feat_per_channel, + adapter_type=getattr(args, "speech_encoder_adapter_type", "None"), + adapter_dim=args.encoder_embed_dim, + ) + return spch_encoder + + @classmethod + def build_text_encoder(cls, args, src_dictionary, spch_encoder): + if args.encoder_shared_layers > 0: + mx_shared_layers = ( + args.speech_encoder_layers + if args.speech_encoder_layers < args.text_encoder_layers + else args.text_encoder_layers + ) + args.encoder_shared_layers = ( + args.encoder_shared_layers + if args.encoder_shared_layers <= mx_shared_layers + else mx_shared_layers + ) + cfg = { + "encoder_embed_dim": args.encoder_text_embed_dim, + "encoder_ffn_embed_dim": args.encoder_ffn_embed_dim, + "encoder_layers": args.text_encoder_layers, + "encoder_layerdrop": args.encoder_layerdrop, + "encoder_attention_heads": args.encoder_attention_heads, + "encoder_learned_pos": args.encoder_learned_pos, + "max_source_positions": args.max_source_positions, + "dropout": args.dropout, + "encoder_normalize_before": args.encoder_normalize_before, + "activation_dropout": args.activation_dropout, + "attention_dropout": args.attention_dropout, + "activation_fn": args.activation_fn, + "adaptive_input": args.adaptive_input, + "no_token_positional_embeddings": args.no_token_positional_embeddings, + "no_scale_embedding": args.no_scale_embedding, + "quant_noise_pq": args.quant_noise_pq, + } + model_args = namedtuple("args", cfg.keys())(*cfg.values()) + enc_emb = nn.Embedding( + len(src_dictionary), model_args.encoder_embed_dim, src_dictionary.pad() + ) + text_encoder = TransformerEncoder(model_args, src_dictionary, enc_emb) + if args.add_speech_eos: + spch_encoder = spch_encoder.encoder + if args.encoder_shared_layers > 0: + text_encoder.layer_norm = cls.set_shared_layer( + args.encoder_shared_layer_level, + text_encoder.layer_norm, + spch_encoder.layer_norm, + ) + for i, ly in enumerate( + spch_encoder.transformer_layers[-args.encoder_shared_layers :] + ): + ly_id = i + args.text_encoder_layers - args.encoder_shared_layers + if not isinstance(text_encoder.layers[ly_id], type(ly)): + if text_encoder.layers[ly_id]._get_name() not in ('TransformerEncoderLayerBase', 'TransformerEncoderLayer'): + raise ValueError("The shared layers are expected from the same class") + text_encoder.layers[ly_id] = cls.set_shared_layer( + args.encoder_shared_layer_level, + text_encoder.layers[ly_id], + ly, + ) + return text_encoder + + def mult_rst_grad(self, rst, ratio): + assert isinstance(rst, dict) # instead of EncoderOut + assert len(rst["encoder_out"]) == 1 + rst["encoder_out"][0] = GradMultiply.apply(rst["encoder_out"][0], ratio) + return rst + + def process_attentive_loss_states(self, rst, interstates): + assert isinstance(rst, dict) # instead of EncoderOut + rst["encoder_states"] = interstates + return rst + + def forward( + self, + src_tokens, + src_lengths=None, + src_txt_tokens=None, + src_txt_lengths=None, + **kwargs + ): + """ + Args: + src_tokens: padded tensor (B, T, C * feat) + src_lengths: tensor of original lengths of input utterances (speech) (B,) + src_txt_tokens: padded tensor (B, T) + src_txt_lengths: tensor of original lengths of input utterances (text) (B,) + """ + # src_tokens only: inference + # src_tokens, src_lengths: speech only training + # src_txt_tokens, src_txt_lengths: text only training + # all valid: speech + text training + + if src_tokens is None and src_txt_tokens is None: + raise ValueError( + "src_tokens and src_txt_tokens cannot be None at the same time" + ) + ret1 = None + ret2 = None + return_all_hiddens = False + if src_tokens is not None: + if ( + self.use_cross_attentive_loss and src_txt_tokens is not None + ): # remove self.training so we can get attn score during validation step + return_all_hiddens = True + ret1 = self.spch_encoder( + src_tokens, src_lengths, return_all_hiddens=return_all_hiddens + ) + + if self.use_cross_attentive_loss and src_txt_tokens is not None: + assert self.cross_attentive_loss_before_last_layer < len( + ret1["encoder_states"] + ) + ret1 = self.process_attentive_loss_states( + ret1, + ret1["encoder_states"][ + -self.cross_attentive_loss_before_last_layer - 1 + ], + ) + + if src_txt_tokens is not None: + ret2 = self.text_encoder( + src_txt_tokens, src_txt_lengths, return_all_hiddens=return_all_hiddens + ) + if return_all_hiddens: + if self.cross_attentive_loss_before_last_layer == len( + self.text_encoder.layers + ): + text_embedding, _ = self.text_encoder.forward_embedding( + src_txt_tokens + ) + text_embedding = text_embedding.transpose(0, 1) + ret2 = self.process_attentive_loss_states(ret2, text_embedding) + else: + assert self.cross_attentive_loss_before_last_layer < len( + self.text_encoder.layers + ) + ret2 = self.process_attentive_loss_states( + ret2, + ret2["encoder_states"][ + -self.cross_attentive_loss_before_last_layer - 1 + ], + ) + + def merge_output(rst1, rst2): + if rst1 is None: + if not (self.enc2_along_grad_mult == 1.0 or self.training): + rst2 = self.mult_rst_grad(rst2, self.enc2_along_grad_mult) + return rst2 + if rst2 is None: + return rst1 + if self.enc_grad_mult != 1.0 and self.training: + rst1 = self.mult_rst_grad(rst1, self.enc_grad_mult) + rst2 = self.mult_rst_grad(rst2, self.enc_grad_mult) + rst = (rst1, rst2) + return rst + + return merge_output(ret1, ret2) + + def reorder_encoder_out(self, encoder_out, new_order): + assert self.training is False # used for inference only + return self.spch_encoder.reorder_encoder_out(encoder_out, new_order) + + +# TransformerMultiInputDecoder: take one or two encoder inputs +class TransformerMultiInputDecoder(FairseqDecoder): + def __init__( + self, + dictionary, + spch_decoder, + text_decoder, + compute_cross_attentive_loss=False, + cross_attentive_loss_with_norm=True, + cross_attentive_loss_reverse=False, + ): + + super().__init__(dictionary) + self.spch_decoder = spch_decoder + self.text_decoder = text_decoder + self.compute_cross_attentive_loss = compute_cross_attentive_loss + self.cross_attentive_loss_with_norm = cross_attentive_loss_with_norm + self.cross_attentive_loss_reverse = cross_attentive_loss_reverse + + @classmethod + def share_spchdecoder(cls, task_args, text_decoder, spch_decoder): + if task_args.decoder_shared_layer_level == 0: + return text_decoder + assert text_decoder.embed_tokens == spch_decoder.embed_tokens + spch_decoder.project_in_dim = text_decoder.project_in_dim + spch_decoder.embed_positions = text_decoder.embed_positions + spch_decoder.layernorm_embedding = text_decoder.layernorm_embedding + spch_decoder.project_out_dim = text_decoder.project_out_dim + spch_decoder.adaptive_softmax = text_decoder.adaptive_softmax + if task_args.decoder_shared_layer_level == 1: + spch_decoder.output_projection = text_decoder.output_projection + spch_decoder.layer_norm = text_decoder.layer_norm + else: # 2 + spch_decoder.output_projection.weight = ( + text_decoder.output_projection.weight + ) + for i, ly in enumerate(text_decoder.layers): + sly = spch_decoder.layers[i] + sly.self_attn = ly.self_attn + sly.self_attn_layer_norm = ly.self_attn_layer_norm + # sly.encoder_attn = ly.encoder_attn + if ( + task_args.decoder_shared_layer_level == 1 + ): # share everything, but under different models + sly.encoder_attn = ly.encoder_attn + sly.encoder_attn_layer_norm = ly.encoder_attn_layer_norm + sly.fc1 = ly.fc1 + sly.fc2 = ly.fc2 + sly.final_layer_norm = ly.final_layer_norm + else: # task_args.decoder_shared_layer_level == 2: #separated encoder_attn_layer_norm and bias + sly.encoder_attn.k_proj.weight = ly.encoder_attn.k_proj.weight + sly.encoder_attn.v_proj.weight = ly.encoder_attn.v_proj.weight + sly.encoder_attn.q_proj.weight = ly.encoder_attn.q_proj.weight + sly.encoder_attn.out_proj.weight = ly.encoder_attn.out_proj.weight + sly.fc1.weight = ly.fc1.weight + sly.fc2.weight = ly.fc2.weight + + return spch_decoder + + def cross_attentive_loss( + self, teacher_states, student_states, teacher_masking, student_masking, eps=1e-6 + ): + x = teacher_states.transpose(0, 1) # from T X B X D to B X T X D + y = student_states.transpose(0, 1) + if self.cross_attentive_loss_with_norm: + x = x / (x.norm(dim=2, keepdim=True) + eps) + y = y / (y.norm(dim=2, keepdim=True) + eps) + dim = x.size(-1) + # lengths: batch X seqLen + sim_scores_xy = torch.bmm(x, y.transpose(1, 2)) # batch X lenx X leny ] + if y.dtype == torch.float16: + sim_scores_xy = sim_scores_xy.float() + y = y.float() + x = x.float() + if teacher_masking != []: + assert len(teacher_masking) == 1 + sim_scores_xy = sim_scores_xy.masked_fill( + teacher_masking[0].unsqueeze(-1), float("-inf") + ) + if student_masking != []: + sim_scores_xy = sim_scores_xy.masked_fill( + student_masking[0].unsqueeze(1), float("-inf") + ) + # do masking + y_weights = utils.softmax(sim_scores_xy, dim=-1) + if teacher_masking != []: + y_weights = y_weights.masked_fill(teacher_masking[0].unsqueeze(-1), 0) + x_reconstruct_from_y = torch.bmm(y_weights, y) + + sim_scores_xx = torch.bmm(x, x.transpose(1, 2)) # batch X lenx X lenx ] + x_weights = utils.softmax(sim_scores_xx, dim=-1) + if teacher_masking != []: + x_weights = x_weights.masked_fill(teacher_masking[0].unsqueeze(-1), 0) + + # no gradient for teacher state + x_reconstruct_from_x = torch.bmm(x_weights, x).detach() + cost = (x_reconstruct_from_x - x_reconstruct_from_y).norm(dim=2) + if teacher_masking != []: + cost = cost.masked_fill(teacher_masking[0], 0) + + if not self.cross_attentive_loss_with_norm: + cost = cost / dim + return cost + + def forward( + self, + prev_output_tokens, + encoder_out, + incremental_state=None, + has_txt_input=False, + **kwargs + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for input feeding/teacher forcing. If there are + two or more input during training, they will share the same prev_output_tokens + encoder_out (tuple[Tensor]): output from the encoder, used for + encoder-side attention. It will be tuple if there are more inputs, but a tensor + if only one input + incremental_state ([dict]): dictionary used for storing state during + :ref:`Incremental decoding`. It is only valid for inference, only from single + input + Returns: + tuple: + - the last decoder layer's output of shape `(batch, tgt_len, + vocab)`. If there are N inputs, batch will be N bigger than a single input + - the last decoder layer's attention weights of shape `(batch, + tgt_len, src_len)` + """ + assert not isinstance(encoder_out, EncoderOut) + if isinstance(encoder_out, tuple): # training with mulitple input + rst = [] + assert len(encoder_out) == 2 + for i, eo in enumerate(encoder_out): + assert incremental_state is None + if i == 0: + rst.append( + self.spch_decoder(prev_output_tokens, eo, incremental_state) + ) + else: + rst.append( + self.text_decoder(prev_output_tokens, eo, incremental_state) + ) + dec_out = torch.cat([r[0] for r in rst], dim=0) + attn_cost = None + if self.compute_cross_attentive_loss: + assert isinstance(encoder_out[0], dict) + if self.cross_attentive_loss_reverse: + attn_cost = self.cross_attentive_loss( + teacher_states=encoder_out[1]["encoder_states"], # text_states + student_states=encoder_out[0]["encoder_states"], # spch_states + teacher_masking=encoder_out[1]["encoder_padding_mask"], + student_masking=encoder_out[0]["encoder_padding_mask"], + ) + else: + attn_cost = self.cross_attentive_loss( + teacher_states=encoder_out[0]["encoder_states"], # spch_states + student_states=encoder_out[1]["encoder_states"], # text_states + teacher_masking=encoder_out[0]["encoder_padding_mask"], + student_masking=encoder_out[1]["encoder_padding_mask"], + ) + + return (dec_out, {"attn_cost": attn_cost}) + else: # inference or training with one input + if has_txt_input: + return self.text_decoder( + prev_output_tokens, encoder_out, incremental_state + ) + return self.spch_decoder(prev_output_tokens, encoder_out, incremental_state) + + +# Note: +# dual input transformer: +# encoder: S2TTransformerEncoder for speech + TransformerEncoder for text +# decoder: TransformerDecoder for text +@register_model("dual_input_s2t_transformer") +class DualInputS2TTransformerModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + self.num_updates = 0 + + def max_positions(self): + return None # it is provided in task + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + # encoder 1: S2TTransformerEncoder for speech + parser.add_argument( + "--conv-kernel-sizes", + type=str, + metavar="N", + help="kernel sizes of Conv1d subsampling layers", + ) + parser.add_argument( + "--conv-channels", + type=int, + metavar="N", + help="# of channels in Conv1d subsampling layers", + ) + parser.add_argument( + "--enc-output-dim", + type=int, + metavar="N", + help=""" + encoder output dimension, can be None. If specified, projecting the + transformer output to the specified dimension""", + ) + # standard Transformer + parser.add_argument( + "--activation-fn", + type=str, + default="relu", + choices=utils.get_available_activation_fns(), + help="activation function to use", + ) + parser.add_argument( + "--dropout", type=float, metavar="D", help="dropout probability" + ) + parser.add_argument( + "--attention-dropout", + type=float, + metavar="D", + help="dropout probability for attention weights", + ) + parser.add_argument( + "--activation-dropout", + "--relu-dropout", + type=float, + metavar="D", + help="dropout probability after activation in FFN.", + ) + parser.add_argument( + "--encoder-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension", + ) + parser.add_argument( + "--encoder-text-embed-dim", + type=int, + metavar="N", + help="encoder text embedding dimension", + ) + parser.add_argument( + "--encoder-ffn-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension for FFN", + ) + parser.add_argument( + "--encoder-attention-heads", + type=int, + metavar="N", + help="num encoder attention heads", + ) + parser.add_argument( + "--decoder-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension", + ) + parser.add_argument( + "--decoder-ffn-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension for FFN", + ) + parser.add_argument( + "--decoder-layers", type=int, metavar="N", help="num decoder layers" + ) + parser.add_argument( + "--decoder-attention-heads", + type=int, + metavar="N", + help="num decoder attention heads", + ) + parser.add_argument( + "--layernorm-embedding", + action="store_true", + help="add layernorm to embedding", + ) + parser.add_argument( + "--no-scale-embedding", + action="store_true", + help="if True, dont scale embeddings", + ) + # non-standard transformer parameters + parser.add_argument( + "--speech-encoder-layers", + type=int, + metavar="N", + help="num speech encoder layers", + ) + parser.add_argument( + "--text-encoder-layers", + type=int, + metavar="N", + help="num text encoder layers", + ) + parser.add_argument( + "--encoder-shared-layers", + type=int, + metavar="N", + help="num shared encoder layers", + ) + parser.add_argument( + "--encoder-shared-layer-level", + type=int, + metavar="N", + default=0, + choices=[0, 1, 2], + help="share layer level 0: all share 1: all share with separate model 2: share weight but not bias and layernorm", + ) + + parser.add_argument( + "--decoder-shared-layer-level", + default=0, + choices=[0, 1, 2], + type=int, + metavar="N", + help="0: share everything; 1: share everything with different model 2: no share layer_norm and bias", + ) + ### + parser.add_argument( + "--text-input-cost-ratio", + type=float, + default=1.0, + metavar="V", + help="text input cost ratio relative to speech input cost", + ) + parser.add_argument( + "--init-scale", + type=float, + default=1.0, + metavar="V", + help="scale the initial weight by given factor", + ) + parser.add_argument( + "--enc-grad-mult", + type=float, + metavar="V", + default=1.0, + help="multiply enc1 and enc2 gradient by V", + ) + parser.add_argument( + "--enc2-along-grad-mult", + type=float, + metavar="V", + default=1.0, + help="multiply enc2 gradient by V if only enc2 is used", + ) + parser.add_argument( + "--load-pretrain-encoder", + type=str, + default="", + metavar="EXPR", + help=""" path to the pretrained encoder """, + ) + parser.add_argument( + "--load-pretrain-speech-encoder", + type=str, + default="", + metavar="EXPR", + help=""" path to the pretrained speech encoder """, + ) + parser.add_argument( + "--load-pretrain-text-encoder", + type=str, + default="", + metavar="EXPR", + help=""" path to the pretrained text encoder """, + ) + parser.add_argument( + "--load-pretrain-text-encoder-last", + type=str, + default="", + metavar="EXPR", + help=""" path to the pretrained text encoder """, + ) + parser.add_argument( + "--load-pretrain-decoder", + type=str, + metavar="EXPR", + default="", + help=""" path to the pretrained encoder """, + ) + parser.add_argument( + "--add-speech-eos", + action="store_true", + help="add eos token at the end of input feature", + ) + parser.add_argument( + "--speech-encoder-adapter-type", + type=str, + metavar="EXPR", + default="None", + choices=["None", "Linear", "MLP"], + help="add speech encoder adapter", + ) + + @classmethod + def build_encoder(cls, args, task): + spch_encoder = DualInputEncoder.build_spch_encoder(args) + text_encoder = DualInputEncoder.build_text_encoder( + args, task.src_dict, spch_encoder + ) + cross_attentive_loss_before_last_layer = ( + 0 if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else -1 + ) + encoder = DualInputEncoder( + args, + spch_encoder, + text_encoder, + task.src_dict, + cross_attentive_loss_before_last_layer, + ) + if args.init_scale != 1.0: + with torch.no_grad(): + for param in encoder.parameters(): + param.data.mul_(args.init_scale) + if args.load_pretrain_text_encoder != "": + checkpoint_utils.load_pretrained_component_from_model( + text_encoder, args.load_pretrain_text_encoder + ) + if args.load_pretrain_speech_encoder != "": + if hasattr(spch_encoder, "encoder"): + checkpoint_utils.load_pretrained_component_from_model( + spch_encoder.encoder, args.load_pretrain_speech_encoder + ) + else: + checkpoint_utils.load_pretrained_component_from_model( + spch_encoder, args.load_pretrain_speech_encoder + ) + if ( + args.load_pretrain_text_encoder_last != "" + ): # if share encoder, speech encoder parameters will be used. + # It provides a chance to use pre-trained mt encoder instead + checkpoint_utils.load_pretrained_component_from_model( + text_encoder, args.load_pretrain_text_encoder_last + ) + + if args.load_pretrain_encoder != "": + checkpoint_utils.load_pretrained_component_from_model( + encoder, args.load_pretrain_encoder + ) + return encoder + + @classmethod + def build_decoder(cls, args, task): + dec_cfg = { + "decoder_layerdrop": args.decoder_layerdrop, + "share_decoder_input_output_embed": args.share_decoder_input_output_embed, + "decoder_embed_dim": args.decoder_embed_dim, + "max_target_positions": args.max_target_positions, + "dropout": args.dropout, + "encoder_learned_pos": args.encoder_learned_pos, + "decoder_learned_pos": args.decoder_learned_pos, + "layernorm_embedding": args.layernorm_embedding, + "decoder_normalize_before": args.decoder_normalize_before, + "activation_dropout": args.activation_dropout, + "attention_dropout": args.attention_dropout, + "decoder_ffn_embed_dim": args.decoder_ffn_embed_dim, + "decoder_layers": args.decoder_layers, + "decoder_attention_heads": args.decoder_attention_heads, + "decoder_output_dim": args.decoder_embed_dim, + "no_scale_embedding": args.no_scale_embedding, + "adaptive_input": args.adaptive_input, + "quant_noise_pq": args.quant_noise_pq, + "adaptive_softmax_cutoff": args.adaptive_softmax_cutoff, + "tie_adaptive_weights": args.tie_adaptive_weights, + "no_token_positional_embeddings": args.no_token_positional_embeddings, + "encoder": {"embed_dim":args.encoder_embed_dim} + } + dec_cfg = namedtuple("args", dec_cfg.keys())(*dec_cfg.values()) + dec_emb = nn.Embedding( + len(task.target_dictionary), + args.decoder_embed_dim, + task.target_dictionary.pad(), + ) + compute_cross_attentive_loss = ( + True if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else False + ) + cross_attentive_loss_without_norm = getattr( + args, "attentive_cost_without_normalize", False + ) + cross_attentive_loss_reverse = ( + False # getattr(args, "attentive_cost_reverse", False) + ) + + text_decoder = TransformerDecoder(dec_cfg, task.target_dictionary, dec_emb) + spch_decoder = TransformerDecoder(dec_cfg, task.target_dictionary, dec_emb) + spch_decoder = TransformerMultiInputDecoder.share_spchdecoder( + args, text_decoder, spch_decoder + ) + decoder = TransformerMultiInputDecoder( + dictionary=task.target_dictionary, + spch_decoder=spch_decoder, + text_decoder=text_decoder, + compute_cross_attentive_loss=compute_cross_attentive_loss, + cross_attentive_loss_with_norm=True + if not cross_attentive_loss_without_norm + else False, + cross_attentive_loss_reverse=cross_attentive_loss_reverse, + ) + if args.init_scale != 1.0: + with torch.no_grad(): + for param in decoder.parameters(): + param.data.mul_(args.init_scale) + if args.load_pretrain_decoder != "": + try: + checkpoint_utils.load_pretrained_component_from_model( + decoder, args.load_pretrain_decoder + ) + except RuntimeError: + checkpoint_utils.load_pretrained_component_from_model( + decoder.text_decoder, args.load_pretrain_decoder + ) + if args.decoder_shared_layer_level > 0: + checkpoint_utils.load_pretrained_component_from_model( + decoder.spch_decoder, args.load_pretrain_decoder + ) + + return decoder + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + # make sure that all args are properly defaulted + # (in case there are any new ones) + dualinputs2ttransformer_base(args) + + encoder = cls.build_encoder(args, task) + decoder = cls.build_decoder(args, task) + return cls(encoder, decoder) + + def get_normalized_probs(self, net_output, log_probs, sample=None): + # net_output['encoder_out'] is a (B, T, D) tensor + lprobs = super().get_normalized_probs(net_output, log_probs, sample) + lprobs.batch_first = True + return lprobs + + def set_num_updates(self, num_updates): + """Set the number of parameters updates.""" + super().set_num_updates(num_updates) + self.num_updates = num_updates + + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens, + use_encoder_outputs=False, + src_txt_tokens=None, + src_txt_lengths=None, + mode="sup_speech", + **kwargs + ): + """ + Run the forward pass for an encoder-decoder model. + + First feed a batch of source tokens through the encoder. Then, feed the + encoder output and previous decoder outputs (i.e., teacher forcing) to + the decoder to produce the next outputs:: + + encoder_out = self.encoder(src_tokens, src_lengths) + return self.decoder(prev_output_tokens, encoder_out) + + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (LongTensor): source sentence lengths of shape `(batch)` + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + mode = 'sup_speech' or 'text' + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + if mode == "text": + assert src_txt_tokens is None + src_txt_tokens = src_tokens + src_txt_lengths = src_lengths + src_tokens = None + src_lengths = None + encoder_out = self.encoder( + src_tokens, + src_lengths=src_lengths, + src_txt_tokens=src_txt_tokens, + src_txt_lengths=src_txt_lengths, + **kwargs + ) + has_txt_input = True if src_txt_tokens is not None else False + decoder_out = self.decoder( + prev_output_tokens, + encoder_out=encoder_out, + has_txt_input=has_txt_input, + **kwargs + ) + if use_encoder_outputs: + return decoder_out, encoder_out + return decoder_out + + +@register_model_architecture( + "dual_input_s2t_transformer", "dualinputs2ttransformer_base" +) +def dualinputs2ttransformer_base(args): + args.encoder_freezing_updates = getattr(args, "encoder_freezing_updates", 0) + # Convolutional subsampler + args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) + args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5") + args.conv_channels = getattr(args, "conv_channels", 1024) + # Transformer + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_text_embed_dim = getattr( + args, "encoder_text_embed_dim", args.encoder_embed_dim + ) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) + args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) + + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.dropout = getattr(args, "dropout", 0.1) + args.attention_dropout = getattr(args, "attention_dropout", args.dropout) + args.activation_dropout = getattr(args, "activation_dropout", args.dropout) + args.activation_fn = getattr(args, "activation_fn", "relu") + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.layernorm_embedding = getattr(args, "layernorm_embedding", False) + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) + + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 10) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) + args.encoder_shared_layers = getattr(args, "encoder_shared_layers", 0) + args.decoder_layers = getattr(args, "decoder_layers", 6) + + args.add_speech_eos = getattr(args, "add_speech_eos", False) + + +@register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_s") +def dualinputs2ttransformer_s(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 4) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) + args.dropout = getattr(args, "dropout", 0.1) + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 7) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 7) + args.decoder_layers = getattr(args, "decoder_layers", 7) + dualinputs2ttransformer_base(args) + + +@register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_m") +def dualinputs2ttransformer_m(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512 * 4) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.dropout = getattr(args, "dropout", 0.15) + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 10) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) + args.decoder_layers = getattr(args, "decoder_layers", 6) + dualinputs2ttransformer_base(args) + + +@register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_b") +def dualinputs2ttransformer_b(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 768 * 4) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 12) + args.dropout = getattr(args, "dropout", 0.15) + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) + args.decoder_layers = getattr(args, "decoder_layers", 6) + dualinputs2ttransformer_base(args) + + +@register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_l") +def dualinputs2ttransformer_l(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024 * 4) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + args.dropout = getattr(args, "dropout", 0.2) + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) + args.decoder_layers = getattr(args, "decoder_layers", 6) + dualinputs2ttransformer_base(args) diff --git a/examples/speech_text_joint_to_text/models/s2t_dualinputwavtransformer.py b/examples/speech_text_joint_to_text/models/s2t_dualinputwavtransformer.py new file mode 100644 index 0000000000..66e4b3f1ec --- /dev/null +++ b/examples/speech_text_joint_to_text/models/s2t_dualinputwavtransformer.py @@ -0,0 +1,526 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from collections import OrderedDict, namedtuple + +import torch.nn as nn + +from fairseq import checkpoint_utils, utils +from fairseq.checkpoint_utils import load_checkpoint_to_cpu +from fairseq.file_io import PathManager +from fairseq.models import register_model, register_model_architecture +from fairseq.models.speech_to_text import ( + SpeechWavTransformerEncoder, + StackedSpeechWavTransformerEncoder, + TransformerDecoder, +) +from fairseq.models.transformer import TransformerEncoder + +from .s2t_dualinputtransformer import ( + DualInputEncoder, + DualInputS2TTransformerModel, + TransformerMultiInputDecoder, +) + +logger = logging.getLogger(__name__) + + +@register_model("dual_input_wav_transformer") +class DualInputWavTransformerModel(DualInputS2TTransformerModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @staticmethod + def add_args(parser): + def add_transformer_args(parser): + # We can't use TransformerModel.add_args(parser), since it defines max-source-positions which is duplicated with tasks/speech_to_text.py + # Transformer + parser.add_argument( + "--activation-fn", + type=str, + default="relu", + choices=utils.get_available_activation_fns(), + help="activation function to use", + ) + parser.add_argument( + "--dropout", type=float, metavar="D", help="dropout probability" + ) + parser.add_argument( + "--attention-dropout", + type=float, + metavar="D", + help="dropout probability for attention weights", + ) + parser.add_argument( + "--activation-dropout", + "--relu-dropout", + type=float, + metavar="D", + help="dropout probability after activation in FFN.", + ) + parser.add_argument( + "--encoder-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension", + ) + parser.add_argument( + "--encoder-ffn-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension for FFN", + ) + parser.add_argument( + "--encoder-layers", type=int, metavar="N", help="num encoder layers" + ) + parser.add_argument( + "--encoder-attention-heads", + type=int, + metavar="N", + help="num encoder attention heads", + ) + parser.add_argument( + "--encoder-normalize-before", + action="store_true", + help="apply layernorm before each encoder block", + ) + parser.add_argument( + "--decoder-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension", + ) + parser.add_argument( + "--decoder-ffn-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension for FFN", + ) + parser.add_argument( + "--decoder-layers", type=int, metavar="N", help="num decoder layers" + ) + parser.add_argument( + "--decoder-attention-heads", + type=int, + metavar="N", + help="num decoder attention heads", + ) + parser.add_argument( + "--decoder-normalize-before", + action="store_true", + help="apply layernorm before each decoder block", + ) + parser.add_argument( + "--share-decoder-input-output-embed", + action="store_true", + help="share decoder input and output embeddings", + ) + parser.add_argument( + "--layernorm-embedding", + action="store_true", + help="add layernorm to embedding", + ) + parser.add_argument( + "--no-scale-embedding", + action="store_true", + help="if True, dont scale embeddings", + ) + + parser.add_argument( + "--encoder-learned-pos", + action="store_true", + help="use learned positional embeddings", + ) + parser.add_argument( + "--decoder-learned-pos", + action="store_true", + help="use learned positional embeddings", + ) + + add_transformer_args(parser) + SpeechWavTransformerEncoder.add_args(parser) + parser.add_argument( + "--load-pretrained-speech-text-encoder", + type=str, + default="", + metavar="EXPR", + help=""" path to the pretrained speech text encoder from SpeechTextPreTrainModel """, + ) + parser.add_argument( + "--load-pretrained-wav2vec-encoder", + type=str, + default="", + metavar="EXPR", + help=""" path to the pretrained speech text encoder from wav2vec """, + ) + + parser.add_argument( + "--load-pretrained-speech-text-decoder", + type=str, + default="", + metavar="EXPR", + help=""" path to the pretrained speech text decoder from SpeechTextPreTrainModel """, + ) + parser.add_argument( + "--load-pretrained-text-decoder", + type=str, + default="", + metavar="EXPR", + help=""" path to the pretrained text decoder """, + ) + parser.add_argument( + "--load-init-encoder", + type=str, + default="", + metavar="EXPR", + help=""" path to load seed encoder model """, + ) + parser.add_argument( + "--load-init-decoder", + type=str, + default="", + metavar="EXPR", + help=""" path to load seed decoder model """, + ) + + parser.add_argument( + "--text-input-cost-ratio", + type=float, + default=1.0, + metavar="V", + help="text input cost ratio relative to speech input cost", + ) + parser.add_argument( + "--enc-grad-mult", + type=float, + metavar="V", + default=1.0, + help="multiply enc1 and enc2 gradient by V", + ) + parser.add_argument( + "--enc2-along-grad-mult", + type=float, + metavar="V", + default=1.0, + help="multiply enc2 gradient by V if only enc2 is used", + ) + parser.add_argument( + "--no-strict-check-pretrain-model", + action="store_true", + help="Don't apply strict model check for the pretrained model", + ) + + parser.add_argument( + "--stacked-encoder", + action="store_true", + help="stack speech and text encoders", + ) + + @classmethod + def update_transformer_encoder_cfg(cls, args, update_dict): + cfg = dict(args._get_kwargs()) + for fkey in update_dict.keys(): + cfg[fkey] = update_dict[fkey] + cfg.pop("_name", None) # remove keys start with _ + model_args = namedtuple("args", cfg.keys())(*cfg.values()) + return model_args + + @classmethod + def build_text_encoder(cls, args, src_dictionary): + enc_emb = nn.Embedding( + len(src_dictionary), args.encoder_embed_dim, src_dictionary.pad() + ) + model_args = cls.update_transformer_encoder_cfg( + args, + { + "encoder_layers": args.text_encoder_layers, + "max_source_positions": args.max_positions_text, + }, + ) + text_encoder = TransformerEncoder(model_args, src_dictionary, enc_emb) + return text_encoder + + @classmethod + def build_speech_encoder(cls, args): + model_args = cls.update_transformer_encoder_cfg( + args, {"encoder_layers": args.speech_encoder_layers} + ) + speech_encoder = SpeechWavTransformerEncoder(model_args) + return speech_encoder + + @classmethod + def check_args(cls, condition, is_strict, msg): + if condition: + return + if is_strict: + raise ValueError(msg) + logger.warn(msg) + + @classmethod + def build_encoder(cls, args, task): + # text_encoder = cls.build_text_encoder(args, task.source_dictionary ) + text_encoder = cls.build_text_encoder(args, task.src_dict) + speech_encoder = cls.build_speech_encoder(args) + if args.load_pretrained_wav2vec_encoder: + component_pairs = ( + ("feature_extractor", speech_encoder.subsample), + ("post_extract_proj", speech_encoder.feat_proj), + ("layer_norm", speech_encoder.feat_layer_norm), + ("encoder.pos_conv", speech_encoder.embed_positions), + ("encoder.layers", speech_encoder.layers), + ("encoder.layer_norm", speech_encoder.layer_norm), + ("mask_emb", speech_encoder.mask_emb), + ) + state = cls.load_pretrained_speech_text_components( + args.load_pretrained_wav2vec_encoder, component_pairs + ) + cls.check_args( + args.encoder_normalize_before + == state["cfg"]["model"]["layer_norm_first"], + not args.no_strict_check_pretrain_model, + f"encoder_normalize_before {args.encoder_normalize_before} doesn't match with the pretrained model", + ) + cls.check_args( + args.activation_fn == state["cfg"]["model"]["activation_fn"], + not args.no_strict_check_pretrain_model, + f"activation_fn {args.activation_fn} doesn't match with the pretrained model", + ) + + if getattr(args, "stacked_encoder", False): + if args.encoder_shared_text_layers_from_begin > 0: + raise ValueError( + "We can not stack encoders and share encoders at the same time!" + ) + speech_encoder = StackedSpeechWavTransformerEncoder( + speech_encoder, text_encoder.layers, text_encoder.layer_norm + ) + else: + cls.share_speech_text_encoder( + speech_encoder, text_encoder, args.encoder_shared_text_layers_from_begin + ) + + cross_attentive_loss_before_last_layer = ( + 0 if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else -1 + ) + encoder = DualInputEncoder( + args, + speech_encoder, + text_encoder, + task.src_dict, + cross_attentive_loss_before_last_layer, + ) + if args.load_pretrained_speech_text_encoder: + component_pairs = ( + ("encoder.sup_s2s_speech_encoder", encoder.spch_encoder), + ("encoder.text_encoder", encoder.text_encoder), + ) + cls.load_pretrained_speech_text_components( + args.load_pretrained_speech_text_encoder, component_pairs + ) + if getattr(args, "load_init_encoder", "") != "": + checkpoint_utils.load_pretrained_component_from_model( + encoder, args.load_init_encoder + ) + return encoder + + @classmethod + def build_text_decoder(cls, args, tgt_dictionary, dec_emb_share=None): + dec_emb = ( + nn.Embedding( + len(tgt_dictionary), args.decoder_embed_dim, tgt_dictionary.pad() + ) + if dec_emb_share is None + else dec_emb_share + ) + text_decoder = TransformerDecoder(args, tgt_dictionary, dec_emb) + return text_decoder + + @classmethod + def build_decoder(cls, args, task): + text_decoder = cls.build_text_decoder(args, task.target_dictionary) + compute_cross_attentive_loss = ( + True if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else False + ) + cross_attentive_loss_without_norm = getattr( + args, "attentive_cost_without_normalize", False + ) + cross_attentive_loss_reverse = ( + False # getattr(args, "attentive_cost_reverse", False) + ) + if getattr(args, "load_pretrained_text_decoder", "") != "": + checkpoint_utils.load_pretrained_component_from_model( + text_decoder, args.load_pretrained_text_decoder + ) + + if args.load_pretrained_speech_text_decoder: + component_pairs = (("decoder.text_decoder", text_decoder),) + cls.load_pretrained_speech_text_components( + args.load_pretrained_speech_text_decoder, component_pairs + ) + + decoder = TransformerMultiInputDecoder( + dictionary=task.target_dictionary, + spch_decoder=text_decoder, + text_decoder=text_decoder, + compute_cross_attentive_loss=compute_cross_attentive_loss, + cross_attentive_loss_with_norm=True + if not cross_attentive_loss_without_norm + else False, + cross_attentive_loss_reverse=cross_attentive_loss_reverse, + ) + if getattr(args, "load_init_decoder", "") != "": + checkpoint_utils.load_pretrained_component_from_model( + decoder, args.load_init_decoder + ) + return decoder + + @classmethod + def load_pretrained_speech_text_components(cls, checkpoint, component_pairs): + if not PathManager.exists(checkpoint): + raise IOError("Model file not found: {}".format(checkpoint)) + state = load_checkpoint_to_cpu(checkpoint) + for component_type, component in component_pairs: + if isinstance(component, nn.parameter.Parameter): + component.data.copy_(state["model"][component_type]) + else: + component_state_dict = OrderedDict() + for key in state["model"].keys(): + if key.startswith(component_type): + component_subkey = key[len(component_type) + 1 :] + component_state_dict[component_subkey] = state["model"][key] + component.load_state_dict(component_state_dict, strict=True) + return state + + @classmethod + def share_speech_text_encoder( + cls, speech_encoder, text_encoder, shared_layers_from_begin + ): + if shared_layers_from_begin > 0: + num_text_encoder_layers = len(text_encoder.layers) + assert len(speech_encoder.layers) >= shared_layers_from_begin + assert num_text_encoder_layers >= shared_layers_from_begin + assert len(speech_encoder.layers) >= num_text_encoder_layers + for i, ly in enumerate( + speech_encoder.layers[ + -num_text_encoder_layers : -num_text_encoder_layers + + shared_layers_from_begin + ] + ): + assert isinstance(text_encoder.layers[i], type(ly)) + text_encoder.layers[i] = ly + + +@register_model_architecture( + "dual_input_wav_transformer", "dualinputs2twavtransformer_base" +) +def dualinputs2twavtransformer_base(args): + # speech masking + args.dropout_input = getattr(args, "dropout_input", 0) + args.dropout_features = getattr(args, "dropout_features", 0) + args.speech_mask_length = getattr(args, "speech_mask_length", 10) + args.speech_mask_prob = getattr(args, "speech_mask_prob", 0.65) + args.speech_mask_selection = getattr(args, "speech_mask_selection", "static") + args.speech_mask_other = getattr(args, "speech_mask_other", 0) + args.speech_mask_min_space = getattr(args, "speech_mask_min_space", 1) + args.speech_no_mask_overlap = getattr(args, "speech_no_mask_overlap", False) + args.speech_conv_bias = getattr(args, "speech_conv_bias", False) + args.speech_extractor_mode = getattr(args, "speech_extractor_mode", "default") + args.no_strict_check_pretrain_model = getattr( + args, "no_strict_check_pretrain_model", False + ) + + args.speech_mask_channel_length = getattr(args, "speech_mask_channel_length", 10) + args.speech_mask_channel_prob = getattr(args, "speech_mask_channel_prob", 0.0) + args.speech_mask_channel_selection = getattr( + args, "speech_mask_channel_selection", "static" + ) + args.speech_mask_channel_other = getattr(args, "speech_mask_channel_other", 0) + args.speech_mask_channel_min_space = getattr( + args, "speech_mask_channel_min_space", 1 + ) + args.speech_no_mask_channel_overlap = getattr( + args, "speech_no_mask_channel_overlap", False + ) + args.no_scale_feature = getattr(args, "", False) + args.feature_grad_mult = getattr(args, "feature_grad_mult", 0.0) # 0.1 + + # Transformer + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) + args.encoder_ffn_embed_dim = getattr( + args, "encoder_ffn_embed_dim", args.encoder_embed_dim * 4 + ) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.1) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) + + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_attention_heads = getattr( + args, "decoder_attention_heads", args.encoder_attention_heads + ) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.dropout = getattr(args, "dropout", 0.1) + args.attention_dropout = getattr(args, "attention_dropout", 0) + args.activation_dropout = getattr(args, "activation_dropout", args.dropout) + args.activation_fn = getattr(args, "activation_fn", "relu") # gelu? + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.layernorm_embedding = getattr(args, "layernorm_embedding", False) + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) + + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) + args.encoder_shared_text_layers_from_begin = getattr( + args, "encoder_shared_text_layers_from_begin", 6 + ) + args.decoder_layers = getattr(args, "decoder_layers", 6) + + +@register_model_architecture( + "dual_input_wav_transformer", "dualinputs2twavtransformer_base_stack" +) +def dualinputs2twavtransformer_base_stack(args): + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 6) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) + args.encoder_shared_text_layers_from_begin = getattr( + args, "encoder_shared_text_layers_from_begin", 0 + ) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.stacked_encoder = getattr(args, "stacked_encoder", True) + args.layernorm_embedding = getattr(args, "layernorm_embedding", True) + dualinputs2twavtransformer_base(args) + + +@register_model_architecture( + "dual_input_wav_transformer", "dualinputs2twavtransformer_large" +) +def dualinputs2twavtransformer_large(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 24) + args.text_encoder_layers = getattr(args, "text_encoder_layers", 12) + args.encoder_shared_text_layers_from_begin = getattr( + args, "encoder_shared_text_layers_from_begin", 12 + ) + args.decoder_layers = getattr(args, "decoder_layers", 12) + dualinputs2twavtransformer_base(args) diff --git a/examples/speech_text_joint_to_text/models/s2t_dualinputxmtransformer.py b/examples/speech_text_joint_to_text/models/s2t_dualinputxmtransformer.py new file mode 100644 index 0000000000..7b4cbb0aa6 --- /dev/null +++ b/examples/speech_text_joint_to_text/models/s2t_dualinputxmtransformer.py @@ -0,0 +1,584 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import copy + +import torch.nn as nn +from fairseq import checkpoint_utils +from fairseq import utils +from fairseq.data.data_utils import lengths_to_padding_mask +from fairseq.models import ( + register_model, + register_model_architecture, + FairseqEncoder, +) +from fairseq.models.speech_to_text import Wav2VecEncoderWithAdaptor +from fairseq.models.speech_to_text.xm_transformer import ( + set_default_adaptor_args, + set_default_w2v_encoder_args, + need_finetuning +) +from fairseq.models.transformer import TransformerEncoder, TransformerDecoder +from fairseq.models.wav2vec import TransformerSentenceEncoderLayer +from fairseq.utils import safe_hasattr + +from .s2t_dualinputtransformer import ( + DualInputS2TTransformerModel, + TransformerMultiInputDecoder, + DualInputEncoder, +) + + +class TransformerSentenceEncoderLayerStd(TransformerSentenceEncoderLayer): + def __init__(self, sent_enc_layer): + super(TransformerSentenceEncoderLayer, self).__init__() + self.embedding_dim = sent_enc_layer.embedding_dim + self.dropout = sent_enc_layer.dropout + self.activation_dropout = sent_enc_layer.activation_dropout + + # Initialize blocks + self.activation_fn = sent_enc_layer.activation_fn + self.self_attn = sent_enc_layer.self_attn + + self.dropout1 = sent_enc_layer.dropout1 + self.dropout2 = sent_enc_layer.dropout2 + self.dropout3 = sent_enc_layer.dropout3 + + self.layer_norm_first = sent_enc_layer.layer_norm_first + + # layer norm associated with the self attention layer + self.self_attn_layer_norm = sent_enc_layer.self_attn_layer_norm + self.fc1 = sent_enc_layer.fc1 + self.fc2 = sent_enc_layer.fc2 + + # layer norm associated with the position wise feed-forward NN + self.final_layer_norm = sent_enc_layer.final_layer_norm + + def forward( + self, + x, + self_attn_mask=None, + self_attn_padding_mask=None, + need_weights=None, + att_args=None, + ): + x, attn = super().forward( + x, self_attn_mask, self_attn_padding_mask, need_weights, att_args + ) + return x + + +# TODO retire SharedEncoder +class SharedEncoder(FairseqEncoder): + def __init__(self, wav2vec_enc, mbart_enc, adaptor, shared_layers): + super().__init__(None) + self.w2v_encoder = wav2vec_enc + self.shared_layers = self.w2v_encoder.w2v_model.encoder.layers[-shared_layers:] + self.w2v_encoder.w2v_model.encoder.layers = ( + self.w2v_encoder.w2v_model.encoder.layers[:-shared_layers] + ) + self.adaptor = adaptor + if self.shared_layers[-1].layer_norm_first: + self.final_layer_norm = mbart_enc.layer_norm + else: + mbart_enc.layer_norm = None + self.final_layer_norm = None + shared_layer_from = len(mbart_enc.layers) - shared_layers + if shared_layer_from < 0: + shared_layer_from = 0 + for layer_id, layer in enumerate(self.shared_layers): + mbart_enc.layers[ + shared_layer_from + layer_id + ] = TransformerSentenceEncoderLayerStd(layer) + + def forward(self, src_tokens, src_lengths=None, **kwargs): + padding_mask = lengths_to_padding_mask(src_lengths) + if not padding_mask.any(): + padding_mask = None + + out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True) + x = out["encoder_out"] + enc_padding_mask = None + if out["encoder_padding_mask"] is not None: + enc_padding_mask = out["encoder_padding_mask"].transpose( + 0, 1 + ) # T X B --> B X T + + x, enc_padding_mask = self.adaptor(x, enc_padding_mask) + for layer in self.shared_layers: + x, _ = layer(x, enc_padding_mask) + if self.final_layer_norm is not None: + x = self.final_layer_norm(x) + + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [enc_padding_mask] + if enc_padding_mask is not None + else [], # B x T + "encoder_embedding": [], # B x T x C + "encoder_states": [], # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + } + + +class StackedWav2VecEncoderWithAdaptor(FairseqEncoder): + def __init__( + self, + wav2vec_enc, + mbart_enc_layers, + mbart_layer_norm, + adaptor, + drop_w2v_layers=0, + ): + super().__init__(None) + self.w2v_encoder = wav2vec_enc + self.adaptor = adaptor + self.mbart_encoder_layers = mbart_enc_layers + self.final_layer_norm = mbart_layer_norm + if drop_w2v_layers > 0: + self.w2v_encoder.w2v_model.encoder.layers = ( + self.w2v_encoder.w2v_model.encoder.layers[:-drop_w2v_layers] + ) + + def forward(self, src_tokens, src_lengths=None, return_all_hiddens=False, **kwargs): + padding_mask = lengths_to_padding_mask(src_lengths) + if not padding_mask.any(): + padding_mask = None + + out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True) + x = out["encoder_out"] + enc_padding_mask = None + if out["padding_mask"] is not None: + enc_padding_mask = out["padding_mask"] # B X T + + x, enc_padding_mask = self.adaptor(x, enc_padding_mask) + encoder_states = [] + for layer in self.mbart_encoder_layers: + x = layer(x, enc_padding_mask) + if return_all_hiddens: + encoder_states.append(x) + if self.final_layer_norm is not None: + x = self.final_layer_norm(x) + + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [enc_padding_mask] + if enc_padding_mask is not None + else [], # B x T + "encoder_embedding": [], # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + } + + def reorder_encoder_out(self, encoder_out, new_order): + new_encoder_out = ( + [] + if len(encoder_out["encoder_out"]) == 0 + else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] + ) + + new_encoder_padding_mask = ( + [] + if len(encoder_out["encoder_padding_mask"]) == 0 + else [ + x.index_select(0, new_order) + for x in encoder_out["encoder_padding_mask"] + ] + ) + + new_encoder_embedding = ( + [] + if len(encoder_out["encoder_embedding"]) == 0 + else [ + x.index_select(0, new_order) for x in encoder_out["encoder_embedding"] + ] + ) + + encoder_states = encoder_out["encoder_states"] + if len(encoder_states) > 0: + for idx, state in enumerate(encoder_states): + encoder_states[idx] = state.index_select(1, new_order) + + return { + "encoder_out": new_encoder_out, # T x B x C + "encoder_padding_mask": new_encoder_padding_mask, # B x T + "encoder_embedding": new_encoder_embedding, # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], # B x T + "src_lengths": [], # B x 1 + } + + +# Note: +# dual input transformer: +# encoder: wav2vec for speech + mbart encoder for text +# decoder: mbart decoder for text +@register_model("dual_input_xm_transformer") +class DualInputXMTransformerModel(DualInputS2TTransformerModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + # wav2vec encoder + Wav2VecEncoderWithAdaptor.add_args(parser) + # add_decoder_args(parser) + # mbart Transformer + parser.add_argument( + "--activation-fn", + type=str, + default="relu", + choices=utils.get_available_activation_fns(), + help="activation function to use", + ) + + parser.add_argument( + "--mbart-dropout", type=float, metavar="D", help="dropout probability" + ) + parser.add_argument( + "--mbart-attention-dropout", + type=float, + metavar="D", + help="dropout probability for attention weights", + ) + parser.add_argument( + "--mbart-activation-dropout", + type=float, + metavar="D", + help="dropout probability after activation in FFN.", + ) + + parser.add_argument( + "--encoder-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension", + ) + parser.add_argument( + "--encoder-ffn-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension for FFN", + ) + parser.add_argument( + "--encoder-layers", type=int, metavar="N", help="num encoder layers" + ) + parser.add_argument( + "--encoder-attention-heads", + type=int, + metavar="N", + help="num encoder attention heads", + ) + parser.add_argument( + "--encoder-normalize-before", + action="store_true", + help="apply layernorm before each encoder block", + ) + + parser.add_argument( + "--decoder-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension", + ) + parser.add_argument( + "--decoder-ffn-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension for FFN", + ) + parser.add_argument( + "--decoder-layers", type=int, metavar="N", help="num decoder layers" + ) + parser.add_argument( + "--decoder-attention-heads", + type=int, + metavar="N", + help="num decoder attention heads", + ) + parser.add_argument( + "--decoder-normalize-before", + action="store_true", + help="apply layernorm before each decoder block", + ) + parser.add_argument( + "--layernorm-embedding", + action="store_true", + help="add layernorm to embedding", + ) + parser.add_argument( + "--no-scale-embedding", + action="store_true", + help="if True, dont scale embeddings", + ) + parser.add_argument( + "--load-pretrained-mbart-from", + type=str, + metavar="STR", + help="model to take text encoder decoder weights from (for initialization)", + ) + # parser.add_argument("--finetune-w2v-params", type=str, metavar="STR", + # help="comma-separated param strings to finetune.") + parser.add_argument( + "--finetune-mbart-decoder-params", + type=str, + metavar="STR", + help="comma-separated param strings to finetune.", + ) + parser.add_argument( + "--finetune-mbart-encoder-params", + type=str, + metavar="STR", + help="comma-separated param strings to finetune.", + ) + parser.add_argument( + "--skip-encoder-projection", + action="store_true", + help="skip the projection layer in encoder", + ) + + parser.add_argument( + "--enc-grad-mult", + type=float, + metavar="V", + default=1.0, + help="multiply enc1 and enc2 gradient by V", + ) + parser.add_argument( + "--enc2-along-grad-mult", + type=float, + metavar="V", + default=1.0, + help="multiply enc2 gradient by V if only enc2 is used", + ) + parser.add_argument( + "--text-input-cost-ratio", + type=float, + default=1.0, + metavar="V", + help="text input cost ratio relative to speech input cost", + ) + parser.add_argument( + "--stack-w2v-mbart-encoder", + action="store_true", + help="stack w2v and mbart encoder", + ) + parser.add_argument( + "--stack-w2v-mbart-nonorm-encoder", + action="store_true", + help="stack w2v and mbart encoder", + ) + parser.add_argument( + "--no-final-norm-decoder", action="store_true", help="no layer norm" + ) + parser.add_argument( + "--drop-w2v-layers", + type=int, + default=0, + metavar="N", + help="drop w2v encoder layers", + ) + + parser.add_argument( + "--share-w2v-text-encoder", + action="store_true", + help="share w2v encoder layers with text encoder", + ) + parser.add_argument( + "--shared-w2v-layers", + type=int, + default=0, + metavar="N", + help="shared encoder layers from w2v encoder", + ) + + @classmethod + def build_encoder(cls, args, task): + _args = copy.deepcopy(args) + _args.dropout = args.mbart_dropout + _args.attention_dropout = args.mbart_attention_dropout + _args.activation_dropout = args.mbart_activation_dropout + _args.max_source_positions = 1024 + enc_emb = nn.Embedding( + len(task.src_dict), _args.encoder_embed_dim, task.src_dict.pad() + ) + text_encoder = TransformerEncoder(_args, task.src_dict, enc_emb) + spch_encoder = Wav2VecEncoderWithAdaptor(args) + if getattr(args, "load_pretrained_mbart_from", None): + text_encoder = checkpoint_utils.load_pretrained_component_from_model( + component=text_encoder, checkpoint=args.load_pretrained_mbart_from + ) + if getattr(args, "stack_w2v_mbart_encoder", False): + assert getattr(args, "share_w2v_text_encoder", False) is False + spch_encoder = StackedWav2VecEncoderWithAdaptor( + spch_encoder.w2v_encoder, + text_encoder.layers, + text_encoder.layer_norm, + spch_encoder.adaptor, + args.drop_w2v_layers, + ) + elif getattr(args, "stack_w2v_mbart_nonorm_encoder", False): + text_encoder.layer_norm = None + spch_encoder = StackedWav2VecEncoderWithAdaptor( + spch_encoder.w2v_encoder, + text_encoder.layers, + text_encoder.layer_norm, + spch_encoder.adaptor, + args.drop_w2v_layers, + ) + elif getattr(args, "share_w2v_text_encoder", False): + spch_encoder = SharedEncoder( + spch_encoder.w2v_encoder, + text_encoder, + spch_encoder.adaptor, + args.shared_w2v_layers, + ) + + for k, p in spch_encoder.named_parameters(): + # Freeze pretrained models by default + if safe_hasattr( + args, "finetune_w2v_params" + ) and need_finetuning(args.finetune_w2v_params, k): + p.requires_grad = True + else: + p.requires_grad = False + for k, p in text_encoder.named_parameters(): + # Freeze pretrained models by default + if safe_hasattr( + args, "finetune_mbart_encoder_params" + ) and need_finetuning( + args.finetune_mbart_encoder_params, k + ): + p.requires_grad = True + else: + p.requires_grad = False + cross_attentive_loss_before_last_layer = ( + 0 if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else -1 + ) + encoder = DualInputEncoder( + args, + spch_encoder, + text_encoder, + task.src_dict, + cross_attentive_loss_before_last_layer, + ) + return encoder + + @classmethod + def build_decoder(cls, args, task): + _args = copy.deepcopy(args) + _args.dropout = args.mbart_dropout + _args.attention_dropout = args.mbart_attention_dropout + _args.activation_dropout = args.mbart_activation_dropout + _args.max_target_positions = 1024 + dec_emb = nn.Embedding( + len(task.tgt_dict), _args.encoder_embed_dim, task.tgt_dict.pad() + ) + decoder = TransformerDecoder(_args, task.tgt_dict, dec_emb) + if getattr(args, "load_pretrained_mbart_from", None): + decoder = checkpoint_utils.load_pretrained_component_from_model( + component=decoder, checkpoint=args.load_pretrained_mbart_from + ) + if getattr(args, "no_final_norm_decoder", False): + decoder.layer_norm = None + for k, p in decoder.named_parameters(): + # Freeze pretrained models by default + if safe_hasattr( + args, "finetune_mbart_decoder_params" + ) and need_finetuning( + args.finetune_mbart_decoder_params, k + ): + p.requires_grad = True + else: + p.requires_grad = False + + compute_cross_attentive_loss = ( + True if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else False + ) + cross_attentive_loss_without_norm = getattr( + args, "attentive_cost_without_normalize", False + ) + cross_attentive_loss_reverse = ( + False # getattr(args, "attentive_cost_reverse", False) + ) + decoder = TransformerMultiInputDecoder( + dictionary=task.target_dictionary, + spch_decoder=decoder, + text_decoder=decoder, + compute_cross_attentive_loss=compute_cross_attentive_loss, + cross_attentive_loss_with_norm=True + if not cross_attentive_loss_without_norm + else False, + cross_attentive_loss_reverse=cross_attentive_loss_reverse, + ) + return decoder + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + # make sure that all args are properly defaulted + # (in case there are any new ones) + dualinputxmtransformer_base(args) + + encoder = cls.build_encoder(args, task) + decoder = cls.build_decoder(args, task) + return cls(encoder, decoder) + + +@register_model_architecture("dual_input_xm_transformer", "dualinputxmtransformer_base") +def dualinputxmtransformer_base(args): + # wav2vec encoder + set_default_w2v_encoder_args(args) + set_default_adaptor_args(args) + + # mbart model + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = getattr( + args, "encoder_ffn_embed_dim", 4 * args.encoder_embed_dim + ) + args.encoder_layers = getattr(args, "encoder_layers", 12) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) + args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True) + + args.decoder_embed_path = getattr(args, "decoder_embed_path", None) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4 * 1024) + args.decoder_layers = getattr(args, "decoder_layers", 12) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", True) + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) + + args.adaptive_input = getattr(args, "adaptive_input", False) + + args.mbart_attention_dropout = getattr(args, "mbart_attention_dropout", 0.0) + args.mbart_activation_dropout = getattr(args, "mbart_activation_dropout", 0.0) + args.mbart_dropout = getattr(args, "mbart_dropout", 0.1) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", True + ) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) + args.layernorm_embedding = getattr(args, "layernorm_embedding", True) + + args.activation_fn = getattr(args, "activation_fn", "gelu") + args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") + args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) diff --git a/examples/speech_text_joint_to_text/scripts/convert_model.py b/examples/speech_text_joint_to_text/scripts/convert_model.py new file mode 100644 index 0000000000..4923af1312 --- /dev/null +++ b/examples/speech_text_joint_to_text/scripts/convert_model.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import re +from collections import OrderedDict + +import torch + +from fairseq.file_io import PathManager + + +def is_update(param_name, module_name): + if module_name in param_name: + return True + return False + + +def load_checkpoint(src_cpt): + + with PathManager.open(src_cpt, "rb") as f: + state_src = torch.load( + f, + map_location=( + lambda s, _: torch.serialization.default_restore_location(s, "cpu") + ), + ) + + return state_src + + +def save_checkpoint(tgt_cpt, states): + + with PathManager.open(tgt_cpt, "wb") as f: + torch.save( + states, + f, + ) + + +# convert the pre-trained model into bart model +def main(): + parser = argparse.ArgumentParser() + # fmt: off + parser.add_argument('--input-model', required=True, + help='Input checkpoint file path.') + parser.add_argument('--output-model', required=True, + help='output checkpoint file path.') + # fmt: on + args = parser.parse_args() + print(args) + + states = load_checkpoint(args.input_model) + model = states["model"] + new_model = OrderedDict() + for key in model.keys(): + if re.search("^encoder.text_encoder", key): + new_key = re.sub("encoder.text_encoder", "encoder", key) + new_model[new_key] = model[key] + elif re.search("^decoder.text_decoder", key): + new_key = re.sub("decoder.text_decoder", "decoder", key) + new_model[new_key] = model[key] + states["model"] = new_model + save_checkpoint(args.output_model, states) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_text_joint_to_text/scripts/g2p_encode.py b/examples/speech_text_joint_to_text/scripts/g2p_encode.py new file mode 100644 index 0000000000..9db779396f --- /dev/null +++ b/examples/speech_text_joint_to_text/scripts/g2p_encode.py @@ -0,0 +1,191 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import itertools +import logging +import re +import time + +from g2p_en import G2p + +logger = logging.getLogger(__name__) + +FAIL_SENT = "FAILED_SENTENCE" + + +def parse(): + parser = argparse.ArgumentParser() + parser.add_argument("--data-path", type=str, required=True) + parser.add_argument("--out-path", type=str, required=True) + parser.add_argument("--lower-case", action="store_true") + parser.add_argument("--do-filter", action="store_true") + parser.add_argument("--use-word-start", action="store_true") + parser.add_argument("--dup-vowel", default=1, type=int) + parser.add_argument("--dup-consonant", default=1, type=int) + parser.add_argument("--no-punc", action="store_true") + parser.add_argument("--reserve-word", type=str, default="") + parser.add_argument( + "--reserve-first-column", + action="store_true", + help="first column is sentence id", + ) + ### + parser.add_argument("--parallel-process-num", default=1, type=int) + parser.add_argument("--logdir", default="") + args = parser.parse_args() + return args + + +def process_sent(sent, g2p, res_wrds, args): + sents = pre_process_sent(sent, args.do_filter, args.lower_case, res_wrds) + pho_seqs = [do_g2p(g2p, s, res_wrds, i == 0) for i, s in enumerate(sents)] + pho_seq = ( + [FAIL_SENT] + if [FAIL_SENT] in pho_seqs + else list(itertools.chain.from_iterable(pho_seqs)) + ) + if args.no_punc: + pho_seq = remove_punc(pho_seq) + if args.dup_vowel > 1 or args.dup_consonant > 1: + pho_seq = dup_pho(pho_seq, args.dup_vowel, args.dup_consonant) + if args.use_word_start: + pho_seq = add_word_start(pho_seq) + return " ".join(pho_seq) + + +def remove_punc(sent): + ns = [] + regex = re.compile("[^a-zA-Z0-9 ]") + for p in sent: + if (not regex.search(p)) or p == FAIL_SENT: + if p == " " and (len(ns) == 0 or ns[-1] == " "): + continue + ns.append(p) + return ns + + +def do_g2p(g2p, sent, res_wrds, is_first_sent): + if sent in res_wrds: + pho_seq = [res_wrds[sent]] + else: + pho_seq = g2p(sent) + if not is_first_sent: + pho_seq = [" "] + pho_seq # add space to separate + return pho_seq + + +def pre_process_sent(sent, do_filter, lower_case, res_wrds): + if do_filter: + sent = re.sub("-", " ", sent) + sent = re.sub("—", " ", sent) + if len(res_wrds) > 0: + wrds = sent.split() + wrds = ["SPLIT_ME " + w + " SPLIT_ME" if w in res_wrds else w for w in wrds] + sents = [x.strip() for x in " ".join(wrds).split("SPLIT_ME") if x.strip() != ""] + else: + sents = [sent] + if lower_case: + sents = [s.lower() if s not in res_wrds else s for s in sents] + return sents + + +def dup_pho(sent, dup_v_num, dup_c_num): + """ + duplicate phoneme defined as cmudict + http://www.speech.cs.cmu.edu/cgi-bin/cmudict + """ + if dup_v_num == 1 and dup_c_num == 1: + return sent + ns = [] + for p in sent: + ns.append(p) + if re.search(r"\d$", p): + for i in range(1, dup_v_num): + ns.append(f"{p}-{i}P") + elif re.search(r"\w", p): + for i in range(1, dup_c_num): + ns.append(f"{p}-{i}P") + return ns + + +def add_word_start(sent): + ns = [] + do_add = True + ws = "▁" + for p in sent: + if do_add: + p = ws + p + do_add = False + if p == " ": + do_add = True + else: + ns.append(p) + return ns + + +def load_reserve_word(reserve_word): + if reserve_word == "": + return [] + with open(reserve_word, "r") as fp: + res_wrds = [x.strip().split() for x in fp.readlines() if x.strip() != ""] + assert sum([0 if len(x) == 2 else 1 for x in res_wrds]) == 0 + res_wrds = dict(res_wrds) + return res_wrds + + +def process_sents(sents, args): + g2p = G2p() + out_sents = [] + res_wrds = load_reserve_word(args.reserve_word) + for sent in sents: + col1 = "" + if args.reserve_first_column: + col1, sent = sent.split(None, 1) + sent = process_sent(sent, g2p, res_wrds, args) + if args.reserve_first_column and col1 != "": + sent = f"{col1} {sent}" + out_sents.append(sent) + return out_sents + + +def main(): + args = parse() + out_sents = [] + with open(args.data_path, "r") as fp: + sent_list = [x.strip() for x in fp.readlines()] + if args.parallel_process_num > 1: + try: + import submitit + except ImportError: + logger.warn( + "submitit is not found and only one job is used to process the data" + ) + submitit = None + + if args.parallel_process_num == 1 or submitit is None: + out_sents = process_sents(sent_list, args) + else: + # process sentences with parallel computation + lsize = len(sent_list) // args.parallel_process_num + 1 + executor = submitit.AutoExecutor(folder=args.logdir) + executor.update_parameters(timeout_min=1000, cpus_per_task=4) + jobs = [] + for i in range(args.parallel_process_num): + job = executor.submit( + process_sents, sent_list[lsize * i : lsize * (i + 1)], args + ) + jobs.append(job) + is_running = True + while is_running: + time.sleep(5) + is_running = sum([job.done() for job in jobs]) < len(jobs) + out_sents = list(itertools.chain.from_iterable([job.result() for job in jobs])) + with open(args.out_path, "w") as fp: + fp.write("\n".join(out_sents) + "\n") + + +if __name__ == "__main__": + main() diff --git a/examples/speech_text_joint_to_text/tasks/__init__.py b/examples/speech_text_joint_to_text/tasks/__init__.py new file mode 100644 index 0000000000..5fc5d9e21b --- /dev/null +++ b/examples/speech_text_joint_to_text/tasks/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import importlib +import os + diff --git a/examples/speech_text_joint_to_text/tasks/pair_denoising.py b/examples/speech_text_joint_to_text/tasks/pair_denoising.py new file mode 100644 index 0000000000..b13b1e5ae3 --- /dev/null +++ b/examples/speech_text_joint_to_text/tasks/pair_denoising.py @@ -0,0 +1,447 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import itertools +import logging +import os +import re + +import numpy as np +import torch + +from examples.speech_text_joint_to_text.data.pair_denoising_dataset import ( + LanguagePairDenoisingDataset, +) +from fairseq import utils +from fairseq.data import ( + ConcatDataset, + Dictionary, + LanguagePairDataset, + ResamplingDataset, + TransformEosConcatLangPairDataset, + TransformEosLangPairDataset, + data_utils, + indexed_dataset, +) +from fairseq.data.encoders.utils import get_whole_word_mask +from fairseq.tasks import register_task +from fairseq.tasks.translation import TranslationTask + +logger = logging.getLogger(__name__) + + +def gen_whole_word_mask(args, dictionary): + def is_beginning_of_word(i): + if i < dictionary.nspecial: + # special elements are always considered beginnings + return True + tok = dictionary[i] + if tok.startswith("madeupword"): + return True + + if tok in ["<unk>", "<s>", "</s>", "<pad>"]: + return True + return tok.startswith("\u2581") + + if args.use_mask_whole_words: + mask_whole_words = torch.ByteTensor( + list(map(is_beginning_of_word, range(len(dictionary)))) + ) + else: + # it will mask every token as word leading token, since no bpe model is loaded for phoneme tokens + return get_whole_word_mask(args, dictionary) + return mask_whole_words + + +@register_task("paired_denoising") +class PairedDenoisingTask(TranslationTask): + + LANG_TAG_TEMPLATE = "<lang:{}>" # Tag for language (target) + + @staticmethod + def add_args(parser): + TranslationTask.add_args(parser) + # bart setting + parser.add_argument( + "--mask", + default=0.0, + type=float, + help="fraction of words/subwords that will be masked", + ) + parser.add_argument( + "--mask-random", + default=0.0, + type=float, + help="instead of using [MASK], use random token this often", + ) + parser.add_argument( + "--insert", + default=0.0, + type=float, + help="insert this percentage of additional random tokens", + ) + parser.add_argument( + "--poisson-lambda", + default=3.0, + type=float, + help="randomly shuffle sentences for this proportion of inputs", + ) + parser.add_argument( + "--mask-length", + default="span-poisson", + type=str, + choices=["subword", "word", "span-poisson"], + help="mask length to choose", + ) + parser.add_argument( + "--replace-length", + default=1, + type=int, + help="when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)", + ) + + # multi-lingual + parser.add_argument( + "--multilang-sampling-alpha", + type=float, + default=1.0, + help="smoothing alpha for sample ratios across multiple datasets", + ) + parser.add_argument( + "--lang-pairs", + default="", + metavar="PAIRS", + help="comma-separated list of language pairs (in training order): phnen-en,phnfr-fr,phnit-it. Do masking", + ) + parser.add_argument( + "--lang-pairs-bitext", + default="", + metavar="PAIRS", + help="comma-separated list of language pairs (in training order): en-de,en-fr,de-fr. No masking", + ) + parser.add_argument("--add-src-lang-token", default=False, action="store_true") + parser.add_argument("--add-tgt-lang-token", default=False, action="store_true") + parser.add_argument( + "--no-whole-word-mask-langs", + type=str, + default="", + metavar="N", + help="languages without spacing between words dont support whole word masking", + ) + parser.add_argument( + "--use-mask-whole-words", default=False, action="store_true" + ) + + @classmethod + def setup_task(cls, args, **kwargs): + """Setup the task.""" + paths = args.data.split(":") + assert len(paths) > 0 + src_dict = Dictionary.load( + os.path.join(paths[0], "src_dict.txt") + ) # assume all languages share a source dictionary + tgt_dict = Dictionary.load( + os.path.join(paths[0], "tgt_dict.txt") + ) # assume all languages share a target dictionary + + lang_pairs = args.lang_pairs + "," + args.lang_pairs_bitext + lang_pairs = re.sub(",$", "", re.sub("^,", "", lang_pairs)) + src_langs = [lp.split("-")[0] for lp in lang_pairs.split(",")] + tgt_langs = [lp.split("-")[1] for lp in lang_pairs.split(",")] + + if args.add_src_lang_token: + for lang in src_langs: + assert ( + src_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang)) + != src_dict.unk() + ) + if args.add_tgt_lang_token: + for lang in tgt_langs: + assert ( + tgt_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang)) + != tgt_dict.unk() + ) + + logger.info("source dictionary: {} types".format(len(src_dict))) + logger.info("target dictionary: {} types".format(len(tgt_dict))) + if not hasattr(args, "shuffle_instance"): + args.shuffle_instance = False + return cls(args, src_dict, tgt_dict) + + def __init__(self, args, src_dict, tgt_dict): + super().__init__(args, src_dict, tgt_dict) + # check mask token + self.mask_idx = self.src_dict.index("<mask>") + assert self.mask_idx != self.src_dict.unk() + self.lang_pairs = args.lang_pairs + self.lang_pairs_bitext = args.lang_pairs_bitext + self.args = args + + @classmethod + def language_pair_denoising_dataset( + cls, + data_path, + do_mask, + split, + src, + src_dict, + tgt, + tgt_dict, + mask_idx, + mask_whole_words, + seed, + args, + dataset_impl, + combine=False, + left_pad_source=True, + left_pad_target=False, + max_source_positions=1024, + max_target_positions=1024, + shuffle=True, + src_lang_id=None, + tgt_lang_id=None, + ): + def split_exists(split, src, tgt, lang, data_path): + filename = os.path.join( + data_path, "{}.{}-{}.{}".format(split, src, tgt, lang) + ) + return indexed_dataset.dataset_exists(filename, impl=dataset_impl) + + src_datasets = [] + tgt_datasets = [] + + for k in itertools.count(): + split_k = split + (str(k) if k > 0 else "") + + # infer langcode + if split_exists(split_k, src, tgt, src, data_path): + prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt)) + elif split_exists(split_k, tgt, src, src, data_path): + prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src)) + else: + if k > 0: + break + else: + raise FileNotFoundError( + "Dataset not found: {} ({})".format(split, data_path) + ) + + src_dataset = data_utils.load_indexed_dataset( + prefix + src, src_dict, dataset_impl + ) + src_datasets.append(src_dataset) + + tgt_dataset = data_utils.load_indexed_dataset( + prefix + tgt, tgt_dict, dataset_impl + ) + if tgt_dataset is not None: + tgt_datasets.append(tgt_dataset) + + logger.info( + "{} {} {}-{} {} examples".format( + data_path, split_k, src, tgt, len(src_datasets[-1]) + ) + ) + + if not combine: + break + + assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 + + if len(src_datasets) == 1: + src_dataset = src_datasets[0] + tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None + else: + sample_ratios = [1] * len(src_datasets) + src_dataset = ConcatDataset(src_datasets, sample_ratios) + if len(tgt_datasets) > 0: + tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) + else: + tgt_dataset = None + + eos = None + + tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None + if not do_mask: + return LanguagePairDataset( + src_dataset, + src_dataset.sizes, + src_dict, + tgt_dataset, + tgt_dataset_sizes, + tgt_dict, + left_pad_source=left_pad_source, + left_pad_target=left_pad_target, + eos=eos, + shuffle=shuffle, + src_lang_id=src_lang_id, + tgt_lang_id=tgt_lang_id, + ) + + return LanguagePairDenoisingDataset( + src_dataset, + src_dataset.sizes, + src_dict, + tgt_dataset, + tgt_dataset_sizes, + tgt_dict, + mask_idx, + mask_whole_words, + seed, + args, + left_pad_source=left_pad_source, + left_pad_target=left_pad_target, + eos=eos, + shuffle=shuffle, + src_lang_id=src_lang_id, + tgt_lang_id=tgt_lang_id, + ) + + def _get_sample_prob(self, dataset_lens): + """ + Get smoothed sampling porbability by languages. This helps low resource + languages by upsampling them. + """ + prob = dataset_lens / dataset_lens.sum() + smoothed_prob = prob ** self.args.multilang_sampling_alpha + smoothed_prob = smoothed_prob / smoothed_prob.sum() + return smoothed_prob + + def resample_datasets(self, lang_datasets, lang_pairs_all, epoch): + # For train subset, additionally up or down sample languages. + if self.args.multilang_sampling_alpha == 1.0: + return lang_datasets + + dataset_lengths = np.array( + [len(d) for d in lang_datasets], + dtype=float, + ) + sample_probs = self._get_sample_prob(dataset_lengths) + logger.info( + "Sample probability by language pair: {}".format( + { + lp: "{0:.4f}".format(sample_probs[id]) + for id, lp in enumerate(lang_pairs_all) + } + ) + ) + size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths + logger.info( + "Up/Down Sampling ratio by language: {}".format( + { + lp: "{0:.2f}".format(size_ratio[id]) + for id, lp in enumerate(lang_pairs_all) + } + ) + ) + + resampled_lang_datasets = [ + ResamplingDataset( + lang_datasets[i], + size_ratio=size_ratio[i], + seed=self.args.seed, + epoch=epoch, + replace=size_ratio[i] >= 1.0, + ) + for i, d in enumerate(lang_datasets) + ] + return resampled_lang_datasets + + def load_dataset_only( + self, split, lang_pairs, do_mask=True, epoch=1, combine=False + ): + paths = utils.split_paths(self.args.data) + assert len(paths) > 0 + data_path = paths[(epoch - 1) % len(paths)] + + # TODO unk token will be considered as first word too, though it might be an unknown phoneme within a word + # get_whole_word_mask returns a tensor (size V by 1 ) to indicate if a token is a word start token + mask_whole_src_words = gen_whole_word_mask(self.args, self.src_dict) + language_without_segmentations = self.args.no_whole_word_mask_langs.split(",") + lang_datasets = [] + eos_bos = [] + lang_pairs = lang_pairs.split(",") if lang_pairs != "" else [] + assert len(lang_pairs) > 0 + for lp in lang_pairs: + src, tgt = lp.split("-") + lang_mask_whole_src_words = ( + mask_whole_src_words + if src not in language_without_segmentations + else None + ) + + end_token = ( + self.source_dictionary.index( + PairedDenoisingTask.LANG_TAG_TEMPLATE.format(src) + ) + if self.args.add_src_lang_token + else None + ) + bos_token = ( + self.target_dictionary.index( + PairedDenoisingTask.LANG_TAG_TEMPLATE.format(tgt) + ) + if self.args.add_tgt_lang_token + else None + ) + src_lang_id = None + + if self.args.add_src_lang_token or self.args.add_tgt_lang_token: + eos_bos.append((end_token, bos_token)) + + dataset = PairedDenoisingTask.language_pair_denoising_dataset( + data_path, + do_mask, + split, + src, + self.source_dictionary, + tgt, + self.target_dictionary, + self.mask_idx, + lang_mask_whole_src_words, + self.args.seed, + self.args, + self.args.dataset_impl, + combine=combine, + left_pad_source=utils.eval_bool(self.args.left_pad_source), + left_pad_target=utils.eval_bool(self.args.left_pad_target), + max_source_positions=self.args.max_source_positions, + max_target_positions=self.args.max_target_positions, + src_lang_id=src_lang_id, + ) + + lang_datasets.append(dataset) + + if len(lang_datasets) == 0: + return + elif len(lang_datasets) == 1: + dataset = lang_datasets[0] + if self.args.add_src_lang_token or self.args.add_tgt_lang_token: + end_token, bos_token = eos_bos[0] + dataset = TransformEosLangPairDataset( + dataset, + src_eos=self.source_dictionary.eos(), + new_src_eos=end_token, + tgt_bos=self.target_dictionary.eos(), + new_tgt_bos=bos_token, + ) + else: + end_tokens = [item[0] for item in eos_bos if item[0] is not None] + bos_tokens = [item[1] for item in eos_bos if item[1] is not None] + lang_datasets = self.resample_datasets(lang_datasets, lang_pairs, epoch) + dataset = TransformEosConcatLangPairDataset( + lang_datasets, + self.source_dictionary.eos(), + self.target_dictionary.eos(), + new_src_eos=end_tokens, + new_tgt_bos=bos_tokens, + ) + return dataset + + # split in (train, valid, test, ...) + def load_dataset(self, split, epoch=1, combine=False, **kwargs): + self.datasets[split] = self.load_dataset_only( + split, self.lang_pairs, epoch=epoch, combine=combine + ) diff --git a/examples/speech_text_joint_to_text/tasks/speech_text_denoise_pretrain.py b/examples/speech_text_joint_to_text/tasks/speech_text_denoise_pretrain.py new file mode 100644 index 0000000000..3ad8e1c906 --- /dev/null +++ b/examples/speech_text_joint_to_text/tasks/speech_text_denoise_pretrain.py @@ -0,0 +1,654 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import logging +import os +import re +from argparse import Namespace +from pathlib import Path + +from fairseq.data import ConcatDataset, Dictionary, encoders +from fairseq.data.audio.multi_modality_dataset import ( + FileAudioDatasetWrapper, + ModalityDatasetItem, + MultiModalityDataset, +) +from fairseq.data.audio.speech_to_text_joint_dataset import ( + S2TJointDataConfig, + SpeechToTextJointDatasetCreator, +) +from fairseq.data.iterators import GroupedEpochBatchIterator +from fairseq.tasks import register_task + +from .pair_denoising import PairedDenoisingTask + +logger = logging.getLogger(__name__) + + +@register_task("speech_text_joint_denoising") +class SpeechTextJointDenoisingPreTask(PairedDenoisingTask): + """ + Joint denoising training task for speech and text. + """ + + SIL_TOKEN = "sil" + + @classmethod + def add_args(cls, parser): + PairedDenoisingTask.add_args(parser) + # set max tokens and position + parser.add_argument( + "--max-text-tokens", + type=int, + metavar="N", + default=1024, + help="maximum samples for encoder text input ", + ) + parser.add_argument( + "--max-speech-tokens", + type=int, + metavar="N", + default=50000, + help="maximum samples for encoder speech input ", + ) + parser.add_argument( + "--max-speech-positions", + type=int, + metavar="N", + default=400, + help="maximum tokens for per encoder text input ", + ) + + parser.add_argument( + "--max-sample-size", + type=int, + metavar="N", + default=32000, + help="max sample size to crop to for batching (unsupervised speech) ", + ) + parser.add_argument( + "--min-sample-size", + type=int, + metavar="N", + default=4000, + help="min sample size to crop to for batching (unsupervised speech) ", + ) + + # set mini-batch ratio for different modalities/subtasks + # s2p + parser.add_argument( + "--supervised-speech-sample-ratio", + default="1", + type=str, + metavar="N", + help="Multiple Ratio for speech dataset with transcripts ", + ) + # s2t + parser.add_argument( + "--supervised-speech-s2s-sample-ratio", + default="1", + type=str, + metavar="N", + help="Multiple Ratio for speech dataset with transcripts ", + ) + # ssl + parser.add_argument( + "--unsupervised-speech-sample-ratio", + default="1", + type=str, + metavar="N", + help="Multiple Ratio for speech dataset without transcripts ", + ) + # t2t with monolingual data (masking) + parser.add_argument( + "--text-sample-ratio", + default="1", + type=str, + metavar="N", + help="Multiple Ratio for text set ", + ) + # t2t with parallel data (no masking) + parser.add_argument( + "--bitext-sample-ratio", + default="1", + type=str, + metavar="N", + help="Multiple Ratio for text set (bitext) ", + ) + # train_subset = "train", 'valid' or so + # parallel data is loaded according to string lang_pairs and lang_pairs_no_mask from args.data + # (un)supervised speech is loaded from args.(un)sup_speech_{train,valid}_subset + parser.add_argument( + "--sup-speech-data", default="", help="path to supervised speech data" + ) + parser.add_argument( + "--sup-speech-train-subset", + default="", + help="supervised speech training subsets", + ) + parser.add_argument( + "--sup-speech-valid-subset", + default="", + help="supervised speech validation subsets", + ) + parser.add_argument( + "--config-yaml", + default="config.yaml", + help="supervised speech configuration yaml file", + ) + parser.add_argument( + "--sup-speech-s2s-data", default="", help="path to supervised speech data" + ) + parser.add_argument( + "--sup-speech-s2s-train-subset", + default="", + help="supervised speech training subsets", + ) + parser.add_argument( + "--sup-speech-s2s-valid-subset", + default="", + help="supervised speech validation subsets", + ) + parser.add_argument( + "--config-s2s-yaml", + default="config.yaml", + help="supervised speech configuration yaml file", + ) + parser.add_argument( + "--unsup-speech-train-data", + default="", + help="path to unsupervised speech training data (tsv)", + ) + parser.add_argument( + "--unsup-speech-valid-data", + default="", + help="path to unsupervised speech valid data (tsv)", + ) + parser.add_argument( + "--sample-rate", + type=int, + metavar="N", + default=16000, + help="input audio sampling rate", + ) + parser.add_argument( + "--no-emb-update-unsup", + default=False, + action="store_true", + help="no update for output embedding during unsupervised_speech mode", + ) + parser.add_argument("--same-data-update", default=False, action="store_true") + + # used for sup_speech_ali + parser.add_argument( + "--use-sup-speech-ctc", + default=False, + action="store_true", + help="use speech_sup_ctc instead of speech_sup_ali", + ) + + @classmethod + def setup_task(cls, args, **kwargs): + """Setup the task.""" + paths = args.data.split(":") + assert len(paths) > 0 + src_dict = Dictionary.load( + os.path.join(paths[0], "src_dict.txt") + ) # assume all languages share a source dictionary + tgt_dict = Dictionary.load( + os.path.join(paths[0], "tgt_dict.txt") + ) # assume all languages share a target dictionary + + lang_pairs = args.lang_pairs + "," + args.lang_pairs_bitext + lang_pairs = re.sub(",$", "", re.sub("^,", "", lang_pairs)) + if lang_pairs != "": + src_langs = [lp.split("-")[0] for lp in lang_pairs.split(",")] + tgt_langs = [lp.split("-")[1] for lp in lang_pairs.split(",")] + else: + src_langs = [] + tgt_langs = [] + + if args.add_src_lang_token: + for lang in src_langs: + assert ( + src_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang)) + != src_dict.unk() + ) + if args.add_tgt_lang_token: + for lang in tgt_langs: + assert ( + tgt_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang)) + != tgt_dict.unk() + ) + + logger.info("source dictionary: {} types".format(len(src_dict))) + logger.info("target dictionary: {} types".format(len(tgt_dict))) + if not hasattr(args, "shuffle_instance"): + args.shuffle_instance = False + return cls(args, src_dict, tgt_dict) + + def __init__(self, args, src_dict, tgt_dict): + super().__init__(args, src_dict, tgt_dict) + self.data_cfg = S2TJointDataConfig( + Path(args.sup_speech_data) / args.config_yaml + ) + logger.info( + f"load supervised speech data configure from {Path(args.sup_speech_data) / args.config_yaml}" + ) + self.data_s2s_cfg = ( + S2TJointDataConfig(Path(args.sup_speech_s2s_data) / args.config_s2s_yaml) + if args.sup_speech_s2s_train_subset != "" + else None + ) + if self.data_s2s_cfg is not None: + logger.info( + f"load supervised sequece to sequence speech data configure from {Path(args.sup_speech_s2s_data) / args.config_yaml}" + ) + + def parse_data_ratio(sample_ratio): + ratios = sample_ratio.split(",") + if len(ratios) == 1: + return [float(ratios[0])] + epoch_ratios = [] + for item in ratios: + ep, r = item.split(":") + ep = int(ep) + r = float(r) + assert ep > 0 # epoch is 1 based + assert ep >= len(epoch_ratios) + + if len(epoch_ratios) == 0: + epoch_ratios.append( + r + ) # epoch_ratios[0] is not used, but we still set it to the first value to make thing simple. + while len(epoch_ratios) < ep: + epoch_ratios.append(epoch_ratios[-1]) + epoch_ratios.append(r) + return epoch_ratios + + self.sup_ratio = parse_data_ratio(args.supervised_speech_sample_ratio) + self.sup_s2s_ratio = parse_data_ratio(args.supervised_speech_s2s_sample_ratio) + self.text_ratio = parse_data_ratio(args.text_sample_ratio) + self.bitext_ratio = parse_data_ratio(args.bitext_sample_ratio) + self.unsup_ratio = parse_data_ratio(args.unsupervised_speech_sample_ratio) + self.sample_mode = None + + def build_model(self, args): + args.input_feat_per_channel = self.data_cfg.input_feat_per_channel + args.input_channels = self.data_cfg.input_channels + return super().build_model(args) + + def build_tokenizer(self, data_cfg, msg=""): + logger.info(f"pre-tokenizer {msg}: {data_cfg.pre_tokenizer}") + return encoders.build_tokenizer(Namespace(**data_cfg.pre_tokenizer)) + + def build_bpe(self, data_cfg, msg=""): + logger.info(f"tokenizer {msg}: {data_cfg.bpe_tokenizer}") + return encoders.build_bpe(Namespace(**data_cfg.bpe_tokenizer)) + + @classmethod + def resolve_data_type(cls, split, use_sup_speech_ctc): + if len(split.split("_")) == 1: + # default case, train or valid + is_train = split + dtype = "text" + else: + is_train, dtype = split.split("_", 1) + is_train = True if is_train == "train" else False + if dtype == "sup_speech": + dtype = "sup_speech_ctc" if use_sup_speech_ctc else "sup_speech_ali" + assert dtype in ( + "text", + "bitext", + "sup_speech_ali", + "sup_speech_s2s", + "unsup_speech", + "sup_speech_ctc", + ), f"failed resolving {split} (it resulted into: {dtype} ; is_train={is_train})" + return is_train, dtype + + def create_modalitydatasetitem(self, dtype, dataset): + dsitem = None + if dtype in ("text", "bitext"): + dsitem = ModalityDatasetItem( + dtype, + dataset, + (self.args.max_source_positions, self.args.max_target_positions), + self.args.max_text_tokens, + self.args.batch_size, + ) + elif dtype in ("sup_speech_ctc", "sup_speech_ali", "sup_speech_s2s"): + dsitem = ModalityDatasetItem( + dtype, + dataset, + (self.args.max_speech_positions, self.args.max_target_positions), + self.args.max_speech_tokens, + self.args.batch_size, + ) + elif dtype == "unsup_speech": + dsitem = ModalityDatasetItem( + dtype, dataset, 1e8, self.args.max_speech_tokens, self.args.batch_size + ) + else: + raise ValueError(f"{dtype} is not supported") + return dsitem + + def load_dataset(self, split, epoch=1, combine=False, **kwargs): + def _get_sup_src_tgt_dict(src_dict, tgt_dict, use_s2s_sup_decoder): + if use_s2s_sup_decoder: + return None, tgt_dict + # use src_dict as tgt_dict here, since we use source dictionary as target for forcealignment + return None, src_dict + + is_train, dtype = self.resolve_data_type(split, self.args.use_sup_speech_ctc) + + # Note we use --add-tgt-lang-token instead of data_cfg.prepend_tgt_lang_tag_no_change to set target language tag in the text dataset + # Verify add_tgt_lang_token and prepend_tgt_lang_tag_no_change are same + + # Note we use --multilang-sampling-alpha instead of data_cfg.sampling_text_alpha to set text data sampling + if is_train: + msets = [] + # train split, load everything into one + if self.lang_pairs != "": + text_dataset = self.load_dataset_only( + "train", self.lang_pairs, epoch=epoch, combine=combine + ) + dsitem = self.create_modalitydatasetitem("text", text_dataset) + msets.append(dsitem) + if self.lang_pairs_bitext != "": # load bitext + bitext_dataset = self.load_dataset_only( + "train_bitext", + self.lang_pairs_bitext, + do_mask=False, + epoch=epoch, + combine=combine, + ) + dsitem = self.create_modalitydatasetitem("bitext", bitext_dataset) + msets.append(dsitem) + if self.args.sup_speech_train_subset != "": + pre_tokenizer = self.build_tokenizer(self.data_cfg) + bpe_tokenizer = self.build_bpe(self.data_cfg) + + append_eos = True + sup_speech_type = "sup_speech_ali" + if self.args.use_sup_speech_ctc: + # CTC mode + sup_speech_type = "sup_speech_ctc" + append_eos = False # CTC doesn't need eos in the target + + src_dict, tgt_dict = _get_sup_src_tgt_dict( + self.src_dict, self.tgt_dict, False + ) + sup_speech_dataset = SpeechToTextJointDatasetCreator.from_tsv( + self.args.sup_speech_data, + self.data_cfg, + self.args.sup_speech_train_subset, + tgt_dict=tgt_dict, + src_dict=src_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + src_pre_tokenizer=None, + src_bpe_tokenizer=None, + is_train_split=is_train, + epoch=epoch, + seed=self.args.seed, + append_eos=append_eos, + ) + dsitem = self.create_modalitydatasetitem( + sup_speech_type, sup_speech_dataset + ) + msets.append(dsitem) + + if self.args.sup_speech_s2s_train_subset != "": + pre_tokenizer = self.build_tokenizer(self.data_s2s_cfg, msg="(s2s)") + bpe_tokenizer = self.build_bpe(self.data_s2s_cfg, msg="(s2s)") + + # make sure self.data_cfg.prepend_tgt_lang_tag_no_change == self.args.add_tgt_lang_token + src_dict, tgt_dict = _get_sup_src_tgt_dict( + self.src_dict, self.tgt_dict, True + ) + sup_speech_s2s_dataset = SpeechToTextJointDatasetCreator.from_tsv( + self.args.sup_speech_s2s_data, + self.data_s2s_cfg, + self.args.sup_speech_s2s_train_subset, + tgt_dict=tgt_dict, + src_dict=src_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + src_pre_tokenizer=None, + src_bpe_tokenizer=None, + is_train_split=is_train, + epoch=epoch, + seed=self.args.seed, + ) + dsitem = self.create_modalitydatasetitem( + "sup_speech_s2s", sup_speech_s2s_dataset + ) + msets.append(dsitem) + if self.args.unsup_speech_train_data != "": + unsup_speech_dataset = FileAudioDatasetWrapper( + self.args.unsup_speech_train_data, + self.args.sample_rate, + max_sample_size=self.args.max_sample_size, + min_sample_size=self.args.min_sample_size, + normalize=False, + ) + dsitem = self.create_modalitydatasetitem( + "unsup_speech", unsup_speech_dataset + ) + msets.append(dsitem) + + pre_train_dataset = MultiModalityDataset(msets) + self.datasets[split] = pre_train_dataset + else: # validation split, load them for each type of data + if dtype == "text": + text_dataset = self.load_dataset_only( + split, self.lang_pairs, epoch=epoch, combine=combine + ) + dsitem = self.create_modalitydatasetitem("text", text_dataset) + self.datasets[split] = MultiModalityDataset([dsitem]) + elif dtype == "bitext": + bitext_dataset = self.load_dataset_only( + split, + self.lang_pairs_bitext, + do_mask=False, + epoch=epoch, + combine=combine, + ) + dsitem = self.create_modalitydatasetitem("bitext", bitext_dataset) + self.datasets[split] = MultiModalityDataset([dsitem]) + + elif dtype in ("sup_speech_ctc", "sup_speech_ali"): + assert self.args.sup_speech_valid_subset != "" + pre_tokenizer = self.build_tokenizer(self.data_cfg) + bpe_tokenizer = self.build_bpe(self.data_cfg) + append_eos = True + if dtype == "sup_speech_ctc": + # CTC mode + append_eos = False # CTC doesn't need eos + assert self.args.use_sup_speech_ctc + + datasets = [] + for split_name in self.args.sup_speech_valid_subset.split(","): + src_dict, tgt_dict = _get_sup_src_tgt_dict( + self.src_dict, self.tgt_dict, False + ) + datasets.append( + SpeechToTextJointDatasetCreator.from_tsv( + self.args.sup_speech_data, + self.data_cfg, + split_name, + tgt_dict=tgt_dict, + src_dict=src_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + src_pre_tokenizer=None, + src_bpe_tokenizer=None, + is_train_split=is_train, + epoch=epoch, + seed=self.args.seed, + append_eos=append_eos, + ) + ) + + dset = datasets[0] if len(datasets) == 1 else ConcatDataset(datasets) + dsitem = self.create_modalitydatasetitem(dtype, dset) + self.datasets[split] = MultiModalityDataset([dsitem]) + + elif dtype == "sup_speech_s2s": + assert self.args.sup_speech_s2s_valid_subset != "" + pre_tokenizer = self.build_tokenizer(self.data_s2s_cfg) + bpe_tokenizer = self.build_bpe(self.data_s2s_cfg) + datasets = [] + for split_name in self.args.sup_speech_s2s_valid_subset.split(","): + src_dict, tgt_dict = _get_sup_src_tgt_dict( + self.src_dict, self.tgt_dict, True + ) + datasets.append( + SpeechToTextJointDatasetCreator.from_tsv( + self.args.sup_speech_s2s_data, + self.data_s2s_cfg, + split_name, + tgt_dict=tgt_dict, + src_dict=src_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + src_pre_tokenizer=None, + src_bpe_tokenizer=None, + is_train_split=is_train, + epoch=epoch, + seed=self.args.seed, + ) + ) + + dset = datasets[0] if len(datasets) == 1 else ConcatDataset(datasets) + dsitem = self.create_modalitydatasetitem("sup_speech_s2s", dset) + self.datasets[split] = MultiModalityDataset([dsitem]) + elif dtype == "unsup_speech": + assert self.args.unsup_speech_valid_data != "" + unsup_speech_dataset = FileAudioDatasetWrapper( + self.args.unsup_speech_valid_data, + self.args.sample_rate, + max_sample_size=self.args.max_sample_size, + min_sample_size=self.args.min_sample_size, + normalize=False, + ) + dsitem = self.create_modalitydatasetitem( + "unsup_speech", unsup_speech_dataset + ) + self.datasets[split] = MultiModalityDataset([dsitem]) + else: + raise ValueError(f"Unsupported type {dtype}") + + def get_sample_ratio(self, epoch): + sup_ratio = ( + self.sup_ratio[epoch] if len(self.sup_ratio) > epoch else self.sup_ratio[-1] + ) + sup_s2s_ratio = ( + self.sup_s2s_ratio[epoch] + if len(self.sup_s2s_ratio) > epoch + else self.sup_s2s_ratio[-1] + ) + unsup_ratio = ( + self.unsup_ratio[epoch] + if len(self.unsup_ratio) > epoch + else self.unsup_ratio[-1] + ) + text_ratio = ( + self.text_ratio[epoch] + if len(self.text_ratio) > epoch + else self.text_ratio[-1] + ) + bitext_ratio = ( + self.bitext_ratio[epoch] + if len(self.bitext_ratio) > epoch + else self.bitext_ratio[-1] + ) + return text_ratio, bitext_ratio, sup_ratio, sup_s2s_ratio, unsup_ratio + + def get_batch_iterator( + self, + dataset, + max_tokens=None, + max_sentences=None, + max_positions=None, + ignore_invalid_inputs=False, + required_batch_size_multiple=1, + seed=1, + num_shards=1, + shard_id=0, + num_workers=0, + epoch=0, + data_buffer_size=0, + disable_iterator_cache=False, + skip_remainder_batch=False, + grouped_shuffling=False, + update_epoch_batch_itr=False, + ): + + assert isinstance(dataset, MultiModalityDataset) + if len(dataset.id_to_mode) == 1: + max_positions = dataset.max_positions[0] + max_tokens = dataset.max_tokens[0] + max_sentences = dataset.max_sentences[0] + return super().get_batch_iterator( + dataset, + max_tokens, + max_sentences, + max_positions, + ignore_invalid_inputs, + required_batch_size_multiple, + seed, + num_shards, + shard_id, + num_workers, + epoch, + data_buffer_size, + disable_iterator_cache, + skip_remainder_batch=skip_remainder_batch, + ) + + mult_ratio = [] + ( + text_ratio, + bitext_ratio, + sup_ratio, + sup_s2s_ratio, + unsup_ratio, + ) = self.get_sample_ratio(epoch) + for mode in dataset.id_to_mode: + if mode in ("sup_speech_ctc", "sup_speech_ali"): + mult_ratio.append(sup_ratio) + elif mode == "sup_speech_s2s": + mult_ratio.append(sup_s2s_ratio) + elif mode == "text": + mult_ratio.append(text_ratio) + elif mode == "bitext": + mult_ratio.append(bitext_ratio) + elif mode == "unsup_speech": + mult_ratio.append(unsup_ratio) + + # initialize the dataset with the correct starting epoch + dataset.set_epoch(epoch) + + batch_samplers = dataset.get_batch_samplers( + mult_ratio, required_batch_size_multiple, seed + ) + + # return a reusable, sharded iterator + epoch_iter = GroupedEpochBatchIterator( + dataset=dataset, + collate_fn=dataset.collater, + batch_samplers=batch_samplers, + seed=seed, + num_shards=num_shards, + shard_id=shard_id, + num_workers=num_workers, + epoch=epoch, + mult_rate=max(self.args.update_freq) if self.args.same_data_update else 1, + buffer_size=data_buffer_size, + skip_remainder_batch=skip_remainder_batch, + ) + self.dataset_to_epoch_iter[dataset] = {} # refresh it every epoch + return epoch_iter diff --git a/examples/speech_text_joint_to_text/tasks/speech_text_joint.py b/examples/speech_text_joint_to_text/tasks/speech_text_joint.py new file mode 100644 index 0000000000..bb04f14f13 --- /dev/null +++ b/examples/speech_text_joint_to_text/tasks/speech_text_joint.py @@ -0,0 +1,377 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import logging +import os +from argparse import Namespace +from pathlib import Path + +import torch +from fairseq.data import ( + encoders, + Dictionary, + ResamplingDataset, + TransformEosLangPairDataset, + ConcatDataset, +) +from fairseq.data.iterators import GroupedEpochBatchIterator +from fairseq.data.audio.multi_modality_dataset import ( + MultiModalityDataset, + LangPairMaskDataset, + ModalityDatasetItem, +) +from fairseq.data.audio.speech_to_text_dataset import ( + SpeechToTextDataset, + SpeechToTextDatasetCreator, +) +from fairseq.data.audio.speech_to_text_joint_dataset import ( + S2TJointDataConfig, + SpeechToTextJointDatasetCreator, +) +from fairseq.tasks import register_task +from fairseq.tasks.speech_to_text import SpeechToTextTask +from fairseq.tasks.translation import load_langpair_dataset + +logger = logging.getLogger(__name__) +LANG_TAG_TEMPLATE = "<lang:{}>" + + +@register_task("speech_text_joint_to_text") +class SpeechTextJointToTextTask(SpeechToTextTask): + """ + Task for joint training speech and text to text. + """ + + @classmethod + def add_args(cls, parser): + """Add task-specific arguments to the parser.""" + super(SpeechTextJointToTextTask, cls).add_args(parser) + ### + parser.add_argument( + "--parallel-text-data", + default="", + help="path to parallel text data directory", + ) + parser.add_argument( + "--max-tokens-text", + type=int, + metavar="N", + help="maximum tokens for encoder text input ", + ) + parser.add_argument( + "--max-positions-text", + type=int, + metavar="N", + default=400, + help="maximum tokens for per encoder text input ", + ) + parser.add_argument( + "--langpairs", + default=None, + metavar="S", + help='language pairs for text training, separated with ","', + ) + parser.add_argument( + "--speech-sample-ratio", + default=1, + type=float, + metavar="N", + help="Multiple Ratio for speech dataset with transcripts ", + ) + parser.add_argument( + "--text-sample-ratio", + default=1, + type=float, + metavar="N", + help="Multiple Ratio for text set ", + ) + parser.add_argument( + "--update-mix-data", + action="store_true", + help="use mixed data in one update when update-freq > 1", + ) + parser.add_argument( + "--load-speech-only", action="store_true", help="load speech data only", + ) + parser.add_argument( + "--mask-text-ratio", + type=float, + metavar="V", + default=0.0, + help="mask V source tokens for text only mode", + ) + parser.add_argument( + "--mask-text-type", + default="random", + choices=["random", "tail"], + help="mask text typed", + ) + parser.add_argument( + "--noise-token", + default="", + help="noise token for masking src text tokens if mask-text-ratio > 0", + ) + parser.add_argument( + "--infer-target-lang", + default="", + metavar="S", + help="target language for inference", + ) + + def __init__(self, args, src_dict, tgt_dict, infer_tgt_lang_id=None): + super().__init__(args, tgt_dict) + self.src_dict = src_dict + self.data_cfg = S2TJointDataConfig(Path(args.data) / args.config_yaml) + assert self.tgt_dict.pad() == self.src_dict.pad() + assert self.tgt_dict.eos() == self.src_dict.eos() + self.speech_only = args.load_speech_only + self._infer_tgt_lang_id = infer_tgt_lang_id + + @classmethod + def setup_task(cls, args, **kwargs): + """Setup the task (e.g., load dictionaries).""" + data_cfg = S2TJointDataConfig(Path(args.data) / args.config_yaml) + tgt_dict_path = Path(args.data) / data_cfg.vocab_filename + src_dict_path = Path(args.data) / data_cfg.src_vocab_filename + if (not os.path.isfile(src_dict_path)) or (not os.path.isfile(tgt_dict_path)): + raise FileNotFoundError("Dict not found: {}".format(args.data)) + src_dict = Dictionary.load(src_dict_path.as_posix()) + tgt_dict = Dictionary.load(tgt_dict_path.as_posix()) + + print("| src dictionary: {} types".format(len(src_dict))) + print("| tgt dictionary: {} types".format(len(tgt_dict))) + + if args.parallel_text_data != "": + if not os.path.isabs(args.parallel_text_data): + args.parallel_text_data = os.path.join( + args.data, args.parallel_text_data + ) + + if args.langpairs is None: + raise Exception( + "Could not infer language pair, please provide it explicitly" + ) + infer_tgt_lang_id = None + if args.infer_target_lang != "" and data_cfg.prepend_tgt_lang_tag_no_change: + tgt_lang_tag = SpeechToTextDataset.LANG_TAG_TEMPLATE.format( + args.infer_target_lang + ) + infer_tgt_lang_id = tgt_dict.index(tgt_lang_tag) + assert infer_tgt_lang_id != tgt_dict.unk() + return cls(args, src_dict, tgt_dict, infer_tgt_lang_id=infer_tgt_lang_id) + + def load_langpair_dataset( + self, prepend_tgt_lang_tag=False, sampling_alpha=1.0, epoch=0 + ): + lang_pairs = [] + text_dataset = None + split = "train" + for lp in self.args.langpairs.split(","): + src, tgt = lp.split("-") + text_dataset = load_langpair_dataset( + self.args.parallel_text_data, + split, + src, + self.src_dict, + tgt, + self.tgt_dict, + combine=True, + dataset_impl=None, + upsample_primary=1, + left_pad_source=False, + left_pad_target=False, + max_source_positions=self.args.max_positions_text, + max_target_positions=self.args.max_target_positions, + load_alignments=False, + truncate_source=False, + ) + if prepend_tgt_lang_tag: + # TODO + text_dataset = TransformEosLangPairDataset( + text_dataset, + src_eos=self.src_dict.eos(), + tgt_bos=self.tgt_dict.eos(), # 'prev_output_tokens' starts with eos + new_tgt_bos=self.tgt_dict.index(LANG_TAG_TEMPLATE.format(tgt)), + ) + lang_pairs.append(text_dataset) + if len(lang_pairs) > 1: + if sampling_alpha != 1.0: + size_ratios = SpeechToTextDatasetCreator.get_size_ratios( + self.args.langpairs.split(","), + [len(s) for s in lang_pairs], + alpha=sampling_alpha, + ) + lang_pairs = [ + ResamplingDataset(d, size_ratio=r, epoch=epoch, replace=(r >= 1.0)) + for d, r in zip(lang_pairs, size_ratios) + ] + return ConcatDataset(lang_pairs) + return text_dataset + + def inference_step( + self, generator, models, sample, prefix_tokens=None, constraints=None + ): + with torch.no_grad(): + return generator.generate( + models, + sample, + prefix_tokens=prefix_tokens, + constraints=constraints, + bos_token=self._infer_tgt_lang_id, + ) + + def build_src_tokenizer(self, args): + logger.info(f"src-pre-tokenizer: {self.data_cfg.src_pre_tokenizer}") + return encoders.build_tokenizer(Namespace(**self.data_cfg.src_pre_tokenizer)) + + def build_src_bpe(self, args): + logger.info(f"tokenizer: {self.data_cfg.src_bpe_tokenizer}") + return encoders.build_bpe(Namespace(**self.data_cfg.src_bpe_tokenizer)) + + def load_dataset(self, split, epoch=1, combine=False, **kwargs): + """Load a given dataset split. + + Args: + split (str): name of the split (e.g., train, valid, test) + """ + is_train_split = split.startswith("train") + pre_tokenizer = self.build_tokenizer(self.args) + bpe_tokenizer = self.build_bpe(self.args) + src_pre_tokenizer = self.build_src_tokenizer(self.args) + src_bpe_tokenizer = self.build_src_bpe(self.args) + ast_dataset = SpeechToTextJointDatasetCreator.from_tsv( + self.args.data, + self.data_cfg, + split, + self.tgt_dict, + src_dict=None if self.speech_only else self.src_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + src_pre_tokenizer=src_pre_tokenizer, + src_bpe_tokenizer=src_bpe_tokenizer, + is_train_split=is_train_split, + epoch=epoch, + seed=self.args.seed, + ) + noise_token_id = -1 + text_dataset = None + if self.args.parallel_text_data != "" and is_train_split: + text_dataset = self.load_langpair_dataset( + self.data_cfg.prepend_tgt_lang_tag_no_change, 1.0, epoch=epoch, + ) + if self.args.mask_text_ratio > 0: + # add mask + noise_token_id = ( + self.src_dict.unk() + if self.args.noise_token == "" + else self.src_dict.index(self.args.noise_token) + ) + text_dataset = LangPairMaskDataset( + text_dataset, + src_bos=self.src_dict.bos(), + src_eos=self.src_dict.eos(), + noise_id=noise_token_id, + mask_ratio=self.args.mask_text_ratio, + mask_type=self.args.mask_text_type, + ) + + if text_dataset is not None: + mdsets = [ + ModalityDatasetItem( + "sup_speech", + ast_dataset, + (self.args.max_source_positions, self.args.max_target_positions), + self.args.max_tokens, + self.args.batch_size, + ), + ModalityDatasetItem( + "text", + text_dataset, + (self.args.max_positions_text, self.args.max_target_positions), + self.args.max_tokens_text + if self.args.max_tokens_text is not None + else self.args.max_tokens, + self.args.batch_size, + ), + ] + ast_dataset = MultiModalityDataset(mdsets) + self.datasets[split] = ast_dataset + + @property + def target_dictionary(self): + """Return the :class:`~fairseq.data.Dictionary` for the language + model.""" + return self.tgt_dict + + @property + def source_dictionary(self): + """Return the source :class:`~fairseq.data.Dictionary` (if applicable + for this task).""" + return None if self.speech_only else self.src_dict + + def get_batch_iterator( + self, + dataset, + max_tokens=None, + max_sentences=None, + max_positions=None, + ignore_invalid_inputs=False, + required_batch_size_multiple=1, + seed=1, + num_shards=1, + shard_id=0, + num_workers=0, + epoch=0, + data_buffer_size=0, + disable_iterator_cache=False, + skip_remainder_batch=False, + grouped_shuffling=False, + update_epoch_batch_itr=False, + ): + + if not isinstance(dataset, MultiModalityDataset): + return super(SpeechTextJointToTextTask, self).get_batch_iterator( + dataset, + max_tokens, + max_sentences, + max_positions, + ignore_invalid_inputs, + required_batch_size_multiple, + seed, + num_shards, + shard_id, + num_workers, + epoch, + data_buffer_size, + disable_iterator_cache, + skip_remainder_batch=skip_remainder_batch, + update_epoch_batch_itr=update_epoch_batch_itr, + ) + + mult_ratio = [self.args.speech_sample_ratio, self.args.text_sample_ratio] + assert len(dataset.datasets) == 2 + + # initialize the dataset with the correct starting epoch + dataset.set_epoch(epoch) + + batch_samplers = dataset.get_batch_samplers( + mult_ratio, required_batch_size_multiple, seed + ) + + # return a reusable, sharded iterator + epoch_iter = GroupedEpochBatchIterator( + dataset=dataset, + collate_fn=dataset.collater, + batch_samplers=batch_samplers, + seed=seed, + num_shards=num_shards, + shard_id=shard_id, + num_workers=num_workers, + epoch=epoch, + mult_rate=1 if self.args.update_mix_data else max(self.args.update_freq), + buffer_size=data_buffer_size, + skip_remainder_batch=skip_remainder_batch, + ) + self.dataset_to_epoch_iter[dataset] = {} # refresh it every epoch + return epoch_iter diff --git a/examples/speech_to_speech/README.md b/examples/speech_to_speech/README.md new file mode 100644 index 0000000000..f03f6a32f8 --- /dev/null +++ b/examples/speech_to_speech/README.md @@ -0,0 +1,7 @@ +# Speech to speech translation (S2ST) + +We provide the implementation and resources for the following work on speech-to-speech translation (S2ST): + +* [Direct speech-to-speech translation with discrete units (Lee et al. 2021)](docs/direct_s2st_discrete_units.md) +* [Textless Speech-to-Speech Translation on Real Data (Lee et al. 2021)](docs/textless_s2st_real_data.md) +* [Enhanced Direct Speech-to-Speech Translation Using Self-supervised Pre-training and Data Augmentation](docs/enhanced_direct_s2st_discrete_units.md) diff --git a/examples/speech_to_speech/__init__.py b/examples/speech_to_speech/__init__.py new file mode 100644 index 0000000000..812b3c30b9 --- /dev/null +++ b/examples/speech_to_speech/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import unity # noqa diff --git a/examples/speech_to_speech/asr_bleu/README.md b/examples/speech_to_speech/asr_bleu/README.md new file mode 100644 index 0000000000..6a7ea7fcef --- /dev/null +++ b/examples/speech_to_speech/asr_bleu/README.md @@ -0,0 +1,34 @@ +# ASR-BLEU evaluation toolkit + +This toolkit provides a set of public ASR models used for evaluation of different speech-to-speech translation systems at FAIR. It enables easier score comparisons between different system's outputs. + +The ASRGenerator wraps different CTC-based ASR models from HuggingFace and fairseq code bases. Torchaudio CTC decoder is built on top of it to decode given audio files. + +Please see `asr_model_cfgs.json` for a list of languages covered currently. + +The high-level pipeline is simple by design: given a lang tag, script loads the ASR model, transcribes model's predicted audio, and computes the BLEU score against provided reference translations using sacrebleu. + +# Dependencies + +Please see `requirements.txt`. + +# Usage examples + +This toolkit have been used with: + +* Speechmatrix project: https://github.com/facebookresearch/fairseq/tree/ust/examples/speech_matrix. + +* Hokkien speech-to-speech translation project: https://github.com/facebookresearch/fairseq/tree/ust/examples/hokkien. + +# Standalone run example + +High-level example, please substitute arguments per your case: + +```bash +python compute_asr_bleu.py --lang <LANG> \ +--audio_dirpath <PATH_TO_AUDIO_DIR> \ +--reference_path <PATH_TO_REFERENCES_FILE> \ +--reference_format txt +``` + +For more details about arguments please see the script argparser help. diff --git a/examples/speech_to_speech/asr_bleu/__init__.py b/examples/speech_to_speech/asr_bleu/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/speech_to_speech/asr_bleu/asr_model_cfgs.json b/examples/speech_to_speech/asr_bleu/asr_model_cfgs.json new file mode 100644 index 0000000000..d0a5f3e3aa --- /dev/null +++ b/examples/speech_to_speech/asr_bleu/asr_model_cfgs.json @@ -0,0 +1,198 @@ +{ + "en": { + "oct22": { + "desc": "Wav2Vec 2.0 Large (LV-60) + Self Training from https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec#pre-trained-models", + "ckpt_path": "https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt", + "dict_path": "https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt", + "model_type": "fairseq", + "lang": "en", + "post_process": "collapse" + } + }, + "hok": { + "oct22": { + "desc": "Hokkien ASR model, for details check [TODO add paper link]", + "ckpt_path": "https://dl.fbaipublicfiles.com/ust_asr/hok/checkpoint_best.pt", + "dict_path": "https://dl.fbaipublicfiles.com/ust_asr/hok/dict.ltr.txt", + "model_type": "fairseq", + "lang": "hok", + "post_process": "none" + } + }, + "es": { + "oct22": { + "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-spanish", + "model_type": "hf", + "lang": "es", + "post_process": "collapse" + } + }, + "fr": { + "oct22": { + "model_path": "jonatasgrosman/wav2vec2-large-fr-voxpopuli-french", + "model_type": "hf", + "lang": "fr", + "post_process": "collapse" + } + }, + "zh": { + "oct22": { + "model_path": "ydshieh/wav2vec2-large-xlsr-53-chinese-zh-cn-gpt", + "model_type": "hf", + "lang": "zh", + "post_process": "collapse" + } + }, + "tr": { + "oct22": { + "model_path": "cahya/wav2vec2-large-xlsr-turkish-artificial-cv", + "model_type": "hf", + "lang": "tr", + "post_process": "collapse" + } + }, + "ar": { + "oct22": { + "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic", + "model_type": "hf", + "lang": "ar", + "post_process": "collapse" + } + }, + "vi": { + "oct22": { + "model_path": "not-tanh/wav2vec2-large-xlsr-53-vietnamese", + "model_type": "hf", + "lang": "vi", + "post_process": "collapse" + } + }, + "de": { + "oct22": { + "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-german", + "model_type": "hf", + "lang": "de", + "post_process": "collapse" + } + }, + "pl": { + "oct22": { + "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-polish", + "model_type": "hf", + "lang": "pl", + "post_process": "collapse" + } + }, + "it": { + "oct22": { + "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-italian", + "model_type": "hf", + "lang": "it", + "post_process": "collapse" + } + }, + "pt": { + "oct22": { + "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-portuguese", + "model_type": "hf", + "lang": "pt", + "post_process": "collapse" + } + }, + "ro": { + "oct22": { + "model_path": "gigant/romanian-wav2vec2", + "model_type": "hf", + "lang": "ro", + "post_process": "collapse" + } + }, + "cs": { + "oct22": { + "model_path": "comodoro/wav2vec2-xls-r-300m-cs-250", + "model_type": "hf", + "lang": "cs", + "post_process": "collapse" + } + }, + "sk": { + "oct22": { + "model_path": "anuragshas/wav2vec2-xls-r-300m-sk-cv8-with-lm", + "model_type": "hf", + "lang": "sk", + "post_process": "collapse" + } + }, + "sl": { + "oct22": { + "model_path": "anuragshas/wav2vec2-xls-r-300m-sl-cv8-with-lm", + "model_type": "hf", + "lang": "sl", + "post_process": "collapse" + } + }, + "fi": { + "oct22": { + "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-finnish", + "model_type": "hf", + "lang": "fi", + "post_process": "collapse" + } + }, + "hu": { + "oct22": { + "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian", + "model_type": "hf", + "lang": "hu", + "post_process": "collapse" + } + }, + "et": { + "oct22": { + "model_path": "RASMUS/wav2vec2-xlsr-1b-et", + "model_type": "hf", + "lang": "et", + "post_process": "collapse" + } + }, + "lt": { + "oct22": { + "model_path": "sammy786/wav2vec2-xlsr-lithuanian", + "model_type": "hf", + "lang": "lt", + "post_process": "collapse" + } + }, + "nl": { + "oct22": { + "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-dutch", + "model_type": "hf", + "lang": "nl", + "post_process": "collapse" + } + }, + "lv": { + "oct22": { + "model_path": "reach-vb/wav2vec2-large-xls-r-1B-common_voice7-lv-ft", + "model_type": "hf", + "lang": "lv", + "post_process": "collapse" + } + }, + "sv": { + "oct22": { + "model_path": "marinone94/xls-r-300m-sv-robust", + "model_type": "hf", + "lang": "sv", + "post_process": "collapse" + } + }, + "hr": { + "oct22": { + "model_path": "classla/wav2vec2-xls-r-parlaspeech-hr", + "model_type": "hf", + "lang": "hr", + "post_process": "collapse" + } + } +} diff --git a/examples/speech_to_speech/asr_bleu/compute_asr_bleu.py b/examples/speech_to_speech/asr_bleu/compute_asr_bleu.py new file mode 100644 index 0000000000..d5926194c1 --- /dev/null +++ b/examples/speech_to_speech/asr_bleu/compute_asr_bleu.py @@ -0,0 +1,244 @@ +import os +from typing import Dict, List +import sacrebleu +import pandas as pd +from glob import glob +from pathlib import Path +from utils import retrieve_asr_config, ASRGenerator +from tqdm import tqdm +from argparse import ArgumentParser + + +def merge_tailo_init_final(text): + """ + Hokkien ASR hypothesis post-processing. + """ + sps = text.strip().split() + results = [] + last_syllable = "" + for sp in sps: + if sp == "NULLINIT" or sp == "nullinit": + continue + last_syllable += sp + if sp[-1].isnumeric(): + results.append(last_syllable) + last_syllable = "" + if last_syllable != "": + results.append(last_syllable) + return " ".join(results) + + +def remove_tone(text): + """ + Used for tone-less evaluation of Hokkien + """ + return " ".join([t[:-1] for t in text.split()]) + + +def extract_audio_for_eval(audio_dirpath: str, audio_format: str): + if audio_format == "n_pred.wav": + """ + The assumption here is that 0_pred.wav corresponds to the reference at line position 0 from the reference manifest + """ + audio_list = [] + audio_fp_list = glob((Path(audio_dirpath) / "*_pred.wav").as_posix()) + audio_fp_list = sorted( + audio_fp_list, key=lambda x: int(os.path.basename(x).split("_")[0]) + ) + for i in range(len(audio_fp_list)): + try: + audio_fp = (Path(audio_dirpath) / f"{i}_pred.wav").as_posix() + assert ( + audio_fp in audio_fp_list + ), f"{Path(audio_fp).name} does not exist in {audio_dirpath}" + except AssertionError: + # check the audio with random speaker + audio_fp = Path(audio_dirpath) / f"{i}_spk*_pred.wav" + audio_fp = glob( + audio_fp.as_posix() + ) # resolve audio filepath with random speaker + assert len(audio_fp) == 1 + audio_fp = audio_fp[0] + + audio_list.append(audio_fp) + else: + raise NotImplementedError + + return audio_list + + +def extract_text_for_eval( + references_filepath: str, reference_format: str, reference_tsv_column: str = None +): + if reference_format == "txt": + reference_sentences = open(references_filepath, "r").readlines() + reference_sentences = [l.strip() for l in reference_sentences] + elif reference_format == "tsv": + tsv_df = pd.read_csv(references_filepath, sep="\t", quoting=3) + reference_sentences = tsv_df[reference_tsv_column].to_list() + reference_sentences = [l.strip() for l in reference_sentences] + else: + raise NotImplementedError + + return reference_sentences + + +def compose_eval_data( + audio_dirpath: str, + audio_format: str, + references_filepath: str, + reference_format: str, + reference_tsv_column: str = None, + save_manifest_filepath=None, +): + """ + Speech matrix decoding pipeline produces audio with the following mask "N_pred.wav" where N is the order of the corresponding input sample + """ + + reference_sentences = extract_text_for_eval( + references_filepath, reference_format, reference_tsv_column + ) + predicted_audio_fp_list = extract_audio_for_eval(audio_dirpath, audio_format) + assert len(predicted_audio_fp_list) == len(reference_sentences) + + audio_text_pairs = [ + (audio, reference) + for audio, reference in zip(predicted_audio_fp_list, reference_sentences) + ] + + tsv_manifest = pd.DataFrame(audio_text_pairs, columns=["prediction", "reference"]) + + if save_manifest_filepath is not None: + tsv_manifest.to_csv(save_manifest_filepath, sep="\t", quoting=3) + + return tsv_manifest + + +def load_eval_data_from_tsv(eval_data_filepath: str): + """ + We may load the result of `compose_eval_data` directly if needed + """ + eval_df = pd.from_csv(eval_data_filepath, sep="\t") + + return eval_df + + +def run_asr_bleu(args): + + asr_config = retrieve_asr_config( + args.lang, args.asr_version, json_path="./asr_model_cfgs.json" + ) + asr_model = ASRGenerator(asr_config) + + eval_manifest = compose_eval_data( + audio_dirpath=args.audio_dirpath, + audio_format=args.audio_format, + references_filepath=args.reference_path, + reference_format=args.reference_format, + reference_tsv_column=args.reference_tsv_column, + save_manifest_filepath=None, + ) + + prediction_transcripts = [] + for _, eval_pair in tqdm( + eval_manifest.iterrows(), + desc="Transcribing predictions", + total=len(eval_manifest), + ): + transcription = asr_model.transcribe_audiofile(eval_pair.prediction) + prediction_transcripts.append(transcription.lower()) + + if args.lang == "hok": + prediction_transcripts = [ + merge_tailo_init_final(text) for text in prediction_transcripts + ] + + references = eval_manifest["reference"].tolist() + bleu_score = sacrebleu.corpus_bleu(prediction_transcripts, [references]) + + print(bleu_score) + + return prediction_transcripts, bleu_score + + +def main(): + parser = ArgumentParser( + description="This script computes the ASR-BLEU metric between model's generated audio and the text reference sequences." + ) + + parser.add_argument( + "--lang", + help="The target language used to initialize ASR model, see asr_model_cfgs.json for available languages", + type=str, + ) + parser.add_argument( + "--asr_version", + type=str, + default="oct22", + help="For future support we add and extra layer of asr versions. The current most recent version is oct22 meaning October 2022", + ) + parser.add_argument( + "--audio_dirpath", + type=str, + help="Path to the directory containing the audio predictions from the translation model", + ) + parser.add_argument( + "--reference_path", + type=str, + help="Path to the file containing reference translations in the form of normalized text (to be compared to ASR predictions", + ) + parser.add_argument( + "--reference_format", + choices=["txt", "tsv"], + help="Format of reference file. Txt means plain text format where each line represents single reference sequence", + ) + parser.add_argument( + "--reference_tsv_column", + default=None, + type=str, + help="If format is tsv, then specify the column name which contains reference sequence", + ) + parser.add_argument( + "--audio_format", + default="n_pred.wav", + choices=["n_pred.wav"], + help="Audio format n_pred.wav corresponds to names like 94_pred.wav or 94_spk7_pred.wav where spk7 is the speaker id", + ) + parser.add_argument( + "--results_dirpath", + default=None, + type=str, + help="If specified, the resulting BLEU score will be written to this file path as txt file", + ) + parser.add_argument( + "--transcripts_path", + default=None, + type=str, + help="If specified, the predicted transcripts will be written to this path as a txt file.", + ) + + args = parser.parse_args() + + prediction_transcripts, bleu_score = run_asr_bleu(args) + result_filename = f"{args.reference_format}_{args.lang}_bleu.txt" + if args.results_dirpath is not None: + if not Path(args.results_dirpath).exists(): + Path(args.results_dirpath).mkdir(parents=True) + with open(Path(args.results_dirpath) / result_filename, "w") as f: + f.write(bleu_score.format(width=2)) + + if args.transcripts_path is not None: + with open(args.transcripts_path, "w") as f: + for transcript in prediction_transcripts: + f.write(transcript + "\n") + + +if __name__ == "__main__": + main() + + +""" +Example to load Sl audio and references, compute BLEU: + +export lang=fi; split=vp && python compute_asr_bleu.py --lang $lang --audio_dirpath /checkpoint/hygong/S2S/speech_matrix_release_ckpts/generated_waveform_release/en-$lang/test_$split/checkpoint.pt --audio_format n_pred.wav --reference_path /large_experiments/ust/hygong/S2S/SpeechEncoder/manifests/vp-vp/en-$lang/test_$split.$lang --reference_format txt --results_dirpath ./ +""" diff --git a/examples/speech_to_speech/asr_bleu/requirements.txt b/examples/speech_to_speech/asr_bleu/requirements.txt new file mode 100644 index 0000000000..cfa90f6aef --- /dev/null +++ b/examples/speech_to_speech/asr_bleu/requirements.txt @@ -0,0 +1,7 @@ +fairseq==0.12.2 +pandas==1.4.3 +sacrebleu==2.2.0 +torch==1.12.1 +torchaudio==0.12.1 +tqdm==4.64.0 +transformers==4.21.1 diff --git a/examples/speech_to_speech/asr_bleu/utils.py b/examples/speech_to_speech/asr_bleu/utils.py new file mode 100644 index 0000000000..0fed55a9b9 --- /dev/null +++ b/examples/speech_to_speech/asr_bleu/utils.py @@ -0,0 +1,306 @@ +import json +import re +import urllib.request +from pathlib import Path + +import fairseq +import torch +from fairseq.data.data_utils import lengths_to_padding_mask +from tqdm import tqdm + +try: + import torchaudio + from torchaudio.models.decoder import ctc_decoder +except ImportError: + raise ImportError("Upgrade torchaudio to 0.12 to enable CTC decoding") + + +class DownloadProgressBar(tqdm): + """A class to represent a download progress bar""" + + def update_to(self, b=1, bsize=1, tsize=None) -> None: + """ + Update the download progress + """ + if tsize is not None: + self.total = tsize + self.update(b * bsize - self.n) + + +def retrieve_asr_config(lang_key: str, asr_version: str, json_path: str) -> dict: + """ + Retrieve the asr model configs + + Args: + lang_key: the lanuage type as the key name + json_path: the path of the config json file + + Returns: + Dict of all the configs in the json file + """ + + with open(json_path, "r") as f: + asr_model_cfgs = json.load(f) + return asr_model_cfgs[lang_key][asr_version] + + +class ASRGenerator(object): + """A class to represent a ASR generator""" + + def __init__( + self, + model_cfg: dict, + cache_dirpath: str = (Path.home() / ".cache" / "ust_asr").as_posix(), + ) -> None: + """ + Construct all the necessary attributes of the ASRGenerator class + + Args: + model_cfg: the dict of the asr model config + cache_dirpath: the default cache path is "Path.home()/.cache/ust_asr" + """ + + self.cache_dirpath = Path(cache_dirpath) / model_cfg["lang"] + self.model_cfg = model_cfg + + self.use_cuda = torch.cuda.is_available() + + torchaudio.set_audio_backend("sox_io") + + if self.model_cfg["model_type"] == "hf": + self.prepare_hf_model(self.model_cfg) + elif self.model_cfg["model_type"] == "fairseq": + self.prepare_fairseq_model(self.model_cfg) + else: + raise NotImplementedError( + f"Model type {self.model_cfg['model_type']} is not supported" + ) + + if self.model_cfg["post_process"] == "collapse": + self.post_process_fn = lambda hypo: "".join(hypo).replace( + self.sil_token, " " + ) + elif self.model_cfg["post_process"] == "none": + self.post_process_fn = lambda hypo: " ".join(hypo).replace( + self.sil_token, " " + ) + else: + raise NotImplementedError + + if self.use_cuda: + self.model.cuda() + self.model.eval() + + self.decoder = ctc_decoder( + lexicon=None, + tokens=self.tokens, + lm=None, + nbest=1, + beam_size=1, + beam_size_token=None, + lm_weight=0.0, + word_score=0.0, + unk_score=float("-inf"), + sil_token=self.sil_token, + sil_score=0.0, + log_add=False, + blank_token=self.blank_token, + ) + + def prepare_hf_model(self, model_cfg: dict) -> None: + """ + Prepare the huggingface asr model + + Args: + model_cfg: dict with the relevant ASR config + """ + + def infer_silence_token(vocab: list): + """ + Different HF checkpoints have different notion of silence token + such as | or " " (space) + Important: when adding new HF asr model in, check what silence token it uses + """ + if "|" in vocab: + return "|" + elif " " in vocab: + return " " + else: + raise RuntimeError("Silence token is not found in the vocabulary") + + try: + from transformers import (AutoFeatureExtractor, AutoTokenizer, + Wav2Vec2ForCTC, Wav2Vec2Processor) + except ImportError: + raise ImportError("Install transformers to load HF wav2vec model") + + model_path = model_cfg["model_path"] + self.model = Wav2Vec2ForCTC.from_pretrained(model_path) + self.tokenizer = AutoTokenizer.from_pretrained(model_path) + self.preprocessor = AutoFeatureExtractor.from_pretrained(model_path) + self.processor = Wav2Vec2Processor.from_pretrained(model_path) + + # extra unk tokens are there to make some models work e.g. Finnish ASR has some vocab issue + vocab_list = [ + self.tokenizer.decoder.get(i, f"{self.tokenizer.unk_token}1") + for i in range(self.tokenizer.vocab_size) + ] + + self.sampling_rate = self.preprocessor.sampling_rate + self.normalize_input = self.preprocessor.do_normalize + self.tokens = vocab_list + self.sil_token = infer_silence_token(vocab_list) + self.blank_token = self.tokenizer.pad_token + + def prepare_fairseq_model(self, model_cfg: dict) -> None: + """ + Prepare the fairseq asr model + + Args: + model_cfg: the specific model config dict must have: (1) ckpt_path, (2) dict_path + """ + + def download_file(url: str, cache_dir: Path): + download_path = cache_dir / url.split("/")[-1] + if not (cache_dir / url.split("/")[-1]).exists(): + with DownloadProgressBar( + unit="B", unit_scale=True, miniters=1, desc=url.split("/")[-1] + ) as t: + cache_dir.mkdir(parents=True, exist_ok=True) + urllib.request.urlretrieve( + url, filename=download_path.as_posix(), reporthook=t.update_to + ) + else: + print(f"'{url}' exists in {cache_dir}") + + return download_path.as_posix() + + try: + ckpt_path = model_cfg["ckpt_path"] + dict_path = model_cfg["dict_path"] + except KeyError: + raise KeyError( + "Fairseq model cfg must provide (1) ckpt_path, (2) dict_path" + ) + + if re.search("^https", ckpt_path): + ckpt_path = download_file(ckpt_path, self.cache_dirpath) + if re.search("^https", dict_path): + dict_path = download_file(dict_path, self.cache_dirpath) + + model, saved_cfg, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( + [ckpt_path], + arg_overrides={ + "task": "audio_finetuning", + "data": self.cache_dirpath.as_posix(), + }, # data must have dict in it + ) + + dict_lines = open(dict_path, "r").readlines() + tokens = [l.split()[0] for l in dict_lines] + # adding default fairseq special tokens + tokens = ["<s>", "<pad>", "</s>", "<unk>"] + tokens + + self.model = model[0] + self.tokens = tokens + + if "|" in tokens: + self.sil_token = "|" + else: + self.sil_token = tokens[ + 2 + ] # use eos as silence token if | not presented e.g., Hok ASR model + print(f"Inferring silence token from the dict: {self.sil_token}") + self.blank_token = self.tokens[0] + + self.sampling_rate = saved_cfg.task.sample_rate + self.normalize_input = saved_cfg.task.normalize + + @torch.inference_mode() + def load_audiofile(self, audio_path: str) -> torch.Tensor: + """ + Load the audio files and apply resampling and normalizaion + + Args: + audio_path: the audio file path + + Returns: + audio_waveform: the audio waveform as a torch.Tensor object + """ + + audio_waveform, sampling_rate = torchaudio.load(audio_path) + if audio_waveform.dim == 2: + audio_waveform = audio_waveform.mean(-1) + if self.sampling_rate != sampling_rate: + audio_waveform = torchaudio.functional.resample( + audio_waveform, sampling_rate, self.sampling_rate + ) + if self.normalize_input: + # following fairseq raw audio dataset + audio_waveform = torch.nn.functional.layer_norm( + audio_waveform, audio_waveform.shape + ) + + return audio_waveform + + @torch.inference_mode() + def compute_emissions(self, audio_input: torch.Tensor) -> torch.Tensor: + """ + Compute the emissions for either fairseq or huggingface asr model + + Args: + audio_path: the input audio waveform + + Returns: + emissions: the logits of the encoded prediction. + """ + + if self.use_cuda: + audio_input = audio_input.to("cuda") + if isinstance(self.model, fairseq.models.wav2vec.wav2vec2_asr.Wav2VecCtc): + padding_mask = lengths_to_padding_mask(torch.tensor([audio_input.numel()])) + emissions = self.model.w2v_encoder(audio_input, padding_mask)[ + "encoder_out" + ].transpose(0, 1) + else: + emissions = self.model(audio_input).logits + + return emissions + + def decode_emissions(self, emissions: torch.Tensor) -> str: + """ + Decode the emissions and apply post process functions + + Args: + emissions: the input Tensor object + + Returns: + hypo: the str as the decoded transcriptions + """ + + emissions = emissions.cpu() + results = self.decoder(emissions) + + # assuming the lexicon-free decoder and working with tokens + hypo = self.decoder.idxs_to_tokens(results[0][0].tokens) + hypo = self.post_process_fn(hypo) + + return hypo + + def transcribe_audiofile(self, audio_path: str, lower=True) -> str: + """ + Transcribe the audio into string + + Args: + audio_path: the input audio waveform + lower: the case of the transcriptions with lowercase as the default + + Returns: + hypo: the transcription result + """ + + asr_input = self.load_audiofile(audio_path) + emissions = self.compute_emissions(asr_input) + hypo = self.decode_emissions(emissions) + + return hypo.strip().lower() if lower else hypo.strip() diff --git a/examples/speech_to_speech/benchmarking/README.md b/examples/speech_to_speech/benchmarking/README.md new file mode 100644 index 0000000000..c62fe12963 --- /dev/null +++ b/examples/speech_to_speech/benchmarking/README.md @@ -0,0 +1,31 @@ +# Benchmarking + +## Overview + +The goal of this framework is to support benchmarking various speech to speech translation(S2ST) models in terms of runtime, max-memory consumption and total number of floating point operations(FLOPS). It is a generic framework and can be easily extended to support any fairseq models. To accurately benchmark the performance, core inference modules are re-implemented based on fairseq_cli/generate.py (core.py/Processing) and examples/speech_to_text/generate_waveform.py(core.py/SpeechGeneration. To ensure that the end to end models and cascaded models are compared fairly, for cascaded models we only consider the performance metrics for model inference at all stages ignoring any intermediate data and io processing consumption. We run all the benchmarking runs on CPU as it is generally used in production environment and also due to lack of good benchmarking library support for GPUs. + +1. Runtime: Average time in seconds to run model inference on an example from a given dataset. We use [timeit](https://docs.python.org/3/library/timeit.html) library to measure the runtime. +2. Max memory: Maximum memory in MiB averaged over by running the model inference on all examples from the given dataset. We use [memory_profiler](https://pypi.org/project/memory-profiler/) library to gather memory footprints for a code snippet and find the maximum to get the max memory used by the code. For cascaded models, we find the max of all stages to get the overall max_memory footprint. +3. FLOPS: We compute the average number of floating point operations needed to run model inference for an example from the given dataset. We use [PAPI library](http://www.bnikolic.co.uk/blog/python/flops/2019/10/01/pytorch-count-flops.html) to benchmark the number of flops. + +## CLI Commands + +```{python} +CUBLAS_WORKSPACE_CONFIG=:4096:8 python examples/speech_to_speech/benchmarking/get_metrics.py ‘’ --config $config +``` + + +## Note: + +1. The npy dataset is a list of samples saved as a .npy file. Each sample is a dictionary with id, net_input. +2. The raw dataset is a list of raw audio paths similar to wav2vec2 input tsv file + +```{python} +sample: { + "id": xx, + "net_input": { + "src_tokens": torch.tensor([]), + "src_lengths": torch.tensor([]) + } +} +``` diff --git a/examples/speech_to_speech/benchmarking/configs/2StageS2ST.yaml b/examples/speech_to_speech/benchmarking/configs/2StageS2ST.yaml new file mode 100644 index 0000000000..11deb42e7d --- /dev/null +++ b/examples/speech_to_speech/benchmarking/configs/2StageS2ST.yaml @@ -0,0 +1,19 @@ +general: + dataset_path: $npy_dataset + cpu: True + model_type: 2StageS2ST + dataset_size: 1 + +stage1: + data: $data_bin_stage1 + task: speech_to_text + path: $checkpoint_stage1 + config_yaml: config.yaml + max_len_a: 2 + max_len_b: 500 + +stage2: + data: $data_bin_stage2 + task: text_to_speech + path: $checkpoint_stage2 + config_yaml: config.yaml diff --git a/examples/speech_to_speech/benchmarking/configs/3StageS2ST.yaml b/examples/speech_to_speech/benchmarking/configs/3StageS2ST.yaml new file mode 100644 index 0000000000..9638136150 --- /dev/null +++ b/examples/speech_to_speech/benchmarking/configs/3StageS2ST.yaml @@ -0,0 +1,28 @@ +general: + dataset_path: $npy_dataset + cpu: True + model_type: 3StageS2ST + max_len_a: 2 + max_len_b: 500 + dataset_size: 1 + +stage1: + data: $data_bin_stage1 + task: speech_to_text + path: $checkpoint_stage1 + config_yaml: config.yaml + max_len_a: 2 + max_len_b: 500 + +stage2: + data: $data_bin_stage2 + task: translation + path: $checkpoint_stage2 + config_yaml: config.yaml + + +stage2: + data: $data_bin_stage3 + task: text_to_speech + path: $checkpoint_stage3 + config_yaml: config.yaml diff --git a/examples/speech_to_speech/benchmarking/configs/DirectS2U.yaml b/examples/speech_to_speech/benchmarking/configs/DirectS2U.yaml new file mode 100644 index 0000000000..96264cec68 --- /dev/null +++ b/examples/speech_to_speech/benchmarking/configs/DirectS2U.yaml @@ -0,0 +1,22 @@ +general: + dataset_path: $npy_dataset_path + cpu: True + model_type: S2UT + dataset_size: 5 + dump_speech_waveforms_dir: $dump_waveforms_dir_path + +stage1: + data: $data_bin + task: speech_to_speech + path: $checkpoint + config_yaml: config.yaml + max_len_b: 100000 + beam: 10 + target_is_code: True + max_target_positions: 3000 + target_code_size: 100 + +stage2: + vocoder: $vocoder_path + vocoder_cfg: $vocoder_cfg_json + dur_prediction: True diff --git a/examples/speech_to_speech/benchmarking/configs/S2T.yaml b/examples/speech_to_speech/benchmarking/configs/S2T.yaml new file mode 100644 index 0000000000..3a106a0441 --- /dev/null +++ b/examples/speech_to_speech/benchmarking/configs/S2T.yaml @@ -0,0 +1,13 @@ +general: + dataset_path: $npy_dataset + cpu: True + model_type: S2T + dataset_size: 1 + +stage1: + data: $data_bin + task: speech_to_text + path: $checkpoint + config_yaml: config.yaml + max_len_a: 2 + max_len_b: 500 diff --git a/examples/speech_to_speech/benchmarking/core.py b/examples/speech_to_speech/benchmarking/core.py new file mode 100644 index 0000000000..da22a34ece --- /dev/null +++ b/examples/speech_to_speech/benchmarking/core.py @@ -0,0 +1,487 @@ +import timeit +import logging +import torch +from pypapi import events, papi_high as high +from memory_profiler import memory_usage +from torch import nn +from argparse import Namespace +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.data import data_utils as fairseq_data_utils +from fairseq import checkpoint_utils, tasks, utils +from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder +from examples.hubert.simple_kmeans.dump_hubert_feature import HubertFeatureReader +from examples.hubert.simple_kmeans.dump_km_label import ApplyKmeans +from fairseq_cli.generate import get_symbols_to_strip_from_output +import soundfile as sf +import ast +import json + +logging.basicConfig() +logging.root.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +torch.manual_seed(1) +torch.set_deterministic(True) + + +class BenchmarkingBase(nn.Module): + def __init__(self): + nn.Module.__init__(self) + self.s2x_task = None + + def warm_up(self, sample, repeat): + """Warm up the model""" + for _i in range(repeat): + self.forward(sample) + logger.info(f"Model warmed up by running inference {repeat} times") + + def benchmark_run_time(self, dataset, repeat): + """Benchmark average runtime for the model by calling benchmark_run_time_single_sample function""" + logger.info("Starting run time benchmarking") + time_elapsed = 0 + for i, sample in enumerate(dataset): + time_elapsed += self.benchmark_run_time_single_sample(sample, repeat=repeat) + if i % 100 == 0: + logger.info(f"Benchmarked run time for {i}/{len(dataset)} samples") + total_time_elapsed = time_elapsed / len(dataset) + return total_time_elapsed + + def benchmark_run_time_single_sample(self, sample, repeat): + """Benchmark average runtime for a single sample using timeit library. Units are seconds""" + timer = timeit.Timer(lambda: self.forward(sample)) + time_elapsed = timer.timeit(repeat) + return time_elapsed / repeat + + def count_flops( + self, + dataset, + repeat, + ): + """Use PYPAPI library to count average flops for model inference. + Note: It only works if the model is being run on cpu""" + logger.info("Starting flop counter") + high.start_counters([events.PAPI_DP_OPS]) + for i, sample in enumerate(dataset): + for _r in range(repeat): + self.forward(sample) + if i % 100 == 0: + logger.info(f"Counted flops for {i}/{len(dataset)} samples") + flops = high.stop_counters() + flops = round(flops[0] / (repeat * len(dataset))) + return flops + + def max_memory(self, dataset, repeat): + """Compute average max memory consumed by model inference. Units are MiB""" + logger.info("Starting memory benchmarking") + total_memory = 0 + for i, sample in enumerate(dataset): + for _r in range(repeat): + total_memory += max(memory_usage((self.forward, (sample,), {}))) + if i % 100 == 0: + logger.info(f"Benchmarked memory for {i}/{len(dataset)} samples") + total_memory = total_memory / (repeat * len(dataset)) + return total_memory + + def gather_all_metrics(self, dataset, repeat): + run_time = self.benchmark_run_time(dataset, repeat) + max_memory = self.max_memory(dataset, repeat) + flops = self.count_flops(dataset, repeat) + + return run_time, max_memory, flops + + def dump_final_speech_output( + self, dataset, output_dir, resample_fn, sample_rate, prefix=None + ): + + for i, sample in enumerate(dataset): + hypo = self.forward(sample)[0] + + def to_np(x): + return x.detach().cpu().numpy() + + try: + wave_preds = to_np(resample_fn(hypo["waveform"])) + sf.write( + f"{output_dir}/{prefix}_{i}_pred.wav", + wave_preds, + sample_rate, + ) + except Exception as e: + raise Exception( + f" Encountered {e} - Invalid waveform. Make sure the model outputs a waveform" + ) + + +class Processing(BenchmarkingBase): + """Class similar to fairseq_cli/generate.py. Supports ASR, MT and ST model inference""" + + def __init__(self, args): + super().__init__() + self.use_cuda = not getattr(args, "cpu", False) + self.setUp(args) + self.training = False + self.s2x_task = self.task + + def setUp(self, cfg): + if isinstance(cfg, Namespace): + cfg = convert_namespace_to_omegaconf(cfg) + + self.task = tasks.setup_task(cfg.task) + self.tgt_dict = self.task.target_dictionary + + # Load ensemble + logger.info("loading model(s) from {}".format(cfg.common_eval.path)) + models, _ = checkpoint_utils.load_model_ensemble( + utils.split_paths(cfg.common_eval.path), + arg_overrides={}, + task=self.task, + suffix=cfg.checkpoint.checkpoint_suffix, + strict=False, + num_shards=cfg.checkpoint.checkpoint_shard_count, + ) + if len(models) > 1: + raise Exception("Currently loading multiple models is not supported") + self.model = models[0] + + # Optimize model for generation + if cfg.common.fp16: + self.model.half() + if self.use_cuda: + self.model.cuda() + self.model.prepare_for_inference_(cfg) + + self.generator = self.task.build_generator( + [self.model], + cfg.generation, + extra_gen_cls_kwargs={}, + ) + # Handle tokenization and BPE + self.tokenizer = self.task.build_tokenizer(cfg.tokenizer) + self.bpe = self.task.build_bpe(cfg.bpe) + self.remove_bpe = cfg.common_eval.post_process + + def encode_source(self, src): + """Method to generate source tokens from a string""" + if self.tokenizer is not None: + src = self.tokenizer.encode(src) + if self.bpe is not None: + src = self.bpe.encode(src) + src_tokens = self.task.source_dictionary.encode_line(src).long() + src_lens = src_tokens.size(0) + return { + "net_input": { + "src_tokens": src_tokens.view(1, src_lens), + "src_lengths": torch.tensor([src_lens]), + } + } + + def decode_target(self, hypos): + """Method to decode target string from tokens""" + hypo_str = self.tgt_dict.string( + hypos[0][0]["tokens"].int().cpu(), + self.remove_bpe, + get_symbols_to_strip_from_output(self.generator), + ) + if self.bpe is not None: + hypo_str = self.bpe.decode(hypo_str) + if self.tokenizer is not None: + hypo_str = self.tokenizer.decode(hypo_str) + return hypo_str + + def forward(self, sample): + hypos = self.task.inference_step( + self.generator, + [self.model], + sample, + prefix_tokens=None, + constraints=None, + ) + return hypos + + +class GenerateWaveformFromCode(BenchmarkingBase): + """Class to support waveform generation from code. Currently, vocoder only supports single speaker""" + + def __init__(self, args): + super().__init__() + with open(args.vocoder_cfg) as f: + vocoder_cfg = json.load(f) + self.dur_prediction = args.dur_prediction + self.vocoder = CodeHiFiGANVocoder(args.vocoder, vocoder_cfg) + + def format_units(self, input): + code = torch.LongTensor(list(map(int, input.strip().split()))).view(1, -1) + return {"code": code} + + def generate_vocoder_input(self, dataset): + return [self.format_units(sample) for sample in dataset] + + def forward(self, sample): + return [{"waveform": self.vocoder(sample, self.dur_prediction)}] + + +class HubertUnitExtractor(BenchmarkingBase): + def __init__(self, args): + self.feature_reader = HubertFeatureReader( + args.hubert_ckpt_path, args.hubert_layer + ) + self.kmeans = ApplyKmeans(args.hubert_km_path) + + def forward(self, sample): + with torch.no_grad(): + feat = [] + for start in range(0, sample.size(1), self.feature_reader.max_chunk): + x_chunk = sample[:, start : start + self.max_chunk] + feat_chunk, _ = self.feature_reader.model.extract_features( + source=x_chunk, + padding_mask=None, + mask=False, + output_layer=self.layer, + ) + feat.append(feat_chunk) + torch.cat(feat, 1).squeeze(0) + return self.kmeans(feat).tolist() + + +class SpeechGeneration(BenchmarkingBase): + """Class similar to examples/text_to_speech/generate_waveform.py. + Supports models with speech generation as end goal (TTS, Direct S2ST models etc)""" + + def __init__(self, args): + super().__init__() + self.use_cuda = not getattr(args, "cpu", False) + self.setUp(args) + self.s2x_task = self.task + + def setUp(self, args): + if args.task == "speech_to_speech": + args.normalize_waveform = False + self.task = tasks.setup_task(args) + self.pre_tokenizer = self.task.build_tokenizer(args) + self.bpe_tokenizer = self.task.build_bpe(args) + try: + self.src_dict = self.task.src_dict + except Exception: + self.src_dict = None + ensemble, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [args.path], + arg_overrides=ast.literal_eval(args.model_overrides), + task=self.task, + strict=False, + ) + self.model = ensemble[0] + if self.use_cuda: + self.model.cuda() + # criterion.cuda() + self.model.eval() + self.generator = self.task.build_generator( + [self.model], + args, + ) + + def processTextInput(self, text): + """Generate source tokens from text input""" + if self.pre_tokenizer is not None: + text = self.pre_tokenizer.encode(text) + if self.bpe_tokenizer is not None: + text = self.bpe_tokenizer.encode(text) + target = self.src_dict.encode_line( + text, add_if_not_exist=False, append_eos=True + ).long() + target = fairseq_data_utils.collate_tokens( + [target], + self.src_dict.pad(), + self.src_dict.eos(), + left_pad=False, + move_eos_to_beginning=False, + ) + src_lengths = torch.tensor([target.size(1)], dtype=torch.long) + prev_output_tokens = None + sample = { + "net_input": { + "src_tokens": target, + "src_lengths": src_lengths, + "prev_output_tokens": prev_output_tokens, + } + } + sample = utils.move_to_cuda(sample) if self.use_cuda else sample + return sample + + def forward(self, sample): + sample["speaker"] = None + output = self.generator.generate(self.model, sample) # , has_targ=False + return output + + +class S2UT(BenchmarkingBase): + """Class to support S2UT models. Also supports generating waveforms from the units predicted""" + + def __init__(self, s2u_args, vocoder_args=None): + super().__init__() + self.s2u = Processing(s2u_args) + self.vocoder = None + if vocoder_args: + self.vocoder = GenerateWaveformFromCode(vocoder_args) + self.vocoder_input = None + + def forward(self, sample): + s2u_hypos = self.s2u(sample) + s2u_output = self.s2u.decode_target(s2u_hypos) + if not self.vocoder: + return s2u_output + units = self.vocoder.format_units(s2u_output) + vocoder_output = self.vocoder(units) + return vocoder_output + + def generate_s2u_outputs(self, dataset): + return [self.s2u.decode_target(self.s2u(sample)) for sample in dataset] + + def compute_metrics(self, metric_type, dataset, repeat=None): + """Generic function to compute metrics ignoring the io processing time""" + if self.vocoder and not self.vocoder_input: + self.s2u_output = self.generate_s2u_outputs(dataset) + self.vocoder_input = self.vocoder.generate_vocoder_input(self.s2u_output) + + s2u_metrics = getattr(self.s2u, metric_type)( + dataset, + repeat, + ) + vocoder_metrics = 0 + if self.vocoder: + vocoder_metrics = getattr(self.vocoder, metric_type)( + self.vocoder_input, + repeat, + ) + print( + f"metric_type = {metric_type} s2u_metrics = {s2u_metrics} \t vocoder_metrics = {vocoder_metrics}" + ) + if metric_type == "max_memory": + return max(s2u_metrics, vocoder_metrics) + else: + return s2u_metrics + vocoder_metrics + + def benchmark_run_time(self, dataset, repeat): + return self.compute_metrics("benchmark_run_time", dataset, repeat) + + def count_flops(self, dataset, repeat): + return self.compute_metrics("count_flops", dataset, repeat) + + def max_memory(self, dataset, repeat): + return self.compute_metrics("max_memory", dataset, repeat) + + +class Cascaded2StageS2ST(BenchmarkingBase): + """ST + TTS""" + + def __init__(self, s2t_args, tts_args): + super().__init__() + self.s2t = Processing(s2t_args) + self.s2x_task = self.s2t.task + self.tts = SpeechGeneration(tts_args) if tts_args else None + self.training = False + self.tts_inputs = None + + def forward(self, sample): + if not self.tts: + raise Exception( + "Forward function is not callable without tts. Reinitialize the class with tts_args" + ) + s2t_hypos = self.s2t(sample) + s2t_output = self.s2t.decode_target(s2t_hypos) + tts_input = self.tts.processTextInput(s2t_output) + tts_output = self.tts(tts_input) + return tts_output + + def generate_s2t_outputs(self, dataset): + """Process dataset and generate s2t outputs""" + return [self.s2t.decode_target(self.s2t(sample)) for sample in dataset] + + def generate_tts_inputs(self, dataset): + """Process dataset and generate tts inputs""" + return [self.tts.processTextInput(sample) for sample in dataset] + + def compute_metrics(self, metric_type, dataset, repeat=None): + """Generic function to compute metrics ignoring the io processing time""" + if not self.tts_inputs: + s2t_outputs = self.generate_s2t_outputs(dataset) + self.tts_inputs = self.generate_tts_inputs(s2t_outputs) + + s2t_metrics = getattr(self.s2t, metric_type)( + dataset, + repeat, + ) + + tts_metrics = getattr(self.tts, metric_type)( + self.tts_inputs, + repeat, + ) + print( + f"metric_type = {metric_type} s2t_metrics = {s2t_metrics} \t tts_metrics = {tts_metrics}" + ) + if metric_type == "max_memory": + return max(s2t_metrics, tts_metrics) + else: + return s2t_metrics + tts_metrics + + def benchmark_run_time(self, dataset, repeat): + return self.compute_metrics("benchmark_run_time", dataset, repeat) + + def count_flops(self, dataset, repeat): + return self.compute_metrics("count_flops", dataset, repeat) + + def max_memory(self, dataset, repeat): + return self.compute_metrics("max_memory", dataset, repeat) + + +class Cascaded3StageS2ST(Cascaded2StageS2ST): + """ASR + MT + TTS""" + + def __init__(self, s2t_args, tts_args, mt_args): + super().__init__(s2t_args, tts_args) + self.mt = Processing(mt_args) + self.mt_inputs = [] + + def forward(self, sample): + s2t_hypos = self.s2t(sample) + s2t_output = self.s2t.decode_target(s2t_hypos) + mt_input = self.mt.encode_source(s2t_output) + mt_hypos = self.mt(mt_input) + mt_output = self.mt.decode_target(mt_hypos) + tts_input = self.tts.processTextInput(mt_output) + tts_output = self.tts(tts_input) + return tts_output + + def generate_mt_inputs(self, dataset): + """Process dataset to generate mt model inputs""" + return [self.mt.encode_source(sample) for sample in dataset] + + def generate_mt_outputs(self, dataset): + """Process dataset to generate mt model outputs""" + return [self.mt.decode_target(self.mt(sample)) for sample in dataset] + + def compute_metrics(self, metric_type, dataset, repeat=None): + """Generic function to compute metrics ignoring the io processing time""" + if not self.tts_inputs: + s2t_outputs = self.generate_s2t_outputs(dataset) + self.mt_inputs = self.generate_mt_inputs(s2t_outputs) + mt_outputs = self.generate_mt_outputs(self.mt_inputs) + self.tts_inputs = self.generate_tts_inputs(mt_outputs) + + s2t_metrics = getattr(self.s2t, metric_type)( + dataset, + repeat, + ) + mt_metrics = getattr(self.mt, metric_type)(self.mt_inputs, repeat) + tts_metrics = getattr(self.tts, metric_type)( + self.tts_inputs, + repeat, + ) + print( + f"metric_type = {metric_type} s2t_metrics = {s2t_metrics} \t mt_metrics = {mt_metrics} \t tts_metrics = {tts_metrics}" + ) + if metric_type == "max_memory": + return max(s2t_metrics, mt_metrics, tts_metrics) + else: + return s2t_metrics + mt_metrics + tts_metrics diff --git a/examples/speech_to_speech/benchmarking/data_utils.py b/examples/speech_to_speech/benchmarking/data_utils.py new file mode 100644 index 0000000000..c73a59951f --- /dev/null +++ b/examples/speech_to_speech/benchmarking/data_utils.py @@ -0,0 +1,264 @@ +from fairseq import tasks +import numpy as np +import logging +import random +from fairseq import options +import torch +import os +import soundfile as sf + +from fairseq.data.audio.audio_utils import ( + get_waveform, + parse_path, +) + +logging.basicConfig() +logging.root.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +random.seed(1) +np.random.seed(1) +random_number_generator = np.random.RandomState(30) + + +def generate_random_data_sample(T, B=1, D=80): + """Generate random data sample given the T, B, D values""" + net_input = { + "src_tokens": torch.tensor(random_number_generator.randn(B, T, D)).float(), + "src_lengths": torch.tensor([T]), + } + return {"net_input": net_input} + + +def generate_random_dataset(T_range_min, T_range_max, B=1, D=80, dataset_size=100): + """Generate random dataset with T values within a given range, B, D""" + T_values = [random.randint(T_range_min, T_range_max) for i in range(dataset_size)] + dataset = [] + for t in T_values: + dataset.append(generate_random_data_sample(t, B, D)) + return dataset, sum(T_values) / dataset_size + + +def load_dataset_npy(file_name, dataset_size=None): + """Load dataset from a .npy file.""" + data = np.load(file_name, allow_pickle=True) + if dataset_size: + data = data[:dataset_size] + return data + + +def load_dataset_raw_to_waveforms( + file_name, + dataset_size=None, + need_waveform=True, + sample_rate=16000, + read_using_soundfile=False, +): + """Load raw dataset from w2v tsv file. Optionally get waveforms""" + data = [] + with open(file_name, "r") as fp: + lines = fp.readlines() + data = [ + os.path.join(lines[0].strip(), line.strip().split("\t")[0]) + for line in lines[1:] + ] + + if dataset_size: + data = data[:dataset_size] + + if not need_waveform: + return data + + features = [] + if read_using_soundfile: + for _i, d in enumerate(data): + wav = sf.read(d)[0] + if wav.ndim == 2: + wav = wav.mean(-1) + features.append(torch.from_numpy(wav).float().view(1, -1)) + else: + for i, d in enumerate(data): + _path, slice_ptr = parse_path(d) + if len(slice_ptr) == 0: + feat = get_waveform( + _path, always_2d=True, output_sample_rate=sample_rate + )[0] + features.append( + { + "id": i, + "net_input": { + "src_tokens": torch.tensor(feat), + "src_lengths": torch.tensor([feat.shape[1]]), + }, + } + ) + else: + raise Exception("Currently unsupported data format") + return features + + +def load_dataset_task( + args, + batch_size=1, + limit_size=None, + ref_dataset=None, +): + """Loads dataset based on args by creating a task""" + if not args.data or not args.subset or not args.task: + raise Exception( + "Please provide necessary arguments to load the dataset - data, subset and task" + ) + task = tasks.setup_task(args) + + task.load_dataset(args.subset) + if not limit_size: + limit_size = len(task.dataset(args.subset)) + + iter = task.get_batch_iterator( + dataset=task.dataset(args.subset), max_sentences=batch_size + ).next_epoch_itr(shuffle=False) + dataset = [] + for i, sample in enumerate(iter): + sample = { + "id": task.datasets[args.subset].ids[sample["id"].item()], + "net_input": { + "src_tokens": sample["net_input"]["src_tokens"], + "src_lengths": sample["net_input"]["src_lengths"], + }, + } + dataset.append(sample) + if i == limit_size - 1: + break + + if ref_dataset: + try: + ids = get_ids_from_dataset(ref_dataset) + except Exception as e: + raise Exception(f"{e} - Cannot extract ids from reference dataset") + + filtered_dataset = [] + for sample in dataset: + if ( + sample["id"] in ids + or sample["id"][5:] in ids + or f"dev_{sample['id']}" in ids + ): + filtered_dataset.append(sample) + dataset = filtered_dataset + + max_len, min_len, avg_len = get_dataset_stats(dataset) + print( + f"{args.subset} dataset stats : num_samples={len(dataset)} max_len = {max_len} min_len = {min_len} avg_len = {avg_len}" + ) + + return dataset + + +def randomly_sample_subset(dataset, size=500): + """Randomly sample subset from a dataset""" + random_indices = [random.randint(0, len(dataset) - 1) for i in range(size)] + return [dataset[i] for i in random_indices] + + +def get_short_data_subset(dataset, size=500): + """Get a subset of desired size by sorting based on src_lengths""" + return sort_dataset(dataset)[:size] + + +def get_long_data_subset(dataset, size=500): + """Get a subset of desired size by sorting based on src_lengths descending""" + return sort_dataset(dataset, reverse=True)[:size] + + +def sort_dataset(dataset, reverse=False): + return sorted( + dataset, key=lambda x: x["net_input"]["src_lengths"].item(), reverse=reverse + ) + + +def save_dataset_npy(dataset, file_name): + """Save a dataset as .npy file""" + np.save(file_name, dataset) + + +def get_dataset_stats(dataset): + """Get stats about dataset based on src_lengths of samples""" + max_len = 0 + min_len = 100000 + avg_len = 0 + for d in dataset: + max_len = max(max_len, d["net_input"]["src_lengths"].item()) + min_len = min(min_len, d["net_input"]["src_lengths"].item()) + avg_len += d["net_input"]["src_lengths"].item() + + return max_len, min_len, avg_len / len(dataset) + + +def make_parser(): + """ + Additional args: + 1. Provide the dataset dir path using --data. + 2. Loading the dataset doesn't require config, provide --config-yaml to apply additional feature transforms + """ + parser = options.get_speech_generation_parser() + parser.add_argument( + "--subset", + default=None, + type=str, + required=True, + help="Subset to use for dataset generation", + ) + parser.add_argument( + "--dataset-save-dir", + default=None, + type=str, + required=False, + help="Dir path in which the datasets are to be saved", + ) + parser.add_argument( + "--ref-dataset", + default=None, + type=str, + required=False, + help="If provided, the ids in the reference dataset will be used to filter the new dataset generated.", + ) + parser.add_argument("--dataset-save-token", default="", type=str, required=False) + + options.add_generation_args(parser) + return parser + + +def get_ids_from_dataset(dataset): + return {sample["id"]: 1 for sample in dataset} + + +def cli_main(): + parser = make_parser() + args = options.parse_args_and_arch(parser) + dataset = load_dataset_task(args) + + random_dataset = randomly_sample_subset(dataset) + short_dataset = get_short_data_subset(dataset) + long_dataset = get_long_data_subset(dataset) + + if args.dataset_save_token: + args.dataset_save_token = f"_{args.dataset_save_token}_" + + if args.dataset_save_dir: + save_dataset_npy( + random_dataset, + f"{args.dataset_save_dir}/random_dataset{args.dataset_save_token}w_ids.npy", + ) + save_dataset_npy( + short_dataset, + f"{args.dataset_save_dir}/short_dataset{args.dataset_save_token}w_ids.npy", + ) + save_dataset_npy( + long_dataset, + f"{args.dataset_save_dir}/long_dataset{args.dataset_save_token}w_ids.npy", + ) + + +if __name__ == "__main__": + cli_main() diff --git a/examples/speech_to_speech/benchmarking/get_metrics.py b/examples/speech_to_speech/benchmarking/get_metrics.py new file mode 100644 index 0000000000..773257f5da --- /dev/null +++ b/examples/speech_to_speech/benchmarking/get_metrics.py @@ -0,0 +1,162 @@ +import copy +import torch +import logging +from argparse import Namespace +import yaml +from fairseq import options +from examples.speech_to_speech.benchmarking.core import ( + Processing, + SpeechGeneration, + Cascaded2StageS2ST, + Cascaded3StageS2ST, + S2UT, +) +from examples.speech_to_speech.benchmarking.data_utils import ( + load_dataset_npy, + load_dataset_raw_to_waveforms, +) + + +logging.basicConfig() +logging.root.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +torch.manual_seed(1) +torch.set_deterministic(True) + + +def make_parser(): + """Note: As the names indicate use s2x_args(ex:ST, ASR etc) for models with speech input, + x2s_args for models with speech output(ex:TTS) and mt_args for translation models (ex: mt, T2U etc). + For direct S2ST models, use x2s_args to provide model details. + """ + parser = options.get_speech_generation_parser() + parser.add_argument("--target-is-code", action="store_true", default=False) + parser.add_argument("--config", type=str) + parser.add_argument( + "--model-type", + default="S2U", + choices=["S2S", "TTS", "S2UT", "MT", "S2T", "2StageS2ST", "3StageS2ST"], + help="Choose one of the models. For model inference implementation, refer to core.py", + ) + parser.add_argument( + "--dataset-path", + type=str, + help="""File to load dataset from. Assumes dataset is a list of samples. + Each sample is a dict of format {'net_input':{'src_tokens':torch.tenor(),'src_lengths':torch.tensor()}}""", + ) + parser.add_argument( + "--dataset-type", + type=str, + default="npy", + choices=["npy", "raw"], + help="""Type of input dataset file""", + ) + parser.add_argument( + "--read-using-sf", + type=str, + default=False, + help="""If sound file should be used to read the raw dataset""", + ) + parser.add_argument( + "--dataset-size", + default=None, + type=int, + help="Dataset size to use for benchmarking", + ) + parser.add_argument( + "--dump-speech-waveforms-dir", + default=None, + type=str, + help="Directory to dump the speech waveforms computed on the dataset.", + ) + parser.add_argument( + "--dump-waveform-file-prefix", + default="", + type=str, + help="File name prefix for the saved speech waveforms", + ) + parser.add_argument( + "--feat-dim", default=80, type=int, help="Input feature dimension" + ) + parser.add_argument( + "--target-sr", + default=16000, + type=int, + help="Target sample rate for dumping waveforms", + ) + + options.add_generation_args(parser) + options.get_interactive_generation_parser(parser) + return parser + + +def cli_main(): + parser = make_parser() + args = options.parse_args_and_arch(parser) + + with open( + args.config, + "r", + ) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + dict_args = vars(args) + dict_args.update(config["general"]) + args = Namespace(**dict_args) + + i = 1 + stage_args = [] + while i <= 3: + var = f"stage{i}" + tmp_args = copy.deepcopy(dict_args) + if var in config: + tmp_args.update(config[var]) + stage_args.append(Namespace(**tmp_args)) + i += 1 + else: + break + + if args.model_type == "S2S" or args.model_type == "TTS": + model = SpeechGeneration(stage_args[0]) + elif args.model_type == "S2UT": + model = S2UT(stage_args[0], stage_args[1] if len(stage_args) > 1 else None) + elif args.model_type == "MT" or args.model_type == "S2T": + model = Processing(stage_args[0]) + elif args.model_type == "2StageS2ST": + model = Cascaded2StageS2ST(stage_args[0], stage_args[1]) + elif args.model_type == "3StageS2ST": + model = Cascaded3StageS2ST(stage_args[0], stage_args[2], stage_args[1]) + else: + raise Exception(f"Currently unsupported model type {args.model_type}") + + print(f"Evaluating on dataset - {args.dataset_path}\n") + + if args.dataset_type == "npy": + dataset = load_dataset_npy(args.dataset_path, dataset_size=args.dataset_size) + elif args.dataset_type == "raw": + dataset = load_dataset_raw_to_waveforms( + args.dataset_path, + dataset_size=args.dataset_size, + read_using_soundfile=args.read_using_sf, + ) + else: + raise Exception(f"Invalid dataset type {args.dataset_type}") + + model.warm_up(sample=dataset[0], repeat=2) + + run_time, memory, flops = model.gather_all_metrics(dataset, repeat=1) + print(f"run_time = {run_time}sec \tmemory = {memory}MiB \tflops = {flops}") + + if args.dump_speech_waveforms_dir: + model.dump_final_speech_output( + dataset, + args.dump_speech_waveforms_dir, + lambda x: x, + args.target_sr, + prefix=args.dump_waveform_file_prefix, + ) + + +if __name__ == "__main__": + cli_main() diff --git a/examples/speech_to_speech/docs/data_augmentation.md b/examples/speech_to_speech/docs/data_augmentation.md new file mode 100644 index 0000000000..c0c17ff223 --- /dev/null +++ b/examples/speech_to_speech/docs/data_augmentation.md @@ -0,0 +1,435 @@ +# Noise and audio augmentation techniques + +The noise and data augmentation techniques were written in an effort to understand how augmenatation can affect model robustness and performance in both clean and noisy settings. + +All transforms discussed in this section are subclasses of `AudioFeatureTransform`, `AudioWaveformTransform`, or `AudioDatasetTransform`. Each `Audio*Transform` has unique interaction with the data. If interested in implemented one's own transforms, it is highly advisable to review the differences (see [Adding your own transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#adding-your-own-transforms)). If only applying the in-built transforms, then one only needs to be mindful that the correct kind of transform is listed in the config (see [Using transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#using-transforms)). These transforms can be applied to instances of `SpeechToTextDataset`. + +### Contents +[In-built transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#in-built-transforms) + +[Benchmark studies](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#benchmark-studies) + +[Using transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#using-transforms) + +[Adding your own transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#adding-your-own-transforms) + + +## In-built transforms +### 1. Utterance concatenation +Utterance concatenation is a data augmenation technique introduced as ConcatAug in [Translatotron 2: High-quality direct speech-to-speech translation +with voice preservation](https://arxiv.org/pdf/2107.08661.pdf). +With some parameterized probability, samples are concatenated with one other randomly chosen sample from the whole dataset. In the positive (concatenation) case, accessing `dataset[i]` will return a `SpeechToTextDatasetItem` where `source=source[i]+source[j]` and `target=target[i]+target[j]`. In the negative (skip concatenation) case, accessing `dataset[i]` will return a `SpeechToTextDatasetItem` where `source=source[i]` and `target=target[i]` as usual. + +**Usage**: `concataugment` is an `AudioDatasetTransform` and has three configurable hyperparameters: +- `rate`: probability that any single access will result in the positive (concatenation) case. Defaults to 0.25. +- `max_tokens`: maximum number of tokens allowed for concatenated source sequences. This parameter is meant to limit the length of concatenated samples to avoid out-of-memory errors. Defaults to 300. +- `attempts`: maximum number of invalid concatenation attempts before defaulting to the negative (skip concatenation) case. This parameter aims to limit excessive time spent trying to find candidate samples that are short enough to concatenate with. Defaults to 5. + +Please be wary of OOMs while using this augmentation technique; we used smaller batch sizes as a workaround to avoid OOMs. Batch size is determined by update frequency, batch size hyperparameter, and the number of GPU, so you may want to alter these to this end. + +### 2. Noise augmentation suite + +The four noise augmentation methods in this suite adhere to the following principle: with some parameterized probability, samples are overlayed with a noise track. The content of the noise track is specific to the method. Signal-to-noise ratio with which the noise track is overlayed is determined by choosing a value from a random uniform distribution with parameterized endpoints. The first three methods are based off data augmentation methods suggested in Section 3.3 of [X-Vectors: Robust DNN Embeddings for Speaker Recognition](https://danielpovey.com/files/2018_icassp_xvectors.pdf). + +#### 2.1. Music augmentation +For music augmentation, the noise track consists of one file uniformly randomly selected from a corpus of music files. The music file is cut to size, including being repeated to fill the original sample length if necessary. + +**Usage**: `musicaugment` is an `AudioWaveformTransform` and has four configurable hyperparameters: +- `samples_path`: path where background music files are saved as audios (.wav files). No default. +- `rate`: probability that any single access will result in the positive (background music) case. Defaults to 0.25. +- `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5. +- `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15. + +#### 2.2. Babble augmentation +For babble augmentation, the noise track consists of multiple audios uniformly randomly selected from a corpus of speech files. The number of speech audios in the background track is chosen randomly with equal probability between 3 and 7 audios. + +**Usage**: `babbleaugment` is an `AudioWaveformTransform` and has four configurable hyperparameters: +- `samples_path`: path where background speech files are saved as audios (.wav files). No default. +- `rate`: probability that any single access will result in the positive (background speech) case. Defaults to 0.25. +- `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5. +- `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15. + +#### 2.3. Sporadic noise augmentation +For sporadic noise augmentation, the noise track is mostly silent except for intermittent short clips of noise which are added at roughly a parameterized frequency. These clips are randomly chosen and cut from a corpus of noise files to lengths according to a parameterized Gaussian distribution. + +**Usage**: `sporadicnoiseaugment` is an `AudioWaveformTransform` and has seven configurable hyperparameters: +- `samples_path`: path where background noise files are saved as audios (.wav files). No default. +- `rate`: probability that any single access will result in the positive (add a sporadic noise track) case. Defaults to 0.25. +- `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5. +- `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15. +- `noise_rate`: rate in noises per second at which noise clip will be added to the original sample +- `noise_len_mean`: mean of Gaussian normal distribution from which length of noise clip is chosen +- `noise_len_std`: standard deviation of Gaussian normal distribution from which length of noise clip is chosen + +#### 2.4. Background noise augmentation +For background noise augmentation, the noise track is a single track uniformly randomly selected from a corpus of noise files. The noise file is cut to size, including being repeated to fill the original sample length if necessary. + +**Usage**: `backgroundnoiseaugment` is an `AudioWaveformTransform` and has four configurable hyperparameters: +- `samples_path`: path where background noise files are saved as audios (.wav files). No default. +- `rate`: probability that any single access will result in the positive (background noise) case. Defaults to 0.25. +- `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5. +- `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15. + +### 3. Mixed babble and background noise augmentation with recognizable source speaker + +This augmentation technique is based on Algorithm 1 in [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) and is similar to the noise augmentation suite techniques in that it has a background noise track. The noise track consists of either (1) another audio sample from the batch or (2) a background noise track. A key difference is the length of the noise track is chosen from a uniform random distribution between 0 and half of the original sample length. + +**Usage**: `noisyoverlapaugment` is an `AudioDatasetTransform` and has seven configurable hyperparameters: +- `noises_path`: path where background noise files are saved as audios (.wav files). No default. +- `rate`: probability that any single access will result in the positive (background noise) case. Defaults to 0.25. +- `mixing_noise_rate`: probability that in a positive (background noise) case, the noise track will consist of background noise (rather than babble from the batch). Defaults to 0.1. +- `noise_snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to -5. +- `noise_snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5. +- `utterance_snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add **another audio from the batch** to the original source. Defaults to -5. +- `utterance_snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add **another audio from the batch** to the original source. Defaults to 5. + +## Benchmark studies +### Evaluation on clean data +Augmentation in training data|Hyperparameters|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) +---|---|---|---|---|--- +None||3.954|24.984|23.962|24.448 +ConcatAugment|rate = 0.25, max_tokens = 3000, attempts = 5|3.940|25.322|26.124|26.19 +BabbleAugment|rate = 0.25, MUSAN speech, snr_min = (-5), snr_max = 5|3.957|24.226|23.186|22.368| +BackgroundNoiseAugment|rate = 0.1, MUSAN noises, snr_min = (-10), snr_max = 10|3.955|24.745|23.513|23.819 +MusicAugment|rate = 0.25, MUSAN music, snr_min = 0, snr_max = 20|3.954|25.096|24.301|23.341| +SporadicNoiseAugment|rate = 0.1, noise_rate = 0.25, MUSAN noises, snr_min = 10, snr_max = 35|3.954|24.924|23.951|23.484| +MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|as above, except limited rates to sum to 0.25: music (0.074), background (0.029), babble (0.074), sporadic (0.029)|3.953|24.874|23.675|24.249| +NoisyOverlapAugment|rate = 0.25, mixing_noise_rate = 0.5, MUSAN noises, utterance_snr_min = (-10), utterance_snr_max = 0, noise_snr_min = (-5), noise_snr_max = 20|3.954|24.949|24.015|23.768| + +### Evaluation on data with music noise added at SNR = (-5) - 5 +Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) +---|---|---|---|--- +None|3.954|15.785|21.105|16.944 +ConcatAugment|3.940|17.186|23.255|18.24 +BabbleAugment|3.957|19.158|22.064|17.116 +BackgroundNoiseAugment|3.955|17.777|22.0|17.535| +MusicAugment|3.954|20.345|23.126|19.433| +SporadicNoiseAugment|3.954|15.927|21.382|14.736| +MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|19.724|22.659|17.852| +NoisyOverlapAugment|3.954|17.49|22.142|17.207| + +### Evaluation on data with babble noise added at SNR = (-5) - 5 +Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) +---|---|---|---|--- +None|3.954|4.092|13.514|5.13 +ConcatAugment|3.940|5.493|15.835|6.893 +BabbleAugment|3.957|16.12|21.097|13.996 +BackgroundNoiseAugment|3.955|4.691|15.784|5.982 +MusicAugment|3.954|8.06|17.764|9.008 +SporadicNoiseAugment|3.954|4.009|13.935|4.814 +MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|14.692|20.882|14.45 +NoisyOverlapAugment|3.954|4.032|16.434|7.284 + +### Evaluation on data with sporadic noise added at SNR = (-5) - 5 +Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) +---|---|---|---|--- +None|3.954|23.778|23.745|22.748 +ConcatAugment|3.940|24.239|25.907|25.723 +BabbleAugment|3.957|23.42|23.048|21.076 +BackgroundNoiseAugment|3.955|23.998|23.467|22.494 +MusicAugment|3.954|24.142|24.181|19.143 +SporadicNoiseAugment|3.954|23.97|23.894|22.61 +MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|24.118|23.59|23.717 +NoisyOverlapAugment|3.954|24.265|24.103|23.167 + +### Evaluation on data with background noise added at SNR = (-5) - 5 +Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) +---|---|---|---|--- +None|3.954|20.201|22.525|19.66 +ConcatAugment|3.940|20.904|24.706|21.353 +BabbleAugment|3.957|20.687|22.374|18.907 +BackgroundNoiseAugment|3.955|21.574|22.998|20.043 +MusicAugment|3.954|21.65|23.529|19.87 +SporadicNoiseAugment|3.954|20.578|22.577|19.096 +MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|21.811|23.144|20.986 +NoisyOverlapAugment|3.954|21.312|23.153|20.302 + +### Evaluation on data with all four types of noises added at SNR = (-5) - 5, each applied with prob 0.5 +Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) +---|---|---|---|--- +None|3.954|10.895|19.319|12.748 +ConcatAugment|3.940|13.517|21.658|15.428 +BabbleAugment|3.957|18.09|21.384|16.018 +BackgroundNoiseAugment|3.955|12.837|20.719|13.933 +MusicAugment|3.954|16.589|21.823|15.927 +SporadicNoiseAugment|3.954|11.238|19.91|13.31 +MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|18.636|21.935|17.845 +NoisyOverlapAugment|3.954|12.829|20.856|15.048 + +### Evaluation on data with noisy overlap augment +Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) +---|---|---|---|--- +None|3.954|21.245|22.24|20.994 +ConcatAugment|3.940|21.611|24.247|23.068 +BabbleAugment|3.957|21.867|21.987|20.099| +BackgroundNoiseAugment|3.955|21.533|21.806|19.717| +MusicAugment|3.954|21.823|22.643|20.847| +SporadicNoiseAugment|3.954|21.373|22.381|20.672| +MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|22.206|22.414|21.375| +NoisyOverlapAugment|3.954|23.371|23.396|22.627| + +## Using transforms +Transforms are configurable. + +1. Please pay careful attention to the type of transform you are applying. + - `concataugment` and `noisyoverlapaugment` are instances of `AudioDatasetTransform` and should be listed in the config under `dataset_transforms`. + - `musicaugment`, `babbleaugment`, `sporadicnoiseaugment`, and `backgroundnoiseaugment` are instances of `AudioWaveformTransform` and should be listed under `waveform_transforms`. + - Instances of `AudioFeatureTransform` should be listed under `feature_transforms`. +2. Feel free to apply these augmentations in different contexts, e.g., you may use a `_train` or `_eval` flag to specify when the transform will be applied. If the dataset at hand contains `train` in its name, those transforms under the `_train` flag will be applied; else, the remaining transforms will be applied. + +For example, you would add this to your config to apply the musicaugment transform to a training dataset: +```yaml +musicaugment: + samples_path: ${MUSIC_PATH} + snr_min: 10 + snr_max: 15 + rate: 0.25 +waveform_transforms: + _train: + - musicaugment +``` +or add this to apply the concataugment transform: +```yaml +concataugment: + rate: 0.25 + max_tokens: 3000 + attempts: 5 +dataset_transforms: + _train: + - concataugment + ``` +You may also want to add multiple of one type of transform; here, we add multiple `AudioWaveformTransform`s: +```yaml +musicaugment: + samples_path: ${MUSIC_PATH} + snr_min: 5 + snr_max: 20 + rate: 0.25 +backgroundnoiseaugment: + samples_path: ${NOISES_PATH} + snr_min: 10 + snr_max: 20 + rate: 0.1 +sporadicnoiseaugment: + samples_path: ${NOISES_PATH} + snr_min: 5 + snr_max: 15 + rate: 0.1 + noise_rate: 0.25 +waveform_transforms: + _train: + - musicaugment + - backgroundnoiseaugment + - sporadicnoiseaugment +``` + +## Adding your own transforms +Note: We store transform implementations in `fairseq/data/audio/*_transforms` directories. You may refer to these as examples while implementing your own transform. + +### Step 1. Picking the right class for your transform +The integration into SpeechToTextDataset is quite different for each kind of transform, so it is important to understand which one is best suited to your purposes. + +**Feature transforms** +`AudioFeatureTransform` is a base class which allows **some transform to be applied to audio spectrograms** in the data loading step. One thing to note is that the source data is either saved as `np.ndarrays` or as audio files, and is to be returned either as features (spectrogram) or waveform. If and only if the data is to be returned as a spectrogram, then `AudioFeatureTransform`s will be applied. + +**Waveform transforms** +`AudioWaveformTransform` is a base class which allows some **transform to be applied to waveforms** in the data loading step. As mentioned above, there are two source and return types to data loading for this dataset. If and only if the data is saved in audio file format, then `AudioWaveformTransform`s will be applied, whichever return type is used. + +**Dataset transforms** +`AudioDatasetTransform` is a base class for transforms **based on more than one item in a dataset**, ex. concatenation of two random samples in a dataset. Rather than being applied in a consistent way, i.e., to all features or to all waveforms, the integration of a dataset transform is entirely specific. Adding a dataset transform requires actually editing the `fairseq/data/audio/speech_to_text_dataset.py` file. + +### Step 2. Setting up your transform (generic to all types of transforms) +Now that you know which kind of transform you would like to use, we are ready to implement it. This step is generic for all transform types, i.e., `TRANSFORM_TYPE` may be any of `feature`, `waveform`, or `dataset`. We will show how to build utterance concatenation (an `AudioDatasetTransform`) as an example. + +Import the base class and registration function for your transform. +```python +from fairseq.data.audio.dataset_transforms import ( + AudioDatasetTransform, + register_audio_dataset_transform +) +``` + +Define the class and register the transform. The name passed into the registration function is how your transform should be named in the config. +```python +@register_audio_dataset_transform("concataugment") +class ConcatAugment(AudioDatasetTransform): +``` + +We are now ready to add the basic important functions to our new class. In this example, `_DEFAULTS` refers to a dictionary with the default hyperparameter values that we defined. `from_config_dict` is called to instantiate the transform given hyperparameters from the config. +```python + @classmethod + def from_config_dict(cls, config=None): + _config = {} if config is None else config + return ConcatAugment( + _config.get("rate", _DEFAULTS["rate"]), + _config.get("max_tokens", _DEFAULTS["max_tokens"]), + _config.get("attempts", _DEFAULTS["attempts"]), + ) +``` +We edit the instantiation function `__init__` to track hyperparameters and do any setup work. +```python + def __init__( + self, + rate=_DEFAULTS["rate"], + max_tokens=_DEFAULTS["max_tokens"], + attempts=_DEFAULTS["attempts"], + ): + self.rate, self.max_tokens, self.attempts = rate, max_tokens, attempts +``` +Lastly `__repr__` gives how the transform will be reported in an output log. +```python + def __repr__(self): + return ( + self.__class__.__name__ + + "(" + + ", ".join( + [ + f"rate={self.rate}", + f"max_tokens={self.max_tokens}", + f"attempts={self.attempts}", + ] + ) + + ")" + ) +``` + +### Step 3. Adding the transform logic +At this point, we are ready to implement the actual transform logic. The flow from here is different for each of the three transforms, so follow the path that is relevant to you. +### ...for feature transforms +The final step is implementing the `__call__` function, which applies the transform logic and **returns** the spectrogram with transform applied. This supports and should take exactly **two arguments**: +- `self` +- `x` (np.ndarray): the spectrogram for one source sample. (This is a positional argument, so you can use another parameter name like `spectrogram` instead of `x`.) + +For example, this is the `__call__` function for GlobalCMVN (cepstral mean and variance normalization). +```python + def __call__(self, x): + x = np.subtract(x, self.mean) + x = np.divide(x, self.std) + return x + +``` +### ...for waveform transforms +The final step is implementing the `__call__` function, which applies the transform logic. This supports and should take exactly **three arguments**: +- `self` +- `source` (numpy.ndarray or torch.Tensor): source audio 2d waveform (channels x length) +- `sample_rate` (optional, defaults to None): sample rate of `source` + +`__call__` **returns**: +- transformed audio waveform +- sample rate of transformed audio waveform + +For example, this is the `__call__` function for augmentations in the Noise Augmentation Suite. +```python + def __call__(self, source, sample_rate=None): + if np.random.random() > self.rate: + return source + + noise = self._get_noise( + source.shape, always_2d=True, use_sample_rate=sample_rate + ) + return self._mix(source, noise, rand_uniform(self.snr_min, self.snr_max)), sample_rate +``` + +### ...for dataset transforms +Dataset transforms are extremely flexible, and implementation involves directly integrating them into `fairseq/data/audio/speech_to_text_dataset.py` in transform-specific ways. +There are two basic components: (1) check whether or not this transform is part of this dataset instance using `self.dataset_transforms.has_transform(TRANSFORM_CLS)`, and (2) if so, get the transform using `self.dataset_transforms.get_transform(TRANSFORM_CLS)` & apply it. +Due to the case-by-case specificity, it is easier to demonstrate this by examples. + +#### Example: NoisyOverlapAugment +This transform requires access to multiple items within the same batch at once. + +**Logic**: We still use the transform classes to keep away the transform logic. For example, `__call__` of `NoisyOverlapAugment` class takes a list of source tokens for items in a mini-batch, applies noise/utterance as dictated by the transform, and returns the list of transformed source tokens for items in the mini-batch. + +```python + def __call__(self, sources): + for i, source in enumerate(sources): + if np.random.random() > self.rate: + continue + + pri = source.numpy() + + # ... some transform code omitted + + pri[s_source : s_source + l] = np.add( + pri[s_source : s_source + l], np.multiply(scl, sec[s_sec : s_sec + l]) + ) + sources[i] = torch.from_numpy(pri).float() + + return sources +``` + +**Integration**: The `collater` function for `SpeechToTextDataset` is responsible for preparing a mini-batch for training, so we integrate NOAug through adding a few lines to the top of this function: +```python +def collater( + self, samples: List[SpeechToTextDatasetItem], return_order: bool = False +) -> Dict: + if len(samples) == 0: + return {} + indices = torch.tensor([x.index for x in samples], dtype=torch.long) + + sources = [x.source for x in samples] + + # NOAUG INTEGRATION BLOCK + # (1) Check whether or not this transform is part of this dataset instance + has_NOAug = self.dataset_transforms.has_transform(NoisyOverlapAugment) + # (2) If so, get & apply the transform + if has_NOAug and self.cfg.use_audio_input: + NOAug = self.dataset_transforms.get_transform(NoisyOverlapAugment) + sources = NOAug(sources) + + frames = _collate_frames(sources, self.cfg.use_audio_input) + # sort samples by descending number of frames + n_frames = torch.tensor([x.size(0) for x in sources], dtype=torch.long) + n_frames, order = n_frames.sort(descending=True) + indices = indices.index_select(0, order) + frames = frames.index_select(0, order) + + # ... rest of function +``` + +#### Example: ConcatAugment +This transform requires access to another item within the dataset at once. + +**Logic**: We abstract the logic for picking indices to concatenate by adding a `find_indices` function to the `ConcatAugment` class, which takes one index in the dataset and finds a compatible second index to concatenate source and target tokens. +```python + def find_indices(self, index: int, n_frames: List[int], n_samples: int): + # skip conditions: application rate, max_tokens limit exceeded + if np.random.random() > self.rate: + return [index] + if self.max_tokens and n_frames[index] > self.max_tokens: + return [index] + + # pick second sample to concatenate + for _ in range(self.attempts): + index2 = np.random.randint(0, n_samples) + if index2 != index and ( + not self.max_tokens + or n_frames[index] + n_frames[index2] < self.max_tokens + ): + return [index, index2] + + return [index] +``` + +**Integration**: `SpeechToTextDataset` uses a custom `__getitem__(self, index)` function (called in the background when you write `dataset[i]`). We edited this function (as well as `_get_source_audio` and `get_tokenized_tgt_text`) to achieve the desired transform effect where accessing `dataset[i]` will return a `SpeechToTextDatasetItem` where `source=source[i]+source[j]` and `target=target[i]+target[j]`. +```python +def __getitem__(self, index: int) -> SpeechToTextDatasetItem: + + # CONCATAUGMENT INTEGRATION BLOCK + # (1) Check whether or not this transform is part of this dataset instance + has_concat = self.dataset_transforms.has_transform(ConcatAugment) + # (2) If so, get & apply the transform + if has_concat: + concat = self.dataset_transforms.get_transform(ConcatAugment) + indices = concat.find_indices(index, self.n_frames, self.n_samples) + + source = self._get_source_audio(indices if has_concat else index) + source = self.pack_frames(source) + + target = None + if self.tgt_texts is not None: + tokenized = self.get_tokenized_tgt_text(indices if has_concat else index) + target = self.tgt_dict.encode_line( + + # ... rest of function +``` diff --git a/examples/speech_to_speech/docs/direct_s2st_discrete_units.md b/examples/speech_to_speech/docs/direct_s2st_discrete_units.md new file mode 100644 index 0000000000..0c63ffee1c --- /dev/null +++ b/examples/speech_to_speech/docs/direct_s2st_discrete_units.md @@ -0,0 +1,181 @@ +# Direct speech-to-speech translation with discrete units + +We provide the implementation for speech-to-unit translation (S2UT) proposed in "[Direct speech-to-speech translation with discrete units (Lee et al. 2021)](https://arxiv.org/abs/2107.05604)" and also the transformer-based implementation of the speech-to-spectrogram translation (S2SPECT, or transformer-based [Translatotron](https://arxiv.org/abs/1904.06037)) baseline in the paper. + +## Pretrained Models + +### Unit-based HiFi-GAN Vocoder +Unit config | Unit size | Vocoder dataset | Model +|---|---|---|--- +[HuBERT Base, Librispeech](https://github.com/fairinternal/fairseq-py/tree/main/examples/hubert), layer 6 | 100 | [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/hubert_base_100_lj/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/hubert_base_100_lj/config.json) + + +## Data preparation +### Target speech +0. (optional) To prepare S2S data from a speech-to-text translation (ST) dataset, see [fairseq-S^2](https://github.com/pytorch/fairseq/tree/main/examples/speech_synthesis) for pre-trained TTS models and instructions on how to train and decode TTS models. +1. Prepare two folders, `$SRC_AUDIO` and `$TGT_AUDIO`, with `${SPLIT}/${SAMPLE_ID}.wav` for source and target speech under each folder, separately. Note that for S2UT experiments, target audio sampling rate should be in 16,000 Hz, and for S2SPECT experiments, target audio sampling rate is recommended to be in 22,050 Hz. +2. To prepare target discrete units for S2UT model training, see [Generative Spoken Language Modeling (speech2unit)](https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/speech2unit) for pre-trained k-means models, checkpoints, and instructions on how to decode units from speech. Set the output target unit files (`--out_quantized_file_path`) as `${TGT_AUDIO}/${SPLIT}.txt`. In [Lee et al. 2021](https://arxiv.org/abs/2107.05604), we use 100 units from the sixth layer (`--layer 6`) of the HuBERT Base model. + +### Formatting data +**Speech-to-speech data** + +_S2UT_ + * Set `--reduce-unit` for training S2UT _reduced_ model + * Pre-trained vocoder and config (`$VOCODER_CKPT`, `$VOCODER_CFG`) can be downloaded from the **Pretrained Models** section. They are not required if `--eval-inference` is not going to be set during model training. +``` +# $SPLIT1, $SPLIT2, etc. are split names such as train, dev, test, etc. + +python examples/speech_to_speech/preprocessing/prep_s2ut_data.py \ + --source-dir $SRC_AUDIO --target-dir $TGT_AUDIO --data-split $SPLIT1 $SPLIT2 \ + --output-root $DATA_ROOT --reduce-unit \ + --vocoder-checkpoint $VOCODER_CKPT --vocoder-cfg $VOCODER_CFG +``` + +_S2SPECT_ +``` +# $SPLIT1, $SPLIT2, etc. are split names such as train, dev, test, etc. + +python examples/speech_to_speech/preprocessing/prep_s2spect_data.py \ + --source-dir $SRC_AUDIO --target-dir $TGT_AUDIO --data-split $SPLIT1 $SPLIT2 \ + --output-root $DATA_ROOT +``` + +**Multitask data** + * For each multitask `$TASK_NAME`, prepare `${DATA_ROOT}/${TASK_NAME}/${SPLIT}.tsv` files for each split following the format below: (Two tab separated columns. The sample_ids should match with the sample_ids for the speech-to-speech data in `${DATA_ROOT}/${SPLIT}.tsv`.) +``` +id tgt_text +sample_id_0 token1 token2 token3 ... +sample_id_1 token1 token2 token3 ... +... +``` + * For each multitask `$TASK_NAME`, prepare `${DATA_ROOT}/${TASK_NAME}/dict.txt`, a dictionary in fairseq format with all tokens for the targets for `$TASK_NAME`. + * Create `config_multitask.yaml`. Below is an example of the config used for S2UT _reduced_ with Fisher experiments including two encoder multitasks (`source_letter`, `target_letter`) and one decoder CTC task (`decoder_target_ctc`). +``` +source_letter: # $TASK_NAME + decoder_type: transformer + dict: ${DATA_ROOT}/source_letter/dict.txt + data: ${DATA_ROOT}/source_letter + encoder_layer: 6 + loss_weight: 8.0 +target_letter: + decoder_type: transformer + dict: ${DATA_ROOT}/target_letter/dict.txt + data: ${DATA_ROOT}/target_letter + encoder_layer: 8 + loss_weight: 8.0 +decoder_target_ctc: + decoder_type: ctc + dict: ${DATA_ROOT}/decoder_target_ctc/dict.txt + data: ${DATA_ROOT}/decoder_target_ctc + decoder_layer: 3 + loss_weight: 1.6 +``` + + +## Training + +**Speech-to-unit translation (S2UT)** + +Here's an example for training Fisher S2UT models with 100 discrete units as target: +``` +fairseq-train $DATA_ROOT \ + --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \ + --task speech_to_speech --target-is-code --target-code-size 100 --vocoder code_hifigan \ + --criterion speech_to_unit --label-smoothing 0.2 \ + --arch s2ut_transformer_fisher --share-decoder-input-output-embed \ + --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.1 \ + --train-subset train --valid-subset dev \ + --save-dir ${MODEL_DIR} \ + --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-init-lr 1e-7 --warmup-updates 10000 \ + --optimizer adam --adam-betas "(0.9,0.98)" --clip-norm 10.0 \ + --max-update 400000 --max-tokens 20000 --max-target-positions 3000 --update-freq 4 \ + --seed 1 --fp16 --num-workers 8 +``` +* Adjust `--update-freq` accordingly for different #GPUs. In the above we set `--update-freq 4` to simulate training with 4 GPUs. +* Set `--n-frames-per-step 5` to train an S2UT _stacked_ system with reduction ratio r=5. (Use `$DATA_ROOT` prepared without `--reduce-unit`.) +* (optional) one can turn on tracking MCD loss during training for checkpoint selection by setting `--eval-inference --eval-args '{"beam": 1, "max_len_a": 1}' --best-checkpoint-metric mcd_loss`. It is recommended to sample a smaller subset as the validation set as MCD loss computation is time-consuming. + +**Speech-to-spectrogram translation (S2SPECT)** + +Here's an example for training Fisher S2SPECT models with reduction ratio r=5: +``` +fairseq-train $DATA_ROOT \ + --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \ + --task speech_to_speech --n-frames-per-step 5 \ + --criterion speech_to_spectrogram \ + --arch s2spect_transformer_fisher --decoder-normalize-before \ + --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.1 \ + --train-subset train --valid-subset dev \ + --save-dir ${MODEL_DIR} \ + --eval-inference --best-checkpoint-metric mcd_loss \ + --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-init-lr 1e-7 --warmup-updates 10000 \ + --optimizer adam --adam-betas "(0.9,0.98)" --clip-norm 10.0 --weight-decay 1e-6 \ + --max-update 400000 --max-tokens 80000 --max-tokens-valid 30000 --required-batch-size-multiple 1 \ + --max-target-positions 3000 --update-freq 16 \ + --seed 1 --fp16 --num-workers 8 +``` +* Adjust `--update-freq` accordingly for different #GPUs. In the above we set `--update-freq 16` to simulate training with 16 GPUs. +* We recommend turning on MCD loss during training for the best checkpoint selection. + +**Unit-based HiFi-GAN vocoder** + +The vocoder is trained with the [speech-resynthesis repo](https://github.com/facebookresearch/speech-resynthesis). See [here](https://github.com/facebookresearch/speech-resynthesis/tree/main/examples/speech_to_speech_translation) for instructions on how to train the unit-based HiFi-GAN vocoder with duration prediction. The same vocoder can support waveform generation for both _reduced_ unit sequences (with `--dur-prediction` set during inference) and original unit sequences. + +## Inference + +**Speech-to-unit translation (S2UT)** + +1. Follow the same inference process as in [fairseq-S2T](https://github.com/pytorch/fairseq/tree/main/examples/speech_to_text) to generate unit sequences (`${RESULTS_PATH}/generate-${GEN_SUBSET}.txt`). +``` +fairseq-generate $DATA_ROOT \ + --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \ + --task speech_to_speech --target-is-code --target-code-size 100 --vocoder code_hifigan \ + --path $MODEL_DIR/checkpoint_best.pt --gen-subset $GEN_SUBSET \ + --max-tokens 50000 \ + --beam 10 --max-len-a 1 \ + --results-path ${RESULTS_PATH} +``` + * Set `--beam 1 --n-frames-per-step $r` for decoding with S2UT _stacked_ models. + +2. Convert unit sequences to waveform. +``` +grep "^D\-" ${RESULTS_PATH}/generate-${GEN_SUBSET}.txt | \ + sed 's/^D-//ig' | sort -nk1 | cut -f3 \ + > ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit + +python examples/speech_to_speech/generate_waveform_from_code.py \ + --in-code-file ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit \ + --vocoder $VOCODER_CKPT --vocoder-cfg $VOCODER_CFG \ + --results-path ${RESULTS_PATH} --dur-prediction +``` + * Set `--dur-prediction` for generating audio for S2UT _reduced_ models. + + +**Speech-to-spectrogram translation (S2SPECT)** + +Follow the same inference process as in [fairseq-S^2](https://github.com/pytorch/fairseq/tree/main/examples/speech_synthesis) to generate waveform. + +``` +# assume using a default Griffin-Lim vocoder + +python examples/speech_synthesis/generate_waveform.py $DATA_ROOT \ + --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \ + --task speech_to_speech --n-frames-per-step 5 \ + --path $MODEL_DIR/checkpoint_best.pt --gen-subset $GEN_SUBSET \ + --max-tokens 50000 \ + --results-path ${RESULTS_PATH} --dump-waveforms --output-sample-rate 16000 +``` + +In addition to using the default Griffin-Lim vocoder, one can also finetune a HiFi-GAN vocoder for the S2SPECT model by following the instructions in the [HiFi-GAN repo](https://github.com/jik876/hifi-gan). + +**Multitask decoding** + +Coming soon. + +## Evaluation + +To evaluate speech translation output, we first apply ASR on the speech output and then compute BLEU score betweent the ASR decoded text and the references using sacreBLEU. + +**En** +* ASR: We use the "[Wav2Vec 2.0 Large (LV-60) + Self Training / 960 hours / Libri-Light + Librispeech](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt)" En ASR model open-sourced by the [wav2vec](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec) project. See [instructions](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#evaluating-a-ctc-model) on how to run inference with a wav2vec-based ASR model. The model is also available on [Hugging Face](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self). +* Text normalization: We use the text cleaner at [https://github.com/keithito/tacotron](https://github.com/keithito/tacotron) for pre-processing reference English text for ASR BLEU evaluation. diff --git a/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md b/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md new file mode 100644 index 0000000000..fbfa5dd16a --- /dev/null +++ b/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md @@ -0,0 +1,125 @@ +# Speech to speech translation (S2ST) + +We provide the implementation for speech-to-unit translation (S2UT) proposed in [Enhanced Direct Speech-to-Speech Translation Using Self-supervised Pre-training and Data Augmentation (Popuri et al. 2022)](https://arxiv.org/abs/2204.02967) and the various pretrained models used. + +## Pretrained Models + +### Unit extraction + +We used the multilingual HuBERT model open sourced in [Textless S2ST with Real Data](textless_s2st_real_data.md) + +### Wav2vec 2.0 + +Language | Block type | Model size | Dataset | Model | +--- | --- | --- | --- | --- | +Es | Transformer | BASE | Voxpopuli | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/es/transformer_B.pt) | +Es | Transformer | LARGE | Voxpopuli | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/es/transformer_L.pt) | +Es | Conformer | LARGE | Voxpopuli | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/es/conformer_L.pt) | +En | Transformer | BASE | Librilight| [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/en/transformer_B.pt) | +En | Conformer | LARGE | Librilight | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/en/conformer_L.pt) | + +### Unit mBART + +Unit size | Dataset | Unit config | Model | +--- | --- | --- | --- | +1000 | [Voxpopuli](https://aclanthology.org/2021.acl-long.80) En, Es unlabelled speech | [mbart_large](https://github.com/pytorch/fairseq/blob/f591cc94caa85098ccf125a4782f91125b6a086d/fairseq/models/bart/model.py#L368) |[ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/unit_mBART/checkpoint.pt) | + +## Data preparation + +1. To prepare data for S2UT finetuning, follow the steps from [Direct S2ST with Discrete Units](./direct_s2st_discrete_units.md) and format the data in the _S2UT_ format. Note that we use 1000 units from the eleventh layer (`--layer 11`) of the multilingual hubert model linked above instead +2. Run + +``` +var="id\taudio\tn_frames\ttgt_text\ttgt_n_frames" +sed -i "1s/.*/$var/" ${SPLIT}.tsv +``` + +## Training + +**Speech-to-unit translation (S2UT)** + +Here's an example for finetuning S2UT models with 1000 discrete units as target. You can download the sample [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/config.yaml) file and [vocabulary](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/dict.txt) for Es-En from here: + +``` +fairseq-train $DATA_ROOT \ + --config-yaml config.yaml \ + --task speech_to_text --arch xm_transformer\ + --criterion l --label-smoothing 0.2 \ + --share-decoder-input-output-embed --adaptor-n-layers 1 --normalize\ + --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.1 \ + --train-subset train --valid-subset dev \ + --load-pretrained-decoder-from ${unit_mBART} --w2v-path ${wav2vec2.0} \ + --mask-prob 0.3 --mask-channel-length 32 --mask-channel-prob 0.25\ + --save-dir ${MODEL_DIR} --checkpoint-activations --encoder-proj \ + --lr 0.0005 --dropout 0.1 --attention-dropout 0.1 --lr-scheduler inverse_sqrt\ + --warmup-init-lr 1e-7 --warmup-updates 10000 \ + --optimizer adam --adam-betas "(0.9,0.98)" --clip-norm 10.0 \ + --max-update 20000 --max-tokens 4000 --max-tokens-valid 4000 --max-source-positions 4000 \ + --max-target-positions 4000 --update-freq 120 \ + --seed 1 --fp16 --num-workers 1 +``` + +* Adjust `--update-freq` accordingly for different #GPUs. In the above we set `--update-freq 15` to simulate training with 120 GPUs. +* In the above setting we finetune the model end to end, corresponding to the full setup in the paper. +* To apply LNA-E partial finetuning, add `--finetune-w2v-params layer_norm,self_attn` +* For LNA-D partial finetuning add `--finetune-decoder-params encoder_attn,layer_norm,self_attn`. To optionally freeze the encoder by k updates, use `--freeze-finetune-updates ${K}` +* For LNA-E,D partial finetuning add both the above options. + +**Unit-based HiFi-GAN vocoder** + +We apply the open-sourced unit-based HiFi-GAN vocoders to convert the predicted unit sequences to waveform. They are open sourced in [Textless S2ST with Real Data](textless_s2st_real_data.md) + +## Inference + +**Speech-to-unit translation (S2UT)** + +1. Follow the same inference process as in [fairseq-S2T](https://github.com/pytorch/fairseq/tree/main/examples/speech_to_text) to generate unit sequences (`${RESULTS_PATH}/generate-${GEN_SUBSET}.txt`). + +``` +fairseq-generate $DATA_ROOT \ + --config-yaml config.yaml \ + --task speech_to_text \ + --path $MODEL_DIR/checkpoint_best.pt --gen-subset $GEN_SUBSET \ + --max-tokens 10000 --max-source-positions 10000 --max-target-positions 10000\ + --beam 10 --max-len-a 1 --max-len-b 200 \ + --results-path ${RESULTS_PATH} +``` + +2. Convert unit sequences to waveform. + +``` +grep "^D\-" ${RESULTS_PATH}/generate-${GEN_SUBSET}.txt | \ + sed 's/^D-//ig' | sort -nk1 | cut -f3 \ + > ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit + +python examples/speech_to_speech/generate_waveform_from_code.py \ + --in-code-file ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit \ + --vocoder $VOCODER_CKPT --vocoder-cfg $VOCODER_CFG \ + --results-path ${RESULTS_PATH} --dur-prediction +``` + +## Evaluation + +To evaluate speech translation output, we first apply ASR on the speech output and then compute BLEU score betweent the ASR decoded text and the references using sacreBLEU. + +* Text normalization: We use the text cleaner at [https://github.com/keithito/tacotron](https://github.com/keithito/tacotron) for pre-processing reference English text for ASR BLEU evaluation. The text cleaner used for Spanish text normalization will be updated here shortly. +* En ASR: We use the "[Wav2Vec 2.0 Large (LV-60) + Self Training / 960 hours / Libri-Light + Librispeech](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt)" En ASR model open-sourced by the [wav2vec](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec) project. The model is also available on [Hugging Face](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self). +* Es ASR: We use the [Wav2Vec2-Large-XLSR-53-Spanish](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) finetuned on spanish Common Voice Es ASR model open-sourced by Jonatasgrosman(<https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-spanish>) on [Hugging Face](https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-spanish). +* See [instructions](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#evaluating-a-ctc-model) on how to run inference with a wav2vec-based ASR model. + + +## Finetuned Model Checkpoints + +ID | En - Es | Es - En | +| --- | --- | --- | +**S2UT systems without pre-training** +S2UT with multitask | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//S2UT_w_multitask.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//S2UT_w_multitask.pt) | +**S2UT systems with model pre-training** +w2v2-L | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_only.pt ) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_only.pt) | +w2v2-L + mBART (LNA-E) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LNE.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LNE.pt) | +w2v2-L + mBART (LNA-D) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LND.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LND.pt) | +w2v2-L + mBART (LNA-E,D) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LNED.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LNED.pt) | +**S2UT systems with model pre-training and data augmentation** +w2v2-L + mBART (LNA-D) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LND_w_ASR.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LND_w_ASR.pt) | + +Note: Some of the tasks use speech_to_text_sharded task which is yet to be open sourced. So make sure to override the task to speech_to_text to use those models. diff --git a/examples/speech_to_speech/docs/textless_s2st_real_data.md b/examples/speech_to_speech/docs/textless_s2st_real_data.md new file mode 100644 index 0000000000..ca6044be1a --- /dev/null +++ b/examples/speech_to_speech/docs/textless_s2st_real_data.md @@ -0,0 +1,89 @@ +# Textless Speech-to-Speech Translation (S2ST) on Real Data + +We provide instructions and pre-trained models for the work "[Textless Speech-to-Speech Translation on Real Data (Lee et al. 2021)](https://arxiv.org/abs/2112.08352)". + +## Pre-trained Models + +### HuBERT +Model | Pretraining Data | Model | Quantizer +|---|---|---|--- +mHuBERT Base | [VoxPopuli](https://github.com/facebookresearch/voxpopuli) En, Es, Fr speech from the 100k subset | [download](https://dl.fbaipublicfiles.com/hubert/mhubert_base_vp_en_es_fr_it3.pt) | [L11 km1000](https://dl.fbaipublicfiles.com/hubert/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin) + + +### Unit-based HiFi-GAN vocoder +Unit config | Unit size | Vocoder language | Dataset | Model +|---|---|---|---|--- +mHuBERT, layer 11 | 1000 | En | [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json) +mHuBERT, layer 11 | 1000 | Es | [CSS10](https://github.com/Kyubyong/css10) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10/config.json) +mHuBERT, layer 11 | 1000 | Fr | [CSS10](https://github.com/Kyubyong/css10) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_fr_css10/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_fr_css10/config.json) + + +### Speech normalizer +Language | Training data | Target unit config | Model +|---|---|---|--- +En | 10 mins | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/en/en_10min.tar.gz) +En | 1 hr | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/en/en_1h.tar.gz) +En | 10 hrs | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/en/en_10h.tar.gz) +Es | 10 mins | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/es/es_10min.tar.gz) +Es | 1 hr | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/es/es_1h.tar.gz) +Es | 10 hrs | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/es/es_10h.tar.gz) +Fr | 10 mins | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/fr/fr_10min.tar.gz) +Fr | 1 hr | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/fr/fr_1h.tar.gz) +Fr | 10 hrs | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/fr/fr_10h.tar.gz) + +* Refer to the paper for the details of the training data. + +## Inference with Pre-trained Models + +### Speech normalizer +1. Download the pre-trained models, including the dictionary, to `DATA_DIR`. +2. Format the audio data. +```bash +# AUDIO_EXT: audio extension, e.g. wav, flac, etc. +# Assume all audio files are at ${AUDIO_DIR}/*.${AUDIO_EXT} + +python examples/speech_to_speech/preprocessing/prep_sn_data.py \ + --audio-dir ${AUDIO_DIR} --ext ${AUIDO_EXT} \ + --data-name ${GEN_SUBSET} --output-dir ${DATA_DIR} \ + --for-inference +``` + +3. Run the speech normalizer and post-process the output. +```bash +mkdir -p ${RESULTS_PATH} + +python examples/speech_recognition/new/infer.py \ + --config-dir examples/hubert/config/decode/ \ + --config-name infer_viterbi \ + task.data=${DATA_DIR} \ + task.normalize=false \ + common_eval.results_path=${RESULTS_PATH}/log \ + common_eval.path=${DATA_DIR}/checkpoint_best.pt \ + dataset.gen_subset=${GEN_SUBSET} \ + '+task.labels=["unit"]' \ + +decoding.results_path=${RESULTS_PATH} \ + common_eval.post_process=none \ + +dataset.batch_size=1 \ + common_eval.quiet=True + +# Post-process and generate output at ${RESULTS_PATH}/${GEN_SUBSET}.txt +python examples/speech_to_speech/preprocessing/prep_sn_output_data.py \ + --in-unit ${RESULTS_PATH}/hypo.units \ + --in-audio ${DATA_DIR}/${GEN_SUBSET}.tsv \ + --output-root ${RESULTS_PATH} +``` + + +### Unit-to-waveform conversion with unit vocoder +The pre-trained vocoders can support generating audio for both full unit sequences and reduced unit sequences (i.e. duplicating consecutive units removed). Set `--dur-prediction` for generating audio with reduced unit sequences. +```bash +# IN_CODE_FILE contains one unit sequence per line. Units are separated by space. + +python examples/speech_to_speech/generate_waveform_from_code.py \ + --in-code-file ${IN_CODE_FILE} \ + --vocoder ${VOCODER_CKPT} --vocoder-cfg ${VOCODER_CFG} \ + --results-path ${RESULTS_PATH} --dur-prediction +``` + +## Training new models +To be updated. diff --git a/examples/speech_to_speech/generate_waveform_from_code.py b/examples/speech_to_speech/generate_waveform_from_code.py new file mode 100644 index 0000000000..82aa7acfb8 --- /dev/null +++ b/examples/speech_to_speech/generate_waveform_from_code.py @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import json +import logging +from pathlib import Path +import random +import soundfile as sf +import torch + +from tqdm import tqdm + +from fairseq import utils +from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder + + +logging.basicConfig() +logging.root.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def dump_result(args, sample_id, pred_wav, suffix=""): + sf.write( + f"{args.results_path}/{sample_id}{suffix}_pred.wav", + pred_wav.detach().cpu().numpy(), + 16000, + ) + + +def load_code(in_file): + with open(in_file) as f: + out = [list(map(int, line.strip().split())) for line in f] + return out + + +def main(args): + logger.info(args) + + use_cuda = torch.cuda.is_available() and not args.cpu + + with open(args.vocoder_cfg) as f: + vocoder_cfg = json.load(f) + vocoder = CodeHiFiGANVocoder(args.vocoder, vocoder_cfg) + if use_cuda: + vocoder = vocoder.cuda() + + multispkr = vocoder.model.multispkr + if multispkr: + logger.info("multi-speaker vocoder") + num_speakers = vocoder_cfg.get( + "num_speakers", 200 + ) # following the default in codehifigan to set to 200 + assert ( + args.speaker_id < num_speakers + ), f"invalid --speaker-id ({args.speaker_id}) with total #speakers = {num_speakers}" + + data = load_code(args.in_code_file) + Path(args.results_path).mkdir(exist_ok=True, parents=True) + for i, d in tqdm(enumerate(data), total=len(data)): + x = { + "code": torch.LongTensor(d).view(1, -1), + } + suffix = "" + if multispkr: + spk = ( + random.randint(0, num_speakers - 1) + if args.speaker_id == -1 + else args.speaker_id + ) + suffix = f"_spk{spk}" + x["spkr"] = torch.LongTensor([spk]).view(1, 1) + + x = utils.move_to_cuda(x) if use_cuda else x + wav = vocoder(x, args.dur_prediction) + dump_result(args, i, wav, suffix=suffix) + + +def cli_main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--in-code-file", type=str, required=True, help="one unit sequence per line" + ) + parser.add_argument( + "--vocoder", type=str, required=True, help="path to the CodeHiFiGAN vocoder" + ) + parser.add_argument( + "--vocoder-cfg", + type=str, + required=True, + help="path to the CodeHiFiGAN vocoder config", + ) + parser.add_argument("--results-path", type=str, required=True) + parser.add_argument( + "--dur-prediction", + action="store_true", + help="enable duration prediction (for reduced/unique code sequences)", + ) + parser.add_argument( + "--speaker-id", + type=int, + default=-1, + help="Speaker id (for vocoder that supports multispeaker). Set to -1 to randomly sample speakers.", + ) + parser.add_argument("--cpu", action="store_true", help="run on CPU") + + args = parser.parse_args() + + main(args) + + +if __name__ == "__main__": + cli_main() diff --git a/examples/speech_to_speech/preprocessing/__init__.py b/examples/speech_to_speech/preprocessing/__init__.py new file mode 100644 index 0000000000..6264236915 --- /dev/null +++ b/examples/speech_to_speech/preprocessing/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/examples/speech_to_speech/preprocessing/data_utils.py b/examples/speech_to_speech/preprocessing/data_utils.py new file mode 100644 index 0000000000..a83a67f954 --- /dev/null +++ b/examples/speech_to_speech/preprocessing/data_utils.py @@ -0,0 +1,88 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from pathlib import Path +from typing import List, Optional + +from examples.speech_to_text.data_utils import S2TDataConfigWriter + + +def gen_config_yaml( + manifest_root: Path, + yaml_filename: str = "config.yaml", + specaugment_policy: Optional[str] = "lb", + feature_transform: Optional[List[str]] = None, + input_channels: Optional[int] = 1, + input_feat_per_channel: Optional[int] = 80, + audio_root: str = "", + vocoder_type: Optional[str] = None, + vocoder_checkpoint: Optional[str] = None, + vocoder_cfg: Optional[str] = None, + extra=None, +): + manifest_root = manifest_root.absolute() + writer = S2TDataConfigWriter(manifest_root / yaml_filename) + + if input_channels is not None: + writer.set_input_channels(input_channels) + if input_feat_per_channel is not None: + writer.set_input_feat_per_channel(input_feat_per_channel) + specaugment_setters = { + "lb": writer.set_specaugment_lb_policy, + "ld": writer.set_specaugment_ld_policy, + "sm": writer.set_specaugment_sm_policy, + "ss": writer.set_specaugment_ss_policy, + } + specaugment_setter = specaugment_setters.get(specaugment_policy, None) + if specaugment_setter is not None: + specaugment_setter() + + if feature_transform is None: + feature_transform = [] + else: + writer.set_feature_transforms("*", feature_transform) + + if specaugment_policy is not None: + writer.set_feature_transforms("_train", feature_transform + ["specaugment"]) + + if len(audio_root) > 0: + writer.set_audio_root(audio_root) + + if ( + vocoder_type is not None + and vocoder_checkpoint is not None + and vocoder_cfg is not None + ): + writer.set_extra( + { + "vocoder": { + "type": vocoder_type, + "config": vocoder_cfg, + "checkpoint": vocoder_checkpoint, + } + } + ) + + if extra is not None: + writer.set_extra(extra) + writer.flush() + + +def load_units(in_file): + out = {} + with open(in_file) as f: + for line in f: + sample_id, units = line.strip().split("|", 1) + out[sample_id] = units.split() + + return out + + +def process_units(units, reduce=False): + if not reduce: + return units + + out = [u for i, u in enumerate(units) if i == 0 or u != units[i - 1]] + return out diff --git a/examples/speech_to_speech/preprocessing/prep_s2spect_data.py b/examples/speech_to_speech/preprocessing/prep_s2spect_data.py new file mode 100644 index 0000000000..2748b37aef --- /dev/null +++ b/examples/speech_to_speech/preprocessing/prep_s2spect_data.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os +from pathlib import Path +import shutil +import torchaudio + +import soundfile as sf +from tqdm import tqdm +import pandas as pd + +from examples.speech_synthesis.data_utils import extract_logmel_spectrogram +from examples.speech_to_speech.preprocessing.data_utils import gen_config_yaml +from examples.speech_to_text.data_utils import create_zip, get_zip_manifest, save_df_to_tsv +from fairseq.data.audio.audio_utils import convert_waveform + + +logger = logging.getLogger(__name__) + +MANIFEST_COLUMNS = ["id", "src_audio", "src_n_frames", "tgt_audio", "tgt_n_frames"] + + +def prepare_target_data(args, tgt_audios): + feature_name = "logmelspec80" + zip_path = args.output_root / f"{feature_name}.zip" + if zip_path.exists(): + print(f"{zip_path} exists.") + return zip_path + + feature_root = args.output_root / feature_name + feature_root.mkdir(exist_ok=True) + + print("Extracting Mel spectrogram features...") + for tgt_audio in tqdm(tgt_audios): + sample_id = tgt_audio.stem + waveform, sample_rate = torchaudio.load(tgt_audio.as_posix()) + waveform, sample_rate = convert_waveform( + waveform, sample_rate, normalize_volume=args.normalize_volume, + to_sample_rate=args.sample_rate + ) + extract_logmel_spectrogram( + waveform, sample_rate, feature_root / f"{sample_id}.npy", + win_length=args.win_length, hop_length=args.hop_length, + n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min, + f_max=args.f_max + ) + print("ZIPing features...") + create_zip(feature_root, zip_path) + shutil.rmtree(feature_root) + + return zip_path + + +def process(args): + os.makedirs(args.output_root, exist_ok=True) + + manifest = {} + tgt_audios = [] + for split in args.data_split: + print(f"Processing {split}...") + + manifest[split] = {c: [] for c in MANIFEST_COLUMNS} + missing_tgt_audios = [] + src_audios = list(args.source_dir.glob(f"{split}/*.wav")) + for src_audio in tqdm(src_audios): + sample_id = src_audio.stem + + tgt_audio = args.target_dir / split / f"{sample_id}.wav" + if not tgt_audio.is_file(): + missing_tgt_audios.append(sample_id) + continue + + tgt_audios.append(tgt_audio) + + src_n_frames = sf.info(src_audio.as_posix()).frames + manifest[split]["id"].append(sample_id) + manifest[split]["src_audio"].append(src_audio.as_posix()) + manifest[split]["src_n_frames"].append( + src_n_frames // 160 + ) # estimation of 10-ms frame for 16kHz audio + + print(f"Processed {len(manifest[split]['id'])} samples") + if len(missing_tgt_audios) > 0: + print( + f"{len(missing_tgt_audios)} with missing target data (first 3 examples: {', '.join(missing_tgt_audios[:3])})" + ) + + # Extract features and pack features into ZIP + zip_path = prepare_target_data(args, tgt_audios) + + print("Fetching ZIP manifest...") + tgt_audio_paths, tgt_audio_lengths = get_zip_manifest(zip_path) + + print("Generating manifest...") + for split in args.data_split: + print(f"Processing {split}...") + + for sample_id in tqdm(manifest[split]["id"]): + manifest[split]["tgt_audio"].append(tgt_audio_paths[sample_id]) + manifest[split]["tgt_n_frames"].append(tgt_audio_lengths[sample_id]) + + out_manifest = args.output_root / f"{split}.tsv" + print(f"Writing manifest to {out_manifest}...") + save_df_to_tsv(pd.DataFrame.from_dict(manifest[split]), out_manifest) + + # Generate config YAML + win_len_t = args.win_length / args.sample_rate + hop_len_t = args.hop_length / args.sample_rate + extra = { + "features": { + "type": "spectrogram+melscale+log", + "sample_rate": args.sample_rate, + "eps": 1e-5, "n_mels": args.n_mels, "n_fft": args.n_fft, + "window_fn": "hann", "win_length": args.win_length, + "hop_length": args.hop_length, + "win_len_t": win_len_t, "hop_len_t": hop_len_t, + "f_min": args.f_min, "f_max": args.f_max, + "n_stft": args.n_fft // 2 + 1 + } + } + gen_config_yaml( + args.output_root, + audio_root=args.output_root.as_posix(), + specaugment_policy="lb", + feature_transform=["utterance_cmvn", "delta_deltas"], + extra=extra, + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--source-dir", required=True, type=Path, help="source audio directory" + ) + parser.add_argument( + "--target-dir", required=True, type=Path, help="target audio directory" + ) + parser.add_argument( + "--data-split", + default=["train", "valid", "test"], + nargs="+", + help="data split names", + ) + parser.add_argument( + "--output-root", required=True, type=Path, help="output directory" + ) + # target feature related + parser.add_argument("--win-length", type=int, default=1024) + parser.add_argument("--hop-length", type=int, default=256) + parser.add_argument("--n-fft", type=int, default=1024) + parser.add_argument("--n-mels", type=int, default=80) + parser.add_argument("--f-min", type=int, default=20) + parser.add_argument("--f-max", type=int, default=8000) + parser.add_argument("--sample-rate", type=int, default=22050) + parser.add_argument("--normalize-volume", "-n", action="store_true") + + args = parser.parse_args() + + process(args) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_to_speech/preprocessing/prep_s2ut_data.py b/examples/speech_to_speech/preprocessing/prep_s2ut_data.py new file mode 100644 index 0000000000..c97c0fe9be --- /dev/null +++ b/examples/speech_to_speech/preprocessing/prep_s2ut_data.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +from pathlib import Path + +import soundfile as sf +from tqdm import tqdm +import pandas as pd + +from examples.speech_to_speech.preprocessing.data_utils import ( + gen_config_yaml, + load_units, + process_units, +) +from examples.speech_to_text.data_utils import save_df_to_tsv + +logger = logging.getLogger(__name__) + +MANIFEST_COLUMNS = ["id", "src_audio", "src_n_frames", "tgt_audio", "tgt_n_frames"] + + +def process(args): + args.output_root.mkdir(exist_ok=True) + + print("Generating manifest...") + for split in args.data_split: + print(f"Processing {split}") + + # load target units + target_unit_data = load_units(args.target_dir / f"{split}.txt") + + manifest = {c: [] for c in MANIFEST_COLUMNS} + missing_tgt_audios = [] + src_audios = list(args.source_dir.glob(f"{split}/*.wav")) + for src_audio in tqdm(src_audios): + sample_id = src_audio.stem + + if sample_id not in target_unit_data: + missing_tgt_audios.append(sample_id) + continue + + src_n_frames = sf.info(src_audio.as_posix()).frames + manifest["id"].append(sample_id) + manifest["src_audio"].append(src_audio.as_posix()) + manifest["src_n_frames"].append( + src_n_frames // 160 + ) # estimation of 10-ms frame for 16kHz audio + + target_units = process_units(target_unit_data[sample_id], args.reduce_unit) + manifest["tgt_audio"].append(" ".join(target_units)) + manifest["tgt_n_frames"].append(len(target_units)) + + print(f"Processed {len(manifest['id'])} samples") + if len(missing_tgt_audios) > 0: + print( + f"{len(missing_tgt_audios)} with missing target data (first 3 examples: {', '.join(missing_tgt_audios[:3])})" + ) + + out_manifest = args.output_root / f"{split}.tsv" + print(f"Writing manifest to {out_manifest}...") + save_df_to_tsv(pd.DataFrame.from_dict(manifest), out_manifest) + + # Generate config YAML + gen_config_yaml( + args.output_root, + specaugment_policy="lb", + feature_transform=["utterance_cmvn"], + vocoder_type="code_hifigan", + vocoder_checkpoint=args.vocoder_checkpoint, + vocoder_cfg=args.vocoder_cfg, + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--source-dir", required=True, type=Path, help="source audio directory" + ) + parser.add_argument( + "--target-dir", required=True, type=Path, help="target audio directory" + ) + parser.add_argument( + "--data-split", + default=["train", "valid", "test"], + nargs="+", + help="data split names", + ) + parser.add_argument( + "--output-root", required=True, type=Path, help="output directory" + ) + parser.add_argument( + "--reduce-unit", + action="store_true", + help="reduce a target unit sequence to a unique unit sequence, i.e. '1 1 1 2 2' -> '1 2'", + ) + parser.add_argument( + "--vocoder-checkpoint", default=None, type=str, help="vocoder checkpoint" + ) + parser.add_argument( + "--vocoder-cfg", default=None, type=str, help="vocoder config file" + ) + + args = parser.parse_args() + + process(args) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_to_speech/preprocessing/prep_sn_data.py b/examples/speech_to_speech/preprocessing/prep_sn_data.py new file mode 100644 index 0000000000..ea94175634 --- /dev/null +++ b/examples/speech_to_speech/preprocessing/prep_sn_data.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Adapted from examples/wav2vec/wav2vec_manifest.py +""" +Data preparation for the speech normalizer +""" + +import argparse +import glob +import os + +import soundfile + +from examples.speech_to_speech.preprocessing.data_utils import load_units, process_units + + +def process(args): + assert ( + args.for_inference or args.target_unit is not None + ), "missing --target-unit or --for-inference" + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + dir_path = os.path.realpath(args.audio_dir) + search_path = os.path.join(dir_path, "**/*." + args.ext) + + if args.target_unit: + unit_data = load_units(args.target_unit) + + with open(os.path.join(args.output_dir, f"{args.data_name}.tsv"), "w") as o_t, open( + os.path.join(args.output_dir, f"{args.data_name}.unit"), "w" + ) as o_u: + print(dir_path, file=o_t) + for fname in glob.iglob(search_path, recursive=True): + file_path = os.path.realpath(fname) + frames = soundfile.info(fname).frames + print( + "{}\t{}".format(os.path.relpath(file_path, dir_path), frames), file=o_t + ) + + if args.for_inference: + print("0", file=o_u) + else: + sample_id = os.path.basename(file_path)[: -len(args.ext) - 1] + assert ( + sample_id in unit_data + ), f'{fname} does not have unit data in {args.target_unit}. Expecting sample_id "{sample_id}".' + target_units = process_units(unit_data[sample_id], reduce=True) + print(" ".join(target_units), file=o_u) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--audio-dir", required=True, type=str, help="audio directory") + parser.add_argument("--ext", default="flac", type=str, help="audio extension") + parser.add_argument( + "--data-name", + required=True, + type=str, + help="dataset name", + ) + parser.add_argument( + "--output-dir", required=True, type=str, help="output directory" + ) + parser.add_argument( + "--for-inference", + action="store_true", + help="set this if preparing data for running inference with a speech normalizer", + ) + parser.add_argument( + "--target-unit", + default=None, + type=str, + help="a file containing unit sequences in the format: sample_id|u1 u2 ...", + ) + + args = parser.parse_args() + + process(args) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_to_speech/preprocessing/prep_sn_output_data.py b/examples/speech_to_speech/preprocessing/prep_sn_output_data.py new file mode 100644 index 0000000000..06991343bd --- /dev/null +++ b/examples/speech_to_speech/preprocessing/prep_sn_output_data.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +from pathlib import Path + +from tqdm import tqdm + + +def process(args): + args.output_root.mkdir(exist_ok=True) + + # load units + units = {} + with open(args.in_unit) as f: + for line in f: + unit_seq, utt_id = line.strip().rsplit(" ", 1) + utt_id = int(utt_id[6:-1]) # remove "(None-" + units[utt_id] = unit_seq + + with open(args.in_audio) as f, open( + args.output_root / f"{args.in_audio.stem}.txt", "w" + ) as o: + f.readline() + for i, line in enumerate(tqdm(f.readlines())): + audio, _ = line.strip().split("\t", 1) + sample_id = Path(audio).stem + o.write(f"{sample_id}|{units[i]}\n") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--in-unit", + required=True, + type=Path, + help="unit file (output from the speech normalizer)", + ) + parser.add_argument( + "--in-audio", + required=True, + type=Path, + help="tsv file (input to the normalizer)", + ) + parser.add_argument( + "--output-root", required=True, type=Path, help="output directory" + ) + + args = parser.parse_args() + + process(args) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_to_speech/unity/__init__.py b/examples/speech_to_speech/unity/__init__.py new file mode 100644 index 0000000000..349db7c65e --- /dev/null +++ b/examples/speech_to_speech/unity/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import sequence_generator # noqa +from . import sequence_generator_multi_decoder # noqa diff --git a/examples/speech_to_speech/unity/sequence_generator.py b/examples/speech_to_speech/unity/sequence_generator.py new file mode 100644 index 0000000000..c482098feb --- /dev/null +++ b/examples/speech_to_speech/unity/sequence_generator.py @@ -0,0 +1,626 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import sys +from typing import Dict, List, Optional + +import torch +from torch import Tensor + +from fairseq.sequence_generator import EnsembleModel as EnsembleModelBase +from fairseq.sequence_generator import SequenceGenerator as SequenceGeneratorBase + + +class SequenceGenerator(SequenceGeneratorBase): + def __init__( + self, + models, + tgt_dict, + beam_size=1, + max_len_a=0, + max_len_b=200, + max_len=0, + min_len=1, + normalize_scores=True, + len_penalty=1.0, + unk_penalty=0.0, + temperature=1.0, + match_source_len=False, + no_repeat_ngram_size=0, + search_strategy=None, + eos=None, + symbols_to_strip_from_output=None, + lm_model=None, + lm_weight=1.0, + tokens_to_suppress=(), + ): + """Generates translations of a given source sentence. + + Args: + models (List[~fairseq.models.FairseqModel]): ensemble of models, + currently support fairseq.models.TransformerModel for scripting + beam_size (int, optional): beam width (default: 1) + max_len_a/b (int, optional): generate sequences of maximum length + ax + b, where x is the source length + max_len (int, optional): the maximum length of the generated output + (not including end-of-sentence) + min_len (int, optional): the minimum length of the generated output + (not including end-of-sentence) + normalize_scores (bool, optional): normalize scores by the length + of the output (default: True) + len_penalty (float, optional): length penalty, where <1.0 favors + shorter, >1.0 favors longer sentences (default: 1.0) + unk_penalty (float, optional): unknown word penalty, where <0 + produces more unks, >0 produces fewer (default: 0.0) + temperature (float, optional): temperature, where values + >1.0 produce more uniform samples and values <1.0 produce + sharper samples (default: 1.0) + match_source_len (bool, optional): outputs should match the source + length (default: False) + """ + super().__init__( + models=models, + tgt_dict=tgt_dict, + beam_size=beam_size, + max_len_a=max_len_a, + max_len_b=max_len_b, + max_len=max_len, + min_len=min_len, + normalize_scores=normalize_scores, + len_penalty=len_penalty, + unk_penalty=unk_penalty, + temperature=temperature, + match_source_len=match_source_len, + no_repeat_ngram_size=no_repeat_ngram_size, + search_strategy=search_strategy, + eos=eos, + symbols_to_strip_from_output=symbols_to_strip_from_output, + lm_model=lm_model, + lm_weight=lm_weight, + tokens_to_suppress=tokens_to_suppress, + ) + + if isinstance(models, EnsembleModel): + self.model = models + else: + self.model = EnsembleModel(models) + + self.model.set_decoder_beam_size(self.beam_size) + self.model.eval() + + def _generate( + self, + sample: Dict[str, Dict[str, Tensor]], + prefix_tokens: Optional[Tensor] = None, + constraints: Optional[Tensor] = None, + bos_token: Optional[int] = None, + ): + net_input = sample["net_input"] + + if "src_tokens" in net_input: + src_tokens = net_input["src_tokens"] + # length of the source text being the character length except EndOfSentence and pad + # if src_lengths exists in net_input (speech_to_text dataset case), then use it + if "src_lengths" in net_input: + src_lengths = net_input["src_lengths"] + else: + src_lengths = ( + (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)) + .long() + .sum(dim=1) + ) + elif "source" in net_input: + src_tokens = net_input["source"] + src_lengths = ( + net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1) + if net_input["padding_mask"] is not None + else torch.tensor(src_tokens.size(-1)).to(src_tokens) + ) + elif "features" in net_input: + src_tokens = net_input["features"] + src_lengths = ( + net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1) + if net_input["padding_mask"] is not None + else torch.tensor(src_tokens.size(-1)).to(src_tokens) + ) + else: + raise Exception( + "expected src_tokens or source in net input. input keys: " + + str(net_input.keys()) + ) + + if constraints is not None and not self.search.supports_constraints: + raise NotImplementedError( + "Target-side constraints were provided, but search method doesn't support them" + ) + + # Initialize constraints, when active + self.search.init_constraints(constraints, self.beam_size) + + # compute the encoder output for each beam + with torch.autograd.profiler.record_function("EnsembleModel: forward_encoder"): + encoder_outs = self.model.forward_encoder(net_input) + + finalized = self.generate_decoder( + encoder_outs, + src_tokens, + src_lengths, + sample, + prefix_tokens, + constraints, + bos_token, + ) + return finalized + + def generate_decoder( + self, + encoder_outs, + src_tokens, + src_lengths, + sample: Dict[str, Dict[str, Tensor]], + prefix_tokens: Optional[Tensor] = None, + constraints: Optional[Tensor] = None, + bos_token: Optional[int] = None, + aux_task_name="", + encoder_outs_aug: Optional[ + Tensor + ] = None, # an additional/augmented encoder_outs + ): + incremental_states = torch.jit.annotate( + List[Dict[str, Dict[str, Optional[Tensor]]]], + [ + torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) + for i in range(self.model.models_size) + ], + ) + + # bsz: total number of sentences in beam + # Note that src_tokens may have more than 2 dimensions (i.e. audio features) + bsz, src_len = src_tokens.size()[:2] + beam_size = self.beam_size + + decoder_name = f"{aux_task_name}_decoder" if aux_task_name else "decoder" + + max_len: int = -1 + if self.match_source_len: + max_len = src_lengths.max().item() + else: + max_len = min( + int(self.max_len_a * src_len + self.max_len_b), + self.max_len - 1, + ) + assert ( + self.min_len <= max_len + ), "min_len cannot be larger than max_len, please adjust these!" + + # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores + new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) + new_order = new_order.to(src_tokens.device).long() + encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order) + # ensure encoder_outs is a List. + assert encoder_outs is not None + if encoder_outs_aug is not None: + encoder_outs_aug = self.model.reorder_encoder_out( + encoder_outs_aug, new_order + ) + + # initialize buffers + scores = ( + torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float() + ) # +1 for eos; pad is never chosen for scoring + tokens = ( + torch.zeros(bsz * beam_size, max_len + 2) + .to(src_tokens) + .long() + .fill_(self.pad) + ) # +2 for eos and pad + tokens[:, 0] = self.eos if bos_token is None else bos_token + attn: Optional[Tensor] = None + + # A list that indicates candidates that should be ignored. + # For example, suppose we're sampling and have already finalized 2/5 + # samples. Then cands_to_ignore would mark 2 positions as being ignored, + # so that we only finalize the remaining 3 samples. + cands_to_ignore = ( + torch.zeros(bsz, beam_size).to(src_tokens).eq(-1) + ) # forward and backward-compatible False mask + + # list of completed sentences + finalized = torch.jit.annotate( + List[List[Dict[str, Tensor]]], + [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)], + ) # contains lists of dictionaries of infomation about the hypothesis being finalized at each step + + # a boolean array indicating if the sentence at the index is finished or not + finished = [False for i in range(bsz)] + num_remaining_sent = bsz # number of sentences remaining + + # number of candidate hypos per step + cand_size = 2 * beam_size # 2 x beam size in case half are EOS + + # offset arrays for converting between different indexing schemes + bbsz_offsets = ( + (torch.arange(0, bsz) * beam_size) + .unsqueeze(1) + .type_as(tokens) + .to(src_tokens.device) + ) + cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(src_tokens.device) + + reorder_state: Optional[Tensor] = None + batch_idxs: Optional[Tensor] = None + + original_batch_idxs: Optional[Tensor] = None + if "id" in sample and isinstance(sample["id"], Tensor): + original_batch_idxs = sample["id"] + else: + original_batch_idxs = torch.arange(0, bsz).type_as(tokens) + + for step in range(max_len + 1): # one extra step for EOS marker + # reorder decoder internal states based on the prev choice of beams + if reorder_state is not None: + if batch_idxs is not None: + # update beam indices to take into account removed sentences + corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as( + batch_idxs + ) + reorder_state.view(-1, beam_size).add_( + corr.unsqueeze(-1) * beam_size + ) + original_batch_idxs = original_batch_idxs[batch_idxs] + self.model.reorder_incremental_state( + incremental_states, reorder_state, decoder_name + ) + encoder_outs = self.model.reorder_encoder_out( + encoder_outs, reorder_state + ) + if encoder_outs_aug is not None: + encoder_outs_aug = self.model.reorder_encoder_out( + encoder_outs_aug, reorder_state + ) + with torch.autograd.profiler.record_function( + "EnsembleModel: forward_decoder" + ): + lprobs, avg_attn_scores = self.model.forward_decoder( + tokens[:, : step + 1], + encoder_outs, + incremental_states, + self.temperature, + decoder_name=decoder_name, + encoder_outs_aug=encoder_outs_aug, + ) + + if self.lm_model is not None and not aux_task_name: + lm_out = self.lm_model(tokens[:, : step + 1]) + probs = self.lm_model.get_normalized_probs( + lm_out, log_probs=True, sample=None + ) + probs = probs[:, -1, :] * self.lm_weight + lprobs += probs + + lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs) + + lprobs[:, self.pad] = -math.inf # never select pad + lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty + + # handle max length constraint + if step >= max_len: + lprobs[:, : self.eos] = -math.inf + lprobs[:, self.eos + 1 :] = -math.inf + + # handle prefix tokens (possibly with different lengths) + if ( + prefix_tokens is not None + and step < prefix_tokens.size(1) + and step < max_len + ): + lprobs, tokens, scores = self._prefix_tokens( + step, lprobs, scores, tokens, prefix_tokens, beam_size + ) + else: + if step < self.min_len: + # minimum length constraint (does not apply if using prefix_tokens) + lprobs[:, self.eos] = -math.inf + + if self.token_indices_to_suppress is not None: + lprobs[:, self.token_indices_to_suppress] = -math.inf + + # Record attention scores, only support avg_attn_scores is a Tensor + if avg_attn_scores is not None: + if attn is None: + attn = torch.empty( + bsz * beam_size, avg_attn_scores.size(1), max_len + 2 + ).to(scores) + attn[:, :, step + 1].copy_(avg_attn_scores) + + scores = scores.type_as(lprobs) + eos_bbsz_idx = torch.empty(0).to( + tokens + ) # indices of hypothesis ending with eos (finished sentences) + eos_scores = torch.empty(0).to( + scores + ) # scores of hypothesis ending with eos (finished sentences) + + if self.should_set_src_lengths: + self.search.set_src_lengths(src_lengths) + + if self.repeat_ngram_blocker is not None: + lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz, beam_size, step) + + # Shape: (batch, cand_size) + cand_scores, cand_indices, cand_beams = self.search.step( + step, + lprobs.view(bsz, -1, self.vocab_size), + scores.view(bsz, beam_size, -1)[:, :, :step], + tokens[:, : step + 1], + original_batch_idxs, + ) + + # cand_bbsz_idx contains beam indices for the top candidate + # hypotheses, with a range of values: [0, bsz*beam_size), + # and dimensions: [bsz, cand_size] + cand_bbsz_idx = cand_beams.add(bbsz_offsets) + + # finalize hypotheses that end in eos + # Shape of eos_mask: (batch size, beam size) + eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf) + eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask) + + # only consider eos when it's among the top beam_size indices + # Now we know what beam item(s) to finish + # Shape: 1d list of absolute-numbered + eos_bbsz_idx = torch.masked_select( + cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size] + ) + + finalized_sents: List[int] = [] + if eos_bbsz_idx.numel() > 0: + eos_scores = torch.masked_select( + cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size] + ) + + finalized_sents = self.finalize_hypos( + step, + eos_bbsz_idx, + eos_scores, + tokens, + scores, + finalized, + finished, + beam_size, + attn, + src_lengths, + max_len, + ) + num_remaining_sent -= len(finalized_sents) + + assert num_remaining_sent >= 0 + if num_remaining_sent == 0: + break + if self.search.stop_on_max_len and step >= max_len: + break + assert step < max_len, f"{step} < {max_len}" + + # Remove finalized sentences (ones for which {beam_size} + # finished hypotheses have been generated) from the batch. + if len(finalized_sents) > 0: + new_bsz = bsz - len(finalized_sents) + + # construct batch_idxs which holds indices of batches to keep for the next pass + batch_mask = torch.ones( + bsz, dtype=torch.bool, device=cand_indices.device + ) + batch_mask[finalized_sents] = False + # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it + batch_idxs = torch.arange( + bsz, device=cand_indices.device + ).masked_select(batch_mask) + + # Choose the subset of the hypothesized constraints that will continue + self.search.prune_sentences(batch_idxs) + + eos_mask = eos_mask[batch_idxs] + cand_beams = cand_beams[batch_idxs] + bbsz_offsets.resize_(new_bsz, 1) + cand_bbsz_idx = cand_beams.add(bbsz_offsets) + cand_scores = cand_scores[batch_idxs] + cand_indices = cand_indices[batch_idxs] + + if prefix_tokens is not None: + prefix_tokens = prefix_tokens[batch_idxs] + src_lengths = src_lengths[batch_idxs] + cands_to_ignore = cands_to_ignore[batch_idxs] + + scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) + tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) + if attn is not None: + attn = attn.view(bsz, -1)[batch_idxs].view( + new_bsz * beam_size, attn.size(1), -1 + ) + bsz = new_bsz + else: + batch_idxs = None + + # Set active_mask so that values > cand_size indicate eos hypos + # and values < cand_size indicate candidate active hypos. + # After, the min values per row are the top candidate active hypos + + # Rewrite the operator since the element wise or is not supported in torchscript. + + eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size])) + active_mask = torch.add( + eos_mask.type_as(cand_offsets) * cand_size, + cand_offsets[: eos_mask.size(1)], + ) + + # get the top beam_size active hypotheses, which are just + # the hypos with the smallest values in active_mask. + # {active_hypos} indicates which {beam_size} hypotheses + # from the list of {2 * beam_size} candidates were + # selected. Shapes: (batch size, beam size) + new_cands_to_ignore, active_hypos = torch.topk( + active_mask, k=beam_size, dim=1, largest=False + ) + + # update cands_to_ignore to ignore any finalized hypos. + cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size] + # Make sure there is at least one active item for each sentence in the batch. + assert (~cands_to_ignore).any(dim=1).all() + + # update cands_to_ignore to ignore any finalized hypos + + # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam + # can be selected more than once). + active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos) + active_scores = torch.gather(cand_scores, dim=1, index=active_hypos) + + active_bbsz_idx = active_bbsz_idx.view(-1) + active_scores = active_scores.view(-1) + + # copy tokens and scores for active hypotheses + + # Set the tokens for each beam (can select the same row more than once) + tokens[:, : step + 1] = torch.index_select( + tokens[:, : step + 1], dim=0, index=active_bbsz_idx + ) + # Select the next token for each of them + tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather( + cand_indices, dim=1, index=active_hypos + ) + if step > 0: + scores[:, :step] = torch.index_select( + scores[:, :step], dim=0, index=active_bbsz_idx + ) + scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather( + cand_scores, dim=1, index=active_hypos + ) + + # Update constraints based on which candidates were selected for the next beam + self.search.update_constraints(active_hypos) + + # copy attention for active hypotheses + if attn is not None: + attn[:, :, : step + 2] = torch.index_select( + attn[:, :, : step + 2], dim=0, index=active_bbsz_idx + ) + + # reorder incremental state in decoder + reorder_state = active_bbsz_idx + + # sort by score descending + for sent in range(len(finalized)): + scores = torch.tensor( + [float(elem["score"].item()) for elem in finalized[sent]] + ) + _, sorted_scores_indices = torch.sort(scores, descending=True) + finalized[sent] = [finalized[sent][ssi] for ssi in sorted_scores_indices] + finalized[sent] = torch.jit.annotate( + List[Dict[str, Tensor]], finalized[sent] + ) + return finalized + + +class EnsembleModel(EnsembleModelBase): + """A wrapper around an ensemble of models.""" + + def __init__(self, models): + super().__init__(models) + + @torch.jit.export + def forward_decoder( + self, + tokens, + encoder_outs: List[Dict[str, List[Tensor]]], + incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], + temperature: float = 1.0, + decoder_name="decoder", + encoder_outs_aug: List[Dict[str, List[Tensor]]] = None, + ): + log_probs = [] + avg_attn: Optional[Tensor] = None + encoder_out: Optional[Dict[str, List[Tensor]]] = None + encoder_out_aug: Optional[Dict[str, List[Tensor]]] = None + for i, model in enumerate(self.models): + if self.has_encoder(): + encoder_out = encoder_outs[i] + if encoder_outs_aug is not None: + encoder_out_aug = encoder_outs_aug[i] + # decode each model + if self.has_incremental_states(): + if encoder_out_aug is not None: + decoder_out = getattr(model, decoder_name).forward( + tokens, + encoder_out=encoder_out, + encoder_out_aug=encoder_out_aug, + incremental_state=incremental_states[i], + ) + else: + decoder_out = getattr(model, decoder_name).forward( + tokens, + encoder_out=encoder_out, + incremental_state=incremental_states[i], + ) + else: + if hasattr(model, decoder_name): + decoder_out = getattr(model, decoder_name).forward( + tokens, encoder_out=encoder_out + ) + else: + decoder_out = model.forward(tokens) + + attn: Optional[Tensor] = None + decoder_len = len(decoder_out) + if decoder_len > 1 and decoder_out[1] is not None: + if isinstance(decoder_out[1], Tensor): + attn = decoder_out[1] + else: + attn_holder = decoder_out[1]["attn"] + if isinstance(attn_holder, Tensor): + attn = attn_holder + elif attn_holder is not None: + attn = attn_holder[0] + if attn is not None: + attn = attn[:, -1, :] + + decoder_out_tuple = ( + decoder_out[0][:, -1:, :].div_(temperature), + None if decoder_len <= 1 else decoder_out[1], + ) + probs = getattr(model, decoder_name).get_normalized_probs( + decoder_out_tuple, log_probs=True, sample=None + ) + probs = probs[:, -1, :] + if self.models_size == 1: + return probs, attn + + log_probs.append(probs) + if attn is not None: + if avg_attn is None: + avg_attn = attn + else: + avg_attn.add_(attn) + + avg_probs = torch.logsumexp(torch.stack(log_probs, dim=0), dim=0) - math.log( + self.models_size + ) + + if avg_attn is not None: + avg_attn.div_(self.models_size) + return avg_probs, avg_attn + + @torch.jit.export + def reorder_incremental_state( + self, + incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], + new_order, + decoder_name="decoder", + ): + if not self.has_incremental_states(): + return + for i, model in enumerate(self.models): + getattr(model, decoder_name).reorder_incremental_state_scripting( + incremental_states[i], new_order + ) diff --git a/examples/speech_to_speech/unity/sequence_generator_multi_decoder.py b/examples/speech_to_speech/unity/sequence_generator_multi_decoder.py new file mode 100644 index 0000000000..af99a960b8 --- /dev/null +++ b/examples/speech_to_speech/unity/sequence_generator_multi_decoder.py @@ -0,0 +1,267 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, List, Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from fairseq import search + + +class MultiDecoderSequenceGenerator(nn.Module): + def __init__( + self, + models, + tgt_dict, + tgt_dict_mt, + beam_size=1, + beam_size_mt=1, + max_len_a=0, + max_len_b=200, + max_len_a_mt=0, + max_len_b_mt=200, + max_len=0, + min_len=1, + normalize_scores=True, + len_penalty=1.0, + len_penalty_mt=1.0, + unk_penalty=0.0, + temperature=1.0, + match_source_len=False, + no_repeat_ngram_size=0, + eos=None, + eos_mt=None, + symbols_to_strip_from_output=None, + lm_model=None, + lm_weight=1.0, + ): + """Generates translations of a given source sentence. + + Args: + models (List[~fairseq.models.FairseqModel]): ensemble of models, + currently support fairseq.models.TransformerModel for scripting + beam_size (int, optional): beam width (default: 1) + max_len_a/b (int, optional): generate sequences of maximum length + ax + b, where x is the source length for the second pass + max_len_a_mt/b_mt (int, optional): generate sequences of maximum length + ax + b, where x is the source length for the first pass + max_len (int, optional): the maximum length of the generated output + (not including end-of-sentence) + min_len (int, optional): the minimum length of the generated output + (not including end-of-sentence) + normalize_scores (bool, optional): normalize scores by the length + of the output (default: True) + len_penalty (float, optional): length penalty in the second pass, where <1.0 favors + shorter, >1.0 favors longer sentences (default: 1.0) + len_penalty (float, optional): length penalty in the first pass, where <1.0 favors + shorter, >1.0 favors longer sentences (default: 1.0) + unk_penalty (float, optional): unknown word penalty, where <0 + produces more unks, >0 produces fewer (default: 0.0) + temperature (float, optional): temperature, where values + >1.0 produce more uniform samples and values <1.0 produce + sharper samples (default: 1.0) + match_source_len (bool, optional): outputs should match the source + length (default: False) + """ + super().__init__() + + from examples.speech_to_speech.unity.sequence_generator import SequenceGenerator + + self.generator = SequenceGenerator( + models, + tgt_dict, + beam_size=beam_size, + max_len_a=max_len_a, + max_len_b=max_len_b, + max_len=max_len, + min_len=min_len, + normalize_scores=normalize_scores, + len_penalty=len_penalty, + unk_penalty=unk_penalty, + temperature=temperature, + match_source_len=match_source_len, + no_repeat_ngram_size=no_repeat_ngram_size, + search_strategy=search.BeamSearch(tgt_dict), + eos=eos, + symbols_to_strip_from_output=symbols_to_strip_from_output, + lm_model=lm_model, + lm_weight=lm_weight, + ) + self.eos = self.generator.eos + + self.generator_mt = SequenceGenerator( + models, + tgt_dict_mt, + beam_size=beam_size_mt, + max_len_a=max_len_a_mt, + max_len_b=max_len_b_mt, + max_len=max_len, + min_len=min_len, + normalize_scores=normalize_scores, + len_penalty=len_penalty_mt, + unk_penalty=unk_penalty, + temperature=temperature, + match_source_len=match_source_len, + no_repeat_ngram_size=no_repeat_ngram_size, + search_strategy=search.BeamSearch(tgt_dict_mt), + eos=eos_mt, + symbols_to_strip_from_output=symbols_to_strip_from_output, + ) + + @torch.no_grad() + def generate( + self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs + ) -> List[List[Dict[str, Tensor]]]: + """Generate translations. Match the api of other fairseq generators. + + Args: + models (List[~fairseq.models.FairseqModel]): ensemble of models + sample (dict): batch + prefix_tokens (torch.LongTensor, optional): force decoder to begin + with these tokens + constraints (torch.LongTensor, optional): force decoder to include + the list of constraints + bos_token (int, optional): beginning of sentence token + (default: self.eos) + """ + return self._generate(sample, **kwargs) + + def _generate( + self, + sample: Dict[str, Dict[str, Tensor]], + prefix_tokens: Optional[Tensor] = None, + constraints: Optional[Tensor] = None, + bos_token: Optional[int] = None, + ): + net_input = sample["net_input"] + + if "src_tokens" in net_input: + src_tokens = net_input["src_tokens"] + # length of the source text being the character length except EndOfSentence and pad + # if src_lengths exists in net_input (speech_to_text dataset case), then use it + if "src_lengths" in net_input: + src_lengths = net_input["src_lengths"] + else: + src_lengths = ( + ( + src_tokens.ne(self.generator.eos) + & src_tokens.ne(self.generator.pad) + ) + .long() + .sum(dim=1) + ) + else: + raise Exception( + "expected src_tokens or source in net input. input keys: " + + str(net_input.keys()) + ) + + if constraints is not None and not self.generator.search.supports_constraints: + raise NotImplementedError( + "Target-side constraints were provided, but search method doesn't support them" + ) + + # Initialize constraints, when active + self.generator.search.init_constraints(constraints, self.generator.beam_size) + self.generator_mt.search.init_constraints( + constraints, self.generator_mt.beam_size + ) + + # compute the encoder output for each beam + with torch.autograd.profiler.record_function("EnsembleModel: forward_encoder"): + encoder_outs = self.generator.model.forward_encoder(net_input) + + single_model = self.generator.model.single_model + mt_decoder = getattr(single_model, f"{single_model.mt_task_name}_decoder") + + # 1. MT decoder + finalized_mt = self.generator_mt.generate_decoder( + encoder_outs, + src_tokens, + src_lengths, + sample, + prefix_tokens, + constraints, + bos_token, + aux_task_name=single_model.mt_task_name, + ) + + # extract decoder output corresponding to the best hypothesis + max_tgt_len = max([len(hypo[0]["tokens"]) for hypo in finalized_mt]) + prev_output_tokens_mt = ( + src_tokens.new_zeros(src_tokens.shape[0], max_tgt_len) + .fill_(mt_decoder.padding_idx) + .int() + ) # B x T + for i, hypo in enumerate(finalized_mt): + i_beam = 0 + tmp = hypo[i_beam]["tokens"].int() # hyp + eos + prev_output_tokens_mt[i, 0] = self.generator_mt.eos + if tmp[-1] == self.generator_mt.eos: + tmp = tmp[:-1] + prev_output_tokens_mt[i, 1 : len(tmp) + 1] = tmp + + text = "".join([self.generator_mt.tgt_dict[c] for c in tmp]) + text = text.replace("_", " ") + text = text.replace("▁", " ") + text = text.replace("<unk>", " ") + text = text.replace("<s>", "") + text = text.replace("</s>", "") + if len(text) > 0 and text[0] == " ": + text = text[1:] + sample_id = sample["id"].tolist()[i] + print("{} (None-{})".format(text, sample_id)) + + x = mt_decoder( + prev_output_tokens_mt, + encoder_out=encoder_outs[0], + features_only=True, + )[0].transpose(0, 1) + + if getattr(single_model, "proj", None) is not None: + x = single_model.proj(x) + + mt_decoder_padding_mask = None + if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any(): + mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx) + + # 2. T2U encoder + if getattr(single_model, "synthesizer_encoder", None) is not None: + t2u_encoder_out = single_model.synthesizer_encoder( + x, + mt_decoder_padding_mask, + ) + else: + t2u_encoder_out = { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [mt_decoder_padding_mask] + if mt_decoder_padding_mask is not None + else [], # B x T + "encoder_embedding": [], + "encoder_states": [], + "src_tokens": [], + "src_lengths": [], + } + + if getattr(single_model, "t2u_augmented_cross_attn", False): + encoder_outs_aug = [t2u_encoder_out] + else: + encoder_outs = [t2u_encoder_out] + encoder_outs_aug = None + + # 3. T2U decoder + finalized = self.generator.generate_decoder( + encoder_outs, + src_tokens, + src_lengths, + sample, + prefix_tokens, + constraints, + bos_token, + encoder_outs_aug=encoder_outs_aug, + ) + return finalized diff --git a/examples/speech_to_text/README.md b/examples/speech_to_text/README.md index 4030af0144..f639d300d3 100644 --- a/examples/speech_to_text/README.md +++ b/examples/speech_to_text/README.md @@ -1,8 +1,8 @@ # Speech-to-Text (S2T) Modeling -[https://arxiv.org/abs/2010.05171](https://arxiv.org/abs/2010.05171) +[https://www.aclweb.org/anthology/2020.aacl-demo.6](https://www.aclweb.org/anthology/2020.aacl-demo.6.pdf) -Examples for speech recognition (ASR) and speech-to-text translation (ST) with fairseq. +Speech recognition (ASR) and speech-to-text translation (ST) with fairseq. ## Data Preparation S2T modeling data consists of source speech features, target text and other optional information @@ -19,241 +19,44 @@ Fairseq S2T also employs a YAML file for data related configurations: tokenizer for the target text, feature transforms such as CMVN (cepstral mean and variance normalization) and SpecAugment, temperature-based resampling, etc. -## Model Training & Evaluation -Fairseq S2T uses the unified `fairseq-train`/`fairseq-generate` interface for model training and evaluation. -It requires arguments `--task speech_to_text` and `--arch <arch in fairseq.models.speech_to_text.*>`. +## Model Training +Fairseq S2T uses the unified `fairseq-train` interface for model training. It requires arguments `--task speech_to_text`, + `--arch <model architecture in fairseq.models.speech_to_text.*>` and `--config-yaml <config YAML filename>`. +## Inference & Evaluation +Fairseq S2T uses the unified `fairseq-generate`/`fairseq-interactive` interface for inference and evaluation. It +requires arguments `--task speech_to_text` and `--config-yaml <config YAML filename>`. The interactive console takes +audio paths (one per line) as inputs. -## Example 1: Speech Recognition (ASR) on LibriSpeech -#### Data preparation -Download and preprocess [LibriSpeech](https://www.danielpovey.com/files/2015_icassp_librispeech.pdf) data with -```bash -python examples/speech_to_text/prep_librispeech_data.py --output-root ${LS_ROOT} --vocab-type unigram --vocab-size 10000 -``` -where `LS_ROOT` is the root path for downloaded data as well as generated manifest and feature files. - -#### Training -```bash -fairseq-train ${LS_ROOT} --train-subset train --valid-subset dev --save-dir ${SAVE_DIR} --num-workers 4 \ - --max-tokens 40000 --task speech_to_text --criterion label_smoothed_cross_entropy --max-update 300000 \ - --arch s2t_transformer_s --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 \ - --clip-norm 10.0 --seed 1 --update-freq 8 -``` -where `SAVE_DIR` is the checkpoint root path. Here we use `--arch s2t_transformer_s` (31M parameters) as example. -You may switch to `s2t_transformer_m` (71M) or `s2t_transformer_l` (268M) for better performance. We set -`--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. - -#### Inference & Evaluation -Average the last 10 checkpoints and evaluate on the 4 splits -(`dev-clean`, `dev-other`, `test-clean` and `test-other`): -```bash -CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt -python scripts/average_checkpoints.py --inputs ${SAVE_DIR} --num-epoch-checkpoints 10 \ - --output "${SAVE_DIR}/${CHECKPOINT_FILENAME}" -for SUBSET in dev-clean dev-other test-clean test-other; do - fairseq-generate ${LS_ROOT} --gen-subset ${SUBSET} --task speech_to_text \ - --path ${SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 --scoring wer -done -``` - -#### Result - -| --arch | Params | dev-clean | dev-other | test-clean | test-other | -|---|---|---|---|---|---| -| s2t_transformer_s | 30M | 4.1 | 9.3 | 4.4 | 9.2 | -| s2t_transformer_sp | 35M | 3.9 | 9.3 | 4.3 | 8.8 | -| s2t_transformer_m | 71M | 3.5 | 8.1 | 3.7 | 8.1 | -| s2t_transformer_mp | 84M | 3.3 | 7.8 | 3.7 | 8.2 | -| s2t_transformer_l | 268M | 3.3 | 7.7 | 3.5 | 7.8 | -| s2t_transformer_lp | 318M | 3.1 | 7.5 | 3.4 | 7.6 | - - -## Example 2: Speech Translation (ST) on MuST-C +## Examples +- [Speech Recognition (ASR) on LibriSpeech](docs/librispeech_example.md) -#### Data Preparation -[Download](https://ict.fbk.eu/must-c) and unpack [MuST-C](https://www.aclweb.org/anthology/N19-1202) data -to a path `${MUSTC_ROOT}/en-${TARGET_LANG_ID}`, then preprocess it with -```bash -# Generate TSV manifests, features, vocabulary and configuration for each language -python examples/speech_to_text/prep_mustc_data.py --data-root ${MUSTC_ROOT} --task asr \ - --vocab-type unigram --vocab-size 5000 -python examples/speech_to_text/prep_mustc_data.py --data-root ${MUSTC_ROOT} --task st \ - --vocab-type unigram --vocab-size 8000 +- [Speech-to-Text Translation (ST) on MuST-C](docs/mustc_example.md) -# Add vocabulary and configuration for joint data (based on the manifests and features generated above) -python examples/speech_to_text/prep_mustc_data.py --data-root ${MUSTC_ROOT} --task asr --joint \ - --vocab-type unigram --vocab-size 10000 -python examples/speech_to_text/prep_mustc_data.py --data-root ${MUSTC_ROOT} --task st --joint \ - --vocab-type unigram --vocab-size 10000 -``` -The generated files will be available under `${MUSTC_ROOT}/en-${TARGET_LANG_ID}` (per-language data) and -`MUSTC_ROOT` (joint data). +- [Speech-to-Text Translation (ST) on CoVoST 2](docs/covost_example.md) -#### ASR -###### Training -ASR data from En-De as example: -```bash -fairseq-train ${MUSTC_ROOT}/en-de --train-subset train_asr --valid-subset dev_asr --save-dir ${ASR_SAVE_DIR} \ - --num-workers 4 --max-tokens 40000 --task speech_to_text --criterion label_smoothed_cross_entropy \ - --report-accuracy --max-update 100000 --arch s2t_transformer_s --optimizer adam --lr 1e-3 \ - --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 -``` -Using joint data from all directions: -```bash -fairseq-train ${MUSTC_ROOT} \ - --train-subset train_de_asr,train_nl_asr,train_es_asr,train_fr_asr,train_it_asr,train_pt_asr,train_ro_asr,train_ru_asr \ - --valid-subset dev_de_asr,dev_nl_asr,dev_es_asr,dev_fr_asr,dev_it_asr,dev_pt_asr,dev_ro_asr,dev_ru_asr \ - --save-dir ${JOINT_ASR_SAVE_DIR} --num-workers 4 --max-tokens 40000 --task speech_to_text --arch s2t_transformer_s \ - --criterion label_smoothed_cross_entropy --report-accuracy --max-update 100000 --optimizer adam --lr 1e-3 \ - --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 -``` -where `ASR_SAVE_DIR` (`JOINT_ASR_SAVE_DIR`) is the checkpoint root path. We set `--update-freq 8` to simulate 8 GPUs -with 1 GPU. You may want to update it accordingly when using more than 1 GPU. +- [Speech-to-Text Translation (ST) on Multilingual TEDx](docs/mtedx_example.md) +- [Simultaneous Speech-to-Text Translation (SimulST) on MuST-C](docs/simulst_mustc_example.md) -###### Inference & Evaluation -```bash -CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt -python scripts/average_checkpoints.py --inputs ${ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ - --output "${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" -fairseq-generate ${MUSTC_ROOT}/en-de --gen-subset tst-COMMON_asr --task speech_to_text \ - --path ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \ - --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct +## Updates +- 02/04/2021: Added interactive decoding (`fairseq-interactive`) support. Examples: + [ASR (LibriSpeech)](docs/librispeech_example.md#interactive-decoding) + and [ST (CoVoST 2)](docs/covost_example.md#interactive-decoding). +- 01/08/2021: Several fixes for S2T Transformer model, inference-time de-tokenization, scorer configuration and data + preparation scripts. We also add pre-trained models to the examples and revise the instructions. + Breaking changes: the data preparation scripts now extract filterbank features without CMVN. CMVN is instead applied + on-the-fly (defined in the config YAML). -# For models trained on joint data -python scripts/average_checkpoints.py --inputs ${JOINT_ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ - --output "${JOINT_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" -for LANG in de nl es fr it pt ro ru; do - fairseq-generate ${MUSTC_ROOT} --gen-subset tst-COMMON_${LANG}_asr --task speech_to_text \ - --path ${JOINT_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \ - --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct -done -``` -###### Result -| Data | --arch | Params | En-De | En-Nl | En-Es | En-Fr | En-It | En-Pt | En-Ro | En-Ru | -|---|---|---|---|---|---|---|---|---|---|---| -| Single | s2t_transformer_s | 31M | 18.2 | 17.6 | 17.7 | 17.2 | 17.9 | 19.1 | 18.1 | 17.7 | -| Joint | s2t_transformer_m | 76M | 16.8 | 16.7 | 16.9 | 16.9 | 17.0 | 17.4 | 17.0 | 16.9 | - -#### ST -###### Training -En-De as example: -```bash -fairseq-train ${MUSTC_ROOT}/en-de --train-subset train_st --valid-subset dev_st --save-dir ${ST_SAVE_DIR} \ - --num-workers 4 --max-tokens 40000 --task speech_to_text --criterion label_smoothed_cross_entropy \ - --report-accuracy --max-update 100000 --arch s2t_transformer_s --optimizer adam --lr 2e-3 \ - --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \ - --load-pretrained-encoder-from ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} -``` -Example for multilingual models: -```bash -fairseq-train ${MUSTC_ROOT} \ - --train-subset train_de_st,train_nl_st,train_es_st,train_fr_st,train_it_st,train_pt_st,train_ro_st,train_ru_st \ - --valid-subset dev_de_st,dev_nl_st,dev_es_st,dev_fr_st,dev_it_st,dev_pt_st,dev_ro_st,dev_ru_st \ - --save-dir ${MULTILINGUAL_ST_SAVE_DIR} --num-workers 4 --max-tokens 40000 --task speech_to_text \ - --arch s2t_transformer_s --criterion label_smoothed_cross_entropy --report-accuracy --ignore-prefix-size 1 \ - --max-update 100000 --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 \ - --seed 1 --update-freq 8 --load-pretrained-encoder-from ${JOINT_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} -``` -where `ST_SAVE_DIR` (`MULTILINGUAL_ST_SAVE_DIR`) is the checkpoint root path. The ST encoder is pre-trained by ASR -for faster training and better performance: `--load-pretrained-encoder-from <(JOINT_)ASR checkpoint path>`. We set -`--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. -For multilingual models, we prepend target language ID token as target BOS, which should be excluded from -the training loss via `--ignore-prefix-size 1`. - -###### Inference & Evaluation -Average the last 10 checkpoints and evaluate on the `tst-COMMON` split: -```bash -CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt -python scripts/average_checkpoints.py --inputs ${ST_SAVE_DIR} --num-epoch-checkpoints 10 \ - --output "${ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" -fairseq-generate ${MUSTC_ROOT} --gen-subset tst-COMMON_st --task speech_to_text \ - --path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 --scoring sacrebleu - -# For multilingual models -python scripts/average_checkpoints.py --inputs ${MULTILINGUAL_ST_SAVE_DIR} --num-epoch-checkpoints 10 \ - --output "${MULTILINGUAL_ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" -for LANG in de nl es fr it pt ro ru; do - fairseq-generate ${MUSTC_ROOT} --gen-subset tst-COMMON_${LANG}_st --task speech_to_text --prefix-size 1 \ - --path ${MULTILINGUAL_ST_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 --scoring sacrebleu -done -``` -For multilingual models, we force decoding from the target language ID token (as BOS) via `--prefix-size 1`. - -###### Result -| Data | --arch | Params | En-De | En-Nl | En-Es | En-Fr | En-It | En-Pt | En-Ro | En-Ru | -|---|---|---|---|---|---|---|---|---|---|---| -| Bilingual | s2t_transformer_s | 31M | 22.7 | 27.3 | 27.2 | 32.9 | 22.7 | 28.1 | 21.9 | 15.3 | -| Multilingual | s2t_transformer_m | 76M | 24.5 | 28.6 | 28.2 | 34.9 | 24.6 | 31.1 | 23.8 | 16.0 | - - -## Example 3: ST on CoVoST -We replicate the experiments in -[CoVoST 2 and Massively Multilingual Speech-to-Text Translation (Wang et al., 2020)](https://arxiv.org/abs/2007.10310). - -#### Data Preparation -Download and preprocess [CoVoST (version 2)](https://arxiv.org/abs/2007.10310) data with -```bash -# En ASR -python examples/speech_to_text/prep_covost_data.py --data-root ${COVOST_ROOT} --vocab-type char --src-lang en -# ST -python examples/speech_to_text/prep_covost_data.py --data-root ${COVOST_ROOT} --vocab-type char \ - --src-lang fr --tgt-lang en -``` -where `COVOST_ROOT` is the root path for downloaded data as well as generated manifest and feature files. - -#### ASR -###### Training -```bash -fairseq-train ${COVOST_ROOT} --train-subset train_asr --valid-subset dev_asr --save-dir ${ASR_SAVE_DIR} \ - --num-workers 4 --max-tokens 40000 --task speech_to_text --criterion label_smoothed_cross_entropy \ - --report-accuracy --max-update 100000 --arch s2t_transformer_s --optimizer adam --lr 1e-3 \ - --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 -``` -where `ASR_SAVE_DIR` is the checkpoint root path. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. -You may want to update it accordingly when using more than 1 GPU. - -###### Inference & Evaluation -```bash -CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt -python scripts/average_checkpoints.py --inputs ${ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ - --output "${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" -fairseq-generate ${COVOST_ROOT} --gen-subset test_asr_en --task speech_to_text \ - --path ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \ - --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct -``` -###### Result -| --arch | Params | En | -|---|---|---| -| s2t_transformer_s | 31M | 25.6 | - -#### ST -###### Training -```bash -fairseq-train ${COVOST_ROOT} --train-subset train_st_fr_en --valid-subset dev_st_fr_en --save-dir ${ST_SAVE_DIR} \ - --num-workers 4 --max-tokens 40000 --task speech_to_text --criterion label_smoothed_cross_entropy \ - --report-accuracy --max-update 100000 --arch s2t_transformer_s --optimizer adam --lr 2e-3 \ - --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \ - --load-pretrained-encoder-from ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} -``` -where `ST_SAVE_DIR` is the checkpoint root path. The ST encoder is pre-trained by En ASR for faster training and better -performance: `--load-pretrained-encoder-from <ASR checkpoint path>`. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. -You may want to update it accordingly when using more than 1 GPU. - -###### Inference & Evaluation -Average the last 10 checkpoints and evaluate on test split: -```bash -CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt -python scripts/average_checkpoints.py --inputs ${ST_SAVE_DIR} --num-epoch-checkpoints 10 \ - --output "${ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" -fairseq-generate ${COVOST_ROOT} --gen-subset test_st_fr_en --task speech_to_text \ - --path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 --scoring sacrebleu -``` - -###### Result -| --arch | Params | Fr-En | De-En | Es-En | Ca-En | En-De | En-Ca | En-Fa | En-Et | -|---|---|---|---|---|---|---|---|---|---| -| s2t_transformer_s | 31M | 26.3 | 17.1 | 23.0 | 18.8 | 16.3 | 21.8 | 13.1 | 13.2 | +## What's Next +- We are migrating the old fairseq [ASR example](../speech_recognition) into this S2T framework and + merging the features from both sides. +- The following papers also base their experiments on fairseq S2T. We are adding more examples for replication. + - [Improving Cross-Lingual Transfer Learning for End-to-End Speech Recognition with Speech Translation (Wang et al., 2020)](https://arxiv.org/abs/2006.05474) + - [Self-Supervised Representations Improve End-to-End Speech Translation (Wu et al., 2020)](https://arxiv.org/abs/2006.12124) + - [Self-Training for End-to-End Speech Translation (Pino et al., 2020)](https://arxiv.org/abs/2006.02490) + - [CoVoST: A Diverse Multilingual Speech-To-Text Translation Corpus (Wang et al., 2020)](https://arxiv.org/abs/2002.01320) + - [Harnessing Indirect Training Data for End-to-End Automatic Speech Translation: Tricks of the Trade (Pino et al., 2019)](https://arxiv.org/abs/1909.06515) ## Citation Please cite as: @@ -272,12 +75,3 @@ Please cite as: year = {2019}, } ``` - -## More Paper Code -The following papers also base their experiments on fairseq S2T. We are adding more examples for replication. - -- [Improving Cross-Lingual Transfer Learning for End-to-End Speech Recognition with Speech Translation (Wang et al., 2020)](https://arxiv.org/abs/2006.05474) -- [Self-Supervised Representations Improve End-to-End Speech Translation (Wu et al., 2020)](https://arxiv.org/abs/2006.12124) -- [Self-Training for End-to-End Speech Translation (Pino et al., 2020)](https://arxiv.org/abs/2006.02490) -- [CoVoST: A Diverse Multilingual Speech-To-Text Translation Corpus (Wang et al., 2020)](https://arxiv.org/abs/2002.01320) -- [Harnessing Indirect Training Data for End-to-End Automatic Speech Translation: Tricks of the Trade (Pino et al., 2019)](https://arxiv.org/abs/1909.06515) diff --git a/examples/speech_to_text/data_utils.py b/examples/speech_to_text/data_utils.py index 083d7316cd..b8648cb2a0 100644 --- a/examples/speech_to_text/data_utils.py +++ b/examples/speech_to_text/data_utils.py @@ -1,23 +1,25 @@ -#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import csv -import os -import os.path as op +from pathlib import Path import zipfile from functools import reduce -from glob import glob from multiprocessing import cpu_count -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional, Union +import io import numpy as np import pandas as pd import sentencepiece as sp -from fairseq.data.audio.audio_utils import _get_kaldi_fbank, _get_torchaudio_fbank -from fairseq.data.audio.feature_transforms.utterance_cmvn import UtteranceCMVN +from fairseq.data.audio.audio_utils import ( + convert_waveform, _get_kaldi_fbank, _get_torchaudio_fbank, is_npy_data, + is_sf_audio_data +) +import torch +import soundfile as sf from tqdm import tqdm @@ -28,12 +30,13 @@ def gen_vocab( - input_path: str, output_path_prefix: str, model_type="bpe", vocab_size=1000, + input_path: Path, output_path_prefix: Path, model_type="bpe", + vocab_size=1000, special_symbols: Optional[List[str]] = None ): # Train SentencePiece Model arguments = [ - f"--input={input_path}", - f"--model_prefix={output_path_prefix}", + f"--input={input_path.as_posix()}", + f"--model_prefix={output_path_prefix.as_posix()}", f"--model_type={model_type}", f"--vocab_size={vocab_size}", "--character_coverage=1.0", @@ -43,10 +46,13 @@ def gen_vocab( f"--eos_id={EOS_TOKEN_ID}", f"--pad_id={PAD_TOKEN_ID}", ] + if special_symbols is not None: + _special_symbols = ",".join(special_symbols) + arguments.append(f"--user_defined_symbols={_special_symbols}") sp.SentencePieceTrainer.Train(" ".join(arguments)) # Export fairseq dictionary spm = sp.SentencePieceProcessor() - spm.Load(output_path_prefix + ".model") + spm.Load(output_path_prefix.as_posix() + ".model") vocab = {i: spm.IdToPiece(i) for i in range(spm.GetPieceSize())} assert ( vocab.get(UNK_TOKEN_ID) == UNK_TOKEN @@ -59,110 +65,147 @@ def gen_vocab( for i, s in vocab.items() if s not in {UNK_TOKEN, BOS_TOKEN, EOS_TOKEN, PAD_TOKEN} } - with open(output_path_prefix + ".txt", "w") as f_out: + with open(output_path_prefix.as_posix() + ".txt", "w") as f_out: for _, s in sorted(vocab.items(), key=lambda x: x[0]): f_out.write(f"{s} 1\n") def extract_fbank_features( - waveform, - sample_rate, - output_path=None, - n_mel_bins=80, - apply_utterance_cmvn=True, - overwrite=False, + waveform: torch.FloatTensor, + sample_rate: int, + output_path: Optional[Path] = None, + n_mel_bins: int = 80, + overwrite: bool = False, ): - if output_path is not None and op.exists(output_path) and not overwrite: + if output_path is not None and output_path.is_file() and not overwrite: return - _waveform = waveform * (2 ** 15) # Kaldi compliance: 16-bit signed integers - _waveform = _waveform.squeeze().numpy() + _waveform, _ = convert_waveform(waveform, sample_rate, to_mono=True) + # Kaldi compliance: 16-bit signed integers + _waveform = _waveform * (2 ** 15) + _waveform = _waveform.numpy() features = _get_kaldi_fbank(_waveform, sample_rate, n_mel_bins) if features is None: features = _get_torchaudio_fbank(_waveform, sample_rate, n_mel_bins) if features is None: raise ImportError( - "Please install pyKaldi or torchaudio to enable " - "online filterbank feature extraction" + "Please install pyKaldi or torchaudio to enable fbank feature extraction" ) - if apply_utterance_cmvn: - cmvn = UtteranceCMVN(norm_means=True, norm_vars=True) - features = cmvn(features) if output_path is not None: - np.save(output_path, features) - else: - return features + np.save(output_path.as_posix(), features) + return features -def create_zip(data_root, zip_path): - cwd = os.path.abspath(os.curdir) - os.chdir(data_root) +def create_zip(data_root: Path, zip_path: Path): + paths = list(data_root.glob("*.npy")) + paths.extend(data_root.glob("*.flac")) with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_STORED) as f: - for filename in tqdm(glob("*.npy")): - f.write(filename) - os.chdir(cwd) + for path in tqdm(paths): + f.write(path, arcname=path.name) -def is_npy_data(data: bytes) -> bool: - return data[0] == 147 and data[1] == 78 - - -def get_zip_manifest(zip_root, zip_filename): - zip_path = op.join(zip_root, zip_filename) - with zipfile.ZipFile(zip_path, mode="r") as f: +def get_zip_manifest( + zip_path: Path, zip_root: Optional[Path] = None, is_audio=False +): + _zip_path = Path.joinpath(zip_root or Path(""), zip_path) + with zipfile.ZipFile(_zip_path, mode="r") as f: info = f.infolist() - manifest = {} + paths, lengths = {}, {} for i in tqdm(info): - utt_id = op.splitext(i.filename)[0] + utt_id = Path(i.filename).stem offset, file_size = i.header_offset + 30 + len(i.filename), i.file_size - manifest[utt_id] = f"{zip_filename}:{offset}:{file_size}" - with open(zip_path, "rb") as f: + paths[utt_id] = f"{zip_path.as_posix()}:{offset}:{file_size}" + with open(_zip_path, "rb") as f: f.seek(offset) - data = f.read(file_size) - assert len(data) > 1 and is_npy_data(data) - return manifest + byte_data = f.read(file_size) + assert len(byte_data) > 1 + if is_audio: + assert is_sf_audio_data(byte_data), i + else: + assert is_npy_data(byte_data), i + byte_data_fp = io.BytesIO(byte_data) + if is_audio: + lengths[utt_id] = sf.info(byte_data_fp).frames + else: + lengths[utt_id] = np.load(byte_data_fp).shape[0] + return paths, lengths def gen_config_yaml( - data_root, - spm_filename, - yaml_filename="config.yaml", - specaugment_policy="lb", - prepend_tgt_lang_tag=False, - sampling_alpha=1.0, + manifest_root: Path, + spm_filename: Optional[str] = None, + vocab_name: Optional[str] = None, + yaml_filename: str = "config.yaml", + specaugment_policy: Optional[str] = "lb", + prepend_tgt_lang_tag: bool = False, + sampling_alpha: Optional[float] = None, + input_channels: Optional[int] = 1, + input_feat_per_channel: Optional[int] = 80, + audio_root: str = "", + cmvn_type: str = "utterance", + gcmvn_path: Optional[Path] = None, + extra=None ): - data_root = op.abspath(data_root) - writer = S2TDataConfigWriter(op.join(data_root, yaml_filename)) - writer.set_audio_root(op.abspath(data_root)) - writer.set_vocab_filename(spm_filename.replace(".model", ".txt")) - writer.set_input_channels(1) - writer.set_input_feat_per_channel(80) + manifest_root = manifest_root.absolute() + writer = S2TDataConfigWriter(manifest_root / yaml_filename) + assert spm_filename is not None or vocab_name is not None + vocab_name = spm_filename.replace(".model", ".txt") if vocab_name is None \ + else vocab_name + writer.set_vocab_filename(vocab_name) + if input_channels is not None: + writer.set_input_channels(input_channels) + if input_feat_per_channel is not None: + writer.set_input_feat_per_channel(input_feat_per_channel) specaugment_setters = { "lb": writer.set_specaugment_lb_policy, "ld": writer.set_specaugment_ld_policy, "sm": writer.set_specaugment_sm_policy, "ss": writer.set_specaugment_ss_policy, } - assert specaugment_policy in specaugment_setters - specaugment_setters[specaugment_policy]() - writer.set_bpe_tokenizer( - { - "bpe": "sentencepiece", - "sentencepiece_model": op.join(data_root, spm_filename), - } - ) + specaugment_setter = specaugment_setters.get(specaugment_policy, None) + if specaugment_setter is not None: + specaugment_setter() + if spm_filename is not None: + writer.set_bpe_tokenizer( + { + "bpe": "sentencepiece", + "sentencepiece_model": (manifest_root / spm_filename).as_posix(), + } + ) if prepend_tgt_lang_tag: writer.set_prepend_tgt_lang_tag(True) - writer.set_sampling_alpha(sampling_alpha) - writer.set_feature_transforms("_train", ["specaugment"]) + if sampling_alpha is not None: + writer.set_sampling_alpha(sampling_alpha) + + if cmvn_type not in ["global", "utterance"]: + raise NotImplementedError + + if specaugment_policy is not None: + writer.set_feature_transforms( + "_train", [f"{cmvn_type}_cmvn", "specaugment"] + ) + writer.set_feature_transforms("*", [f"{cmvn_type}_cmvn"]) + + if cmvn_type == "global": + if gcmvn_path is None: + raise ValueError("Please provide path of global cmvn file.") + else: + writer.set_global_cmvn(gcmvn_path.as_posix()) + + if len(audio_root) > 0: + writer.set_audio_root(audio_root) + + if extra is not None: + writer.set_extra(extra) writer.flush() -def load_df_from_tsv(path: str): +def load_df_from_tsv(path: Union[str, Path]) -> pd.DataFrame: + _path = path if isinstance(path, str) else path.as_posix() return pd.read_csv( - path, + _path, sep="\t", header=0, encoding="utf-8", @@ -172,9 +215,10 @@ def load_df_from_tsv(path: str): ) -def save_df_to_tsv(dataframe, path): +def save_df_to_tsv(dataframe, path: Union[str, Path]): + _path = path if isinstance(path, str) else path.as_posix() dataframe.to_csv( - path, + _path, sep="\t", header=True, index=False, @@ -184,6 +228,20 @@ def save_df_to_tsv(dataframe, path): ) +def load_tsv_to_dicts(path: Union[str, Path]) -> List[dict]: + with open(path, "r") as f: + reader = csv.DictReader( + f, + delimiter="\t", + quotechar=None, + doublequote=False, + lineterminator="\n", + quoting=csv.QUOTE_NONE, + ) + rows = [dict(e) for e in reader] + return rows + + def filter_manifest_df( df, is_train_split=False, extra_filters=None, min_n_frames=5, max_n_frames=3000 ): @@ -206,16 +264,26 @@ def filter_manifest_df( return df[valid] +def cal_gcmvn_stats(features_list): + features = np.concatenate(features_list) + square_sums = (features ** 2).sum(axis=0) + mean = features.mean(axis=0) + features = np.subtract(features, mean) + var = square_sums / features.shape[0] - mean ** 2 + std = np.sqrt(np.maximum(var, 1e-8)) + return {"mean": mean.astype("float32"), "std": std.astype("float32")} + + class S2TDataConfigWriter(object): DEFAULT_VOCAB_FILENAME = "dict.txt" DEFAULT_INPUT_FEAT_PER_CHANNEL = 80 DEFAULT_INPUT_CHANNELS = 1 - def __init__(self, yaml_path): + def __init__(self, yaml_path: Path): try: import yaml except ImportError: - print("Please install PyYAML to load YAML files for S2T data config") + print("Please install PyYAML for S2T data config YAML files") self.yaml = yaml self.yaml_path = yaml_path self.config = {} @@ -227,7 +295,7 @@ def flush(self): def set_audio_root(self, audio_root=""): self.config["audio_root"] = audio_root - def set_vocab_filename(self, vocab_filename="dict.txt"): + def set_vocab_filename(self, vocab_filename: str = "dict.txt"): self.config["vocab_filename"] = vocab_filename def set_specaugment( @@ -288,22 +356,28 @@ def set_specaugment_ss_policy(self): time_mask_p=0.2, ) - def set_input_channels(self, input_channels=1): + def set_input_channels(self, input_channels: int = 1): self.config["input_channels"] = input_channels - def set_input_feat_per_channel(self, input_feat_per_channel=80): + def set_input_feat_per_channel(self, input_feat_per_channel: int = 80): self.config["input_feat_per_channel"] = input_feat_per_channel def set_bpe_tokenizer(self, bpe_tokenizer: Dict[str, Any]): self.config["bpe_tokenizer"] = bpe_tokenizer - def set_feature_transforms(self, split, transforms: List[str]): + def set_global_cmvn(self, stats_npz_path: str): + self.config["global_cmvn"] = {"stats_npz_path": stats_npz_path} + + def set_feature_transforms(self, split: str, transforms: List[str]): if "transforms" not in self.config: self.config["transforms"] = {} self.config["transforms"][split] = transforms - def set_prepend_tgt_lang_tag(self, flag=True): + def set_prepend_tgt_lang_tag(self, flag: bool = True): self.config["prepend_tgt_lang_tag"] = flag - def set_sampling_alpha(self, sampling_alpha=1.0): + def set_sampling_alpha(self, sampling_alpha: float = 1.0): self.config["sampling_alpha"] = sampling_alpha + + def set_extra(self, data): + self.config.update(data) diff --git a/examples/speech_to_text/docs/covost_example.md b/examples/speech_to_text/docs/covost_example.md new file mode 100644 index 0000000000..6282428b16 --- /dev/null +++ b/examples/speech_to_text/docs/covost_example.md @@ -0,0 +1,140 @@ +[[Back]](..) + +# S2T Example: ST on CoVoST + +We replicate the experiments in +[CoVoST 2 and Massively Multilingual Speech-to-Text Translation (Wang et al., 2020)](https://arxiv.org/abs/2007.10310). + +## Data Preparation + +[Download](https://commonvoice.mozilla.org/en/datasets) and unpack Common Voice v4 to a path +`${COVOST_ROOT}/${SOURCE_LANG_ID}`, then preprocess it with + +```bash +# additional Python packages for S2T data processing/model training +pip install pandas torchaudio sentencepiece + +# En ASR +python examples/speech_to_text/prep_covost_data.py \ + --data-root ${COVOST_ROOT} --vocab-type char --src-lang en +# ST +python examples/speech_to_text/prep_covost_data.py \ + --data-root ${COVOST_ROOT} --vocab-type char \ + --src-lang fr --tgt-lang en +``` + +The generated files (manifest, features, vocabulary and data configuration) will be added to +`${COVOST_ROOT}/${SOURCE_LANG_ID}`. + +Download our vocabulary files if you want to use our pre-trained models: + +- ASR: [En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_asr_vocab_char.zip) +- ST: [Fr-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_fr_en_st_vocab_char.zip), [De-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_de_en_st_vocab_char.zip), [Es-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_es_en_st_vocab_char.zip), [Ca-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_ca_en_st_vocab_char.zip), [En-De](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_de_st_vocab_char.zip), [En-Ca](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_ca_st_vocab_char.zip), [En-Fa](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_fa_st_vocab_char.zip), [En-Et](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_et_st_vocab_char.zip) + +## ASR + +#### Training + +We train an En ASR model for encoder pre-training some of the ST models. + +```bash +fairseq-train ${COVOST_ROOT}/en \ + --config-yaml config_asr_en.yaml --train-subset train_asr_en --valid-subset dev_asr_en \ + --save-dir ${ASR_SAVE_DIR} --num-workers 4 --max-tokens 50000 --max-update 60000 \ + --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ + --report-accuracy --arch s2t_transformer_s --dropout 0.15 --optimizer adam --lr 2e-3 \ + --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \ + --attn-type None --pos-enc-type ${POS_ENC_TYPE} +``` + +where `ASR_SAVE_DIR` is the checkpoint root path and `POS_ENC_TYPE` refers to positional encoding to be used in the conformer encoder. +Set it to `abs`, `rope` or `rel_pos` to use the absolute positional encoding, rotary positional encoding or relative positional encoding in the conformer layer respectively. +Transformer encoder only supports absolute positional encoding and by default, the transformer encoder will be used. +To switch to conformer, set `--attn-type espnet` and `--POS_ENC_TYPE`. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. + +#### Inference & Evaluation + +```bash +CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt +python scripts/average_checkpoints.py \ + --inputs ${ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ + --output "${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" +fairseq-generate ${COVOST_ROOT}/en \ + --config-yaml config_asr_en.yaml --gen-subset test_asr_en --task speech_to_text \ + --path ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \ + --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct +``` + +#### Results + +| --arch | --pos-enc-type | Params | En | Model | +|---|---|---|---|---| +| s2t_transformer_s | - | 31M | 25.6 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_asr_transformer_s.pt) | +| s2t_conformer | rel_pos | 42.9M | 23.18| [Download](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_asr/rel_pos_asr_checkpoint_best.pt) | +| s2t_conformer | rope | 42.1M | 23.8| [Download](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_asr/rope_pos_asr_checkpoint_best.pt) | +| s2t_conformer | abs | 42.1M | 23.8| [Download](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_asr/abs_asr_checkpoint_best.pt) | + +## ST + +#### Training + +Fr-En as example: + +```bash +fairseq-train ${COVOST_ROOT}/fr \ + --config-yaml config_st_fr_en.yaml --train-subset train_st_fr_en --valid-subset dev_st_fr_en \ + --save-dir ${ST_SAVE_DIR} --num-workers 4 --max-update 30000 --max-tokens 40000 \ # --max-tokens 50000 for en-* + --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ + --arch s2t_transformer_s --encoder-freezing-updates 1000 --optimizer adam --lr 2e-3 \ + --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \ + --attn-type None --pos-enc-type ${POS_ENC_TYPE} \ + --load-pretrained-encoder-from ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} +``` + +where `ST_SAVE_DIR` is the checkpoint root path and `POS_ENC_TYPE` refers to positional encoding to be used in the conformer encoder. +Set it to `abs`, `rope` or `rel_pos` to use the absolute positional encoding, rotary positional encoding or relative positional encoding in the conformer layer respectively. +Transformer encoder only supports absolute positional encoding and by default, the transformer encoder will be used. +To switch to conformer, set `--attn-type espnet` and `--POS_ENC_TYPE`. Optionally load the pre-trained En ASR encoder for faster training and better +performance: `--load-pretrained-encoder-from <ASR checkpoint path>`. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. +You may want to update it accordingly when using more than 1 GPU. + +#### Inference & Evaluation + +Average the last 10 checkpoints and evaluate on test split: + +```bash +CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt +python scripts/average_checkpoints.py \ + --inputs ${ST_SAVE_DIR} --num-epoch-checkpoints 10 \ + --output "${ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" +fairseq-generate ${COVOST_ROOT}/fr \ + --config-yaml config_st_fr_en.yaml --gen-subset test_st_fr_en --task speech_to_text \ + --path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --max-tokens 50000 --beam 5 --scoring sacrebleu +``` + +## Interactive Decoding + +Launch the interactive console via + +```bash +fairseq-interactive ${COVOST_ROOT}/fr --config-yaml config_st_fr_en.yaml \ + --task speech_to_text --path ${SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --max-tokens 50000 --beam 5 +``` + +Type in WAV/FLAC/OGG audio paths (one per line) after the prompt. + +#### Results + +| --arch | --pos-enc-type | Params | ASR PT | Fr-En | De-En | Es-En | Ca-En | En-De | En-Ca | En-Fa | En-Et | Model | +|---|---|---|---|---|---|---|---|---|---|---|---|---| +| s2t_transformer | - | 31M | Yes | [27.2](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_fr_en_st_transformer_s.pt) | [17.7](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_de_en_st_transformer_s.pt) | [23.1](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_es_en_st_transformer_s.pt) | [19.3](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_ca_en_st_transformer_s.pt) | [16.1](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_de_st_transformer_s.pt) | [21.6](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_ca_st_transformer_s.pt) | [12.9](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_fa_st_transformer_s.pt) | [12.8](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_et_st_transformer_s.pt) | (<-Download) | +| s2t_conformer | rel_pos | 42.9M | No | [28.32](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [18.21](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [25.98](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [21.13](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [20.37](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [25.89](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [15.59](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [14.49](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | (<-Download) | +| s2t_conformer | rel_pos | 42.9M | Yes| [27.15](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [18.22](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [25.14](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [21.68](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [20.35](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [25.92](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [15.76](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [16.52](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | (<-Download) | +| s2t_conformer | rope | 42.1M | No | [27.61](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [17.6](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [24.91](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [20.78](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [19.7](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rope_from_scratch_avg_last_10_checkpoint.pt) | [25.13](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rope_from_scratch_avg_last_10_checkpoint.pt) | [15.22](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rope_from_scratch_avg_last_10_checkpoint.pt) | [15.87](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rope_from_scratch_avg_last_10_checkpoint.pt) | (<-Download) | +| s2t_conformer | rope | 42.1M | Yes | [26.99](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [17.71](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [24.24](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [21.24](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [19.9](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rope_asr_pt_avg_last_10_checkpoint.pt) | [25.25](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rope_asr_pt_avg_last_10_checkpoint.pt) | [15.58](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rope_asr_pt_avg_last_10_checkpoint.pt) | [15.97](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rope_asr_pt_avg_last_10_checkpoint.pt) | (<-Download) | +| s2t_conformer | abs | 42.1M | No | [27.45](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/abs_from_scratch_avg_last_10_checkpoint.pt) | [17.25](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/abs_from_scratch_avg_last_10_checkpoint.pt) | [25.01](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/abs_from_scratch_avg_last_10_checkpoint.pt) | [20.26](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/abs_from_scratch_avg_last_10_checkpoint.pt) | [19.86](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/abs_from_scratch_avg_last_10_checkpoint.pt) | [25.25](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/abs_from_scratch_avg_last_10_checkpoint.pt) | [15.46](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/abs_from_scratch_avg_last_10_checkpoint.pt) | [15.81](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/abs_from_scratch_avg_last_10_checkpoint.pt) | (<-Download) | +| s2t_conforme | abs | 42.1M | Yes| [26.52](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [17.37](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [25.40](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [20.45](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [19.57](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/abs_asr_pt_avg_last_10_checkpoint.pt) | [25.40](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/abs_asr_pt_avg_last_10_checkpoint.pt) | [15.17](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/abs_asr_pt_avg_last_10_checkpoint.pt) | [15.83](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/abs_asr_pt_avg_last_10_checkpoint.pt) | (<-Download) | + +[[Back]](..) diff --git a/examples/speech_to_text/docs/librispeech_example.md b/examples/speech_to_text/docs/librispeech_example.md new file mode 100644 index 0000000000..4040fda942 --- /dev/null +++ b/examples/speech_to_text/docs/librispeech_example.md @@ -0,0 +1,69 @@ +[[Back]](..) + +# S2T Example: Speech Recognition (ASR) on LibriSpeech +[LibriSpeech](https://www.danielpovey.com/files/2015_icassp_librispeech.pdf) is a de-facto standard English ASR +benchmark. We provide competitive +vanilla [Transformer](https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf) baselines. + +## Data preparation +Download and preprocess LibriSpeech data with +```bash +# additional Python packages for S2T data processing/model training +pip install pandas torchaudio sentencepiece + +python examples/speech_to_text/prep_librispeech_data.py \ + --output-root ${LS_ROOT} --vocab-type unigram --vocab-size 10000 +``` +where `LS_ROOT` is the root path for downloaded data as well as generated files (manifest, features, vocabulary and +data configuration). + +[Download](https://dl.fbaipublicfiles.com/fairseq/s2t/librispeech_vocab_unigram10000.zip) our vocabulary files +if you want to use our pre-trained models. + +## Training +```bash +fairseq-train ${LS_ROOT} --save-dir ${SAVE_DIR} \ + --config-yaml config.yaml --train-subset train-clean-100,train-clean-360,train-other-500 --valid-subset dev-clean,dev-other \ + --num-workers 4 --max-tokens 40000 --max-update 300000 \ + --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ + --arch s2t_transformer_s --share-decoder-input-output-embed \ + --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 \ + --clip-norm 10.0 --seed 1 --update-freq 8 +``` +where `SAVE_DIR` is the checkpoint root path. Here we use `--arch s2t_transformer_s` (31M parameters) as example. +For better performance, you may switch to `s2t_transformer_m` (71M, with `--lr 1e-3`) or `s2t_transformer_l` +(268M, with `--lr 5e-4`). We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly +when using more than 1 GPU. + +## Inference & Evaluation +Average the last 10 checkpoints and evaluate on the 4 splits +(`dev-clean`, `dev-other`, `test-clean` and `test-other`): +```bash +CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt +python scripts/average_checkpoints.py --inputs ${SAVE_DIR} \ + --num-epoch-checkpoints 10 \ + --output "${SAVE_DIR}/${CHECKPOINT_FILENAME}" +for SUBSET in dev-clean dev-other test-clean test-other; do + fairseq-generate ${LS_ROOT} --config-yaml config.yaml --gen-subset ${SUBSET} \ + --task speech_to_text --path ${SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --max-tokens 50000 --beam 5 --scoring wer +done +``` + +## Interactive Decoding +Launch the interactive console via +```bash +fairseq-interactive ${LS_ROOT} --config-yaml config.yaml --task speech_to_text \ + --path ${SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 +``` +Type in WAV/FLAC/OGG audio paths (one per line) after the prompt. + +## Results + +| --arch | Params | dev-clean | dev-other | test-clean | test-other | Model | +|---|---|---|---|---|---|---| +| s2t_transformer_s | 30M | 3.8 | 8.9 | 4.4 | 9.0 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/librispeech_transformer_s.pt) | +| s2t_transformer_m | 71M | 3.2 | 8.0 | 3.4 | 7.9 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/librispeech_transformer_m.pt) | +| s2t_transformer_l | 268M | 3.0 | 7.5 | 3.2 | 7.5 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/librispeech_transformer_l.pt) | + +[[Back]](..) diff --git a/examples/speech_to_text/docs/mtedx_example.md b/examples/speech_to_text/docs/mtedx_example.md new file mode 100644 index 0000000000..7e3d759557 --- /dev/null +++ b/examples/speech_to_text/docs/mtedx_example.md @@ -0,0 +1,201 @@ +[[Back]](..) + +# S2T Example: Speech Translation (ST) on Multilingual TEDx + +[Multilingual TEDx](https://arxiv.org/abs/2102.01757) is multilingual corpus for speech recognition and +speech translation. The data is derived from TEDx talks in 8 source languages +with translations to a subset of 5 target languages. + +## Data Preparation +[Download](http://openslr.org/100/) and unpack Multilingual TEDx data to a path +`${MTEDX_ROOT}/${LANG_PAIR}`, then preprocess it with +```bash +# additional Python packages for S2T data processing/model training +pip install pandas torchaudio soundfile sentencepiece + +# Generate TSV manifests, features, vocabulary +# and configuration for each language +python examples/speech_to_text/prep_mtedx_data.py \ + --data-root ${MTEDX_ROOT} --task asr \ + --vocab-type unigram --vocab-size 1000 +python examples/speech_to_text/prep_mtedx_data.py \ + --data-root ${MTEDX_ROOT} --task st \ + --vocab-type unigram --vocab-size 1000 + +# Add vocabulary and configuration for joint data +# (based on the manifests and features generated above) +python examples/speech_to_text/prep_mtedx_data.py \ + --data-root ${MTEDX_ROOT} --task asr --joint \ + --vocab-type unigram --vocab-size 8000 +python examples/speech_to_text/prep_mtedx_data.py \ + --data-root ${MTEDX_ROOT} --task st --joint \ + --vocab-type unigram --vocab-size 8000 +``` +The generated files (manifest, features, vocabulary and data configuration) will be added to +`${MTEDX_ROOT}/${LANG_PAIR}` (per-language data) and `MTEDX_ROOT` (joint data). + + +## ASR +#### Training +Spanish as example: +```bash +fairseq-train ${MTEDX_ROOT}/es-es \ + --config-yaml config_asr.yaml --train-subset train_asr --valid-subset valid_asr \ + --save-dir ${ASR_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-epoch 200 \ + --task speech_to_text --criterion label_smoothed_cross_entropy --report-accuracy \ + --arch s2t_transformer_xs --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ + --warmup-updates 10000 --clip-norm 10.0 --seed 1 --dropout 0.3 --label-smoothing 0.1 \ + --load-pretrained-encoder-from ${PRETRAINED_ENCODER} \ + --skip-invalid-size-inputs-valid-test \ + --keep-last-epochs 10 --update-freq 8 --patience 10 +``` +For joint model (using ASR data from all 8 languages): +```bash +fairseq-train ${MTEDX_ROOT} \ + --config-yaml config_asr.yaml \ + --train-subset train_es-es_asr,train_fr-fr_asr,train_pt-pt_asr,train_it-it_asr,train_ru-ru_asr,train_el-el_asr,train_ar-ar_asr,train_de-de_asr \ + --valid-subset valid_es-es_asr,valid_fr-fr_asr,valid_pt-pt_asr,valid_it-it_asr,valid_ru-ru_asr,valid_el-el_asr,valid_ar-ar_asr,valid_de-de_asr \ + --save-dir ${MULTILINGUAL_ASR_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-epoch 200 \ + --task speech_to_text --criterion label_smoothed_cross_entropy --report-accuracy \ + --arch s2t_transformer_s --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ + --warmup-updates 10000 --clip-norm 10.0 --seed 1 --dropout 0.3 --label-smoothing 0.1 \ + --skip-invalid-size-inputs-valid-test \ + --keep-last-epochs 10 --update-freq 8 --patience 10 \ + --ignore-prefix-size 1 +``` +where `MULTILINGUAL_ASR_SAVE_DIR` is the checkpoint root path. We set `--update-freq 8` to simulate 8 GPUs +with 1 GPU. You may want to update it accordingly when using more than 1 GPU. +For multilingual models, we prepend target language ID token as target BOS, which should be excluded from +the training loss via `--ignore-prefix-size 1`. + +#### Inference & Evaluation +```bash +CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt +python scripts/average_checkpoints.py \ + --inputs ${ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ + --output "${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" + +fairseq-generate ${MTEDX_ROOT}/es-es \ + --config-yaml config_asr.yaml --gen-subset test --task speech_to_text \ + --path ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \ + --skip-invalid-size-inputs-valid-test \ + --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct --remove-bpe + +# For models trained on joint data +CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt +python scripts/average_checkpoints.py \ + --inputs ${MULTILINGUAL_ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ + --output "${MULTILINGUAL_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" + +for LANG in es fr pt it ru el ar de; do + fairseq-generate ${MTEDX_ROOT} \ + --config-yaml config_asr.yaml --gen-subset test_${LANG}-${LANG}_asr --task speech_to_text \ + --prefix-size 1 --path ${MULTILINGUAL_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --max-tokens 40000 --beam 5 \ + --skip-invalid-size-inputs-valid-test \ + --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct --remove-bpe +done +``` +#### Results +| Data | --arch | Params | Es | Fr | Pt | It | Ru | El | Ar | De | +|--------------|--------------------|--------|------|------|------|------|------|-------|-------|-------| +| Monolingual | s2t_transformer_xs | 10M | 46.4 | 45.6 | 54.8 | 48.0 | 74.7 | 109.5 | 104.4 | 111.1 | + + +## ST +#### Training +Es-En as example: +```bash +fairseq-train ${MTEDX_ROOT}/es-en \ + --config-yaml config_st.yaml --train-subset train_st --valid-subset valid_st \ + --save-dir ${ST_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-epoch 200 \ + --task speech_to_text --criterion label_smoothed_cross_entropy --report-accuracy \ + --arch s2t_transformer_xs --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ + --warmup-updates 10000 --clip-norm 10.0 --seed 1 --dropout 0.3 --label-smoothing 0.1 \ + --load-pretrained-encoder-from ${PRETRAINED_ENCODER} \ + --skip-invalid-size-inputs-valid-test \ + --keep-last-epochs 10 --update-freq 8 --patience 10 +``` +For multilingual model (all 12 directions): +```bash +fairseq-train ${MTEDX_ROOT} \ + --config-yaml config_st.yaml \ + --train-subset train_el-en_st,train_es-en_st,train_es-fr_st,train_es-it_st,train_es-pt_st,train_fr-en_st,train_fr-es_st,train_fr-pt_st,train_it-en_st,train_it-es_st,train_pt-en_st,train_pt-es_st,train_ru-en_st \ + --valid-subset valid_el-en_st,valid_es-en_st,valid_es-fr_st,valid_es-it_st,valid_es-pt_st,valid_fr-en_st,valid_fr-es_st,valid_fr-pt_st,valid_it-en_st,valid_it-es_st,valid_pt-en_st,valid_pt-es_st,valid_ru-en_st \ + --save-dir ${MULTILINGUAL_ST_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-epoch 200 \ + --task speech_to_text --criterion label_smoothed_cross_entropy --report-accuracy \ + --arch s2t_transformer_s --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ + --warmup-updates 10000 --clip-norm 10.0 --seed 1 --dropout 0.3 --label-smoothing 0.1 \ + --skip-invalid-size-inputs-valid-test \ + --keep-last-epochs 10 --update-freq 8 --patience 10 \ + --ignore-prefix-size 1 \ + --load-pretrained-encoder-from ${PRETRAINED_ENCODER} +``` +where `ST_SAVE_DIR` (`MULTILINGUAL_ST_SAVE_DIR`) is the checkpoint root path. The ST encoder is pre-trained by ASR +for faster training and better performance: `--load-pretrained-encoder-from <(JOINT_)ASR checkpoint path>`. We set +`--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. +For multilingual models, we prepend target language ID token as target BOS, which should be excluded from +the training loss via `--ignore-prefix-size 1`. + +#### Inference & Evaluation +Average the last 10 checkpoints and evaluate on the `test` split: +```bash +CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt +python scripts/average_checkpoints.py \ + --inputs ${ST_SAVE_DIR} --num-epoch-checkpoints 10 \ + --output "${ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" + +fairseq-generate ${MTEDX_ROOT}/es-en \ + --config-yaml config_st.yaml --gen-subset test --task speech_to_text \ + --path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --max-tokens 50000 --beam 5 --scoring sacrebleu --remove-bpe + +# For multilingual models +python scripts/average_checkpoints.py \ + --inputs ${MULTILINGUAL_ST_SAVE_DIR} --num-epoch-checkpoints 10 \ + --output "${MULTILINGUAL_ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" + +for LANGPAIR in es-en es-fr es-pt fr-en fr-es fr-pt pt-en pt-es it-en it-es ru-en el-en; do + fairseq-generate ${MTEDX_ROOT} \ + --config-yaml config_st.yaml --gen-subset test_${LANGPAIR}_st --task speech_to_text \ + --prefix-size 1 --path ${MULTILINGUAL_ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --max-tokens 40000 --beam 5 \ + --skip-invalid-size-inputs-valid-test \ + --scoring sacrebleu --remove-bpe +done +``` +For multilingual models, we force decoding from the target language ID token (as BOS) via `--prefix-size 1`. + +#### Results +| Data | --arch | Params | Es-En | Es-Pt | Es-Fr | Fr-En | Fr-Es | Fr-Pt | Pt-En | Pt-Es | It-En | It-Es | Ru-En | El-En | +|--------------|--------------------|-----|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------| +| Bilingual | s2t_transformer_xs | 10M | 7.0 | 12.2 | 1.7 | 8.9 | 10.6 | 7.9 | 8.1 | 8.7 | 6.4 | 1.0 | 0.7 | 0.6 | +| Multilingual | s2t_transformer_s | 31M | 12.3 | 17.4 | 6.1 | 12.0 | 13.6 | 13.2 | 12.0 | 13.7 | 10.7 | 13.1 | 0.6 | 0.8 | + + +## Citation +Please cite as: +``` +@inproceedings{salesky2021mtedx, + title={Multilingual TEDx Corpus for Speech Recognition and Translation}, + author={Elizabeth Salesky and Matthew Wiesner and Jacob Bremerman and Roldano Cattoni and Matteo Negri and Marco Turchi and Douglas W. Oard and Matt Post}, + booktitle={Proceedings of Interspeech}, + year={2021}, +} + +@inproceedings{wang2020fairseqs2t, + title = {fairseq S2T: Fast Speech-to-Text Modeling with fairseq}, + author = {Changhan Wang and Yun Tang and Xutai Ma and Anne Wu and Dmytro Okhonko and Juan Pino}, + booktitle = {Proceedings of the 2020 Conference of the Asian Chapter of the Association for Computational Linguistics (AACL): System Demonstrations}, + year = {2020}, +} + +@inproceedings{ott2019fairseq, + title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling}, + author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli}, + booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations}, + year = {2019}, +} +``` + +[[Back]](..) diff --git a/examples/speech_to_text/docs/mustc_example.md b/examples/speech_to_text/docs/mustc_example.md new file mode 100644 index 0000000000..c95ef3e156 --- /dev/null +++ b/examples/speech_to_text/docs/mustc_example.md @@ -0,0 +1,155 @@ +[[Back]](..) + +# S2T Example: Speech Translation (ST) on MuST-C + +[MuST-C](https://www.aclweb.org/anthology/N19-1202) is multilingual speech-to-text translation corpus with +8-language translations on English TED talks. We match the state-of-the-art performance in +[ESPNet-ST](https://arxiv.org/pdf/2004.10234.pdf) with a simpler model training pipeline. + +## Data Preparation +[Download](https://ict.fbk.eu/must-c) and unpack MuST-C data to a path +`${MUSTC_ROOT}/en-${TARGET_LANG_ID}`, then preprocess it with +```bash +# additional Python packages for S2T data processing/model training +pip install pandas torchaudio soundfile sentencepiece + +# Generate TSV manifests, features, vocabulary +# and configuration for each language +python examples/speech_to_text/prep_mustc_data.py \ + --data-root ${MUSTC_ROOT} --task asr \ + --vocab-type unigram --vocab-size 5000 +python examples/speech_to_text/prep_mustc_data.py \ + --data-root ${MUSTC_ROOT} --task st \ + --vocab-type unigram --vocab-size 8000 + +# Add vocabulary and configuration for joint data +# (based on the manifests and features generated above) +python examples/speech_to_text/prep_mustc_data.py \ + --data-root ${MUSTC_ROOT} --task asr --joint \ + --vocab-type unigram --vocab-size 10000 +python examples/speech_to_text/prep_mustc_data.py \ + --data-root ${MUSTC_ROOT} --task st --joint \ + --vocab-type unigram --vocab-size 10000 +``` +The generated files (manifest, features, vocabulary and data configuration) will be added to +`${MUSTC_ROOT}/en-${TARGET_LANG_ID}` (per-language data) and `MUSTC_ROOT` (joint data). + +Download our vocabulary files if you want to use our pre-trained models: +- ASR: [En-De](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_de_asr_vocab_unigram5000.zip), [En-Nl](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_nl_asr_vocab_unigram5000.zip), [En-Es](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_es_asr_vocab_unigram5000.zip), [En-Fr](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_fr_asr_vocab_unigram5000.zip), [En-It](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_it_asr_vocab_unigram5000.zip), [En-Pt](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_pt_asr_vocab_unigram5000.zip), [En-Ro](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ro_asr_vocab_unigram5000.zip), [En-Ru](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ru_asr_vocab_unigram5000.zip), [Joint](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_joint_asr_vocab_unigram10000.zip) +- ST: [En-De](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_de_st_vocab_unigram8000.zip), [En-Nl](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_nl_st_vocab_unigram8000.zip), [En-Es](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_es_st_vocab_unigram8000.zip), [En-Fr](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_fr_st_vocab_unigram8000.zip), [En-It](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_it_st_vocab_unigram8000.zip), [En-Pt](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_pt_st_vocab_unigram8000.zip), [En-Ro](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ro_st_vocab_unigram8000.zip), [En-Ru](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ru_st_vocab_unigram8000.zip), [Multilingual](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_multilingual_st_vocab_unigram10000.zip) + +## ASR +#### Training +En-De as example: +```bash +fairseq-train ${MUSTC_ROOT}/en-de \ + --config-yaml config_asr.yaml --train-subset train_asr --valid-subset dev_asr \ + --save-dir ${ASR_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-update 100000 \ + --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ + --arch s2t_transformer_s --optimizer adam --lr 1e-3 --lr-scheduler inverse_sqrt \ + --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 +``` +For joint model (using ASR data from all 8 directions): +```bash +fairseq-train ${MUSTC_ROOT} \ + --config-yaml config_asr.yaml \ + --train-subset train_de_asr,train_nl_asr,train_es_asr,train_fr_asr,train_it_asr,train_pt_asr,train_ro_asr,train_ru_asr \ + --valid-subset dev_de_asr,dev_nl_asr,dev_es_asr,dev_fr_asr,dev_it_asr,dev_pt_asr,dev_ro_asr,dev_ru_asr \ + --save-dir ${JOINT_ASR_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-update 100000 \ + --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ + --arch s2t_transformer_s --optimizer adam --lr 1e-3 --lr-scheduler inverse_sqrt \ + --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 +``` +where `ASR_SAVE_DIR` (`JOINT_ASR_SAVE_DIR`) is the checkpoint root path. We set `--update-freq 8` to simulate 8 GPUs +with 1 GPU. You may want to update it accordingly when using more than 1 GPU. + +#### Inference & Evaluation +```bash +CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt +python scripts/average_checkpoints.py \ + --inputs ${ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ + --output "${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" +fairseq-generate ${MUSTC_ROOT}/en-de \ + --config-yaml config_asr.yaml --gen-subset tst-COMMON_asr --task speech_to_text \ + --path ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \ + --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct + +# For models trained on joint data +python scripts/average_checkpoints.py \ + --inputs ${JOINT_ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ + --output "${JOINT_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" +for LANG in de nl es fr it pt ro ru; do + fairseq-generate ${MUSTC_ROOT} \ + --config-yaml config_asr.yaml --gen-subset tst-COMMON_${LANG}_asr --task speech_to_text \ + --path ${JOINT_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \ + --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct +done +``` +#### Results +| Data | --arch | Params | En-De | En-Nl | En-Es | En-Fr | En-It | En-Pt | En-Ro | En-Ru | Model | +|---|---|---|---|---|---|---|---|---|---|---|---| +| Single | s2t_transformer_s | 31M | [18.2](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_de_asr_transformer_s.pt) | [17.6](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_nl_asr_transformer_s.pt) | [17.7](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_es_asr_transformer_s.pt) | [17.2](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_fr_asr_transformer_s.pt) | [17.9](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_it_asr_transformer_s.pt) | [19.1](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_pt_asr_transformer_s.pt) | [18.1](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ro_asr_transformer_s.pt) | [17.7](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ru_asr_transformer_s.pt) | (<-Download) | +| Joint | s2t_transformer_m | 76M | 16.8 | 16.7 | 16.9 | 16.9 | 17.0 | 17.4 | 17.0 | 16.9 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_joint_asr_transformer_m.pt) | + +## ST +#### Training +En-De as example: +```bash +fairseq-train ${MUSTC_ROOT}/en-de \ + --config-yaml config_st.yaml --train-subset train_st --valid-subset dev_st \ + --save-dir ${ST_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-update 100000 \ + --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ + --arch s2t_transformer_s --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ + --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \ + --load-pretrained-encoder-from ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} +``` +For multilingual model (all 8 directions): +```bash +fairseq-train ${MUSTC_ROOT} \ + --config-yaml config_st.yaml \ + --train-subset train_de_st,train_nl_st,train_es_st,train_fr_st,train_it_st,train_pt_st,train_ro_st,train_ru_st \ + --valid-subset dev_de_st,dev_nl_st,dev_es_st,dev_fr_st,dev_it_st,dev_pt_st,dev_ro_st,dev_ru_st \ + --save-dir ${MULTILINGUAL_ST_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-update 100000 \ + --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ + --arch s2t_transformer_s --ignore-prefix-size 1 --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ + --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \ + --load-pretrained-encoder-from ${JOINT_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} +``` +where `ST_SAVE_DIR` (`MULTILINGUAL_ST_SAVE_DIR`) is the checkpoint root path. The ST encoder is pre-trained by ASR +for faster training and better performance: `--load-pretrained-encoder-from <(JOINT_)ASR checkpoint path>`. We set +`--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. +For multilingual models, we prepend target language ID token as target BOS, which should be excluded from +the training loss via `--ignore-prefix-size 1`. + +#### Inference & Evaluation +Average the last 10 checkpoints and evaluate on the `tst-COMMON` split: +```bash +CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt +python scripts/average_checkpoints.py \ + --inputs ${ST_SAVE_DIR} --num-epoch-checkpoints 10 \ + --output "${ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" +fairseq-generate ${MUSTC_ROOT}/en-de \ + --config-yaml config_st.yaml --gen-subset tst-COMMON_st --task speech_to_text \ + --path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --max-tokens 50000 --beam 5 --scoring sacrebleu + +# For multilingual models +python scripts/average_checkpoints.py \ + --inputs ${MULTILINGUAL_ST_SAVE_DIR} --num-epoch-checkpoints 10 \ + --output "${MULTILINGUAL_ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" +for LANG in de nl es fr it pt ro ru; do + fairseq-generate ${MUSTC_ROOT} \ + --config-yaml config_st.yaml --gen-subset tst-COMMON_${LANG}_st --task speech_to_text \ + --prefix-size 1 --path ${MULTILINGUAL_ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --max-tokens 50000 --beam 5 --scoring sacrebleu +done +``` +For multilingual models, we force decoding from the target language ID token (as BOS) via `--prefix-size 1`. + +#### Results +| Data | --arch | Params | En-De | En-Nl | En-Es | En-Fr | En-It | En-Pt | En-Ro | En-Ru | Model | +|---|---|---|---|---|---|---|---|---|---|---|---| +| Bilingual | s2t_transformer_s | 31M | [22.7](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_de_st_transformer_s.pt) | [27.3](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_nl_st_transformer_s.pt) | [27.2](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_es_st_transformer_s.pt) | [32.9](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_fr_st_transformer_s.pt) | [22.7](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_it_st_transformer_s.pt) | [28.1](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_pt_st_transformer_s.pt) | [21.9](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ro_st_transformer_s.pt) | [15.3](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ru_st_transformer_s.pt) | (<-Download) | +| Multilingual | s2t_transformer_m | 76M | 24.5 | 28.6 | 28.2 | 34.9 | 24.6 | 31.1 | 23.8 | 16.0 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_multilingual_st_transformer_m.pt) | + +[[Back]](..) diff --git a/examples/speech_to_text/docs/simulst_mustc_example.md b/examples/speech_to_text/docs/simulst_mustc_example.md new file mode 100644 index 0000000000..f3b5a413a2 --- /dev/null +++ b/examples/speech_to_text/docs/simulst_mustc_example.md @@ -0,0 +1,190 @@ +# Simultaneous Speech Translation (SimulST) on MuST-C + +This is a tutorial of training and evaluating a transformer *wait-k* simultaneous model on MUST-C English-Germen Dataset, from [SimulMT to SimulST: Adapting Simultaneous Text Translation to End-to-End Simultaneous Speech Translation](https://www.aclweb.org/anthology/2020.aacl-main.58.pdf). + +[MuST-C](https://www.aclweb.org/anthology/N19-1202) is multilingual speech-to-text translation corpus with 8-language translations on English TED talks. + +## Data Preparation +This section introduces the data preparation for training and evaluation. +If you only want to evaluate the model, please jump to [Inference & Evaluation](#inference--evaluation) + +[Download](https://ict.fbk.eu/must-c) and unpack MuST-C data to a path +`${MUSTC_ROOT}/en-${TARGET_LANG_ID}`, then preprocess it with +```bash +# Additional Python packages for S2T data processing/model training +pip install pandas torchaudio sentencepiece + +# Generate TSV manifests, features, vocabulary, +# global cepstral and mean estimation, +# and configuration for each language +cd fairseq + +python examples/speech_to_text/prep_mustc_data.py \ + --data-root ${MUSTC_ROOT} --task asr \ + --vocab-type unigram --vocab-size 10000 \ + --cmvn-type global + +python examples/speech_to_text/prep_mustc_data.py \ + --data-root ${MUSTC_ROOT} --task st \ + --vocab-type unigram --vocab-size 10000 \ + --cmvn-type global +``` + +## ASR Pretraining +We need a pretrained offline ASR model. Assuming the save directory of the ASR model is `${ASR_SAVE_DIR}`. +The following command (and the subsequent training commands in this tutorial) assume training on 1 GPU (you can also train on 8 GPUs and remove the `--update-freq 8` option). +``` +fairseq-train ${MUSTC_ROOT}/en-de \ + --config-yaml config_asr.yaml --train-subset train_asr --valid-subset dev_asr \ + --save-dir ${ASR_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-update 100000 \ + --task speech_to_text --criterion label_smoothed_cross_entropy --report-accuracy \ + --arch convtransformer_espnet --optimizer adam --lr 0.0005 --lr-scheduler inverse_sqrt \ + --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 +``` +A pretrained ASR checkpoint can be downloaded [here](https://dl.fbaipublicfiles.com/simultaneous_translation/must_c_v1_en_de_pretrained_asr) + +## Simultaneous Speech Translation Training + +### Wait-K with fixed pre-decision module +Fixed pre-decision indicates that the model operate simultaneous policy on the boundaries of fixed chunks. +Here is a example of fixed pre-decision ratio 7 (the simultaneous decision is made every 7 encoder states) and +a wait-3 policy model. Assuming the save directory is `${ST_SAVE_DIR}` +```bash + fairseq-train ${MUSTC_ROOT}/en-de \ + --config-yaml config_st.yaml --train-subset train_st --valid-subset dev_st \ + --save-dir ${ST_SAVE_DIR} --num-workers 8 \ + --optimizer adam --lr 0.0001 --lr-scheduler inverse_sqrt --clip-norm 10.0 \ + --criterion label_smoothed_cross_entropy \ + --warmup-updates 4000 --max-update 100000 --max-tokens 40000 --seed 2 \ + --load-pretrained-encoder-from ${ASR_SAVE_DIR}/checkpoint_best.pt \ + --task speech_to_text \ + --arch convtransformer_simul_trans_espnet \ + --simul-type waitk_fixed_pre_decision \ + --waitk-lagging 3 \ + --fixed-pre-decision-ratio 7 \ + --update-freq 8 + +``` +### Monotonic multihead attention with fixed pre-decision module +``` + fairseq-train ${MUSTC_ROOT}/en-de \ + --config-yaml config_st.yaml --train-subset train_st --valid-subset dev_st \ + --save-dir ${ST_SAVE_DIR} --num-workers 8 \ + --optimizer adam --lr 0.0001 --lr-scheduler inverse_sqrt --clip-norm 10.0 \ + --warmup-updates 4000 --max-update 100000 --max-tokens 40000 --seed 2 \ + --load-pretrained-encoder-from ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --task speech_to_text \ + --criterion latency_augmented_label_smoothed_cross_entropy \ + --latency-weight-avg 0.1 \ + --arch convtransformer_simul_trans_espnet \ + --simul-type infinite_lookback_fixed_pre_decision \ + --fixed-pre-decision-ratio 7 \ + --update-freq 8 +``` +## Inference & Evaluation +[SimulEval](https://github.com/facebookresearch/SimulEval) is used for evaluation. +The following command is for evaluation. + +``` +git clone https://github.com/facebookresearch/SimulEval.git +cd SimulEval +pip install -e . + +simuleval \ + --agent ${FAIRSEQ}/examples/speech_to_text/simultaneous_translation/agents/fairseq_simul_st_agent.py + --source ${SRC_LIST_OF_AUDIO} + --target ${TGT_FILE} + --data-bin ${MUSTC_ROOT}/en-de \ + --config config_st.yaml \ + --model-path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --output ${OUTPUT} \ + --scores +``` + +The source file `${SRC_LIST_OF_AUDIO}` is a list of paths of audio files. Assuming your audio files stored at `/home/user/data`, +it should look like this + +```bash +/home/user/data/audio-1.wav +/home/user/data/audio-2.wav +``` + +Each line of target file `${TGT_FILE}` is the translation for each audio file input. +```bash +Translation_1 +Translation_2 +``` +The evaluation runs on the original MUSTC segmentation. +The following command will generate the wav list and text file for a evaluation set `${SPLIT}` (chose from `dev`, `tst-COMMON` and `tst-HE`) in MUSTC to `${EVAL_DATA}`. +```bash +python ${FAIRSEQ}/examples/speech_to_text/seg_mustc_data.py \ + --data-root ${MUSTC_ROOT} --lang de \ + --split ${SPLIT} --task st \ + --output ${EVAL_DATA} +``` + +The `--data-bin` and `--config` should be the same in previous section if you prepare the data from the scratch. +If only for evaluation, a prepared data directory can be found [here](https://dl.fbaipublicfiles.com/simultaneous_translation/must_c_v1.0_en_de_databin.tgz). It contains +- `spm_unigram10000_st.model`: a sentencepiece model binary. +- `spm_unigram10000_st.txt`: the dictionary file generated by the sentencepiece model. +- `gcmvn.npz`: the binary for global cepstral mean and variance. +- `config_st.yaml`: the config yaml file. It looks like this. +You will need to set the absolute paths for `sentencepiece_model` and `stats_npz_path` if the data directory is downloaded. +```yaml +bpe_tokenizer: + bpe: sentencepiece + sentencepiece_model: ABS_PATH_TO_SENTENCEPIECE_MODEL +global_cmvn: + stats_npz_path: ABS_PATH_TO_GCMVN_FILE +input_channels: 1 +input_feat_per_channel: 80 +sampling_alpha: 1.0 +specaugment: + freq_mask_F: 27 + freq_mask_N: 1 + time_mask_N: 1 + time_mask_T: 100 + time_mask_p: 1.0 + time_wrap_W: 0 +transforms: + '*': + - global_cmvn + _train: + - global_cmvn + - specaugment +vocab_filename: spm_unigram10000_st.txt +``` + +Notice that once a `--data-bin` is set, the `--config` is the base name of the config yaml, not the full path. + +Set `--model-path` to the model checkpoint. +A pretrained checkpoint can be downloaded from [here](https://dl.fbaipublicfiles.com/simultaneous_translation/convtransformer_wait5_pre7), which is a wait-5 model with a pre-decision of 280 ms. + +The result of this model on `tst-COMMON` is: +```bash +{ + "Quality": { + "BLEU": 13.94974229366959 + }, + "Latency": { + "AL": 1751.8031870037803, + "AL_CA": 2338.5911762796536, + "AP": 0.7931395378788959, + "AP_CA": 0.9405103863210942, + "DAL": 1987.7811616943081, + "DAL_CA": 2425.2751560926167 + } +} +``` + +If `--output ${OUTPUT}` option is used, the detailed log and scores will be stored under the `${OUTPUT}` directory. + + +The quality is measured by detokenized BLEU. So make sure that the predicted words sent to the server are detokenized. + +The latency metrics are +* Average Proportion +* Average Lagging +* Differentiable Average Lagging + +Again they will also be evaluated on detokenized text. diff --git a/examples/speech_to_text/prep_covost_data.py b/examples/speech_to_text/prep_covost_data.py index e8a028b446..411e9b5515 100644 --- a/examples/speech_to_text/prep_covost_data.py +++ b/examples/speech_to_text/prep_covost_data.py @@ -5,10 +5,8 @@ # LICENSE file in the root directory of this source tree. import argparse -import csv import logging -import os -import os.path as op +from pathlib import Path import shutil from tempfile import NamedTemporaryFile from typing import Optional, Tuple @@ -22,6 +20,7 @@ gen_config_yaml, gen_vocab, get_zip_manifest, + load_df_from_tsv, save_df_to_tsv, ) from torch import Tensor @@ -49,10 +48,6 @@ class CoVoST(Dataset): found at root path. (default: ``False``). """ - CV_URL_TEMPLATE = ( - "https://voice-prod-bundler-ee1969a6ce8178826482b88" - "e843c335139bd3fb4.s3.amazonaws.com/{ver}/{lang}.tar.gz" - ) COVOST_URL_TEMPLATE = ( "https://dl.fbaipublicfiles.com/covost/" "covost_v2.{src_lang}_{tgt_lang}.tsv.tar.gz" @@ -61,8 +56,6 @@ class CoVoST(Dataset): VERSIONS = {2} SPLITS = ["train", "dev", "test"] - CV_VERSION_ID = {1: "cv-corpus-3", 2: "cv-corpus-4-2019-12-10"} - XX_EN_LANGUAGES = { 1: ["fr", "de", "nl", "ru", "es", "it", "tr", "fa", "sv-SE", "mn", "zh-CN"], 2: [ @@ -117,7 +110,6 @@ def __init__( source_language: str, target_language: Optional[str] = None, version: int = 2, - download: bool = False, ) -> None: assert version in self.VERSIONS and split in self.SPLITS assert source_language is not None @@ -134,30 +126,22 @@ def __init__( # to Common Voice train split. target_language = "de" if source_language == "en" else "en" - self.root = os.path.join(root, "raw") - os.makedirs(self.root, exist_ok=True) + self.root: Path = Path(root) - cv_url = self.CV_URL_TEMPLATE.format( - ver=self.CV_VERSION_ID[version], lang=source_language - ) - cv_archive = os.path.join(self.root, os.path.basename(cv_url)) - if download: - if not os.path.isfile(cv_archive): - download_url(cv_url, self.root, hash_value=None) - extract_archive(cv_archive) + cv_tsv_path = self.root / "validated.tsv" + assert cv_tsv_path.is_file() covost_url = self.COVOST_URL_TEMPLATE.format( src_lang=source_language, tgt_lang=target_language ) - covost_archive = os.path.join(self.root, os.path.basename(covost_url)) - if download: - if not os.path.isfile(covost_archive): - download_url(covost_url, self.root, hash_value=None) - extract_archive(covost_archive) + covost_archive = self.root / Path(covost_url).name + if not covost_archive.is_file(): + download_url(covost_url, self.root.as_posix(), hash_value=None) + extract_archive(covost_archive.as_posix()) - cv_tsv = self.load_from_tsv(os.path.join(self.root, "validated.tsv")) - covost_tsv = self.load_from_tsv( - os.path.join(self.root, os.path.basename(covost_url).replace(".tar.gz", "")) + cv_tsv = load_df_from_tsv(cv_tsv_path) + covost_tsv = load_df_from_tsv( + self.root / Path(covost_url).name.replace(".tar.gz", "") ) df = pd.merge( left=cv_tsv[["path", "sentence", "client_id"]], @@ -169,20 +153,16 @@ def __init__( df = df[(df["split"] == split) | (df["split"] == f"{split}_covost")] else: df = df[df["split"] == split] - self.data = df.to_dict(orient="index").items() - self.data = [v for k, v in sorted(self.data, key=lambda x: x[0])] - - @classmethod - def load_from_tsv(cls, path: str): - return pd.read_csv( - path, - sep="\t", - header=0, - encoding="utf-8", - escapechar="\\", - quoting=csv.QUOTE_NONE, - na_filter=False, - ) + data = df.to_dict(orient="index").items() + data = [v for k, v in sorted(data, key=lambda x: x[0])] + self.data = [] + for e in data: + try: + path = self.root / "clips" / e["path"] + _ = torchaudio.info(path.as_posix()) + self.data.append(e) + except RuntimeError: + pass def __getitem__( self, n: int @@ -197,7 +177,7 @@ def __getitem__( sample_id)`` """ data = self.data[n] - path = os.path.join(self.root, "clips", data["path"]) + path = self.root / "clips" / data["path"] waveform, sample_rate = torchaudio.load(path) sentence = data["sentence"] translation = None if self.no_translation else data["translation"] @@ -210,26 +190,26 @@ def __len__(self) -> int: def process(args): - root = op.join(args.data_root, args.src_lang) - os.makedirs(root, exist_ok=True) + root = Path(args.data_root).absolute() / args.src_lang + if not root.is_dir(): + raise NotADirectoryError(f"{root} does not exist") # Extract features - feature_root = op.join(root, "fbank80") - os.makedirs(feature_root, exist_ok=True) + feature_root = root / "fbank80" + feature_root.mkdir(exist_ok=True) for split in CoVoST.SPLITS: print(f"Fetching split {split}...") - dataset = CoVoST(root, split, args.src_lang, args.tgt_lang, download=True) + dataset = CoVoST(root, split, args.src_lang, args.tgt_lang) print("Extracting log mel filter bank features...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): extract_fbank_features( - waveform, sample_rate, op.join(feature_root, f"{utt_id}.npy") + waveform, sample_rate, feature_root / f"{utt_id}.npy" ) # Pack features into ZIP - zip_filename = "fbank80.zip" - zip_path = op.join(root, zip_filename) + zip_path = root / "fbank80.zip" print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") - zip_manifest = get_zip_manifest(args.data_root, f"{args.src_lang}/{zip_filename}") + audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] @@ -239,11 +219,10 @@ def process(args): for split in CoVoST.SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = CoVoST(root, split, args.src_lang, args.tgt_lang) - for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): + for _, _, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest["id"].append(utt_id) - manifest["audio"].append(zip_manifest[utt_id]) - duration_ms = int(wav.size(1) / sr * 1000) - manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) + manifest["audio"].append(audio_paths[utt_id]) + manifest["n_frames"].append(audio_lengths[utt_id]) manifest["tgt_text"].append(src_utt if args.tgt_lang is None else tgt_utt) manifest["speaker"].append(speaker_id) is_train_split = split.startswith("train") @@ -251,7 +230,7 @@ def process(args): train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) - save_df_to_tsv(df, op.join(root, f"{split}_{task}.tsv")) + save_df_to_tsv(df, root / f"{split}_{task}.tsv") # Generate vocab vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{task}" @@ -259,12 +238,15 @@ def process(args): for t in train_text: f.write(t + "\n") gen_vocab( - f.name, op.join(root, spm_filename_prefix), args.vocab_type, args.vocab_size + Path(f.name), + root / spm_filename_prefix, + args.vocab_type, + args.vocab_size ) # Generate config YAML gen_config_yaml( root, - spm_filename_prefix + ".model", + spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{task}.yaml", specaugment_policy="lb", ) @@ -274,7 +256,10 @@ def process(args): def main(): parser = argparse.ArgumentParser() - parser.add_argument("--data-root", "-d", required=True, type=str) + parser.add_argument( + "--data-root", "-d", required=True, type=str, + help="data root with sub-folders for each language <root>/<src_lang>" + ) parser.add_argument( "--vocab-type", default="unigram", diff --git a/examples/speech_to_text/prep_librispeech_data.py b/examples/speech_to_text/prep_librispeech_data.py index 95fcec8fe3..f379fa7bf1 100644 --- a/examples/speech_to_text/prep_librispeech_data.py +++ b/examples/speech_to_text/prep_librispeech_data.py @@ -6,8 +6,7 @@ import argparse import logging -import os -import os.path as op +from pathlib import Path import shutil from tempfile import NamedTemporaryFile @@ -40,42 +39,41 @@ def process(args): - os.makedirs(args.output_root, exist_ok=True) + out_root = Path(args.output_root).absolute() + out_root.mkdir(exist_ok=True) # Extract features - feature_root = op.join(args.output_root, "fbank80") - os.makedirs(feature_root, exist_ok=True) + feature_root = out_root / "fbank80" + feature_root.mkdir(exist_ok=True) for split in SPLITS: print(f"Fetching split {split}...") - dataset = LIBRISPEECH(args.output_root, url=split, download=True) + dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True) print("Extracting log mel filter bank features...") - for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset): - sample_id = f"{spk_id}-{chapter_id}-{utt_id}" + for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset): + sample_id = f"{spk_id}-{chapter_no}-{utt_no}" extract_fbank_features( - wav, sample_rate, op.join(feature_root, f"{sample_id}.npy") + wav, sample_rate, feature_root / f"{sample_id}.npy" ) # Pack features into ZIP - zip_filename = "fbank80.zip" - zip_path = op.join(args.output_root, zip_filename) + zip_path = out_root / "fbank80.zip" print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") - zip_manifest = get_zip_manifest(args.output_root, zip_filename) + audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} - dataset = LIBRISPEECH(args.output_root, url=split) - for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset): - sample_id = f"{spk_id}-{chapter_id}-{utt_id}" + dataset = LIBRISPEECH(out_root.as_posix(), url=split) + for _, _, utt, spk_id, chapter_no, utt_no in tqdm(dataset): + sample_id = f"{spk_id}-{chapter_no}-{utt_no}" manifest["id"].append(sample_id) - manifest["audio"].append(zip_manifest[sample_id]) - duration_ms = int(wav.size(1) / sample_rate * 1000) - manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) - manifest["tgt_text"].append(utt) + manifest["audio"].append(audio_paths[sample_id]) + manifest["n_frames"].append(audio_lengths[sample_id]) + manifest["tgt_text"].append(utt.lower()) manifest["speaker"].append(spk_id) save_df_to_tsv( - pd.DataFrame.from_dict(manifest), op.join(args.output_root, f"{split}.tsv") + pd.DataFrame.from_dict(manifest), out_root / f"{split}.tsv" ) if split.startswith("train"): train_text.extend(manifest["tgt_text"]) @@ -86,14 +84,16 @@ def process(args): for t in train_text: f.write(t + "\n") gen_vocab( - f.name, - op.join(args.output_root, spm_filename_prefix), + Path(f.name), + out_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml( - args.output_root, spm_filename_prefix + ".model", specaugment_policy="ld" + out_root, + spm_filename=spm_filename_prefix + ".model", + specaugment_policy="ld" ) # Clean up shutil.rmtree(feature_root) diff --git a/examples/speech_to_text/prep_mtedx_data.py b/examples/speech_to_text/prep_mtedx_data.py new file mode 100644 index 0000000000..2dfd631763 --- /dev/null +++ b/examples/speech_to_text/prep_mtedx_data.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os +from pathlib import Path +import shutil +from itertools import groupby +from tempfile import NamedTemporaryFile +from typing import Tuple + +import pandas as pd +import soundfile as sf +from examples.speech_to_text.data_utils import ( + create_zip, + extract_fbank_features, + filter_manifest_df, + gen_config_yaml, + gen_vocab, + get_zip_manifest, + load_df_from_tsv, + save_df_to_tsv, +) +import torch +from torch.utils.data import Dataset +from tqdm import tqdm + +from fairseq.data.audio.audio_utils import get_waveform, convert_waveform + + +log = logging.getLogger(__name__) + + +MANIFEST_COLUMNS = [ + "id", "audio", "n_frames", "tgt_text", "speaker", "tgt_lang" +] + + +class mTEDx(Dataset): + """ + Create a Dataset for Multilingual TEDx. + Each item is a tuple of the form: waveform, sample_rate, source utterance, + target utterance, speaker_id, utterance_id + """ + + SPLITS = ["train", "valid", "test"] + LANGPAIRS = ["es-es", "fr-fr", "pt-pt", "it-it", "ru-ru", "el-el", "ar-ar", + "de-de", "es-en", "es-fr", "es-pt", "es-it", "fr-en", "fr-es", + "fr-pt", "pt-en", "pt-es", "it-en", "it-es", "ru-en", "el-en"] + + def __init__(self, root: str, lang: str, split: str) -> None: + assert split in self.SPLITS and lang in self.LANGPAIRS + _root = Path(root) / f"{lang}" / "data" / split + wav_root, txt_root = _root / "wav", _root / "txt" + assert _root.is_dir() and wav_root.is_dir() and txt_root.is_dir() + # Load audio segments + try: + import yaml + except ImportError: + print( + "Please install PyYAML to load the Multilingual TEDx YAML files" + ) + with open(txt_root / f"{split}.yaml") as f: + segments = yaml.load(f, Loader=yaml.BaseLoader) + # Load source and target utterances + src, tgt = lang.split("-") + for _lang in [src, tgt]: + with open(txt_root / f"{split}.{_lang}") as f: + utterances = [r.strip() for r in f] + assert len(segments) == len(utterances) + for i, u in enumerate(utterances): + segments[i][_lang] = u + # Gather info + self.data = [] + for wav_filename, _seg_group in groupby(segments, lambda x: x["wav"]): + wav_filename = wav_filename.replace(".wav", ".flac") + wav_path = wav_root / wav_filename + sample_rate = sf.info(wav_path.as_posix()).samplerate + seg_group = sorted(_seg_group, key=lambda x: float(x["offset"])) + for i, segment in enumerate(seg_group): + offset = int(float(segment["offset"]) * sample_rate) + n_frames = int(float(segment["duration"]) * sample_rate) + _id = f"{wav_path.stem}_{i}" + self.data.append( + ( + wav_path.as_posix(), + offset, + n_frames, + sample_rate, + segment[src], + segment[tgt], + segment["speaker_id"], + tgt, + _id, + ) + ) + + def __getitem__( + self, n: int + ) -> Tuple[torch.Tensor, int, str, str, str, str, str]: + wav_path, offset, n_frames, sr, src_utt, tgt_utt, spk_id, tgt_lang, \ + utt_id = self.data[n] + waveform, _ = get_waveform(wav_path, frames=n_frames, start=offset) + waveform = torch.from_numpy(waveform) + return waveform, sr, src_utt, tgt_utt, spk_id, tgt_lang, utt_id + + def __len__(self) -> int: + return len(self.data) + + +def process(args): + root = Path(args.data_root).absolute() + for lang in mTEDx.LANGPAIRS: + cur_root = root / f"{lang}" + if not cur_root.is_dir(): + print(f"{cur_root.as_posix()} does not exist. Skipped.") + continue + # Extract features + audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80") + audio_root.mkdir(exist_ok=True) + for split in mTEDx.SPLITS: + print(f"Fetching split {split}...") + dataset = mTEDx(root.as_posix(), lang, split) + if args.use_audio_input: + print("Converting audios...") + for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): + tgt_sample_rate = 16_000 + _wavform, _ = convert_waveform( + waveform, sample_rate, to_mono=True, + to_sample_rate=tgt_sample_rate + ) + sf.write( + (audio_root / f"{utt_id}.flac").as_posix(), + _wavform.numpy(), tgt_sample_rate + ) + else: + print("Extracting log mel filter bank features...") + for waveform, sample_rate, _, _, _, _, utt_id in tqdm(dataset): + extract_fbank_features( + waveform, sample_rate, audio_root / f"{utt_id}.npy" + ) + # Pack features into ZIP + zip_path = cur_root / f"{audio_root.name}.zip" + print("ZIPing audios/features...") + create_zip(audio_root, zip_path) + print("Fetching ZIP manifest...") + audio_paths, audio_lengths = get_zip_manifest(zip_path) + # Generate TSV manifest + print("Generating manifest...") + train_text = [] + for split in mTEDx.SPLITS: + is_train_split = split.startswith("train") + manifest = {c: [] for c in MANIFEST_COLUMNS} + ds = mTEDx(args.data_root, lang, split) + for _, _, src_utt, tgt_utt, spk_id, tgt_lang, utt_id in tqdm(ds): + manifest["id"].append(utt_id) + manifest["audio"].append(audio_paths[utt_id]) + manifest["n_frames"].append(audio_lengths[utt_id]) + manifest["tgt_text"].append( + src_utt if args.task == "asr" else tgt_utt + ) + manifest["speaker"].append(spk_id) + manifest["tgt_lang"].append(tgt_lang) + if is_train_split: + train_text.extend(manifest["tgt_text"]) + df = pd.DataFrame.from_dict(manifest) + df = filter_manifest_df(df, is_train_split=is_train_split) + save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv") + # Generate vocab + v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) + spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" + with NamedTemporaryFile(mode="w") as f: + for t in train_text: + f.write(t + "\n") + gen_vocab( + Path(f.name), + cur_root / spm_filename_prefix, + args.vocab_type, + args.vocab_size, + ) + # Generate config YAML + if args.use_audio_input: + gen_config_yaml( + cur_root, + spm_filename=spm_filename_prefix + ".model", + yaml_filename=f"config_{args.task}.yaml", + specaugment_policy=None, + extra={"use_audio_input": True} + ) + else: + gen_config_yaml( + cur_root, + spm_filename=spm_filename_prefix + ".model", + yaml_filename=f"config_{args.task}.yaml", + specaugment_policy="lb", + ) + # Clean up + shutil.rmtree(audio_root) + + +def process_joint(args): + cur_root = Path(args.data_root) + assert all((cur_root / f"{lang}").is_dir() for lang in mTEDx.LANGPAIRS), \ + "do not have downloaded data available for all languages" + # Generate vocab + vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) + spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{args.task}" + with NamedTemporaryFile(mode="w") as f: + for lang in mTEDx.LANGPAIRS: + tsv_path = cur_root / f"{lang}" / f"train_{args.task}.tsv" + df = load_df_from_tsv(tsv_path) + for t in df["tgt_text"]: + f.write(t + "\n") + special_symbols = None + if args.joint: + # Add tgt_lang tags to dict + special_symbols = list( + {f'<lang:{lang.split("-")[1]}>' for lang in mTEDx.LANGPAIRS} + ) + gen_vocab( + Path(f.name), + cur_root / spm_filename_prefix, + args.vocab_type, + args.vocab_size, + special_symbols=special_symbols + ) + # Generate config YAML + gen_config_yaml( + cur_root, + spm_filename=spm_filename_prefix + ".model", + yaml_filename=f"config_{args.task}.yaml", + specaugment_policy="ld", + prepend_tgt_lang_tag=(args.joint), + ) + # Make symbolic links to manifests + for lang in mTEDx.LANGPAIRS: + for split in mTEDx.SPLITS: + src_path = cur_root / f"{lang}" / f"{split}_{args.task}.tsv" + desc_path = cur_root / f"{split}_{lang}_{args.task}.tsv" + if not desc_path.is_symlink(): + os.symlink(src_path, desc_path) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data-root", "-d", required=True, type=str) + parser.add_argument( + "--vocab-type", + default="unigram", + required=True, + type=str, + choices=["bpe", "unigram", "char"], + ), + parser.add_argument("--vocab-size", default=8000, type=int) + parser.add_argument("--task", type=str, choices=["asr", "st"]) + parser.add_argument("--joint", action="store_true", help="") + parser.add_argument("--use-audio-input", action="store_true") + args = parser.parse_args() + + if args.joint: + process_joint(args) + else: + process(args) + + +if __name__ == "__main__": + main() diff --git a/examples/speech_to_text/prep_mustc_data.py b/examples/speech_to_text/prep_mustc_data.py index 59a42803f9..c2362f76fa 100644 --- a/examples/speech_to_text/prep_mustc_data.py +++ b/examples/speech_to_text/prep_mustc_data.py @@ -7,14 +7,15 @@ import argparse import logging import os -import os.path as op +from pathlib import Path import shutil from itertools import groupby from tempfile import NamedTemporaryFile from typing import Tuple +import numpy as np import pandas as pd -import torchaudio +import soundfile as sf from examples.speech_to_text.data_utils import ( create_zip, extract_fbank_features, @@ -24,11 +25,14 @@ get_zip_manifest, load_df_from_tsv, save_df_to_tsv, + cal_gcmvn_stats, ) -from torch import Tensor +import torch from torch.utils.data import Dataset from tqdm import tqdm +from fairseq.data.audio.audio_utils import get_waveform, convert_waveform + log = logging.getLogger(__name__) @@ -48,19 +52,19 @@ class MUSTC(Dataset): def __init__(self, root: str, lang: str, split: str) -> None: assert split in self.SPLITS and lang in self.LANGUAGES - _root = op.join(root, f"en-{lang}", "data", split) - wav_root, txt_root = op.join(_root, "wav"), op.join(_root, "txt") - assert op.isdir(_root) and op.isdir(wav_root) and op.isdir(txt_root) + _root = Path(root) / f"en-{lang}" / "data" / split + wav_root, txt_root = _root / "wav", _root / "txt" + assert _root.is_dir() and wav_root.is_dir() and txt_root.is_dir() # Load audio segments try: import yaml except ImportError: - print("Please install PyYAML to load YAML files for " "the MuST-C dataset") - with open(op.join(txt_root, f"{split}.yaml")) as f: + print("Please install PyYAML to load the MuST-C YAML files") + with open(txt_root / f"{split}.yaml") as f: segments = yaml.load(f, Loader=yaml.BaseLoader) # Load source and target utterances for _lang in ["en", lang]: - with open(op.join(txt_root, f"{split}.{_lang}")) as f: + with open(txt_root / f"{split}.{_lang}") as f: utterances = [r.strip() for r in f] assert len(segments) == len(utterances) for i, u in enumerate(utterances): @@ -68,16 +72,16 @@ def __init__(self, root: str, lang: str, split: str) -> None: # Gather info self.data = [] for wav_filename, _seg_group in groupby(segments, lambda x: x["wav"]): - wav_path = op.join(wav_root, wav_filename) - sample_rate = torchaudio.info(wav_path)[0].rate + wav_path = wav_root / wav_filename + sample_rate = sf.info(wav_path.as_posix()).samplerate seg_group = sorted(_seg_group, key=lambda x: x["offset"]) for i, segment in enumerate(seg_group): offset = int(float(segment["offset"]) * sample_rate) n_frames = int(float(segment["duration"]) * sample_rate) - _id = f"{op.splitext(wav_filename)[0]}_{i}" + _id = f"{wav_path.stem}_{i}" self.data.append( ( - wav_path, + wav_path.as_posix(), offset, n_frames, sample_rate, @@ -88,9 +92,13 @@ def __init__(self, root: str, lang: str, split: str) -> None: ) ) - def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, str, str]: - wav_path, offset, n_frames, sr, src_utt, tgt_utt, spk_id, utt_id = self.data[n] - waveform, _ = torchaudio.load(wav_path, offset=offset, num_frames=n_frames) + def __getitem__( + self, n: int + ) -> Tuple[torch.Tensor, int, str, str, str, str]: + wav_path, offset, n_frames, sr, src_utt, tgt_utt, spk_id, \ + utt_id = self.data[n] + waveform, _ = get_waveform(wav_path, frames=n_frames, start=offset) + waveform = torch.from_numpy(waveform) return waveform, sr, src_utt, tgt_utt, spk_id, utt_id def __len__(self) -> int: @@ -98,29 +106,60 @@ def __len__(self) -> int: def process(args): + root = Path(args.data_root).absolute() for lang in MUSTC.LANGUAGES: - cur_root = op.join(args.data_root, f"en-{lang}") - if not op.isdir(cur_root): - print(f"{cur_root} does not exist. Skipped.") + cur_root = root / f"en-{lang}" + if not cur_root.is_dir(): + print(f"{cur_root.as_posix()} does not exist. Skipped.") continue # Extract features - feature_root = op.join(cur_root, "fbank80") - os.makedirs(feature_root, exist_ok=True) + audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80") + audio_root.mkdir(exist_ok=True) + for split in MUSTC.SPLITS: print(f"Fetching split {split}...") - dataset = MUSTC(args.data_root, lang, split) - print("Extracting log mel filter bank features...") - for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): - extract_fbank_features( - waveform, sample_rate, op.join(feature_root, f"{utt_id}.npy") - ) + dataset = MUSTC(root.as_posix(), lang, split) + if args.use_audio_input: + print("Converting audios...") + for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): + tgt_sample_rate = 16_000 + _wavform, _ = convert_waveform( + waveform, sample_rate, to_mono=True, + to_sample_rate=tgt_sample_rate + ) + sf.write( + (audio_root / f"{utt_id}.flac").as_posix(), + _wavform.T.numpy(), tgt_sample_rate + ) + else: + print("Extracting log mel filter bank features...") + gcmvn_feature_list = [] + if split == 'train' and args.cmvn_type == "global": + print("And estimating cepstral mean and variance stats...") + + for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): + features = extract_fbank_features( + waveform, sample_rate, audio_root / f"{utt_id}.npy" + ) + if split == 'train' and args.cmvn_type == "global": + if len(gcmvn_feature_list) < args.gcmvn_max_num: + gcmvn_feature_list.append(features) + + if split == 'train' and args.cmvn_type == "global": + # Estimate and save cmv + stats = cal_gcmvn_stats(gcmvn_feature_list) + with open(cur_root / "gcmvn.npz", "wb") as f: + np.savez(f, mean=stats["mean"], std=stats["std"]) + # Pack features into ZIP - zip_filename = "fbank80.zip" - zip_path = op.join(cur_root, zip_filename) - print("ZIPing features...") - create_zip(feature_root, zip_path) + zip_path = cur_root / f"{audio_root.name}.zip" + print("ZIPing audios/features...") + create_zip(audio_root, zip_path) print("Fetching ZIP manifest...") - zip_manifest = get_zip_manifest(args.data_root, f"en-{lang}/{zip_filename}") + audio_paths, audio_lengths = get_zip_manifest( + zip_path, + is_audio=args.use_audio_input, + ) # Generate TSV manifest print("Generating manifest...") train_text = [] @@ -128,18 +167,19 @@ def process(args): is_train_split = split.startswith("train") manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = MUSTC(args.data_root, lang, split) - for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): + for _, _, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest["id"].append(utt_id) - manifest["audio"].append(zip_manifest[utt_id]) - duration_ms = int(wav.size(1) / sr * 1000) - manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) - manifest["tgt_text"].append(src_utt if args.task == "asr" else tgt_utt) + manifest["audio"].append(audio_paths[utt_id]) + manifest["n_frames"].append(audio_lengths[utt_id]) + manifest["tgt_text"].append( + src_utt if args.task == "asr" else tgt_utt + ) manifest["speaker"].append(speaker_id) if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) - save_df_to_tsv(df, op.join(cur_root, f"{split}_{args.task}.tsv")) + save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv") # Generate vocab v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" @@ -147,56 +187,74 @@ def process(args): for t in train_text: f.write(t + "\n") gen_vocab( - f.name, - op.join(cur_root, spm_filename_prefix), + Path(f.name), + cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML - gen_config_yaml( - cur_root, - spm_filename_prefix + ".model", - yaml_filename=f"config_{args.task}.yaml", - specaugment_policy="lb", - ) + if args.use_audio_input: + gen_config_yaml( + cur_root, + spm_filename=spm_filename_prefix + ".model", + yaml_filename=f"config_{args.task}.yaml", + specaugment_policy=None, + extra={"use_audio_input": True} + ) + else: + gen_config_yaml( + cur_root, + spm_filename=spm_filename_prefix + ".model", + yaml_filename=f"config_{args.task}.yaml", + specaugment_policy="lb", + cmvn_type=args.cmvn_type, + gcmvn_path=( + cur_root / "gcmvn.npz" if args.cmvn_type == "global" + else None + ), + ) # Clean up - shutil.rmtree(feature_root) + shutil.rmtree(audio_root) def process_joint(args): + cur_root = Path(args.data_root) assert all( - op.isdir(op.join(args.data_root, f"en-{lang}")) for lang in MUSTC.LANGUAGES + (cur_root / f"en-{lang}").is_dir() for lang in MUSTC.LANGUAGES ), "do not have downloaded data available for all 8 languages" - cur_root = args.data_root # Generate vocab vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for lang in MUSTC.LANGUAGES: - tsv_path = op.join(cur_root, f"en-{lang}", f"train_{args.task}.tsv") + tsv_path = cur_root / f"en-{lang}" / f"train_{args.task}.tsv" df = load_df_from_tsv(tsv_path) for t in df["tgt_text"]: f.write(t + "\n") + special_symbols = None + if args.task == 'st': + special_symbols = [f'<lang:{lang}>' for lang in MUSTC.LANGUAGES] gen_vocab( - f.name, - op.join(cur_root, spm_filename_prefix), + Path(f.name), + cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, + special_symbols=special_symbols ) # Generate config YAML gen_config_yaml( cur_root, - spm_filename_prefix + ".model", + spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", - specaugment_policy="lb", + specaugment_policy="ld", prepend_tgt_lang_tag=(args.task == "st"), ) # Make symbolic links to manifests for lang in MUSTC.LANGUAGES: for split in MUSTC.SPLITS: - src_path = op.join(cur_root, f"en-{lang}", f"{split}_{args.task}.tsv") - desc_path = op.join(cur_root, f"{split}_{lang}_{args.task}.tsv") - if not op.islink(desc_path): + src_path = cur_root / f"en-{lang}" / f"{split}_{args.task}.tsv" + desc_path = cur_root / f"{split}_{lang}_{args.task}.tsv" + if not desc_path.is_symlink(): os.symlink(src_path, desc_path) @@ -213,6 +271,17 @@ def main(): parser.add_argument("--vocab-size", default=8000, type=int) parser.add_argument("--task", type=str, choices=["asr", "st"]) parser.add_argument("--joint", action="store_true", help="") + parser.add_argument( + "--cmvn-type", default="utterance", + choices=["global", "utterance"], + help="The type of cepstral mean and variance normalization" + ) + parser.add_argument( + "--gcmvn-max-num", default=150000, type=int, + help="Maximum number of sentences to use to estimate global mean and " + "variance" + ) + parser.add_argument("--use-audio-input", action="store_true") args = parser.parse_args() if args.joint: diff --git a/examples/speech_to_text/seg_mustc_data.py b/examples/speech_to_text/seg_mustc_data.py new file mode 100644 index 0000000000..1ee665d639 --- /dev/null +++ b/examples/speech_to_text/seg_mustc_data.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +from pathlib import Path +import soundfile as sf +from examples.speech_to_text.prep_mustc_data import ( + MUSTC +) + +from tqdm import tqdm + +log = logging.getLogger(__name__) + + +def main(args): + root = Path(args.data_root).absolute() + lang = args.lang + split = args.split + + cur_root = root / f"en-{lang}" + assert cur_root.is_dir(), ( + f"{cur_root.as_posix()} does not exist. Skipped." + ) + + dataset = MUSTC(root.as_posix(), lang, split) + output = Path(args.output).absolute() + output.mkdir(exist_ok=True) + f_text = open(output / f"{split}.{lang}", "w") + f_wav_list = open(output / f"{split}.wav_list", "w") + for waveform, sample_rate, _, text, _, utt_id in tqdm(dataset): + sf.write( + output / f"{utt_id}.wav", + waveform.squeeze(0).numpy(), + samplerate=int(sample_rate) + ) + f_text.write(text + "\n") + f_wav_list.write(str(output / f"{utt_id}.wav") + "\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data-root", "-d", required=True, type=str) + parser.add_argument("--task", required=True, type=str, choices=["asr", "st"]) + parser.add_argument("--lang", required=True, type=str) + parser.add_argument("--output", required=True, type=str) + parser.add_argument("--split", required=True, choices=MUSTC.SPLITS) + args = parser.parse_args() + + main(args) diff --git a/examples/speech_to_text/simultaneous_translation/agents/fairseq_simul_st_agent.py b/examples/speech_to_text/simultaneous_translation/agents/fairseq_simul_st_agent.py new file mode 100644 index 0000000000..61617a1739 --- /dev/null +++ b/examples/speech_to_text/simultaneous_translation/agents/fairseq_simul_st_agent.py @@ -0,0 +1,363 @@ +import math +import os +import json +import numpy as np +import torch +import torchaudio.compliance.kaldi as kaldi +import yaml +from fairseq import checkpoint_utils, tasks +from fairseq.file_io import PathManager + +try: + from simuleval import READ_ACTION, WRITE_ACTION, DEFAULT_EOS + from simuleval.agents import SpeechAgent + from simuleval.states import ListEntry, SpeechStates +except ImportError: + print("Please install simuleval 'pip install simuleval'") + +SHIFT_SIZE = 10 +WINDOW_SIZE = 25 +SAMPLE_RATE = 16000 +FEATURE_DIM = 80 +BOW_PREFIX = "\u2581" + + +class OnlineFeatureExtractor: + """ + Extract speech feature on the fly. + """ + + def __init__(self, args): + self.shift_size = args.shift_size + self.window_size = args.window_size + assert self.window_size >= self.shift_size + + self.sample_rate = args.sample_rate + self.feature_dim = args.feature_dim + self.num_samples_per_shift = int(self.shift_size * self.sample_rate / 1000) + self.num_samples_per_window = int(self.window_size * self.sample_rate / 1000) + self.len_ms_to_samples = lambda x: x * self.sample_rate / 1000 + self.previous_residual_samples = [] + self.global_cmvn = args.global_cmvn + + def clear_cache(self): + self.previous_residual_samples = [] + + def __call__(self, new_samples): + samples = self.previous_residual_samples + new_samples + if len(samples) < self.num_samples_per_window: + self.previous_residual_samples = samples + return + + # num_frames is the number of frames from the new segment + num_frames = math.floor( + (len(samples) - self.len_ms_to_samples(self.window_size - self.shift_size)) + / self.num_samples_per_shift + ) + + # the number of frames used for feature extraction + # including some part of thte previous segment + effective_num_samples = int( + num_frames * self.len_ms_to_samples(self.shift_size) + + self.len_ms_to_samples(self.window_size - self.shift_size) + ) + + input_samples = samples[:effective_num_samples] + self.previous_residual_samples = samples[ + num_frames * self.num_samples_per_shift: + ] + + torch.manual_seed(1) + output = kaldi.fbank( + torch.FloatTensor(input_samples).unsqueeze(0), + num_mel_bins=self.feature_dim, + frame_length=self.window_size, + frame_shift=self.shift_size, + ).numpy() + + output = self.transform(output) + + return torch.from_numpy(output) + + def transform(self, input): + if self.global_cmvn is None: + return input + + mean = self.global_cmvn["mean"] + std = self.global_cmvn["std"] + + x = np.subtract(input, mean) + x = np.divide(x, std) + return x + + +class TensorListEntry(ListEntry): + """ + Data structure to store a list of tensor. + """ + + def append(self, value): + + if len(self.value) == 0: + self.value = value + return + + self.value = torch.cat([self.value] + [value], dim=0) + + def info(self): + return { + "type": str(self.new_value_type), + "length": self.__len__(), + "value": "" if type(self.value) is list else self.value.size(), + } + + +class FairseqSimulSTAgent(SpeechAgent): + + speech_segment_size = 40 # in ms, 4 pooling ratio * 10 ms step size + + def __init__(self, args): + super().__init__(args) + + self.eos = DEFAULT_EOS + + self.gpu = getattr(args, "gpu", False) + + self.args = args + + self.load_model_vocab(args) + + if getattr( + self.model.decoder.layers[0].encoder_attn, + 'pre_decision_ratio', + None + ) is not None: + self.speech_segment_size *= ( + self.model.decoder.layers[0].encoder_attn.pre_decision_ratio + ) + + args.global_cmvn = None + if args.config: + with open(os.path.join(args.data_bin, args.config), "r") as f: + config = yaml.load(f, Loader=yaml.BaseLoader) + + if "global_cmvn" in config: + args.global_cmvn = np.load(config["global_cmvn"]["stats_npz_path"]) + + if args.global_stats: + with PathManager.open(args.global_stats, "r") as f: + global_cmvn = json.loads(f.read()) + self.global_cmvn = {"mean": global_cmvn["mean"], "std": global_cmvn["stddev"]} + + self.feature_extractor = OnlineFeatureExtractor(args) + + self.max_len = args.max_len + + self.force_finish = args.force_finish + + torch.set_grad_enabled(False) + + def build_states(self, args, client, sentence_id): + # Initialize states here, for example add customized entry to states + # This function will be called at beginning of every new sentence + states = SpeechStates(args, client, sentence_id, self) + self.initialize_states(states) + return states + + def to_device(self, tensor): + if self.gpu: + return tensor.cuda() + else: + return tensor.cpu() + + @staticmethod + def add_args(parser): + # fmt: off + parser.add_argument('--model-path', type=str, required=True, + help='path to your pretrained model.') + parser.add_argument("--data-bin", type=str, required=True, + help="Path of data binary") + parser.add_argument("--config", type=str, default=None, + help="Path to config yaml file") + parser.add_argument("--global-stats", type=str, default=None, + help="Path to json file containing cmvn stats") + parser.add_argument("--tgt-splitter-type", type=str, default="SentencePiece", + help="Subword splitter type for target text") + parser.add_argument("--tgt-splitter-path", type=str, default=None, + help="Subword splitter model path for target text") + parser.add_argument("--user-dir", type=str, default="examples/simultaneous_translation", + help="User directory for simultaneous translation") + parser.add_argument("--max-len", type=int, default=200, + help="Max length of translation") + parser.add_argument("--force-finish", default=False, action="store_true", + help="Force the model to finish the hypothsis if the source is not finished") + parser.add_argument("--shift-size", type=int, default=SHIFT_SIZE, + help="Shift size of feature extraction window.") + parser.add_argument("--window-size", type=int, default=WINDOW_SIZE, + help="Window size of feature extraction window.") + parser.add_argument("--sample-rate", type=int, default=SAMPLE_RATE, + help="Sample rate") + parser.add_argument("--feature-dim", type=int, default=FEATURE_DIM, + help="Acoustic feature dimension.") + + # fmt: on + return parser + + def load_model_vocab(self, args): + + filename = args.model_path + if not os.path.exists(filename): + raise IOError("Model file not found: {}".format(filename)) + + state = checkpoint_utils.load_checkpoint_to_cpu(filename) + + task_args = state["cfg"]["task"] + task_args.data = args.data_bin + + if args.config is not None: + task_args.config_yaml = args.config + + task = tasks.setup_task(task_args) + + # build model for ensemble + state["cfg"]["model"].load_pretrained_encoder_from = None + state["cfg"]["model"].load_pretrained_decoder_from = None + self.model = task.build_model(state["cfg"]["model"]) + self.model.load_state_dict(state["model"], strict=True) + self.model.eval() + self.model.share_memory() + + if self.gpu: + self.model.cuda() + + # Set dictionary + self.dict = {} + self.dict["tgt"] = task.target_dictionary + + def initialize_states(self, states): + self.feature_extractor.clear_cache() + states.units.source = TensorListEntry() + states.units.target = ListEntry() + states.incremental_states = dict() + + def segment_to_units(self, segment, states): + # Convert speech samples to features + features = self.feature_extractor(segment) + if features is not None: + return [features] + else: + return [] + + def units_to_segment(self, units, states): + # Merge sub word to full word. + if self.model.decoder.dictionary.eos() == units[0]: + return DEFAULT_EOS + + segment = [] + if None in units.value: + units.value.remove(None) + + for index in units: + if index is None: + units.pop() + token = self.model.decoder.dictionary.string([index]) + if token.startswith(BOW_PREFIX): + if len(segment) == 0: + segment += [token.replace(BOW_PREFIX, "")] + else: + for j in range(len(segment)): + units.pop() + + string_to_return = ["".join(segment)] + + if self.model.decoder.dictionary.eos() == units[0]: + string_to_return += [DEFAULT_EOS] + + return string_to_return + else: + segment += [token.replace(BOW_PREFIX, "")] + + if ( + len(units) > 0 + and self.model.decoder.dictionary.eos() == units[-1] + or len(states.units.target) > self.max_len + ): + tokens = [self.model.decoder.dictionary.string([unit]) for unit in units] + return ["".join(tokens).replace(BOW_PREFIX, "")] + [DEFAULT_EOS] + + return None + + def update_model_encoder(self, states): + if len(states.units.source) == 0: + return + src_indices = self.to_device( + states.units.source.value.unsqueeze(0) + ) + src_lengths = self.to_device( + torch.LongTensor([states.units.source.value.size(0)]) + ) + + states.encoder_states = self.model.encoder(src_indices, src_lengths) + torch.cuda.empty_cache() + + def update_states_read(self, states): + # Happens after a read action. + self.update_model_encoder(states) + + def policy(self, states): + if not getattr(states, "encoder_states", None): + return READ_ACTION + + tgt_indices = self.to_device( + torch.LongTensor( + [self.model.decoder.dictionary.eos()] + + [x for x in states.units.target.value if x is not None] + ).unsqueeze(0) + ) + + states.incremental_states["steps"] = { + "src": states.encoder_states["encoder_out"][0].size(0), + "tgt": 1 + len(states.units.target), + } + + states.incremental_states["online"] = {"only": torch.tensor(not states.finish_read())} + + x, outputs = self.model.decoder.forward( + prev_output_tokens=tgt_indices, + encoder_out=states.encoder_states, + incremental_state=states.incremental_states, + ) + + states.decoder_out = x + + states.decoder_out_extra = outputs + + torch.cuda.empty_cache() + + if outputs.action == 0: + return READ_ACTION + else: + return WRITE_ACTION + + def predict(self, states): + decoder_states = states.decoder_out + + lprobs = self.model.get_normalized_probs( + [decoder_states[:, -1:]], log_probs=True + ) + + index = lprobs.argmax(dim=-1) + + index = index[0, 0].item() + + if ( + self.force_finish + and index == self.model.decoder.dictionary.eos() + and not states.finish_read() + ): + # If we want to force finish the translation + # (don't stop before finish reading), return a None + # self.model.decoder.clear_cache(states.incremental_states) + index = None + + return index diff --git a/examples/textless_nlp/dgslm/README.md b/examples/textless_nlp/dgslm/README.md new file mode 100644 index 0000000000..917dbb2765 --- /dev/null +++ b/examples/textless_nlp/dgslm/README.md @@ -0,0 +1,183 @@ +# Generative Spoken Dialogue Language Modeling +[[paper]](https://arxiv.org/abs/2203.16502) [[demo samples]](https://speechbot.github.io/dgslm/index.html) [[blog]](https://ai.facebook.com/blog/generating-chit-chat-including-laughs-yawns-ums-and-other-nonverbal-cues-from-raw-audio/) + +This repo contains the code and pre-trained models for the paper _Generative Spoken Dialogue Language Modeling_. +<details> + <summary>Paper abstract </summary> + +> We introduce dGSLM, the first "textless" model able to generate audio samples of naturalistic spoken dialogues. It uses recent work on unsupervised spoken unit discovery coupled with a dual-tower transformer architecture with cross-attention trained on 2000 hours of two-channel raw conversational audio (Fisher dataset) without any text or labels. We show that our model is able to generate speech, laughter and other paralinguistic signals in the two channels simultaneously and reproduces more naturalistic and fluid turn taking compared to a text-based cascaded model. + +</details> + +## [Speech-to-Unit Encoder for dGSLM: The Fisher HuBERT model](hubert_fisher/) +The [hubert_fisher](hubert_fisher/) repository contains the pre-trained models and recipies to produce discrete units for the dGSLM model. + +## [Unit-to-Speech Decoder for dGSLM](vocoder_hifigan/) +The [vocoder_hifigan](vocoder_hifigan/) repo contains the vocoder and recipies to synthesize the waveform from the discrete units. + +## Spoken Dialogue Transformer Language Model (SpeechDLM) +### Pre-trained model +We share the pre-trained model checkpoint for the best configuration in the paper (DLM-5 model, with Edge Unit Prediction & Delayed Duration Prediction objectives), dubbed as `SpeechDLM`, trained on the 2000 hours of Fisher dataset : +| Pre-trained SpeechDLM model trained on Fisher dataset | +|-----------------------------------------------| +|[model checkpoint](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/speech_dlm/speech_dlm_base.pt) - [dictionary 1](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/speech_dlm/dict.unitA.txt) - [dictionary 2](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/speech_dlm/dict.unitB.txt)| +the two dictionary files correspond to the two channels, and actually have the same content. + +### Sample from a trained model +You can sample from a trained SpeechDLM model interactively : +```python +from fairseq.models.speech_dlm import SpeechDLM + +# Load SpeechDLM model +speech_dlm = SpeechDLM.from_pretrained( + model_name_or_path='/path/to/model/dir', + checkpoint_file='speech_dlm_base.pt', + data_name_or_path='/path/to/data/dir' + ) +# Disable dropout +speech_dlm.eval() +# Move model to GPU +speech_dlm.cuda() + +# Define the input sequences +input_sequences = [{ + 'unitA': '7 376 376 133 178 486 486 486 486 486 486 486 486 2 486', + 'unitB': '7 499 415 177 7 7 7 7 7 7 136 136 289 289 408' + }] + +# Sample from the SpeechDLM model +generated_units = speech_dlm.sample( + input_sequences, + max_len_a = 0, + max_len_b = 500, + sampling=True, + beam=5, + ) +# >> {'unitA': '7 376 376 133 178 486 486 486 486 486 486 486 486 2 486 486 178 486 486 2 2 376 376 486 486 486 376 376 387 387 ...', +# >> 'unitB': '7 499 415 177 7 7 7 7 7 7 136 136 289 289 408 32 428 95 356 141 331 439 350 350 192 331 445 202 104 104 ...'} +``` + +Or using the `sample_speech_dlm.py` script : +```bash +python sample_speech_dlm.py \ + --in-file $INPUT_CODE_FILE --out-file $OUTPUT_FILE \ + --ckpt $CHECKPOINT_PATH --data $DATA_DIR +``` +where each line of INPUT_CODE_FILE is a dictionary with keys `'audio', 'unitA', 'unitB'` as follows : +``` +{'audio': 'file_1', 'unitA': '8 8 ... 352 352', 'unitB': '217 8 ... 8 8'} +{'audio': 'file_2', 'unitA': '5 5 ... 65 65', 'unitB': '6 35 ... 8 9'} +... +``` +This code file can be created with the script `create_input_code.py` (using the outputs of `quantize_with_kmeans.py` [here](hubert_fisher/#encode-audio-to-discrete-units)) : +```bash +python examples/textless_nlp/dgslm/vocoder_hifigan/create_input_code.py \ + $CHANNEL1_UNITS $CHANNEL2_UNITS $OUTPUT_CODE_FILE +``` + +### Training a SpeechDLM model +#### 1) Data preparation +First, you need to prepare the raw dataset. For each `split` (train, valid), you need two files corresponding to two channels (namely `unitA` and `unitB` for example) containing the units from each channel separately. Make sure that 2 files have the same number of lines and each corresponding line has the same number of units. + +Here is an example of `.unitA` file : +``` +7 376 376 133 178 +486 486 486 +486 376 +``` +and the corresponding `.unitB` file : +``` +7 499 415 177 7 +7 7 136 +331 445 +``` +These two files can be obtained using the [example command](hubert_fisher/#encode-audio-to-discrete-units) of hubert fisher, with the `--hide-fname` option added. + +The raw dataset directory should contain the following files : +``` +train.unitA valid.unitA +train.unitB valid.unitB +``` + +Next preprocess/binarize the data with `fairseq-preprocess`, but make sure to preprocess each channel separately, and **rename** the preprocessed files under the following format `${split}.${channel}.{bin, idx}`. Each channel also needs a separate dictionary file under the name `dict.${channel}.txt` . + +Here is an example pre-processing code : + +```bash +# Preprocess the first channel (unitA) +fairseq-preprocess --source-lang unitA \ + --only-source \ + --trainpref $RAW_DATA_DIR/train \ + --validpref $RAW_DATA_DIR/valid \ + --destdir $BIN_DATA_DIR \ + --workers 20 + +# Preprocess the second channel (unitB) and reuse the dictionary from the first channel +fairseq-preprocess --source-lang unitB \ + --srcdict $BIN_DATA_DIR/dict.unitA.txt \ + --only-source \ + --trainpref $RAW_DATA_DIR/train \ + --validpref $RAW_DATA_DIR/valid \ + --destdir $BIN_DATA_DIR \ + --workers 20 + +# Rename the bin & index files +for channel in unitA unitB; do + for split in train valid; do + mv $BIN_DATA_DIR/${split}.${channel}-None.${channel}.bin $BIN_DATA_DIR/${split}.${channel}.bin + mv $BIN_DATA_DIR/${split}.${channel}-None.${channel}.idx $BIN_DATA_DIR/${split}.${channel}.idx + done +done +``` +Finally, the preprocessed (bin) dataset directory should contain the following files : +``` +dict.unitA.txt train.unitA.idx train.unitA.bin valid.unitA.idx valid.unitA.bin +dict.unitB.txt train.unitB.idx train.unitB.bin valid.unitB.idx valid.unitB.bin +``` + +#### 2) Train the model +To train the SpeechDLM (with the configuration as the pre-trained model) on 2 GPUs : +```bash +fairseq-train $BIN_DATA_DIR \ + --save-dir $CHECKPOINT_DIR \ + --tensorboard-logdir $CHECKPOINT_DIR \ + --task speech_dlm_task --channels unitA,unitB \ + --next-unit-prediction "False" --edge-unit-prediction "True" \ + --duration-prediction "True" --delayed-duration-target "True" \ + --criterion speech_dlm_criterion \ + --arch speech_dlm --decoder-cross-layers 4 \ + --share-decoder-input-output-embed \ + --dropout 0.1 --attention-dropout 0.1 \ + --optimizer adam --adam-betas "(0.9, 0.98)" --clip-norm 1.0 \ + --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 \ + --max-tokens 18432 --tokens-per-sample 6144 --sample-break-mode none \ + --update-freq 16 --num-workers 4 --skip-invalid-size-inputs-valid-test \ + --max-update 250000 --warmup-updates 20000 \ + --save-interval-updates 10000 --keep-last-epochs 1 --no-epoch-checkpoints \ + --log-interval 50 --seed 100501 \ + --fp16 --checkpoint-activations +``` + +#### 3) Validate +The model can be validated via the `fairseq-validate` command : +```bash +fairseq-validate $BIN_DATA_DIR \ + --task speech_dlm_task \ + --path $CHECKPOINT_PATH \ + --max-tokens 6144 +``` + +## Reference + +If you find our work useful in your research, please consider citing our paper: + +```bibtex +@article{nguyen2022dgslm, + title = {Generative Spoken Dialogue Language Modeling}, + author = {Nguyen, Tu Anh and Kharitonov, Eugene and Copet, Jade and Adi, Yossi and Hsu, Wei-Ning and Elkahky, Ali and Tomasello, Paden and Algayres, Robin and Sagot, Benoit and Mohamed, Abdelrahman and Dupoux, Emmanuel}, + eprint={2203.16502}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + year={2022} +} +``` diff --git a/examples/textless_nlp/dgslm/create_code_file.py b/examples/textless_nlp/dgslm/create_code_file.py new file mode 100644 index 0000000000..d10f9484ad --- /dev/null +++ b/examples/textless_nlp/dgslm/create_code_file.py @@ -0,0 +1,79 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse + + +def main(): + """ + Create code file with the following format: + {'audio': 'file1', 'unitA': 'file1_chnl1_units', 'unitB': 'file1_chnl2_units'} + {'audio': 'file2', 'unitA': 'file2_chnl1_units', 'unitB': 'file2_chnl2_units'} + ... + + Given the input units files + - channel1_units_file: + file1|file1_chnl1_units + file2|file2_chnl1_units + ... + - channel2_units_file: + file1|file1_chnl2_units + file2|file2_chnl2_units + ... + """ + + parser = argparse.ArgumentParser() + parser.add_argument( + "channel1_units_file", + type=str, + help="Units of the first channel.", + ) + parser.add_argument( + "channel2_units_file", + type=str, + help="Units of the second channel.", + ) + parser.add_argument( + "output_file", + type=str, + help="Output file.", + ) + parser.add_argument( + "--channels", + type=str, + default='unitA,unitB', + help="Comma-separated list of the channel names to create in the code" + "(Default: 'unitA,unitB').", + ) + + args = parser.parse_args() + + channel_names = args.channels.split(',') + + with open(args.channel1_units_file) as funit1, \ + open(args.channel2_units_file) as funit2, \ + open(args.output_file, 'w') as fout: + for line1, line2 in zip(funit1, funit2): + fname1, units1 = line1.strip().split('|') + fname2, units2 = line2.strip().split('|') + assert len(units1.split()) == len(units2.split()), \ + f"Mismatch units length ({len(units1.split())} vs {len(units2.split())})" + base_fname1 = fname1[:-9] + base_fname2 = fname2[:-9] + assert base_fname1 == base_fname2, \ + f"Mismatch filenames ({base_fname1} vs {base_fname2}). " \ + f"Expected $filename-channel1 and $filename-channel2 in two files" + code = { + "audio" : base_fname1, + channel_names[0] : units1, + channel_names[1] : units2, + } + fout.write(str(code)) + fout.write("\n") + print(f"Codes written to {args.output_file}") + + +if __name__ == "__main__": + main() diff --git a/examples/textless_nlp/dgslm/dgslm_utils.py b/examples/textless_nlp/dgslm/dgslm_utils.py new file mode 100644 index 0000000000..8049d49793 --- /dev/null +++ b/examples/textless_nlp/dgslm/dgslm_utils.py @@ -0,0 +1,78 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +import json + +from fairseq import utils +from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder + +# from examples.hubert.simple_kmeans.dump_hubert_feature import HubertFeatureReader +from examples.textless_nlp.gslm.speech2unit.pretrained.hubert_feature_reader import HubertFeatureReader +from examples.hubert.simple_kmeans.dump_km_label import ApplyKmeans + + +# Hubert tokenizer +class HubertTokenizer: + def __init__( + self, + hubert_path, + hubert_layer, + km_path, + use_cuda=True, + ): + self.feature_extractor = HubertFeatureReader(hubert_path, hubert_layer, use_cuda=use_cuda) + self.quantizer = ApplyKmeans(km_path) + if not use_cuda: + self.quantizer.C = self.quantizer.C.cpu() + self.quantizer.Cnorm = self.quantizer.Cnorm.cpu() + + def wav2code(self, path, channel_id=1): + feat = self.feature_extractor.get_feats(path, channel_id=channel_id) + code = self.quantizer(feat) + return ' '.join(map(str, code)) + + def wav2codes(self, path): + codes = [ + self.wav2code(path, channel_id=1), + self.wav2code(path, channel_id=2) + ] + return codes + + +# Vocoder +class HifiganVocoder: + def __init__( + self, + vocoder_path, + vocoder_cfg_path, + use_cuda=True, + ): + with open(vocoder_cfg_path) as f: + cfg = json.load(f) + self.vocoder = CodeHiFiGANVocoder(vocoder_path, cfg).eval() + self.use_cuda = use_cuda + if self.use_cuda: + self.vocoder.cuda() + + def code2wav(self, code, speaker_id=0, pred_dur=False): + if isinstance(code, str): + code = list(map(int, code.split())) + inp = {"code": torch.LongTensor(code).view(1, -1)} + if self.vocoder.model.multispkr: + inp["spkr"] = torch.LongTensor([speaker_id]).view(1, 1) + if self.use_cuda: + inp = utils.move_to_cuda(inp) + return self.vocoder(inp, pred_dur).detach().cpu().numpy() + + def codes2wav(self, codes, speaker_ids=[0, 4], pred_dur=False): + if isinstance(codes, dict): + codes = list(codes.values()) + assert len(codes) == 2 + wav1 = self.code2wav(codes[0], speaker_ids[0], pred_dur) + wav2 = self.code2wav(codes[1], speaker_ids[1], pred_dur) + wav = np.stack([wav1, wav2]) + return wav diff --git a/examples/textless_nlp/dgslm/hubert_fisher/README.md b/examples/textless_nlp/dgslm/hubert_fisher/README.md new file mode 100644 index 0000000000..52c528fa1e --- /dev/null +++ b/examples/textless_nlp/dgslm/hubert_fisher/README.md @@ -0,0 +1,47 @@ +# Dialogue Speech-to-Unit Encoder for dGSLM: The Fisher HuBERT model +For the speech2unit encoder, we train a [HuBERT model](https://arxiv.org/pdf/2106.07447.pdf) on the [Fisher dataset](http://www.lrec-conf.org/proceedings/lrec2004/pdf/767.pdf) for 3 iterations (see [our paper](https://arxiv.org/pdf/2203.16502.pdf) for more details) and train a k-means model with 500 units on the layer 12 features of the HuBERT model. + +## Model checkpoints +The pre-trained HuBERT and k-means model checkpoints can be found here: + +| Fisher HuBERT model | k-means model | +|---------------------|---------------| +|[download](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/hubert/hubert_fisher.pt)|[download](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/hubert/hubert_fisher_km_500.bin)| + + +## Encode audio to discrete units +Below is an example command to encode a stereo dataset to discrete units using the pre-trained model checkpoints : +```bash +for CHANNEL_ID in 1 2; do + python examples/textless_nlp/gslm/speech2unit/clustering/quantize_with_kmeans.py \ + --feature_type hubert \ + --kmeans_model_path path/to/hubert_fisher_km_500.bin \ + --acoustic_model_path path/to/hubert_fisher.pt \ + --layer 12 \ + --manifest_path $MANIFEST_FILE \ + --out_quantized_file_path ${OUTPUT_FILE}-channel${CHANNEL_ID} \ + --extension $EXTENSION \ + --channel_id $CHANNEL_ID +done +``` +where MANIFEST_FILE is the output of [wav2vec manifest script](https://github.com/facebookresearch/fairseq/blob/main/examples/wav2vec/wav2vec_manifest.py), which can be obtained through the following command : +``` +python examples/wav2vec/wav2vec_manifest.py --valid-percent=0.0 $AUDIO_DIR --dest=$OUTPUT_DIR --ext=$EXTENSION +``` + +Otherwise, you can encode an audio file in python interactively with the HubertTokenizer class : +```python +# Load the Hubert tokenizer +from examples.textless_nlp.dgslm.dgslm_utils import HubertTokenizer +encoder = HubertTokenizer( + hubert_path = "/path/to/hubert_ckpt.pt", + hubert_layer = 12, + km_path = "path/to/km.bin" +) + +# Encode the audio to units +path = "/path/to/stereo/audio.wav" +codes = encoder.wav2codes(path) +# > ['7 376 376 133 178 486 486 486 486 486 486 486 486 2 486', +# > '7 499 415 177 7 7 7 7 7 7 136 136 289 289 408'] +``` \ No newline at end of file diff --git a/examples/textless_nlp/dgslm/sample_speech_dlm.py b/examples/textless_nlp/dgslm/sample_speech_dlm.py new file mode 100644 index 0000000000..484cbabd3e --- /dev/null +++ b/examples/textless_nlp/dgslm/sample_speech_dlm.py @@ -0,0 +1,202 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import ast +import argparse +import logging +import torch + +from fairseq import utils +from fairseq.models.speech_dlm import SpeechDLM + +logging.basicConfig() +logging.root.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def load_data(in_file): + with open(in_file) as f: + data = [ast.literal_eval(line.strip()) for line in f] + return data + + +def write_data(out_file, data): + with open(out_file, 'w') as f: + for d in data: + f.write(str(d)) + f.write('\n') + + +def limit(codes, n): + new_codes = {} + for k, v in codes.items(): + new_codes[k] = ' '.join(v.split()[:n]) + return new_codes + + +def main(args): + logger.info(args) + + use_cuda = torch.cuda.is_available() + + # Load the data + data = load_data(args.in_file) + channels = args.channels.split(',') + unit_sequences = [{ + channels[0]: d[channels[0]], + channels[1]: d[channels[1]], + } for d in data] + fnames = [d['audio'] for d in data] + print(f"Found {len(data)} sequences from {args.in_file}") + + # Limit the prefix size + if args.prefix_size is not None: + print(f"Limit the prefix size to {args.prefix_size}") + unit_sequences = [limit(codes, args.prefix_size) for codes in unit_sequences] + + # Load model from ckpt + print(f"Loading the SpeechDLM model from {args.ckpt}") + model = SpeechDLM.from_pretrained( + model_name_or_path=os.path.dirname(args.ckpt), + checkpoint_file=os.path.basename(args.ckpt), + data_name_or_path=args.data + ) + model.eval() + if use_cuda: + model.cuda() + + # Set batch sizes + model.cfg.dataset.max_tokens = args.batch_max_tokens + model.max_positions = args.batch_max_positions + if args.batch_max_sentences is not None: + model.cfg.dataset.batch_size = args.batch_max_sentences + + # Set seed (if needed) + if args.seed is not None: + utils.set_torch_seed(args.seed) + + # Sample from the SpeechDLM model + print(f"Generating {len(unit_sequences)} sequences with SpeechDLM model...\n" + f"Generation args: sampling={(not args.beam_search)}, " + f"sampling_topk={args.sampling_topk}, sampling_topp={args.sampling_topp}, " + f"beam={args.beam_size}, min_len={args.min_len}, " + f"max_len_a={args.max_len_a}, max_len_b={args.max_len_b}, " + f"temperature={args.temperature}, dur_temperature={args.dur_temperature}, " + f"seed={args.seed}") + generated_units = model.sample( + unit_sequences, + sampling=(not args.beam_search), + sampling_topk=args.sampling_topk, + sampling_topp=args.sampling_topp, + beam=args.beam_size, + max_len_a=args.max_len_a, + max_len_b=args.max_len_b, + min_len=args.min_len, + temperature=args.temperature, + duration_temperature=args.dur_temperature, + verbose=args.verbose, + skip_invalid_size_inputs=args.skip_invalid_size_batch, + ) + + # Create the generated sequences + generated_data = [] + for fname, gen_units in zip(fnames, generated_units): + d = { + "audio" : fname+'-generated', + **gen_units + } + generated_data.append(d) + + # Write the generated sequences + print(f"Write the generated units to {args.out_file}") + if args.out_file: + os.makedirs(os.path.dirname(args.out_file), exist_ok=True) + write_data(args.out_file, generated_data) + + +def cli_main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--in-file", + type=str, + required=True, + help="Input file following the same format of the output from create_input.py", + ) + parser.add_argument( + "--ckpt", + type=str, + required=True, + help="Path to the model checkpoint." + ) + parser.add_argument( + "--data", + type=str, + required=True, + help="path to the model data dir (containing dict files)", + ) + parser.add_argument( + "--out-file", + type=str, + required=True, + help="Path of the output file.", + ) + parser.add_argument( + "--channels", + type=str, + default='unitA,unitB', + help="Comma-separated list of the channel names" + "(Default: 'unitA,unitB').", + ) + parser.add_argument("--prefix-size", type=int, default=None, + help='Limit the prefix size') + + # Batch sizes + parser.add_argument("--batch-max-tokens", type=int, default=9216, + help='maximum number of tokens considered in a batch') + parser.add_argument("--batch-max-positions", type=int, default=6144, + help='maximum number of tokens allowed for a sentence in a batch') + parser.add_argument("--batch-max-sentences", type=int, default=None, + help='maximum number of sentences considered in a batch') + parser.add_argument("--skip-invalid-size-batch", action='store_true', + help='skip sentences with more tokens than --batch-max-positions') + + # Generation args + parser.add_argument("--beam-search", action='store_true', + help='perform beam search instead of sampling') + parser.add_argument("--beam-size", type=int, default=5, + help="beam width (used in both sampling and beam search mode) " + "(default: 5)") + parser.add_argument("--sampling-topk", type=int, default=-1, + help="only sample from top-k candidates (default: -1, non applied)") + parser.add_argument("--sampling-topp", type=float, default=-1.0, + help="only sample among the smallest set of elements whose cumulative " + "probability mass exceeds p (default: -1.0, non applied)") + parser.add_argument("--max-len-a", type=int, default=0, + help="generate sequences of maximum length ax + b, " + "where x is the source length (default: 0)") + parser.add_argument("--max-len-b", type=int, default=500, + help="generate sequences of maximum length ax + b, " + "where x is the source length (default: 500 ~ 10s)") + parser.add_argument("--min-len", type=int, default=1, + help="generate sequences of maximum length ax + b, " + "where x is the source length (default: 1)") + parser.add_argument("--temperature", type=float, default=1.0, + help="temperature when generating unit tokens (default: 1.0)") + parser.add_argument("--dur-temperature", type=float, default=1.0, + help="temperature when generating duration tokens (default: 1.0)") + parser.add_argument("--verbose", action='store_true', + help="print the scores given by the model to generated sequences") + parser.add_argument("--seed", type=int, default=123, + help="seed of the generation model") + + args = parser.parse_args() + + main(args) + + +if __name__ == "__main__": + cli_main() diff --git a/examples/textless_nlp/dgslm/vocoder_hifigan/README.md b/examples/textless_nlp/dgslm/vocoder_hifigan/README.md new file mode 100644 index 0000000000..5d4a59a9ac --- /dev/null +++ b/examples/textless_nlp/dgslm/vocoder_hifigan/README.md @@ -0,0 +1,47 @@ +# Dialogue Unit-to-Speech Decoder for dGSLM +For the unit2speech decoder, we train a [discrete unit-based HiFi-GAN vocoder](https://arxiv.org/pdf/2104.00355.pdf) on the [Fisher dataset](http://www.lrec-conf.org/proceedings/lrec2004/pdf/767.pdf). + +## Model checkpoint +The pre-trained model checkpoint can be found here : + +| HiFi-GAN vocoder based on HuBERT Fisher Units | +|-----------------------------------------------| +|[model checkpoint](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/hifigan/hifigan_vocoder) - [config](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/hifigan/config.json) | + +## Decode discrete units to audio +To create waveform from discrete units, use the script `generate_stereo_waveform.py` : +```bash +python examples/textless_nlp/dgslm/vocoder_hifigan/generate_stereo_waveform.py \ + --in-file $INPUT_CODE_FILE \ + --vocoder $VOCODER_PATH \ + --vocoder-cfg $VOCODER_CONFIG \ + --results-path $OUTPUT_DIR +``` +where INPUT_CODE_FILE is expected to have the following format : +``` +{'audio': 'file_1', 'unitA': '8 8 ... 352 352', 'unitB': '217 8 ... 8 8'} +{'audio': 'file_2', 'unitA': '5 5 ... 65 65', 'unitB': '6 35 ... 8 9'} +... +``` + +You can also use the HifiganVocoder class to generate waveform from the codes interactively : +```python +# Load the Hifigan vocoder +from examples.textless_nlp.dgslm.dgslm_utils import HifiganVocoder +decoder = HifiganVocoder( + vocoder_path = "/path/to/hifigan_vocoder", + vocoder_cfg_path = "/path/to/config.json", +) + +# Decode the units to waveform +codes = [ + '7 376 376 133 178 486 486 486 486 486 486 486 486 2 486', + '7 499 415 177 7 7 7 7 7 7 136 136 289 289 408', +] +wav = decoder.codes2wav(codes) +# > array of shape (2, 4800) + +# Play the waveform +import IPython.display as ipd +ipd.Audio(wav, rate=16_000) +``` diff --git a/examples/textless_nlp/dgslm/vocoder_hifigan/generate_stereo_waveform.py b/examples/textless_nlp/dgslm/vocoder_hifigan/generate_stereo_waveform.py new file mode 100644 index 0000000000..1e15f43241 --- /dev/null +++ b/examples/textless_nlp/dgslm/vocoder_hifigan/generate_stereo_waveform.py @@ -0,0 +1,137 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import ast +import argparse +import json +import logging +from pathlib import Path +import soundfile as sf +import torch + +from tqdm import tqdm + +from fairseq import utils +from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder + + +logging.basicConfig() +logging.root.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def dump_result(args, data, sample_id, pred_wav): + assert "audio" in data or args.results_path is not None + if args.results_path: + fname = Path(data["audio"]).stem + ".wav" if "audio" in data else f"{sample_id}_pred.wav" + out_file = Path(args.results_path) / fname + + sf.write( + out_file.as_posix(), + pred_wav.detach().cpu().numpy(), + args.sample_rate, + ) + + +def load_data(in_file): + with open(in_file) as f: + data = [ast.literal_eval(line.strip()) for line in f] + + return data + + +def load_vocoder(vocoder_path, vocoder_cfg_path, use_cuda=True): + with open(vocoder_cfg_path) as f: + cfg = json.load(f) + vocoder = CodeHiFiGANVocoder(vocoder_path, cfg).eval() + if use_cuda: + vocoder = vocoder.cuda() + return vocoder + + +def code2wav(vocoder, code, speaker_id, use_cuda=True): + if isinstance(code, str): + code = list(map(int, code.split())) + inp = dict() + inp["code"] = torch.LongTensor(code).view(1, -1) + if vocoder.model.multispkr: + inp["spkr"] = torch.LongTensor([speaker_id]).view(1, 1) + if use_cuda: + inp = utils.move_to_cuda(inp) + return vocoder(inp) + + +def main(args): + logger.info(args) + + use_cuda = torch.cuda.is_available() and not args.cpu + + vocoder = load_vocoder(args.vocoder, args.vocoder_cfg, use_cuda) + + data = load_data(args.in_file) + + if args.results_path: + Path(args.results_path).mkdir(exist_ok=True, parents=True) + + channels = args.channels.split(',') + speakers = [args.channel1_spk, args.channel2_spk] + + for i, d in tqdm(enumerate(data), total=len(data)): + wavs = [] + for key, speaker_id in zip(channels, speakers): + wav = code2wav(vocoder, d[key], speaker_id, use_cuda=use_cuda) + wavs.append(wav) + + wav = torch.stack(wavs, dim=-1) + if args.mix: + wav = torch.mean(wav, dim=-1) + + dump_result(args, d, i, wav) + + +def cli_main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--in-file", + type=str, + required=True, + help="Input file following the same format of the output from create_input.py", + ) + parser.add_argument( + "--vocoder", type=str, required=True, help="path to the vocoder" + ) + parser.add_argument( + "--vocoder-cfg", + type=str, + required=True, + help="path to the vocoder config", + ) + parser.add_argument( + "--channels", + type=str, + default='unitA,unitB', + help="Comma-separated list of the channel names" + "(Default: 'unitA,unitB').", + ) + parser.add_argument("--sample-rate", type=int, default=16_000) + parser.add_argument( + "--results-path", + type=str, + default=None, + help="Output directory. If not set, the audios will be stored following the 'audio' field specified in the input file", + ) + parser.add_argument("--channel1-spk", type=int, default=0, help="Speaker of the first channel",) + parser.add_argument("--channel2-spk", type=int, default=4, help="Speaker of the second channel",) + parser.add_argument("--mix", action="store_true", help="Mix the two channels to create output mono files") + parser.add_argument("--cpu", action="store_true", help="run on CPU") + + args = parser.parse_args() + + main(args) + + +if __name__ == "__main__": + cli_main() diff --git a/examples/textless_nlp/gslm/README.md b/examples/textless_nlp/gslm/README.md new file mode 100644 index 0000000000..7a76ffd57c --- /dev/null +++ b/examples/textless_nlp/gslm/README.md @@ -0,0 +1,21 @@ +# Generative Spoken Language Modeling + +* [Paper](https://arxiv.org/abs/2102.01192) +* [Demo](https://speechbot.github.io/gslm/index.html) + +We build and evaluate generative speech2speech systems using [Log Mel Filtebank](https://pytorch.org/audio/stable/compliance.kaldi.html#fbank), [Modified CPC](https://github.com/facebookresearch/CPC_audio), [HuBERT Base](https://github.com/pytorch/fairseq/tree/main/examples/hubert) and [Wav2Vec 2.0 Large](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec). Our system is composed of three components, namely, *speech2unit*, *ulm* and *unit2speech*. We explain about models and usage of these components in their respective sub-directories. See the links below. + +## Speech to Unit Model (speech2unit) +Speech to unit model is used for quantizing raw speech into learned discrete speech units. [More details](speech2unit) + +## Unit Language Model (ulm) +Unit Language Model is a generative language model trained on discrete speech units. [More details](ulm) + +## Unit to Speech Model (unit2speech) +Unit to speech model is used for synthesizing speech from discrete speech units. [More details](unit2speech) + +## Metrics +We show how to compute ASR based metrics as well as zero-shot metrics proposed in our paper [here](metrics). + +## Tools +We share two tools to resynthesize a given spoken utterance, and generate novel spoken language given a spoken prompt. [More detail](tools) diff --git a/examples/textless_nlp/gslm/metrics/README.md b/examples/textless_nlp/gslm/metrics/README.md new file mode 100644 index 0000000000..0a63e2f0d8 --- /dev/null +++ b/examples/textless_nlp/gslm/metrics/README.md @@ -0,0 +1,10 @@ +# GSLM Metrics + +## ASR Metrics +The suite of metrics here uses an ASR model to transcribe the synthesized speech into text, and then uses text-based metrics. We also use word error rate from ASR transcription itself as one of the metrics. [More details](asr_metrics) + +## ABX Metrics +We use [ABX](https://www.semanticscholar.org/paper/ABX-Discriminability-Measures-and-Applications-Schatz/13d3537228f728c1063cc83743cb118bba3367a0) to evaluate how well-separated phonetic categories are with quantized representations. [More details](abx_metrics) + +## sWUGGY and sBLIMP +We refer to [ZeroSpeech challenge](https://www.zerospeech.com/2021/track_s.html#scoring-based-metrics) for details on the sWUGGY and sBLIMP metrics. diff --git a/examples/textless_nlp/gslm/metrics/abx_metrics/README.md b/examples/textless_nlp/gslm/metrics/abx_metrics/README.md new file mode 100644 index 0000000000..aa2560f045 --- /dev/null +++ b/examples/textless_nlp/gslm/metrics/abx_metrics/README.md @@ -0,0 +1,77 @@ +# ABX-based evaluation + +ABX is used to evaluate the quality of the obtained discrete units. + +The life cycle of the ABX-based evaluation for the Speech-to-Unit contains the following steps: +1. Training an acoustic model (or use an existing acoustic model) ([description](./../..)) +2. Perform quantization of speech by learning a K-means clustering model ([description](./../..)) +3. Compute discrete features for ABX computation using the learned clusters +4. Compute the ABX score over the discrete features taking advantage of [libri-light's ABX evaluation script][ll-abx] + +Here we assume that you already went throught the first two steps and focus solely on extracting features and computing ABX scores. + +## Libri-light setup + +Follow [libri-light's instructions][ll-instructions] for installation and [ABX evaluation setup][ll-abx] (including the download of the data items required for ABX computation). + +## Computing ABX + +### Dumping quantized features + +The first step for the ABX computation is to dump the quantized representations corresponding to the test files. + +```shell +TYPE="hubert" +LAYER=6 +CKPT_PATH="<PATH_TO_HUBERT_MODEL_CHECKPOINT_FILE>" +KM_MODEL_PATH="<PATH_TO_PRETRAINED_KM_MODEL_FILE>" + +SUBSET="dev-clean" +MANIFEST="<PATH_TO_MANIFEST_FOR_LS_DEV-CLEAN>" +DATA_DIR="<PATH_TO_DIR_TO_STORE_FEATURES>/$SUBSET" + +PYTHONPATH=. python examples/textless_nlp/gslm/metrics/abx_metrics/dump_abx_feats.py \ + --feature_type $TYPE \ + --kmeans_model_path $KM_MODEL_PATH \ + --checkpoint_path $CKPT_PATH \ + --layer $LAYER \ + --manifest_path $MANIFEST \ + --out_dir_path $DATA_DIR \ + --extension ".flac" +``` + +Again the manifest file follows the same structure than elsewhere in the codebase. + +### Compute ABX with Libri-light + +Use libri-light's `eval_ABX.py` script (within the appropriate environment set up) as followed: + +```shell +LIBRILIGHT_ROOT="<PATH_TO_LIBRILIGHT>" + +SUBSET="dev-clean" +DATA_DIR="<PATH_TO_DIR_TO_STORE_FEATURES>/$SUBSET" +ITEM_FILE_PATH="$LIBRILIGHT_ROOT/eval/ABX_data/$SUBSET.item" +OUT_DIR="<PATH_TO_DIR_TO_STORE_ABX_SCORES>/$SUBSET" + +FILE_EXTENSION=".npy" +FEATURE_SIZE=0.02 # depends on the model used + +PYTHONPATH=$LIBRILIGHT_ROOT \ + python $LIBRILIGHT_ROOT/eval/eval_ABX.py \ + $DATA_DIR \ + $ITEM_FILE_PATH \ + --file_extension $FILE_EXTENSION \ + --feature_size $FEATURE_SIZE \ + --out $OUT_DIR \ + --mode "all" +``` + +Note that `FEATURE_SIZE` will depend on the model type you are using to extract the acoustic features: +* For HuBERT and Wav2Vec2.0, use `FEATURE_SIZE=0.02` +* For CPC and Log Mel, use `FEATURE_SIZE=0.01` + +If you have a gpu available, make sure you add the `--cuda` flag for faster computation. + +[ll-instructions]: https://github.com/facebookresearch/libri-light +[ll-abx]: https://github.com/facebookresearch/libri-light/tree/master/eval#abx diff --git a/examples/textless_nlp/gslm/metrics/abx_metrics/dump_abx_feats.py b/examples/textless_nlp/gslm/metrics/abx_metrics/dump_abx_feats.py new file mode 100644 index 0000000000..41cf558970 --- /dev/null +++ b/examples/textless_nlp/gslm/metrics/abx_metrics/dump_abx_feats.py @@ -0,0 +1,107 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os + +import joblib +import numpy as np + +from examples.textless_nlp.gslm.speech2unit.clustering.utils import get_audio_files +from examples.textless_nlp.gslm.speech2unit.pretrained.utils import get_features + +def get_logger(): + log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" + logging.basicConfig(format=log_format, level=logging.INFO) + logger = logging.getLogger(__name__) + return logger + +def get_parser(): + parser = argparse.ArgumentParser( + description="Quantize using K-means clustering over acoustic features." + ) + parser.add_argument( + "--feature_type", + type=str, + choices=["logmel", "hubert", "w2v2", "cpc"], + default=None, + required=True, + help="Acoustic feature type", + ) + parser.add_argument( + "--kmeans_model_path", + type=str, + required=True, + help="K-means model file path to use for inference", + ) + parser.add_argument( + "--manifest_path", + type=str, + default=None, + help="Manifest file containing the root dir and file names", + ) + parser.add_argument( + "--checkpoint_path", + type=str, + help="Pretrained model checkpoint", + ) + parser.add_argument( + "--layer", + type=int, + help="The layer of the pretrained model to extract features from", + default=-1, + ) + parser.add_argument( + "--out_dir_path", + required=True, + type=str, + help="File path of quantized output.", + ) + parser.add_argument( + "--extension", type=str, default=".flac", help="Features file path" + ) + return parser + + +def one_hot(feat, n_clusters): + return np.eye(n_clusters)[feat] + +def main(args, logger): + # Feature extraction + logger.info(f"Extracting {args.feature_type} acoustic features...") + features_batch = get_features( + feature_type=args.feature_type, + checkpoint_path=args.checkpoint_path, + layer=args.layer, + manifest_path=args.manifest_path, + sample_pct=1.0, + flatten=False, + ) + logger.info(f"Features extracted for {len(features_batch)} utterances.\n") + logger.info(f"Dimensionality of representation = {features_batch[0].shape[1]}") + + logger.info(f"Loading K-means model from {args.kmeans_model_path} ...") + kmeans_model = joblib.load(open(args.kmeans_model_path, "rb")) + kmeans_model.verbose = False + + _, fnames, _ = get_audio_files(args.manifest_path) + + os.makedirs(args.out_dir_path, exist_ok=True) + logger.info(f"Writing quantized features to {args.out_dir_path}") + for i, feats in enumerate(features_batch): + pred = kmeans_model.predict(feats) + emb = one_hot(pred, kmeans_model.n_clusters) + base_fname = os.path.basename(fnames[i]).rstrip(args.extension) + output_path = os.path.join(args.out_dir_path, f"{base_fname}.npy") + with open(output_path, "wb") as f: + np.save(f, emb) + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + logger = get_logger() + logger.info(args) + main(args, logger) diff --git a/examples/textless_nlp/gslm/metrics/asr_metrics/README.md b/examples/textless_nlp/gslm/metrics/asr_metrics/README.md new file mode 100644 index 0000000000..90741f42b0 --- /dev/null +++ b/examples/textless_nlp/gslm/metrics/asr_metrics/README.md @@ -0,0 +1,87 @@ +# ASR-based evaluation + +Overall, the life cycle of the ASR-based evaluation for an ULM contains the following steps: + 1. Training an ULM and sampling from it [[description]](./../../ulm) + 2. Running UTS on the sampled unit sequences [[description]](./../../unit2speech) + 3. Pre-processing for the ASR (down-sampling to 16 KHz, aligning length of the generated audio with ground-truth utterances) + 4. Running ASR + 5. Calculation of the post-ASR evaluation metrics + +Here we assume that you have already went throught the first two steps and focus on the rest. + +## Preprocessing +### Down-sampling to 16KHz +The bulk conversion can be done by running +```bash + python $FAIRSEQ_ROOT/examples/textless_nlp/gslm/unit2speech/convert_to_16k.py $UTS_OUTPUT $UTS_OUTPUT_DOWNSAMPLE + ``` + where `$UTS_OUTPUT` specifies the directory with the generated audio and `$UTS_OUTPUT_DOWNSAMPLE` is the directory where downsampled audio would be saved. + + ### Matching by length +This step is somewhat optional. However, if you want to compare the fluency and diversity of a generated speech utterance to that of the ground-truth speech with the same prefix, it is a good idea to force them to be of the same length. +```bash +python $FAIRSEQ_ROOT/examples/textless_nlp/asr_metrics/cut_as.py \ + --samples_dir=$UTS_OUTPUT_DOWNSAMPLE --out_dir=$UTS_OUTPUT_DOWNSAMPLE_CUT \ + --prompts_description=data/ground_truth_continuation_dev.json +``` + +Here `ground_truth_continuation_dev.json` is a json file with ground-truth text from LibriSpeech dev-clean, associated with some meta-data (assuming the evaluation is done on dev-clean). This file can be downloaded [[here]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/eval_data/ground_truth_continuation_dev.json). A similar file for the test-clean is [[here]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/eval_data/ground_truth_continuation_test.json). These files are used for the evaluation and contain texts for audio sequences that are at least 6s long. + +## Running ASR +We use a pre-trained wav2vec model to run the ASR step. We firstly need to prepare manifest files which, roughly, tell the ASR system which files we want to transcribe. You can find more details and download the `960h_scratch.pt` checkpoint +[[here]](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/README.md)). To run ASR, you would also need to +install KenLM, Flashlight decoder, and download the KenLM 4-gram English language model. + +```bash + python $FAIRSEQ_ROOT/examples/wav2vec/wav2vec_manifest.py \ + $UTS_OUTPUT_DOWNSAMPLE_CUT --valid-percent 0.0 --dest $MANIFEST_DIR --ext wav +``` +where `$UTS_OUTPUT_DOWNSAMPLE_CUT` speficies the directory with the preprocessed UTS outputs and `$MANIFEST_DIR` is the output directory. + +We will be running an out-of-the-box evaluation script which requires ground-truth transcripts to measure quality metrics. We are only +interested in the transcripts (and we don't have ground-truth outputs for when our ULM generated!), hence we will just generate +some dummy transcripts instead: +```bash +cp $FAIRSEQ_ROOT/examples/textless_nlp/gslm/asr_metrics/misc/dict.ltr.txt $MANIFEST_DIR +python $FAIRSEQ_ROOT/examples/textless_nlp/gslm/asr_metrics/misc/dummy_asr_data.py --tsv=$MANIFEST_DIR/train.tsv \ + --output-dir=$MANIFEST_DIR +``` + +Now we are ready for running ASR: +``` +mkdir -p asr +python $FAIRSEQ_ROOT/examples/speech_recognition/infer.py \ + $MANIFEST_DIR \ + --task audio_pretraining --nbest 1 --path 960h_scratch.pt \ + --gen-subset=train --results-path $PATH_TO_ASR_OUTPUT \ + --w2l-decoder kenlm --lm-model 4-gram.bin \ + --lexicon librispeech/lexicon_ltr.lst --word-score -1 \ + --sil-weight 0 --lm-weight 2 --criterion ctc --labels ltr --max-tokens 300000 --remove-bpe letter +``` +where `lexicon_ltr.lst` is the LibriSpeech lexicon and `$PATH_TO_ASR_OUTPUT` is the output directory (can be downloaded [[here]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/eval_data/lexicon_ltr.lst)). + +## Evaluation metrics +We run evaluation on the 1_000 shortest sequences that are at least 6s long. To filter those from the ASR transcript, we additionally provide each metric script with the paths to the manifest and `ground_truth_continuation_*` files. + +### Perplexity (PPX) +To get a PPX metric estimate on an ASR transcript, you need to run the following command: +```bash +python ppx.py $PATH_TO_ASR_OUTPUT/hypo.word-960h_scratch.pt-train.txt --cut-tail\ + --manifest=$MANIFEST_DIR/train.tsv --prompts-description=data/ground_truth_continuation_dev.json +``` +where `--cut-tail` tells the script to ignore the last token on each line (ASR puts the sequence ID there). + +### Self- and Auto-BLEU +```bash +python self_bleu.py $PATH_TO_ASR_OUTPUT/hypo.word-960h_scratch.pt-train.txt --cut-tail \ + --manifest=$MANIFEST_DIR/train.tsv --prompts-description=data/ground_truth_continuation_dev.json +``` + +### Continuation-BLEU +```bash +python continuation_eval.py --asr-transcript $PATH_TO_ASR_OUTPUT/hypo.word-960h_scratch.pt-train.txt \ + --manifest=$MANIFEST_DIR/train.tsv --prompts-description=data/ground_truth_continuation_dev.json +``` + +### AUC +Based on the metrics calculated above, we can estimate the AUC of the perplexity/diversity trade-off. We provide an illustration in a [Colab notebook](https://colab.research.google.com/drive/1pVPfOVax_PU3MkYdHRSsa-SI8GBUldNt?usp=sharing). diff --git a/examples/textless_nlp/gslm/metrics/asr_metrics/continuation_eval.py b/examples/textless_nlp/gslm/metrics/asr_metrics/continuation_eval.py new file mode 100644 index 0000000000..72b92a341d --- /dev/null +++ b/examples/textless_nlp/gslm/metrics/asr_metrics/continuation_eval.py @@ -0,0 +1,99 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from collections import defaultdict +import numpy as np +from misc.bleu_utils import sentence_bleu +import json +import warnings + + +def get_args(): + import argparse + + parser = argparse.ArgumentParser("Tool to calculate Continuation-BLEU2") + parser.add_argument('--asr-transcript', type=str, + help='Path to the transcript file.') + parser.add_argument('--prompts-description', type=str, + help='Path to the ground-truth continuation') + parser.add_argument('--manifest', type=str, required=True) + parser.add_argument('--take-shortest', type=int, default=1000) + + args = parser.parse_args() + + return args + + +def main(): + # NLTK produces warnings + warnings.filterwarnings("ignore") + + args = get_args() + + with open(args.prompts_description, 'r') as fin: + original_continuations = json.loads(fin.read()) + + sequence2length = [(k, v[0]) for k, v in original_continuations.items()] + assert all(float(v) >= 6.0 for (_, v) in sequence2length) # 6 seconds + + sequence2length.sort(key=lambda x: x[1]) + to_take = set(v[0] for v in sequence2length[:args.take_shortest]) + + with open(args.manifest, 'r') as fin: + fin.readline() + + linenum2file = dict([ + (i, l.split("__")[0]) for (i, l) in enumerate(fin) + ]) + + max_files = max(linenum2file.keys()) + continuations = defaultdict(list) + + mean_length_after = 0 + n_examples = 0 + + with open(args.asr_transcript, 'r') as fin: + for line in fin: + n_examples += 1 + line = line.split() + sequence_id = int(line[-1].split('-')[1][:-1]) + + assert sequence_id <= max_files + + sequence_name = linenum2file[sequence_id] + + continuations[sequence_name].append(line[:-1]) + mean_length_after += len(line) + + mean_length_after /= n_examples + print(f'Mean length of continuations, in words: {mean_length_after}') + metric_values = [] + + mean_ground_truth_words = 0 + n_examples = 0 + n_candidates = 0 + + for k, candidates in continuations.items(): + if k not in to_take: + continue + + n_examples += 1 + + ground_truth = original_continuations[k][1].split() + n_candidates += len(candidates) + bleu = sentence_bleu(candidates, ground_truth, weights=( + 0.5, 0.5), no_length_penalty=True, averaging_mode="geometric") + mean_ground_truth_words += len(ground_truth) + + metric_values.append(bleu) + + n = len(metric_values) + print( + f'Median BLEU over {n} examples: {np.median(metric_values)} +- {np.std(metric_values) / np.sqrt(n)}') + + +if __name__ == '__main__': + main() diff --git a/examples/textless_nlp/gslm/metrics/asr_metrics/misc/bleu_utils.py b/examples/textless_nlp/gslm/metrics/asr_metrics/misc/bleu_utils.py new file mode 100644 index 0000000000..75cc5272d3 --- /dev/null +++ b/examples/textless_nlp/gslm/metrics/asr_metrics/misc/bleu_utils.py @@ -0,0 +1,166 @@ +""" + +TODO: the code is take from Apache-2 Licensed NLTK: make sure we do this properly! + + +Copied over from nltk.tranlate.bleu_score. This code has two major changes: + - allows to turn off length/brevity penalty --- it has no sense for self-bleu, + - allows to use arithmetic instead of geometric mean +""" + +import math +import sys +from fractions import Fraction +import warnings +from collections import Counter +from nltk.translate.bleu_score import modified_precision, closest_ref_length, brevity_penalty, SmoothingFunction + + +def corpus_bleu( + list_of_references, + hypotheses, + weights=(0.25, 0.25, 0.25, 0.25), + smoothing_function=None, + auto_reweigh=False, + averaging_mode="geometric", + no_length_penalty=False +): + """ + Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all + the hypotheses and their respective references. + + Instead of averaging the sentence level BLEU scores (i.e. marco-average + precision), the original BLEU metric (Papineni et al. 2002) accounts for + the micro-average precision (i.e. summing the numerators and denominators + for each hypothesis-reference(s) pairs before the division). + + >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', + ... 'ensures', 'that', 'the', 'military', 'always', + ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] + >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', + ... 'ensures', 'that', 'the', 'military', 'will', 'forever', + ... 'heed', 'Party', 'commands'] + >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', + ... 'guarantees', 'the', 'military', 'forces', 'always', + ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] + >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', + ... 'army', 'always', 'to', 'heed', 'the', 'directions', + ... 'of', 'the', 'party'] + + >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', + ... 'interested', 'in', 'world', 'history'] + >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', + ... 'because', 'he', 'read', 'the', 'book'] + + >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] + >>> hypotheses = [hyp1, hyp2] + >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS + 0.5920... + + The example below show that corpus_bleu() is different from averaging + sentence_bleu() for hypotheses + + >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) + >>> score2 = sentence_bleu([ref2a], hyp2) + >>> (score1 + score2) / 2 # doctest: +ELLIPSIS + 0.6223... + + :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses + :type list_of_references: list(list(list(str))) + :param hypotheses: a list of hypothesis sentences + :type hypotheses: list(list(str)) + :param weights: weights for unigrams, bigrams, trigrams and so on + :type weights: list(float) + :param smoothing_function: + :type smoothing_function: SmoothingFunction + :param auto_reweigh: Option to re-normalize the weights uniformly. + :type auto_reweigh: bool + :return: The corpus-level BLEU score. + :rtype: float + """ + # Before proceeding to compute BLEU, perform sanity checks. + + p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. + p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. + hyp_lengths, ref_lengths = 0, 0 + + assert len(list_of_references) == len(hypotheses), ( + "The number of hypotheses and their reference(s) should be the " "same " + ) + + # Iterate through each hypothesis and their corresponding references. + for references, hypothesis in zip(list_of_references, hypotheses): + # For each order of ngram, calculate the numerator and + # denominator for the corpus-level modified precision. + for i, _ in enumerate(weights, start=1): + p_i = modified_precision(references, hypothesis, i) + p_numerators[i] += p_i.numerator + p_denominators[i] += p_i.denominator + + # Calculate the hypothesis length and the closest reference length. + # Adds them to the corpus-level hypothesis and reference counts. + hyp_len = len(hypothesis) + hyp_lengths += hyp_len + ref_lengths += closest_ref_length(references, hyp_len) + + # Calculate corpus-level brevity penalty. + if no_length_penalty and averaging_mode == 'geometric': + bp = 1.0 + elif no_length_penalty and averaging_mode == 'arithmetic': + bp = 0.0 + else: + assert not no_length_penalty + assert averaging_mode != 'arithmetic', 'Not sure how to apply length penalty when aurithmetic mode' + bp = brevity_penalty(ref_lengths, hyp_lengths) + + # Uniformly re-weighting based on maximum hypothesis lengths if largest + # order of n-grams < 4 and weights is set at default. + if auto_reweigh: + if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): + weights = (1 / hyp_lengths,) * hyp_lengths + + # Collects the various precision values for the different ngram orders. + p_n = [ + Fraction(p_numerators[i], p_denominators[i], _normalize=False) + for i, _ in enumerate(weights, start=1) + ] + + # Returns 0 if there's no matching n-grams + # We only need to check for p_numerators[1] == 0, since if there's + # no unigrams, there won't be any higher order ngrams. + if p_numerators[1] == 0: + return 0 + + # If there's no smoothing, set use method0 from SmoothinFunction class. + if not smoothing_function: + smoothing_function = SmoothingFunction().method0 + # Smoothen the modified precision. + # Note: smoothing_function() may convert values into floats; + # it tries to retain the Fraction object as much as the + # smoothing method allows. + p_n = smoothing_function( + p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths + ) + + if averaging_mode == "geometric": + s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n)) + s = bp * math.exp(math.fsum(s)) + elif averaging_mode == "arithmetic": + s = (w_i * p_i for w_i, p_i in zip(weights, p_n)) + s = math.fsum(s) + + return s + + +def sentence_bleu( + references, + hypothesis, + weights=(0.25, 0.25, 0.25, 0.25), + smoothing_function=None, + auto_reweigh=False, + averaging_mode="geometric", + no_length_penalty=False +): + return corpus_bleu( + [references], [hypothesis], weights, smoothing_function, auto_reweigh, averaging_mode, no_length_penalty + ) \ No newline at end of file diff --git a/examples/textless_nlp/gslm/metrics/asr_metrics/misc/cut_as.py b/examples/textless_nlp/gslm/metrics/asr_metrics/misc/cut_as.py new file mode 100644 index 0000000000..5b7e1e9685 --- /dev/null +++ b/examples/textless_nlp/gslm/metrics/asr_metrics/misc/cut_as.py @@ -0,0 +1,69 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import torchaudio +import argparse +import json +import pathlib + + +def get_args(): + parser = argparse.ArgumentParser( + "Assuring generated audio have the same length as ground-truth audio") + parser.add_argument('--samples_dir', required=True, type=str) + parser.add_argument('--out_dir', required=True, type=str) + parser.add_argument('--prompts_description', required=True, type=str) + return parser.parse_args() + + +def cut(src, tgt, l): + x, sr = torchaudio.load(str(src)) + assert sr == 16_000 + + x = x.squeeze() + target_frames = int(l * sr) + + flag = 0 + if target_frames <= x.size(0): + x = x[:target_frames] + flag = 1 + else: + flag = 0 + torchaudio.save(str(tgt), x.unsqueeze(0), sr) + return flag + + +def main(): + args = get_args() + tgt_dir = pathlib.Path(args.out_dir) + tgt_dir.mkdir(exist_ok=True, parents=True) + + total_files, sufficiently_long = 0, 0 + + with open(args.prompts_description, 'r') as f: + description = json.loads(f.read()) + + for src_f in pathlib.Path(args.samples_dir).glob('*.wav'): + name_prompt = src_f.with_suffix('').name.split('__')[0] + + assert name_prompt in description, f'Cannot find {name_prompt}!' + + target_length = description[name_prompt][0] + tgt_f = tgt_dir / (src_f.name) + + is_long_enough = cut(src_f, tgt_f, target_length) + sufficiently_long += is_long_enough + if not is_long_enough: + print(f'{src_f} is not long enough') + + total_files += 1 + + print( + f'Total files: {total_files}; sufficiently long: {sufficiently_long}') + + +if __name__ == '__main__': + main() diff --git a/examples/textless_nlp/gslm/metrics/asr_metrics/misc/dict.ltr.txt b/examples/textless_nlp/gslm/metrics/asr_metrics/misc/dict.ltr.txt new file mode 100644 index 0000000000..69929e1666 --- /dev/null +++ b/examples/textless_nlp/gslm/metrics/asr_metrics/misc/dict.ltr.txt @@ -0,0 +1,28 @@ +| 94802 +E 51860 +T 38431 +A 33152 +O 31495 +N 28855 +I 28794 +H 27187 +S 26071 +R 23546 +D 18289 +L 16308 +U 12400 +M 10685 +W 10317 +C 9844 +F 9062 +G 8924 +Y 8226 +P 6890 +B 6339 +V 3936 +K 3456 +' 1023 +X 636 +J 598 +Q 437 +Z 213 diff --git a/examples/textless_nlp/gslm/metrics/asr_metrics/ppx.py b/examples/textless_nlp/gslm/metrics/asr_metrics/ppx.py new file mode 100644 index 0000000000..d6a40e4d35 --- /dev/null +++ b/examples/textless_nlp/gslm/metrics/asr_metrics/ppx.py @@ -0,0 +1,122 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +import numpy as np +import warnings + + +def get_target_sequences(manifest, ground_truth, to_take=1000): + import json + import pathlib + + with open(ground_truth, 'r') as fin: + original_continuations = json.loads(fin.read()) + + sequence2length = [(k, v[0]) for k, v in original_continuations.items()] + assert all(float(v) >= 6.0 for (_, v) in sequence2length) # 6 seconds + + sequence2length.sort(key=lambda x: x[1]) + to_take_sequences = set(v[0] for v in sequence2length[:to_take]) + to_take_ids = [] + + with open(manifest, 'r') as f: + f.readline() + + for i, line in enumerate(f.readlines()): + seq_id = line.split()[0] + seq_id = pathlib.Path(seq_id).name.split('__')[0] + + if seq_id in to_take_sequences: + to_take_ids.append(i) + + print(f'Took {len(to_take_ids)} ids') + return set(to_take_ids) + + +def get_args(): + import argparse + + parser = argparse.ArgumentParser("Evaluate PPX metric of a transcript.") + parser.add_argument('--asr-transcript', type=str, + help='Path to the transcript file.') + parser.add_argument('--cut-id', action='store_true', + help='Whether cut the first token (typically a seq id)') + parser.add_argument('--cut-tail', action='store_true', + help='Whether cut the last token (typically a speaker id)') + + parser.add_argument('--manifest', type=str, default=None) + parser.add_argument('--prompts-description', type=str, default=None) + + args = parser.parse_args() + + return args + + +def main(): + args = get_args() + + lm = torch.hub.load( + 'pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe') + + lm.eval().cuda() # disable dropout + + if args.manifest is None and args.prompts_description is None: + target_ids = None + else: + target_ids = get_target_sequences( + args.manifest, args.prompts_description) + + with open(args.asr_transcript, 'r') as fin: + lines = fin.readlines() + + if target_ids is not None: + filtered = [] + for line in lines: + line_id = line.split()[-1] + line_id = int(line_id.split('-')[1][:-1]) + if line_id in target_ids: + filtered.append(line) + lines = filtered + else: + pass + + if args.cut_id: + lines = [' '.join(x.split()[1:]) for x in lines] + if args.cut_tail: + lines = [' '.join(x.split()[:-1]) for x in lines] + lines = [x.strip().lower() for x in lines] + + def get_logprob(sent): return \ + lm.score(sent)['positional_scores'].mean().neg().item() + + logprobs = [get_logprob(l) for l in lines] + + filtered = [x for x in logprobs if not np.isnan(x)] + if len(filtered) != len(logprobs): + warnings.warn("NaNs detected!") + logprobs = filtered + + perplexities = [np.exp(l) for l in logprobs] + + for name, stats in [('logprob', logprobs), ('perplexity', perplexities)]: + mean = np.mean(stats) + sem = np.std(stats) / np.sqrt(len(stats)) + + median = np.median(stats) + interval = list(np.percentile(stats, [10, 90])) + + mean, sem, median, percentile10, percentile90 = [ + round(x, 2) for x in [mean, sem, median] + interval] + + print(name) + print(f"\tMean {mean} +- {sem}") + print( + f"\tMedian {median}, 90% confidence interval {percentile10}...{percentile90}") + + +if __name__ == '__main__': + main() diff --git a/examples/textless_nlp/gslm/metrics/asr_metrics/self_auto_bleu.py b/examples/textless_nlp/gslm/metrics/asr_metrics/self_auto_bleu.py new file mode 100644 index 0000000000..062bb82f66 --- /dev/null +++ b/examples/textless_nlp/gslm/metrics/asr_metrics/self_auto_bleu.py @@ -0,0 +1,201 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import nltk +from misc.bleu_utils import sentence_bleu +import warnings + + +def get_target_sequences(manifest, ground_truth, to_take=1000): + import json + import pathlib + + with open(ground_truth, 'r') as fin: + original_continuations = json.loads(fin.read()) + + sequence2length = [(k, v[0]) for k, v in original_continuations.items()] + assert all(float(v) >= 6.0 for (_, v) in sequence2length) # 6 seconds + + sequence2length.sort(key=lambda x: x[1]) + to_take_sequences = set(v[0] for v in sequence2length[:to_take]) + to_take_ids = [] + + with open(manifest, 'r') as f: + f.readline() + + for i, line in enumerate(f.readlines()): + seq_id = line.split()[0] + seq_id = pathlib.Path(seq_id).name.split('__')[0] + + if seq_id in to_take_sequences: + to_take_ids.append(i) + + print(f'Took {len(to_take_ids)} ids') + return set(to_take_ids) + + +def get_args(): + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('--asr-transcript', type=str, + help='Path to the transcript file.') + + parser.add_argument('--manifest', required=True) + parser.add_argument('--prompts-description', required=True) + + parser.add_argument('--cut-id', action='store_true', + help='Whether cut the first token (typically a seq id)') + parser.add_argument('--cut-tail', action='store_true', + help='Whether cut the last token (typically a speaker id)') + parser.add_argument('--debug', action='store_true') + + args = parser.parse_args() + + return args + + +def get_self_bleu(utterances, averaging_mode, weights): + self_bleu = [] + + for i in range(len(utterances)): + hypo = utterances[i] + rest = utterances[:i] + utterances[i+1:] + + self_bleu.append(sentence_bleu(rest, hypo, weights, + no_length_penalty=True, averaging_mode=averaging_mode)) + + return self_bleu + + +def get_self_bleu2_arithmetic(utterances): + weights = (0.5, 0.5) # equal weight for unigrams and bigrams + return get_self_bleu(utterances, averaging_mode='arithmetic', weights=weights) + + +def get_self_bleu2_geometric(utterances): + weights = (0.5, 0.5) + return get_self_bleu(utterances, averaging_mode='geometric', weights=weights) + + +def get_auto_bleu2_arithmetic(utterances): + weights = (0.5, 0.5) + return [auto_bleu(u, mean_mode='arithmetic', weights=weights) for u in utterances] + + +def get_auto_bleu2_geometric(utterances): + weights = (0.5, 0.5) + return [auto_bleu(u, mean_mode='geometric', weights=weights) for u in utterances] + + +def get_auto_bleu3_geometric(utterances): + weights = (1./3, 1./3, 1./3) + return [auto_bleu(u, mean_mode='geometric', weights=weights) for u in utterances] + + +def get_auto_bleu3_arithmetic(utterances): + weights = (1./3, 1./3, 1./3) + return [auto_bleu(u, mean_mode='arithmetic', weights=weights) for u in utterances] + + +def get_self_bleu3_arithmetic(utterances): + weights = (1./3, 1./3, 1./3) + return get_self_bleu(utterances, averaging_mode='arithmetic', weights=weights) + + +def get_self_bleu3_geometric(utterances): + weights = (1./3, 1./3, 1./3) + return get_self_bleu(utterances, averaging_mode='geometric', weights=weights) + + +def auto_bleu(sentence, weights, mean_mode='arithmetic'): + if len(sentence) <= 1: + return 0 + + N = len(weights) + + bleu_n = np.zeros([N]) + for n in range(N): + targ_ngrams = list(nltk.ngrams(sentence, n+1)) + for p in range(len(targ_ngrams)): + left = sentence[:p] + right = sentence[(p+n+1):] + rest_ngrams = list(nltk.ngrams(left, n+1)) + \ + list(nltk.ngrams(right, n+1)) + # compute the nb of matching ngrams + bleu_n[n] += targ_ngrams[p] in rest_ngrams + bleu_n[n] /= len(targ_ngrams) # average them to get a proportion + + weights = np.array(weights) + if mean_mode == 'arithmetic': + return (bleu_n * weights).sum() + elif mean_mode == 'geometric': + return (bleu_n ** weights).prod() + else: + raise ValueError(f'Unknown agggregation mode {mean_mode}') + + +def main(): + from multiprocessing import Pool + + args = get_args() + target_ids = get_target_sequences(args.manifest, args.prompts_description) + + with open(args.asr_transcript, 'r') as fin: + lines = fin.readlines() + + terms = [x.strip().split() for x in lines] + filtered = [] + for term in terms: + line_id = int(term[-1].split('-')[1][:-1]) + if line_id in target_ids: + filtered.append(term) + terms = filtered + + if args.cut_id: + terms = [x[1:] for x in terms] + if args.cut_tail: + terms = [x[:-1] for x in terms] + + if args.debug: + terms = terms[:10] + + tasks = [ + ('Self-BLEU2-arithmetic', get_self_bleu2_arithmetic), + ('Self-BLEU2-geometric', get_self_bleu2_geometric), + ('Auto-BLEU2-arithmetic', get_auto_bleu2_arithmetic), + ('Auto-BLEU2-geometric', get_auto_bleu2_geometric), + + ('Self-BLEU3-arithmetic', get_self_bleu3_arithmetic), + ('Self-BLEU3-geometric', get_self_bleu3_geometric), + ('Auto-BLEU3-arithmetic', get_auto_bleu3_arithmetic), + ('Auto-BLEU3-geometric', get_auto_bleu3_geometric), + ] + + n_processes = min(16, len(tasks)) + with Pool(n_processes) as pool: + metrics = pool.map(run_f, [(t[1], terms) for t in tasks]) + + for (metric_name, _), metric in zip(tasks, metrics): + metric, sem = np.mean(metric), np.std(metric) / np.sqrt(len(metric)) + + metric, sem = [ + round(100 * x, 2) for x in [metric, sem] + ] + + print(f'{metric_name} {metric} +- {sem}') + + +def run_f(task_params): + f, terms = task_params + return f(terms) + + +if __name__ == '__main__': + # NLTK produces warnings + warnings.filterwarnings("ignore") + + main() diff --git a/examples/textless_nlp/gslm/speech2unit/README.md b/examples/textless_nlp/gslm/speech2unit/README.md new file mode 100644 index 0000000000..9dff9d33ac --- /dev/null +++ b/examples/textless_nlp/gslm/speech2unit/README.md @@ -0,0 +1,68 @@ +# Speech to Unit Model (speech2unit) + +## Acoustic Model +For quantizing speech we learn a K-means clustering over acoustic representations for which we either use Log-Mel Filterbank or pretrained acoustic representation models. For using pretrained models, please download from their respective locations linked below. +* [Modified CPC](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/cpc_big_ll6kh_top_ctc.pt) +* [HuBERT-Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) +* [Wav2Vec 2.0-Base](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_new.pt) + +## Quantization Model +You can download pretrained quantized model from the list below. + +K-Means Model | Download Link +|-|- +Log Mel Filterbank + KM50 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/km50/km.bin) +Log Mel Filterbank + KM100 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/km100/km.bin) +Log Mel Filterbank + KM200 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/km200/km.bin) +Modified CPC + KM50 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/km50/km.bin) +Modified CPC + KM100 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/km100/km.bin) +Modified CPC + KM200 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/km200/km.bin) +HuBERT Base + KM50 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/km50/km.bin) +HuBERT Base + KM100 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/km100/km.bin) +HuBERT Base + KM200 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/km200/km.bin) +wav2vec 2.0 Large + KM50 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/km50/km.bin) +wav2vec 2.0 Large + KM100 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/km100/km.bin) +wav2vec 2.0 Large + KM200 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/km200/km.bin) + +### Quantization +For quantizing speech with a given acoustic representation, please follow the steps below. +1. Learn K-means clustering model +``` +N_CLUSTERS=<number_of_clusters_used_for_kmeans> +TYPE=<one_of_logmel/cpc/hubert/w2v2> +CKPT_PATH=<path_of_pretrained_acoustic_model> +LAYER=<layer_of_acoustic_model_to_extract_features_from> +MANIFEST=<tab_separated_manifest_of_audio_files_for_training_kmeans> +KM_MODEL_PATH=<output_path_of_the_kmeans_model> + +PYTHONPATH=. python examples/textless_nlp/gslm/speech2unit/clustering/cluster_kmeans.py \ + --num_clusters $N_CLUSTERS \ + --feature_type $TYPE \ + --checkpoint_path $CKPT_PATH \ + --layer $LAYER \ + --manifest_path $MANIFEST \ + --out_kmeans_model_path $KM_MODEL_PATH +``` +2. Quantize using the learned clusters +``` +MANIFEST=<tab_separated_manifest_of_audio_files_to_quantize> +OUT_QUANTIZED_FILE=<output_quantized_audio_file_path> + +python examples/textless_nlp/gslm/speech2unit/clustering/quantize_with_kmeans.py \ + --feature_type $TYPE \ + --kmeans_model_path $KM_MODEL_PATH \ + --acoustic_model_path $CKPT_PATH \ + --layer $LAYER \ + --manifest_path $MANIFEST \ + --out_quantized_file_path $OUT_QUANTIZED_FILE \ + --extension ".flac" +``` + +Note about the manifest file is a file with paths and length of input audio files. The format of the file is as follows: +``` +<path_of_root_directory_containing_audio_files> +<relative_path_of_audio_file_1>\t<number_of_frames_1> +<relative_path_of_audio_file_2>\t<number_of_frames_1> +... +``` + diff --git a/examples/textless_nlp/gslm/speech2unit/__init__.py b/examples/textless_nlp/gslm/speech2unit/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/textless_nlp/gslm/speech2unit/clustering/__init__.py b/examples/textless_nlp/gslm/speech2unit/clustering/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/textless_nlp/gslm/speech2unit/clustering/cluster_kmeans.py b/examples/textless_nlp/gslm/speech2unit/clustering/cluster_kmeans.py new file mode 100644 index 0000000000..7cf844a95a --- /dev/null +++ b/examples/textless_nlp/gslm/speech2unit/clustering/cluster_kmeans.py @@ -0,0 +1,212 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os +import time + +import numpy as np +from sklearn.cluster import MiniBatchKMeans + +import joblib +from examples.textless_nlp.gslm.speech2unit.pretrained.utils import ( + get_and_dump_features, + get_features, +) + + +def get_logger(): + log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" + logging.basicConfig(format=log_format, level=logging.INFO) + logger = logging.getLogger(__name__) + return logger + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Learn K-means clustering over acoustic features." + ) + + # Features arguments + parser.add_argument( + "--in_features_path", type=str, default=None, help="Features file path" + ) + parser.add_argument( + "--feature_type", + type=str, + choices=["logmel", "hubert", "w2v2", "cpc"], + default=None, + help="Acoustic feature type", + ) + parser.add_argument( + "--manifest_path", + type=str, + default=None, + help="Manifest file containing the root dir and file names", + ) + parser.add_argument( + "--out_features_path", + type=str, + default=None, + help="Features file path to write to", + ) + parser.add_argument( + "--checkpoint_path", + type=str, + help="Pretrained acoustic model checkpoint", + ) + parser.add_argument( + "--layer", + type=int, + help="The layer of the pretrained model to extract features from", + default=-1, + ) + parser.add_argument( + "--sample_pct", + type=float, + help="Percent data to use for K-means training", + default=0.1, + ) + + # K-means arguments + parser.add_argument( + "--num_clusters", type=int, help="Nubmer of clusters", default=50 + ) + parser.add_argument("--init", default="k-means++") + parser.add_argument( + "--max_iter", + type=int, + help="Maximum number of iterations for K-means training", + default=150, + ) + parser.add_argument( + "--batch_size", + type=int, + help="Batch size for K-means training", + default=10000, + ) + parser.add_argument("--tol", default=0.0, type=float) + parser.add_argument("--max_no_improvement", default=100, type=int) + parser.add_argument("--n_init", default=20, type=int) + parser.add_argument("--reassignment_ratio", default=0.5, type=float) + parser.add_argument( + "--out_kmeans_model_path", + type=str, + required=True, + help="Path to save K-means model", + ) + + # Leftovers + parser.add_argument( + "--seed", + type=int, + help="Random seed to use for K-means training", + default=1369, + ) + + return parser + + +def get_kmeans_model( + n_clusters, + init, + max_iter, + batch_size, + tol, + max_no_improvement, + n_init, + reassignment_ratio, + random_state, +): + return MiniBatchKMeans( + n_clusters=n_clusters, + init=init, + max_iter=max_iter, + batch_size=batch_size, + tol=tol, + max_no_improvement=max_no_improvement, + n_init=n_init, + reassignment_ratio=reassignment_ratio, + random_state=random_state, + verbose=1, + compute_labels=True, + init_size=None, + ) + + +def train_kmeans(kmeans_model, features_batch): + start_time = time.time() + kmeans_model.fit(features_batch) + time_taken = round((time.time() - start_time) // 60, 2) + return kmeans_model, time_taken + + +def main(args, logger): + # Features loading/extraction for K-means + if args.in_features_path: + # Feature loading + logger.info(f"Loading features from {args.in_features_path}...") + features_batch = np.load(args.in_features_path, allow_pickle=True) + else: + # Feature extraction + logger.info(f"Extracting {args.feature_type} acoustic features...") + features_batch = ( + get_features( + feature_type=args.feature_type, + checkpoint_path=args.checkpoint_path, + layer=args.layer, + manifest_path=args.manifest_path, + sample_pct=args.sample_pct, + flatten=True, + ) + if not args.out_features_path + else get_and_dump_features( + feature_type=args.feature_type, + checkpoint_path=args.checkpoint_path, + layer=args.layer, + manifest_path=args.manifest_path, + sample_pct=args.sample_pct, + flatten=True, + out_features_path=args.out_features_path, + ) + ) + if args.out_features_path: + logger.info( + f"Saved extracted features at {args.out_features_path}" + ) + logger.info(f"Features shape = {features_batch.shape}\n") + + # Learn and save K-means model + kmeans_model = get_kmeans_model( + n_clusters=args.num_clusters, + init=args.init, + max_iter=args.max_iter, + batch_size=args.batch_size, + tol=args.tol, + max_no_improvement=args.max_no_improvement, + n_init=args.n_init, + reassignment_ratio=args.reassignment_ratio, + random_state=args.seed, + ) + logger.info("Starting k-means training...") + kmeans_model, time_taken = train_kmeans( + kmeans_model=kmeans_model, features_batch=features_batch + ) + logger.info(f"...done k-means training in {time_taken} minutes") + inertia = -kmeans_model.score(features_batch) / len(features_batch) + logger.info(f"Total intertia: {round(inertia, 2)}\n") + + logger.info(f"Saving k-means model to {args.out_kmeans_model_path}") + os.makedirs(os.path.dirname(args.out_kmeans_model_path), exist_ok=True) + joblib.dump(kmeans_model, open(args.out_kmeans_model_path, "wb")) + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + logger = get_logger() + logger.info(args) + main(args, logger) diff --git a/examples/textless_nlp/gslm/speech2unit/clustering/dump_feats.py b/examples/textless_nlp/gslm/speech2unit/clustering/dump_feats.py new file mode 100644 index 0000000000..031567c6d8 --- /dev/null +++ b/examples/textless_nlp/gslm/speech2unit/clustering/dump_feats.py @@ -0,0 +1,91 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging + +from examples.textless_nlp.gslm.speech2unit.pretrained.utils import ( + get_and_dump_features, +) + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Compute and dump log mel fbank features." + ) + parser.add_argument( + "--feature_type", + type=str, + choices=["logmel", "hubert", "w2v2", "cpc"], + default=None, + help="Acoustic feature type", + ) + parser.add_argument( + "--manifest_path", + type=str, + default=None, + help="Manifest file containing the root dir and file names", + ) + parser.add_argument( + "--out_features_path", + type=str, + default=None, + help="Features file path to write to", + ) + parser.add_argument( + "--checkpoint_path", + type=str, + help="Pretrained acoustic model checkpoint", + ) + parser.add_argument( + "--layer", + type=int, + help="The layer of the pretrained model to extract features from", + default=-1, + ) + parser.add_argument( + "--sample_pct", + type=float, + help="Percent data to use for K-means training", + default=0.1, + ) + parser.add_argument( + "--out_features_path", + type=str, + help="Path to save log mel fbank features", + ) + return parser + + +def get_logger(): + log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" + logging.basicConfig(format=log_format, level=logging.INFO) + logger = logging.getLogger(__name__) + return logger + + +if __name__ == "__main__": + """ + Example command: + python ~/speechbot/clustering/dump_logmelfank_feats.py \ + --manifest_path /checkpoint/kushall/data/LJSpeech-1.1/asr_input_wavs_16k/train.tsv + --out_features_path /checkpoint/kushall/experiments/speechbot/logmelfbank/features/ljspeech/train.npy + """ + parser = get_parser() + args = parser.parse_args() + logger = get_logger() + logger.info(args) + + logger.info(f"Extracting {args.feature_type} acoustic features...") + get_and_dump_features( + feature_type=args.feature_type, + checkpoint_path=args.checkpoint_path, + layer=args.layer, + manifest_path=args.manifest_path, + sample_pct=args.sample_pct, + flatten=True, + out_features_path=args.out_features_path, + ) + logger.info(f"Saved extracted features at {args.out_features_path}") diff --git a/examples/textless_nlp/gslm/speech2unit/clustering/quantize_with_kmeans.py b/examples/textless_nlp/gslm/speech2unit/clustering/quantize_with_kmeans.py new file mode 100644 index 0000000000..dd95105232 --- /dev/null +++ b/examples/textless_nlp/gslm/speech2unit/clustering/quantize_with_kmeans.py @@ -0,0 +1,141 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os + +import numpy as np + +import joblib +from examples.textless_nlp.gslm.speech2unit.clustering.utils import ( + get_audio_files, +) +from examples.textless_nlp.gslm.speech2unit.pretrained.utils import ( + get_features, +) + + +def get_logger(): + log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" + logging.basicConfig(format=log_format, level=logging.INFO) + logger = logging.getLogger(__name__) + return logger + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Quantize using K-means clustering over acoustic features." + ) + parser.add_argument( + "--feature_type", + type=str, + choices=["logmel", "hubert", "w2v2", "cpc"], + default=None, + required=True, + help="Acoustic feature type", + ) + parser.add_argument( + "--acoustic_model_path", + type=str, + help="Pretrained acoustic model checkpoint" + ) + parser.add_argument( + "--layer", + type=int, + help="The layer of the pretrained model to extract features from", + default=-1, + ) + parser.add_argument( + "--kmeans_model_path", + type=str, + required=True, + help="K-means model file path to use for inference", + ) + parser.add_argument( + "--features_path", + type=str, + default=None, + help="Features file path. You don't need to enter acoustic model details if you have dumped features", + ) + parser.add_argument( + "--manifest_path", + type=str, + default=None, + help="Manifest file containing the root dir and file names", + ) + parser.add_argument( + "--out_quantized_file_path", + required=True, + type=str, + help="File path of quantized output.", + ) + parser.add_argument( + "--extension", type=str, default=".flac", help="Features file path" + ) + parser.add_argument( + "--channel_id", + choices=['1', '2'], + help="The audio channel to extract the units in case of stereo file.", + default=None, + ) + parser.add_argument( + "--hide-fname", action='store_true', + help="Hide file names in the output file." + ) + return parser + + +def main(args, logger): + # Feature extraction + if args.features_path is not None: + logger.info(f"Loading acoustic features from {args.features_path}...") + features_batch = np.load(args.features_path) + else: + logger.info(f"Extracting {args.feature_type} acoustic features...") + features_batch = get_features( + feature_type=args.feature_type, + checkpoint_path=args.acoustic_model_path, + layer=args.layer, + manifest_path=args.manifest_path, + sample_pct=1.0, + flatten=False, + channel_id=int(args.channel_id) if args.channel_id else None, + ) + logger.info( + f"Features extracted for {len(features_batch)} utterances.\n" + ) + logger.info( + f"Dimensionality of representation = {features_batch[0].shape[1]}" + ) + + # K-means model + logger.info(f"Loading K-means model from {args.kmeans_model_path} ...") + kmeans_model = joblib.load(open(args.kmeans_model_path, "rb")) + kmeans_model.verbose = False + + _, fnames, _ = get_audio_files(args.manifest_path) + + os.makedirs(os.path.dirname(args.out_quantized_file_path), exist_ok=True) + print(f"Writing quantized predictions to {args.out_quantized_file_path}") + with open(args.out_quantized_file_path, "w") as fout: + for i, feats in enumerate(features_batch): + pred = kmeans_model.predict(feats) + pred_str = " ".join(str(p) for p in pred) + base_fname = os.path.basename(fnames[i]).rstrip('.'+args.extension.lstrip('.')) + if args.channel_id is not None: + base_fname = base_fname+f'-channel{args.channel_id}' + if not args.hide_fname: + fout.write(f"{base_fname}|{pred_str}\n") + else: + fout.write(f"{pred_str}\n") + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + logger = get_logger() + logger.info(args) + main(args, logger) diff --git a/examples/textless_nlp/gslm/speech2unit/clustering/utils.py b/examples/textless_nlp/gslm/speech2unit/clustering/utils.py new file mode 100644 index 0000000000..cf08d1fe4b --- /dev/null +++ b/examples/textless_nlp/gslm/speech2unit/clustering/utils.py @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Tuple + + +def get_audio_files(manifest_path: str) -> Tuple[str, List[str], List[int]]: + fnames, sizes = [], [] + with open(manifest_path, "r") as f: + root_dir = f.readline().strip() + for line in f: + items = line.strip().split("\t") + assert ( + len(items) == 2 + ), f"File must have two columns separated by tab. Got {line}" + fnames.append(items[0]) + sizes.append(int(items[1])) + return root_dir, fnames, sizes diff --git a/examples/textless_nlp/gslm/speech2unit/pretrained/cpc_feature_reader.py b/examples/textless_nlp/gslm/speech2unit/pretrained/cpc_feature_reader.py new file mode 100644 index 0000000000..2ea3890c28 --- /dev/null +++ b/examples/textless_nlp/gslm/speech2unit/pretrained/cpc_feature_reader.py @@ -0,0 +1,204 @@ +import soundfile as sf +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class CpcFeatureReader: + """ + Wrapper class to run inference on CPC model. + Helps extract features for a given audio file. + """ + + def __init__( + self, + checkpoint_path, + layer, + use_encoder_layer=False, + norm_features=False, + sample_rate=16000, + max_chunk=64000, + use_cuda=True, + ): + self.model = load_cpc_model(checkpoint_path, layer).eval() + self.sample_rate = sample_rate + self.max_chunk = max_chunk + self.norm_features = norm_features + self.use_encoder_layer = use_encoder_layer + self.use_cuda = use_cuda + if self.use_cuda: + self.model.cuda() + + def read_audio(self, path, ref_len=None, channel_id=None): + wav, sr = sf.read(path) + if channel_id is not None: + assert wav.ndim == 2, \ + f"Expected stereo input when channel_id is given ({path})" + assert channel_id in [1, 2], \ + "channel_id is expected to be in [1, 2]" + wav = wav[:, channel_id-1] + if wav.ndim == 2: + wav = wav.mean(-1) + assert wav.ndim == 1, wav.ndim + assert sr == self.sample_rate, sr + if ref_len is not None and abs(ref_len - len(wav)) > 160: + print(f"ref {ref_len} != read {len(wav)} ({path})") + return wav + + def get_feats(self, file_path, ref_len=None, channel_id=None): + x = self.read_audio(file_path, ref_len, channel_id) + # Inspired from CPC_audio feature_loader.py + with torch.no_grad(): + x = torch.from_numpy(x).float() + if self.use_cuda: + x = x.cuda() + x = x.view(1, 1, -1) + size = x.size(2) + feat = [] + start = 0 + while start < size: + if start + self.max_chunk > size: + break + x_chunk = x[..., start : start + self.max_chunk] + feat_chunk = self.model.extract_features( + source=x_chunk, + get_encoded=self.use_encoder_layer, + norm_output=self.norm_features, + ) + feat.append(feat_chunk) + start += self.max_chunk + + if start < size: + x_chunk = x[:, -self.max_chunk :] + feat_chunk = self.model.extract_features( + source=x_chunk, + get_encoded=self.use_encoder_layer, + norm_output=self.norm_features, + ) + df = x_chunk.size(2) // feat_chunk.size(1) + delta = (size - start) // df + feat.append(feat_chunk[:, -delta:]) + return torch.cat(feat, 1).squeeze(0) + + +def load_cpc_model(checkpoint_path, layer=None): + state_dict = torch.load(checkpoint_path) + weights = state_dict["weights"] + config = state_dict["config"] + if layer is not None: + config["nLevelsGRU"] = layer + + encoder = CPCEncoder(config["hiddenEncoder"]) + ar_net = CPCAR( + config["hiddenEncoder"], config["hiddenGar"], False, config["nLevelsGRU"] + ) + + model = CPCModel(encoder, ar_net) + model.load_state_dict(weights, strict=False) + model.config = config + + return model + + +class ChannelNorm(nn.Module): + def __init__(self, num_features, epsilon=1e-05, affine=True): + super(ChannelNorm, self).__init__() + if affine: + self.weight = nn.parameter.Parameter(torch.Tensor(1, num_features, 1)) + self.bias = nn.parameter.Parameter(torch.Tensor(1, num_features, 1)) + else: + self.weight = None + self.bias = None + self.epsilon = epsilon + self.p = 0 + self.affine = affine + self.reset_parameters() + + def reset_parameters(self): + if self.affine: + torch.nn.init.ones_(self.weight) + torch.nn.init.zeros_(self.bias) + + def forward(self, x): + cum_mean = x.mean(dim=1, keepdim=True) + cum_var = x.var(dim=1, keepdim=True) + x = (x - cum_mean) * torch.rsqrt(cum_var + self.epsilon) + if self.weight is not None: + x = x * self.weight + self.bias + return x + + +class CPCEncoder(nn.Module): + def __init__(self, hidden_dim=512): + super(CPCEncoder, self).__init__() + self.conv0 = nn.Conv1d(1, hidden_dim, 10, stride=5, padding=3) + self.batchNorm0 = ChannelNorm(hidden_dim) + self.conv1 = nn.Conv1d(hidden_dim, hidden_dim, 8, stride=4, padding=2) + self.batchNorm1 = ChannelNorm(hidden_dim) + self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1) + self.batchNorm2 = ChannelNorm(hidden_dim) + self.conv3 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1) + self.batchNorm3 = ChannelNorm(hidden_dim) + self.conv4 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1) + self.batchNorm4 = ChannelNorm(hidden_dim) + self.DOWNSAMPLING = 160 + + def get_output_dim(self): + return self.conv4.out_channels + + def forward(self, x): + x = F.relu(self.batchNorm0(self.conv0(x))) + x = F.relu(self.batchNorm1(self.conv1(x))) + x = F.relu(self.batchNorm2(self.conv2(x))) + x = F.relu(self.batchNorm3(self.conv3(x))) + x = F.relu(self.batchNorm4(self.conv4(x))) + return x + + +class CPCAR(nn.Module): + def __init__(self, dim_encoded, dim_output, keep_hidden, num_layers): + super(CPCAR, self).__init__() + self.baseNet = nn.LSTM( + dim_encoded, dim_output, num_layers=num_layers, batch_first=True + ) + self.hidden = None + self.keep_hidden = keep_hidden + + def get_output_dim(self): + return self.baseNet.hidden_size + + def forward(self, x): + try: + self.baseNet.flatten_parameters() + except RuntimeError: + pass + x, h = self.baseNet(x, self.hidden) + if self.keep_hidden: + if isinstance(h, tuple): + self.hidden = tuple(x.detach() for x in h) + else: + self.hidden = h.detach() + return x + + +class CPCModel(nn.Module): + def __init__(self, encoder, ar_net): + super(CPCModel, self).__init__() + self.gEncoder = encoder + self.gAR = ar_net + self.config = None + + def forward(self, x, label): + encoded = self.gEncoder(x).permute(0, 2, 1) + cpc_feature = self.gAR(encoded) + return cpc_feature, encoded, label + + def extract_features(self, source, get_encoded=False, norm_output=False): + cpc_feature, encoded, _ = self.forward(source, None) + if get_encoded: + cpc_feature = encoded + if norm_output: + mean = cpc_feature.mean(dim=1, keepdim=True) + var = cpc_feature.var(dim=1, keepdim=True) + cpc_feature = (cpc_feature - mean) / torch.sqrt(var + 1e-08) + return cpc_feature diff --git a/examples/textless_nlp/gslm/speech2unit/pretrained/hubert_feature_reader.py b/examples/textless_nlp/gslm/speech2unit/pretrained/hubert_feature_reader.py new file mode 100644 index 0000000000..4fef859fb3 --- /dev/null +++ b/examples/textless_nlp/gslm/speech2unit/pretrained/hubert_feature_reader.py @@ -0,0 +1,70 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import fairseq +import soundfile as sf +import torch.nn.functional as F + + +class HubertFeatureReader: + """ + Wrapper class to run inference on HuBERT model. + Helps extract features for a given audio file. + """ + + def __init__(self, checkpoint_path, layer, max_chunk=1600000, use_cuda=True): + ( + model, + cfg, + task, + ) = fairseq.checkpoint_utils.load_model_ensemble_and_task( + [checkpoint_path] + ) + self.model = model[0].eval() + self.task = task + self.layer = layer + self.max_chunk = max_chunk + self.use_cuda = use_cuda + if self.use_cuda: + self.model.cuda() + + def read_audio(self, path, ref_len=None, channel_id=None): + wav, sr = sf.read(path) + if channel_id is not None: + assert wav.ndim == 2, \ + f"Expected stereo input when channel_id is given ({path})" + assert channel_id in [1, 2], \ + "channel_id is expected to be in [1, 2]" + wav = wav[:, channel_id-1] + if wav.ndim == 2: + wav = wav.mean(-1) + assert wav.ndim == 1, wav.ndim + assert sr == self.task.cfg.sample_rate, sr + if ref_len is not None and abs(ref_len - len(wav)) > 160: + print(f"ref {ref_len} != read {len(wav)} ({path})") + return wav + + def get_feats(self, file_path, ref_len=None, channel_id=None): + x = self.read_audio(file_path, ref_len, channel_id) + with torch.no_grad(): + x = torch.from_numpy(x).float() + if self.use_cuda: + x = x.cuda() + if self.task.cfg.normalize: + x = F.layer_norm(x, x.shape) + x = x.view(1, -1) + + feat = [] + for start in range(0, x.size(1), self.max_chunk): + x_chunk = x[:, start: start + self.max_chunk] + feat_chunk, _ = self.model.extract_features( + source=x_chunk, + padding_mask=None, + mask=False, + output_layer=self.layer, + ) + feat.append(feat_chunk) + return torch.cat(feat, 1).squeeze(0) diff --git a/examples/textless_nlp/gslm/speech2unit/pretrained/logmel_feature_reader.py b/examples/textless_nlp/gslm/speech2unit/pretrained/logmel_feature_reader.py new file mode 100644 index 0000000000..5879da7067 --- /dev/null +++ b/examples/textless_nlp/gslm/speech2unit/pretrained/logmel_feature_reader.py @@ -0,0 +1,34 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import soundfile as sf +import torch +import torchaudio.compliance.kaldi as kaldi + + +class LogMelFeatureReader: + """ + Wrapper class to run inference on HuBERT model. + Helps extract features for a given audio file. + """ + + def __init__(self, *args, **kwargs): + self.num_mel_bins = kwargs.get("num_mel_bins", 80) + self.frame_length = kwargs.get("frame_length", 25.0) + + def get_feats(self, file_path, channel_id=None): + wav, sr = sf.read(file_path) + if channel_id is not None: + assert wav.ndim == 2, \ + f"Expected stereo input when channel_id is given ({file_path})" + wav = wav[:, channel_id-1] + feats = torch.from_numpy(wav).float() + feats = kaldi.fbank( + feats.unsqueeze(0), + num_mel_bins=self.num_mel_bins, + frame_length=self.frame_length, + sample_frequency=sr, + ) + return feats diff --git a/examples/textless_nlp/gslm/speech2unit/pretrained/utils.py b/examples/textless_nlp/gslm/speech2unit/pretrained/utils.py new file mode 100644 index 0000000000..2eca68e800 --- /dev/null +++ b/examples/textless_nlp/gslm/speech2unit/pretrained/utils.py @@ -0,0 +1,127 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import gc +import os +import random +import shutil +import numpy as np + +import torch +import tqdm +from examples.textless_nlp.gslm.speech2unit.pretrained.cpc_feature_reader import ( + CpcFeatureReader, +) +from examples.textless_nlp.gslm.speech2unit.pretrained.hubert_feature_reader import ( + HubertFeatureReader, +) +from examples.textless_nlp.gslm.speech2unit.pretrained.logmel_feature_reader import ( + LogMelFeatureReader, +) +from examples.textless_nlp.gslm.speech2unit.pretrained.w2v2_feature_reader import ( + Wav2VecFeatureReader, +) + + +def get_feature_reader(feature_type): + if feature_type == "logmel": + return LogMelFeatureReader + elif feature_type == "hubert": + return HubertFeatureReader + elif feature_type == "w2v2": + return Wav2VecFeatureReader + elif feature_type == "cpc": + return CpcFeatureReader + else: + raise NotImplementedError(f"{feature_type} is not supported.") + + +def get_feature_iterator( + feature_type, checkpoint_path, layer, manifest_path, sample_pct, channel_id +): + feature_reader_cls = get_feature_reader(feature_type) + with open(manifest_path, "r") as fp: + lines = fp.read().split("\n") + root = lines.pop(0).strip() + file_path_list = [ + os.path.join(root, line.split("\t")[0]) + for line in lines + if len(line) > 0 + ] + if sample_pct < 1.0: + file_path_list = random.sample( + file_path_list, int(sample_pct * len(file_path_list)) + ) + num_files = len(file_path_list) + reader = feature_reader_cls( + checkpoint_path=checkpoint_path, layer=layer + ) + + def iterate(): + for file_path in file_path_list: + feats = reader.get_feats(file_path, channel_id=channel_id) + yield feats.cpu().numpy() + + return iterate, num_files + + +def get_features( + feature_type, checkpoint_path, layer, manifest_path, sample_pct, flatten, channel_id +): + generator, num_files = get_feature_iterator( + feature_type=feature_type, + checkpoint_path=checkpoint_path, + layer=layer, + manifest_path=manifest_path, + sample_pct=sample_pct, + channel_id=channel_id + ) + iterator = generator() + + features_list = [] + for features in tqdm.tqdm(iterator, total=num_files): + features_list.append(features) + + # Explicit clean up + del iterator + del generator + gc.collect() + torch.cuda.empty_cache() + + if flatten: + return np.concatenate(features_list) + + return features_list + + +def get_and_dump_features( + feature_type, + checkpoint_path, + layer, + manifest_path, + sample_pct, + flatten, + out_features_path, +): + # Feature extraction + features_batch = get_features( + feature_type=feature_type, + checkpoint_path=checkpoint_path, + layer=layer, + manifest_path=manifest_path, + sample_pct=sample_pct, + flatten=flatten, + ) + + # Save features + out_dir_path = os.path.dirname(out_features_path) + os.makedirs(out_dir_path, exist_ok=True) + shutil.copyfile( + manifest_path, + os.path.join(out_dir_path, os.path.basename(manifest_path)), + ) + np.save(out_features_path, features_batch) + + return features_batch diff --git a/examples/textless_nlp/gslm/speech2unit/pretrained/w2v2_feature_reader.py b/examples/textless_nlp/gslm/speech2unit/pretrained/w2v2_feature_reader.py new file mode 100644 index 0000000000..9f9da6c499 --- /dev/null +++ b/examples/textless_nlp/gslm/speech2unit/pretrained/w2v2_feature_reader.py @@ -0,0 +1,56 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import fairseq +import soundfile as sf + + +class Wav2VecFeatureReader: + """ + Wrapper class to run inference on Wav2Vec 2.0 model. + Helps extract features for a given audio file. + """ + + def __init__(self, checkpoint_path, layer, use_cuda=True): + state = fairseq.checkpoint_utils.load_checkpoint_to_cpu( + checkpoint_path + ) + + w2v_args = state["args"] + self.task = fairseq.tasks.setup_task(w2v_args) + model = self.task.build_model(w2v_args) + model.load_state_dict(state["model"], strict=True) + model.eval() + self.model = model + self.layer = layer + self.use_cuda = use_cuda + if self.use_cuda: + self.model.cuda() + + def read_audio(self, fname, channel_id=None): + wav, sr = sf.read(fname) + if channel_id is not None: + assert wav.ndim == 2, \ + f"Expected stereo input when channel_id is given ({fname})" + assert channel_id in [1, 2], \ + "channel_id is expected to be in [1, 2]" + wav = wav[:, channel_id-1] + if wav.ndim == 2: + wav = wav.mean(-1) + assert wav.ndim == 1, wav.ndim + assert sr == self.task.cfg.sample_rate, sr + return wav + + def get_feats(self, file_path, channel_id=None): + x = self.read_audio(file_path, channel_id) + with torch.no_grad(): + source = torch.from_numpy(x).view(1, -1).float() + if self.use_cuda: + source = source.cuda() + res = self.model( + source=source, mask=False, features_only=True, layer=self.layer + ) + return res["layer_results"][self.layer][0].squeeze(1) diff --git a/examples/textless_nlp/gslm/tools/README.md b/examples/textless_nlp/gslm/tools/README.md new file mode 100644 index 0000000000..385834841c --- /dev/null +++ b/examples/textless_nlp/gslm/tools/README.md @@ -0,0 +1,25 @@ +# GSLM Tools + +## Resynthesis +You can use the command line tool below to input an audio file and get the resynthesized audio. This tool implements the unsupervised method for resynthesis described in the paper. The way to invoke the command line tool is shown below. +``` +FAIRSEQ_ROOT=<path_to_your_fairseq_repo_root> +TYPE=<one_of_logmel/cpc/hubert/w2v2> +ACOUSTIC_MODEL_PATH=<path_of_pretrained_acoustic_model> +LAYER=<layer_of_acoustic_model_to_extract_features_from> +KM_MODEL_PATH=<output_path_of_the_kmeans_model> +TTS_MODEL_PATH=<unit2speech_model_file_path> +# A text file containing the codes, one per line +CODE_DICT_PATH=<unit2speech_code_dict_path> +WAVEGLOW_PATH=<path_where_you_have_downloaded_waveglow_checkpoint> + +PYTHONPATH=${FAIRSEQ_ROOT}:${FAIRSEQ_ROOT}/examples/textless_nlp/gslm/unit2speech python ${FAIRSEQ_ROOT}/examples/textless_nlp/gslm/tools/resynthesize_speech.py \ + --feature_type $TYPE \ + --acoustic_model_path $ACOUSTIC_MODEL_PATH \ + --layer $LAYER \ + --kmeans_model_path $KM_MODEL_PATH \ + --tts_model_path $TTS_MODEL_PATH \ + --code_dict_path $CODE_DICT_PATH \ + --waveglow_path $WAVEGLOW_PATH \ + --max_decoder_steps 2000 +``` \ No newline at end of file diff --git a/examples/textless_nlp/gslm/tools/resynthesize_speech.py b/examples/textless_nlp/gslm/tools/resynthesize_speech.py new file mode 100644 index 0000000000..309877212e --- /dev/null +++ b/examples/textless_nlp/gslm/tools/resynthesize_speech.py @@ -0,0 +1,132 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import gc +import logging +import os + +import joblib +import soundfile as sf +import torch +from examples.textless_nlp.gslm.speech2unit.pretrained.utils import get_feature_reader +from examples.textless_nlp.gslm.unit2speech.tts_data import TacotronInputDataset +from examples.textless_nlp.gslm.unit2speech.utils import ( + load_tacotron, + load_waveglow, + synthesize_audio, +) + + +def get_logger(): + log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" + logging.basicConfig(format=log_format, level=logging.INFO) + logger = logging.getLogger(__name__) + return logger + + +def get_parser(): + parser = argparse.ArgumentParser(description="GSLM U2S tool") + parser.add_argument( + "--feature_type", + type=str, + choices=["logmel", "hubert", "w2v2", "cpc"], + default=None, + required=True, + help="Acoustic feature type", + ) + parser.add_argument( + "--acoustic_model_path", + type=str, + help="Pretrained acoustic model checkpoint", + ) + parser.add_argument("--layer", type=int, help="Layer of acoustic model") + parser.add_argument( + "--kmeans_model_path", + type=str, + required=True, + help="K-means model file path to use for inference", + ) + parser.add_argument( + "--tts_model_path", + type=str, + help="TTS model file path to use for inference", + ) + parser.add_argument( + "--code_dict_path", + type=str, + help="Code dict file path to use for inference", + ) + parser.add_argument( + "--waveglow_path", + type=str, + help="Waveglow (vocoder) model file path to use for inference", + ) + parser.add_argument("--max_decoder_steps", type=int, default=2000) + parser.add_argument("--denoiser_strength", type=float, default=0.1) + return parser + + +################################################ +def main(args, logger): + # Acoustic Model + logger.info(f"Loading acoustic model from {args.tts_model_path}...") + feature_reader_cls = get_feature_reader(args.feature_type) + reader = feature_reader_cls( + checkpoint_path=args.acoustic_model_path, layer=args.layer + ) + + # K-means Model + logger.info(f"Loading K-means model from {args.kmeans_model_path} ...") + kmeans_model = joblib.load(open(args.kmeans_model_path, "rb")) + kmeans_model.verbose = False + + # TTS Model + logger.info(f"Loading TTS model from {args.tts_model_path}...") + tacotron_model, sample_rate, hparams = load_tacotron( + tacotron_model_path=args.tts_model_path, + max_decoder_steps=args.max_decoder_steps, + ) + + # Waveglow Model + logger.info(f"Loading Waveglow model from {args.waveglow_path}...") + waveglow, denoiser = load_waveglow(waveglow_path=args.waveglow_path) + + # Dataset + if not os.path.exists(hparams.code_dict): + hparams.code_dict = args.code_dict_path + tts_dataset = TacotronInputDataset(hparams) + + iters = 0 + while True: + in_file_path = input("Input: Enter the full file path of audio file...\n") + out_file_path = input("Output: Enter the full file path of audio file...\n") + feats = reader.get_feats(in_file_path).cpu().numpy() + iters += 1 + if iters == 1000: + gc.collect() + torch.cuda.empty_cache() + + quantized_units = kmeans_model.predict(feats) + quantized_units_str = " ".join(map(str, quantized_units)) + + tts_input = tts_dataset.get_tensor(quantized_units_str) + mel, aud, aud_dn, has_eos = synthesize_audio( + tacotron_model, + waveglow, + denoiser, + tts_input.unsqueeze(0), + strength=args.denoiser_strength, + ) + sf.write(f"{out_file_path}", aud_dn[0].cpu().float().numpy(), sample_rate) + logger.info("Resynthesis done!\n") + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + logger = get_logger() + logger.info(args) + main(args, logger) diff --git a/examples/textless_nlp/gslm/ulm/README.md b/examples/textless_nlp/gslm/ulm/README.md new file mode 100644 index 0000000000..01459121ce --- /dev/null +++ b/examples/textless_nlp/gslm/ulm/README.md @@ -0,0 +1,72 @@ +# Unit Language Model (ULM) + +Here you can find links to the pre-trained ULMs and instructions on training new models using fairseq. At the end of the page, we also share how to run sampling for those models and provide pointers to the transcribed prompts we used. + +## Pre-trained models + +Using the links below, you can download pre-trained models for various unit types and vocabulary sizes: + +| | 50 | 100 | 200 +|-|-|-|- +| LogMel Filterbank | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/lm_km50/logmel50_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/lm_km100/logmel100_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/lm_km200/logmel200_lm.tgz) +| Modified CPC | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/lm_km50/cpc50_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/lm_km100/cpc100_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/lm_km200/cpc200_lm.tgz) +| HuBERT | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/lm_km50/hubert50_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/lm_km100/hubert100_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/lm_km200/hubert200_lm.tgz) +| Wav2Vec 2.0 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/lm_km50/w2v2_50_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/lm_km100/w2v2_100_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/lm_km200/w2v2_200_lm.tgz) + + +## Preprocessing data +Assuming that unit-transcribed train, valid, and test sets are located in `data/train.txt`, `data/valid.txt`, and `data/test.txt`, respectively, +we run the following command to get a preprocessed version of the datast in `data-bin`: + +```bash +fairseq-preprocess --only-source \ + --trainpref data/train.txt --validpref data/valid.txt --testpref data/test.txt \ + --destdir data-bin/ --workers 40 +``` +As a result, the `data-bin` directory should appear. + +## Fitting a Unit Language Model (ULM) +As an ULM, we train a standard fairseq Transformer LM. Assuming 8 GPUs used for training, a good starting point for an ULM training would be: +```bash + fairseq-train data-bin/ \ + --task=language_modeling \ + --arch=transformer_lm_big \ + --share-decoder-input-output-embed \ + --dropout=0.1 \ + --attention-dropout=0.1 \ + --optimizer=adam \ + --adam-betas='(0.9, 0.98)' \ + --clip-norm=1.0 \ + --lr=0.0005 \ + --lr-scheduler=inverse_sqrt \ + --warmup-updates=4000 \ + --warmup-init-lr=1e-07 \ + --tokens-per-sample=3072 \ + --update-freq=16 \ + --max-tokens=4096 \ + --num-workers=4 \ + --skip-invalid-size-inputs-valid-test \ + --max-update=500000 \ + --log-interval=10 \ + --seed=100501 \ + --fp16 \ + --sample-break-mode=eos +``` +This command will train a Transformer-large model (12 layers). You can train other standard LM models provided by fairseq, e.g. specify `--arch=transformer_lm` to train a smaller (6-layer) Transformer model. When training with a different number of GPUs, it might be a good idea to adjust the `update-freq` parameter. To save the GPU memory at an expense of additional computation, it can be useful to enable activation checkpointing with `--checkpoint-activations`. + +## Sampling from an ULM +Once an ULM was trained, we can use it for generating new utterances. Suppose, that the prompts are given in a file named `prompts.txt`. Then we can sample continuations by running the following command: + +```bash + python sample.py data-bin/ \ + --path=checkpoints/checkpoint_best.pt --task=language_modeling --sampling --temperature=0.7 \ + --seed=1 --prompts=prompts.txt --output=samples.txt --max-len-a=0 --max-len-b=500 \ + --prefix-size=-1 --batch-size=16 --fp16 --samples-per-prompt=10 +``` +Here, `--prefix-size` controls the number of tokens that are used to prime the ULM. When set to a positive value, the sampling script will take first `prefix-size` tokens to prompt the ULM; with `0` it runs unconditional sampling and with `-1` the entire prompt is used. +`--samples-per-prompt` specifies how many utterances are generated with every prompt which can be useful when generating multiple prompt continuations. In this command, `--max-len-a` and `--max-len-b` control the number of generated tokens. + +When using a pretrained model from above, `data-bin` should point to the unpacked directory (with `dict.txt` file). + +Evaluation-time, to generate prompts, we used utterances from LibriSpeech dev-clean and test-clean that are longer than 6s. We took first 3s from an utterance as a prompt. Unit transcripts of those prompts can be downloaded here: [[dev]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/eval_data/dev_prompts.tgz) [[test]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/eval_data/test_prompts.tgz) + diff --git a/examples/textless_nlp/gslm/ulm/sample.py b/examples/textless_nlp/gslm/ulm/sample.py new file mode 100644 index 0000000000..77302a6894 --- /dev/null +++ b/examples/textless_nlp/gslm/ulm/sample.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Sample from a trained LM; hacked fairseq-interactive +""" +from collections import namedtuple +import os +import ast +import numpy as np + +from fairseq import checkpoint_utils, options, tasks, utils + +import tqdm + +Batch = namedtuple('Batch', 'ids src_tokens src_lengths') +Translation = namedtuple('Translation', 'src_str hypos pos_scores alignments') + + +def make_batches(lines, args, task, max_positions): + tokens = [ + task.source_dictionary.encode_line( + src_str, add_if_not_exist=False + ).long() + for src_str in lines + ] + lengths = [t.numel() for t in tokens] + itr = task.get_batch_iterator( + dataset=task.build_dataset_for_inference(tokens, lengths), + max_tokens=args.dataset.max_tokens, + max_sentences=args.dataset.batch_size, + max_positions=max_positions, + ignore_invalid_inputs=args.dataset.skip_invalid_size_inputs_valid_test + ).next_epoch_itr(shuffle=False) + for batch in itr: + yield Batch( + ids=batch['id'], + src_tokens=batch['net_input']['src_tokens'], src_lengths=batch['net_input']['src_lengths'], + ) + + +def main(args): + arg_prompts = args.prompts + arg_output = args.output + arg_debug = args.debug + arg_sample_size = args.samples_per_prompt + + try: + from fairseq.dataclass.utils import convert_namespace_to_omegaconf + args = convert_namespace_to_omegaconf(args) + except: + pass + + # if args.max_tokens is None and args.max_sentences is None: + if args.common.seed is not None: + np.random.seed(args.common.seed) + utils.set_torch_seed(args.common.seed) + + if args.generation.sampling: + args.generation.nbest = args.generation.beam = arg_sample_size + + task = tasks.setup_task(args.task) + + overrides = ast.literal_eval(args.common_eval.model_overrides) + + models, _model_args = checkpoint_utils.load_model_ensemble( + args.common_eval.path.split(os.pathsep), + arg_overrides=overrides, + task=task, + suffix=getattr(args, "checkpoint_suffix", ""), + ) + + # Set dictionaries + src_dict = task.source_dictionary + tgt_dict = task.target_dictionary + + # Optimize ensemble for generation + for model in models: + model.prepare_for_inference_(args) + model.cuda() + + # Load alignment dictionary for unknown word replacement + # (None if no unknown word replacement, empty if no path to align dictionary) + align_dict = utils.load_align_dict(args.generation.replace_unk) + + max_positions = utils.resolve_max_positions( + task.max_positions(), + *[model.max_positions() for model in models] + ) + + output_file = open(arg_output, 'w') + + with open(arg_prompts, 'r') as fin: + lines = fin.readlines() + + split = [x.split('|', 1) for x in lines] + seq_id = [x[0] for x in split] + prompts = [x[1] for x in split] + + if args.generation.prefix_size >= 0: + prompts = [' '.join(l.split()[:args.generation.prefix_size]) + for l in prompts] + + if arg_debug: + prompts = prompts[:10] + + generator = task.build_generator(models, args.generation) + + start_id = 0 + pbar = tqdm.tqdm(total=len(prompts)) + for batch in make_batches(prompts, args, task, max_positions): + src_tokens = batch.src_tokens + src_lengths = batch.src_lengths + src_tokens = src_tokens.cuda() + src_lengths = src_lengths.cuda() + + sample = { + 'net_input': { + 'src_tokens': src_tokens, + 'src_lengths': src_lengths, + }, + } + + results = [] + translations = task.inference_step(generator, models, sample) + for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)): + src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) + results.append((i + start_id, src_tokens_i, hypos)) + + # sort output to match input order + for id, src_tokens, hypos in sorted(results, key=lambda x: x[0]): + if src_dict is not None: + src_str = src_dict.string( + src_tokens, args.common_eval.post_process) + + # Process top predictions + for hypo_id, hypo in enumerate(hypos): + _hypo_tokens, hypo_str, _alignment = utils.post_process_prediction( + hypo_tokens=hypo['tokens'].int().cpu(), + src_str=src_str, + alignment=hypo['alignment'], + align_dict=align_dict, + tgt_dict=tgt_dict, + remove_bpe=args.common_eval.post_process, + ) + + detok_hypo_str = hypo_str + utterance = detok_hypo_str + print(f'{seq_id[id]}__{hypo_id}|{utterance}', file=output_file) + pbar.update(1) + start_id += len(results) + + # output_file.close() + + +def cli_main(): + parser = options.get_interactive_generation_parser() + parser.add_argument('--prompts', type=str, default=None, required=True) + parser.add_argument('--output', type=str, default=None, required=True) + parser.add_argument('--debug', action='store_true') + parser.add_argument('--samples-per-prompt', type=int, default=1) + + args = options.parse_args_and_arch(parser) + + np.random.seed(args.seed) + utils.set_torch_seed(args.seed) + + main(args) + + +if __name__ == '__main__': + cli_main() diff --git a/examples/textless_nlp/gslm/unit2speech/README.md b/examples/textless_nlp/gslm/unit2speech/README.md new file mode 100644 index 0000000000..e61601392b --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/README.md @@ -0,0 +1,40 @@ +# Unit to Speech Model (unit2speech) + +Unit to speech model is modified Tacotron2 model that learns to synthesize speech from discrete speech units. All models are trained on quantized [LJSpeech](https://keithito.com/LJ-Speech-Dataset/). + +Upstream Units | Download Links | model md5 +|-|-|- +Log Mel Filterbank + KM50 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km50/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km50/code_dict) | 932b3b8527c0125f5f964b57762eba49 +Log Mel Filterbank + KM100 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km100/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km100/code_dict) | cde0b0d278a39011d0acbd5df27abdf4 +Log Mel Filterbank + KM200 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km200/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km200/code_dict) | dba0f1d4de64bc7976718834010b23e7 +Modified CPC + KM50 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km50/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km50/code_dict) | a585e8dd8890ea56164f17635dd8e613 +Modified CPC + KM100 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km100/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km100/code_dict) | 5c0ee2869b4f483d17f37f1a41a548e0 +Modified CPC + KM200 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km200/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km200/code_dict) | 2f0c9951cf37020d9464514bff48bc5d +HuBERT Base + KM50 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km50/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km50/code_dict) | 85ffce8baec5aa90035ab696fe676fce +HuBERT Base + KM100 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km100/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km100/code_dict) | df4a9c6ffd1bb00c91405432c234aba3 +HuBERT Base + KM200 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km200/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km200/code_dict) | ac72f2c0c563589819bec116c7f8d274 +wav2vec 2.0 Large + KM50 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km50/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km50/code_dict) | e3503d0ad822b2c24b89f68b857fedff +wav2vec 2.0 Large + KM100 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km100/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km100/code_dict) | eb3666e456ae4c96bf2a1eec825c13ed +wav2vec 2.0 Large + KM200 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km200/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km200/code_dict) | 777d343e963c4d64f04d78eef032f4e8 + +## Run inference using a unit2speech model +* Install librosa, unidecode and inflect using `pip install librosa, unidecode, inflect` +* Download [Waveglow checkpoint](https://dl.fbaipublicfiles.com/textless_nlp/gslm/waveglow_256channels_new.pt). This is the vocoder. + +Sample commnd to run inference using trained unit2speech models. Please note that the quantized audio to synthesized should be using the same units as the unit2speech model was trained with. +``` +FAIRSEQ_ROOT=<path_to_your_fairseq_repo_root> +TTS_MODEL_PATH=<unit2speech_model_file_path> +QUANTIZED_UNIT_PATH=<quantized_audio_file_path> +OUT_DIR=<dir_to_dump_synthesized_audio_files> +WAVEGLOW_PATH=<path_where_you_have_downloaded_waveglow_checkpoint> +CODE_DICT_PATH=<unit2speech_code_dict_path> + +PYTHONPATH=${FAIRSEQ_ROOT}:${FAIRSEQ_ROOT}/examples/textless_nlp/gslm/unit2speech python ${FAIRSEQ_ROOT}/examples/textless_nlp/gslm/unit2speech/synthesize_audio_from_units.py \ + --tts_model_path $TTS_MODEL_PATH \ + --quantized_unit_path $QUANTIZED_UNIT_PATH \ + --out_audio_dir $OUT_DIR \ + --waveglow_path $WAVEGLOW_PATH \ + --code_dict_path $CODE_DICT_PATH \ + --max_decoder_steps 2000 +``` diff --git a/examples/textless_nlp/gslm/unit2speech/convert_to_16k.py b/examples/textless_nlp/gslm/unit2speech/convert_to_16k.py new file mode 100644 index 0000000000..2be848fcea --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/convert_to_16k.py @@ -0,0 +1,56 @@ +import os +import shlex +import subprocess +import progressbar +from time import time +from pathlib import Path + +def find_all_files(path_dir, extension): + out = [] + for root, dirs, filenames in os.walk(path_dir): + for f in filenames: + if f.endswith(extension): + out.append(((str(Path(f).stem)), os.path.join(root, f))) + return out + +def convert16k(inputfile, outputfile16k): + command = ('sox -c 1 -b 16 {} -t wav {} rate 16k'.format(inputfile, outputfile16k)) + subprocess.call(shlex.split(command)) + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description='Convert to wav 16k audio using sox.') + parser.add_argument('input_dir', type=str, + help='Path to the input dir.') + parser.add_argument('output_dir', type=str, + help='Path to the output dir.') + parser.add_argument('--extension', type=str, default='wav', + help='Audio file extension in the input. Default: mp3') + args = parser.parse_args() + + # Find all sequences + print(f"Finding all audio files with extension '{args.extension}' from {args.input_dir}...") + audio_files = find_all_files(args.input_dir, args.extension) + print(f"Done! Found {len(audio_files)} files.") + + # Convert to relative path + audio_files = [os.path.relpath(file[-1], start=args.input_dir) for file in audio_files] + + # Create all the directories needed + rel_dirs_set = set([os.path.dirname(file) for file in audio_files]) + for rel_dir in rel_dirs_set: + Path(os.path.join(args.output_dir, rel_dir)).mkdir(parents=True, exist_ok=True) + + # Converting wavs files + print("Converting the audio to wav files...") + bar = progressbar.ProgressBar(maxval=len(audio_files)) + bar.start() + start_time = time() + for index, file in enumerate(audio_files): + bar.update(index) + input_file = os.path.join(args.input_dir, file) + output_file = os.path.join(args.output_dir, os.path.splitext(file)[0]+".wav") + convert16k(input_file, output_file) + bar.finish() + print(f"...done {len(audio_files)} files in {time()-start_time} seconds.") \ No newline at end of file diff --git a/examples/textless_nlp/gslm/unit2speech/glow.py b/examples/textless_nlp/gslm/unit2speech/glow.py new file mode 100644 index 0000000000..41fd437feb --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/glow.py @@ -0,0 +1,312 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** +import copy +import torch +from torch.autograd import Variable +import torch.nn.functional as F + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a+input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class WaveGlowLoss(torch.nn.Module): + def __init__(self, sigma=1.0): + super(WaveGlowLoss, self).__init__() + self.sigma = sigma + + def forward(self, model_output): + z, log_s_list, log_det_W_list = model_output + for i, log_s in enumerate(log_s_list): + if i == 0: + log_s_total = torch.sum(log_s) + log_det_W_total = log_det_W_list[i] + else: + log_s_total = log_s_total + torch.sum(log_s) + log_det_W_total += log_det_W_list[i] + + loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total + return loss/(z.size(0)*z.size(1)*z.size(2)) + + +class Invertible1x1Conv(torch.nn.Module): + """ + The layer outputs both the convolution, and the log determinant + of its weight matrix. If reverse=True it does convolution with + inverse + """ + def __init__(self, c): + super(Invertible1x1Conv, self).__init__() + self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, + bias=False) + + # Sample a random orthonormal matrix to initialize weights + _qr = torch.linalg.qr if torch.__version__ >= "1.8" else torch.qr + W = _qr(torch.FloatTensor(c, c).normal_())[0] + + # Ensure determinant is 1.0 not -1.0 + if torch.det(W) < 0: + W[:,0] = -1*W[:,0] + W = W.view(c, c, 1) + self.conv.weight.data = W + + def forward(self, z, reverse=False): + # shape + batch_size, group_size, n_of_groups = z.size() + + W = self.conv.weight.squeeze() + + if reverse: + if not hasattr(self, 'W_inverse'): + # Reverse computation + W_inverse = W.float().inverse() + W_inverse = Variable(W_inverse[..., None]) + if z.type() == 'torch.cuda.HalfTensor': + W_inverse = W_inverse.half() + self.W_inverse = W_inverse + z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) + return z + else: + # Forward computation + log_det_W = batch_size * n_of_groups * torch.logdet(W) + z = self.conv(z) + return z, log_det_W + + +class WN(torch.nn.Module): + """ + This is the WaveNet like layer for the affine coupling. The primary difference + from WaveNet is the convolutions need not be causal. There is also no dilation + size reset. The dilation only doubles on each layer + """ + def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, + kernel_size): + super(WN, self).__init__() + assert(kernel_size % 2 == 1) + assert(n_channels % 2 == 0) + self.n_layers = n_layers + self.n_channels = n_channels + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + + start = torch.nn.Conv1d(n_in_channels, n_channels, 1) + start = torch.nn.utils.weight_norm(start, name='weight') + self.start = start + + # Initializing last layer to 0 makes the affine coupling layers + # do nothing at first. This helps with training stability + end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1) + end.weight.data.zero_() + end.bias.data.zero_() + self.end = end + + cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + + for i in range(n_layers): + dilation = 2 ** i + padding = int((kernel_size*dilation - dilation)/2) + in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size, + dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + self.in_layers.append(in_layer) + + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2*n_channels + else: + res_skip_channels = n_channels + res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, forward_input): + audio, spect = forward_input + audio = self.start(audio) + output = torch.zeros_like(audio) + n_channels_tensor = torch.IntTensor([self.n_channels]) + + spect = self.cond_layer(spect) + + for i in range(self.n_layers): + spect_offset = i*2*self.n_channels + acts = fused_add_tanh_sigmoid_multiply( + self.in_layers[i](audio), + spect[:,spect_offset:spect_offset+2*self.n_channels,:], + n_channels_tensor) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + audio = audio + res_skip_acts[:,:self.n_channels,:] + output = output + res_skip_acts[:,self.n_channels:,:] + else: + output = output + res_skip_acts + + return self.end(output) + + +class WaveGlow(torch.nn.Module): + def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, + n_early_size, WN_config): + super(WaveGlow, self).__init__() + + self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, + n_mel_channels, + 1024, stride=256) + assert(n_group % 2 == 0) + self.n_flows = n_flows + self.n_group = n_group + self.n_early_every = n_early_every + self.n_early_size = n_early_size + self.WN = torch.nn.ModuleList() + self.convinv = torch.nn.ModuleList() + + n_half = int(n_group/2) + + # Set up layers with the right sizes based on how many dimensions + # have been output already + n_remaining_channels = n_group + for k in range(n_flows): + if k % self.n_early_every == 0 and k > 0: + n_half = n_half - int(self.n_early_size/2) + n_remaining_channels = n_remaining_channels - self.n_early_size + self.convinv.append(Invertible1x1Conv(n_remaining_channels)) + self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config)) + self.n_remaining_channels = n_remaining_channels # Useful during inference + + def forward(self, forward_input): + """ + forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames + forward_input[1] = audio: batch x time + """ + spect, audio = forward_input + + # Upsample spectrogram to size of audio + spect = self.upsample(spect) + assert(spect.size(2) >= audio.size(1)) + if spect.size(2) > audio.size(1): + spect = spect[:, :, :audio.size(1)] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) + + audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) + output_audio = [] + log_s_list = [] + log_det_W_list = [] + + for k in range(self.n_flows): + if k % self.n_early_every == 0 and k > 0: + output_audio.append(audio[:,:self.n_early_size,:]) + audio = audio[:,self.n_early_size:,:] + + audio, log_det_W = self.convinv[k](audio) + log_det_W_list.append(log_det_W) + + n_half = int(audio.size(1)/2) + audio_0 = audio[:,:n_half,:] + audio_1 = audio[:,n_half:,:] + + output = self.WN[k]((audio_0, spect)) + log_s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = torch.exp(log_s)*audio_1 + b + log_s_list.append(log_s) + + audio = torch.cat([audio_0, audio_1],1) + + output_audio.append(audio) + return torch.cat(output_audio,1), log_s_list, log_det_W_list + + def infer(self, spect, sigma=1.0): + spect = self.upsample(spect) + # trim conv artifacts. maybe pad spec to kernel multiple + time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] + spect = spect[:, :, :-time_cutoff] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) + + if spect.type() == 'torch.cuda.HalfTensor': + audio = torch.cuda.HalfTensor(spect.size(0), + self.n_remaining_channels, + spect.size(2)).normal_() + else: + audio = torch.cuda.FloatTensor(spect.size(0), + self.n_remaining_channels, + spect.size(2)).normal_() + + audio = torch.autograd.Variable(sigma*audio) + + for k in reversed(range(self.n_flows)): + n_half = int(audio.size(1)/2) + audio_0 = audio[:,:n_half,:] + audio_1 = audio[:,n_half:,:] + + output = self.WN[k]((audio_0, spect)) + + s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = (audio_1 - b)/torch.exp(s) + audio = torch.cat([audio_0, audio_1],1) + + audio = self.convinv[k](audio, reverse=True) + + if k % self.n_early_every == 0 and k > 0: + if spect.type() == 'torch.cuda.HalfTensor': + z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() + else: + z = torch.cuda.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() + audio = torch.cat((sigma*z, audio),1) + + audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data + return audio + + @staticmethod + def remove_weightnorm(model): + waveglow = model + for WN in waveglow.WN: + WN.start = torch.nn.utils.remove_weight_norm(WN.start) + WN.in_layers = remove(WN.in_layers) + WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer) + WN.res_skip_layers = remove(WN.res_skip_layers) + return waveglow + + +def remove(conv_list): + new_conv_list = torch.nn.ModuleList() + for old_conv in conv_list: + old_conv = torch.nn.utils.remove_weight_norm(old_conv) + new_conv_list.append(old_conv) + return new_conv_list diff --git a/examples/textless_nlp/gslm/unit2speech/multiproc.py b/examples/textless_nlp/gslm/unit2speech/multiproc.py new file mode 100644 index 0000000000..2a287a4e97 --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/multiproc.py @@ -0,0 +1,27 @@ +import os +import time +import torch +import sys +import subprocess + +argslist = list(sys.argv)[1:] +log_dir = argslist[-1] +num_gpus = torch.cuda.device_count() +argslist.append('--n_gpus={}'.format(num_gpus)) +workers = [] +job_id = time.strftime("%Y_%m_%d-%H%M%S") +argslist.append("--group_name=group_{}".format(job_id)) + +print("GPU log directory is {}".format(log_dir)) +os.makedirs(log_dir, exist_ok=True) +for i in range(num_gpus): + argslist.append('--rank={}'.format(i)) + stdout = None if i == 0 else open("{}/{}_GPU_{}.log".format(log_dir, job_id, i), + "w") + print(argslist) + p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout) + workers.append(p) + argslist = argslist[:-1] + +for p in workers: + p.wait() diff --git a/examples/textless_nlp/gslm/unit2speech/synthesize_audio_from_units.py b/examples/textless_nlp/gslm/unit2speech/synthesize_audio_from_units.py new file mode 100644 index 0000000000..80730843bf --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/synthesize_audio_from_units.py @@ -0,0 +1,105 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os + +import soundfile as sf +from examples.textless_nlp.gslm.unit2speech.tts_data import ( + TacotronInputDataset, +) +from examples.textless_nlp.gslm.unit2speech.utils import ( + load_quantized_audio_from_file, + load_tacotron, + load_waveglow, + synthesize_audio, +) + + +def get_logger(): + log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" + logging.basicConfig(format=log_format, level=logging.INFO) + logger = logging.getLogger(__name__) + return logger + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Wav2Vec 2.0 speech generator." + ) + parser.add_argument( + "--quantized_unit_path", + type=str, + help="K-means model file path to use for inference", + ) + parser.add_argument( + "--tts_model_path", + type=str, + help="TTS model file path to use for inference", + ) + parser.add_argument( + "--waveglow_path", + type=str, + help="Path to the waveglow checkpoint (vocoder).", + ) + parser.add_argument( + "--code_dict_path", + type=str, + help="Code dict file path to use for inference", + ) + parser.add_argument("--max_decoder_steps", type=int, default=2000) + parser.add_argument("--denoiser_strength", type=float, default=0.1) + parser.add_argument( + "--out_audio_dir", + type=str, + help="Output directory to dump audio files", + ) + + return parser + + +def main(args, logger): + # Load quantized audio + logger.info(f"Loading quantized audio from {args.quantized_unit_path}...") + names_batch, quantized_units_batch = load_quantized_audio_from_file( + file_path=args.quantized_unit_path + ) + + logger.info(f"Loading TTS model from {args.tts_model_path}...") + tacotron_model, sample_rate, hparams = load_tacotron( + tacotron_model_path=args.tts_model_path, + max_decoder_steps=args.max_decoder_steps, + ) + + logger.info(f"Loading Waveglow model from {args.waveglow_path}...") + waveglow, denoiser = load_waveglow(waveglow_path=args.waveglow_path) + + if not os.path.exists(hparams.code_dict): + hparams.code_dict = args.code_dict_path + tts_dataset = TacotronInputDataset(hparams) + + for name, quantized_units in zip(names_batch, quantized_units_batch): + quantized_units_str = " ".join(map(str, quantized_units)) + tts_input = tts_dataset.get_tensor(quantized_units_str) + mel, aud, aud_dn, has_eos = synthesize_audio( + tacotron_model, + waveglow, + denoiser, + tts_input.unsqueeze(0), + strength=args.denoiser_strength, + ) + out_file_path = os.path.join(args.out_audio_dir, f"{name}.wav") + sf.write( + f"{out_file_path}", aud_dn[0].cpu().float().numpy(), sample_rate + ) + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + logger = get_logger() + logger.info(args) + main(args, logger) diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/__init__.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/audio_processing.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/audio_processing.py new file mode 100644 index 0000000000..b5af7f723e --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tacotron2/audio_processing.py @@ -0,0 +1,93 @@ +import torch +import numpy as np +from scipy.signal import get_window +import librosa.util as librosa_util + + +def window_sumsquare(window, n_frames, hop_length=200, win_length=800, + n_fft=800, dtype=np.float32, norm=None): + """ + # from librosa 0.6 + Compute the sum-square envelope of a window function at a given hop length. + + This is used to estimate modulation effects induced by windowing + observations in short-time fourier transforms. + + Parameters + ---------- + window : string, tuple, number, callable, or list-like + Window specification, as in `get_window` + + n_frames : int > 0 + The number of analysis frames + + hop_length : int > 0 + The number of samples to advance between frames + + win_length : [optional] + The length of the window function. By default, this matches `n_fft`. + + n_fft : int > 0 + The length of each analysis frame. + + dtype : np.dtype + The data type of the output + + Returns + ------- + wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` + The sum-squared envelope of the window function + """ + if win_length is None: + win_length = n_fft + + n = n_fft + hop_length * (n_frames - 1) + x = np.zeros(n, dtype=dtype) + + # Compute the squared window at the desired length + win_sq = get_window(window, win_length, fftbins=True) + win_sq = librosa_util.normalize(win_sq, norm=norm)**2 + win_sq = librosa_util.pad_center(win_sq, n_fft) + + # Fill the envelope + for i in range(n_frames): + sample = i * hop_length + x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] + return x + + +def griffin_lim(magnitudes, stft_fn, n_iters=30): + """ + PARAMS + ------ + magnitudes: spectrogram magnitudes + stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods + """ + + angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) + angles = angles.astype(np.float32) + angles = torch.autograd.Variable(torch.from_numpy(angles)) + signal = stft_fn.inverse(magnitudes, angles).squeeze(1) + + for i in range(n_iters): + _, angles = stft_fn.transform(signal) + signal = stft_fn.inverse(magnitudes, angles).squeeze(1) + return signal + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/cleaners.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/cleaners.py new file mode 100644 index 0000000000..e2e35c1a8c --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tacotron2/cleaners.py @@ -0,0 +1,90 @@ +""" from https://github.com/keithito/tacotron """ + +''' +Cleaners are transformations that run over the input text at both training and eval time. + +Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" +hyperparameter. Some cleaners are English-specific. You'll typically want to use: + 1. "english_cleaners" for English text + 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using + the Unidecode library (https://pypi.python.org/pypi/Unidecode) + 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update + the symbols in symbols.py to match your data). +''' + +import re +from unidecode import unidecode +from .numbers import normalize_numbers + + +# Regular expression matching whitespace: +_whitespace_re = re.compile(r'\s+') + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ + ('mrs', 'misess'), + ('mr', 'mister'), + ('dr', 'doctor'), + ('st', 'saint'), + ('co', 'company'), + ('jr', 'junior'), + ('maj', 'major'), + ('gen', 'general'), + ('drs', 'doctors'), + ('rev', 'reverend'), + ('lt', 'lieutenant'), + ('hon', 'honorable'), + ('sgt', 'sergeant'), + ('capt', 'captain'), + ('esq', 'esquire'), + ('ltd', 'limited'), + ('col', 'colonel'), + ('ft', 'fort'), +]] + + +def expand_abbreviations(text): + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text + + +def expand_numbers(text): + return normalize_numbers(text) + + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, ' ', text) + + +def convert_to_ascii(text): + return unidecode(text) + + +def basic_cleaners(text): + '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def transliteration_cleaners(text): + '''Pipeline for non-English text that transliterates to ASCII.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def english_cleaners(text): + '''Pipeline for English text, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + return text diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/cmudict.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/cmudict.py new file mode 100644 index 0000000000..62bfef745c --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tacotron2/cmudict.py @@ -0,0 +1,65 @@ +""" from https://github.com/keithito/tacotron """ + +import re + + +valid_symbols = [ + 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', + 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', + 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', + 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', + 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', + 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', + 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' +] + +_valid_symbol_set = set(valid_symbols) + + +class CMUDict: + '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' + def __init__(self, file_or_path, keep_ambiguous=True): + if isinstance(file_or_path, str): + with open(file_or_path, encoding='latin-1') as f: + entries = _parse_cmudict(f) + else: + entries = _parse_cmudict(file_or_path) + if not keep_ambiguous: + entries = {word: pron for word, pron in entries.items() if len(pron) == 1} + self._entries = entries + + + def __len__(self): + return len(self._entries) + + + def lookup(self, word): + '''Returns list of ARPAbet pronunciations of the given word.''' + return self._entries.get(word.upper()) + + + +_alt_re = re.compile(r'\([0-9]+\)') + + +def _parse_cmudict(file): + cmudict = {} + for line in file: + if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): + parts = line.split(' ') + word = re.sub(_alt_re, '', parts[0]) + pronunciation = _get_pronunciation(parts[1]) + if pronunciation: + if word in cmudict: + cmudict[word].append(pronunciation) + else: + cmudict[word] = [pronunciation] + return cmudict + + +def _get_pronunciation(s): + parts = s.strip().split(' ') + for part in parts: + if part not in _valid_symbol_set: + return None + return ' '.join(parts) diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/layers.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/layers.py new file mode 100644 index 0000000000..f10d557ff5 --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tacotron2/layers.py @@ -0,0 +1,103 @@ +import torch +from librosa.filters import mel as librosa_mel_fn +from .audio_processing import dynamic_range_compression +from .audio_processing import dynamic_range_decompression +from .stft import STFT +from .utils import get_mask_from_lengths + + +class LinearNorm(torch.nn.Module): + def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): + super(LinearNorm, self).__init__() + self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) + + torch.nn.init.xavier_uniform_( + self.linear_layer.weight, + gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, x): + return self.linear_layer(x) + + +class ConvNorm(torch.nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, + padding=None, dilation=1, bias=True, w_init_gain='linear'): + super(ConvNorm, self).__init__() + if padding is None: + assert(kernel_size % 2 == 1) + padding = int(dilation * (kernel_size - 1) / 2) + + self.conv = torch.nn.Conv1d(in_channels, out_channels, + kernel_size=kernel_size, stride=stride, + padding=padding, dilation=dilation, + bias=bias) + + torch.nn.init.xavier_uniform_( + self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, signal): + conv_signal = self.conv(signal) + return conv_signal + + +class GlobalAvgPool(torch.nn.Module): + def __init__(self): + super(GlobalAvgPool, self).__init__() + + def forward(self, x, lengths=None): + """Average pooling across time steps (dim=1) with optionally lengths. + Args: + x: torch.Tensor of shape (N, T, ...) + lengths: None or torch.Tensor of shape (N,) + dim: dimension to pool + """ + if lengths is None: + return x.mean(dim=1, keepdim=False) + else: + mask = get_mask_from_lengths(lengths).type(x.type()).to(x.device) + mask_shape = list(mask.size()) + [1 for _ in range(x.ndimension()-2)] + mask = mask.reshape(*mask_shape) + numer = (x * mask).sum(dim=1, keepdim=False) + denom = mask.sum(dim=1, keepdim=False) + return numer / denom + + +class TacotronSTFT(torch.nn.Module): + def __init__(self, filter_length=1024, hop_length=256, win_length=1024, + n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, + mel_fmax=8000.0): + super(TacotronSTFT, self).__init__() + self.n_mel_channels = n_mel_channels + self.sampling_rate = sampling_rate + self.stft_fn = STFT(filter_length, hop_length, win_length) + mel_basis = librosa_mel_fn( + sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer('mel_basis', mel_basis) + + def spectral_normalize(self, magnitudes): + output = dynamic_range_compression(magnitudes) + return output + + def spectral_de_normalize(self, magnitudes): + output = dynamic_range_decompression(magnitudes) + return output + + def mel_spectrogram(self, y): + """Computes mel-spectrograms from a batch of waves + PARAMS + ------ + y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] + + RETURNS + ------- + mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) + """ + assert(torch.min(y.data) >= -1) + assert(torch.max(y.data) <= 1) + + magnitudes, phases = self.stft_fn.transform(y) + magnitudes = magnitudes.data + mel_output = torch.matmul(self.mel_basis, magnitudes) + mel_output = self.spectral_normalize(mel_output) + return mel_output diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/model.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/model.py new file mode 100644 index 0000000000..ccf132b150 --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tacotron2/model.py @@ -0,0 +1,669 @@ +from math import sqrt +import torch +import torch.distributions as distr +from torch.autograd import Variable +from torch import nn +from torch.nn import functional as F +from .layers import ConvNorm, LinearNorm, GlobalAvgPool +from .utils import to_gpu, get_mask_from_lengths + + +class LocationLayer(nn.Module): + def __init__(self, attention_n_filters, attention_kernel_size, + attention_dim): + super(LocationLayer, self).__init__() + padding = int((attention_kernel_size - 1) / 2) + self.location_conv = ConvNorm(2, attention_n_filters, + kernel_size=attention_kernel_size, + padding=padding, bias=False, stride=1, + dilation=1) + self.location_dense = LinearNorm(attention_n_filters, attention_dim, + bias=False, w_init_gain='tanh') + + def forward(self, attention_weights_cat): + processed_attention = self.location_conv(attention_weights_cat) + processed_attention = processed_attention.transpose(1, 2) + processed_attention = self.location_dense(processed_attention) + return processed_attention + + +class Attention(nn.Module): + def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, + attention_location_n_filters, attention_location_kernel_size): + super(Attention, self).__init__() + self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, + bias=False, w_init_gain='tanh') + self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, + w_init_gain='tanh') + self.v = LinearNorm(attention_dim, 1, bias=False) + self.location_layer = LocationLayer(attention_location_n_filters, + attention_location_kernel_size, + attention_dim) + self.score_mask_value = -float("inf") + + def get_alignment_energies(self, query, processed_memory, + attention_weights_cat): + """ + PARAMS + ------ + query: decoder output (batch, n_mel_channels * n_frames_per_step) + processed_memory: processed encoder outputs (B, T_in, attention_dim) + attention_weights_cat: cumulative and prev. att weights (B, 2, max_time) + + RETURNS + ------- + alignment (batch, max_time) + """ + + processed_query = self.query_layer(query.unsqueeze(1)) + processed_attention_weights = self.location_layer(attention_weights_cat) + energies = self.v(torch.tanh( + processed_query + processed_attention_weights + processed_memory)) + + energies = energies.squeeze(-1) + return energies + + def forward(self, attention_hidden_state, memory, processed_memory, + attention_weights_cat, mask): + """ + PARAMS + ------ + attention_hidden_state: attention rnn last output + memory: encoder outputs + processed_memory: processed encoder outputs + attention_weights_cat: previous and cummulative attention weights + mask: binary mask for padded data + """ + alignment = self.get_alignment_energies( + attention_hidden_state, processed_memory, attention_weights_cat) + + if mask is not None: + alignment.data.masked_fill_(mask, self.score_mask_value) + + attention_weights = F.softmax(alignment, dim=1) + attention_context = torch.bmm(attention_weights.unsqueeze(1), memory) + attention_context = attention_context.squeeze(1) + + return attention_context, attention_weights + + +class Prenet(nn.Module): + def __init__(self, in_dim, sizes): + super(Prenet, self).__init__() + in_sizes = [in_dim] + sizes[:-1] + self.layers = nn.ModuleList( + [LinearNorm(in_size, out_size, bias=False) + for (in_size, out_size) in zip(in_sizes, sizes)]) + + def forward(self, x): + for linear in self.layers: + x = F.dropout(F.relu(linear(x)), p=0.5, training=True) + return x + + +class Postnet(nn.Module): + """Postnet + - Five 1-d convolution with 512 channels and kernel size 5 + """ + + def __init__(self, hparams): + super(Postnet, self).__init__() + self.convolutions = nn.ModuleList() + + self.convolutions.append( + nn.Sequential( + ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim, + kernel_size=hparams.postnet_kernel_size, stride=1, + padding=int((hparams.postnet_kernel_size - 1) / 2), + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(hparams.postnet_embedding_dim)) + ) + + for i in range(1, hparams.postnet_n_convolutions - 1): + self.convolutions.append( + nn.Sequential( + ConvNorm(hparams.postnet_embedding_dim, + hparams.postnet_embedding_dim, + kernel_size=hparams.postnet_kernel_size, stride=1, + padding=int((hparams.postnet_kernel_size - 1) / 2), + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(hparams.postnet_embedding_dim)) + ) + + self.convolutions.append( + nn.Sequential( + ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels, + kernel_size=hparams.postnet_kernel_size, stride=1, + padding=int((hparams.postnet_kernel_size - 1) / 2), + dilation=1, w_init_gain='linear'), + nn.BatchNorm1d(hparams.n_mel_channels)) + ) + + def forward(self, x): + for i in range(len(self.convolutions) - 1): + x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training) + x = F.dropout(self.convolutions[-1](x), 0.5, self.training) + + return x + + +class Encoder(nn.Module): + """Encoder module: + - Three 1-d convolution banks + - Bidirectional LSTM + """ + def __init__(self, hparams): + super(Encoder, self).__init__() + + convolutions = [] + for _ in range(hparams.encoder_n_convolutions): + conv_layer = nn.Sequential( + ConvNorm(hparams.encoder_embedding_dim, + hparams.encoder_embedding_dim, + kernel_size=hparams.encoder_kernel_size, stride=1, + padding=int((hparams.encoder_kernel_size - 1) / 2), + dilation=1, w_init_gain='relu'), + nn.BatchNorm1d(hparams.encoder_embedding_dim)) + convolutions.append(conv_layer) + self.convolutions = nn.ModuleList(convolutions) + + self.lstm = nn.LSTM(hparams.encoder_embedding_dim, + int(hparams.encoder_embedding_dim / 2), 1, + batch_first=True, bidirectional=True) + + def forward(self, x, input_lengths): + for conv in self.convolutions: + x = F.dropout(F.relu(conv(x)), 0.5, self.training) + + x = x.transpose(1, 2) + + # pytorch tensor are not reversible, hence the conversion + input_lengths = input_lengths.cpu().numpy() + x = nn.utils.rnn.pack_padded_sequence( + x, input_lengths, batch_first=True) + + self.lstm.flatten_parameters() + outputs, _ = self.lstm(x) + + outputs, _ = nn.utils.rnn.pad_packed_sequence( + outputs, batch_first=True) + + return outputs + + def inference(self, x): + for conv in self.convolutions: + x = F.dropout(F.relu(conv(x)), 0.5, self.training) + + x = x.transpose(1, 2) + + self.lstm.flatten_parameters() + outputs, _ = self.lstm(x) + + return outputs + + +class AudioEncoder(nn.Module): + def __init__(self, hparams): + super(AudioEncoder, self).__init__() + + assert hparams.lat_dim > 0 + + convolutions = [] + inp_dim = hparams.n_mel_channels + for _ in range(hparams.lat_n_convolutions): + conv_layer = nn.Sequential( + ConvNorm(inp_dim, hparams.lat_n_filters, + kernel_size=hparams.lat_kernel_size, stride=1, + padding=int((hparams.lat_kernel_size - 1) / 2), + dilation=1, w_init_gain='tanh'), + nn.BatchNorm1d(hparams.lat_n_filters)) + inp_dim = hparams.lat_n_filters + convolutions.append(conv_layer) + self.convolutions = nn.ModuleList(convolutions) + + self.lstm = nn.LSTM(hparams.lat_n_filters, + int(hparams.lat_n_filters / 2), + hparams.lat_n_blstms, batch_first=True, + bidirectional=True) + self.pool = GlobalAvgPool() + + self.mu_proj = LinearNorm(hparams.lat_n_filters, hparams.lat_dim) + self.logvar_proj = LinearNorm(hparams.lat_n_filters, hparams.lat_dim) + self.lat_dim = hparams.lat_dim + + def forward(self, x, lengths): + """ + Args: + x (torch.Tensor): (B, F, T) + """ + + for conv in self.convolutions: + x = F.dropout(F.tanh(conv(x)), 0.5, self.training) + + x = x.transpose(1, 2) # (B, T, D) + + # x may not be sorted by length. Sort->process->unsort + max_len = x.size(1) + assert max_len == torch.max(lengths).item() + + lengths, perm_idx = lengths.sort(0, descending=True) + x = x[perm_idx] + x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True) + + self.lstm.flatten_parameters() + outputs, _ = self.lstm(x) + outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) + + _, unperm_idx = perm_idx.sort(0) + outputs = outputs[unperm_idx] # (B, T, D) + lengths = lengths[unperm_idx] # (B, T, D) + + outputs = self.pool(outputs, lengths) # (B, D) + + mu = self.mu_proj(outputs) + logvar = self.logvar_proj(outputs) + z = distr.Normal(mu, logvar).rsample() + return z, mu, logvar + + +class Decoder(nn.Module): + def __init__(self, hparams): + super(Decoder, self).__init__() + self.n_mel_channels = hparams.n_mel_channels + self.n_frames_per_step = hparams.n_frames_per_step + self.encoder_embedding_dim = hparams.encoder_embedding_dim + self.obs_dim = hparams.obs_dim + self.lat_dim = hparams.lat_dim + self.attention_rnn_dim = hparams.attention_rnn_dim + self.decoder_rnn_dim = hparams.decoder_rnn_dim + self.prenet_dim = hparams.prenet_dim + self.max_decoder_steps = hparams.max_decoder_steps + self.gate_threshold = hparams.gate_threshold + self.p_attention_dropout = hparams.p_attention_dropout + self.p_decoder_dropout = hparams.p_decoder_dropout + + self.prenet = Prenet( + hparams.n_mel_channels * hparams.n_frames_per_step, + [hparams.prenet_dim, hparams.prenet_dim]) + + self.attention_rnn = nn.LSTMCell( + hparams.prenet_dim + hparams.encoder_embedding_dim, + hparams.attention_rnn_dim) + + self.attention_layer = Attention( + hparams.attention_rnn_dim, hparams.encoder_embedding_dim, + hparams.attention_dim, hparams.attention_location_n_filters, + hparams.attention_location_kernel_size) + + encoder_tot_dim = (hparams.encoder_embedding_dim + \ + hparams.lat_dim + hparams.obs_dim) + self.decoder_rnn = nn.LSTMCell( + hparams.attention_rnn_dim + encoder_tot_dim, + hparams.decoder_rnn_dim, 1) + + self.linear_projection = LinearNorm( + hparams.decoder_rnn_dim + encoder_tot_dim, + hparams.n_mel_channels * hparams.n_frames_per_step) + + self.gate_layer = LinearNorm( + hparams.decoder_rnn_dim + encoder_tot_dim, 1, + bias=True, w_init_gain='sigmoid') + + def get_go_frame(self, memory): + """ Gets all zeros frames to use as first decoder input + PARAMS + ------ + memory: decoder outputs + + RETURNS + ------- + decoder_input: all zeros frames + """ + B = memory.size(0) + decoder_input = Variable(memory.data.new( + B, self.n_mel_channels * self.n_frames_per_step).zero_()) + return decoder_input + + def initialize_decoder_states(self, memory, obs_and_lat, mask): + """ Initializes attention rnn states, decoder rnn states, attention + weights, attention cumulative weights, attention context, stores memory + and stores processed memory + PARAMS + ------ + memory: Encoder outputs + obs_and_lat: Observed and latent attribute embeddings + mask: Mask for padded data if training, expects None for inference + """ + B = memory.size(0) + MAX_TIME = memory.size(1) + + self.attention_hidden = Variable(memory.data.new( + B, self.attention_rnn_dim).zero_()) + self.attention_cell = Variable(memory.data.new( + B, self.attention_rnn_dim).zero_()) + + self.decoder_hidden = Variable(memory.data.new( + B, self.decoder_rnn_dim).zero_()) + self.decoder_cell = Variable(memory.data.new( + B, self.decoder_rnn_dim).zero_()) + + self.attention_weights = Variable(memory.data.new( + B, MAX_TIME).zero_()) + self.attention_weights_cum = Variable(memory.data.new( + B, MAX_TIME).zero_()) + self.attention_context = Variable(memory.data.new( + B, self.encoder_embedding_dim).zero_()) + + self.memory = memory + self.processed_memory = self.attention_layer.memory_layer(memory) + self.obs_and_lat = obs_and_lat + self.mask = mask + + def parse_decoder_inputs(self, decoder_inputs): + """ Prepares decoder inputs, i.e. mel outputs + PARAMS + ------ + decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs + + RETURNS + ------- + inputs: processed decoder inputs + + """ + # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels) + decoder_inputs = decoder_inputs.transpose(1, 2) + decoder_inputs = decoder_inputs.view( + decoder_inputs.size(0), + int(decoder_inputs.size(1)/self.n_frames_per_step), -1) + # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels) + decoder_inputs = decoder_inputs.transpose(0, 1) + return decoder_inputs + + def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments): + """ Prepares decoder outputs for output + PARAMS + ------ + mel_outputs: + gate_outputs: gate output energies + alignments: + + RETURNS + ------- + mel_outputs: + gate_outpust: gate output energies + alignments: + """ + # (T_out, B) -> (B, T_out) + alignments = torch.stack(alignments).transpose(0, 1) + # (T_out, B) -> (B, T_out) + gate_outputs = torch.stack(gate_outputs).transpose(0, 1) + gate_outputs = gate_outputs.contiguous() + # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels) + mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous() + # decouple frames per step + mel_outputs = mel_outputs.view( + mel_outputs.size(0), -1, self.n_mel_channels) + # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out) + mel_outputs = mel_outputs.transpose(1, 2) + + return mel_outputs, gate_outputs, alignments + + def decode(self, decoder_input): + """ Decoder step using stored states, attention and memory + PARAMS + ------ + decoder_input: previous mel output + + RETURNS + ------- + mel_output: + gate_output: gate output energies + attention_weights: + """ + cell_input = torch.cat((decoder_input, self.attention_context), -1) + self.attention_hidden, self.attention_cell = self.attention_rnn( + cell_input, (self.attention_hidden, self.attention_cell)) + self.attention_hidden = F.dropout( + self.attention_hidden, self.p_attention_dropout, self.training) + + attention_weights_cat = torch.cat( + (self.attention_weights.unsqueeze(1), + self.attention_weights_cum.unsqueeze(1)), dim=1) + self.attention_context, self.attention_weights = self.attention_layer( + self.attention_hidden, self.memory, self.processed_memory, + attention_weights_cat, self.mask) + + self.attention_weights_cum += self.attention_weights + decoder_input = torch.cat( + (self.attention_hidden, self.attention_context), -1) + if self.obs_and_lat is not None: + decoder_input = torch.cat((decoder_input, self.obs_and_lat), -1) + self.decoder_hidden, self.decoder_cell = self.decoder_rnn( + decoder_input, (self.decoder_hidden, self.decoder_cell)) + self.decoder_hidden = F.dropout( + self.decoder_hidden, self.p_decoder_dropout, self.training) + + decoder_hidden_attention_context = torch.cat( + (self.decoder_hidden, self.attention_context), dim=1) + if self.obs_and_lat is not None: + decoder_hidden_attention_context = torch.cat( + (decoder_hidden_attention_context, self.obs_and_lat), dim=1) + decoder_output = self.linear_projection( + decoder_hidden_attention_context) + + gate_prediction = self.gate_layer(decoder_hidden_attention_context) + return decoder_output, gate_prediction, self.attention_weights + + def forward(self, memory, obs_and_lat, decoder_inputs, memory_lengths): + """ Decoder forward pass for training + PARAMS + ------ + memory: Encoder outputs + obs_and_lat: Observed and latent attribute embeddings + decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs + memory_lengths: Encoder output lengths for attention masking. + + RETURNS + ------- + mel_outputs: mel outputs from the decoder + gate_outputs: gate outputs from the decoder + alignments: sequence of attention weights from the decoder + """ + + decoder_input = self.get_go_frame(memory).unsqueeze(0) + decoder_inputs = self.parse_decoder_inputs(decoder_inputs) + decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0) + decoder_inputs = self.prenet(decoder_inputs) + + self.initialize_decoder_states( + memory, obs_and_lat, mask=~get_mask_from_lengths(memory_lengths)) + + mel_outputs, gate_outputs, alignments = [], [], [] + while len(mel_outputs) < decoder_inputs.size(0) - 1: + decoder_input = decoder_inputs[len(mel_outputs)] + mel_output, gate_output, attention_weights = self.decode( + decoder_input) + mel_outputs += [mel_output.squeeze(1)] + gate_outputs += [gate_output.squeeze()] + alignments += [attention_weights] + + mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( + mel_outputs, gate_outputs, alignments) + + return mel_outputs, gate_outputs, alignments + + def inference(self, memory, obs_and_lat, ret_has_eos=False): + """ Decoder inference + PARAMS + ------ + memory: Encoder outputs + obs_and_lat: Observed and latent attribute embeddings + + RETURNS + ------- + mel_outputs: mel outputs from the decoder + gate_outputs: gate outputs from the decoder + alignments: sequence of attention weights from the decoder + """ + decoder_input = self.get_go_frame(memory) + + self.initialize_decoder_states(memory, obs_and_lat, mask=None) + + mel_outputs, gate_outputs, alignments = [], [], [] + has_eos = False + while True: + decoder_input = self.prenet(decoder_input) + mel_output, gate_output, alignment = self.decode(decoder_input) + + mel_outputs += [mel_output.squeeze(1)] + gate_outputs += [gate_output] + alignments += [alignment] + + if torch.sigmoid(gate_output.data) > self.gate_threshold: + has_eos = True + break + elif len(mel_outputs) == self.max_decoder_steps: + # print("Warning! Reached max decoder steps") + break + + decoder_input = mel_output + + mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( + mel_outputs, gate_outputs, alignments) + + if ret_has_eos: + return mel_outputs, gate_outputs, alignments, has_eos + else: + return mel_outputs, gate_outputs, alignments + + +class Tacotron2(nn.Module): + def __init__(self, hparams): + super(Tacotron2, self).__init__() + self.mask_padding = hparams.mask_padding + self.fp16_run = hparams.fp16_run + self.n_mel_channels = hparams.n_mel_channels + self.n_frames_per_step = hparams.n_frames_per_step + + # initialize text encoder embedding + self.embedding = nn.Embedding( + hparams.n_symbols, hparams.symbols_embedding_dim) + std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim)) + val = sqrt(3.0) * std # uniform bounds for std + self.embedding.weight.data.uniform_(-val, val) + + # initialize observed attribute embedding + self.obs_embedding = None + if hparams.obs_dim > 0: + self.obs_embedding = nn.Embedding( + hparams.obs_n_class, hparams.obs_dim) + std = sqrt(2.0 / (hparams.obs_n_class + hparams.obs_dim)) + val = sqrt(3.0) * std # uniform bounds for std + self.obs_embedding.weight.data.uniform_(-val, val) + + self.encoder = Encoder(hparams) + self.decoder = Decoder(hparams) + self.postnet = Postnet(hparams) + + self.lat_encoder = None + if hparams.lat_dim > 0: + self.lat_encoder = AudioEncoder(hparams) + + def parse_batch(self, batch): + (text_padded, input_lengths, obs_labels, + mel_padded, gate_padded, output_lengths) = batch + text_padded = to_gpu(text_padded).long() + input_lengths = to_gpu(input_lengths).long() + obs_labels = to_gpu(obs_labels).long() + max_len = torch.max(input_lengths.data).item() + mel_padded = to_gpu(mel_padded).float() + gate_padded = to_gpu(gate_padded).float() + output_lengths = to_gpu(output_lengths).long() + + return ( + (text_padded, input_lengths, obs_labels, + mel_padded, max_len, output_lengths), + (mel_padded, gate_padded)) + + def parse_output(self, outputs, output_lengths=None): + if self.mask_padding and output_lengths is not None: + mask = ~get_mask_from_lengths(output_lengths) + mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1)) + mask = mask.permute(1, 0, 2) + + outputs[0].data.masked_fill_(mask, 0.0) + outputs[1].data.masked_fill_(mask, 0.0) + outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies + + return outputs + + def forward(self, inputs): + (text_inputs, text_lengths, obs_labels, + mels, max_len, output_lengths) = inputs + text_lengths, output_lengths = text_lengths.data, output_lengths.data + + embedded_inputs = self.embedding(text_inputs).transpose(1, 2) + + encoder_outputs = self.encoder(embedded_inputs, text_lengths) + + obs = None + if self.obs_embedding is not None: + obs = self.obs_embedding(obs_labels) + + lat, lat_mu, lat_logvar = None, None, None + if self.lat_encoder is not None: + (lat, lat_mu, lat_logvar) = self.lat_encoder(mels, output_lengths) + + obs_and_lat = [x for x in [obs, lat] if x is not None] + if bool(obs_and_lat): + obs_and_lat = torch.cat(obs_and_lat, dim=-1) + else: + obs_and_lat = None + + mel_outputs, gate_outputs, alignments = self.decoder( + encoder_outputs, obs_and_lat, mels, memory_lengths=text_lengths) + + mel_outputs_postnet = self.postnet(mel_outputs) + mel_outputs_postnet = mel_outputs + mel_outputs_postnet + + return self.parse_output( + [mel_outputs, mel_outputs_postnet, gate_outputs, alignments, + lat_mu, lat_logvar], + output_lengths) + + def inference(self, inputs, obs_labels=None, lat=None, ret_has_eos=False): + embedded_inputs = self.embedding(inputs).transpose(1, 2) + encoder_outputs = self.encoder.inference(embedded_inputs) + + if obs_labels is None: + obs_labels = torch.LongTensor(len(inputs)) + obs_labels = obs_labels.to(inputs.device).zero_() + + obs = None + if self.obs_embedding is not None: + obs = self.obs_embedding(obs_labels) + + if self.lat_encoder is not None: + if lat is None: + lat = torch.FloatTensor(len(inputs), self.lat_encoder.lat_dim) + lat = lat.to(inputs.device).zero_().type(encoder_outputs.type()) + + obs_and_lat = [x for x in [obs, lat] if x is not None] + if bool(obs_and_lat): + obs_and_lat = torch.cat(obs_and_lat, dim=-1) + else: + obs_and_lat = None + + mel_outputs, gate_outputs, alignments, has_eos = self.decoder.inference( + encoder_outputs, obs_and_lat, ret_has_eos=True) + + mel_outputs_postnet = self.postnet(mel_outputs) + mel_outputs_postnet = mel_outputs + mel_outputs_postnet + + outputs = self.parse_output( + [mel_outputs, mel_outputs_postnet, gate_outputs, alignments]) + + if ret_has_eos: + return outputs + [has_eos] + else: + return outputs diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/numbers.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/numbers.py new file mode 100644 index 0000000000..0d5f7fa818 --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tacotron2/numbers.py @@ -0,0 +1,71 @@ +""" from https://github.com/keithito/tacotron """ + +import inflect +import re + + +_inflect = inflect.engine() +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + else: + return _inflect.number_to_words(num, andword='') + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/stft.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/stft.py new file mode 100644 index 0000000000..63fcd431e2 --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tacotron2/stft.py @@ -0,0 +1,141 @@ +""" +BSD 3-Clause License + +Copyright (c) 2017, Prem Seetharaman +All rights reserved. + +* Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import torch +import numpy as np +import torch.nn.functional as F +from torch.autograd import Variable +from scipy.signal import get_window +from librosa.util import pad_center, tiny +from .audio_processing import window_sumsquare + + +class STFT(torch.nn.Module): + """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" + def __init__(self, filter_length=800, hop_length=200, win_length=800, + window='hann'): + super(STFT, self).__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length + self.window = window + self.forward_transform = None + scale = self.filter_length / self.hop_length + fourier_basis = np.fft.fft(np.eye(self.filter_length)) + + cutoff = int((self.filter_length / 2 + 1)) + fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), + np.imag(fourier_basis[:cutoff, :])]) + + forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) + inverse_basis = torch.FloatTensor( + np.linalg.pinv(scale * fourier_basis).T[:, None, :]) + + if window is not None: + assert(filter_length >= win_length) + # get window and zero center pad it to filter_length + fft_window = get_window(window, win_length, fftbins=True) + fft_window = pad_center(fft_window, filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis *= fft_window + + self.register_buffer('forward_basis', forward_basis.float()) + self.register_buffer('inverse_basis', inverse_basis.float()) + + def transform(self, input_data): + num_batches = input_data.size(0) + num_samples = input_data.size(1) + + self.num_samples = num_samples + + # similar to librosa, reflect-pad the input + input_data = input_data.view(num_batches, 1, num_samples) + input_data = F.pad( + input_data.unsqueeze(1), + (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), + mode='reflect') + input_data = input_data.squeeze(1) + + forward_transform = F.conv1d( + input_data, + Variable(self.forward_basis, requires_grad=False), + stride=self.hop_length, + padding=0) + + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + + magnitude = torch.sqrt(real_part**2 + imag_part**2) + phase = torch.autograd.Variable( + torch.atan2(imag_part.data, real_part.data)) + + return magnitude, phase + + def inverse(self, magnitude, phase): + recombine_magnitude_phase = torch.cat( + [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) + + inverse_transform = F.conv_transpose1d( + recombine_magnitude_phase, + Variable(self.inverse_basis, requires_grad=False), + stride=self.hop_length, + padding=0) + + if self.window is not None: + window_sum = window_sumsquare( + self.window, magnitude.size(-1), hop_length=self.hop_length, + win_length=self.win_length, n_fft=self.filter_length, + dtype=np.float32) + # remove modulation effects + approx_nonzero_indices = torch.from_numpy( + np.where(window_sum > tiny(window_sum))[0]) + window_sum = torch.autograd.Variable( + torch.from_numpy(window_sum), requires_grad=False) + window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum + inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] + + # scale by hop ratio + inverse_transform *= float(self.filter_length) / self.hop_length + + inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] + inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] + + return inverse_transform + + def forward(self, input_data): + self.magnitude, self.phase = self.transform(input_data) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/symbols.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/symbols.py new file mode 100644 index 0000000000..5f0d70fdad --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tacotron2/symbols.py @@ -0,0 +1,18 @@ +""" from https://github.com/keithito/tacotron """ + +''' +Defines the set of symbols used in text input to the model. + +The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' +from . import cmudict + +_pad = '_' +_punctuation = '!\'(),.:;? ' +_special = '-' +_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + +# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): +_arpabet = ['@' + s for s in cmudict.valid_symbols] + +# Export all symbols: +symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/text.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/text.py new file mode 100644 index 0000000000..49e2ca498b --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tacotron2/text.py @@ -0,0 +1,107 @@ +""" from https://github.com/keithito/tacotron """ +import numpy as np +import re +from . import cleaners +from .symbols import symbols + + +# Mappings from symbol to numeric ID and vice versa: +_symbol_to_id = {s: i for i, s in enumerate(symbols)} +_id_to_symbol = {i: s for i, s in enumerate(symbols)} + +# Regular expression matching text enclosed in curly braces: +_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') + +# Special symbols +SOS_TOK = '<s>' +EOS_TOK = '</s>' + +def text_to_sequence(text, cleaner_names): + '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + + The text can optionally have ARPAbet sequences enclosed in curly braces embedded + in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." + + Args: + text: string to convert to a sequence + cleaner_names: names of the cleaner functions to run the text through + + Returns: + List of integers corresponding to the symbols in the text + ''' + sequence = [] + + # Check for curly braces and treat their contents as ARPAbet: + while len(text): + m = _curly_re.match(text) + if not m: + sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) + break + sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) + sequence += _arpabet_to_sequence(m.group(2)) + text = m.group(3) + + return sequence + + +def sample_code_chunk(code, size): + assert(size > 0 and size <= len(code)) + start = np.random.randint(len(code) - size + 1) + end = start + size + return code[start:end], start, end + + +def code_to_sequence(code, code_dict, collapse_code): + if collapse_code: + prev_c = None + sequence = [] + for c in code: + if c in code_dict and c != prev_c: + sequence.append(code_dict[c]) + prev_c = c + else: + sequence = [code_dict[c] for c in code if c in code_dict] + if len(sequence) < 0.95 * len(code): + print('WARNING : over 5%% codes are OOV') + + return sequence + + +def sequence_to_text(sequence): + '''Converts a sequence of IDs back to a string''' + result = '' + for symbol_id in sequence: + if symbol_id in _id_to_symbol: + s = _id_to_symbol[symbol_id] + # Enclose ARPAbet back in curly braces: + if len(s) > 1 and s[0] == '@': + s = '{%s}' % s[1:] + result += s + return result.replace('}{', ' ') + + +def sequence_to_code(sequence, code_dict): + '''Analogous to sequence_to_text''' + id_to_code = {i: c for c, i in code_dict.items()} + return ' '.join([id_to_code[i] for i in sequence]) + + +def _clean_text(text, cleaner_names): + for name in cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception('Unknown cleaner: %s' % name) + text = cleaner(text) + return text + + +def _symbols_to_sequence(symbols): + return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] + + +def _arpabet_to_sequence(text): + return _symbols_to_sequence(['@' + s for s in text.split()]) + + +def _should_keep_symbol(s): + return s in _symbol_to_id and s != '_' and s != '~' diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/utils.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/utils.py new file mode 100644 index 0000000000..b72ae0e35b --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tacotron2/utils.py @@ -0,0 +1,171 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import collections +import io +import json +import librosa +import numpy as np +import soundfile as sf +import time +import torch +from scipy.io.wavfile import read +from .text import SOS_TOK, EOS_TOK + + +def get_mask_from_lengths(lengths): + max_len = torch.max(lengths).item() + ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len)) + mask = (ids < lengths.unsqueeze(1)) + return mask + + +def load_wav_to_torch(full_path, sr=None): + data, sr = librosa.load(full_path, sr=sr) + data = np.clip(data, -1, 1) # potentially out of [-1, 1] due to resampling + data = data * 32768.0 # match values loaded by scipy + return torch.FloatTensor(data.astype(np.float32)), sr + + +def read_binary_audio(bin_data, tar_sr=None): + """ + read binary audio (`bytes` or `uint8` `numpy.ndarray`) to `float32` + `numpy.ndarray` + + RETURNS: + data (np.ndarray) : audio of shape (n,) or (2, n) + tar_sr (int) : sample rate + """ + data, ori_sr = sf.read(io.BytesIO(bin_data), dtype='float32') + data = data.T + if (tar_sr is not None) and (ori_sr != tar_sr): + data = librosa.resample(data, ori_sr, tar_sr) + else: + tar_sr = ori_sr + data = np.clip(data, -1, 1) + data = data * 32768.0 + return torch.FloatTensor(data.astype(np.float32)), tar_sr + + +def load_filepaths_and_text(filename): + with open(filename, encoding='utf-8') as f: + data = [json.loads(line.rstrip()) for line in f] + return data + + +def to_gpu(x): + x = x.contiguous() + + if torch.cuda.is_available(): + x = x.cuda(non_blocking=True) + return torch.autograd.Variable(x) + + +def load_code_dict(path, add_sos=False, add_eos=False): + if not path: + return {} + + with open(path, 'r') as f: + codes = ['_'] + [line.rstrip() for line in f] # '_' for pad + code_dict = {c: i for i, c in enumerate(codes)} + + if add_sos: + code_dict[SOS_TOK] = len(code_dict) + if add_eos: + code_dict[EOS_TOK] = len(code_dict) + assert(set(code_dict.values()) == set(range(len(code_dict)))) + + return code_dict + + +def load_obs_label_dict(path): + if not path: + return {} + with open(path, 'r') as f: + obs_labels = [line.rstrip() for line in f] + return {c: i for i, c in enumerate(obs_labels)} + + +# A simple timer class inspired from `tnt.TimeMeter` +class CudaTimer: + def __init__(self, keys): + self.keys = keys + self.reset() + + def start(self, key): + s = torch.cuda.Event(enable_timing=True) + s.record() + self.start_events[key].append(s) + return self + + def stop(self, key): + e = torch.cuda.Event(enable_timing=True) + e.record() + self.end_events[key].append(e) + return self + + def reset(self): + self.start_events = collections.defaultdict(list) + self.end_events = collections.defaultdict(list) + self.running_times = collections.defaultdict(float) + self.n = collections.defaultdict(int) + return self + + def value(self): + self._synchronize() + return {k: self.running_times[k] / self.n[k] for k in self.keys} + + def _synchronize(self): + torch.cuda.synchronize() + for k in self.keys: + starts = self.start_events[k] + ends = self.end_events[k] + if len(starts) == 0: + raise ValueError("Trying to divide by zero in TimeMeter") + if len(ends) != len(starts): + raise ValueError("Call stop before checking value!") + time = 0 + for start, end in zip(starts, ends): + time += start.elapsed_time(end) + self.running_times[k] += time * 1e-3 + self.n[k] += len(starts) + self.start_events = collections.defaultdict(list) + self.end_events = collections.defaultdict(list) + + +# Used to measure the time taken for multiple events +class Timer: + def __init__(self, keys): + self.keys = keys + self.n = {} + self.running_time = {} + self.total_time = {} + self.reset() + + def start(self, key): + self.running_time[key] = time.time() + return self + + def stop(self, key): + self.total_time[key] = time.time() - self.running_time[key] + self.n[key] += 1 + self.running_time[key] = None + return self + + def reset(self): + for k in self.keys: + self.total_time[k] = 0 + self.running_time[k] = None + self.n[k] = 0 + return self + + def value(self): + vals = {} + for k in self.keys: + if self.n[k] == 0: + raise ValueError("Trying to divide by zero in TimeMeter") + else: + vals[k] = self.total_time[k] / self.n[k] + return vals diff --git a/examples/textless_nlp/gslm/unit2speech/tacotron2/waveglow_denoiser.py b/examples/textless_nlp/gslm/unit2speech/tacotron2/waveglow_denoiser.py new file mode 100644 index 0000000000..6a6585e8b6 --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tacotron2/waveglow_denoiser.py @@ -0,0 +1,40 @@ +# import sys +# sys.path.append('tacotron2') +import torch +from .layers import STFT + + +class Denoiser(torch.nn.Module): + """ Removes model bias from audio produced with waveglow """ + + def __init__(self, waveglow, filter_length=1024, n_overlap=4, + win_length=1024, mode='zeros'): + super(Denoiser, self).__init__() + self.stft = STFT(filter_length=filter_length, + hop_length=int(filter_length/n_overlap), + win_length=win_length).cuda() + if mode == 'zeros': + mel_input = torch.zeros( + (1, 80, 88), + dtype=waveglow.upsample.weight.dtype, + device=waveglow.upsample.weight.device) + elif mode == 'normal': + mel_input = torch.randn( + (1, 80, 88), + dtype=waveglow.upsample.weight.dtype, + device=waveglow.upsample.weight.device) + else: + raise Exception("Mode {} if not supported".format(mode)) + + with torch.no_grad(): + bias_audio = waveglow.infer(mel_input, sigma=0.0).float() + bias_spec, _ = self.stft.transform(bias_audio) + + self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None]) + + def forward(self, audio, strength=0.1): + audio_spec, audio_angles = self.stft.transform(audio.cuda().float()) + audio_spec_denoised = audio_spec - self.bias_spec * strength + audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) + audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles) + return audio_denoised diff --git a/examples/textless_nlp/gslm/unit2speech/tts_data.py b/examples/textless_nlp/gslm/unit2speech/tts_data.py new file mode 100644 index 0000000000..d2b04c0fee --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/tts_data.py @@ -0,0 +1,54 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +import numpy as np +from examples.textless_nlp.gslm.unit2speech.tacotron2.text import ( + EOS_TOK, + SOS_TOK, + code_to_sequence, + text_to_sequence, +) +from examples.textless_nlp.gslm.unit2speech.tacotron2.utils import ( + load_code_dict, +) + + +class TacotronInputDataset: + def __init__(self, hparams, append_str=""): + self.is_text = getattr(hparams, "text_or_code", "text") == "text" + if not self.is_text: + self.code_dict = load_code_dict( + hparams.code_dict, hparams.add_sos, hparams.add_eos + ) + self.code_key = hparams.code_key + self.add_sos = hparams.add_sos + self.add_eos = hparams.add_eos + self.collapse_code = hparams.collapse_code + self.append_str = append_str + + def process_code(self, inp_str): + inp_toks = inp_str.split() + if self.add_sos: + inp_toks = [SOS_TOK] + inp_toks + if self.add_eos: + inp_toks = inp_toks + [EOS_TOK] + return code_to_sequence(inp_toks, self.code_dict, self.collapse_code) + + def process_text(self, inp_str): + return text_to_sequence(inp_str, ["english_cleaners"]) + + def get_tensor(self, inp_str): + # uid, txt, inp_str = self._get_data(idx) + inp_str = inp_str + self.append_str + if self.is_text: + inp_toks = self.process_text(inp_str) + else: + inp_toks = self.process_code(inp_str) + return torch.from_numpy(np.array(inp_toks)).long() + + def __len__(self): + return len(self.data) diff --git a/examples/textless_nlp/gslm/unit2speech/utils.py b/examples/textless_nlp/gslm/unit2speech/utils.py new file mode 100644 index 0000000000..7aced08d38 --- /dev/null +++ b/examples/textless_nlp/gslm/unit2speech/utils.py @@ -0,0 +1,55 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +from examples.textless_nlp.gslm.unit2speech.tacotron2.model import Tacotron2 +from examples.textless_nlp.gslm.unit2speech.tacotron2.waveglow_denoiser import ( + Denoiser, +) + + +def load_quantized_audio_from_file(file_path): + base_fname_batch, quantized_units_batch = [], [] + with open(file_path) as f: + for line in f: + base_fname, quantized_units_str = line.rstrip().split("|") + quantized_units = [int(q) for q in quantized_units_str.split(" ")] + base_fname_batch.append(base_fname) + quantized_units_batch.append(quantized_units) + return base_fname_batch, quantized_units_batch + + +def synthesize_audio(model, waveglow, denoiser, inp, lab=None, strength=0.0): + assert inp.size(0) == 1 + inp = inp.cuda() + if lab is not None: + lab = torch.LongTensor(1).cuda().fill_(lab) + + with torch.no_grad(): + _, mel, _, ali, has_eos = model.inference(inp, lab, ret_has_eos=True) + aud = waveglow.infer(mel, sigma=0.666) + aud_dn = denoiser(aud, strength=strength).squeeze(1) + return mel, aud, aud_dn, has_eos + + +def load_tacotron(tacotron_model_path, max_decoder_steps): + ckpt_dict = torch.load(tacotron_model_path) + hparams = ckpt_dict["hparams"] + hparams.max_decoder_steps = max_decoder_steps + sr = hparams.sampling_rate + model = Tacotron2(hparams) + model.load_state_dict(ckpt_dict["model_dict"]) + model = model.cuda().eval().half() + return model, sr, hparams + + +def load_waveglow(waveglow_path): + waveglow = torch.load(waveglow_path)["model"] + waveglow = waveglow.cuda().eval().half() + for k in waveglow.convinv: + k.float() + denoiser = Denoiser(waveglow) + return waveglow, denoiser diff --git a/examples/textless_nlp/pgslm/README.md b/examples/textless_nlp/pgslm/README.md new file mode 100644 index 0000000000..596467fe82 --- /dev/null +++ b/examples/textless_nlp/pgslm/README.md @@ -0,0 +1,318 @@ +# Text-Free Prosody-Aware Generative Spoken Language Modeling + +This folder contains code and recipes to reproduce results reported in a paper _Text-Free Prosody-Aware Generative Spoken Language Modeling_, +Eugene Kharitonov*, Ann Lee*, Adam Polyak, Yossi Adi, Jade Copet, Kushal Lakhotia, Tu-Anh Nguyen, Morgane Rivière, Abdelrahman Mohamed, Emmanuel Dupoux, Wei-Ning Hsu, 2021. arxiv/2109.03264 [[arxiv]](https://arxiv.org/abs/2109.03264). + +`*` denotes equal contribution. + +You can find demo samples [[here]](https://speechbot.github.io/pgslm/index.html). + +<details> + <summary>If you find this code useful, please consider citing our work using this bibtex </summary> + +``` + @misc{Kharitonov2021, + title={Text-Free Prosody-Aware Generative Spoken Language Modeling}, + author={Eugene Kharitonov and Ann Lee and Adam Polyak and Yossi Adi and Jade Copet and Kushal Lakhotia and Tu-Anh Nguyen and Morgane Rivière and Abdelrahman Mohamed and Emmanuel Dupoux and Wei-Ning Hsu}, + year={2021}, + eprint={2109.03264}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` +</details> + + +## Additional requirements +Three packages are required in addition to fairseq, they are installable with pip: +```bash +pip install AMFM-decompy SoundFile scipy sklearn torchaudio npy-append-array +``` + +## Data preprocessing + +### Prepare unit pseudo-text transcriptions of the audio +To get unit trascripts of the speech data we rely on the preprocessing steps of [GSLM](https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/speech2unit/) work. + +Firstly, we will need to prepare manifest files for the dataset we want to preprocess +``` +mkdir manifests/ +python examples/wav2vec/wav2vec_manifest.py --valid-percent=0.0 $DATA_PATH --dest=manifests/train/ +``` +Next, we need a pre-trained HuBERT-base-ls960 model [[download]](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) and a corresponding kmeans-100 quantizer [[download]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/km100/km.bin). Having those we can quantize the dataset: +``` +python examples/textless_nlp/gslm/speech2unit/clustering/quantize_with_kmeans.py \ + --feature_type hubert \ + --kmeans_model_path km.bin \ + --acoustic_model_path hubert_base_ls960.pt \ + --layer 6 \ + --manifest_path manifests/train/train.tsv \ + --out_quantized_file_path manifests/train/units +``` + +Finally, by running +``` +python examples/textless_nlp/pgslm/scripts/join_units_manifest.py --manifest=manifests/train/train.tsv --units=manifests/train/units --output=train.txt +``` +We will get the training data description `train.txt` in the format that pGSLM expects. The above steps have to be repeated for +dev/test sets. Importantly, we rely on an assumption that the directories are structured as in LibriSpeech, i.e. the file paths follow the +`<spk_id>/<session_id>/<sample_id>.wav` format. + +### Preprocess data for pGSLM +The very first step is to obtain the F0 quantization bins. +Assume the vocoder training manifest is `vocoder_train.txt` (in pGSLM data format prepared with the same process above). +We prepare the quantized F0 from the vocoder training data by running +```sh +bash examples/textless_nlp/pgslm/scripts/prepare_f0_quantization.sh \ + vocoder_train.txt <sample_rate> 32 <preprocessed_dir> <output_prefix> # we use 32 bins in the paper +``` +- `<sample_rate>`: sampling rate of the audio files in the manifest +- `<preprocessed_dir>`: where to output the output files +- `<output_prefix>`: prefix of the output files + +The script will generate +- `<output_prefix>.f0_stat.pt`: the speaker-level F0 statistics, which can be used in vocoder training +- `<output_prefix>_mean_norm_log_f0_bin.th`: the quantized F0, which should be used in `prepare_data.sh` below + +**Note:** See "Pre-trained models" for the pre-computed speaker-level F0 statistics and quantized F0 bins. We suggest using the pre-computed statistics for the data preparation below in order to take advantage of the pre-trained vocoder for waveform generation. + +Next prepare the pGSLM data. +Assume train/valid/test manifests are `{train,valid,test}.txt`. +Here is an example of how to preprocess data: + +```sh +bash examples/textless_nlp/pgslm/scripts/prepare_data.sh \ + train.txt valid.txt test.txt <n_unit> <hop_size> <sample_rate> \ + <preprocessed_dir>/<output_prefix>_mean_norm_log_f0_bin.th <preprocessed_dir> +``` +- `<n_unit>`: discrete unit vocabulary size (we used a kmeans quantizer with the number of units equal to 100 in the example above) +- `<hop_size>`: downsampling rate relative to the waveform (e.g., 320 for HuBERT units) +- `<sample_rate>`: sampling rate of the audio files in the manifest +- `<preprocessed_dir>`: where to output the preprocessed files + +This will create the dataset json config used for the next section at +`<preprocessed_dir>/data_config.json`. + +Note that the example script uses only one thread to compute F0, which can take +_very long_ for preprocessing large datasets. It is suggested to distribute +jobs over multiple nodes/processes with `--nshards=x` and `--rank=z` (where z is +in [1, x]) in `preprocess_f0.py`, and set `--nshards_list=x` in +`prepare_data.py` correspondingly to collect sharded F0 data. + +Now, everything is ready for training a model. + +## Training Multi-Stream Transformer Unit Language Model (MS-TLM) + +Below is an example command that trains Multi-Stream Transformer Language Model (MS-TLM) on a prepared dataset: +```bash +DATASET=data_config.json + +fairseq-train $DATASET \ + --task=speech_unit_modeling \ + --arch="transformer_ulm_tiny" \ + --criterion=speech_unit_lm_criterion \ + --share-decoder-input-output-embed \ + --dropout=0.1 \ + --attention-dropout=0.1 \ + --optimizer="adam" \ + --adam-betas="(0.9, 0.98)" \ + --clip-norm=1.0 \ + --lr=0.0005 \ + --lr-scheduler="inverse_sqrt" \ + --warmup-updates=4000 \ + --warmup-init-lr=1e-07 \ + --tokens-per-sample=3072 \ + --max-tokens=3072 \ + --update-freq=4 \ + --max-epoch=70 \ + --num-workers=0 \ + --skip-invalid-size-inputs-valid-test \ + --loss-weights="1.0;0.5;0.0" \ + --ignore-f0-input \ + --checkpoint-activations \ + --fp16 \ + --max-target-positions=4096 \ + --stream-shifts="1,1" \ + --log-f0 --normalize-f0-mean --interpolate-f0 \ + --ignore-unused-valid-subsets \ + --discrete-duration --discrete-f0 +``` + +Some of the important parameters that are specific to MS-TLM: + * `arch`: specifies the Transformer architecture used. Supported options are: + * `transformer_ulm_tiny` - a tiny model that can be used for debugging; it has 2 layers, 1 attention head, FFN and embedding dimensions of 64, + * `transformer_ulm` - a base model with 6 layers, 8 heads, embedding dimension 512, and FFN dimensionality of 2048, + * `transformer_ulm_big` - the largest model we experiment with in the paper: 12-layer/16 heads, 1024/4096 embedding and FFN dimensions; + * `loss-weights`: this parameter sets importance weights (must be non-negative) for the components of the loss that correspond to unit, duration, and F0 streams. To turn off a component of the loss, its weight has to be set to 0. For instance, to predict only unit stream the parameter should be set to "1;0;0"; + * `stream-shifts`: specifies relative shifts of the two prosodic streams w.r.t. the unit stream (duration and F0, respectively). No shift corresponds to "0,0"; + * `ignore-duration-input`/`ignore-f0-input`: setting these flags would zero-out correpsonding input streams; + * `max-token-duration`: duration values would be max-capped by the specified value; + * `discrete-duration`/`discrete-f0`: whether duration and F0 streams should be quantized; + * `log_f0`, `normalize-f0-mean`, `normalize-f0-std`, `interpolate-f0`: configure how F0 stream is treated. `log_f0` sets up modelling in the log-space, `normalize-f0-mean`/`normalize-f0-std` control per-speaker normalization, and `interpolate-f0` enables F0 interpolation for unvoiced regions where F0 was set to 0, + * `mask-dur-prob`, `mask-f0-prob`, `mask-dur-seg-prob`, `mask-f0-seg-prob`, `mask-unit-seg-prob`, `mask-unit-seg-leng`: this family of parameters sets the probababilities of masking individual steps and spans on each stream as well as lengths of the maked spans. + + +## Pre-trained models +### MS-TLM +Below you can find checkpoints for four best-performing models from the paper (IDs 9..12 in Table 1). These models are trained on Hubert-100 transcripts of the LibriLight-6K dataset. They have the prosody streams shifted by 1 w.r.t. the unit stream. All models predict all three streams (units, duration, and F0), but two +of them only have unit steam in their input. + +| | Continuous prosody | Quantized prosody | +|-------------------|--------------------|-------------------| +| No prosody input | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/ulm_checkpoints/continuous_no_prosody_shift_1_1.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/ulm_checkpoints/discrete_no_prosody_shift_1_1.pt) | +| Has prosody input | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/ulm_checkpoints/continuous_prosody_shift_1_1.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/ulm_checkpoints/discrete_prosody_shift_1_1.pt)| + +The optimal per-stream sampling temperatures/scaling parameters that we have identified for these models, in the (`T-token, T-duration, T-f0`) format: + +| | Continuous prosody | Quantized prosody | +|-------------------|--------------------|-------------------| +| No prosody input | 0.7, 0.125, 0.0003125| 0.7, 0.25, 0.5 | +| Has prosody input | 0.7, 0.125, 0.00125 | 0.7, 0.25, 0.7 | + +## Vocoder +| Units | Prosody | F0 stats | Checkpoint | Config | +|-------------------|---------|--------------|------------|--------| +| HuBERT-base-ls960, kmeans-100 | [[Quantized 32 bins]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/mean_norm_log_f0_seg_bin.th) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/f0_stats.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/naive_quant_32_norm_log_seg_hubert/checkpoint.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/naive_quant_32_norm_log_seg_hubert/config.json) | +| HuBERT-base-ls960, kmeans-100 | Continuous | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/f0_stats.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/mean_norm_log_f0_hubert/checkpoint.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/mean_norm_log_f0_hubert/config.json) | + + +## Evaluating a trained model +Evaluation is done with the `eval/cont_metrics.py` scripts. As described in the paper, there are several metrics used. + +**Teacher-forced metrics** +```bash +SET=valid +CHECKPOINT_PATH=discrete_prosody_shift_1_1.pt +DATA=data_config.json + +python examples/textless_nlp/pgslm/eval/cont_metrics.py $DATA \ + --metric=teacher_force_everything \ + --path=$CHECKPOINT_PATH \ + --batch-size=16 \ + --fp16 \ + --seed=111 \ + --eval-subset=$SET \ + --f0-discretization-bounds=mean_norm_log_f0_seg_bin.th --dequantize-prosody +``` +(Using this command, our provided `discrete_prosody_shift_1_1.pt` checkpoint should produce `{'token_loss': 1.408..., 'duration_loss': 0.5424..., 'f0_loss': 0.0474...}` on LibriSpeech dev-clean). + +The parameters `--f0-discretization-bounds=mean_norm_log_f0_seg_bin.th --dequantize-prosody` are specific for quantized-prosody models. They signal that the prosody streams must be decoded into the continuous domain before calculating correlation. It is the same `*_mean_norm_log_f0_bin.th` file as we prepared before. +The `mean_norm_log_f0_seg_bin.th` file we used with the pre-trained models can be downloaded [[here]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/mean_norm_log_f0_seg_bin.th). + + +**Consistency (aka Correlation) metrics** + +The following command estimates correlation between mean values of the F0 stream in the prompt and in the generated continuation (unit and duration steams are fixed). + +```bash +T_F0=0.7 +EXPLOSION=20 +SET=test +CHECKPOINT_PATH=discrete_prosody_shift_1_1.pt +DATA=data_config.json + +python examples/textless_nlp/pgslm/eval/cont_metrics.py $DATA \ + --prefix-length=150 \ + --metric=correlation \ + --path=$CHECKPOINT_PATH \ + --batch-size=16 \ + --fp16 \ + --seed=111 \ + --teacher-force-tokens \ + --teacher-force-duration \ + --min-length=300 \ + --batch-explosion-rate=$EXPLOSION \ + --T-f0=$T_F0 \ + --eval-subset=$SET \ + --f0-discretization-bounds=mean_norm_log_f0_seg_bin.th \ + --dequantize-prosody --n-workers=8 +``` +(Using this command, our provided `discrete_prosody_shift_1_1.pt` checkpoint should produce `{...'F0 corr': 0.315 ..}` on LibriSpeech test-clean). + + * By using flags `--teacher-force-tokens, --teacher-force-duration, --teacher-force-f0` one can calculate correlations along each stream while having other two streams fixed to ground-truth values (or freeze all three streams to get ground-truth correlation values); + * The parameters `T-f0`, `T-duration`, and `T-token` specify per-stream temperatures and, in the case of continuous-valued prosody, scaling parameter of the corresponding Laplace distribution (setting a temperature to 0 will enforce greedy sampling); + * `min-length` filters out sequences that are shorter then 300 duration units (i.e. 6s in the case of Hubert units); + * `prefix-length` specifies that we want to use first 150 duration units are prompt (i.e. 3s in the case of Hubert units) + + +**Correctness (aka Continuation) and Expressiveness (aka Std) metrics** + +By running the following command, we can get minMAE and Std for the log-F0 stream for the model with quantized prosody. +```bash +DATA=data_config.json +EXPLOSION=20 +SET=test +CHECKPOINT_PATH=discrete_prosody_shift_1_1.pt +T_F0=0.7 + +python examples/textless_nlp/pgslm/eval/cont_metrics.py $DATA \ + --prefix-length=150 \ + --metric=continuation \ + --path=$CHECKPOINT_PATH \ + --batch-size=16 \ + --fp16 \ + --seed=111 \ + --batch-explosion-rate=$EXPLOSION \ + --teacher-force-tokens \ + --teacher-force-duration \ + --T-f0=$T_F0 \ + --eval-subset=$SET \ + --f0-discretization-bounds=mean_norm_log_f0_seg_bin.th --dequantize-prosody +``` +(Using this command, our provided `discrete_prosody_shift_1_1.pt` checkpoint should produce `{...'F0 MAE': 0.0772, 'F0 Std': 0.1489...}` on LibriSpeech test-clean). + +Again, by setting `--teacher-force-tokens, --teacher-force-duration, --teacher-force-f0` we can calculate Token BLEU for the token stream (when `--teacher-force-duration` & `--teacher-force-f0` are on) and per-stream min MAE for each prosody stream individually. + +Finally, `cont_metrics.py` allows to specify the number of workers (e.g., `n-workers=8`) which allows to speed up the computation by spreading multiple worker processes +over the available GPUs. + +**Cont Word BLEU** + +We used the code and the evaluation protocol of [(Lakhotia et al., 2021)](https://arxiv.org/abs/2102.01192). + +## Sampling from a trained model + +To get (prompted or not) samples from a trained model it is enough to run `sample.py`: +```bash +CHECKPOINT_PATH=checkpoints/checkpoint_best.pt +DATASET=examples/textless_nlp/pgslm/repro/dataset/data_config.json +python examples/textless_nlp/pgslm/sample/sample.py $DATASET \ + --output=$SAMPLES \ + --path=$CHECKPOINT_PATH \ + --sampling \ + --T-token=0.7 \ + --T-duration=0.25 \ + --T-f0=0.7 \ + --max-length=500 \ + --prefix-length=150 \ + --subset=valid \ + --seed=1 \ + --match-duration \ + --code-type=hubert \ + --batch-explosion-rate=2 +``` + +Some useful parameters: + * `T-token`, `T-duration`, `T-f0` specify sampling temperature for the three streams. Setting a temperature to `0` switches sample to the greedy (argmax) one; + * `prefix-length`: length of the prompt, measured in timesteps (e.g. for Hubert (CPC) each timestep is 20 (10) ms); + * `subset`: which subset of the dataset to use as prompts (can be `train`, `valid`, `test`); + * `teacher-force-tokens`, `teacher-force-duration`, `teacher-force-f0`: if set, at each autoregressive step, ground-truth values replace the produced one; + * `short-curcuit`: replace sampling by ground-truth inputs; + * `match-duration`: forces the produced sample to have the same duration (in time), as the entire sequence (beyond the prompt if there is any); + * `batch-explosion-rate`: number of samples per prompt; + * `f0-discretization-bounds`: path to a file with quantization boundaries. If it is set, F0 values are de-quantized back to the continuous domain + (the model must be a quanized one); + * `max-length` sets the maximal number of segment steps to be produced. + +Note that `sample.py` automatically uses all available GPUs, to avoid that please use environment variable `CUDA_VISIBLE_DEVICES`. + +## Vocoding samples +To generate audios for output from `sample.py` (`$IN_FILE`): +```bash +python examples/textless_nlp/pgslm/generate_waveform.py \ + --in-file=$IN_FILE \ + --vocoder=$VODOER \ + --vocoder-cfg=$VOCODER_CFG \ + --results-path=$RESULTS_PATH +``` +See "Pre-trained model" for `$VOCODER` and `VOCODER_CFG`. diff --git a/examples/textless_nlp/pgslm/data_utils.py b/examples/textless_nlp/pgslm/data_utils.py new file mode 100644 index 0000000000..2033697b37 --- /dev/null +++ b/examples/textless_nlp/pgslm/data_utils.py @@ -0,0 +1,107 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import torch + +from tqdm import tqdm + + +class Stat: + def __init__(self, keep_raw=False): + self.x = 0.0 + self.x2 = 0.0 + self.z = 0.0 # z = logx + self.z2 = 0.0 + self.n = 0.0 + self.u = 0.0 + self.keep_raw = keep_raw + self.raw = [] + + def update(self, new_x): + new_z = new_x.log() + + self.x += new_x.sum() + self.x2 += (new_x**2).sum() + self.z += new_z.sum() + self.z2 += (new_z**2).sum() + self.n += len(new_x) + self.u += 1 + + if self.keep_raw: + self.raw.append(new_x) + + @property + def mean(self): + return self.x / self.n + + @property + def std(self): + return (self.x2 / self.n - self.mean**2) ** 0.5 + + @property + def mean_log(self): + return self.z / self.n + + @property + def std_log(self): + return (self.z2 / self.n - self.mean_log**2) ** 0.5 + + @property + def n_frms(self): + return self.n + + @property + def n_utts(self): + return self.u + + @property + def raw_data(self): + assert self.keep_raw, "does not support storing raw data!" + return torch.cat(self.raw) + + +class F0Stat(Stat): + def update(self, new_x): + # assume unvoiced frames are 0 and consider only voiced frames + if new_x is not None: + super().update(new_x[new_x != 0]) + + +def dump_speaker_f0_stat(speaker_to_f0_stat, out_prefix): + path = f"{out_prefix}.f0_stat.pt" + assert not os.path.exists(path) + + d = { + speaker: { + "f0_mean": speaker_to_f0_stat[speaker].mean, + "f0_std": speaker_to_f0_stat[speaker].std, + "logf0_mean": speaker_to_f0_stat[speaker].mean_log, + "logf0_std": speaker_to_f0_stat[speaker].std_log, + } + for speaker in speaker_to_f0_stat + } + torch.save(d, path) + + return d + + +def load_audio_path(path): + audio_paths = [] + with open(path) as f: + for line in f.readlines(): + sample = eval(line.strip()) + audio_paths.append(sample["audio"]) + + return audio_paths + + +def load_f0(f0_dir, nshards): + path_to_f0 = {} + for rank in tqdm(range(1, nshards + 1), desc=f"load f0"): + f0_shard_path = f"{f0_dir}/f0_{rank}_{nshards}.pt" + shard_path_to_f0 = torch.load(f0_shard_path) + path_to_f0.update(shard_path_to_f0) + return path_to_f0 diff --git a/examples/textless_nlp/pgslm/eval/__init__.py b/examples/textless_nlp/pgslm/eval/__init__.py new file mode 100644 index 0000000000..0e028c26b9 --- /dev/null +++ b/examples/textless_nlp/pgslm/eval/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/examples/textless_nlp/pgslm/eval/cont_metrics.py b/examples/textless_nlp/pgslm/eval/cont_metrics.py new file mode 100644 index 0000000000..e98abadde3 --- /dev/null +++ b/examples/textless_nlp/pgslm/eval/cont_metrics.py @@ -0,0 +1,730 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import numpy as np +import scipy + +import torch +import torch.multiprocessing as mp +from fairseq import checkpoint_utils, options +from fairseq.data.codedataset import CodeDataset, ExpressiveCodeDataConfig +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from torch.utils.data import DataLoader, DistributedSampler +from fairseq.utils import move_to_cuda +from fairseq import utils +from fairseq.criterions.speech_ulm_criterion import nll_loss, mae_loss + +import time +from types import SimpleNamespace + +import sys, pathlib + +sys.path.append(str(pathlib.Path(__file__).parent.parent.resolve())) + +from naive_decoder import Naive_F0_Decoder +from inference_dataset import InferenceDataset, explode_batch +from sample.sample import do_sampling, TemperatureDecoder, FilterNamesDataset + +try: + from nltk.translate.bleu_score import sentence_bleu +except ImportError: + print("Please install nltk: `pip install --user -U nltk`") + raise + + +@torch.no_grad() +def teacher_force_everything( + args, dataset, model, criterion, tgt_dict, rank, world_size +): + prefix = args.prefix_length + + f0_decoder = None + if args.dequantize_prosody: + assert dataset.discrete_f0 + print("Reporting MAE for a discrete model") + f0_decoder = Naive_F0_Decoder( + args.f0_discretization_bounds, dataset.config.f0_vq_n_units + ).cuda() + + dataset = InferenceDataset( + dataset, + prefix=args.prefix_length, + only_prefix=False, + filter_short=True, + presort_by_length=True, + ) + sampler = ( + None + if world_size == 1 + else DistributedSampler( + dataset, num_replicas=world_size, rank=rank, shuffle=False + ) + ) + dataloader = DataLoader( + dataset, + args.batch_size, + shuffle=False, + collate_fn=dataset.collater, + sampler=sampler, + ) + + total_token_loss, total_duration_loss, total_f0_loss, total_tokens = ( + 0.0, + 0.0, + 0.0, + 0.0, + ) + + i = 0 + for batch in dataloader: + i += 1 + batch = move_to_cuda(batch) + output = model(**batch["net_input"]) + + tokens, durations, f0 = output["token"], output["duration"], output["f0"] + durations, f0 = durations.squeeze(), f0.squeeze() + + token_loss = nll_loss( + tokens[:, prefix - 1 :], + batch["target"][:, prefix - 1 :].contiguous(), + batch["mask"][:, prefix - 1 :].contiguous(), + reduce=True, + ) + + if args.dequantize_prosody: + durations = durations.argmax(dim=-1) + duration_loss = mae_loss( + durations[:, prefix - 1 :].contiguous().float(), + batch["dur_target"][:, prefix - 1 :].contiguous().float(), + batch["dur_mask"][:, prefix - 1 :].contiguous(), + reduce=True, + ) + else: + duration_loss = criterion.dur_loss_fn( + durations[:, prefix - 1 :].contiguous(), + batch["dur_target"][:, prefix - 1 :].contiguous(), + batch["dur_mask"][:, prefix - 1 :].contiguous(), + reduce=True, + ) + + if f0_decoder: + f0 = f0.argmax(dim=-1) + f0 = f0_decoder(f0).squeeze(-1) + + f0_target = batch["raw_f0"] + f0_loss = mae_loss( + f0[:, prefix - 1 :].contiguous(), + f0_target[:, prefix - 1 :].contiguous(), + batch["f0_mask"][:, prefix - 1 :].contiguous(), + reduce=True, + ) + else: + f0_loss = criterion.f0_loss_fn( + f0[:, prefix - 1 :].contiguous(), + batch["f0_target"][:, prefix - 1 :].contiguous(), + batch["f0_mask"][:, prefix - 1 :].contiguous(), + reduce=True, + ) + + n_tokens = (~batch["dur_mask"])[:, prefix - 1 :].sum() + + total_token_loss += token_loss.item() + total_duration_loss += duration_loss.item() + total_f0_loss += f0_loss.item() + + total_tokens += n_tokens.item() + if args.debug and i > 5: + break + + values = torch.tensor([total_token_loss, total_duration_loss, total_f0_loss]) + normalizers = torch.tensor([total_tokens for _ in range(3)]) + + return values, normalizers + + +def get_bleu(produced_tokens, target_tokens, tgt_dict): + assert target_tokens.ndim == 1 + assert produced_tokens.size(1) == target_tokens.size(0) + + # we can have padding due to shifted channels + shift = 0 + for token in reversed(target_tokens.cpu().tolist()): + if token in [tgt_dict.pad(), tgt_dict.eos()]: + shift += 1 + else: + break + target_tokens = target_tokens[:-shift] + produced_tokens = produced_tokens[:, :-shift] + + string_target = tgt_dict.string(target_tokens).split() + string_candidates = [ + tgt_dict.string(produced_tokens[i, :]).split() + for i in range(produced_tokens.size(0)) + ] + + bleu3 = sentence_bleu( + references=string_candidates, + hypothesis=string_target, + weights=(1.0 / 3, 1.0 / 3, 1.0 / 3), + ) + return bleu3 + + +@torch.no_grad() +def continuation(args, dataset, model, criterion, tgt_dict, rank, world_size): + is_discrete_duration = dataset.discrete_dur + is_discrete_f0 = dataset.discrete_f0 + + f0_decoder = None + if args.dequantize_prosody: + assert dataset.discrete_f0 + print("Reporting MAE F0 for a discrete model") + f0_decoder = Naive_F0_Decoder( + args.f0_discretization_bounds, dataset.config.f0_vq_n_units + ).cuda() + + dataset = InferenceDataset( + dataset, args.prefix_length, filter_short=True, presort_by_length=True + ) + sampler = ( + None + if world_size == 1 + else DistributedSampler( + dataset, num_replicas=world_size, rank=rank, shuffle=False + ) + ) + dataloader = DataLoader( + dataset, + batch_size=1, + shuffle=False, + collate_fn=dataset.collater, + sampler=sampler, + ) + + Ts = args.T_token, args.T_duration, args.T_f0 + decoder = TemperatureDecoder( + Ts, discrete_dur=is_discrete_duration, discrete_f0=is_discrete_f0 + ) + + running_stats = SimpleNamespace( + token_bleu=0.0, + duration_nll=0.0, + duration_mae=0.0, + f0_nll=0.0, + f0_mae=0.0, + n_tokens=0.0, + n_sentences=0.0, + f0_sum=0.0, + f0_sum_sq=0.0, + dur_sum=0.0, + dur_sum_sq=0.0, + ) + + for i, batch in enumerate(dataloader): + batch = explode_batch(batch, args.batch_explosion_rate) + bsz = batch["target"].size(0) + + batch = move_to_cuda(batch) + prefix = batch["prefix"][0] + + max_length_to_unroll = batch["target"].size(1) + prefix_length = batch["net_input"]["src_tokens"].size(1) + steps = max_length_to_unroll - prefix_length + 1 + + assert steps > 0 + produced_tokens, produced_durations, produced_f0, outputs = do_sampling( + model, + batch, + tgt_dict.eos(), + decoder, + autoregressive_steps=steps, + teacher_force_tokens=args.teacher_force_tokens, + teacher_force_duration=args.teacher_force_duration, + teacher_force_f0=args.teacher_force_f0, + ) + + if args.teacher_force_tokens: + assert (produced_tokens[:, 1:] == batch["target"]).all() + if args.teacher_force_duration: + assert (produced_durations[:, 1:] == batch["dur_target"]).all() + if args.teacher_force_f0: + assert (produced_f0[:, 1:] == batch["f0_target"]).all() + + dur_target = batch["dur_target"][:, prefix - 1 :].contiguous() + f0_target = batch["f0_target"][:, prefix - 1 :].contiguous() + + f0_mask = batch["f0_mask"][:, prefix - 1 :].contiguous() + dur_mask = batch["dur_mask"][:, prefix - 1 :].contiguous() + + duration_mae = mae_loss( + produced_durations[:, prefix:].float(), + dur_target.float(), + dur_mask, + reduce=False, + ) + min_duration_mae = duration_mae.view(bsz, -1).sum(dim=-1).min(dim=0)[0] + running_stats.duration_mae += min_duration_mae + + running_stats.dur_sum += ( + produced_durations[:, prefix:].float() * (~dur_mask) + ).sum() / args.batch_explosion_rate + running_stats.dur_sum_sq += ( + produced_durations[:, prefix:].float() * (~dur_mask) + ).pow(2.0).sum() / args.batch_explosion_rate + + if is_discrete_duration: + duration_loss = criterion.dur_loss_fn( + torch.stack([x[1] for x in outputs], dim=1), + dur_target, + dur_mask, + reduce=False, + ) + min_duration_loss = duration_loss.view(bsz, -1).sum(dim=-1).min(dim=0)[0] + running_stats.duration_nll += min_duration_loss + + if f0_decoder: # can only exist for discrete F0 models + decoded_produced_f0 = f0_decoder(produced_f0[:, prefix:]) + decoded_f0_target = batch["raw_f0"][:, prefix - 1 :].contiguous() + + if produced_f0.ndim == 3: + decoded_produced_f0 = decoded_produced_f0.squeeze(2) + decoded_f0_target = decoded_f0_target.squeeze(2) + + f0_mae = mae_loss( + decoded_produced_f0, decoded_f0_target, f0_mask, reduce=False + ) + f0_mae = f0_mae.view(bsz, -1).sum(dim=-1).min(dim=0)[0] + running_stats.f0_mae += f0_mae + + f0_loss = criterion.f0_loss_fn( + torch.stack([x[2] for x in outputs], dim=1), + f0_target.long(), + f0_mask, + reduce=False, + ) + f0_loss = f0_loss.view(bsz, -1).sum(dim=-1).min(dim=0)[0] + running_stats.f0_nll += f0_loss + + running_stats.f0_sum += ( + decoded_produced_f0 * (~f0_mask) + ).sum() / args.batch_explosion_rate + running_stats.f0_sum_sq += (decoded_produced_f0 * (~f0_mask)).pow( + 2.0 + ).sum() / args.batch_explosion_rate + + else: + assert not is_discrete_duration + + f0_loss = mae_loss( + produced_f0[:, prefix:], f0_target, f0_mask, reduce=False + ) + f0_loss = f0_loss.view(bsz, -1).sum(dim=-1).min(dim=0)[0] + running_stats.f0_mae += f0_loss + + running_stats.f0_sum += ( + produced_f0[:, prefix:].sum() / args.batch_explosion_rate + ) + running_stats.f0_sum_sq += ( + produced_f0[:, prefix:].pow(2.0).sum() / args.batch_explosion_rate + ) + + running_stats.n_tokens += (~dur_mask)[0, ...].sum() + + token_loss = get_bleu( + produced_tokens[:, prefix:], batch["target"][0, prefix - 1 :], tgt_dict + ) + running_stats.token_bleu += token_loss + running_stats.n_sentences += 1 + + if args.debug: + break + + values = torch.tensor( + [ + running_stats.token_bleu, + running_stats.duration_nll, + running_stats.duration_mae, + running_stats.f0_nll, + running_stats.f0_mae, + running_stats.f0_sum, + running_stats.f0_sum_sq, + running_stats.dur_sum, + running_stats.dur_sum_sq, + ] + ) + normalizers = torch.tensor( + [running_stats.n_sentences] + [running_stats.n_tokens] * 8 + ) + + return values, normalizers + + +@torch.no_grad() +def correlation(args, dataset, model, criterion, tgt_dict, rank, world_size): + is_discrete_duration = dataset.discrete_dur + is_discrete_f0 = dataset.discrete_f0 + + f0_decoder = None + if is_discrete_f0: + assert dataset.discrete_f0 + f0_decoder = Naive_F0_Decoder( + args.f0_discretization_bounds, dataset.config.f0_vq_n_units + ).cuda() + + if is_discrete_f0: + assert f0_decoder # correlation on tokens is meaningless + + dataset = InferenceDataset( + dataset, + args.prefix_length, + filter_short=True, + presort_by_length=True, + min_length=args.min_length, + ) + sampler = ( + None + if world_size == 1 + else DistributedSampler( + dataset, num_replicas=world_size, rank=rank, shuffle=False + ) + ) + dataloader = DataLoader( + dataset, + batch_size=1, + shuffle=False, + collate_fn=dataset.collater, + sampler=sampler, + ) + + Ts = args.T_token, args.T_duration, args.T_f0 + decoder = TemperatureDecoder( + Ts, discrete_dur=is_discrete_duration, discrete_f0=is_discrete_f0 + ) + + mean_dur_prefix, mean_dur_cont = [], [] + mean_f0_prefix, mean_f0_cont = [], [] + + for batch in dataloader: + batch = explode_batch(batch, args.batch_explosion_rate) + batch = move_to_cuda(batch) + + assert len(batch["prefix"]) == 1 + + if args.teacher_force_tokens: + autoregressive_steps = batch["target"].size(1) - args.prefix_length - 1 + else: + autoregressive_steps = args.max_length - args.prefix_length # + max_shift? + + if args.copy_target: + produced_durations, produced_f0 = batch["dur_target"], batch["f0_target"] + else: + _, produced_durations, produced_f0, outputs = do_sampling( + model, + batch, + tgt_dict.eos(), + decoder, + autoregressive_steps=autoregressive_steps, + teacher_force_tokens=args.teacher_force_tokens, + teacher_force_duration=args.teacher_force_duration, + teacher_force_f0=args.teacher_force_f0, + ) + + # first tokens actually correspond to BOS + produced_durations = produced_durations[:, 1:] + produced_f0 = produced_f0[:, 1:] + + dur_target = batch["dur_target"] + if is_discrete_duration: + produced_durations = produced_durations.float() + dur_target = dur_target.float() + + if is_discrete_f0: + produced_f0 = f0_decoder(produced_f0).squeeze(-1) + f0_target = batch["raw_f0"] + else: + f0_target = batch["f0_target"] + + # prefix values + prefix = batch["prefix"][0] + dur_prefix_mean = dur_target[:, :prefix].sum(dim=-1) / ( + (~batch["dur_mask"][:, :prefix]).sum(dim=-1) + ) + + non_voiced = f0_target[:, :prefix] == 0.0 + f0_mask = batch["f0_mask"][:, :prefix].logical_or(non_voiced) + f0_prefix_mean = f0_target[:, :prefix].sum(dim=-1) / ((~f0_mask).sum(dim=-1)) + + # continuation values + dur_cont_mean = produced_durations[:, prefix:].sum(dim=-1) / ( + (~batch["dur_mask"][:, prefix:]).sum(dim=-1) + ) + + non_voiced = produced_f0[:, prefix:] == 0.0 + f0_mask = non_voiced + f0_cont_mean = produced_f0[:, prefix:].sum(dim=-1) / ((~f0_mask).sum(dim=-1)) + + assert not f0_cont_mean.isnan().any() + + mean_dur_prefix.append(dur_prefix_mean.cpu()) + mean_dur_cont.append(dur_cont_mean.cpu()) + + mean_f0_prefix.append(f0_prefix_mean.cpu()) + mean_f0_cont.append(f0_cont_mean.cpu()) + + if args.debug and len(mean_dur_prefix) > 10: + break + + mean_dur_prefix, mean_dur_cont = torch.cat(mean_dur_prefix), torch.cat( + mean_dur_cont + ) + mean_f0_prefix, mean_f0_cont = torch.cat(mean_f0_prefix), torch.cat(mean_f0_cont) + + return mean_dur_prefix, mean_dur_cont, mean_f0_prefix, mean_f0_cont + + +def main(rank, world_size, args): + start = time.time() + + if world_size > 1: + torch.distributed.init_process_group( + backend="gloo", init_method="env://", world_size=world_size, rank=rank + ) + torch.cuda.set_device(rank % torch.cuda.device_count()) + + raw_args = args + + args = convert_namespace_to_omegaconf(args) + if args.common.seed is not None: + np.random.seed(args.common.seed) + utils.set_torch_seed(args.common.seed) + + models, model_args, task = checkpoint_utils.load_model_ensemble_and_task( + [raw_args.path], arg_overrides={"data": args.task.data} + ) + + tgt_dict = task.target_dictionary + + for model in models: + model.prepare_for_inference_(args) + model.cuda().eval() + if raw_args.fp16: + model = model.half() + model = models[0] + + config = ExpressiveCodeDataConfig(args.task.data) + + dataset = CodeDataset( + manifest=config.manifests[raw_args.eval_subset], + dictionary=task.source_dictionary, + dur_dictionary=task.source_duration_dictionary, + f0_dictionary=task.source_f0_dictionary, + config=config, + discrete_dur=task.cfg.discrete_duration, + discrete_f0=task.cfg.discrete_f0, + log_f0=task.cfg.log_f0, + normalize_f0_mean=task.cfg.normalize_f0_mean, + normalize_f0_std=task.cfg.normalize_f0_std, + interpolate_f0=task.cfg.interpolate_f0, + shifts=task.cfg.stream_shifts, + return_filename=True, + strip_filename=False, + return_continuous_f0=raw_args.dequantize_prosody, + ) + + if raw_args.filter_names: + dataset = FilterNamesDataset(dataset, raw_args.filter_names) + + criterion = task.build_criterion(model_args.criterion) + + name2metric = { + "continuation": continuation, + "teacher_force_everything": teacher_force_everything, + "correlation": correlation, + } + + name2keys = { + "continuation": ( + "Token BLEU3", + "Duration NLL", + "Duration MAE", + "F0 NLL", + "F0 MAE", + "F0 sum", + "F0 sum_sq", + "Dur sum", + "Dur sum_sq", + ), + "teacher_force_everything": ("token_loss", "duration_loss", "f0_loss"), + "correlation": ("Duration corr", "F0 corr"), + } + metric_name = raw_args.metric + + metric = name2metric[metric_name] + results = metric(raw_args, dataset, model, criterion, tgt_dict, rank, world_size) + + values = None + + if metric_name not in [ + "correlation", + ]: + values, normalizers = results + values = maybe_aggregate_normalize(values, normalizers, world_size) + elif metric_name == "correlation": + values = maybe_aggregate_correlations(results, world_size) + else: + assert False + + assert values is not None + summary = dict(zip(name2keys[raw_args.metric], values.tolist())) + if metric_name == "continuation": + summary["F0 Std"] = np.sqrt(-summary["F0 sum"] ** 2 + summary["F0 sum_sq"]) + summary["Dur Std"] = np.sqrt(-summary["Dur sum"] ** 2 + summary["Dur sum_sq"]) + del summary["F0 sum"] + del summary["F0 sum_sq"] + del summary["Dur sum"] + del summary["Dur sum_sq"] + + summary["metric"] = metric_name + + if rank == 0: + print(summary) + if raw_args.wandb: + wandb_results(summary, raw_args) + print("# finished in ", time.time() - start, "seconds") + + +def wandb_results(summary, raw_args): + import wandb + + run = wandb.init( + project=raw_args.wandb_project_name, tags=raw_args.wandb_tags.split(",") + ) + run.config.metric = raw_args.metric + run.config.model = raw_args.path + run.config.data = raw_args.data + + if raw_args.wandb_run_name: + run.name = raw_args.wandb_run_name + run.save() + + wandb.log(summary) + wandb.finish() + + +def maybe_aggregate_normalize(values, normalizers, world_size): + if world_size > 1: + torch.distributed.barrier() + + torch.distributed.all_reduce_multigpu([values]) + torch.distributed.all_reduce_multigpu([normalizers]) + + return values / normalizers + + +def maybe_aggregate_correlations(results, world_size): + if world_size > 1: + output = [None for _ in range(world_size)] + torch.distributed.all_gather_object(output, results) + mean_dur_prefix, mean_dur_cont, mean_f0_prefix, mean_f0_cont = [ + torch.cat([x[i] for x in output]) for i in range(4) + ] + else: + mean_dur_prefix, mean_dur_cont, mean_f0_prefix, mean_f0_cont = results + + corr_dur = scipy.stats.pearsonr(mean_dur_prefix.numpy(), mean_dur_cont.numpy())[0] + corr_f0 = scipy.stats.pearsonr(mean_f0_prefix.numpy(), mean_f0_cont.numpy())[0] + values = torch.tensor([corr_dur, corr_f0]) + + return values + + +def cli_main(): + parser = options.get_interactive_generation_parser() + parser.add_argument( + "--prefix-length", + type=int, + default=1, + help="Prompt prefix length (including <s>)", + ) + parser.add_argument( + "--duration-scale", + type=float, + default=1, + help="Multiply durations by the given scaler", + ) + parser.add_argument( + "--debug", action="store_true", help="Process only the first batch" + ) + parser.add_argument("--n_hypotheses", type=int, default=1) + parser.add_argument("--filter-names", type=str, default=None) + parser.add_argument( + "--max-length", type=int, default=200, help="Maximal produced length" + ) + + parser.add_argument("--teacher-force-tokens", action="store_true", default=False) + parser.add_argument("--teacher-force-duration", action="store_true", default=False) + parser.add_argument("--teacher-force-f0", action="store_true", default=False) + + parser.add_argument("--copy-target", action="store_true", default=False) + parser.add_argument("--min-length", type=int, default=None) + parser.add_argument("--f0-discretization-bounds", type=str, default=None) + parser.add_argument("--dequantize-prosody", action="store_true") + parser.add_argument("--batch-explosion-rate", type=int, default=1) + + parser.add_argument( + "--metric", + choices=["continuation", "teacher_force_everything", "correlation"], + required=True, + ) + + parser.add_argument("--wandb", action="store_true") + parser.add_argument("--wandb-project-name", type=str, default="eslm") + parser.add_argument("--wandb-tags", type=str, default="") + parser.add_argument("--wandb-run-name", type=str, default="") + + parser.add_argument("--T-token", type=float, default=1.0) + parser.add_argument("--T-duration", type=float, default=1.0) + parser.add_argument("--T-f0", type=float, default=1.0) + + parser.add_argument("--n-workers", type=int, default=1) + + parser.add_argument( + "--eval-subset", type=str, default="valid", choices=["valid", "test"] + ) + + args = options.parse_args_and_arch(parser) + + assert ( + args.prefix_length >= 1 + ), "Prefix length includes bos token <s>, hence the minimum is 1." + assert args.temperature >= 0.0, "T must be non-negative!" + + if args.dequantize_prosody: + assert args.f0_discretization_bounds + + world_size = args.n_workers or torch.cuda.device_count() + if world_size > 1: + import random + + mp.set_start_method("spawn", force=True) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(random.randint(10_000, 50_000)) + + mp.spawn( + main, + nprocs=world_size, + args=( + world_size, + args, + ), + join=True, + ) + else: + main(rank=0, world_size=world_size, args=args) + + +if __name__ == "__main__": + cli_main() diff --git a/examples/textless_nlp/pgslm/generate_waveform.py b/examples/textless_nlp/pgslm/generate_waveform.py new file mode 100644 index 0000000000..a6f348bb9b --- /dev/null +++ b/examples/textless_nlp/pgslm/generate_waveform.py @@ -0,0 +1,120 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import ast +import argparse +import json +import logging +from pathlib import Path +import soundfile as sf +import torch + +from tqdm import tqdm + +from fairseq import utils +from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder + + +logging.basicConfig() +logging.root.setLevel(logging.INFO) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def dump_result(args, data, sample_id, pred_wav): + assert "audio" in data or args.results_path is not None + if args.results_path: + fname = Path(data["audio"]).name if "audio" in data else f"{sample_id}_pred.wav" + out_file = Path(args.results_path) / fname + + sf.write( + out_file.as_posix(), + pred_wav.detach().cpu().numpy(), + args.sample_rate, + ) + + +def load_data(in_file): + with open(in_file) as f: + data = [ast.literal_eval(line.strip()) for line in f] + + return data + + +def get_f0_upsample_ratio(code_hop_size, f_hop_size): + ratio = (code_hop_size // 160) // (f_hop_size // 256) * 2 + return ratio + + +def main(args): + logger.info(args) + + use_cuda = torch.cuda.is_available() and not args.cpu + + with open(args.vocoder_cfg) as f: + vocoder_cfg = json.load(f) + vocoder = CodeHiFiGANVocoder(args.vocoder, vocoder_cfg) + if use_cuda: + vocoder = vocoder.cuda() + + data = load_data(args.in_file) + + if args.results_path: + Path(args.results_path).mkdir(exist_ok=True, parents=True) + + for i, d in tqdm(enumerate(data), total=len(data)): + code_key = "cpc_km100" if "cpc_km100" in d else "hubert" + code = list(map(int, d[code_key].split())) + + x = { + "code": torch.LongTensor(code).view(1, -1), + "f0": torch.Tensor(d["f0"]).view(1, -1), + } + + f0_up_ratio = get_f0_upsample_ratio( + vocoder_cfg["code_hop_size"], vocoder_cfg["hop_size"] + ) + if f0_up_ratio > 1: + bsz, cond_length = x["f0"].size() + x["f0"] = x["f0"].unsqueeze(2).repeat(1, 1, f0_up_ratio).view(bsz, -1) + + x = utils.move_to_cuda(x) if use_cuda else x + wav = vocoder(x) + dump_result(args, d, i, wav) + + +def cli_main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--in-file", + type=str, + required=True, + help="Input file following the same format of the output from sample.py ('f0' and 'cpc_km100/hubert' are required fields)", + ) + parser.add_argument( + "--vocoder", type=str, required=True, help="path to the vocoder" + ) + parser.add_argument( + "--vocoder-cfg", + type=str, + required=True, + help="path to the vocoder config", + ) + parser.add_argument("--sample-rate", type=int, default=16_000) + parser.add_argument( + "--results-path", + type=str, + default=None, + help="Output directory. If not set, the audios will be stored following the 'audio' field specified in the input file.", + ) + parser.add_argument("--cpu", action="store_true", help="run on CPU") + + args = parser.parse_args() + + main(args) + + +if __name__ == "__main__": + cli_main() diff --git a/examples/textless_nlp/pgslm/inference_dataset.py b/examples/textless_nlp/pgslm/inference_dataset.py new file mode 100644 index 0000000000..9f7cfa5f54 --- /dev/null +++ b/examples/textless_nlp/pgslm/inference_dataset.py @@ -0,0 +1,103 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import torch + + +class InferenceDataset: + def __init__( + self, + dataset, + prefix, + only_prefix=True, + presort_by_length=True, + filter_short=False, + min_length=None, + ): + self.dataset = dataset + self.collater = self.dataset.collater + self.prefix = prefix + self.only_prefix = only_prefix + self.filter_short = filter_short + + self.remapping = list(range(len(self.dataset))) + if min_length: + assert min_length >= prefix + 1 + + length_thr = prefix + 1 if not min_length else min_length + + if filter_short: + self.remapping = list( + filter( + lambda i: self.dataset[i]["dur_source"].sum() > length_thr, + self.remapping, + ) + ) + print( + f"# the initial dataset of {len(self.dataset)} examples became {len(self.remapping)} after filtering" + f" examples shorter than {length_thr} (in duration units)" + ) + + if presort_by_length: + lengths = {index: dataset.size(index) for index in self.remapping} + self.remapping.sort(key=lambda i: lengths[i]) + + @property + def pads(self): + return self.dataset.pads + + def __len__(self): + return len(self.remapping) + + def original_size(self, k): + k = self.remapping[k] + return self.dataset.size(k) + + def __getitem__(self, k): + k = self.remapping[k] + channels = self.dataset[k] + + if self.prefix and self.only_prefix: + dur_channel = channels["dur_source"] + assert dur_channel.sum() >= self.prefix + + token_times = dur_channel.cumsum(dim=-1) + cut_after = torch.searchsorted(token_times, torch.tensor(self.prefix)) + + r = {} + for channel_name, value in channels.items(): + if isinstance(value, torch.Tensor) and "source" in channel_name: + # if self.filter_short: assert value.size(0) >= self.prefix + r[channel_name] = value[: cut_after + 1] + else: + r[channel_name] = value + + r["prefix"] = cut_after + 1 + else: + r = channels + + return r + + +def explode_batch(batch, times): + if times == 1: + return batch + + new_batch = {} + + for key, value in batch.items(): + if isinstance(value, torch.Tensor): + assert value.size(0) == 1 + new_batch[key] = torch.cat([value] * times) + elif key in ["ntokens", "nsentences"]: + new_batch[key] = value * times + elif key in ["prefix", "filename"]: + new_batch[key] = value + elif key == "net_input": + new_batch[key] = explode_batch(value, times) + else: + assert False, key + return new_batch diff --git a/examples/textless_nlp/pgslm/naive_decoder.py b/examples/textless_nlp/pgslm/naive_decoder.py new file mode 100644 index 0000000000..5132889792 --- /dev/null +++ b/examples/textless_nlp/pgslm/naive_decoder.py @@ -0,0 +1,40 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import warnings + + +class Naive_F0_Decoder(torch.nn.Module): + def __init__(self, bounds_path, n_units=32): + super().__init__() + + bounds = torch.load(bounds_path) + bounds = torch.from_numpy(bounds[n_units]) + assert bounds.ndim == 1 + + pad = torch.tensor([-5.0, -5.0]) # bos, eos, pad are in the dictionary + centers = torch.cat( + [bounds[0:1], 0.5 * (bounds[1:] + bounds[:-1]), bounds[-1:], pad[:]] + ) + + self.embedding = torch.nn.Embedding.from_pretrained( + centers.unsqueeze(-1), freeze=True + ) + self.max_n = self.embedding.weight.numel() + + def forward(self, discrete_f0: torch.Tensor): + in_bounds = (0 <= discrete_f0).all() and (discrete_f0 < self.max_n).all() + if not in_bounds: + warnings.warn( + f"F0 contains some weird outputs: discrete_f0.max().item()={discrete_f0.max().item()} discrete_f0.min().item()={discrete_f0.min().item()}; " + f"while we have embeddings for {self.max_n} values. " + "Assuming this is a no-prosody model -- but be careful!" + ) + + mask = discrete_f0 >= self.max_n + discrete_f0 = discrete_f0.masked_fill(mask, self.max_n - 1) + + return self.embedding(discrete_f0).squeeze(-1) diff --git a/examples/textless_nlp/pgslm/prepare_dataset.py b/examples/textless_nlp/pgslm/prepare_dataset.py new file mode 100644 index 0000000000..3d5edaa58f --- /dev/null +++ b/examples/textless_nlp/pgslm/prepare_dataset.py @@ -0,0 +1,143 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from multiprocessing import Pool + +import os +from collections import defaultdict +from itertools import starmap + +import torch +from npy_append_array import NpyAppendArray +from tqdm import tqdm + +from data_utils import dump_speaker_f0_stat, F0Stat, load_f0 +from fairseq.data.codedataset import ( + ExpressiveCodeDataConfig, + parse_manifest, + F0_FRAME_SPACE, + align_f0_to_durations, +) +from fairseq.tasks.speech_ulm_task import UnitDictionary + + +def load_meta(meta_path, split): + config = ExpressiveCodeDataConfig(meta_path) + manifest_path = config.manifests[split] + dictionary = UnitDictionary(n_units=config.n_units) + audio_paths, codes, durs, speakers = parse_manifest(manifest_path, dictionary) + return config, audio_paths, codes, durs, speakers + + +def _align_f0(f0, dur, ratio, frm_tol=5): + if f0 is None: + seg_f0 = torch.zeros_like(dur, dtype=torch.float) + else: + seg_f0 = align_f0_to_durations(f0, dur, ratio, tol=frm_tol * ratio) + return seg_f0.numpy() # try a hacky stuff + + +def align_f0(path_to_f0, audio_paths, durs, ratio, mp=False): + chunk_size = 2000 + num_procs = 40 + iterable = ((path_to_f0[p], d, ratio) for p, d in zip(audio_paths, durs)) + + seg_f0s = [] + if mp: + with Pool(num_procs) as pool: + iterator = tqdm( + pool.istarmap(_align_f0, iterable, chunk_size), + desc="align f0", + total=len(durs), + ) + for seg_f0 in iterator: + seg_f0s.append(torch.from_numpy(seg_f0).float()) + else: + iterator = tqdm(starmap(_align_f0, iterable), desc="align f0", total=len(durs)) + for seg_f0 in iterator: + seg_f0s.append(torch.from_numpy(seg_f0).float()) + + return seg_f0s + + +def prepare_seg_data(config, audio_paths, codes, durs, speakers, path_to_f0): + ratio = config.code_hop_size / (config.sampling_rate * F0_FRAME_SPACE) + seg_f0s = align_f0(path_to_f0, audio_paths, durs, ratio) + data = { + "codes": codes, + "duration": durs, + "f0": seg_f0s, + "speaker": speakers, + "path": audio_paths, + } + return data + + +def dump_seg_data(data, out_prefix): + key_targs = { + "codes": f"{out_prefix}.code.npy", + "duration": f"{out_prefix}.dur.npy", + "f0": f"{out_prefix}.f0.npy", + } + for key, targ in key_targs.items(): + assert not os.path.exists(targ) + npaa = NpyAppendArray(targ) + for utt_data in tqdm(data[key], desc=f"dumping {key}"): + npaa.append(utt_data.numpy()) + + assert not os.path.exists(f"{out_prefix}.path.txt") + with open(f"{out_prefix}.path.txt", "w") as f: + for x in data["path"]: + f.write(f"{str(x)}\n") + + assert not os.path.exists(f"{out_prefix}.leng.txt") + with open(f"{out_prefix}.leng.txt", "w") as f: + for x in data["codes"]: + f.write(f"{len(x)}\n") + + assert not os.path.exists(f"{out_prefix}.speaker.txt") + with open(f"{out_prefix}.speaker.txt", "w") as f: + for x in data["speaker"]: + f.write(f"{str(x)}\n") + + print(f"wrote to files with prefix {out_prefix}") + + +def main(meta_path, f0_dir, splits, nshards_list): + speaker_to_stat = defaultdict(F0Stat) + if len(nshards_list) == 1: + nshards_list = nshards_list * len(splits) + else: + assert len(nshards_list) == len(splits) + + for split, nshards in zip(splits, nshards_list): + config, audio_paths, codes, durs, speakers = load_meta(meta_path, split) + path_to_f0 = load_f0(f"{f0_dir}/{split}", nshards) + + # segment-level data + data = prepare_seg_data(config, audio_paths, codes, durs, speakers, path_to_f0) + dump_seg_data(data, config.manifests[split]) + + # speaker f0 + for audio_path, speaker in tqdm(zip(audio_paths, speakers)): + f0 = path_to_f0[audio_path] + speaker_to_stat[speaker].update(f0) + dump_speaker_f0_stat(speaker_to_stat, config.manifests[split]) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("meta_path") + parser.add_argument("f0_dir", help="out_dir from preprocess_f0") + parser.add_argument("--splits", nargs="+", default=["train", "valid"]) + parser.add_argument( + "--nshards_list", type=int, nargs="+", default=[20], help="number of f0 shards" + ) + args = parser.parse_args() + print(args) + + main(**vars(args)) diff --git a/examples/textless_nlp/pgslm/preprocess_f0.py b/examples/textless_nlp/pgslm/preprocess_f0.py new file mode 100644 index 0000000000..afe899cb85 --- /dev/null +++ b/examples/textless_nlp/pgslm/preprocess_f0.py @@ -0,0 +1,65 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import torch +from tqdm import tqdm +from data_utils import load_audio_path +from fairseq.data.codedataset import get_f0_by_filename + + +def process_one(path, sr): + """ + Args: + path: audio file path + sr: sampling rate + """ + try: + # YAAPT throws errors in some rare cases + f0 = get_f0_by_filename(path, sr) + except Exception as e: + print( + f"WARNING: error when processing {path}. set f0 to zero. original error message:\n{e}" + ) + f0 = None + return f0 + + +def main(file_path, out_dir, nshards, rank, sampling_rate): + # load data + audio_paths = load_audio_path(file_path) + + # shard + assert nshards <= len(audio_paths) and nshards > 0 + shard_size = len(audio_paths) / nshards + s = int(round((rank - 1) * shard_size)) + e = int(round(rank * shard_size)) + audio_paths = audio_paths[s:e] + + # process + path_to_f0 = {} + for i, audio_path in enumerate(tqdm(audio_paths)): + f0 = process_one(audio_path, sampling_rate) + path_to_f0[audio_path] = f0 + print(f"finished processing {len(path_to_f0)} utterances ({s}-{e})") + + f0_path = f"{out_dir}/f0_{rank}_{nshards}.pt" + os.makedirs(out_dir, exist_ok=True) + torch.save(path_to_f0, f0_path) + print(f"saved to {f0_path}") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("file_path") + parser.add_argument("out_dir") + parser.add_argument("--nshards", type=int, default=20) + parser.add_argument("--rank", type=int, default=1) + parser.add_argument("--sampling_rate", type=int, default=16000) + args = parser.parse_args() + + main(**vars(args)) diff --git a/examples/textless_nlp/pgslm/quantize_f0.py b/examples/textless_nlp/pgslm/quantize_f0.py new file mode 100644 index 0000000000..d9e3df2fe2 --- /dev/null +++ b/examples/textless_nlp/pgslm/quantize_f0.py @@ -0,0 +1,94 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from collections import defaultdict +from functools import partial + +import numpy as np +import torch +from tqdm import tqdm + +from data_utils import dump_speaker_f0_stat, F0Stat, load_audio_path, load_f0 + + +def load_speaker(path): + speakers = [] + with open(path) as f: + for line in f.readlines(): + sample = eval(line.strip()) + assert "speaker" in sample + speakers.append(sample["speaker"]) + return speakers + + +def quantize_f0(speaker_to_f0, f0_stats, nbins, normalize, log): + f0_all = [] + for speaker, f0 in speaker_to_f0.items(): + f0 = f0.raw_data + if log: + f0 = f0.log() + mean = f0_stats[speaker]["logf0_mean"] if log else f0_stats[speaker]["f0_mean"] + std = f0_stats[speaker]["logf0_std"] if log else f0_stats[speaker]["f0_std"] + if normalize == "mean": + f0 = f0 - mean + elif normalize == "meanstd": + f0 = (f0 - mean) / std + f0_all.extend(f0.tolist()) + + hist, bin_x = np.histogram(f0_all, 100000) + cum_hist = np.cumsum(hist) / len(f0_all) * 100 + + f0_bin = {} + for num_bin in nbins: + bin_offset = [] + bin_size = 100 / num_bin + threshold = bin_size + for i in range(num_bin - 1): + index = (np.abs(cum_hist - threshold)).argmin() + bin_offset.append(bin_x[index]) + threshold += bin_size + f0_bin[num_bin] = np.array(bin_offset) + + return f0_bin + + +def main(file_path, f0_dir, out_dir, out_prefix, nbins, nshards, normalize, log): + audio_paths = load_audio_path(file_path) + path_to_f0 = load_f0(f0_dir, nshards) + + speakers = load_speaker(file_path) + speaker_to_f0 = defaultdict(partial(F0Stat, True)) + + # speaker f0 stats + for audio_path, speaker in tqdm(zip(audio_paths, speakers)): + f0 = path_to_f0[audio_path] + speaker_to_f0[speaker].update(f0) + f0_stats = dump_speaker_f0_stat(speaker_to_f0, f"{out_dir}/{out_prefix}") + + # quantize + f0_bin = quantize_f0(speaker_to_f0, f0_stats, nbins, normalize, log) + log_suffix = "_log" if log else "" + f0_bin_out_file = f"{out_dir}/{out_prefix}_{normalize}_norm{log_suffix}_f0_bin.th" + torch.save(f0_bin, f0_bin_out_file) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("file_path") + parser.add_argument("f0_dir", help="out_dir from preprocess_f0") + parser.add_argument("out_dir") + parser.add_argument("out_prefix") + parser.add_argument("--nbins", nargs="+", type=int, default=[32]) + parser.add_argument("--nshards", type=int, default=20, help="number of f0 shards") + parser.add_argument( + "--normalize", type=str, choices=["meanstd", "mean", "none"], default="mean" + ) + parser.add_argument("--log", action="store_true") + args = parser.parse_args() + print(args) + + main(**vars(args)) diff --git a/examples/textless_nlp/pgslm/sample/__init__.py b/examples/textless_nlp/pgslm/sample/__init__.py new file mode 100644 index 0000000000..0e028c26b9 --- /dev/null +++ b/examples/textless_nlp/pgslm/sample/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/examples/textless_nlp/pgslm/sample/sample.py b/examples/textless_nlp/pgslm/sample/sample.py new file mode 100644 index 0000000000..55ec7a955e --- /dev/null +++ b/examples/textless_nlp/pgslm/sample/sample.py @@ -0,0 +1,612 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import torch.multiprocessing as mp +import numpy as np +import json + +import torch +from torch.distributions.categorical import Categorical + +from fairseq import checkpoint_utils, options, utils +from fairseq.data.codedataset import CodeDataset, ExpressiveCodeDataConfig +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from torch.utils.data import DataLoader, DistributedSampler +from fairseq.utils import move_to_cuda + +import tqdm +import random +import pathlib + +import sys, pathlib + +sys.path.append(str(pathlib.Path(__file__).parent.parent)) +from inference_dataset import InferenceDataset, explode_batch +from naive_decoder import Naive_F0_Decoder +from truncated_laplace import truncated_laplace + +CODETYPE_TO_FRAMETIME = {"cpc_km100": 0.01, "hubert": 0.02} # 10ms # 20ms + + +class TemperatureDecoder: + def __init__(self, Ts, discrete_dur=False, discrete_f0=False): + self.T_token, self.T_dur, self.T_f0 = Ts + self.discrete_dur = discrete_dur + self.discrete_f0 = discrete_f0 + + def __call__(self, output): + def sample_multinomial(key, T): + logits = output[key][:, -1, :].float() + return Categorical(logits=logits / T).sample().unsqueeze(-1) + + def sample_laplace(key, T, truncate_at_zero): + mean = output[key][:, -1, :].float() + return truncated_laplace(mean=mean, T=T, truncate_by_zero=truncate_at_zero) + + if self.T_token > 0: + new_tokens = sample_multinomial("token", self.T_token) + else: + new_tokens = output["token"][:, -1, :].argmax(dim=-1, keepdim=True) + + if not self.discrete_dur and self.T_dur == 0: + new_durations = output["duration"][:, -1].round().int() + elif not self.discrete_dur and self.T_dur > 0: + new_durations = ( + sample_laplace("duration", self.T_dur, truncate_at_zero=True) + .round() + .int() + ) + elif self.discrete_dur and self.T_dur > 0: + new_durations = sample_multinomial("duration", self.T_dur) + elif self.discrete_dur and self.T_dur == 0: + new_durations = output["duration"][:, -1, :].argmax(dim=-1, keepdim=True) + else: + assert False + + if not self.discrete_f0 and self.T_f0 == 0: + new_f0 = output["f0"][:, -1] + elif not self.discrete_f0 and self.T_f0 > 0: + new_f0 = sample_laplace("f0", self.T_f0, truncate_at_zero=False) + elif self.discrete_f0 and self.T_f0 > 0: + new_f0 = sample_multinomial("f0", self.T_f0) + elif self.discrete_f0 and self.T_f0 == 0: + new_f0 = output["f0"][:, -1, :].argmax(dim=-1, keepdim=True) + else: + assert False + + return new_tokens, new_durations, new_f0 + + +class FilterNamesDataset: + def __init__(self, dataset, fnames_path): + self.dataset = dataset + + with open(fnames_path, "r") as fin: + fnames = set((eval(line)["audio"] for line in fin)) + print(f"# will retrict the dataset for {len(fnames)} files") + + self.indexes = [] + + for i, datapoint in enumerate(dataset): + if datapoint["filename"] in fnames: + self.indexes.append(i) + assert len(self.indexes) == len(fnames), f"{len(self.indexes)} {len(fnames)}" + + self.collater = self.dataset.collater + self.discrete_dur = self.dataset.discrete_dur + self.discrete_f0 = self.dataset.discrete_f0 + + def __len__(self): + return len(self.indexes) + + def __getitem__(self, k): + k = self.indexes[k] + return self.dataset[k] + + def size(self, k): + k = self.indexes[k] + return self.dataset.size(k) + + +@torch.no_grad() +def do_sampling( + model, + batch, + eos_token, + decoder, + autoregressive_steps=100, + teacher_force_tokens=False, + teacher_force_duration=False, + teacher_force_f0=False, + match_duration=False, +): + def autoregressive_step_(output, autoregressive_steps): + new_tokens, new_durations, new_f0 = decoder(output) + + n = output["token"].size(1) if output["token"].ndim == 3 else 1 + + if teacher_force_tokens: + new_tokens = batch["target"][:, n - 1].unsqueeze(-1) + if teacher_force_duration: + new_durations = batch["dur_target"][:, n - 1].unsqueeze(-1) + if teacher_force_f0: + new_f0 = batch["f0_target"][:, n - 1].unsqueeze(-1) + + batch["net_input"]["src_tokens"] = torch.cat( + [batch["net_input"]["src_tokens"], new_tokens], dim=1 + ) + batch["net_input"]["dur_src"] = torch.cat( + [batch["net_input"]["dur_src"], new_durations], dim=1 + ) + batch["net_input"]["f0_src"] = torch.cat( + [batch["net_input"]["f0_src"], new_f0], dim=1 + ) + + outputs = [] + + if teacher_force_tokens or teacher_force_duration or teacher_force_f0: + max_time = batch["target"].size(1) + prefix_time = batch["net_input"]["src_tokens"].size(1) + + autoregressive_steps = max_time - prefix_time + 1 # should be 0 + + for _ in range(autoregressive_steps): + output = model(**batch["net_input"]) + + last_steps = ( + output["token"][:, -1, ...], + output["duration"][:, -1, ...], + output["f0"][:, -1, ...], + ) + outputs.append(last_steps) + + autoregressive_step_(output, autoregressive_steps) + tokens, duration, f0 = ( + batch["net_input"]["src_tokens"], + batch["net_input"]["dur_src"], + batch["net_input"]["f0_src"], + ) + + if ( + match_duration + and (batch["dur_target"].sum(dim=-1) < duration.sum(dim=-1)).all() + ): + break + + return tokens, duration, f0, outputs + + +def unroll_duration(token_stream, duration_stream): + assert len(token_stream) == len( + duration_stream + ), f"{len(token_stream)} != {len(duration_stream)}" + non_positive_durations = sum(d <= 0 for d in duration_stream) + if non_positive_durations > 0: + print( + f"# {non_positive_durations} durations are non-positive, they will be capped to 1" + ) + + result = [] + + duration_stream_rounded_capped = [max(1, int(round(x))) for x in duration_stream] + for t, d in zip(token_stream, duration_stream_rounded_capped): + result.extend([t] * d) + + return result + + +def realign_shifted_streams(tokens, durations, F0s, shifts): + """ + Durations are shifted by 1, F0 by 2 + >>> tokens = ["<s>", "t1", "t2", "t3", "</s>", "x", "x"] + >>> durations = ["<0>", "<0>", "d1", "d2", "d3", "<0>", "x"] + >>> F0s = ["<0>", "<0>", "<0>", "f1", "f2", "f3", "<0>"] + >>> shifts = [1,2] + >>> realign_shifted_streams(tokens, durations, F0s, shifts) + (['<s>', 't1', 't2', 't3', '</s>'], ['<0>', 'd1', 'd2', 'd3', '<0>'], ['<0>', 'f1', 'f2', 'f3', '<0>']) + """ + max_shift = max(shifts) + if max_shift > 0: + shift_durations, shift_F0s = shifts + + tokens = tokens[:-max_shift] + durations = durations[shift_durations:] + if shift_durations < max_shift: + durations = durations[: -(max_shift - shift_durations)] + + if F0s is not None: + F0s = F0s[shift_F0s:] + if shift_F0s < max_shift: + F0s = F0s[: -(max_shift - shift_F0s)] + + assert len(tokens) == len(durations), f"{len(tokens)} =! {len(durations)}" + if F0s is not None: + assert len(tokens) == len(F0s), f"{len(tokens)} =! {len(F0s)}" + + return tokens, durations, F0s + + +def maybe_cut_eos(produced_tokens, produced_duration, produced_f0, eos_idx): + if eos_idx in produced_tokens: + eos_index = produced_tokens.index(eos_idx) + produced_tokens = produced_tokens[:eos_index] + produced_duration = produced_duration[:eos_index] + produced_f0 = produced_f0[:eos_index] + return produced_tokens, produced_duration, produced_f0 + + +def maybe_filter_pad(produced_tokens, produced_duration, produced_f0, pad_idx): + if pad_idx not in produced_tokens: + return produced_tokens, produced_duration, produced_f0 + + assert len(produced_tokens) == len(produced_duration) == len(produced_f0) + + print("<pad> is detected in the output!") + filtered_tokens, filtered_duration, filtered_f0 = [], [], [] + + for t, d, f in zip(produced_tokens, produced_duration, produced_f0): + if t != pad_idx: + filtered_tokens.append(t) + filtered_duration.append(d) + filtered_f0.append(f) + return filtered_tokens, filtered_duration, filtered_f0 + + +def match_duration(produced_tokens, produced_duration, produced_f0, target_duration): + """ + >>> tokens = ['t'] * 4 + >>> F0s = ['f0'] * 4 + >>> produced_duration = [1, 10, 10, 10] + >>> match_duration(tokens, produced_duration, F0s, target_duration=100) + (['t', 't', 't', 't'], [1, 10, 10, 10], ['f0', 'f0', 'f0', 'f0']) + >>> match_duration(tokens, produced_duration, F0s, target_duration=5) + (['t', 't'], [1, 4], ['f0', 'f0']) + """ + if sum(produced_duration) <= target_duration: + return produced_tokens, produced_duration, produced_f0 + + running_duration = 0 + filtered_duration = [] + + for next_tok_duration in produced_duration: + if running_duration + next_tok_duration < target_duration: + filtered_duration.append(next_tok_duration) + running_duration += next_tok_duration + else: + to_add = target_duration - running_duration + assert to_add <= next_tok_duration + filtered_duration.append(to_add) + break + + produced_duration = filtered_duration + assert sum(produced_duration) == target_duration + + n_tok = len(filtered_duration) + + return produced_tokens[:n_tok], produced_duration, produced_f0[:n_tok] + + +def main(rank, world_size, args): + if world_size > 1: + torch.distributed.init_process_group( + backend="gloo", init_method="env://", world_size=world_size, rank=rank + ) + torch.cuda.set_device(rank) + + raw_args = args + args = convert_namespace_to_omegaconf(args) + if args.common.seed is not None: + random.seed(args.common.seed) + np.random.seed(args.common.seed) + utils.set_torch_seed(args.common.seed) + + models, model_args, task = checkpoint_utils.load_model_ensemble_and_task( + [raw_args.path], arg_overrides={"data": args.task.data} + ) + tgt_dict = task.target_dictionary + + for model in models: + model.prepare_for_inference_(args) + model.cuda().eval() + if raw_args.fp16: + model = model.half() + model = models[0] + + config = ExpressiveCodeDataConfig(args.task.data) + + dataset = CodeDataset( + manifest=config.manifests[raw_args.subset], + dictionary=task.source_dictionary, + dur_dictionary=task.source_duration_dictionary, + f0_dictionary=task.source_f0_dictionary, + config=config, + discrete_dur=task.cfg.discrete_duration, + discrete_f0=task.cfg.discrete_f0, + log_f0=task.cfg.log_f0, + normalize_f0_mean=task.cfg.normalize_f0_mean, + normalize_f0_std=task.cfg.normalize_f0_std, + interpolate_f0=task.cfg.interpolate_f0, + shifts=task.cfg.stream_shifts, + return_filename=True, + strip_filename=False, + ) + tgt_dict = task.target_dictionary + shifts = dataset.shifts.dur, dataset.shifts.f0 + max_shift = max(shifts) + + fname = raw_args.output + if world_size > 1: + fname += f"_{rank}" + output_file = open(fname, "w") + + if raw_args.filter_names: + dataset = FilterNamesDataset(dataset, raw_args.filter_names) + + dataset = InferenceDataset(dataset, raw_args.prefix_length, filter_short=True) + print(f"Dataset size {len(dataset)}") + sampler = ( + None + if world_size == 1 + else DistributedSampler( + dataset, num_replicas=world_size, rank=rank, shuffle=False + ) + ) + dataloader = DataLoader( + dataset, + batch_size=1, + shuffle=False, + collate_fn=dataset.collater, + sampler=sampler, + ) + + Ts = raw_args.T_token, raw_args.T_duration, raw_args.T_f0 + decoder = TemperatureDecoder( + Ts, discrete_dur=task.cfg.discrete_duration, discrete_f0=task.cfg.discrete_f0 + ) + + dataset_size = len(dataset) + + f0_decoder = None + if raw_args.f0_discretization_bounds: + assert task.cfg.discrete_f0 + f0_decoder = Naive_F0_Decoder(raw_args.f0_discretization_bounds).cuda() + + pbar = ( + tqdm.tqdm( + total=dataset_size + if raw_args.max_samples is None + else min(raw_args.max_samples, dataset_size) + ) + if world_size == 1 + else None + ) + + samples_produced = 0 + + for batch in dataloader: + if ( + raw_args.max_samples is not None + and samples_produced >= raw_args.max_samples + ): + break + + prefix = batch["prefix"][0] + + batch = explode_batch(batch, raw_args.batch_explosion_rate) + batch = move_to_cuda(batch) + + if not raw_args.short_curcuit: + produced_tokens, produced_durations, produced_f0, _ = do_sampling( + models[0], + batch, + tgt_dict.eos(), + decoder, + autoregressive_steps=raw_args.max_length - prefix + max_shift, + teacher_force_tokens=raw_args.teacher_force_tokens, + match_duration=raw_args.match_duration, + teacher_force_duration=raw_args.teacher_force_duration, + teacher_force_f0=raw_args.teacher_force_f0, + ) + + # stip entries corresponding to <s> + produced_tokens = produced_tokens[:, 1:] + produced_durations = produced_durations[:, 1:] + produced_f0 = produced_f0[:, 1:] + + else: + max_length = raw_args.max_length + max_shift + produced_tokens, produced_durations, produced_f0 = ( + batch["target"][:, :max_length], + batch["dur_target"][:, :max_length], + batch["f0_target"][:, :max_length], + ) + + if f0_decoder is not None: + produced_f0 = f0_decoder(produced_f0) + + produced_tokens, produced_durations, produced_f0 = ( + produced_tokens.cpu().tolist(), + produced_durations.cpu().tolist(), + produced_f0.cpu().tolist(), + ) + + bsz = batch["target"].size(0) + assert bsz == raw_args.batch_explosion_rate + + for i in range(bsz): + if ( + raw_args.max_samples is not None + and samples_produced >= raw_args.max_samples + ): + break + + produced_tokens_i = produced_tokens[i] + produced_durations_i = produced_durations[i] + produced_f0_i = produced_f0[i] + + ( + produced_tokens_i, + produced_durations_i, + produced_f0_i, + ) = realign_shifted_streams( + produced_tokens_i, produced_durations_i, produced_f0_i, shifts + ) + + produced_tokens_i, produced_durations_i, produced_f0_i = maybe_cut_eos( + produced_tokens_i, produced_durations_i, produced_f0_i, tgt_dict.eos() + ) + + produced_tokens_i, produced_durations_i, produced_f0_i = maybe_filter_pad( + produced_tokens_i, produced_durations_i, produced_f0_i, tgt_dict.pad() + ) + + if raw_args.match_duration: + # NB: here we cheat a bit and use that padding has duration 0 + # so no need to re-align and remove padding + dur_target_i = batch["dur_target"][i, :].sum().item() + produced_tokens_i, produced_durations_i, produced_f0_i = match_duration( + produced_tokens_i, produced_durations_i, produced_f0_i, dur_target_i + ) + + if raw_args.cut_prompt: + produced_tokens_i, produced_durations_i, produced_f0_i = ( + produced_tokens_i[prefix:], + produced_durations_i[prefix:], + produced_f0_i[prefix:], + ) + + prompt_fname = batch["filename"][0] + fname = str(pathlib.Path(prompt_fname).with_suffix("")) + f"__{i}.wav" + + token_stream = unroll_duration(produced_tokens_i, produced_durations_i) + f0_stream = unroll_duration(produced_f0_i, produced_durations_i) + output_line = json.dumps( + { + "audio": fname, + "prompt": prompt_fname, + raw_args.code_type: " ".join(map(str, token_stream)), + "duration": round( + sum(produced_durations_i) + * CODETYPE_TO_FRAMETIME[raw_args.code_type], + 3, + ), + "raw_duration": produced_durations_i, + "raw_f0": produced_f0_i, + "f0": [round(f0, 3) for f0 in f0_stream], + } + ) + print(output_line, file=output_file) + + if pbar: + pbar.update(1) + samples_produced += 1 + + if raw_args.debug: + break + + output_file.close() + + if world_size > 1: + # important that everything is flushed before aggregating + torch.distributed.barrier() + + if world_size > 1 and rank == 0: + with open(raw_args.output, "w") as fout: + for i in range(world_size): + f = raw_args.output + f"_{i}" + with open(f, "r") as fin: + fout.write(fin.read()) + os.remove(f) + + +def cli_main(): + parser = options.get_interactive_generation_parser() + parser.add_argument( + "--prefix-length", + type=int, + default=1, + help="Prompt prefix length (including <s>)", + ) + parser.add_argument("--output", type=str, default=None, required=True) + parser.add_argument( + "--debug", action="store_true", help="Process only the first batch" + ) + parser.add_argument( + "--ignore-durations", + action="store_true", + help="If set, the duration stream is ignored", + ) + parser.add_argument( + "--max-length", type=int, default=200, help="Maximal produced length" + ) + parser.add_argument( + "--code-type", choices=["cpc_km100", "hubert"], default="cpc_km100" + ) + parser.add_argument("--max-samples", type=int, default=None) + parser.add_argument("--prompt-duration-scaler", type=float, default=1.0) + parser.add_argument("--teacher-force-tokens", action="store_true", default=False) + parser.add_argument("--teacher-force-duration", action="store_true", default=False) + parser.add_argument("--teacher-force-f0", action="store_true", default=False) + parser.add_argument("--filter-names", type=str, default=None) + parser.add_argument( + "--match-duration", + action="store_true", + help="Do not produce sequences longer that ground-truth", + ) + parser.add_argument( + "--cut-prompt", + action="store_true", + help="Remove prompt from the produced audio", + ) + parser.add_argument( + "--short-curcuit", action="store_true", help="Use 'target' as a sample" + ) + parser.add_argument("--f0-discretization-bounds", type=str, default=None) + + parser.add_argument("--batch-explosion-rate", type=int, default=1) + + parser.add_argument("--T-token", type=float, default=1.0) + parser.add_argument("--T-duration", type=float, default=1.0) + parser.add_argument("--T-f0", type=float, default=1.0) + + parser.add_argument( + "--subset", type=str, default="valid", choices=["test", "valid"] + ) + + args = options.parse_args_and_arch(parser) + + assert ( + args.prefix_length >= 1 + ), "Prefix length includes bos token <s>, hence the minimum is 1." + assert all( + t >= 0 for t in [args.T_token, args.T_f0, args.T_duration] + ), "T must be non-negative!" + + world_size = torch.cuda.device_count() + if world_size > 1: + import random + + mp.set_start_method("spawn", force=True) + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(random.randint(10_000, 50_000)) + + print(f"Using {world_size} devices, master port {os.environ['MASTER_PORT']}") + + mp.spawn( + main, + nprocs=world_size, + args=( + world_size, + args, + ), + join=True, + ) + else: + main(rank=0, world_size=world_size, args=args) + + +if __name__ == "__main__": + cli_main() diff --git a/examples/textless_nlp/pgslm/scripts/join_units_manifest.py b/examples/textless_nlp/pgslm/scripts/join_units_manifest.py new file mode 100644 index 0000000000..ed14fc5f59 --- /dev/null +++ b/examples/textless_nlp/pgslm/scripts/join_units_manifest.py @@ -0,0 +1,48 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import argparse +import pathlib + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--manifest", required=True) + parser.add_argument("--units", required=True) + parser.add_argument("--output", required=True) + parser.add_argument("--sample_rate", type=int, default=16_000) + + args = parser.parse_args() + + with open(args.manifest, "r") as manifest, open(args.units, "r") as units, open( + args.output, "w" + ) as outp: + root = manifest.readline().strip() + root = pathlib.Path(root) + + for manifest_line, unit_line in zip(manifest.readlines(), units.readlines()): + path, frames = manifest_line.split() + duration = int(frames) / float(args.sample_rate) + fname = root / path + speaker = fname.parent.parent.name + + units = unit_line.split("|")[1] + + print( + json.dumps( + dict( + audio=str(root / path), + duration=duration, + hubert_km100=units.strip(), + speaker=speaker, + ) + ), + file=outp, + ) + + +if __name__ == "__main__": + main() diff --git a/examples/textless_nlp/pgslm/scripts/prepare_data.sh b/examples/textless_nlp/pgslm/scripts/prepare_data.sh new file mode 100644 index 0000000000..ec892e59a4 --- /dev/null +++ b/examples/textless_nlp/pgslm/scripts/prepare_data.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +set -eu + +train_json=$1 +valid_json=$2 +test_json=$3 +n_units=$4 +hop_size=$5 +sr=$6 +f0_quantizer=$7 +out_dir=$8 + +meta_path="$out_dir/data_config.json" +f0_dir="$out_dir/f0" + +mkdir -p $out_dir +ln -sf $train_json $out_dir/train.txt +ln -sf $valid_json $out_dir/valid.txt +ln -sf $test_json $out_dir/test.txt + +cat <<EOF >$meta_path +{ + "manifests": { + "train": "$out_dir/train.txt", + "valid": "$out_dir/valid.txt", + "test": "$out_dir/test.txt" + }, + "n_units": $n_units, + "code_hop_size": $hop_size, + "sampling_rate": $sr, + "multispkr": "parent_parent_name", + + "f0_vq_type": "naive", + "f0_vq_naive_quantizer": { + "log_mean_norm": "$f0_quantizer" + }, + "f0_vq_n_units": 32 +} +EOF + +for split in train valid test; do + python examples/textless_nlp/pgslm/preprocess_f0.py \ + $out_dir/$split.txt $f0_dir/$split --nshards=1 --rank=1 --sampling_rate=$sr + + #NSHARDS=16 + #seq 1 $NSHARDS | parallel -j $NSHARDS python examples/textless_nlp/pgslm/preprocess_f0.py \ + # $out_dir/$split.txt $f0_dir/$split --nshards=$NSHARDS --sampling_rate=$sr --rank +done + +# Please make sure that the number of shards (--nshards_list) is consistent across commands +python examples/textless_nlp/pgslm/prepare_dataset.py \ + $meta_path $f0_dir --splits test valid train --nshards_list 1 diff --git a/examples/textless_nlp/pgslm/scripts/prepare_f0_quantization.sh b/examples/textless_nlp/pgslm/scripts/prepare_f0_quantization.sh new file mode 100644 index 0000000000..3a285a39bc --- /dev/null +++ b/examples/textless_nlp/pgslm/scripts/prepare_f0_quantization.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +set -eu + +train_json=$1 +sr=$2 +nbins=$3 +out_dir=$4 +out_prefix=$5 + +f0_dir="$out_dir/f0" + +python examples/textless_nlp/pgslm/preprocess_f0.py \ + $train_json $f0_dir/${out_prefix}_f0_quant --nshards 1 --rank 1 --sampling_rate $sr + +# NB: one can use parallel here: +# NSHARDS=16 +# +#seq 1 $NSHARDS | parallel -j $NSHARDS python examples/textless_nlp/pgslm/preprocess_f0.py \ +# $train_json $f0_dir/${out_prefix}_f0_quant --nshards $NSHARDS --sampling_rate $sr --rank + +python examples/textless_nlp/pgslm/quantize_f0.py \ + $train_json $f0_dir/${out_prefix}_f0_quant $out_dir $out_prefix --nbins $nbins --nshards 1 --normalize mean --log diff --git a/examples/textless_nlp/pgslm/truncated_laplace.py b/examples/textless_nlp/pgslm/truncated_laplace.py new file mode 100644 index 0000000000..089f8a8cfc --- /dev/null +++ b/examples/textless_nlp/pgslm/truncated_laplace.py @@ -0,0 +1,29 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import warnings + + +def truncated_laplace(mean, T, truncate_by_zero=False): + """Generating a sample from a Laplace distribution, possible left-truncated at zero. + A bit of explanation here https://stats.stackexchange.com/a/357598 . + """ + assert isinstance(mean, torch.Tensor) + + if not truncate_by_zero: + percentile = 0.0 + else: + if not (mean >= 0.0).all(): + warnings.warn(f"means are supposed to be non-negative, but got {mean}") + mean = torch.clamp_min(mean, 0.0) + + lower_bound = mean.new_tensor([0.0]) + percentile = 0.5 + 0.5 * torch.sign(lower_bound - mean) * ( + 1.0 - torch.exp(-1.0 / T * torch.abs(mean - lower_bound)) + ) + + p = torch.empty_like(mean).uniform_() * (1.0 - percentile) + percentile + return mean - T * torch.sign(p - 0.5) * torch.log(1 - 2 * torch.abs(p - 0.5)) diff --git a/examples/textless_nlp/speech-resynth/README.md b/examples/textless_nlp/speech-resynth/README.md new file mode 100644 index 0000000000..a099682cdb --- /dev/null +++ b/examples/textless_nlp/speech-resynth/README.md @@ -0,0 +1,28 @@ + +# Speech Resynthesis from Discrete Disentangled Self-Supervised Representations +Landing page with usfull resources for the [Speech Resynthesis from Discrete Disentangled Self-Supervised Representations](https://arxiv.org/abs/2104.00355) paper. + +<p align="center"><img width="70%" src="img/fig.png" /></p> + +__Abstract__: We propose using self-supervised discrete representations for the task of speech resynthesis. To generate disentangled representation, we separately extract low-bitrate representations for speech content, prosodic information, and speaker identity. This allows to synthesize speech in a controllable manner. We analyze various state-of-the-art, self-supervised representation learning methods and shed light on the advantages of each method while considering reconstruction quality and disentanglement properties. Specifically, we evaluate the F0 reconstruction, speaker identification performance (for both resynthesis and voice conversion), recordings' intelligibility, and overall quality using subjective human evaluation. Lastly, we demonstrate how these representations can be used for an ultra-lightweight speech codec. Using the obtained representations, we can get to a rate of 365 bits per second while providing better speech quality than the baseline methods. + + +## Quick Links +- [Paper](https://arxiv.org/pdf/2104.00355.pdf) +- [Samples](https://speechbot.github.io/resynthesis/index.html) +- [Code](https://github.com/facebookresearch/speech-resynthesis) + +The codebase for the [Speech Resynthesis from Discrete Disentangled Self-Supervised Representations](https://arxiv.org/abs/2104.00355) paper can be found under the following [repository](https://github.com/facebookresearch/speech-resynthesis). + + +## Citation +``` +@inproceedings{polyak21_interspeech, + author={Adam Polyak and Yossi Adi and Jade Copet and + Eugene Kharitonov and Kushal Lakhotia and + Wei-Ning Hsu and Abdelrahman Mohamed and Emmanuel Dupoux}, + title={{Speech Resynthesis from Discrete Disentangled Self-Supervised Representations}}, + year=2021, + booktitle={Proc. Interspeech 2021}, +} +``` diff --git a/examples/textless_nlp/speech-resynth/img/fig.png b/examples/textless_nlp/speech-resynth/img/fig.png new file mode 100644 index 0000000000..585bbbce14 Binary files /dev/null and b/examples/textless_nlp/speech-resynth/img/fig.png differ diff --git a/examples/translation/README.md b/examples/translation/README.md index 3eb8e01310..2941f5eb84 100644 --- a/examples/translation/README.md +++ b/examples/translation/README.md @@ -263,12 +263,12 @@ fairseq-preprocess --source-lang fr --target-lang en \ mkdir -p checkpoints/multilingual_transformer CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt17.de_fr.en.bpe16k/ \ --max-epoch 50 \ - --ddp-backend=no_c10d \ + --ddp-backend=legacy_ddp \ --task multilingual_translation --lang-pairs de-en,fr-en \ --arch multilingual_transformer_iwslt_de_en \ --share-decoders --share-decoder-input-output-embed \ --optimizer adam --adam-betas '(0.9, 0.98)' \ - --lr 0.0005 --lr-scheduler inverse_sqrt --min-lr '1e-09' \ + --lr 0.0005 --lr-scheduler inverse_sqrt \ --warmup-updates 4000 --warmup-init-lr '1e-07' \ --label-smoothing 0.1 --criterion label_smoothed_cross_entropy \ --dropout 0.3 --weight-decay 0.0001 \ diff --git a/examples/translation/prepare-iwslt14.sh b/examples/translation/prepare-iwslt14.sh index 0bf0dc2a2e..2fb6643fbc 100644 --- a/examples/translation/prepare-iwslt14.sh +++ b/examples/translation/prepare-iwslt14.sh @@ -15,7 +15,7 @@ CLEAN=$SCRIPTS/training/clean-corpus-n.perl BPEROOT=subword-nmt/subword_nmt BPE_TOKENS=10000 -URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz" +URL="http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz" GZ=de-en.tgz if [ ! -d "$SCRIPTS" ]; then diff --git a/examples/translation_moe/README.md b/examples/translation_moe/README.md index 33f1bee5cb..2e5c8af617 100644 --- a/examples/translation_moe/README.md +++ b/examples/translation_moe/README.md @@ -15,16 +15,16 @@ The model is trained with online responsibility assignment and shared parameteri The following command will train a `hMoElp` model with `3` experts: ```bash -fairseq-train --ddp-backend='no_c10d' \ +fairseq-train --ddp-backend='legacy_ddp' \ data-bin/wmt17_en_de \ --max-update 100000 \ - --task translation_moe --user-dir examples/translation_moe/src \ + --task translation_moe --user-dir examples/translation_moe/translation_moe_src \ --method hMoElp --mean-pool-gating-network \ --num-experts 3 \ --arch transformer_wmt_en_de --share-all-embeddings \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \ - --lr 0.0007 --min-lr 1e-09 \ + --lr 0.0007 \ --dropout 0.1 --weight-decay 0.0 --criterion cross_entropy \ --max-tokens 3584 ``` @@ -37,7 +37,7 @@ For example, to generate from expert 0: fairseq-generate data-bin/wmt17_en_de \ --path checkpoints/checkpoint_best.pt \ --beam 1 --remove-bpe \ - --task translation_moe --user-dir examples/translation_moe/src \ + --task translation_moe --user-dir examples/translation_moe/translation_moe_src \ --method hMoElp --mean-pool-gating-network \ --num-experts 3 \ --gen-expert 0 @@ -61,7 +61,7 @@ for EXPERT in $(seq 0 2); do \ --beam 1 \ --bpe subword_nmt --bpe-codes $BPE_CODE \ --buffer-size 500 --max-tokens 6000 \ - --task translation_moe --user-dir examples/translation_moe/src \ + --task translation_moe --user-dir examples/translation_moe/translation_moe_src \ --method hMoElp --mean-pool-gating-network \ --num-experts 3 \ --gen-expert $EXPERT ; \ diff --git a/examples/translation_moe/score.py b/examples/translation_moe/score.py index 9a529a9850..e45b2cb62e 100644 --- a/examples/translation_moe/score.py +++ b/examples/translation_moe/score.py @@ -17,8 +17,8 @@ from itertools import chain import numpy as np -from sacrebleu import compute_bleu, corpus_bleu as _corpus_bleu - +import sacrebleu +from sacrebleu import corpus_bleu as _corpus_bleu def main(): parser = argparse.ArgumentParser(sys.argv[0]) @@ -119,7 +119,7 @@ def sentence_bleu(hypothesis, reference): for i in range(1, 4): bleu.counts[i] += 1 bleu.totals[i] += 1 - bleu = compute_bleu( + bleu = sacrebleu.BLEU.compute_bleu( bleu.counts, bleu.totals, bleu.sys_len, diff --git a/examples/translation_moe/src/__init__.py b/examples/translation_moe/translation_moe_src/__init__.py similarity index 100% rename from examples/translation_moe/src/__init__.py rename to examples/translation_moe/translation_moe_src/__init__.py diff --git a/examples/translation_moe/src/logsumexp_moe.py b/examples/translation_moe/translation_moe_src/logsumexp_moe.py similarity index 100% rename from examples/translation_moe/src/logsumexp_moe.py rename to examples/translation_moe/translation_moe_src/logsumexp_moe.py diff --git a/examples/translation_moe/src/mean_pool_gating_network.py b/examples/translation_moe/translation_moe_src/mean_pool_gating_network.py similarity index 83% rename from examples/translation_moe/src/mean_pool_gating_network.py rename to examples/translation_moe/translation_moe_src/mean_pool_gating_network.py index 484b6ac912..efc7ae40bf 100644 --- a/examples/translation_moe/src/mean_pool_gating_network.py +++ b/examples/translation_moe/translation_moe_src/mean_pool_gating_network.py @@ -26,15 +26,15 @@ def __init__(self, embed_dim, num_experts, dropout=None): def forward(self, encoder_out): if not ( - hasattr(encoder_out, "encoder_out") - and hasattr(encoder_out, "encoder_padding_mask") - and encoder_out.encoder_out.size(2) == self.embed_dim + "encoder_out" in encoder_out + and "encoder_padding_mask" in encoder_out + and encoder_out["encoder_out"][0].size(2) == self.embed_dim ): raise ValueError("Unexpected format for encoder_out") # mean pooling over time - encoder_padding_mask = encoder_out.encoder_padding_mask # B x T - encoder_out = encoder_out.encoder_out.transpose(0, 1) # B x T x C + encoder_padding_mask = encoder_out["encoder_padding_mask"][0] # B x T + encoder_out = encoder_out["encoder_out"][0].transpose(0, 1) # B x T x C if encoder_padding_mask is not None: encoder_out = encoder_out.clone() # required because of transpose above encoder_out[encoder_padding_mask] = 0 diff --git a/examples/translation_moe/src/translation_moe.py b/examples/translation_moe/translation_moe_src/translation_moe.py similarity index 73% rename from examples/translation_moe/src/translation_moe.py rename to examples/translation_moe/translation_moe_src/translation_moe.py index ae458aaad3..a829bf7dcb 100644 --- a/examples/translation_moe/src/translation_moe.py +++ b/examples/translation_moe/translation_moe_src/translation_moe.py @@ -3,16 +3,53 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +from dataclasses import dataclass, field import torch -from fairseq import metrics, utils +from omegaconf import II + +from fairseq import utils +from fairseq.logging import metrics +from fairseq.dataclass import ChoiceEnum from fairseq.tasks import register_task -from fairseq.tasks.translation import TranslationTask +from fairseq.tasks.translation import TranslationConfig, TranslationTask from .logsumexp_moe import LogSumExpMoE from .mean_pool_gating_network import MeanPoolGatingNetwork -@register_task("translation_moe") +METHOD_CHOICES = ChoiceEnum(["sMoElp", "sMoEup", "hMoElp", "hMoEup"]) + + +@dataclass +class TranslationMoEConfig(TranslationConfig): + method: METHOD_CHOICES = field( + default="hMoEup", + metadata={"help": "MoE method"}, + ) + num_experts: int = field( + default=3, + metadata={"help": "number of experts"}, + ) + mean_pool_gating_network: bool = field( + default=False, + metadata={"help": "use a simple mean-pooling gating network"}, + ) + mean_pool_gating_network_dropout: float = field( + default=0, + metadata={"help": "dropout for mean-pooling gating network"}, + ) + mean_pool_gating_network_encoder_dim: int = field( + default=0, + metadata={"help": "encoder output dim for mean-pooling gating network"}, + ) + gen_expert: int = field( + default=0, + metadata={"help": "which expert to use for generation"}, + ) + sentence_avg: bool = II("optimization.sentence_avg") + + +@register_task("translation_moe", dataclass=TranslationMoEConfig) class TranslationMoETask(TranslationTask): """ Translation task for Mixture of Experts (MoE) models. @@ -37,77 +74,60 @@ class TranslationMoETask(TranslationTask): :prog: """ - @staticmethod - def add_args(parser): - """Add task-specific arguments to the parser.""" - # fmt: off - TranslationTask.add_args(parser) - parser.add_argument('--method', default='hMoEup', - choices=['sMoElp', 'sMoEup', 'hMoElp', 'hMoEup']) - parser.add_argument('--num-experts', default=3, type=int, metavar='N', - help='number of experts') - parser.add_argument('--mean-pool-gating-network', action='store_true', - help='use a simple mean-pooling gating network') - parser.add_argument('--mean-pool-gating-network-dropout', type=float, - help='dropout for mean-pooling gating network') - parser.add_argument('--mean-pool-gating-network-encoder-dim', type=float, - help='encoder output dim for mean-pooling gating network') - parser.add_argument('--gen-expert', type=int, default=0, - help='which expert to use for generation') - # fmt: on - - def __init__(self, args, src_dict, tgt_dict): - if args.method == "sMoElp": + cfg: TranslationMoEConfig + + def __init__(self, cfg: TranslationMoEConfig, src_dict, tgt_dict): + if cfg.method == "sMoElp": # soft MoE with learned prior self.uniform_prior = False self.hard_selection = False - elif args.method == "sMoEup": + elif cfg.method == "sMoEup": # soft MoE with uniform prior self.uniform_prior = True self.hard_selection = False - elif args.method == "hMoElp": + elif cfg.method == "hMoElp": # hard MoE with learned prior self.uniform_prior = False self.hard_selection = True - elif args.method == "hMoEup": + elif cfg.method == "hMoEup": # hard MoE with uniform prior self.uniform_prior = True self.hard_selection = True # add indicator tokens for each expert - for i in range(args.num_experts): + for i in range(cfg.num_experts): # add to both dictionaries in case we're sharing embeddings src_dict.add_symbol("<expert_{}>".format(i)) tgt_dict.add_symbol("<expert_{}>".format(i)) - super().__init__(args, src_dict, tgt_dict) + super().__init__(cfg, src_dict, tgt_dict) - def build_model(self, args): + def build_model(self, cfg, from_checkpoint=False): from fairseq import models - model = models.build_model(args, self) + model = models.build_model(cfg, self) if not self.uniform_prior and not hasattr(model, "gating_network"): - if self.args.mean_pool_gating_network: - if getattr(args, "mean_pool_gating_network_encoder_dim", None): - encoder_dim = args.mean_pool_gating_network_encoder_dim - elif getattr(args, "encoder_embed_dim", None): + if self.cfg.mean_pool_gating_network: + if self.cfg.mean_pool_gating_network_encoder_dim > 0: + encoder_dim = self.cfg.mean_pool_gating_network_encoder_dim + elif getattr(cfg, "encoder_embed_dim", None): # assume that encoder_embed_dim is the encoder's output dimension - encoder_dim = args.encoder_embed_dim + encoder_dim = cfg.encoder_embed_dim else: raise ValueError( "Must specify --mean-pool-gating-network-encoder-dim" ) - if getattr(args, "mean_pool_gating_network_dropout", None): - dropout = args.mean_pool_gating_network_dropout - elif getattr(args, "dropout", None): - dropout = args.dropout + if self.cfg.mean_pool_gating_network_dropout > 0: + dropout = self.cfg.mean_pool_gating_network_dropout + elif getattr(cfg, "dropout", None): + dropout = cfg.dropout else: - raise ValueError("Must specify --mean-pool-gating-network-dropout") + raise ValueError("Must specify task.mean_pool_gating_network_dropout") model.gating_network = MeanPoolGatingNetwork( encoder_dim, - args.num_experts, + self.cfg.num_experts, dropout, ) else: @@ -125,7 +145,7 @@ def _get_loss(self, sample, model, criterion): criterion, "compute_loss" ), "translation_moe task requires the criterion to implement the compute_loss() method" - k = self.args.num_experts + k = self.cfg.num_experts bsz = sample["target"].size(0) def get_lprob_y(encoder_out, prev_output_tokens_k): @@ -185,7 +205,7 @@ def get_lprob_yz(winners=None): loss = loss.sum() sample_size = ( - sample["target"].size(0) if self.args.sentence_avg else sample["ntokens"] + sample["target"].size(0) if self.cfg.sentence_avg else sample["ntokens"] ) logging_output = { "loss": utils.item(loss.data), @@ -221,7 +241,7 @@ def inference_step( expert=None, constraints=None, ): - expert = expert or self.args.gen_expert + expert = expert or self.cfg.gen_expert with torch.no_grad(): return generator.generate( models, diff --git a/examples/truncated_bptt/README.md b/examples/truncated_bptt/README.md new file mode 100644 index 0000000000..86518c9d5e --- /dev/null +++ b/examples/truncated_bptt/README.md @@ -0,0 +1,70 @@ +# Truncated Backpropagation Through Time (BPTT) + +Truncated BPTT is a useful technique for training language models on very long +sequences. Typically a long sequences is split into chunks and a language model +is trained over the chunks sequentially. The LM may condition on previous +chunks, but gradients only flow through the current chunk. This technique was +the basis for the paper: [Transformer-XL: Attentive Language Models Beyond a +Fixed-Length Context](https://arxiv.org/abs/1901.02860), which achieved +state-of-the-art language modeling results at the time of publication. + +It is slightly tricky to implement Truncated BPTT efficiently in fairseq, since +we need to iterate over the data sequentially and disable any batch shuffling +logic. The code provided in this example illustrates how to implement Truncated +BPTT in fairseq by overriding ``FairseqTask::get_batch_iterator`` to iterate +over the data sequentially. Crucially, this example supports batching and +multi-GPU (data parallel) training. + +##### 0. Setup + +First, see the general [language modeling README](README.md) for instructions on +preprocessing the WikiText-103 data. + +##### 1. Train a Transformer-XL model on WikiText-103 + +We will train a 16-layer Transformer-XL model following the [hyperparameters +used in the original +paper](https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/run_wt103_base.sh). + +The following command assumes 4 GPUs, so that the total batch size is 60 +sequences (15 x 4). Training should take ~24 hours on 4 V100 GPUs: +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train \ + --user-dir examples/truncated_bptt \ + data-bin/wikitext-103/ \ + --task truncated_bptt_lm --tokens-per-sample 150 \ + --batch-size 15 --max-update 200000 \ + --arch transformer_xl --n-layer 16 --d-model 410 --n-head 10 \ + --d-head 41 --d-inner 2100 --dropout 0.1 --dropatt 0.0 --mem-len 150 \ + --optimizer adam --clip-norm 0.25 \ + --lr-scheduler cosine --warmup-updates 0 --min-lr 0.0 --lr 0.00025 \ + --log-format json --log-interval 25 \ + --fp16 +``` + +If training on a single GPU, set `--update-freq=4` to accumulate 4x gradients +and simulate training on 4 GPUs. + +##### 2. Evaluate + +```bash +fairseq-eval-lm data-bin/wikitext-103/ \ + --path checkpoints/checkpoint_best.pt \ + --user-dir examples/truncated_bptt/ \ + --task truncated_bptt_lm \ + --batch-size 1 --required-batch-size-multiple 1 \ + --model-overrides '{"mem_len":640,"clamp_len":400,"same_length":True}' \ + --tokens-per-sample 64 +# ... | INFO | fairseq_cli.eval_lm | num. model params: 151123537 +# ... | INFO | fairseq_cli.eval_lm | Evaluated 245569 tokens in 83.1s (2956.82 tokens/s) +# ... | INFO | fairseq_cli.eval_lm | Loss (base 2): 4.5668, Perplexity: 23.70 +# Compare to 24.0 test perplexity from the paper +``` + +*Note:* During training the model saw 150 tokens of context +(``--tokens-per-sample=150``) and 150 extra memory tokens (``--mem-len=150``). +During evaluation we measure perplexity on sequences of 64 tokens +(``--tokens-per-sample=64``) and increase the memory length +(``--model-overrides='{"mem_len":640}'``). These settings match the evaluation +settings from [the original +paper](https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/run_wt103_base.sh). diff --git a/examples/truncated_bptt/__init__.py b/examples/truncated_bptt/__init__.py new file mode 100644 index 0000000000..eee484d427 --- /dev/null +++ b/examples/truncated_bptt/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import transformer_xl_model, truncated_bptt_lm_task # noqa diff --git a/examples/truncated_bptt/transformer_xl_model.py b/examples/truncated_bptt/transformer_xl_model.py new file mode 100644 index 0000000000..58c0f6ad8a --- /dev/null +++ b/examples/truncated_bptt/transformer_xl_model.py @@ -0,0 +1,143 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from dataclasses import dataclass, field +from typing import Dict, List, Optional + +import torch +from fairseq.dataclass import FairseqDataclass +from fairseq.models import ( + FairseqIncrementalDecoder, + FairseqLanguageModel, + register_model, +) +from fairseq.modules.checkpoint_activations import checkpoint_wrapper +from omegaconf import II + + +logger = logging.getLogger(__name__) + + +@dataclass +class TransformerXLConfig(FairseqDataclass): + # defaults come from the original Transformer-XL code + cutoffs: List[int] = field(default_factory=lambda: [20000, 40000, 200000]) + d_model: int = 500 + n_head: int = 10 + d_head: int = 50 + d_inner: int = 1000 + div_val: int = 1 + n_layer: int = 12 + mem_len: int = 0 + clamp_len: int = -1 + same_length: bool = False + dropout: float = 0.0 + dropatt: float = 0.0 + checkpoint_activations: bool = False + offload_activations: bool = False + max_target_positions: int = II("task.max_target_positions") + + +@register_model("transformer_xl", dataclass=TransformerXLConfig) +class TransformerXLLanguageModel(FairseqLanguageModel): + @classmethod + def build_model(cls, cfg: TransformerXLConfig, task): + return cls(TransformerXLDecoder(cfg, task)) + + +class TransformerXLDecoder(FairseqIncrementalDecoder): + def __init__(self, cfg, task): + try: + from transformers.models.transfo_xl import ( + TransfoXLConfig, + TransfoXLLMHeadModel, + ) + except ImportError: + from transformers.configuration_transfo_xl import TransfoXLConfig + from transformers.modeling_transfo_xl import TransfoXLLMHeadModel + + super().__init__(task.target_dictionary) + self.cfg = cfg + + # remove any cutoffs larger than the vocab size + cutoffs = [ + cutoff for cutoff in cfg.cutoffs if cutoff < len(task.target_dictionary) + ] + + config = TransfoXLConfig( + vocab_size=len(task.target_dictionary), + cutoffs=cutoffs, + d_model=cfg.d_model, + d_embed=cfg.d_model, + n_head=cfg.n_head, + d_head=cfg.d_head, + d_inner=cfg.d_inner, + div_val=cfg.div_val, + n_layer=cfg.n_layer, + mem_len=cfg.mem_len, + clamp_len=cfg.clamp_len, + same_length=cfg.same_length, + dropout=cfg.dropout, + dropatt=cfg.dropatt, + ) + logger.info(config) + self.model = TransfoXLLMHeadModel(config) + + if cfg.checkpoint_activations or cfg.offload_activations: + for i in range(len(self.model.transformer.layers)): + self.model.transformer.layers[i] = checkpoint_wrapper( + self.model.transformer.layers[i], + offload_to_cpu=cfg.offload_activations, + ) + # TODO: may save mem to wrap(layer.pos_ff.CoreNet[3]) + + self._mems = None + + def forward( + self, + src_tokens, + src_lengths=None, # unused + incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None, + encoder_out=None, + ): + if incremental_state is not None: # used during inference + mems = self.get_incremental_state(incremental_state, "mems") + src_tokens = src_tokens[:, -1:] # only keep the most recent token + else: + mems = self._mems + + output = self.model( + input_ids=src_tokens, + mems=mems, + return_dict=False, + ) + + if len(output) >= 2: + if incremental_state is not None: + self.set_incremental_state(incremental_state, "mems", output[1]) + else: + self._mems = output[1] + + return (output[0],) + + def max_positions(self): + return self.cfg.max_target_positions + + def reorder_incremental_state( + self, + incremental_state: Dict[str, Dict[str, Optional[torch.Tensor]]], + new_order: torch.Tensor, + ): + """Reorder incremental state. + + This will be called when the order of the input has changed from the + previous time step. A typical use case is beam search, where the input + order changes between time steps based on the selection of beams. + """ + mems = self.get_incremental_state(incremental_state, "mems") + if mems is not None: + new_mems = [mems_i.index_select(1, new_order) for mems_i in mems] + self.set_incremental_state(incremental_state, "mems", new_mems) diff --git a/examples/truncated_bptt/truncated_bptt_lm_task.py b/examples/truncated_bptt/truncated_bptt_lm_task.py new file mode 100644 index 0000000000..9978481b6d --- /dev/null +++ b/examples/truncated_bptt/truncated_bptt_lm_task.py @@ -0,0 +1,285 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +from dataclasses import dataclass, field +from typing import List, Optional, Tuple + +import torch +from fairseq import utils +from fairseq.data import ( + Dictionary, + TokenBlockDataset, + data_utils, + iterators, +) +from fairseq.dataclass import FairseqDataclass +from fairseq.distributed import utils as dist_utils +from fairseq.tasks import FairseqTask, register_task +from omegaconf import II + + +logger = logging.getLogger(__name__) + + +@dataclass +class TruncatedBPTTLMConfig(FairseqDataclass): + data: str = field(default="???", metadata={"help": "path to data directory"}) + tokens_per_sample: int = field( + default=1024, metadata={"help": "max number of tokens per sequence"}, + ) + batch_size: int = II("dataset.batch_size") + # Some models use *max_target_positions* to know how many positional + # embeddings to learn. We use II(...) to make it default to + # *tokens_per_sample*, but in principle there could be more positional + # embeddings than tokens in a single batch. This may also be irrelevant for + # custom model implementations. + max_target_positions: int = II("task.tokens_per_sample") + # these will be populated automatically if not provided + data_parallel_rank: Optional[int] = None + data_parallel_size: Optional[int] = None + + +@register_task("truncated_bptt_lm", dataclass=TruncatedBPTTLMConfig) +class TruncatedBPTTLMTask(FairseqTask): + def __init__(self, cfg: TruncatedBPTTLMConfig): + super().__init__(cfg) + + if cfg.data_parallel_rank is None or cfg.data_parallel_size is None: + if torch.distributed.is_initialized(): + cfg.data_parallel_rank = dist_utils.get_data_parallel_rank() + cfg.data_parallel_size = dist_utils.get_data_parallel_world_size() + else: + cfg.data_parallel_rank = 0 + cfg.data_parallel_size = 1 + + # load the dictionary + paths = utils.split_paths(cfg.data) + assert len(paths) > 0 + self.dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) + logger.info("dictionary: {} types".format(len(self.dictionary))) + + def load_dataset(self, split, epoch=1, combine=False, **kwargs): + """Load a given dataset split (e.g., train, valid, test)""" + + # support sharded datasets + paths = utils.split_paths(self.cfg.data) + assert len(paths) > 0 + data_path = paths[(epoch - 1) % len(paths)] + split_path = os.path.join(data_path, split) + + # each element of *data* will be a tensorized line from the original + # text dataset, similar to ``open(split_path).readlines()`` + data = data_utils.load_indexed_dataset( + split_path, self.dictionary, combine=combine + ) + if data is None: + raise FileNotFoundError( + "Dataset not found: {} ({})".format(split, split_path) + ) + + # this is similar to ``data.view(-1).split(tokens_per_sample)`` + data = TokenBlockDataset( + data, + data.sizes, + block_size=self.cfg.tokens_per_sample, + pad=None, # unused + eos=None, # unused + break_mode="none", + ) + + self.datasets[split] = TruncatedBPTTDataset( + data=data, + bsz_per_shard=self.cfg.batch_size, + shard_id=self.cfg.data_parallel_rank, + num_shards=self.cfg.data_parallel_size, + ) + + def dataset(self, split): + return self.datasets[split] + + def get_batch_iterator( + self, + dataset, + num_workers=0, + epoch=1, + data_buffer_size=0, + skip_remainder_batch=False, + **kwargs + ): + return iterators.EpochBatchIterator( + dataset=dataset, + collate_fn=self._collate_fn, + num_workers=num_workers, + epoch=epoch, + buffer_size=data_buffer_size, + # we don't use the batching functionality from EpochBatchIterator; + # instead every item in *dataset* is a whole batch + batch_sampler=[[i] for i in range(len(dataset))], + disable_shuffling=True, + skip_remainder_batch=skip_remainder_batch, + ) + + def _collate_fn(self, items: List[List[torch.Tensor]]): + # we don't use fairseq's batching functionality, so we expect a single + # Tensor of type List[torch.Tensor] + assert len(items) == 1 + + # item will have shape B x T (the last batch may have length < T) + id, item = items[0] + item = data_utils.collate_tokens(item, pad_idx=self.source_dictionary.pad()) + B, T = item.size() + + # shift item one position over and append a padding token for the target + target = torch.nn.functional.pad( + item[:, 1:], (0, 1, 0, 0), value=self.target_dictionary.pad() + ) + + # fairseq expects batches to have the following structure + return { + "id": torch.tensor([id] * item.size(0)), + "net_input": {"src_tokens": item,}, + "target": target, + "nsentences": item.size(0), + "ntokens": item.numel(), + } + + def build_dataset_for_inference( + self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs + ) -> torch.utils.data.Dataset: + eos = self.source_dictionary.eos() + dataset = TokenBlockDataset( + src_tokens, + src_lengths, + block_size=None, # ignored for "eos" break mode + pad=self.source_dictionary.pad(), + eos=eos, + break_mode="eos", + ) + + class Dataset(torch.utils.data.Dataset): + def __getitem__(self, i): + item = dataset[i] + if item[-1] == eos: + # remove eos to support generating with a prefix + item = item[:-1] + return (i, [item]) + + def __len__(self): + return len(dataset) + + return Dataset() + + def inference_step( + self, generator, models, sample, prefix_tokens=None, constraints=None + ): + with torch.no_grad(): + if constraints is not None: + raise NotImplementedError + + # SequenceGenerator doesn't use *src_tokens* directly, we need to + # pass the *prefix_tokens* argument instead. + if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement(): + prefix_tokens = sample["net_input"]["src_tokens"] + + # begin generation with the end-of-sentence token + bos_token = self.source_dictionary.eos() + + return generator.generate( + models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token + ) + + def eval_lm_dataloader( + self, + dataset, + max_tokens: Optional[int] = 36000, + batch_size: Optional[int] = None, + max_positions: Optional[int] = None, + num_shards: int = 1, + shard_id: int = 0, + num_workers: int = 1, + data_buffer_size: int = 10, + context_window: int = 0, + ): + if context_window > 0: + raise NotImplementedError( + "Transformer-XL doesn't need --context-window, try " + "--model-overrides '{\"mem_len\":42}' instead " + ) + return self.get_batch_iterator( + dataset=dataset, + max_tokens=max_tokens, + max_sentences=batch_size, + max_positions=max_positions, + ignore_invalid_inputs=True, + num_shards=num_shards, + shard_id=shard_id, + num_workers=num_workers, + data_buffer_size=data_buffer_size, + ).next_epoch_itr(shuffle=False) + + @property + def source_dictionary(self): + return self.dictionary + + @property + def target_dictionary(self): + return self.dictionary + + +class TruncatedBPTTDataset(torch.utils.data.Dataset): + def __init__( + self, + data: List[torch.Tensor], # ordered list of items + bsz_per_shard, # number of items processed per GPUs per forward + shard_id, # current GPU ID + num_shards, # number of GPUs + ): + super().__init__() + self.data = data + + def batchify(data, bsz): + # Work out how cleanly we can divide the dataset into bsz parts. + nbatch = data.size(0) // bsz + # Trim off any extra elements that wouldn't cleanly fit (remainders). + data = data.narrow(0, 0, nbatch * bsz) + # Evenly divide the data across the bsz batches. + data = data.view(bsz, -1).contiguous() + return data + + # total number of sequences processed by all GPUs in each forward pass + global_batch_size = bsz_per_shard * num_shards + + """ + With a 16 item dataset, bsz_per_shard=2 and num_shards=3, + *indices* might look like: + + indices = [[0, 1], + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11]] + + The size of the TruncatedBPTTDataset instance will be 2, + and shard 1 will see items: + + [(0, [data[4], data[6]]), + (1, [data[5], data[7]])] + """ + indices = batchify(torch.arange(len(data)), global_batch_size) + assert indices.size(0) == global_batch_size + + self.my_indices = indices[ + shard_id * bsz_per_shard : (shard_id + 1) * bsz_per_shard + ] + assert self.my_indices.size(0) == bsz_per_shard + + def __len__(self): + return self.my_indices.size(1) + + def __getitem__(self, i) -> Tuple[int, List[torch.Tensor]]: + return (i, [self.data[idx] for idx in self.my_indices[:, i]]) diff --git a/examples/unsupervised_quality_estimation/README.md b/examples/unsupervised_quality_estimation/README.md index aeb96a14b1..e86a0d13b8 100644 --- a/examples/unsupervised_quality_estimation/README.md +++ b/examples/unsupervised_quality_estimation/README.md @@ -55,7 +55,7 @@ Translate ``` CUDA_VISIBLE_DEVICES=$GPU fairseq-generate $TMP/bin --path ${MODEL_DIR}/${SRC_LANG}-${TGT_LANG}.pt --beam 5 --source-lang $SRC_LANG --target-lang $TGT_LANG --no-progress-bar --unkpen 5 > $TMP/fairseq.out -grep ^H $TMP/fairseq.out | cut -f3- > $TMP/mt.out +grep ^H $TMP/fairseq.out | cut -d- -f2- | sort -n | cut -f3- > $TMP/mt.out ``` Post-process @@ -88,7 +88,7 @@ CUDA_VISIBLE_DEVICES=${GPU} fairseq-generate ${TMP}/bin-repeated --path ${MODEL_ --retain-dropout-modules '["TransformerModel","TransformerEncoder","TransformerDecoder","TransformerEncoderLayer"]' TransformerDecoderLayer --seed 46 > $TMP/dropout.scoring.out -grep ^H $TMP/dropout.scoring.out | cut -f2- > $TMP/dropout.scores +grep ^H $TMP/dropout.scoring.out | cut -d- -f2- | sort -n | cut -f2 > $TMP/dropout.scores ``` @@ -112,7 +112,7 @@ CUDA_VISIBLE_DEVICES=${GPU} fairseq-generate ${TMP}/bin-repeated --path ${MODEL_ --unkpen 5 --retain-dropout-modules TransformerModel TransformerEncoder TransformerDecoder TransformerEncoderLayer TransformerDecoderLayer --seed 46 > $TMP/dropout.generation.out -grep ^H $TMP/dropout.generation.out | cut -f3- > $TMP/dropout.hypotheses_ +grep ^H $TMP/dropout.generation.out | cut -d- -f2- | sort -n | cut -f3- > $TMP/dropout.hypotheses_ sed -r 's/(@@ )| (@@ ?$)//g' < $TMP/dropout.hypotheses_ | perl $MOSES_DECODER/scripts/tokenizer/detokenizer.perl -l $TGT_LANG > $TMP/dropout.hypotheses diff --git a/examples/unsupervised_quality_estimation/meteor.py b/examples/unsupervised_quality_estimation/meteor.py index 4a214e794d..2ee0448cf1 100644 --- a/examples/unsupervised_quality_estimation/meteor.py +++ b/examples/unsupervised_quality_estimation/meteor.py @@ -85,19 +85,19 @@ def read_output(meteor_output_path, n_repeats): def main(): parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input") + parser.add_argument("-i", "--infile") parser.add_argument("-n", "--repeat_times", type=int) parser.add_argument("-m", "--meteor") parser.add_argument("-o", "--output") args = parser.parse_args() - translations = read_translations(args.infile, args.repetitions) + translations = read_translations(args.infile, args.repeat_times) sys.stderr.write("\nGenerating input for Meteor...") - ref_path, mt_path = generate_input(translations, args.repetitions) + ref_path, mt_path = generate_input(translations, args.repeat_times) sys.stderr.write("\nRunning Meteor...") out_path = run_meteor(ref_path, mt_path, args.meteor) sys.stderr.write("\nReading output...") - scores = read_output(out_path, args.repetitions) + scores = read_output(out_path, args.repeat_times) sys.stderr.write("\nWriting results...") with open(args.output, "w") as o: for scr in scores: diff --git a/examples/wav2vec/README.md b/examples/wav2vec/README.md index 518d8f86cb..e979733075 100644 --- a/examples/wav2vec/README.md +++ b/examples/wav2vec/README.md @@ -2,6 +2,14 @@ wav2vec 2.0 learns speech representations on unlabeled data as described in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations (Baevski et al., 2020)](https://arxiv.org/abs/2006.11477). +We learned speech representations in multiple languages as well in [Unsupervised Cross-lingual Representation Learning for Speech Recognition (Conneau et al., 2020)](https://arxiv.org/abs/2006.13979). + +We also combined wav2vec 2.0 with self-training in [Self-training and Pre-training are Complementary for Speech Recognition (Xu et al., 2020)](https://arxiv.org/abs/2010.11430). + +We combined speech data from multiple domains in [Robust wav2vec 2.0: Analyzing Domain Shift in Self-Supervised Pre-Training (Hsu, et al., 2021)](https://arxiv.org/abs/2104.01027). + +We finetuned XLSR-53 on multiple languages to transcribe unseen languages in [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al., 2021)](https://arxiv.org/abs/2109.11680). + ## Pre-trained models Model | Finetuning split | Dataset | Model @@ -14,18 +22,59 @@ Wav2Vec 2.0 Large | No finetuning | [Librispeech](http://www.openslr.org/12) | Wav2Vec 2.0 Large | 10 minutes | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_10m.pt) Wav2Vec 2.0 Large | 100 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_100h.pt) Wav2Vec 2.0 Large | 960 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_960h.pt) -Wav2Vec 2.0 Large (LV-60) | No finetuning | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox.pt) -Wav2Vec 2.0 Large (LV-60) | 10 minutes | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_10m.pt) -Wav2Vec 2.0 Large (LV-60) | 100 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_100h.pt) -Wav2Vec 2.0 Large (LV-60) | 960 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec2_vox_960h.pt) +Wav2Vec 2.0 Large (LV-60)* | No finetuning | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_new.pt) +Wav2Vec 2.0 Large conformer - rel_pos (LV-60)* | No finetuning | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_relpos_PT_no_FT) +Wav2Vec 2.0 Large conformer - rope (LV-60)* | No finetuning | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_rope_PT_no_FT) +Wav2Vec 2.0 Large (LV-60)* | 10 minutes | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_10m_new.pt) +Wav2Vec 2.0 Large (LV-60)* | 100 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_100h_new.pt) +Wav2Vec 2.0 Large conformer - rel_pos (LV-60)* | 100 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_relpos_PT_100h_FT.pt) +Wav2Vec 2.0 Large conformer - rope (LV-60)* | 100 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_rope_PT_100h_FT.pt) +Wav2Vec 2.0 Large (LV-60)* | 960 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec2_vox_960h_new.pt) +Wav2Vec 2.0 Large conformer - rel_pos (LV-60)* | 960 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_relpos_PT_960h_FT.pt) +Wav2Vec 2.0 Large conformer - rope (LV-60)* | 960 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_rope_PT_960h_FT.pt) +Wav2Vec 2.0 Large (LV-60) + Self Training * | 10 minutes | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_10m_pl.pt) +Wav2Vec 2.0 Large (LV-60) + Self Training * | 100 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_100h_pl.pt) +Wav2Vec 2.0 Large (LV-60) + Self Training * | 960 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt) +Wav2Vec 2.0 Large (LV-60 + CV + SWBD + FSH) ** | No finetuning | [Libri-Light](https://github.com/facebookresearch/libri-light) + [CommonVoice](https://commonvoice.mozilla.org/en/languages) + [Switchboard](https://catalog.ldc.upenn.edu/LDC97S62) + [Fisher](https://catalog.ldc.upenn.edu/LDC2004T19) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/w2v_large_lv_fsh_swbd_cv.pt) +Wav2Vec 2.0 Large (LV-60 + CV + SWBD + FSH) ** | 960 hours Librispeech | [Libri-Light](https://github.com/facebookresearch/libri-light) + [CommonVoice](https://commonvoice.mozilla.org/en/languages) + [Switchboard](https://catalog.ldc.upenn.edu/LDC97S62) + [Fisher](https://catalog.ldc.upenn.edu/LDC2004T19) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/w2v_large_lv_fsh_swbd_cv_ftls960_updated.pt) +Wav2Vec 2.0 Large (LV-60 + CV + SWBD + FSH) ** | 300 hours Switchboard | [Libri-Light](https://github.com/facebookresearch/libri-light) + [CommonVoice](https://commonvoice.mozilla.org/en/languages) + [Switchboard](https://catalog.ldc.upenn.edu/LDC97S62) + [Fisher](https://catalog.ldc.upenn.edu/LDC2004T19) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/w2v_large_lv_fsh_swbd_cv_ftsb300_updated.pt) + +\* updated (Oct. 24, 2020)\ +** updated (Nov. 13, 2021) + +We also release multilingual pre-trained wav2vec 2.0 (XLSR) models: + +Model | Architecture | Hours | Languages | Datasets | Model +|---|---|---|---|---|--- +XLSR-53 | Large | 56k | 53 | MLS, CommonVoice, BABEL | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr_53_56k.pt) + +The XLSR model uses the following datasets for multilingual pretraining: + +* **[MLS: Multilingual LibriSpeech](https://indico2.conference4me.psnc.pl/event/35/contributions/3585/attachments/1060/1101/Wed-2-6-10.pdf)** (8 languages, 50.7k hours): *Dutch, English, French, German, Italian, Polish, Portuguese, Spanish* + +* **[CommonVoice](https://commonvoice.mozilla.org/en/languages)** (36 languages, 3.6k hours): *Arabic, Basque, Breton, Chinese (CN), Chinese (HK), Chinese (TW), Chuvash, Dhivehi, Dutch, English, Esperanto, Estonian, French, German, Hakh-Chin, Indonesian, Interlingua, Irish, Italian, Japanese, Kabyle, Kinyarwanda, Kyrgyz, Latvian, Mongolian, Persian, Portuguese, Russian, Sakha, Slovenian, Spanish, Swedish, Tamil, Tatar, Turkish, Welsh* (see also [finetuning splits]([https://dl.fbaipublicfiles.com/cpc_audio/common_voices_splits.tar.gz]) from [this paper](https://arxiv.org/abs/2002.02848)). + +* **[Babel](https://catalog.ldc.upenn.edu/byyear)** (17 languages, 1.7k hours): *Assamese, Bengali, Cantonese, Cebuano, Georgian, Haitian, Kazakh, Kurmanji, Lao, Pashto, Swahili, Tagalog, Tamil, Tok, Turkish, Vietnamese, Zulu* + +We also finetuned several models on languages from [CommonVoice](https://commonvoice.mozilla.org/en/languages) (version 6.1) and [Babel](https://catalog.ldc.upenn.edu/byyear). Please refer to [our paper](https://arxiv.org/abs/2109.11680) for details about which languages are used. + +Pretrained Model | Fintune Dataset | # Languages | Phonemizer | Model | Dictionary +|---|---|---|---|---|--- +LV-60 | CommonVoice | 26 | [Espeak](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/espeak_en_26lang_m10.pt) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/espeak_dict.txt) +XLSR-53 | CommonVoice | 26 | [Espeak](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/espeak_26lang_m10.pt) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/espeak_dict.txt) +XLSR-53 | CommonVoice | 21 | [Phonetisaurus](https://github.com/AdolfVonKleist/Phonetisaurus) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/phonetisaurus_21lang_m10.pt) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/phonetisaurus_dict.txt) +XLSR-53 | CommonVoice, BABEL | 21, 19 | [Phonetisaurus](https://github.com/AdolfVonKleist/Phonetisaurus) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/phonetisaurus_40lang_m10.pt) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/phonetisaurus_40lang.dict.txt) + +We release 2 models that are finetuned on data from 2 different phonemizers. Although the phonemes are all [IPA](https://en.wikipedia.org/wiki/International_Phonetic_Alphabet) symbols, there are still subtle differences between the phonemized transcriptions from the 2 phonemizers. Thus, it's better to use the corresponding model, if your data is phonemized by either phonemizer above. ## Training a new model with the CLI tools Given a directory containing wav files to be used for pretraining (we recommend splitting each file into separate file 10 to 30 seconds in length) -### Prepare training data manifest: +### Prepare training data manifest First, install the `soundfile` library: + ```shell script pip install soundfile ``` @@ -33,7 +82,7 @@ pip install soundfile Next, run: ```shell script -$ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext $ext --valid-percent $valid +python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext $ext --valid-percent $valid ``` $ext should be set to flac, wav, or whatever format your dataset happens to use that soundfile can read. @@ -42,50 +91,63 @@ $valid should be set to some reasonable percentage (like 0.01) of training data To use a pre-defined validation set (like dev-other from librispeech), set to it 0 and then overwrite valid.tsv with a separately pre-processed manifest file. -### Train a wav2vec 2.0 base model: +### Train a wav2vec 2.0 base model This configuration was used for the base model trained on the Librispeech dataset in the wav2vec 2.0 paper -Note that this was tested with pytorch 1.4.0 and the input is expected to be single channel, sampled at 16 kHz +Note that the input is expected to be single channel, sampled at 16 kHz ```shell script -$ python train.py --distributed-world-size 64 --distributed-port $PORT /manifest/path \ ---save-dir /model/path --fp16 --num-workers 6 --task audio_pretraining --criterion wav2vec --arch wav2vec2 \ ---log-keys '["prob_perplexity","code_perplexity","temp"]' --quantize-targets --extractor-mode default \ ---conv-feature-layers '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2' --final-dim 256 --latent-vars 320 \ ---latent-groups 2 --latent-temp '(2,0.5,0.999995)' --infonce --optimizer adam \ ---adam-betas '(0.9,0.98)' --adam-eps 1e-06 --lr-scheduler polynomial_decay --total-num-update 400000 \ ---lr 0.0005 --warmup-updates 32000 --mask-length 10 --mask-prob 0.65 --mask-selection static --mask-other 0 \ ---encoder-layerdrop 0.05 --dropout-input 0.1 --dropout-features 0.1 --feature-grad-mult 0.1 \ ---loss-weights '[0.1, 10]' --conv-pos 128 --conv-pos-groups 16 --num-negatives 100 --cross-sample-negatives 0 \ ---max-sample-size 250000 --min-sample-size 32000 --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \ ---max-tokens 1400000 --max-update 400000 --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d +$ fairseq-hydra-train \ + task.data=/path/to/data \ + --config-dir /path/to/fairseq-py/examples/wav2vec/config/pretraining \ + --config-name wav2vec2_base_librispeech ``` -Note: you can simulate 64 GPUs by using k GPUs and setting --update-freq 64/k +Note: you can simulate 64 GPUs by using k GPUs and adding command line parameters (before `--config-dir`) +`distributed_training.distributed_world_size=k` `+optimization.update_freq='[x]'` where x = 64/k -### Train a wav2vec 2.0 large model: +### Train a wav2vec 2.0 large model This configuration was used for the large model trained on the Libri-light dataset in the wav2vec 2.0 paper ```shell script -$ python train.py --distributed-world-size 128 --distributed-port $PORT /manifest/path \ ---save-dir /model/path --fp16 --num-workers 6 --task audio_pretraining --criterion wav2vec --arch wav2vec2 \ ---log-keys '["prob_perplexity","code_perplexity","temp"]' --quantize-targets --extractor-mode default \ ---conv-feature-layers '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2' --final-dim 768 --latent-vars 320 \ ---latent-groups 2 --latent-temp '(2.0,0.1,0.999995)' --infonce --optimizer adam \ ---adam-betas '(0.9,0.98)' --adam-eps 1e-06 --lr-scheduler polynomial_decay --total-num-update 600000 \ ---lr 0.0003 --warmup-updates 32000 --mask-length 10 --mask-prob 0.65 --mask-selection static --mask-other 0 \ ---encoder-layerdrop 0.0 --dropout-input 0.1 --dropout-features 0.1 --feature-grad-mult 0.03 \ ---loss-weights '[0.1, 10]' --conv-pos 128 --conv-pos-groups 16 --encoder-layers 24 --encoder-embed-dim 1024 \ ---encoder-ffn-embed-dim 4096 --encoder-attention-heads 16 --num-negatives 100 --cross-sample-negatives 0 \ ---max-sample-size 320000 --min-sample-size 32000 --dropout 0.0 --attention-dropout 0.1 --weight-decay 0.01 \ ---max-tokens 1200000 --max-update 600000 --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d +$ fairseq-hydra-train \ + task.data=/path/to/data \ + --config-dir /path/to/fairseq-py/examples/wav2vec/config/pretraining \ + --config-name wav2vec2_large_librivox +``` + +Note: you can simulate 128 GPUs by using k GPUs and adding command line parameters (before `--config-dir`) +`distributed_training.distributed_world_size=k` `+optimization.update_freq='[x]'` where x = 128/k + +### Train a wav2vec 2.0 model with conformer backbone + +To replace the transformer layers in the encoder with the conformer layers, set `--layer-type conformer --attn-type espnet --pos-enc-type ${POS_ENC_TYPE}`. `POS_ENC_TYPE` refers to positional encoding to be used in the conformer encoder. +Set it to `abs`, `rope` or `rel_pos` to use the absolute positional encoding, rotary positional encoding or relative positional encoding in the conformer layer respectively. + +To train a base model with conformer: + +```shell script +$ fairseq-hydra-train \ + task.data=/path/to/data \ + --config-dir /path/to/fairseq-py/examples/wav2vec/config/pretraining \ + --config-name wav2vec2_conformer_base_librispeech \ + --attn-type espnet --pos-enc-type ${POS_ENC_TYPE} ``` -Note: you can simulate 128 GPUs by using k GPUs and setting --update-freq 128/k +To train a large model with conformer: + +```shell script +$ fairseq-hydra-train \ + task.data=/path/to/data \ + --config-dir /path/to/fairseq-py/examples/wav2vec/config/pretraining \ + --config-name wav2vec2_conformer_large_librivox + --attn-type espnet --pos-enc-type ${POS_ENC_TYPE} + +``` -### Fine-tune a pre-trained model with CTC: +### Fine-tune a pre-trained model with CTC Fine-tuning a model requires parallel audio and labels file, as well as a vocabulary file in fairseq format. A letter vocabulary can be downloaded [here](https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt). @@ -97,33 +159,28 @@ $ python libri_labels.py /path/to/tsv --output-dir /output/dir --output-name $sp ``` Fine-tuning on 100h of Librispeech with letter targets: + ```shell script -valid_subset=dev_other -python train.py --distributed-world-size 24 --distributed-port $PORT /path/to/training_data --save-dir /model/path --fp16 \ ---wer-args '("/path/to/lm/4-gram.bin","/path/to/lexicon",2,-1)' \ ---post-process letter --valid-subset $valid_subset --no-epoch-checkpoints --best-checkpoint-metric wer --num-workers 4 \ ---max-update 80000 --sentence-avg --task audio_pretraining --arch wav2vec_ctc --w2v-path /path/to/pretrained/model \ ---labels ltr --apply-mask --mask-selection static --mask-other 0 --mask-length 10 --mask-prob 0.5 --layerdrop 0.1 \ ---mask-channel-selection static --mask-channel-other 0 --mask-channel-length 64 --mask-channel-prob 0.5 --zero-infinity \ ---feature-grad-mult 0.0 --freeze-finetune-updates 10000 --validate-after-updates 10000 --optimizer adam \ ---adam-betas '(0.9, 0.98)' --adam-eps 1e-08 --lr 2e-05 --lr-scheduler tri_stage --warmup-steps 8000 --hold-steps 32000 \ ---decay-steps 40000 --final-lr-scale 0.05 --final-dropout 0.0 --dropout 0.0 --activation-dropout 0.1 --criterion ctc \ ---attention-dropout 0.0 --max-tokens 1280000 --seed 2337 --log-format json --log-interval 500 --ddp-backend no_c10d +$ fairseq-hydra-train \ + distributed_training.distributed_port=$PORT \ + task.data=/path/to/data \ + model.w2v_path=/path/to/model.pt \ + --config-dir /path/to/fairseq-py/examples/wav2vec/config/finetuning \ + --config-name base_100h ``` -Note: you can simulate 24 GPUs by using k GPUs and setting --update-freq 24/k +There are other config files in the config/finetuning directory that can be used to fine-tune on other splits. +You can specify the right config via the `--config-name` parameter. -Decoding with a language model during training requires wav2letter [python bindings](https://github.com/facebookresearch/wav2letter/wiki/Building-Python-bindings). -Alternatively, simply omit the --wer-args flag. +Note: you can simulate 24 GPUs by using k GPUs and adding command line parameters (before `--config-dir`) +`distributed_training.distributed_world_size=k` `+optimization.update_freq='[x]'` where x = 24/k -For hyper-parameters to fine-tune other Librispeech splits (10 minutes, 1 hour, etc) please refer to the table in Appendix B in the wav2vec 2.0 paper. -The main changes to make are adjusting --max-update, and then adjusting --warmup-steps, --hold-steps, and --decay steps so that they use 0.1/0.4/0.5 of max-update respectively. You then need to adjust --mask-prob and --mask-channel-prob. This should be set to the mask-length * x where x is the number in the table and mask-length is what you use for --mask-length (10 in this example. Use --mask-channel-length value for --mask-channel-prob). +Decoding with a language model during training requires flashlight [python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter). +If you want to use a language model, add `+criterion.wer_args='[/path/to/kenlm, /path/to/lexicon, 2, -1]'` to the command line. -For example, for 10 hours, we see in the paper that timestep mask prob should be 0.065, so we set --mask-prob to 10* 0.065 = 0.65. channel mask prob is 0.004, so we set it to 64 * 0.004 = 0.256. then we set --max-updates to 20000 and change --warmup-steps to 20000 * 0.1 = 2000, --hold-steps to 8000 and --decay-steps to 10000. +### Evaluating a CTC model -### Evaluating a CTC model: - -Evaluating a CTC model with a language model requires wav2letter [python bindings](https://github.com/facebookresearch/wav2letter/wiki/Building-Python-bindings) to be installed. +Evaluating a CTC model with a language model requires [flashlight python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter) to be installed. Fairseq transformer language model used in the wav2vec 2.0 paper can be obtained from the [wav2letter model repository](https://github.com/facebookresearch/wav2letter/tree/master/recipes/sota/2019). Be sure to upper-case the language model vocab after downloading it. @@ -134,7 +191,7 @@ Next, run the evaluation command: ```shell script $subset=dev_other -python examples/speech_recognition/infer.py /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw --task audio_pretraining \ +python examples/speech_recognition/infer.py /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw --task audio_finetuning \ --nbest 1 --path /path/to/model --gen-subset $subset --results-path /path/to/save/results/for/sclite --w2l-decoder kenlm \ --lm-model /path/to/kenlm.bin --lm-weight 2 --word-score -1 --sil-weight 0 --criterion ctc --labels ltr --max-tokens 4000000 \ --post-process letter @@ -142,6 +199,58 @@ python examples/speech_recognition/infer.py /checkpoint/abaevski/data/speech/lib To get raw numbers, use --w2l-decoder viterbi and omit the lexicon. To use the transformer language model, use --w2l-decoder fairseqlm. +## Use wav2vec 2.0 with 🤗Transformers + +Wav2Vec2 is also available in the [🤗Transformers library](https://github.com/huggingface/transformers) since version 4.4. + +Pretrained Models can be found on the [hub](https://huggingface.co/models?filter=wav2vec2) +and documentation can be found [here](https://huggingface.co/transformers/master/model_doc/wav2vec2.html). + +Usage example: + +```python +# !pip install transformers +# !pip install datasets +import soundfile as sf +import torch +from datasets import load_dataset +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor + +# load pretrained model +processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") +model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") + + +librispeech_samples_ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") + +# load audio +audio_input, sample_rate = sf.read(librispeech_samples_ds[0]["file"]) + +# pad input values and return pt tensor +input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values + +# INFERENCE + +# retrieve logits & take argmax +logits = model(input_values).logits +predicted_ids = torch.argmax(logits, dim=-1) + +# transcribe +transcription = processor.decode(predicted_ids[0]) + +# FINE-TUNE + +target_transcription = "A MAN SAID TO THE UNIVERSE I EXIST" + +# encode labels +with processor.as_target_processor(): + labels = processor(target_transcription, return_tensors="pt").input_ids + +# compute loss by passing labels +loss = model(input_values, labels=labels).loss +loss.backward() +``` + # wav2vec Example to train a wav2vec model as described in [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](https://arxiv.org/abs/1904.05862). @@ -152,14 +261,15 @@ Description | Dataset | Model ---|---|--- Wav2Vec large | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_large.pt) -#### Example usage: +#### Example usage + ```python import torch -from fairseq.models.wav2vec import Wav2VecModel +import fairseq -cp = torch.load('/path/to/wav2vec.pt') -model = Wav2VecModel.build_model(cp['args'], task=None) -model.load_state_dict(cp['model']) +cp_path = '/path/to/wav2vec.pt' +model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp_path]) +model = model[0] model.eval() wav_input_16khz = torch.randn(1,10000) @@ -171,24 +281,79 @@ c = model.feature_aggregator(z) Given a directory containing wav files to be used for pretraining (we recommend splitting each file into separate files 10 to 30 seconds in length) -### Prepare training data manifest: +### Prepare training data manifest ``` -$ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext wav +python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext wav ``` -### Train a wav2vec model: +### Train a wav2vec model ``` $ python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 --save-interval 1 --no-epoch-checkpoints \ ---arch wav2vec --task audio_pretraining --lr 1e-06 --min-lr 1e-09 --optimizer adam --max-lr 0.005 --lr-scheduler cosine \ +--arch wav2vec --task audio_pretraining --min-lr 1e-06 --stop-min-lr 1e-09 --optimizer adam --lr 0.005 --lr-scheduler cosine \ --conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)] \ --conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \ --skip-connections-agg --residual-scale 0.5 --log-compression --warmup-updates 500 --warmup-init-lr 1e-07 --criterion wav2vec --num-negatives 10 \ --max-sample-size 150000 --max-tokens 1500000 --skip-invalid-size-inputs-valid-test ``` -### Extract embeddings from the downstream task data: +### Run wav2vec2 pre-training on Google Cloud TPUs + +Wav2Vec2 is now supported on TPUs! It's currently pre-training only. + +#### Using hydra on a v3-8 + +``` +$ OMP_NUM_THREADS=1 fairseq-hydra-train \ + task.data=/manifest/path \ + --config-dir /PATH/TO/FAIRSEQ/examples/wav2vec/config/pretraining \ + --config-name wav2vec2_large_librivox_tpu.yaml +``` + +#### Using command line arguments on a v3-8 + +Note: Commandline arguments way of execution has a [known-problem](https://github.com/pytorch/fairseq/issues/3741) currently. + +``` +$ OMP_NUM_THREADS=1 python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 --save-interval 1 --no-epoch-checkpoints \ +--arch wav2vec2 --task audio_pretraining --min-lr 1e-06 --stop-min-lr 1e-09 --optimizer adam --lr 0.005 --lr-scheduler cosine \ +--conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)] \ +--conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \ +--skip-connections-agg --residual-scale 0.5 --log-compression --warmup-updates 500 --warmup-init-lr 1e-07 --criterion wav2vec --num-negatives 10 \ +--max-sample-size 150000 --max-tokens 1500000 --skip-invalid-size-inputs-valid-test \ +--tpu --distributed-world-size 8 --num-batch-buckets 3 --enable-padding \ +--encoder-layerdrop 0 --mask-channel-prob 0.1 +``` + +#### Using hydra on a pod slice (v3-N with N > 8) + +``` +$ OMP_NUM_THREADS=1 fairseq-hydra-train \ + task.data=/manifest/path \ + --config-dir /PATH/TO/FAIRSEQ/examples/wav2vec/config/pretraining \ + --config-name wav2vec2_large_librivox_tpu-pod.yaml # edit distributed-world-size accordingly +``` + +#### Using command line arguments on a pod slice (v3-N with N > 8) + +Note: Commandline arguments way of execution has a [known-problem](https://github.com/pytorch/fairseq/issues/3741) currently. + +``` +$ python -m torch_xla.distributed.xla_dist \ + --tpu ${TPUNAME} --conda-env=torch-xla-${TORCH_XLA_VERSION} --env OMP_NUM_THREADS=1 \ + -- \ +python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 --save-interval 1 --no-epoch-checkpoints \ +--arch wav2vec2 --task audio_pretraining --min-lr 1e-06 --stop-min-lr 1e-09 --optimizer adam --lr 0.005 --lr-scheduler cosine \ +--conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)] \ +--conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \ +--skip-connections-agg --residual-scale 0.5 --log-compression --warmup-updates 500 --warmup-init-lr 1e-07 --criterion wav2vec --num-negatives 10 \ +--max-sample-size 150000 --max-tokens 1500000 --skip-invalid-size-inputs-valid-test \ +--tpu --distributed-world-size ${WORLD_SIZE} --num-batch-buckets 3 --enable-padding \ +--encoder-layerdrop 0 --mask-channel-prob 0.1 +``` + +### Extract embeddings from the downstream task data ``` $ PYTHONPATH=/path/to/fairseq python examples/wav2vec/wav2vec_featurize.py --input /path/to/task/waves --output /path/to/output \ @@ -209,14 +374,15 @@ vq-wav2vec Gumbel | [Librispeech](http://www.openslr.org/12) | [download](https: vq-wav2vec K-means | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/vq-wav2vec_kmeans.pt) Roberta on K-means codes | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/bert_kmeans.tar) -#### Example usage: +#### Example usage + ```python import torch -from fairseq.models.wav2vec import Wav2VecModel +import fairseq cp = torch.load('/path/to/vq-wav2vec.pt') -model = Wav2VecModel.build_model(cp['args'], task=None) -model.load_state_dict(cp['model']) +model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp]) +model = model[0] model.eval() wav_input_16khz = torch.randn(1,10000) @@ -229,18 +395,18 @@ print(idxs.shape) # output: torch.Size([1, 60, 2]), 60 timesteps with 2 indexes Given a directory containing wav files to be used for pretraining (we recommend splitting each file into separate file 10 to 30 seconds in length) -### Prepare training data manifest: +### Prepare training data manifest ``` -$ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext wav +python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext wav ``` -### Train a gumbel vq-wav2vec model: +### Train a gumbel vq-wav2vec model ``` $ python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 \ ---save-interval 1 --no-epoch-checkpoints --arch wav2vec --task audio_pretraining --lr 1e-06 --min-lr 1e-09 \ ---optimizer adam --max-lr 1e-05 --lr-scheduler cosine \ +--save-interval 1 --no-epoch-checkpoints --arch wav2vec --task audio_pretraining --min-lr 1e-06 --stop-min-lr 1e-09 \ +--optimizer adam --lr 1e-05 --lr-scheduler cosine \ --conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1), (512, 1, 1)] \ --conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \ --activation gelu --offset auto --skip-connections-agg --residual-scale 0.5 \ @@ -252,7 +418,7 @@ $ python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 - for k-means training, set vq-type with "kmeans" and add --loss-weights [1] argument. Pre-trained models were trained on 16 GPUs. -### Tokenize audio data (e.g. for BERT training): +### Tokenize audio data (e.g. for BERT training) ``` $ PYTHONPATH=/path/to/fairseq python examples/wav2vec/vq-wav2vec_featurize.py --data-dir /manifest/path --output-dir /path/to/output \ diff --git a/examples/wav2vec/__init__.py b/examples/wav2vec/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/wav2vec/config/finetuning/base_100h.yaml b/examples/wav2vec/config/finetuning/base_100h.yaml new file mode 100644 index 0000000000..153b5df170 --- /dev/null +++ b/examples/wav2vec/config/finetuning/base_100h.yaml @@ -0,0 +1,58 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: ??? + normalize: false + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 3200000 + skip_invalid_size_inputs_valid_test: true + valid_subset: dev_other + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 2 + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 80000 + lr: [0.00003] + sentence_avg: true + update_freq: [4] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.65 + mask_channel_prob: 0.5 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 0 diff --git a/examples/wav2vec/config/finetuning/base_10h.yaml b/examples/wav2vec/config/finetuning/base_10h.yaml new file mode 100644 index 0000000000..5044518025 --- /dev/null +++ b/examples/wav2vec/config/finetuning/base_10h.yaml @@ -0,0 +1,63 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + save_interval: 50 + save_interval_updates: 10000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: ??? + normalize: false + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 3200000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 10000 + validate_interval: 50 + valid_subset: dev_other + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 2 + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 20000 + lr: [0.00005] + sentence_avg: true + update_freq: [4] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.65 + mask_channel_prob: 0.5 + mask_channel_length: 64 + layerdrop: 0.05 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 diff --git a/examples/wav2vec/config/finetuning/base_10m.yaml b/examples/wav2vec/config/finetuning/base_10m.yaml new file mode 100644 index 0000000000..14abc013bd --- /dev/null +++ b/examples/wav2vec/config/finetuning/base_10m.yaml @@ -0,0 +1,63 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + save_interval: 1000 + save_interval_updates: 50 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: ??? + normalize: false + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 3200000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 10000 + validate_interval: 1000 + valid_subset: dev_other + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 2 + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 13000 + lr: [0.00005] + sentence_avg: true + update_freq: [4] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.65 + mask_channel_prob: 0.25 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 diff --git a/examples/wav2vec/config/finetuning/base_1h.yaml b/examples/wav2vec/config/finetuning/base_1h.yaml new file mode 100644 index 0000000000..a0af1cfad7 --- /dev/null +++ b/examples/wav2vec/config/finetuning/base_1h.yaml @@ -0,0 +1,63 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + save_interval: 50 + save_interval_updates: 1000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: ??? + normalize: false + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 3200000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 10000 + validate_interval: 1000 + valid_subset: dev_other + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 2 + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 13000 + lr: [0.00005] + sentence_avg: true + update_freq: [4] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.65 + mask_channel_prob: 0.25 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 diff --git a/examples/wav2vec/config/finetuning/base_960h.yaml b/examples/wav2vec/config/finetuning/base_960h.yaml new file mode 100644 index 0000000000..3eadc36b37 --- /dev/null +++ b/examples/wav2vec/config/finetuning/base_960h.yaml @@ -0,0 +1,57 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: ??? + normalize: false + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 3200000 + skip_invalid_size_inputs_valid_test: true + valid_subset: dev_other + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 8 + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 320000 + lr: [0.0001] + sentence_avg: true + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.5 + mask_channel_prob: 0.1 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 0 diff --git a/examples/wav2vec/config/finetuning/run_config/slurm_1.yaml b/examples/wav2vec/config/finetuning/run_config/slurm_1.yaml new file mode 100644 index 0000000000..4a848435c1 --- /dev/null +++ b/examples/wav2vec/config/finetuning/run_config/slurm_1.yaml @@ -0,0 +1,26 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 \ No newline at end of file diff --git a/examples/wav2vec/config/finetuning/run_config/slurm_16.yaml b/examples/wav2vec/config/finetuning/run_config/slurm_16.yaml new file mode 100644 index 0000000000..041843a9b9 --- /dev/null +++ b/examples/wav2vec/config/finetuning/run_config/slurm_16.yaml @@ -0,0 +1,27 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 16 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 + exclude: learnfair1381,learnfair5192,learnfair2304 \ No newline at end of file diff --git a/examples/wav2vec/config/finetuning/run_config/slurm_1_aws.yaml b/examples/wav2vec/config/finetuning/run_config/slurm_1_aws.yaml new file mode 100644 index 0000000000..b9335df782 --- /dev/null +++ b/examples/wav2vec/config/finetuning/run_config/slurm_1_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.local_cache_path + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/run_config/slurm_1_old.yaml b/examples/wav2vec/config/finetuning/run_config/slurm_1_old.yaml new file mode 100644 index 0000000000..a8d2363dc5 --- /dev/null +++ b/examples/wav2vec/config/finetuning/run_config/slurm_1_old.yaml @@ -0,0 +1,27 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 1 + name: ${env:PREFIX}_wav2vec3_small_librispeech + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 + exclude: learnfair1381 \ No newline at end of file diff --git a/examples/wav2vec/config/finetuning/run_config/slurm_2.yaml b/examples/wav2vec/config/finetuning/run_config/slurm_2.yaml new file mode 100644 index 0000000000..65ec48920d --- /dev/null +++ b/examples/wav2vec/config/finetuning/run_config/slurm_2.yaml @@ -0,0 +1,27 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 + exclude: learnfair7491,learnfair7477,learnfair7487 \ No newline at end of file diff --git a/examples/wav2vec/config/finetuning/run_config/slurm_2_aws.yaml b/examples/wav2vec/config/finetuning/run_config/slurm_2_aws.yaml new file mode 100644 index 0000000000..e7590efc0a --- /dev/null +++ b/examples/wav2vec/config/finetuning/run_config/slurm_2_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.local_cache_path + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 0 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/run_config/slurm_2g.yaml b/examples/wav2vec/config/finetuning/run_config/slurm_2g.yaml new file mode 100644 index 0000000000..aaa20ebd03 --- /dev/null +++ b/examples/wav2vec/config/finetuning/run_config/slurm_2g.yaml @@ -0,0 +1,26 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 2 + tasks_per_node: 2 + mem_gb: 200 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/run_config/slurm_3.yaml b/examples/wav2vec/config/finetuning/run_config/slurm_3.yaml new file mode 100644 index 0000000000..9614ececae --- /dev/null +++ b/examples/wav2vec/config/finetuning/run_config/slurm_3.yaml @@ -0,0 +1,27 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 3 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 + exclude: learnfair7491,learnfair7477,learnfair7487 \ No newline at end of file diff --git a/examples/wav2vec/config/finetuning/run_config/slurm_4g.yaml b/examples/wav2vec/config/finetuning/run_config/slurm_4g.yaml new file mode 100644 index 0000000000..c0c9f60436 --- /dev/null +++ b/examples/wav2vec/config/finetuning/run_config/slurm_4g.yaml @@ -0,0 +1,26 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 200 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/run_config/slurm_4g_aws.yaml b/examples/wav2vec/config/finetuning/run_config/slurm_4g_aws.yaml new file mode 100644 index 0000000000..6bbbf3b646 --- /dev/null +++ b/examples/wav2vec/config/finetuning/run_config/slurm_4g_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.local_cache_path + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 4 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/run_config/slurm_8.yaml b/examples/wav2vec/config/finetuning/run_config/slurm_8.yaml new file mode 100644 index 0000000000..984f218885 --- /dev/null +++ b/examples/wav2vec/config/finetuning/run_config/slurm_8.yaml @@ -0,0 +1,26 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 400 + nodes: 8 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_100h.yaml b/examples/wav2vec/config/finetuning/vox_100h.yaml new file mode 100644 index 0000000000..b8f81e5e18 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_100h.yaml @@ -0,0 +1,58 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: ??? + normalize: true + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + valid_subset: dev_other + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 80000 + lr: [0.00003] + sentence_avg: true + update_freq: [5] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.5 + mask_channel_prob: 0.5 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 diff --git a/examples/wav2vec/config/finetuning/vox_100h_2.yaml b/examples/wav2vec/config/finetuning/vox_100h_2.yaml new file mode 100644 index 0000000000..9bf588f587 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_100h_2.yaml @@ -0,0 +1,106 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 1 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 8 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin + wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 2.0 + wer_word_score: 0 + wer_sil_weight: -2 + +optimization: + max_update: 100000 + lr: [1e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [1] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: null + warmup_steps: 8000 + hold_steps: 0 + decay_steps: 72000 + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.4 + mask_length: 5 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.1 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 100 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 250 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_100h_2_aws.yaml b/examples/wav2vec/config/finetuning/vox_100h_2_aws.yaml new file mode 100644 index 0000000000..3a0d517ebb --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_100h_2_aws.yaml @@ -0,0 +1,82 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /data/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /fsx-wav2vec/abaevski/data/libri/100h/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 1 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 8 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin + wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 2.0 + wer_word_score: 0 + wer_sil_weight: -2 + +optimization: + max_update: 100000 + lr: [1e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [1] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: null + warmup_steps: 8000 + hold_steps: 0 + decay_steps: 82000 + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.4 + mask_length: 7 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.1 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 100 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + diff --git a/examples/wav2vec/config/finetuning/vox_100h_3.yaml b/examples/wav2vec/config/finetuning/vox_100h_3.yaml new file mode 100644 index 0000000000..46778666f6 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_100h_3.yaml @@ -0,0 +1,101 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1000000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 1 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 8 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin + wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 2.0 + wer_word_score: -1.0 + +optimization: + max_update: 100000 + lr: [1e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [1] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: cosine + warmup_updates: 8000 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.4 + mask_length: 5 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.1 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 100 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 250 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_10h.yaml b/examples/wav2vec/config/finetuning/vox_10h.yaml new file mode 100644 index 0000000000..8f1ca71ee2 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_10h.yaml @@ -0,0 +1,63 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + save_interval: 50 + save_interval_updates: 10000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: ??? + normalize: true + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 10000 + validate_interval: 50 + valid_subset: dev_other + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 20000 + lr: [0.0001] + sentence_avg: true + update_freq: [5] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.75 + mask_channel_prob: 0.25 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 diff --git a/examples/wav2vec/config/finetuning/vox_10h_2.yaml b/examples/wav2vec/config/finetuning/vox_10h_2.yaml new file mode 100644 index 0000000000..05ee76f147 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_10h_2.yaml @@ -0,0 +1,102 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 10 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + keep_interval_updates: 1 + +task: + _name: audio_finetuning + data: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 10 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin + wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 2.0 + wer_word_score: -1.0 + +optimization: + max_update: 60000 + lr: [2e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [1] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: cosine + warmup_updates: 8000 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.5 + mask_length: 5 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.1 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 100 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 250 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_10h_2_aws.yaml b/examples/wav2vec/config/finetuning/vox_10h_2_aws.yaml new file mode 100644 index 0000000000..a0afc9c5d5 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_10h_2_aws.yaml @@ -0,0 +1,81 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /data/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 10 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 10 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin + wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 2.0 + wer_word_score: 4 + wer_sil_weight: -5 + +optimization: + max_update: 60000 + lr: [1e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [1] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: null + warmup_steps: 8000 + hold_steps: 0 + decay_steps: 72000 + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.75 + mask_length: 5 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.1 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 100 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 diff --git a/examples/wav2vec/config/finetuning/vox_10h_aws.yaml b/examples/wav2vec/config/finetuning/vox_10h_aws.yaml new file mode 100644 index 0000000000..c754373657 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_10h_aws.yaml @@ -0,0 +1,104 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /data/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 10 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 10 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter +# wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin +# wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst +# wer_lm_weight: 2.0 +# wer_word_score: -1.0 + +optimization: + max_update: 60000 + lr: [2e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [1] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: null + warmup_steps: 8000 + hold_steps: 0 + decay_steps: 72000 + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.4 + mask_length: 5 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.1 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 100 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_10h_aws_v100.yaml b/examples/wav2vec/config/finetuning/vox_10h_aws_v100.yaml new file mode 100644 index 0000000000..58ad2acf71 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_10h_aws_v100.yaml @@ -0,0 +1,102 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 +# tensorboard_logdir: tb + +checkpoint: + save_interval: 10 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /fsx/abaevski/data/libri/10h/wav2vec/raw + labels: ltr + cache_in_scratch: true + + +dataset: + num_workers: 10 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 10 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_lexicon: /fsx/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 2.0 + wer_word_score: -1.0 + +optimization: + max_update: 60000 + lr: [2e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [1] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: null + warmup_steps: 8000 + hold_steps: 0 + decay_steps: 72000 + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.6 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.1 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 100 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /fsx/${env:USER}/w2v_ft/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: learnfair + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_10m.yaml b/examples/wav2vec/config/finetuning/vox_10m.yaml new file mode 100644 index 0000000000..07e327fe74 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_10m.yaml @@ -0,0 +1,63 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + save_interval: 1000 + save_interval_updates: 50 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: ??? + normalize: true + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 10000 + validate_interval: 1000 + valid_subset: dev_other + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 13000 + lr: [0.0001] + sentence_avg: true + update_freq: [5] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.65 + mask_channel_prob: 0.25 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 diff --git a/examples/wav2vec/config/finetuning/vox_10m_2.yaml b/examples/wav2vec/config/finetuning/vox_10m_2.yaml new file mode 100644 index 0000000000..1ac7c1217f --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_10m_2.yaml @@ -0,0 +1,114 @@ +# @package _group_ + +common: + fp16: true + fp16_no_flatten_grads: true + log_format: json + log_interval: 200 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 500 + save_interval_updates: 500 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /checkpoint/abaevski/data/speech/libri/10m/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1000000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 500 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin + wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 5 + wer_word_score: 2 + wer_sil_weight: -2 + +optimization: + max_update: 10000 + lr: [2e-6] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [4] # base 10h we -> 2/4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 2e-6 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + lr_scheduler: + _name: cosine + warmup_updates: 1000 + +lr_scheduler: pass_through + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.4 + mask_length: 3 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.25 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + freeze_finetune_updates: 100 + + zero_mask: true + feature_grad_mult: 0.0 + activation_dropout: 0.1 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + update_alibi: false + +#hydra: +# job: +# config: +# override_dirname: +# kv_sep: ':' +# item_sep: '__' +# exclude_keys: +# - run_config +# - distributed_training.distributed_port +# sweep: +# dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} +# subdir: ${hydra.job.num} +# launcher: +# submitit_folder: ${hydra.sweep.dir} +# timeout_min: 3000 +# cpus_per_task: 10 +# gpus_per_node: 4 +# tasks_per_node: 4 +# mem_gb: 250 +# nodes: 1 +# name: ${env:PREFIX}_${hydra.job.config_name} +# partition: devlab,learnlab,learnfair,scavenge +# constraint: volta32gb +# max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_10m_2_aws.yaml b/examples/wav2vec/config/finetuning/vox_10m_2_aws.yaml new file mode 100644 index 0000000000..a9c270855b --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_10m_2_aws.yaml @@ -0,0 +1,114 @@ +# @package _group_ + +common: + fp16: true + fp16_no_flatten_grads: true + log_format: json + log_interval: 200 + user_dir: /data/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 500 + save_interval_updates: 500 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /fsx-wav2vec/abaevski/data/libri/10m/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1000000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 500 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin + wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 5 + wer_word_score: 2 + wer_sil_weight: -2 + +optimization: + max_update: 10000 + lr: [2e-6] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [4] # base 10h we -> 2/4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 2e-6 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + lr_scheduler: + _name: cosine + warmup_updates: 1000 + +lr_scheduler: pass_through + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.4 + mask_length: 3 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.25 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + freeze_finetune_updates: 100 + + zero_mask: true + feature_grad_mult: 0.0 + activation_dropout: 0.1 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + update_alibi: false + +#hydra: +# job: +# config: +# override_dirname: +# kv_sep: ':' +# item_sep: '__' +# exclude_keys: +# - run_config +# - distributed_training.distributed_port +# sweep: +# dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} +# subdir: ${hydra.job.num} +# launcher: +# submitit_folder: ${hydra.sweep.dir} +# timeout_min: 3000 +# cpus_per_task: 10 +# gpus_per_node: 4 +# tasks_per_node: 4 +# mem_gb: 250 +# nodes: 1 +# name: ${env:PREFIX}_${hydra.job.config_name} +# partition: devlab,learnlab,learnfair,scavenge +# constraint: volta32gb +# max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_10m_3.yaml b/examples/wav2vec/config/finetuning/vox_10m_3.yaml new file mode 100644 index 0000000000..b6804126cf --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_10m_3.yaml @@ -0,0 +1,105 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 1000 + save_interval_updates: 100 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /checkpoint/abaevski/data/speech/libri/10m/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 10000 + validate_interval: 500 + valid_subset: dev_other + required_batch_size_multiple: 8 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin + wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 8 + wer_word_score: 5.8 + wer_sil_weight: -8 + +optimization: + max_update: 13000 + lr: [2e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [5] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.65 + mask_length: 10 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.25 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 250 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_1h.yaml b/examples/wav2vec/config/finetuning/vox_1h.yaml new file mode 100644 index 0000000000..fac1bbb32f --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_1h.yaml @@ -0,0 +1,63 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + save_interval: 1000 + save_interval_updates: 50 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: ??? + normalize: true + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 10000 + validate_interval: 1000 + valid_subset: dev_other + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 13000 + lr: [0.0003] + sentence_avg: true + update_freq: [5] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.75 + mask_channel_prob: 0.25 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 diff --git a/examples/wav2vec/config/finetuning/vox_1h_2.yaml b/examples/wav2vec/config/finetuning/vox_1h_2.yaml new file mode 100644 index 0000000000..75f4aafd71 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_1h_2.yaml @@ -0,0 +1,104 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 100 + save_interval_updates: 500 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1000000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 100 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 8 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin + wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 6 + wer_word_score: -0.1 + wer_sil_weight: -4.7 + +optimization: + max_update: 60000 + lr: [1e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [1] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: cosine + warmup_updates: 4000 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.65 + mask_length: 5 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.25 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 100 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 250 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_1h_2_aws.yaml b/examples/wav2vec/config/finetuning/vox_1h_2_aws.yaml new file mode 100644 index 0000000000..cc4d511d14 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_1h_2_aws.yaml @@ -0,0 +1,114 @@ +# @package _group_ + +common: + fp16: true + fp16_no_flatten_grads: true + log_format: json + log_interval: 200 + user_dir: /data/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 100 + save_interval_updates: 500 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /fsx-wav2vec/abaevski/data/libri/1h/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1000000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 500 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin + wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 5 + wer_word_score: 0 + wer_sil_weight: -4 + +optimization: + max_update: 10000 + lr: [2e-6] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [4] # base 10h we -> 2/4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 2e-6 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + lr_scheduler: + _name: cosine + warmup_updates: 1000 + +lr_scheduler: pass_through + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.4 + mask_length: 3 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.25 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + freeze_finetune_updates: 100 + + zero_mask: true + feature_grad_mult: 0.0 + activation_dropout: 0.1 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + update_alibi: false + +#hydra: +# job: +# config: +# override_dirname: +# kv_sep: ':' +# item_sep: '__' +# exclude_keys: +# - run_config +# - distributed_training.distributed_port +# sweep: +# dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} +# subdir: ${hydra.job.num} +# launcher: +# submitit_folder: ${hydra.sweep.dir} +# timeout_min: 3000 +# cpus_per_task: 10 +# gpus_per_node: 4 +# tasks_per_node: 4 +# mem_gb: 250 +# nodes: 1 +# name: ${env:PREFIX}_${hydra.job.config_name} +# partition: devlab,learnlab,learnfair,scavenge +# constraint: volta32gb +# max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_1h_3.yaml b/examples/wav2vec/config/finetuning/vox_1h_3.yaml new file mode 100644 index 0000000000..842c89717e --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_1h_3.yaml @@ -0,0 +1,104 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 100 + save_interval_updates: 500 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 640000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 10000 + validate_interval: 100 + valid_subset: dev_other + required_batch_size_multiple: 8 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 8 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin + wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 6 + wer_word_score: -0.1 + wer_sil_weight: -4.7 + +optimization: + max_update: 13000 + lr: [6e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [5] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: cosine + warmup_updates: 4000 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.3 + mask_length: 3 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.25 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 250 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_1h_4.yaml b/examples/wav2vec/config/finetuning/vox_1h_4.yaml new file mode 100644 index 0000000000..698ed8c4da --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_1h_4.yaml @@ -0,0 +1,104 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 100 + save_interval_updates: 1000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 640000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 10000 + validate_interval: 100 + valid_subset: dev_other + required_batch_size_multiple: 8 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 8 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin + wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 2.0 + wer_word_score: -1.0 + +optimization: + max_update: 13000 + lr: [6e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [5] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.65 + mask_length: 10 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.25 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 250 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_1h_aws.yaml b/examples/wav2vec/config/finetuning/vox_1h_aws.yaml new file mode 100644 index 0000000000..aa6700415b --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_1h_aws.yaml @@ -0,0 +1,80 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /data/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 100 + save_interval_updates: 500 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /fsx-wav2vec/abaevski/data/libri/10m/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1000000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 10000 + validate_interval: 100 + valid_subset: dev_other + required_batch_size_multiple: 8 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 8 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin + wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 5 + wer_word_score: -0.1 + wer_sil_weight: -4.7 + +optimization: + max_update: 13000 + lr: [6e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [5] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: cosine + warmup_updates: 4000 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.3 + mask_length: 3 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.25 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + update_alibi: false diff --git a/examples/wav2vec/config/finetuning/vox_960h.yaml b/examples/wav2vec/config/finetuning/vox_960h.yaml new file mode 100644 index 0000000000..9d72404fa3 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_960h.yaml @@ -0,0 +1,57 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: ??? + normalize: true + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + valid_subset: dev_other + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 24 + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: 320000 + lr: [0.00003] + sentence_avg: true + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.5 + mask_channel_prob: 0.25 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 diff --git a/examples/wav2vec/config/finetuning/vox_960h_2.yaml b/examples/wav2vec/config/finetuning/vox_960h_2.yaml new file mode 100644 index 0000000000..d96e2325be --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_960h_2.yaml @@ -0,0 +1,105 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /checkpoint/abaevski/data/speech/libri/960h/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1000000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 1 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 16 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin + wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 2.0 + wer_word_score: -1.0 + +optimization: + max_update: 200000 + lr: [1e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [1] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: null + warmup_steps: 8000 + hold_steps: 0 + decay_steps: 200000 + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.4 + mask_length: 5 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.1 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 100 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 250 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/finetuning/vox_960h_2_aws.yaml b/examples/wav2vec/config/finetuning/vox_960h_2_aws.yaml new file mode 100644 index 0000000000..41d2b38f85 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_960h_2_aws.yaml @@ -0,0 +1,82 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /data/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /fsx-wav2vec/abaevski/data/librispeech + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 1 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 16 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin + wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 1.5 + wer_word_score: 0 + wer_sil_weight: -1 + +optimization: + max_update: 200000 + lr: [2e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [1] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: null + warmup_steps: 8000 + hold_steps: 0 + decay_steps: 192000 + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.3 + mask_length: 5 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.1 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 100 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + diff --git a/examples/wav2vec/config/finetuning/vox_960h_3.yaml b/examples/wav2vec/config/finetuning/vox_960h_3.yaml new file mode 100644 index 0000000000..ef6597aa67 --- /dev/null +++ b/examples/wav2vec/config/finetuning/vox_960h_3.yaml @@ -0,0 +1,101 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec +# tensorboard_logdir: tb + +checkpoint: + save_interval: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw + labels: ltr + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1000000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 100 + validate_interval: 1 + valid_subset: dev_other + required_batch_size_multiple: 1 + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 16 + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin + wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst + wer_lm_weight: 2.0 + wer_word_score: -1.0 + +optimization: + max_update: 200000 + lr: [1e-5] +# lr: [1e-5] # base 10h wer + sentence_avg: true + update_freq: [1] # base 10h we -> 2/4 + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: cosine + warmup_updates: 8000 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.4 + mask_length: 5 +# mask_prob: 0.65 # base 10h wer + mask_channel_prob: 0.1 +# mask_channel_prob: 0.6 # base 10h wer + mask_channel_length: 64 + layerdrop: 0.1 +# layerdrop: 0.05 # base 10h wer + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 100 + dropout: 0 + final_dropout: 0 + attention_dropout: 0 + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 4 + tasks_per_node: 4 + mem_gb: 250 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml b/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml new file mode 100644 index 0000000000..b686e21ab1 --- /dev/null +++ b/examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml @@ -0,0 +1,57 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: ??? + max_sample_size: 250000 + min_sample_size: 32000 + normalize: false + +dataset: + num_workers: 6 + max_tokens: 1400000 + skip_invalid_size_inputs_valid_test: true + +distributed_training: + distributed_world_size: 64 + ddp_backend: legacy_ddp + +criterion: + _name: wav2vec + infonce: true + log_keys: ["prob_perplexity","code_perplexity","temp"] + loss_weights: [0.1, 10] + +optimization: + max_update: 400000 + lr: [0.0005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 32000 + +model: + _name: wav2vec2 + quantize_targets: true + final_dim: 256 + encoder_layerdrop: 0.05 + dropout_input: 0.1 + dropout_features: 0.1 + feature_grad_mult: 0.1 + encoder_embed_dim: 768 diff --git a/examples/wav2vec/config/pretraining/wav2vec2_conformer_base_librispeech.yaml b/examples/wav2vec/config/pretraining/wav2vec2_conformer_base_librispeech.yaml new file mode 100644 index 0000000000..912ac152fd --- /dev/null +++ b/examples/wav2vec/config/pretraining/wav2vec2_conformer_base_librispeech.yaml @@ -0,0 +1,60 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: ??? + max_sample_size: 250000 + min_sample_size: 32000 + normalize: false + +dataset: + num_workers: 6 + max_tokens: 1400000 + skip_invalid_size_inputs_valid_test: true + +distributed_training: + distributed_world_size: 64 + ddp_backend: legacy_ddp + +criterion: + _name: wav2vec + infonce: true + log_keys: ["prob_perplexity","code_perplexity","temp"] + loss_weights: [0.1, 10] + +optimization: + max_update: 400000 + lr: [0.0005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 32000 + +model: + _name: wav2vec2 + quantize_targets: true + final_dim: 256 + encoder_layerdrop: 0.05 + dropout_input: 0.1 + dropout_features: 0.1 + feature_grad_mult: 0.1 + encoder_embed_dim: 768 + layer_type: conformer + attn_type: espnet + pos_enc_type: rel_pos diff --git a/examples/wav2vec/config/pretraining/wav2vec2_conformer_large_librivox.yaml b/examples/wav2vec/config/pretraining/wav2vec2_conformer_large_librivox.yaml new file mode 100644 index 0000000000..676166b6b7 --- /dev/null +++ b/examples/wav2vec/config/pretraining/wav2vec2_conformer_large_librivox.yaml @@ -0,0 +1,72 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: ??? + max_sample_size: 320000 + min_sample_size: 32000 + normalize: true + +dataset: + num_workers: 6 + max_tokens: 1200000 + skip_invalid_size_inputs_valid_test: true + +distributed_training: + distributed_world_size: 128 + ddp_backend: legacy_ddp + +criterion: + _name: wav2vec + infonce: true + log_keys: ["prob_perplexity","code_perplexity","temp"] + loss_weights: [0.1, 0] + +optimization: + max_update: 1000000 + lr: [0.005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 32000 + +model: + _name: wav2vec2 + quantize_targets: true + extractor_mode: layer_norm + layer_norm_first: true + final_dim: 768 + latent_temp: [2.0,0.1,0.999995] + encoder_layerdrop: 0.00 + dropout_input: 0.0 + dropout_features: 0.0 + dropout: 0.0 + attention_dropout: 0.0 + conv_bias: true + + encoder_layers: 24 + encoder_embed_dim: 1024 + encoder_ffn_embed_dim: 4096 + encoder_attention_heads: 16 + + feature_grad_mult: 1.0 + + layer_type: conformer + attn_type: espnet + pos_enc_type: rel_pos diff --git a/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml b/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml new file mode 100644 index 0000000000..3192ce4cba --- /dev/null +++ b/examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml @@ -0,0 +1,70 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: ??? + max_sample_size: 320000 + min_sample_size: 32000 + normalize: true + +dataset: + batch_size: 4 + num_workers: 6 + max_tokens: 1200000 + skip_invalid_size_inputs_valid_test: true + +distributed_training: + distributed_world_size: 128 + ddp_backend: legacy_ddp + +criterion: + _name: wav2vec + infonce: true + log_keys: ["prob_perplexity","code_perplexity","temp"] + loss_weights: [0.1, 0] + +optimization: + max_update: 1000000 + lr: [0.005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 32000 + +model: + _name: wav2vec2 + quantize_targets: true + extractor_mode: layer_norm + layer_norm_first: true + final_dim: 768 + latent_temp: [2.0,0.1,0.999995] + encoder_layerdrop: 0.00 + dropout_input: 0.0 + dropout_features: 0.0 + dropout: 0.0 + attention_dropout: 0.0 + conv_bias: true + + encoder_layers: 24 + encoder_embed_dim: 1024 + encoder_ffn_embed_dim: 4096 + encoder_attention_heads: 16 + + feature_grad_mult: 1.0 + diff --git a/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu-pod.yaml b/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu-pod.yaml new file mode 100644 index 0000000000..ff35a95b65 --- /dev/null +++ b/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu-pod.yaml @@ -0,0 +1,72 @@ +# @package _group_ + +common: + tpu: true + fp16: false + log_format: json + log_interval: 10 + +checkpoint: + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: ??? + max_sample_size: 250000 + min_sample_size: 32000 + normalize: true + num_batch_buckets: 3 + precompute_mask_indices: true + enable_padding: true + +dataset: + num_workers: 6 + max_tokens: 1200000 + skip_invalid_size_inputs_valid_test: true + +distributed_training: + distributed_world_size: 128 + ddp_backend: legacy_ddp + +criterion: + _name: wav2vec + infonce: true + log_keys: ["prob_perplexity","code_perplexity","temp"] + loss_weights: [0.1, 0] + +optimization: + max_update: 1000000 + lr: [0.005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 32000 + +model: + _name: wav2vec2 + quantize_targets: true + extractor_mode: layer_norm + layer_norm_first: true + final_dim: 768 + latent_temp: [2.0,0.1,0.999995] + encoder_layerdrop: 0.00 + dropout_input: 0.0 + dropout_features: 0.0 + dropout: 0.0 + attention_dropout: 0.0 + conv_bias: true + + encoder_layers: 24 + encoder_embed_dim: 1024 + encoder_ffn_embed_dim: 4096 + encoder_attention_heads: 16 + + feature_grad_mult: 1.0 diff --git a/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu.yaml b/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu.yaml new file mode 100644 index 0000000000..ee55bdab72 --- /dev/null +++ b/examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu.yaml @@ -0,0 +1,77 @@ +# @package _group_ + +common: + tpu: true + fp16: false + log_format: json + log_interval: 10 + +checkpoint: + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: ??? + max_sample_size: 250000 + min_sample_size: 32000 + normalize: true + num_batch_buckets: 3 + precompute_mask_indices: true + enable_padding: true + inferred_w2v_config: + mask_prob: 0.65 + mask_selection: 'static' + mask_other: 0 + mask_channel_prob: 0.1 + +dataset: + num_workers: 6 + max_tokens: 1200000 + skip_invalid_size_inputs_valid_test: true + +distributed_training: + distributed_world_size: 8 + ddp_backend: legacy_ddp + +criterion: + _name: wav2vec + infonce: true + log_keys: ["prob_perplexity","code_perplexity","temp"] + loss_weights: [0.1, 0] + +optimization: + max_update: 1000000 + lr: [0.005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 32000 + +model: + _name: wav2vec2 + quantize_targets: true + extractor_mode: layer_norm + layer_norm_first: true + final_dim: 768 + latent_temp: [2.0,0.1,0.999995] + encoder_layerdrop: 0.00 + dropout_input: 0.0 + dropout_features: 0.0 + dropout: 0.0 + attention_dropout: 0.0 + conv_bias: true + + encoder_layers: 24 + encoder_embed_dim: 1024 + encoder_ffn_embed_dim: 4096 + encoder_attention_heads: 16 + + feature_grad_mult: 1.0 diff --git a/examples/wav2vec/libri_labels.py b/examples/wav2vec/libri_labels.py index 3fa1ec4c8b..694a202604 100644 --- a/examples/wav2vec/libri_labels.py +++ b/examples/wav2vec/libri_labels.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. """ -Helper script to pre-compute embeddings for a wav2letter++ dataset +Helper script to pre-compute embeddings for a flashlight (previously called wav2letter++) dataset """ import argparse diff --git a/examples/wav2vec/scripts/binarize_manifest.sh b/examples/wav2vec/scripts/binarize_manifest.sh new file mode 100644 index 0000000000..6f201bdb52 --- /dev/null +++ b/examples/wav2vec/scripts/binarize_manifest.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# usage: bash binarize_manifest <dest_dir> <train_split> <valid_split> + +DEST_DIR=$1 +TRAIN_SPLIT=$2 +VALID_SPLIT=$3 +FAIRSEQ_ROOT=$4 + +mkdir -p $DEST_DIR + +# split file path and lengths into separate files +cut -f1 $TRAIN_SPLIT.tsv > $DEST_DIR/train_fnames.txt +cut -f1 $VALID_SPLIT.tsv > $DEST_DIR/valid_fnames.txt +cut -f2 $TRAIN_SPLIT.tsv > $DEST_DIR/train.lengths +cut -f2 $VALID_SPLIT.tsv > $DEST_DIR/valid.lengths + +# copy root directory +head -1 $TRAIN_SPLIT.tsv > $DEST_DIR/train.root +head -1 $VALID_SPLIT.tsv > $DEST_DIR/valid.root + +# remove root directory +sed -i '1d' $DEST_DIR/train_fnames.txt +sed -i '1d' $DEST_DIR/valid_fnames.txt +sed -i '1d' $DEST_DIR/train.lengths +sed -i '1d' $DEST_DIR/valid.lengths + +# insert spaces between characters +sed -i -e 's/\(.\)/\1 /g' $DEST_DIR/train_fnames.txt +sed -i -e 's/\(.\)/\1 /g' $DEST_DIR/valid_fnames.txt + +# run preprocessor +PYTHONPATH=$FAIRSEQ_ROOT python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $DEST_DIR/train_fnames.txt --validpref $DEST_DIR/valid_fnames.txt --workers 60 --only-source --destdir $DEST_DIR diff --git a/examples/wav2vec/unsupervised/README.md b/examples/wav2vec/unsupervised/README.md new file mode 100644 index 0000000000..b9d6f6762c --- /dev/null +++ b/examples/wav2vec/unsupervised/README.md @@ -0,0 +1,119 @@ +# wav2vec Unsupervised (wav2vec-U) + +Wav2vec Unsupervised (wav2vec-U) and the 2.0 version are frameworks for building speech recognition systems without any labeled training data as described in [Unsupervised Speech Recognition (Baevski et al., 2021)](https://ai.facebook.com/research/publications/unsupervised-speech-recognition) and [Towards End-to-end Unsupervised Speech Recognition (Liu, et al., 2022)](https://arxiv.org/abs/2204.02492). The model takes as input wav2vec 2.0 or XLSR representations (see [pretrained models](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec)) as well as unlabeled speech and text data. + + The training procedure consists of three consecutive main steps: +* Preparation of speech representations and text data +* Generative adversarial training (GAN) +* Iterative self-training + Kaldi LM-decoding + +## Preparation of speech and text data +Similar to [wav2vec 2.0](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/README.md), data folders contain {train,valid,test}.{tsv,wrd,phn} files, where audio paths are stored in tsv files, and word, letter or phoneme transcriptions are stored in .{wrd,ltr,phn}. + +In **/path/to/data/with_silence** you need a *train.tsv* file as well as (optionally) *{valid,test}.{tsv,wrd,phn}*. It is nice to have *10h.{tsv,phn}* files there too for reproducing the ablation study on layer selection. In **/path/to/data/without_silence** you have the same files, except *.tsv* files contain audios with silences removed using rVAD. + +Pre-requisites: +* set FAIRSEQ_ROOT environmental variable to your fairseq installation +* set RVAD_ROOT environmental variable to a checkout of [rVADfast](https://github.com/zhenghuatan/rVADfast) +* set KENLM_ROOT environmental variable to the location of [KenLM](https://github.com/kpu/kenlm) binaries +* install [PyKaldi](https://github.com/pykaldi/pykaldi) and set KALDI_ROOT environmental variable to the location of your kaldi installation. To use the version bundled with PyKaldi, you can use /path/to/pykaldi/tools/kaldi + +Create new audio files without silences: +```shell +# create a manifest file for the set original of audio files +python $FAIRSEQ_ROOT/examples/wav2vec/wav2vec_manifest.py /dir/to/save/audio/files --ext wav --dest /path/to/new/train.tsv --valid-percent 0 + +python scripts/vads.py -r $RVAD_ROOT < /path/to/train.tsv > train.vads + +python scripts/remove_silence.py --tsv /path/to/train.tsv --vads train.vads --out /dir/to/save/audio/files + +python $FAIRSEQ_ROOT/examples/wav2vec/wav2vec_manifest.py /dir/to/save/audio/files --ext wav --dest /path/to/new/train.tsv --valid-percent 0.01 +``` + +Next, we need to preprocess the audio data to better match phonemized text data: + +```shell +# wav2vec-U +zsh scripts/prepare_audio.sh /dir/with/{train,test,valid}.tsv /output/dir /path/to/wav2vec2/model.pt 512 14 +# wav2vec-U 2.0 +zsh scripts/prepare_audio_v2.sh /dir/with/{train,test,valid}.tsv /output/dir /path/to/wav2vec2/model.pt 64 14 +``` +Note that if you have splits different than train/valid/test, you will need to modify this script. The thrid argument is the PCA dimensionality for wav2vec-U and the number of MFCC clusters for wav2vec-U 2.0. The last argument is the 0-based index of the layer from which to extract representations. + +Now we need to prepare text data: +```shell +zsh scripts/prepare_text.sh language /path/to/text/file /output/dir 1000 espeak /path/to/fasttext/lid/model sil_prob +``` + +The fourth argument is minimum number observations of phones to keep. If your text corpus is small, you might want to reduce this number. + +The fifth argument is which phonemizer to use. Supported values are [espeak](http://espeak.sourceforge.net/), [espeak-ng](https://github.com/espeak-ng/espeak-ng), and [G2P](https://github.com/Kyubyong/g2p) (english only). + +Pre-trained fasttext LID models can be downloaded [here](https://fasttext.cc/docs/en/language-identification.html). + +The last argument is the probability to introduce silence (`<SIL>`) between the word boundaries. We found the value `0.25`/`0.5` works in general for wav2vec-U and the 2.0 version respectively, but you might want to vary for languages that are never tested. + +### Prepare TIMIT data +TIMIT transcripts include silence. Therefore VAD is not used for audio preprocessing, and we do not wrap transcripts with silences or insert random silence in between words. + +To prepare TIMIT data for both the matched an unmatched setup: +```shell +bash scripts/prepare_timit.sh /dir/to/timit/raw/data /output/dir /path/to/wav2vec2/model.pt +``` + +Note that we assume the TIMIT distribution with capitalized directories and filenames are used (e.g., `TRAIN/DR1/FCJF0/SA1.PHN`). + +## Generative adversarial training (GAN) + +We then use a GAN model to build a first unsupervised ASR model. The data preparation above of both speech features and text data is a necessary procedure that enables the generator to match speech to text in an unsupervised way. + +Launching GAN training on top of preprocessed features, with default hyperparameters can be done with: + +``` +PREFIX=w2v_unsup_gan_xp + +# For wav2vec-U, audio features are pre-segmented +CONFIG_NAME=w2vu +TASK_DATA=/path/to/features/precompute_unfiltered_pca512_cls128_mean_pooled + +# For wav2vec-U 2.0, use raw audio features +CONFIG_NAME=w2vu2 +TASK_DATA=/path/to/features/ + +# Unpaired text input +TEXT_DATA=/path/to/data/phones # path to fairseq-preprocessed GAN data (phones dir) +KENLM_PATH=/path/to/data/phones/kenlm.phn.o4.bin # KenLM 4-gram phoneme language model (LM data = GAN data here) + +PYTHONPATH=$FAIRSEQ_ROOT PREFIX=$PREFIX fairseq-hydra-train \ + -m --config-dir config/gan \ + --config-name $CONFIG_NAME \ + task.data=${TASK_DATA} \ + task.text_data=${TEXT_DATA} \ + task.kenlm_path=${KENLM_PATH} \ + common.user_dir=${FAIRSEQ_ROOT}/examples/wav2vec/unsupervised \ + model.code_penalty=2,4 model.gradient_penalty=1.5,2.0 \ + model.smoothness_weight=0.5,0.75,1.0 'common.seed=range(0,5)' +``` + + +Once we find the best checkpoint (chosen using unsupervised metric that combined language model perplexity and vocabulary usage), we can use it to generate phone labels (or word labels with an appropriate kaldi WFST): + +```shell +python w2vu_generate.py --config-dir config/generate --config-name viterbi \ +fairseq.common.user_dir=${FAIRSEQ_ROOT}/examples/wav2vec/unsupervised \ +fairseq.task.data=/path/to/dir/with/features \ +fairseq.common_eval.path=/path/to/gan/checkpoint \ +fairseq.dataset.gen_subset=valid results_path=/where/to/save/transcriptions +``` + +The decoding without LM works best on the same adjacent-mean-pooled features that the gan was trained on, while decoding with LM works better on features before the adjacent timestep mean-pooling step (without the "_pooled" suffix). + +While the generator of wav2vec-U 2.0 is trained with an output frequency of 16hz, we found decoding at a higher frequency produces better results. This can be done by adding `decode_stride=1` or `2` to the argument. + +## Iterative self-training + Kaldi LM-decoding +After the GAN training provides a first unsupervised model, we can then progressively refine the quality of transcriptions using several iterations of semi-supervised learning. We perform two iterations: first, pseudo-label the training data with the unsupervised GAN model and train an HMM on the pseudo-labels. Second, we relabel the training data with the HMM and then fine-tune the original wav2vec 2.0 model using the HMM pseudo-labels with a CTC loss. Note that HMM models use phonemes as output, while wav2vec 2.0 use letter. Both are decoded using WFST decoders into words. + + +Please see [this README](kaldi_self_train/README.md) for more instructions on how to do iterative self-training + Kaldi LM-decoding. + +*** Note: these instructions are a work in progress and will be updated over the next few days diff --git a/examples/wav2vec/unsupervised/__init__.py b/examples/wav2vec/unsupervised/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/wav2vec/unsupervised/config/finetuning/w2v_finetune.yaml b/examples/wav2vec/unsupervised/config/finetuning/w2v_finetune.yaml new file mode 100644 index 0000000000..19a3ef3484 --- /dev/null +++ b/examples/wav2vec/unsupervised/config/finetuning/w2v_finetune.yaml @@ -0,0 +1,62 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + +checkpoint: + no_epoch_checkpoints: true + save_interval_updates: 20000 + +task: + _name: audio_finetuning + data: ??? + normalize: true + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 800000 + skip_invalid_size_inputs_valid_test: true + train_subset: train + valid_subset: valid + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 8 + find_unused_parameters: True + +criterion: + _name: ctc + zero_infinity: true + post_process: letter + +optimization: + max_update: 80000 + lr: [0.00003] + sentence_avg: true + update_freq: [1] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.25 + mask_channel_prob: 0.1 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 0 diff --git a/examples/wav2vec/unsupervised/config/gan/w2vu.yaml b/examples/wav2vec/unsupervised/config/gan/w2vu.yaml new file mode 100644 index 0000000000..74f1829d14 --- /dev/null +++ b/examples/wav2vec/unsupervised/config/gan/w2vu.yaml @@ -0,0 +1,115 @@ +# @package _group_ + +common: + fp16: false + fp16_no_flatten_grads: true + log_format: json + log_interval: 100 + tensorboard_logdir: tb + reset_logging: false + suppress_crashes: false + +checkpoint: + save_interval: 1000 + save_interval_updates: 1000 + no_epoch_checkpoints: true + best_checkpoint_metric: weighted_lm_ppl + save_dir: . + +distributed_training: + distributed_world_size: 1 + +task: + _name: unpaired_audio_text + data: ??? + text_data: ??? + labels: phn + sort_by_length: false + unfiltered: false + max_length: null + append_eos: false + kenlm_path: ??? + +dataset: + num_workers: 6 + batch_size: 160 + skip_invalid_size_inputs_valid_test: true + valid_subset: valid + validate_interval: 1000 + validate_interval_updates: 1000 + +criterion: + _name: model + log_keys: + - accuracy_dense + - accuracy_token + - temp + - code_ppl + +optimization: + max_update: 150000 + clip_norm: 5.0 + lr: [0] + +optimizer: + _name: composite + groups: + generator: + lr: [0.0004] + lr_float: null + optimizer: + _name: adam + adam_betas: [0.5,0.98] + adam_eps: 1e-06 + weight_decay: 0 + amsgrad: false + lr_scheduler: + _name: fixed + warmup_updates: 0 + discriminator: + lr: [ 0.0005 ] + lr_float: null + optimizer: + _name: adam + adam_betas: [0.5,0.98] + adam_eps: 1e-06 + weight_decay: 0.0001 + amsgrad: false + lr_scheduler: + _name: fixed + warmup_updates: 0 + +lr_scheduler: pass_through + +model: + _name: wav2vec_u + + discriminator_dim: 384 + discriminator_depth: 2 + discriminator_kernel: 6 + discriminator_linear_emb: false + discriminator_causal: true + discriminator_max_pool: false + discriminator_act_after_linear: false + discriminator_dropout: 0.0 + discriminator_weight_norm: false + + generator_stride: 1 + generator_kernel: 4 + generator_bias: false + generator_dropout: 0.1 + + smoothness_weight: 0.5 + smoothing: 0 + smoothing_one_sided: false + gumbel: false + hard_gumbel: false + gradient_penalty: 1.5 + code_penalty: 4.0 + temp: [ 2,0.1,0.99995 ] + input_dim: 512 + + segmentation: + type: JOIN + mean_pool_join: false + remove_zeros: false diff --git a/examples/wav2vec/unsupervised/config/gan/w2vu2.yaml b/examples/wav2vec/unsupervised/config/gan/w2vu2.yaml new file mode 100644 index 0000000000..52014222b2 --- /dev/null +++ b/examples/wav2vec/unsupervised/config/gan/w2vu2.yaml @@ -0,0 +1,154 @@ +# @package _group_ + +common: + fp16: false + fp16_no_flatten_grads: true + log_format: json + log_interval: 100 + tensorboard_logdir: tb + reset_logging: false + suppress_crashes: false + +checkpoint: + save_interval: 1000 + save_interval_updates: 1000 + no_epoch_checkpoints: true + best_checkpoint_metric: weighted_lm_ppl + save_dir: . + +distributed_training: + distributed_world_size: 1 + +task: + _name: unpaired_audio_text + data: ??? + text_data: ??? + labels: phn + sort_by_length: false + unfiltered: false + max_length: null + append_eos: false + kenlm_path: ??? + aux_target_postfix: km + +dataset: + num_workers: 6 + batch_size: 160 + skip_invalid_size_inputs_valid_test: true + valid_subset: valid + validate_interval: 1000 + validate_interval_updates: 1000 + +criterion: + _name: model + log_keys: + - accuracy_dense + - accuracy_token + - temp + - code_ppl + +optimization: + max_update: 150000 + clip_norm: 5.0 + lr: [0] + +optimizer: + _name: composite + groups: + generator: + lr: [0.00005] + lr_float: null + optimizer: + _name: adam + adam_betas: [0.5,0.98] + adam_eps: 1e-06 + weight_decay: 0 + amsgrad: false + lr_scheduler: + _name: fixed + warmup_updates: 0 + discriminator: + lr: [ 0.0003 ] + lr_float: null + optimizer: + _name: adam + adam_betas: [0.5,0.98] + adam_eps: 1e-06 + weight_decay: 0.0001 + amsgrad: false + lr_scheduler: + _name: fixed + warmup_updates: 0 + +lr_scheduler: pass_through + +model: + _name: wav2vec_u + + discriminator_dim: 384 + discriminator_depth: 2 + discriminator_kernel: 8 + discriminator_linear_emb: false + discriminator_causal: true + discriminator_max_pool: false + discriminator_act_after_linear: false + discriminator_dropout: 0.0 + discriminator_weight_norm: false + + generator_stride: 3 + generator_kernel: 9 + generator_bias: false + generator_dropout: 0.1 + generator_batch_norm: 30 + generator_residual: true + + smoothness_weight: 1.5 + smoothing: 0 + smoothing_one_sided: false + gumbel: false + hard_gumbel: false + gradient_penalty: 1.0 + code_penalty: 3.0 + temp: [ 2,0.1,0.99995 ] + input_dim: 1024 + mmi_weight: 0.5 + target_dim: 64 + + segmentation: + type: JOIN + mean_pool_join: false + remove_zeros: false + + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - run_config + - distributed_training.distributed_port + - common.user_dir + - task.data + - task.kenlm_path + - task.text_data + - model.generator_layers + - task.labels + - task.force_model_seed + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 3000 + cpus_per_task: 10 + gpus_per_node: 1 + tasks_per_node: 1 + mem_gb: 120 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + comment: intern_endding_soon + constraint: volta32gb + max_num_timeout: 30 diff --git a/examples/wav2vec/unsupervised/config/generate/viterbi.yaml b/examples/wav2vec/unsupervised/config/generate/viterbi.yaml new file mode 100644 index 0000000000..9c88beebcb --- /dev/null +++ b/examples/wav2vec/unsupervised/config/generate/viterbi.yaml @@ -0,0 +1,21 @@ +# @package _group_ + +fairseq: + task: + _name: unpaired_audio_text + labels: phn + data: ??? + sort_by_length: false + shuffle: false + text_data: '' + + common_eval: + path: ??? + quiet: true + + dataset: + gen_subset: valid + batch_size: 1 + +w2l_decoder: VITERBI +post_process: silence diff --git a/examples/wav2vec/unsupervised/config/timit_matched/test.uid b/examples/wav2vec/unsupervised/config/timit_matched/test.uid new file mode 100644 index 0000000000..401008246a --- /dev/null +++ b/examples/wav2vec/unsupervised/config/timit_matched/test.uid @@ -0,0 +1,192 @@ +FDHC0_SI1559 +FDHC0_SI2189 +FDHC0_SI929 +FDHC0_SX119 +FDHC0_SX209 +FDHC0_SX29 +FDHC0_SX299 +FDHC0_SX389 +FELC0_SI1386 +FELC0_SI2016 +FELC0_SI756 +FELC0_SX126 +FELC0_SX216 +FELC0_SX306 +FELC0_SX36 +FELC0_SX396 +FJLM0_SI1043 +FJLM0_SI1673 +FJLM0_SI2303 +FJLM0_SX143 +FJLM0_SX233 +FJLM0_SX323 +FJLM0_SX413 +FJLM0_SX53 +FMGD0_SI1564 +FMGD0_SI2194 +FMGD0_SI934 +FMGD0_SX124 +FMGD0_SX214 +FMGD0_SX304 +FMGD0_SX34 +FMGD0_SX394 +FMLD0_SI2185 +FMLD0_SI822 +FMLD0_SI925 +FMLD0_SX115 +FMLD0_SX205 +FMLD0_SX25 +FMLD0_SX295 +FMLD0_SX385 +FNLP0_SI1308 +FNLP0_SI1938 +FNLP0_SI678 +FNLP0_SX138 +FNLP0_SX228 +FNLP0_SX318 +FNLP0_SX408 +FNLP0_SX48 +FPAS0_SI1272 +FPAS0_SI2204 +FPAS0_SI944 +FPAS0_SX134 +FPAS0_SX224 +FPAS0_SX314 +FPAS0_SX404 +FPAS0_SX44 +FPKT0_SI1538 +FPKT0_SI2168 +FPKT0_SI908 +FPKT0_SX188 +FPKT0_SX278 +FPKT0_SX368 +FPKT0_SX8 +FPKT0_SX98 +MBPM0_SI1577 +MBPM0_SI1584 +MBPM0_SI947 +MBPM0_SX137 +MBPM0_SX227 +MBPM0_SX317 +MBPM0_SX407 +MBPM0_SX47 +MCMJ0_SI1094 +MCMJ0_SI464 +MCMJ0_SI602 +MCMJ0_SX104 +MCMJ0_SX14 +MCMJ0_SX194 +MCMJ0_SX284 +MCMJ0_SX374 +MDAB0_SI1039 +MDAB0_SI1669 +MDAB0_SI2299 +MDAB0_SX139 +MDAB0_SX229 +MDAB0_SX319 +MDAB0_SX409 +MDAB0_SX49 +MGRT0_SI1450 +MGRT0_SI2080 +MGRT0_SI820 +MGRT0_SX10 +MGRT0_SX100 +MGRT0_SX190 +MGRT0_SX280 +MGRT0_SX370 +MJDH0_SI1354 +MJDH0_SI1984 +MJDH0_SI724 +MJDH0_SX184 +MJDH0_SX274 +MJDH0_SX364 +MJDH0_SX4 +MJDH0_SX94 +MJLN0_SI1449 +MJLN0_SI2079 +MJLN0_SI819 +MJLN0_SX189 +MJLN0_SX279 +MJLN0_SX369 +MJLN0_SX9 +MJLN0_SX99 +MJMP0_SI1535 +MJMP0_SI1791 +MJMP0_SI905 +MJMP0_SX185 +MJMP0_SX275 +MJMP0_SX365 +MJMP0_SX5 +MJMP0_SX95 +MKLT0_SI1213 +MKLT0_SI1843 +MKLT0_SI583 +MKLT0_SX133 +MKLT0_SX223 +MKLT0_SX313 +MKLT0_SX403 +MKLT0_SX43 +MLLL0_SI1363 +MLLL0_SI1993 +MLLL0_SI733 +MLLL0_SX103 +MLLL0_SX13 +MLLL0_SX193 +MLLL0_SX283 +MLLL0_SX373 +MLNT0_SI1574 +MLNT0_SI1902 +MLNT0_SI642 +MLNT0_SX102 +MLNT0_SX12 +MLNT0_SX192 +MLNT0_SX282 +MLNT0_SX372 +MNJM0_SI1580 +MNJM0_SI2210 +MNJM0_SI950 +MNJM0_SX140 +MNJM0_SX230 +MNJM0_SX320 +MNJM0_SX410 +MNJM0_SX50 +MPAM0_SI1189 +MPAM0_SI1819 +MPAM0_SI1961 +MPAM0_SX109 +MPAM0_SX19 +MPAM0_SX199 +MPAM0_SX289 +MPAM0_SX379 +MTAS1_SI1473 +MTAS1_SI2098 +MTAS1_SI838 +MTAS1_SX118 +MTAS1_SX208 +MTAS1_SX28 +MTAS1_SX298 +MTAS1_SX388 +MTLS0_SI1370 +MTLS0_SI2000 +MTLS0_SI740 +MTLS0_SX110 +MTLS0_SX20 +MTLS0_SX200 +MTLS0_SX290 +MTLS0_SX380 +MWBT0_SI1553 +MWBT0_SI2183 +MWBT0_SI923 +MWBT0_SX113 +MWBT0_SX203 +MWBT0_SX23 +MWBT0_SX293 +MWBT0_SX383 +MWEW0_SI1361 +MWEW0_SI1991 +MWEW0_SI731 +MWEW0_SX101 +MWEW0_SX11 +MWEW0_SX191 +MWEW0_SX281 +MWEW0_SX371 diff --git a/examples/wav2vec/unsupervised/config/timit_matched/train.uid b/examples/wav2vec/unsupervised/config/timit_matched/train.uid new file mode 100644 index 0000000000..c39fd0b91d --- /dev/null +++ b/examples/wav2vec/unsupervised/config/timit_matched/train.uid @@ -0,0 +1,3696 @@ +FAEM0_SI1392 +FAEM0_SI2022 +FAEM0_SI762 +FAEM0_SX132 +FAEM0_SX222 +FAEM0_SX312 +FAEM0_SX402 +FAEM0_SX42 +FAJW0_SI1263 +FAJW0_SI1893 +FAJW0_SI633 +FAJW0_SX183 +FAJW0_SX273 +FAJW0_SX3 +FAJW0_SX363 +FAJW0_SX93 +FALK0_SI1086 +FALK0_SI456 +FALK0_SI658 +FALK0_SX186 +FALK0_SX276 +FALK0_SX366 +FALK0_SX6 +FALK0_SX96 +FALR0_SI1325 +FALR0_SI1955 +FALR0_SI695 +FALR0_SX155 +FALR0_SX245 +FALR0_SX335 +FALR0_SX425 +FALR0_SX65 +FAPB0_SI1063 +FAPB0_SI1693 +FAPB0_SI2323 +FAPB0_SX163 +FAPB0_SX253 +FAPB0_SX343 +FAPB0_SX433 +FAPB0_SX73 +FBAS0_SI1387 +FBAS0_SI1472 +FBAS0_SI2066 +FBAS0_SX127 +FBAS0_SX217 +FBAS0_SX307 +FBAS0_SX37 +FBAS0_SX397 +FBCG1_SI1612 +FBCG1_SI2242 +FBCG1_SI982 +FBCG1_SX172 +FBCG1_SX262 +FBCG1_SX352 +FBCG1_SX442 +FBCG1_SX82 +FBCH0_SI1586 +FBCH0_SI956 +FBCH0_SI959 +FBCH0_SX146 +FBCH0_SX236 +FBCH0_SX326 +FBCH0_SX416 +FBCH0_SX56 +FBJL0_SI1552 +FBJL0_SI2182 +FBJL0_SI922 +FBJL0_SX112 +FBJL0_SX202 +FBJL0_SX22 +FBJL0_SX292 +FBJL0_SX382 +FBLV0_SI1058 +FBLV0_SI1688 +FBLV0_SI2318 +FBLV0_SX158 +FBLV0_SX248 +FBLV0_SX338 +FBLV0_SX428 +FBLV0_SX68 +FBMH0_SI1136 +FBMH0_SI1766 +FBMH0_SI970 +FBMH0_SX146 +FBMH0_SX236 +FBMH0_SX326 +FBMH0_SX416 +FBMH0_SX56 +FBMJ0_SI1776 +FBMJ0_SI516 +FBMJ0_SI815 +FBMJ0_SX156 +FBMJ0_SX246 +FBMJ0_SX336 +FBMJ0_SX426 +FBMJ0_SX66 +FCAG0_SI1503 +FCAG0_SI1641 +FCAG0_SI2133 +FCAG0_SX153 +FCAG0_SX243 +FCAG0_SX333 +FCAG0_SX423 +FCAG0_SX63 +FCAJ0_SI1479 +FCAJ0_SI1804 +FCAJ0_SI849 +FCAJ0_SX129 +FCAJ0_SX219 +FCAJ0_SX309 +FCAJ0_SX39 +FCAJ0_SX399 +FCDR1_SI1186 +FCDR1_SI1816 +FCDR1_SI556 +FCDR1_SX106 +FCDR1_SX16 +FCDR1_SX196 +FCDR1_SX286 +FCDR1_SX376 +FCEG0_SI1248 +FCEG0_SI1878 +FCEG0_SI618 +FCEG0_SX168 +FCEG0_SX258 +FCEG0_SX348 +FCEG0_SX438 +FCEG0_SX78 +FCJF0_SI1027 +FCJF0_SI1657 +FCJF0_SI648 +FCJF0_SX127 +FCJF0_SX217 +FCJF0_SX307 +FCJF0_SX37 +FCJF0_SX397 +FCJS0_SI1607 +FCJS0_SI2237 +FCJS0_SI977 +FCJS0_SX167 +FCJS0_SX257 +FCJS0_SX347 +FCJS0_SX437 +FCJS0_SX77 +FCKE0_SI1111 +FCKE0_SI1741 +FCKE0_SI481 +FCKE0_SX121 +FCKE0_SX211 +FCKE0_SX301 +FCKE0_SX31 +FCKE0_SX391 +FCLT0_SI1438 +FCLT0_SI2068 +FCLT0_SI808 +FCLT0_SX178 +FCLT0_SX268 +FCLT0_SX358 +FCLT0_SX448 +FCLT0_SX88 +FCMG0_SI1142 +FCMG0_SI1242 +FCMG0_SI1872 +FCMG0_SX162 +FCMG0_SX252 +FCMG0_SX342 +FCMG0_SX432 +FCMG0_SX72 +FCMM0_SI1083 +FCMM0_SI1957 +FCMM0_SI453 +FCMM0_SX183 +FCMM0_SX273 +FCMM0_SX363 +FCMM0_SX420 +FCMM0_SX93 +FCRZ0_SI1913 +FCRZ0_SI2053 +FCRZ0_SI793 +FCRZ0_SX163 +FCRZ0_SX253 +FCRZ0_SX343 +FCRZ0_SX433 +FCRZ0_SX73 +FCYL0_SI1297 +FCYL0_SI1927 +FCYL0_SI667 +FCYL0_SX127 +FCYL0_SX217 +FCYL0_SX349 +FCYL0_SX37 +FCYL0_SX397 +FDAS1_SI1461 +FDAS1_SI2091 +FDAS1_SI831 +FDAS1_SX111 +FDAS1_SX201 +FDAS1_SX21 +FDAS1_SX291 +FDAS1_SX381 +FDAW0_SI1271 +FDAW0_SI1406 +FDAW0_SI2036 +FDAW0_SX146 +FDAW0_SX236 +FDAW0_SX326 +FDAW0_SX416 +FDAW0_SX56 +FDFB0_SI1318 +FDFB0_SI1948 +FDFB0_SI2010 +FDFB0_SX148 +FDFB0_SX238 +FDFB0_SX328 +FDFB0_SX418 +FDFB0_SX58 +FDJH0_SI1565 +FDJH0_SI2195 +FDJH0_SI935 +FDJH0_SX125 +FDJH0_SX215 +FDJH0_SX305 +FDJH0_SX35 +FDJH0_SX395 +FDKN0_SI1081 +FDKN0_SI1202 +FDKN0_SI1711 +FDKN0_SX181 +FDKN0_SX271 +FDKN0_SX361 +FDKN0_SX451 +FDKN0_SX91 +FDML0_SI1149 +FDML0_SI1779 +FDML0_SI2075 +FDML0_SX159 +FDML0_SX249 +FDML0_SX339 +FDML0_SX429 +FDML0_SX69 +FDMY0_SI1197 +FDMY0_SI567 +FDMY0_SI714 +FDMY0_SX117 +FDMY0_SX207 +FDMY0_SX27 +FDMY0_SX297 +FDMY0_SX387 +FDNC0_SI1278 +FDNC0_SI1908 +FDNC0_SI2287 +FDNC0_SX108 +FDNC0_SX18 +FDNC0_SX198 +FDNC0_SX288 +FDNC0_SX378 +FDTD0_SI1561 +FDTD0_SI2191 +FDTD0_SI931 +FDTD0_SX121 +FDTD0_SX211 +FDTD0_SX301 +FDTD0_SX321 +FDTD0_SX391 +FDXW0_SI1511 +FDXW0_SI2141 +FDXW0_SI881 +FDXW0_SX161 +FDXW0_SX251 +FDXW0_SX341 +FDXW0_SX431 +FDXW0_SX71 +FEAC0_SI1245 +FEAC0_SI1875 +FEAC0_SI615 +FEAC0_SX165 +FEAC0_SX255 +FEAC0_SX345 +FEAC0_SX435 +FEAC0_SX75 +FEAR0_SI1252 +FEAR0_SI1882 +FEAR0_SI622 +FEAR0_SX172 +FEAR0_SX262 +FEAR0_SX352 +FEAR0_SX442 +FEAR0_SX82 +FECD0_SI1418 +FECD0_SI2048 +FECD0_SI788 +FECD0_SX158 +FECD0_SX248 +FECD0_SX338 +FECD0_SX428 +FECD0_SX68 +FEEH0_SI1112 +FEEH0_SI1742 +FEEH0_SI471 +FEEH0_SX122 +FEEH0_SX212 +FEEH0_SX302 +FEEH0_SX32 +FEEH0_SX392 +FEME0_SI1505 +FEME0_SI2135 +FEME0_SI875 +FEME0_SX155 +FEME0_SX245 +FEME0_SX335 +FEME0_SX425 +FEME0_SX65 +FETB0_SI1148 +FETB0_SI1778 +FETB0_SI518 +FETB0_SX158 +FETB0_SX248 +FETB0_SX338 +FETB0_SX428 +FETB0_SX68 +FEXM0_SI1101 +FEXM0_SI1731 +FEXM0_SI482 +FEXM0_SX111 +FEXM0_SX201 +FEXM0_SX291 +FEXM0_SX366 +FEXM0_SX381 +FGCS0_SI1486 +FGCS0_SI2116 +FGCS0_SI856 +FGCS0_SX136 +FGCS0_SX226 +FGCS0_SX316 +FGCS0_SX406 +FGCS0_SX46 +FGDP0_SI1618 +FGDP0_SI2248 +FGDP0_SI988 +FGDP0_SX178 +FGDP0_SX268 +FGDP0_SX358 +FGDP0_SX448 +FGDP0_SX88 +FGMB0_SI1145 +FGMB0_SI1775 +FGMB0_SI515 +FGMB0_SX155 +FGMB0_SX245 +FGMB0_SX335 +FGMB0_SX425 +FGMB0_SX65 +FGRW0_SI1152 +FGRW0_SI1782 +FGRW0_SI1990 +FGRW0_SX162 +FGRW0_SX252 +FGRW0_SX342 +FGRW0_SX432 +FGRW0_SX72 +FHLM0_SI1560 +FHLM0_SI2190 +FHLM0_SI930 +FHLM0_SX120 +FHLM0_SX210 +FHLM0_SX300 +FHLM0_SX349 +FHLM0_SX390 +FHXS0_SI1075 +FHXS0_SI2302 +FHXS0_SI2335 +FHXS0_SX175 +FHXS0_SX265 +FHXS0_SX355 +FHXS0_SX445 +FHXS0_SX85 +FJDM2_SI1582 +FJDM2_SI1964 +FJDM2_SI2212 +FJDM2_SX142 +FJDM2_SX232 +FJDM2_SX322 +FJDM2_SX412 +FJDM2_SX52 +FJEN0_SI1047 +FJEN0_SI1677 +FJEN0_SI2307 +FJEN0_SX147 +FJEN0_SX237 +FJEN0_SX327 +FJEN0_SX417 +FJEN0_SX57 +FJHK0_SI1022 +FJHK0_SI1652 +FJHK0_SI2282 +FJHK0_SX122 +FJHK0_SX212 +FJHK0_SX302 +FJHK0_SX32 +FJHK0_SX392 +FJKL0_SI1562 +FJKL0_SI2192 +FJKL0_SI932 +FJKL0_SX122 +FJKL0_SX212 +FJKL0_SX302 +FJKL0_SX32 +FJKL0_SX392 +FJLG0_SI1506 +FJLG0_SI1889 +FJLG0_SI2306 +FJLG0_SX179 +FJLG0_SX269 +FJLG0_SX359 +FJLG0_SX449 +FJLG0_SX89 +FJLR0_SI1231 +FJLR0_SI1861 +FJLR0_SI601 +FJLR0_SX151 +FJLR0_SX241 +FJLR0_SX331 +FJLR0_SX421 +FJLR0_SX61 +FJRB0_SI1302 +FJRB0_SI1932 +FJRB0_SI672 +FJRB0_SX132 +FJRB0_SX222 +FJRB0_SX312 +FJRB0_SX402 +FJRB0_SX42 +FJRP1_SI1432 +FJRP1_SI2062 +FJRP1_SI802 +FJRP1_SX172 +FJRP1_SX262 +FJRP1_SX352 +FJRP1_SX442 +FJRP1_SX82 +FJSK0_SI1052 +FJSK0_SI1682 +FJSK0_SI2312 +FJSK0_SX152 +FJSK0_SX242 +FJSK0_SX332 +FJSK0_SX422 +FJSK0_SX62 +FJSP0_SI1434 +FJSP0_SI1763 +FJSP0_SI804 +FJSP0_SX174 +FJSP0_SX264 +FJSP0_SX354 +FJSP0_SX444 +FJSP0_SX84 +FJWB1_SI2055 +FJWB1_SI748 +FJWB1_SI795 +FJWB1_SX165 +FJWB1_SX255 +FJWB1_SX345 +FJWB1_SX435 +FJWB1_SX75 +FJXM0_SI1211 +FJXM0_SI1971 +FJXM0_SI581 +FJXM0_SX131 +FJXM0_SX221 +FJXM0_SX311 +FJXM0_SX401 +FJXM0_SX41 +FJXP0_SI1122 +FJXP0_SI1752 +FJXP0_SI492 +FJXP0_SX132 +FJXP0_SX222 +FJXP0_SX312 +FJXP0_SX402 +FJXP0_SX42 +FKAA0_SI1208 +FKAA0_SI1838 +FKAA0_SI578 +FKAA0_SX128 +FKAA0_SX218 +FKAA0_SX308 +FKAA0_SX38 +FKAA0_SX398 +FKDE0_SI1141 +FKDE0_SI1771 +FKDE0_SI2221 +FKDE0_SX151 +FKDE0_SX241 +FKDE0_SX331 +FKDE0_SX421 +FKDE0_SX61 +FKDW0_SI1207 +FKDW0_SI1891 +FKDW0_SI577 +FKDW0_SX127 +FKDW0_SX217 +FKDW0_SX307 +FKDW0_SX37 +FKDW0_SX397 +FKFB0_SI1608 +FKFB0_SI2238 +FKFB0_SI978 +FKFB0_SX168 +FKFB0_SX258 +FKFB0_SX348 +FKFB0_SX438 +FKFB0_SX78 +FKKH0_SI1290 +FKKH0_SI1920 +FKKH0_SI660 +FKKH0_SX120 +FKKH0_SX210 +FKKH0_SX30 +FKKH0_SX300 +FKKH0_SX390 +FKLC0_SI1615 +FKLC0_SI2245 +FKLC0_SI985 +FKLC0_SX175 +FKLC0_SX265 +FKLC0_SX355 +FKLC0_SX445 +FKLC0_SX85 +FKLC1_SI1048 +FKLC1_SI1678 +FKLC1_SI2308 +FKLC1_SX148 +FKLC1_SX238 +FKLC1_SX328 +FKLC1_SX418 +FKLC1_SX58 +FKLH0_SI1257 +FKLH0_SI1887 +FKLH0_SI627 +FKLH0_SX177 +FKLH0_SX267 +FKLH0_SX357 +FKLH0_SX447 +FKLH0_SX87 +FKSR0_SI1117 +FKSR0_SI1747 +FKSR0_SI487 +FKSR0_SX161 +FKSR0_SX217 +FKSR0_SX366 +FKSR0_SX37 +FKSR0_SX397 +FLAC0_SI1339 +FLAC0_SI2161 +FLAC0_SI901 +FLAC0_SX181 +FLAC0_SX271 +FLAC0_SX361 +FLAC0_SX451 +FLAC0_SX91 +FLAG0_SI1464 +FLAG0_SI2094 +FLAG0_SI834 +FLAG0_SX114 +FLAG0_SX204 +FLAG0_SX24 +FLAG0_SX294 +FLAG0_SX384 +FLEH0_SI1051 +FLEH0_SI1681 +FLEH0_SI2311 +FLEH0_SX151 +FLEH0_SX241 +FLEH0_SX331 +FLEH0_SX421 +FLEH0_SX61 +FLET0_SI1137 +FLET0_SI1767 +FLET0_SI507 +FLET0_SX147 +FLET0_SX237 +FLET0_SX277 +FLET0_SX417 +FLET0_SX57 +FLHD0_SI1344 +FLHD0_SI1827 +FLHD0_SI1974 +FLHD0_SX174 +FLHD0_SX264 +FLHD0_SX354 +FLHD0_SX444 +FLHD0_SX84 +FLJA0_SI1078 +FLJA0_SI1708 +FLJA0_SI2338 +FLJA0_SX178 +FLJA0_SX268 +FLJA0_SX358 +FLJA0_SX448 +FLJA0_SX88 +FLJD0_SI1516 +FLJD0_SI2146 +FLJD0_SI886 +FLJD0_SX166 +FLJD0_SX256 +FLJD0_SX346 +FLJD0_SX436 +FLJD0_SX76 +FLJG0_SI1611 +FLJG0_SI2241 +FLJG0_SI981 +FLJG0_SX171 +FLJG0_SX261 +FLJG0_SX351 +FLJG0_SX441 +FLJG0_SX81 +FLKM0_SI1880 +FLKM0_SI620 +FLKM0_SI686 +FLKM0_SX116 +FLKM0_SX260 +FLKM0_SX350 +FLKM0_SX440 +FLKM0_SX80 +FLMA0_SI1243 +FLMA0_SI1873 +FLMA0_SI613 +FLMA0_SX163 +FLMA0_SX253 +FLMA0_SX343 +FLMA0_SX433 +FLMA0_SX73 +FLMC0_SI1372 +FLMC0_SI2002 +FLMC0_SI742 +FLMC0_SX112 +FLMC0_SX22 +FLMC0_SX292 +FLMC0_SX336 +FLMC0_SX382 +FLMK0_SI1035 +FLMK0_SI1229 +FLMK0_SI2295 +FLMK0_SX135 +FLMK0_SX225 +FLMK0_SX315 +FLMK0_SX405 +FLMK0_SX45 +FLOD0_SI1287 +FLOD0_SI1917 +FLOD0_SI657 +FLOD0_SX117 +FLOD0_SX171 +FLOD0_SX207 +FLOD0_SX297 +FLOD0_SX387 +FLTM0_SI1070 +FLTM0_SI1700 +FLTM0_SI2330 +FLTM0_SX170 +FLTM0_SX260 +FLTM0_SX350 +FLTM0_SX440 +FLTM0_SX80 +FMAH1_SI1509 +FMAH1_SI2139 +FMAH1_SI879 +FMAH1_SX159 +FMAH1_SX249 +FMAH1_SX339 +FMAH1_SX429 +FMAH1_SX69 +FMBG0_SI1160 +FMBG0_SI1790 +FMBG0_SI2264 +FMBG0_SX260 +FMBG0_SX3 +FMBG0_SX350 +FMBG0_SX440 +FMBG0_SX80 +FMEM0_SI1377 +FMEM0_SI2007 +FMEM0_SI747 +FMEM0_SX117 +FMEM0_SX207 +FMEM0_SX297 +FMEM0_SX333 +FMEM0_SX387 +FMJB0_SI1177 +FMJB0_SI1807 +FMJB0_SI547 +FMJB0_SX187 +FMJB0_SX277 +FMJB0_SX367 +FMJB0_SX7 +FMJB0_SX97 +FMJF0_SI1254 +FMJF0_SI1884 +FMJF0_SI624 +FMJF0_SX174 +FMJF0_SX264 +FMJF0_SX354 +FMJF0_SX444 +FMJF0_SX84 +FMJU0_SI1389 +FMJU0_SI2019 +FMJU0_SI759 +FMJU0_SX129 +FMJU0_SX219 +FMJU0_SX309 +FMJU0_SX39 +FMJU0_SX399 +FMKC0_SI1041 +FMKC0_SI1072 +FMKC0_SI1702 +FMKC0_SX172 +FMKC0_SX262 +FMKC0_SX352 +FMKC0_SX442 +FMKC0_SX82 +FMKF0_SI1018 +FMKF0_SI1536 +FMKF0_SI906 +FMKF0_SX186 +FMKF0_SX276 +FMKF0_SX366 +FMKF0_SX6 +FMKF0_SX96 +FMMH0_SI1537 +FMMH0_SI2167 +FMMH0_SI907 +FMMH0_SX187 +FMMH0_SX367 +FMMH0_SX420 +FMMH0_SX7 +FMMH0_SX97 +FMPG0_SI1602 +FMPG0_SI2232 +FMPG0_SI972 +FMPG0_SX162 +FMPG0_SX252 +FMPG0_SX342 +FMPG0_SX432 +FMPG0_SX72 +FNKL0_SI1522 +FNKL0_SI2152 +FNKL0_SI892 +FNKL0_SX172 +FNKL0_SX196 +FNKL0_SX262 +FNKL0_SX442 +FNKL0_SX82 +FNTB0_SI1203 +FNTB0_SI573 +FNTB0_SI679 +FNTB0_SX123 +FNTB0_SX213 +FNTB0_SX303 +FNTB0_SX33 +FNTB0_SX393 +FPAB1_SI1471 +FPAB1_SI2101 +FPAB1_SI841 +FPAB1_SX121 +FPAB1_SX211 +FPAB1_SX301 +FPAB1_SX31 +FPAB1_SX391 +FPAC0_SI1921 +FPAC0_SI2011 +FPAC0_SI661 +FPAC0_SX121 +FPAC0_SX211 +FPAC0_SX301 +FPAC0_SX31 +FPAC0_SX391 +FPAD0_SI1346 +FPAD0_SI1976 +FPAD0_SI716 +FPAD0_SX176 +FPAD0_SX266 +FPAD0_SX356 +FPAD0_SX446 +FPAD0_SX86 +FPAF0_SI1054 +FPAF0_SI1684 +FPAF0_SI2314 +FPAF0_SX154 +FPAF0_SX244 +FPAF0_SX334 +FPAF0_SX424 +FPAF0_SX64 +FPAZ0_SI1593 +FPAZ0_SI2223 +FPAZ0_SI963 +FPAZ0_SX153 +FPAZ0_SX243 +FPAZ0_SX27 +FPAZ0_SX423 +FPAZ0_SX63 +FPJF0_SI1046 +FPJF0_SI1259 +FPJF0_SI1676 +FPJF0_SX146 +FPJF0_SX236 +FPJF0_SX326 +FPJF0_SX352 +FPJF0_SX56 +FPLS0_SI1590 +FPLS0_SI2220 +FPLS0_SI960 +FPLS0_SX150 +FPLS0_SX240 +FPLS0_SX3 +FPLS0_SX330 +FPLS0_SX60 +FPMY0_SI1153 +FPMY0_SI1783 +FPMY0_SI523 +FPMY0_SX163 +FPMY0_SX196 +FPMY0_SX253 +FPMY0_SX343 +FPMY0_SX73 +FREH0_SI1315 +FREH0_SI1945 +FREH0_SI685 +FREH0_SX145 +FREH0_SX235 +FREH0_SX325 +FREH0_SX415 +FREH0_SX55 +FRJB0_SI1427 +FRJB0_SI1470 +FRJB0_SI1794 +FRJB0_SX167 +FRJB0_SX257 +FRJB0_SX347 +FRJB0_SX437 +FRJB0_SX77 +FRLL0_SI1514 +FRLL0_SI805 +FRLL0_SI884 +FRLL0_SX164 +FRLL0_SX254 +FRLL0_SX344 +FRLL0_SX434 +FRLL0_SX74 +FSAG0_SI1323 +FSAG0_SI1953 +FSAG0_SI693 +FSAG0_SX153 +FSAG0_SX243 +FSAG0_SX333 +FSAG0_SX423 +FSAG0_SX63 +FSAH0_SI1244 +FSAH0_SI1874 +FSAH0_SI614 +FSAH0_SX164 +FSAH0_SX327 +FSAH0_SX344 +FSAH0_SX434 +FSAH0_SX74 +FSAK0_SI1300 +FSAK0_SI1930 +FSAK0_SI670 +FSAK0_SX130 +FSAK0_SX220 +FSAK0_SX310 +FSAK0_SX40 +FSAK0_SX400 +FSBK0_SI1069 +FSBK0_SI1699 +FSBK0_SI2329 +FSBK0_SX169 +FSBK0_SX259 +FSBK0_SX349 +FSBK0_SX439 +FSBK0_SX79 +FSCN0_SI1886 +FSCN0_SI626 +FSCN0_SI705 +FSCN0_SX176 +FSCN0_SX266 +FSCN0_SX356 +FSCN0_SX446 +FSCN0_SX86 +FSDC0_SI1312 +FSDC0_SI1942 +FSDC0_SI2234 +FSDC0_SX142 +FSDC0_SX232 +FSDC0_SX322 +FSDC0_SX412 +FSDC0_SX52 +FSDJ0_SI1115 +FSDJ0_SI1745 +FSDJ0_SI485 +FSDJ0_SX125 +FSDJ0_SX215 +FSDJ0_SX305 +FSDJ0_SX35 +FSDJ0_SX395 +FSGF0_SI1557 +FSGF0_SI2187 +FSGF0_SI927 +FSGF0_SX117 +FSGF0_SX207 +FSGF0_SX27 +FSGF0_SX297 +FSGF0_SX387 +FSJG0_SI1570 +FSJG0_SI2200 +FSJG0_SI940 +FSJG0_SX130 +FSJG0_SX220 +FSJG0_SX310 +FSJG0_SX40 +FSJG0_SX400 +FSJK1_SI1025 +FSJK1_SI2285 +FSJK1_SI696 +FSJK1_SX125 +FSJK1_SX215 +FSJK1_SX305 +FSJK1_SX35 +FSJK1_SX395 +FSJS0_SI1171 +FSJS0_SI1801 +FSJS0_SI541 +FSJS0_SX181 +FSJS0_SX271 +FSJS0_SX361 +FSJS0_SX451 +FSJS0_SX91 +FSJW0_SI1333 +FSJW0_SI1963 +FSJW0_SI703 +FSJW0_SX163 +FSJW0_SX253 +FSJW0_SX343 +FSJW0_SX433 +FSJW0_SX73 +FSKC0_SI1416 +FSKC0_SI2046 +FSKC0_SI786 +FSKC0_SX156 +FSKC0_SX246 +FSKC0_SX336 +FSKC0_SX426 +FSKC0_SX66 +FSKL0_SI1529 +FSKL0_SI2159 +FSKL0_SI899 +FSKL0_SX179 +FSKL0_SX269 +FSKL0_SX359 +FSKL0_SX449 +FSKL0_SX89 +FSKP0_SI1098 +FSKP0_SI1728 +FSKP0_SI468 +FSKP0_SX108 +FSKP0_SX18 +FSKP0_SX198 +FSKP0_SX288 +FSKP0_SX378 +FSLS0_SI1056 +FSLS0_SI1686 +FSLS0_SI2316 +FSLS0_SX156 +FSLS0_SX202 +FSLS0_SX246 +FSLS0_SX426 +FSLS0_SX66 +FSMA0_SI1621 +FSMA0_SI2251 +FSMA0_SI991 +FSMA0_SX181 +FSMA0_SX271 +FSMA0_SX361 +FSMA0_SX451 +FSMA0_SX91 +FSMM0_SI1314 +FSMM0_SI1944 +FSMM0_SI684 +FSMM0_SX144 +FSMM0_SX234 +FSMM0_SX324 +FSMM0_SX414 +FSMM0_SX54 +FSMS1_SI1504 +FSMS1_SI2134 +FSMS1_SI874 +FSMS1_SX154 +FSMS1_SX244 +FSMS1_SX334 +FSMS1_SX347 +FSMS1_SX64 +FSPM0_SI1241 +FSPM0_SI1871 +FSPM0_SI611 +FSPM0_SX161 +FSPM0_SX251 +FSPM0_SX341 +FSPM0_SX431 +FSPM0_SX71 +FSRH0_SI1719 +FSRH0_SI1931 +FSRH0_SI671 +FSRH0_SX131 +FSRH0_SX221 +FSRH0_SX311 +FSRH0_SX401 +FSRH0_SX41 +FSSB0_SI1082 +FSSB0_SI1712 +FSSB0_SI2342 +FSSB0_SX182 +FSSB0_SX272 +FSSB0_SX362 +FSSB0_SX452 +FSSB0_SX92 +FTAJ0_SI1329 +FTAJ0_SI474 +FTAJ0_SI699 +FTAJ0_SX159 +FTAJ0_SX249 +FTAJ0_SX339 +FTAJ0_SX429 +FTAJ0_SX69 +FTBR0_SI1402 +FTBR0_SI2181 +FTBR0_SI921 +FTBR0_SX111 +FTBR0_SX201 +FTBR0_SX21 +FTBR0_SX291 +FTBR0_SX381 +FTBW0_SI1345 +FTBW0_SI1975 +FTBW0_SI715 +FTBW0_SX175 +FTBW0_SX265 +FTBW0_SX355 +FTBW0_SX445 +FTBW0_SX85 +FTLG0_SI1743 +FTLG0_SI483 +FTLG0_SI840 +FTLG0_SX123 +FTLG0_SX213 +FTLG0_SX303 +FTLG0_SX33 +FTLG0_SX393 +FTMG0_SI1532 +FTMG0_SI2162 +FTMG0_SI902 +FTMG0_SX182 +FTMG0_SX272 +FTMG0_SX362 +FTMG0_SX452 +FTMG0_SX92 +FVFB0_SI1032 +FVFB0_SI1510 +FVFB0_SI2292 +FVFB0_SX132 +FVFB0_SX222 +FVFB0_SX312 +FVFB0_SX402 +FVFB0_SX42 +FVKB0_SI1159 +FVKB0_SI1789 +FVKB0_SI529 +FVKB0_SX169 +FVKB0_SX259 +FVKB0_SX349 +FVKB0_SX439 +FVKB0_SX79 +FVMH0_SI1466 +FVMH0_SI2096 +FVMH0_SI836 +FVMH0_SX116 +FVMH0_SX206 +FVMH0_SX26 +FVMH0_SX296 +FVMH0_SX386 +MABC0_SI1620 +MABC0_SI2041 +MABC0_SI781 +MABC0_SX151 +MABC0_SX241 +MABC0_SX331 +MABC0_SX421 +MABC0_SX61 +MADC0_SI1367 +MADC0_SI1997 +MADC0_SI737 +MADC0_SX107 +MADC0_SX17 +MADC0_SX197 +MADC0_SX287 +MADC0_SX377 +MADD0_SI1295 +MADD0_SI1798 +MADD0_SI538 +MADD0_SX178 +MADD0_SX268 +MADD0_SX358 +MADD0_SX448 +MADD0_SX88 +MAEB0_SI1411 +MAEB0_SI2250 +MAEB0_SI990 +MAEB0_SX180 +MAEB0_SX270 +MAEB0_SX360 +MAEB0_SX450 +MAEB0_SX90 +MAEO0_SI1326 +MAEO0_SI1655 +MAEO0_SI1956 +MAEO0_SX156 +MAEO0_SX246 +MAEO0_SX336 +MAEO0_SX426 +MAEO0_SX66 +MAFM0_SI1569 +MAFM0_SI2199 +MAFM0_SI939 +MAFM0_SX129 +MAFM0_SX219 +MAFM0_SX309 +MAFM0_SX39 +MAFM0_SX399 +MAJP0_SI1074 +MAJP0_SI1704 +MAJP0_SI2334 +MAJP0_SX174 +MAJP0_SX264 +MAJP0_SX354 +MAJP0_SX444 +MAJP0_SX84 +MAKB0_SI1016 +MAKB0_SI1646 +MAKB0_SI2276 +MAKB0_SX116 +MAKB0_SX206 +MAKB0_SX26 +MAKB0_SX296 +MAKB0_SX386 +MAKR0_SI1352 +MAKR0_SI1982 +MAKR0_SI722 +MAKR0_SX182 +MAKR0_SX272 +MAKR0_SX362 +MAKR0_SX452 +MAKR0_SX92 +MAPV0_SI1293 +MAPV0_SI1923 +MAPV0_SI663 +MAPV0_SX123 +MAPV0_SX213 +MAPV0_SX303 +MAPV0_SX33 +MAPV0_SX393 +MARC0_SI1188 +MARC0_SI1818 +MARC0_SI558 +MARC0_SX108 +MARC0_SX18 +MARC0_SX198 +MARC0_SX288 +MARC0_SX378 +MARW0_SI1276 +MARW0_SI1906 +MARW0_SI646 +MARW0_SX106 +MARW0_SX16 +MARW0_SX286 +MARW0_SX349 +MARW0_SX376 +MBAR0_SI1319 +MBAR0_SI1949 +MBAR0_SI689 +MBAR0_SX149 +MBAR0_SX239 +MBAR0_SX329 +MBAR0_SX419 +MBAR0_SX59 +MBBR0_SI1055 +MBBR0_SI1685 +MBBR0_SI2315 +MBBR0_SX155 +MBBR0_SX245 +MBBR0_SX335 +MBBR0_SX425 +MBBR0_SX65 +MBCG0_SI2217 +MBCG0_SI486 +MBCG0_SI957 +MBCG0_SX147 +MBCG0_SX237 +MBCG0_SX327 +MBCG0_SX417 +MBCG0_SX57 +MBEF0_SI1281 +MBEF0_SI1911 +MBEF0_SI651 +MBEF0_SX111 +MBEF0_SX201 +MBEF0_SX21 +MBEF0_SX291 +MBEF0_SX381 +MBGT0_SI1341 +MBGT0_SI1841 +MBGT0_SI711 +MBGT0_SX171 +MBGT0_SX261 +MBGT0_SX351 +MBGT0_SX441 +MBGT0_SX81 +MBJV0_SI1247 +MBJV0_SI1877 +MBJV0_SI617 +MBJV0_SX167 +MBJV0_SX257 +MBJV0_SX347 +MBJV0_SX437 +MBJV0_SX77 +MBMA0_SI1222 +MBMA0_SI1852 +MBMA0_SI592 +MBMA0_SX142 +MBMA0_SX232 +MBMA0_SX322 +MBMA0_SX412 +MBMA0_SX52 +MBMA1_SI2207 +MBMA1_SI2214 +MBMA1_SI954 +MBMA1_SX144 +MBMA1_SX234 +MBMA1_SX324 +MBMA1_SX414 +MBMA1_SX54 +MBML0_SI1169 +MBML0_SI1799 +MBML0_SI539 +MBML0_SX179 +MBML0_SX269 +MBML0_SX359 +MBML0_SX449 +MBML0_SX89 +MBOM0_SI1014 +MBOM0_SI1644 +MBOM0_SI2274 +MBOM0_SX114 +MBOM0_SX204 +MBOM0_SX294 +MBOM0_SX311 +MBOM0_SX384 +MBSB0_SI1353 +MBSB0_SI1983 +MBSB0_SI723 +MBSB0_SX183 +MBSB0_SX273 +MBSB0_SX3 +MBSB0_SX363 +MBSB0_SX93 +MBTH0_SI2102 +MBTH0_SI505 +MBTH0_SI757 +MBTH0_SX122 +MBTH0_SX212 +MBTH0_SX302 +MBTH0_SX32 +MBTH0_SX392 +MBWP0_SI1531 +MBWP0_SI1969 +MBWP0_SI709 +MBWP0_SX169 +MBWP0_SX259 +MBWP0_SX349 +MBWP0_SX439 +MBWP0_SX79 +MCAE0_SI1447 +MCAE0_SI2077 +MCAE0_SI817 +MCAE0_SX187 +MCAE0_SX277 +MCAE0_SX367 +MCAE0_SX7 +MCAE0_SX97 +MCAL0_SI1138 +MCAL0_SI1768 +MCAL0_SI508 +MCAL0_SX148 +MCAL0_SX238 +MCAL0_SX328 +MCAL0_SX418 +MCAL0_SX58 +MCDC0_SI1292 +MCDC0_SI1922 +MCDC0_SI662 +MCDC0_SX122 +MCDC0_SX212 +MCDC0_SX302 +MCDC0_SX32 +MCDC0_SX392 +MCDD0_SI1513 +MCDD0_SI2143 +MCDD0_SI883 +MCDD0_SX163 +MCDD0_SX253 +MCDD0_SX343 +MCDD0_SX433 +MCDD0_SX73 +MCDR0_SI1154 +MCDR0_SI1784 +MCDR0_SI524 +MCDR0_SX164 +MCDR0_SX254 +MCDR0_SX344 +MCDR0_SX434 +MCDR0_SX74 +MCEF0_SI1135 +MCEF0_SI1765 +MCEF0_SI842 +MCEF0_SX145 +MCEF0_SX235 +MCEF0_SX325 +MCEF0_SX415 +MCEF0_SX55 +MCEW0_SI1442 +MCEW0_SI2072 +MCEW0_SI812 +MCEW0_SX182 +MCEW0_SX272 +MCEW0_SX362 +MCEW0_SX452 +MCEW0_SX92 +MCHL0_SI1347 +MCHL0_SI1404 +MCHL0_SI1977 +MCHL0_SX177 +MCHL0_SX267 +MCHL0_SX357 +MCHL0_SX447 +MCHL0_SX87 +MCLK0_SI1660 +MCLK0_SI2290 +MCLK0_SI650 +MCLK0_SX130 +MCLK0_SX220 +MCLK0_SX310 +MCLK0_SX40 +MCLK0_SX400 +MCLM0_SI1456 +MCLM0_SI2086 +MCLM0_SI826 +MCLM0_SX106 +MCLM0_SX16 +MCLM0_SX196 +MCLM0_SX286 +MCLM0_SX376 +MCPM0_SI1194 +MCPM0_SI1824 +MCPM0_SI564 +MCPM0_SX114 +MCPM0_SX204 +MCPM0_SX24 +MCPM0_SX294 +MCPM0_SX384 +MCRE0_SI1121 +MCRE0_SI1725 +MCRE0_SI1751 +MCRE0_SX131 +MCRE0_SX221 +MCRE0_SX24 +MCRE0_SX401 +MCRE0_SX41 +MCSS0_SI1380 +MCSS0_SI688 +MCSS0_SI750 +MCSS0_SX120 +MCSS0_SX210 +MCSS0_SX30 +MCSS0_SX300 +MCSS0_SX390 +MCTH0_SI1209 +MCTH0_SI1839 +MCTH0_SI579 +MCTH0_SX129 +MCTH0_SX219 +MCTH0_SX309 +MCTH0_SX39 +MCTH0_SX399 +MCTM0_SI1350 +MCTM0_SI1980 +MCTM0_SI720 +MCTM0_SX180 +MCTM0_SX270 +MCTM0_SX360 +MCTM0_SX450 +MCTM0_SX90 +MCXM0_SI1351 +MCXM0_SI1981 +MCXM0_SI721 +MCXM0_SX181 +MCXM0_SX271 +MCXM0_SX361 +MCXM0_SX451 +MCXM0_SX91 +MDAC0_SI1261 +MDAC0_SI1837 +MDAC0_SI631 +MDAC0_SX181 +MDAC0_SX271 +MDAC0_SX361 +MDAC0_SX451 +MDAC0_SX91 +MDAS0_SI1266 +MDAS0_SI1896 +MDAS0_SI636 +MDAS0_SX186 +MDAS0_SX21 +MDAS0_SX276 +MDAS0_SX6 +MDAS0_SX96 +MDBB1_SI1006 +MDBB1_SI1636 +MDBB1_SI2056 +MDBB1_SX106 +MDBB1_SX16 +MDBB1_SX196 +MDBB1_SX286 +MDBB1_SX376 +MDBP0_SI1158 +MDBP0_SI1788 +MDBP0_SI528 +MDBP0_SX168 +MDBP0_SX258 +MDBP0_SX348 +MDBP0_SX438 +MDBP0_SX78 +MDCD0_SI1415 +MDCD0_SI2045 +MDCD0_SI785 +MDCD0_SX155 +MDCD0_SX245 +MDCD0_SX335 +MDCD0_SX425 +MDCD0_SX65 +MDCM0_SI1480 +MDCM0_SI2110 +MDCM0_SI850 +MDCM0_SX130 +MDCM0_SX220 +MDCM0_SX310 +MDCM0_SX40 +MDCM0_SX400 +MDDC0_SI1419 +MDDC0_SI2049 +MDDC0_SI789 +MDDC0_SX159 +MDDC0_SX249 +MDDC0_SX339 +MDDC0_SX429 +MDDC0_SX69 +MDED0_SI1170 +MDED0_SI1800 +MDED0_SI540 +MDED0_SX180 +MDED0_SX270 +MDED0_SX360 +MDED0_SX450 +MDED0_SX90 +MDEF0_SI1123 +MDEF0_SI1563 +MDEF0_SI2193 +MDEF0_SX123 +MDEF0_SX213 +MDEF0_SX303 +MDEF0_SX33 +MDEF0_SX393 +MDEM0_SI1868 +MDEM0_SI608 +MDEM0_SI800 +MDEM0_SX158 +MDEM0_SX248 +MDEM0_SX338 +MDEM0_SX428 +MDEM0_SX68 +MDHL0_SI1439 +MDHL0_SI2069 +MDHL0_SI809 +MDHL0_SX179 +MDHL0_SX269 +MDHL0_SX359 +MDHL0_SX449 +MDHL0_SX89 +MDHS0_SI1530 +MDHS0_SI2160 +MDHS0_SI900 +MDHS0_SX180 +MDHS0_SX270 +MDHS0_SX360 +MDHS0_SX450 +MDHS0_SX90 +MDJM0_SI1455 +MDJM0_SI2085 +MDJM0_SI825 +MDJM0_SX105 +MDJM0_SX15 +MDJM0_SX195 +MDJM0_SX285 +MDJM0_SX375 +MDKS0_SI1066 +MDKS0_SI1696 +MDKS0_SI2326 +MDKS0_SX166 +MDKS0_SX256 +MDKS0_SX346 +MDKS0_SX436 +MDKS0_SX76 +MDLB0_SI1306 +MDLB0_SI1936 +MDLB0_SI676 +MDLB0_SX136 +MDLB0_SX226 +MDLB0_SX316 +MDLB0_SX406 +MDLB0_SX46 +MDLC0_SI1395 +MDLC0_SI2025 +MDLC0_SI765 +MDLC0_SX135 +MDLC0_SX225 +MDLC0_SX315 +MDLC0_SX405 +MDLC0_SX45 +MDLC1_SI1435 +MDLC1_SI2065 +MDLC1_SI2144 +MDLC1_SX175 +MDLC1_SX265 +MDLC1_SX355 +MDLC1_SX445 +MDLC1_SX85 +MDLC2_SI1614 +MDLC2_SI2244 +MDLC2_SI984 +MDLC2_SX174 +MDLC2_SX264 +MDLC2_SX354 +MDLC2_SX444 +MDLC2_SX84 +MDLH0_SI1960 +MDLH0_SI574 +MDLH0_SI700 +MDLH0_SX160 +MDLH0_SX250 +MDLH0_SX340 +MDLH0_SX430 +MDLH0_SX70 +MDLM0_SI1234 +MDLM0_SI1864 +MDLM0_SI604 +MDLM0_SX154 +MDLM0_SX244 +MDLM0_SX334 +MDLM0_SX424 +MDLM0_SX64 +MDLR0_SI1233 +MDLR0_SI1863 +MDLR0_SI603 +MDLR0_SX153 +MDLR0_SX243 +MDLR0_SX333 +MDLR0_SX423 +MDLR0_SX63 +MDLR1_SI1299 +MDLR1_SI1929 +MDLR1_SI669 +MDLR1_SX129 +MDLR1_SX219 +MDLR1_SX309 +MDLR1_SX39 +MDLR1_SX399 +MDMA0_SI1238 +MDMA0_SI1430 +MDMA0_SI2060 +MDMA0_SX170 +MDMA0_SX260 +MDMA0_SX350 +MDMA0_SX440 +MDMA0_SX80 +MDMT0_SI1832 +MDMT0_SI2341 +MDMT0_SI572 +MDMT0_SX122 +MDMT0_SX212 +MDMT0_SX302 +MDMT0_SX32 +MDMT0_SX392 +MDNS0_SI1011 +MDNS0_SI2271 +MDNS0_SI873 +MDNS0_SX111 +MDNS0_SX201 +MDNS0_SX21 +MDNS0_SX291 +MDNS0_SX381 +MDPB0_SI1760 +MDPB0_SI2126 +MDPB0_SI866 +MDPB0_SX146 +MDPB0_SX236 +MDPB0_SX326 +MDPB0_SX416 +MDPB0_SX56 +MDPK0_SI1053 +MDPK0_SI1683 +MDPK0_SI552 +MDPK0_SX153 +MDPK0_SX243 +MDPK0_SX333 +MDPK0_SX423 +MDPK0_SX63 +MDPS0_SI1651 +MDPS0_SI1979 +MDPS0_SI719 +MDPS0_SX179 +MDPS0_SX269 +MDPS0_SX359 +MDPS0_SX449 +MDPS0_SX89 +MDRD0_SI1382 +MDRD0_SI2012 +MDRD0_SI752 +MDRD0_SX122 +MDRD0_SX212 +MDRD0_SX302 +MDRD0_SX32 +MDRD0_SX392 +MDSJ0_SI1462 +MDSJ0_SI2092 +MDSJ0_SI832 +MDSJ0_SX112 +MDSJ0_SX22 +MDSJ0_SX292 +MDSJ0_SX382 +MDSJ0_SX438 +MDSS0_SI1881 +MDSS0_SI2087 +MDSS0_SI621 +MDSS0_SX171 +MDSS0_SX261 +MDSS0_SX351 +MDSS0_SX441 +MDSS0_SX81 +MDSS1_SI1327 +MDSS1_SI1713 +MDSS1_SI697 +MDSS1_SX157 +MDSS1_SX247 +MDSS1_SX337 +MDSS1_SX427 +MDSS1_SX67 +MDTB0_SI1200 +MDTB0_SI1830 +MDTB0_SI570 +MDTB0_SX120 +MDTB0_SX210 +MDTB0_SX300 +MDTB0_SX321 +MDTB0_SX390 +MDWD0_SI1260 +MDWD0_SI1890 +MDWD0_SI557 +MDWD0_SX180 +MDWD0_SX270 +MDWD0_SX360 +MDWD0_SX450 +MDWD0_SX90 +MDWH0_SI1168 +MDWH0_SI1925 +MDWH0_SI665 +MDWH0_SX125 +MDWH0_SX215 +MDWH0_SX305 +MDWH0_SX35 +MDWH0_SX395 +MDWM0_SI1546 +MDWM0_SI2176 +MDWM0_SI916 +MDWM0_SX106 +MDWM0_SX16 +MDWM0_SX286 +MDWM0_SX376 +MDWM0_SX433 +MEAL0_SI1547 +MEAL0_SI2177 +MEAL0_SI917 +MEAL0_SX107 +MEAL0_SX197 +MEAL0_SX287 +MEAL0_SX347 +MEAL0_SX377 +MEDR0_SI1374 +MEDR0_SI2004 +MEDR0_SI744 +MEDR0_SX114 +MEDR0_SX204 +MEDR0_SX24 +MEDR0_SX294 +MEDR0_SX384 +MEFG0_SI465 +MEFG0_SI491 +MEFG0_SI598 +MEFG0_SX105 +MEFG0_SX15 +MEFG0_SX195 +MEFG0_SX285 +MEFG0_SX375 +MEGJ0_SI1337 +MEGJ0_SI1967 +MEGJ0_SI707 +MEGJ0_SX167 +MEGJ0_SX257 +MEGJ0_SX3 +MEGJ0_SX437 +MEGJ0_SX77 +MEJL0_SI1592 +MEJL0_SI1654 +MEJL0_SI962 +MEJL0_SX152 +MEJL0_SX242 +MEJL0_SX332 +MEJL0_SX422 +MEJL0_SX62 +MEJS0_SI1240 +MEJS0_SI1870 +MEJS0_SI610 +MEJS0_SX160 +MEJS0_SX250 +MEJS0_SX340 +MEJS0_SX430 +MEJS0_SX70 +MESG0_SI1332 +MESG0_SI1962 +MESG0_SI702 +MESG0_SX162 +MESG0_SX252 +MESG0_SX342 +MESG0_SX432 +MESG0_SX72 +MESJ0_SI2039 +MESJ0_SI2257 +MESJ0_SI997 +MESJ0_SX187 +MESJ0_SX277 +MESJ0_SX367 +MESJ0_SX7 +MESJ0_SX97 +MEWM0_SI1348 +MEWM0_SI1978 +MEWM0_SI718 +MEWM0_SX178 +MEWM0_SX268 +MEWM0_SX358 +MEWM0_SX448 +MEWM0_SX88 +MFER0_SI1492 +MFER0_SI2122 +MFER0_SI862 +MFER0_SX142 +MFER0_SX232 +MFER0_SX322 +MFER0_SX412 +MFER0_SX52 +MFMC0_SI1132 +MFMC0_SI1762 +MFMC0_SI502 +MFMC0_SX142 +MFMC0_SX232 +MFMC0_SX322 +MFMC0_SX412 +MFMC0_SX52 +MFRM0_SI1155 +MFRM0_SI1717 +MFRM0_SI1785 +MFRM0_SX165 +MFRM0_SX255 +MFRM0_SX345 +MFRM0_SX435 +MFRM0_SX75 +MFWK0_SI1249 +MFWK0_SI1879 +MFWK0_SI619 +MFWK0_SX169 +MFWK0_SX259 +MFWK0_SX349 +MFWK0_SX439 +MFWK0_SX79 +MFXS0_SI1674 +MFXS0_SI2225 +MFXS0_SI2304 +MFXS0_SX144 +MFXS0_SX234 +MFXS0_SX324 +MFXS0_SX414 +MFXS0_SX54 +MFXV0_SI1005 +MFXV0_SI1342 +MFXV0_SI1635 +MFXV0_SX105 +MFXV0_SX15 +MFXV0_SX195 +MFXV0_SX285 +MFXV0_SX375 +MGAF0_SI1282 +MGAF0_SI1912 +MGAF0_SI652 +MGAF0_SX112 +MGAF0_SX202 +MGAF0_SX22 +MGAF0_SX292 +MGAF0_SX382 +MGAG0_SI1321 +MGAG0_SI645 +MGAG0_SI691 +MGAG0_SX151 +MGAG0_SX241 +MGAG0_SX331 +MGAG0_SX421 +MGAG0_SX61 +MGAK0_SI1036 +MGAK0_SI1666 +MGAK0_SI2296 +MGAK0_SX136 +MGAK0_SX226 +MGAK0_SX316 +MGAK0_SX406 +MGAK0_SX46 +MGAR0_SI1212 +MGAR0_SI1694 +MGAR0_SI1842 +MGAR0_SX132 +MGAR0_SX222 +MGAR0_SX312 +MGAR0_SX402 +MGAR0_SX42 +MGAW0_SI1165 +MGAW0_SI1802 +MGAW0_SI535 +MGAW0_SX175 +MGAW0_SX265 +MGAW0_SX355 +MGAW0_SX445 +MGAW0_SX85 +MGES0_SI1481 +MGES0_SI2111 +MGES0_SI851 +MGES0_SX131 +MGES0_SX221 +MGES0_SX311 +MGES0_SX401 +MGES0_SX41 +MGJC0_SI1256 +MGJC0_SI1335 +MGJC0_SI1965 +MGJC0_SX165 +MGJC0_SX255 +MGJC0_SX345 +MGJC0_SX435 +MGJC0_SX75 +MGRL0_SI1497 +MGRL0_SI2127 +MGRL0_SI867 +MGRL0_SX147 +MGRL0_SX237 +MGRL0_SX327 +MGRL0_SX417 +MGRL0_SX57 +MGRP0_SI1317 +MGRP0_SI1947 +MGRP0_SI687 +MGRP0_SX147 +MGRP0_SX237 +MGRP0_SX327 +MGRP0_SX417 +MGRP0_SX57 +MGSH0_SI1176 +MGSH0_SI1806 +MGSH0_SI546 +MGSH0_SX127 +MGSH0_SX186 +MGSH0_SX276 +MGSH0_SX6 +MGSH0_SX96 +MGSL0_SI1164 +MGSL0_SI534 +MGSL0_SI797 +MGSL0_SX174 +MGSL0_SX264 +MGSL0_SX354 +MGSL0_SX444 +MGSL0_SX84 +MGXP0_SI1087 +MGXP0_SI457 +MGXP0_SI525 +MGXP0_SX187 +MGXP0_SX277 +MGXP0_SX367 +MGXP0_SX7 +MGXP0_SX97 +MHBS0_SI1575 +MHBS0_SI2205 +MHBS0_SI945 +MHBS0_SX135 +MHBS0_SX225 +MHBS0_SX315 +MHBS0_SX405 +MHBS0_SX45 +MHIT0_SI1613 +MHIT0_SI2243 +MHIT0_SI983 +MHIT0_SX173 +MHIT0_SX263 +MHIT0_SX353 +MHIT0_SX443 +MHIT0_SX83 +MHJB0_SI1017 +MHJB0_SI1647 +MHJB0_SI2277 +MHJB0_SX117 +MHJB0_SX207 +MHJB0_SX27 +MHJB0_SX297 +MHJB0_SX387 +MHMG0_SI1365 +MHMG0_SI1995 +MHMG0_SI735 +MHMG0_SX105 +MHMG0_SX15 +MHMG0_SX195 +MHMG0_SX285 +MHMG0_SX375 +MHMR0_SI1119 +MHMR0_SI1692 +MHMR0_SI489 +MHMR0_SX129 +MHMR0_SX219 +MHMR0_SX309 +MHMR0_SX39 +MHMR0_SX399 +MHRM0_SI1475 +MHRM0_SI2218 +MHRM0_SI958 +MHRM0_SX148 +MHRM0_SX238 +MHRM0_SX328 +MHRM0_SX418 +MHRM0_SX58 +MHXL0_SI1772 +MHXL0_SI512 +MHXL0_SI612 +MHXL0_SX152 +MHXL0_SX242 +MHXL0_SX332 +MHXL0_SX422 +MHXL0_SX62 +MILB0_SI2163 +MILB0_SI807 +MILB0_SI903 +MILB0_SX183 +MILB0_SX273 +MILB0_SX3 +MILB0_SX363 +MILB0_SX93 +MJAC0_SI1331 +MJAC0_SI2148 +MJAC0_SI701 +MJAC0_SX251 +MJAC0_SX307 +MJAC0_SX341 +MJAC0_SX431 +MJAC0_SX71 +MJAE0_SI1524 +MJAE0_SI1999 +MJAE0_SI2154 +MJAE0_SX174 +MJAE0_SX264 +MJAE0_SX354 +MJAE0_SX444 +MJAE0_SX84 +MJAI0_SI1604 +MJAI0_SI682 +MJAI0_SI710 +MJAI0_SX164 +MJAI0_SX254 +MJAI0_SX344 +MJAI0_SX434 +MJAI0_SX74 +MJBG0_SI1232 +MJBG0_SI1724 +MJBG0_SI1862 +MJBG0_SX152 +MJBG0_SX242 +MJBG0_SX332 +MJBG0_SX422 +MJBG0_SX62 +MJDA0_SI1031 +MJDA0_SI1661 +MJDA0_SI2291 +MJDA0_SX131 +MJDA0_SX221 +MJDA0_SX311 +MJDA0_SX401 +MJDA0_SX41 +MJDC0_SI1161 +MJDC0_SI2165 +MJDC0_SI531 +MJDC0_SX171 +MJDC0_SX261 +MJDC0_SX351 +MJDC0_SX441 +MJDC0_SX81 +MJDE0_SI1120 +MJDE0_SI463 +MJDE0_SI490 +MJDE0_SX130 +MJDE0_SX220 +MJDE0_SX310 +MJDE0_SX40 +MJDE0_SX400 +MJDG0_SI1042 +MJDG0_SI1672 +MJDG0_SI1705 +MJDG0_SX142 +MJDG0_SX232 +MJDG0_SX322 +MJDG0_SX412 +MJDG0_SX52 +MJDM0_SI1340 +MJDM0_SI1937 +MJDM0_SI974 +MJDM0_SX170 +MJDM0_SX260 +MJDM0_SX350 +MJDM0_SX440 +MJDM0_SX80 +MJEB0_SI1286 +MJEB0_SI1916 +MJEB0_SI656 +MJEB0_SX170 +MJEB0_SX206 +MJEB0_SX26 +MJEB0_SX296 +MJEB0_SX386 +MJEB1_SI1467 +MJEB1_SI2097 +MJEB1_SI837 +MJEB1_SX117 +MJEB1_SX207 +MJEB1_SX27 +MJEB1_SX297 +MJEB1_SX387 +MJEE0_SI1237 +MJEE0_SI1867 +MJEE0_SI607 +MJEE0_SX157 +MJEE0_SX247 +MJEE0_SX337 +MJEE0_SX427 +MJEE0_SX67 +MJFH0_SI1107 +MJFH0_SI1737 +MJFH0_SI477 +MJFH0_SX117 +MJFH0_SX207 +MJFH0_SX27 +MJFH0_SX297 +MJFH0_SX387 +MJFR0_SI1605 +MJFR0_SI2235 +MJFR0_SI975 +MJFR0_SX165 +MJFR0_SX255 +MJFR0_SX345 +MJFR0_SX435 +MJFR0_SX75 +MJHI0_SI1328 +MJHI0_SI555 +MJHI0_SI698 +MJHI0_SX158 +MJHI0_SX248 +MJHI0_SX338 +MJHI0_SX428 +MJHI0_SX68 +MJJB0_SI1139 +MJJB0_SI1277 +MJJB0_SI1769 +MJJB0_SX149 +MJJB0_SX239 +MJJB0_SX329 +MJJB0_SX419 +MJJB0_SX59 +MJJJ0_SI1163 +MJJJ0_SI1793 +MJJJ0_SI533 +MJJJ0_SX173 +MJJJ0_SX263 +MJJJ0_SX353 +MJJJ0_SX443 +MJJJ0_SX83 +MJJM0_SI1251 +MJJM0_SI1457 +MJJM0_SI827 +MJJM0_SX107 +MJJM0_SX17 +MJJM0_SX197 +MJJM0_SX287 +MJJM0_SX377 +MJKR0_SI1201 +MJKR0_SI1831 +MJKR0_SI571 +MJKR0_SX121 +MJKR0_SX211 +MJKR0_SX301 +MJKR0_SX31 +MJKR0_SX391 +MJLB0_SI1616 +MJLB0_SI2246 +MJLB0_SI986 +MJLB0_SX176 +MJLB0_SX266 +MJLB0_SX356 +MJLB0_SX446 +MJLB0_SX86 +MJLG1_SI1012 +MJLG1_SI1642 +MJLG1_SI2272 +MJLG1_SX112 +MJLG1_SX202 +MJLG1_SX22 +MJLG1_SX292 +MJLG1_SX382 +MJLS0_SI1096 +MJLS0_SI1726 +MJLS0_SI466 +MJLS0_SX106 +MJLS0_SX16 +MJLS0_SX196 +MJLS0_SX286 +MJLS0_SX376 +MJMA0_SI1495 +MJMA0_SI2125 +MJMA0_SI865 +MJMA0_SX145 +MJMA0_SX235 +MJMA0_SX325 +MJMA0_SX415 +MJMA0_SX55 +MJMD0_SI1028 +MJMD0_SI1658 +MJMD0_SI2288 +MJMD0_SX128 +MJMD0_SX218 +MJMD0_SX308 +MJMD0_SX38 +MJMD0_SX398 +MJMM0_SI1255 +MJMM0_SI1885 +MJMM0_SI625 +MJMM0_SX175 +MJMM0_SX265 +MJMM0_SX355 +MJMM0_SX445 +MJMM0_SX85 +MJPG0_SI1191 +MJPG0_SI1821 +MJPG0_SI561 +MJPG0_SX111 +MJPG0_SX201 +MJPG0_SX21 +MJPG0_SX291 +MJPG0_SX381 +MJPM0_SI1368 +MJPM0_SI1998 +MJPM0_SI738 +MJPM0_SX108 +MJPM0_SX18 +MJPM0_SX198 +MJPM0_SX288 +MJPM0_SX378 +MJPM1_SI1897 +MJPM1_SI2280 +MJPM1_SI761 +MJPM1_SX131 +MJPM1_SX221 +MJPM1_SX311 +MJPM1_SX401 +MJPM1_SX41 +MJRA0_SI1236 +MJRA0_SI1866 +MJRA0_SI606 +MJRA0_SX156 +MJRA0_SX246 +MJRA0_SX336 +MJRA0_SX426 +MJRA0_SX66 +MJRG0_SI1366 +MJRG0_SI1996 +MJRG0_SI736 +MJRG0_SX106 +MJRG0_SX16 +MJRG0_SX286 +MJRG0_SX352 +MJRG0_SX376 +MJRH0_SI1125 +MJRH0_SI1755 +MJRH0_SI1840 +MJRH0_SX135 +MJRH0_SX225 +MJRH0_SX315 +MJRH0_SX405 +MJRH0_SX45 +MJRH1_SI1558 +MJRH1_SI1774 +MJRH1_SI514 +MJRH1_SX154 +MJRH1_SX244 +MJRH1_SX334 +MJRH1_SX424 +MJRH1_SX64 +MJRK0_SI1662 +MJRK0_SI2103 +MJRK0_SI880 +MJRK0_SX160 +MJRK0_SX250 +MJRK0_SX340 +MJRK0_SX430 +MJRK0_SX70 +MJRP0_SI1835 +MJRP0_SI1845 +MJRP0_SI585 +MJRP0_SX135 +MJRP0_SX225 +MJRP0_SX315 +MJRP0_SX405 +MJRP0_SX45 +MJSR0_SI1424 +MJSR0_SI2054 +MJSR0_SI794 +MJSR0_SX164 +MJSR0_SX254 +MJSR0_SX344 +MJSR0_SX434 +MJSR0_SX74 +MJWG0_SI2155 +MJWG0_SI813 +MJWG0_SI895 +MJWG0_SX175 +MJWG0_SX265 +MJWG0_SX355 +MJWG0_SX445 +MJWG0_SX85 +MJWS0_SI1143 +MJWS0_SI1773 +MJWS0_SI513 +MJWS0_SX153 +MJWS0_SX243 +MJWS0_SX333 +MJWS0_SX423 +MJWS0_SX63 +MJWT0_SI1291 +MJWT0_SI1381 +MJWT0_SI751 +MJWT0_SX121 +MJWT0_SX211 +MJWT0_SX301 +MJWT0_SX31 +MJWT0_SX391 +MJXA0_SI1507 +MJXA0_SI2137 +MJXA0_SI877 +MJXA0_SX157 +MJXA0_SX247 +MJXA0_SX337 +MJXA0_SX427 +MJXA0_SX67 +MJXL0_SI1172 +MJXL0_SI1795 +MJXL0_SI542 +MJXL0_SX182 +MJXL0_SX272 +MJXL0_SX362 +MJXL0_SX452 +MJXL0_SX92 +MKAG0_SI1609 +MKAG0_SI2239 +MKAG0_SI979 +MKAG0_SX169 +MKAG0_SX259 +MKAG0_SX30 +MKAG0_SX439 +MKAG0_SX79 +MKAH0_SI1528 +MKAH0_SI2158 +MKAH0_SI898 +MKAH0_SX178 +MKAH0_SX268 +MKAH0_SX358 +MKAH0_SX448 +MKAH0_SX88 +MKAJ0_SI1414 +MKAJ0_SI2044 +MKAJ0_SI784 +MKAJ0_SX154 +MKAJ0_SX244 +MKAJ0_SX334 +MKAJ0_SX424 +MKAJ0_SX64 +MKAM0_SI1250 +MKAM0_SI1316 +MKAM0_SI1465 +MKAM0_SX146 +MKAM0_SX236 +MKAM0_SX326 +MKAM0_SX416 +MKAM0_SX56 +MKDB0_SI2132 +MKDB0_SI588 +MKDB0_SI872 +MKDB0_SX152 +MKDB0_SX242 +MKDB0_SX332 +MKDB0_SX422 +MKDB0_SX62 +MKDD0_SI1567 +MKDD0_SI2197 +MKDD0_SI937 +MKDD0_SX127 +MKDD0_SX217 +MKDD0_SX307 +MKDD0_SX37 +MKDD0_SX397 +MKDT0_SI2153 +MKDT0_SI814 +MKDT0_SI893 +MKDT0_SX173 +MKDT0_SX263 +MKDT0_SX353 +MKDT0_SX443 +MKDT0_SX83 +MKES0_SI1253 +MKES0_SI1883 +MKES0_SI623 +MKES0_SX173 +MKES0_SX263 +MKES0_SX353 +MKES0_SX443 +MKES0_SX83 +MKJO0_SI1517 +MKJO0_SI2147 +MKJO0_SI887 +MKJO0_SX167 +MKJO0_SX257 +MKJO0_SX424 +MKJO0_SX437 +MKJO0_SX77 +MKLN0_SI1598 +MKLN0_SI2228 +MKLN0_SI968 +MKLN0_SX158 +MKLN0_SX248 +MKLN0_SX338 +MKLN0_SX428 +MKLN0_SX68 +MKLR0_SI1059 +MKLR0_SI1689 +MKLR0_SI2319 +MKLR0_SX159 +MKLR0_SX249 +MKLR0_SX339 +MKLR0_SX429 +MKLR0_SX69 +MKLS0_SI1437 +MKLS0_SI1533 +MKLS0_SI2067 +MKLS0_SX177 +MKLS0_SX267 +MKLS0_SX357 +MKLS0_SX447 +MKLS0_SX87 +MKLS1_SI1545 +MKLS1_SI2175 +MKLS1_SI915 +MKLS1_SX105 +MKLS1_SX15 +MKLS1_SX195 +MKLS1_SX285 +MKLS1_SX375 +MKLW0_SI1571 +MKLW0_SI1844 +MKLW0_SI2201 +MKLW0_SX131 +MKLW0_SX221 +MKLW0_SX311 +MKLW0_SX401 +MKLW0_SX41 +MKRG0_SI1491 +MKRG0_SI2121 +MKRG0_SI861 +MKRG0_SX141 +MKRG0_SX231 +MKRG0_SX31 +MKRG0_SX411 +MKRG0_SX51 +MKXL0_SI1185 +MKXL0_SI1815 +MKXL0_SI1958 +MKXL0_SX105 +MKXL0_SX15 +MKXL0_SX195 +MKXL0_SX285 +MKXL0_SX375 +MLBC0_SI1239 +MLBC0_SI1869 +MLBC0_SI609 +MLBC0_SX159 +MLBC0_SX249 +MLBC0_SX339 +MLBC0_SX429 +MLBC0_SX69 +MLEL0_SI1246 +MLEL0_SI1876 +MLEL0_SI616 +MLEL0_SX166 +MLEL0_SX256 +MLEL0_SX346 +MLEL0_SX436 +MLEL0_SX76 +MLJC0_SI1225 +MLJC0_SI1855 +MLJC0_SI595 +MLJC0_SX145 +MLJC0_SX235 +MLJC0_SX325 +MLJC0_SX415 +MLJC0_SX55 +MLJH0_SI1324 +MLJH0_SI1422 +MLJH0_SI694 +MLJH0_SX154 +MLJH0_SX244 +MLJH0_SX334 +MLJH0_SX424 +MLJH0_SX64 +MLNS0_SI1407 +MLNS0_SI2037 +MLNS0_SI777 +MLNS0_SX147 +MLNS0_SX237 +MLNS0_SX327 +MLNS0_SX417 +MLNS0_SX57 +MLSH0_SI1417 +MLSH0_SI2047 +MLSH0_SI787 +MLSH0_SX157 +MLSH0_SX247 +MLSH0_SX337 +MLSH0_SX427 +MLSH0_SX67 +MMAA0_SI1588 +MMAA0_SI2105 +MMAA0_SI845 +MMAA0_SX125 +MMAA0_SX215 +MMAA0_SX305 +MMAA0_SX35 +MMAA0_SX395 +MMAB1_SI1494 +MMAB1_SI2124 +MMAB1_SI864 +MMAB1_SX144 +MMAB1_SX234 +MMAB1_SX324 +MMAB1_SX414 +MMAB1_SX54 +MMAG0_SI1126 +MMAG0_SI1756 +MMAG0_SI496 +MMAG0_SX136 +MMAG0_SX226 +MMAG0_SX316 +MMAG0_SX406 +MMAG0_SX46 +MMAM0_SI1597 +MMAM0_SI1668 +MMAM0_SI2227 +MMAM0_SX157 +MMAM0_SX247 +MMAM0_SX337 +MMAM0_SX427 +MMAM0_SX67 +MMAR0_SI1336 +MMAR0_SI1966 +MMAR0_SI706 +MMAR0_SX166 +MMAR0_SX256 +MMAR0_SX346 +MMAR0_SX436 +MMAR0_SX76 +MMBS0_SI1151 +MMBS0_SI1781 +MMBS0_SI521 +MMBS0_SX161 +MMBS0_SX251 +MMBS0_SX341 +MMBS0_SX431 +MMBS0_SX71 +MMCC0_SI1338 +MMCC0_SI1968 +MMCC0_SI708 +MMCC0_SX168 +MMCC0_SX258 +MMCC0_SX348 +MMCC0_SX438 +MMCC0_SX78 +MMDB0_SI1358 +MMDB0_SI1617 +MMDB0_SI987 +MMDB0_SX177 +MMDB0_SX267 +MMDB0_SX357 +MMDB0_SX447 +MMDB0_SX87 +MMDG0_SI1780 +MMDG0_SI2035 +MMDG0_SI520 +MMDG0_SX160 +MMDG0_SX250 +MMDG0_SX340 +MMDG0_SX430 +MMDG0_SX70 +MMDM0_SI1311 +MMDM0_SI1941 +MMDM0_SI681 +MMDM0_SX141 +MMDM0_SX231 +MMDM0_SX321 +MMDM0_SX411 +MMDM0_SX51 +MMDM1_SI1650 +MMDM1_SI2043 +MMDM1_SI783 +MMDM1_SX153 +MMDM1_SX243 +MMDM1_SX333 +MMDM1_SX423 +MMDM1_SX63 +MMDS0_SI1343 +MMDS0_SI1973 +MMDS0_SI713 +MMDS0_SX173 +MMDS0_SX263 +MMDS0_SX353 +MMDS0_SX443 +MMDS0_SX83 +MMEA0_SI1388 +MMEA0_SI2018 +MMEA0_SI758 +MMEA0_SX128 +MMEA0_SX218 +MMEA0_SX308 +MMEA0_SX38 +MMEA0_SX398 +MMEB0_SI1357 +MMEB0_SI1987 +MMEB0_SI727 +MMEB0_SX187 +MMEB0_SX327 +MMEB0_SX367 +MMEB0_SX7 +MMEB0_SX97 +MMGC0_SI1305 +MMGC0_SI1935 +MMGC0_SI2184 +MMGC0_SX135 +MMGC0_SX225 +MMGC0_SX315 +MMGC0_SX405 +MMGC0_SX45 +MMGG0_SI1079 +MMGG0_SI1709 +MMGG0_SI2339 +MMGG0_SX179 +MMGG0_SX269 +MMGG0_SX359 +MMGG0_SX449 +MMGG0_SX89 +MMGK0_SI1322 +MMGK0_SI1952 +MMGK0_SI692 +MMGK0_SX152 +MMGK0_SX242 +MMGK0_SX332 +MMGK0_SX422 +MMGK0_SX62 +MMJB1_SI1408 +MMJB1_SI2038 +MMJB1_SI778 +MMJB1_SX148 +MMJB1_SX238 +MMJB1_SX328 +MMJB1_SX418 +MMJB1_SX58 +MMLM0_SI1527 +MMLM0_SI2150 +MMLM0_SI897 +MMLM0_SX177 +MMLM0_SX267 +MMLM0_SX357 +MMLM0_SX447 +MMLM0_SX87 +MMPM0_SI1061 +MMPM0_SI1691 +MMPM0_SI2321 +MMPM0_SX161 +MMPM0_SX251 +MMPM0_SX341 +MMPM0_SX431 +MMPM0_SX71 +MMRP0_SI2034 +MMRP0_SI717 +MMRP0_SI774 +MMRP0_SX144 +MMRP0_SX234 +MMRP0_SX324 +MMRP0_SX414 +MMRP0_SX54 +MMSM0_SI1106 +MMSM0_SI1736 +MMSM0_SI476 +MMSM0_SX116 +MMSM0_SX206 +MMSM0_SX26 +MMSM0_SX296 +MMSM0_SX386 +MMVP0_SI1284 +MMVP0_SI1914 +MMVP0_SI654 +MMVP0_SX114 +MMVP0_SX204 +MMVP0_SX294 +MMVP0_SX347 +MMVP0_SX384 +MMWB0_SI1619 +MMWB0_SI2249 +MMWB0_SI989 +MMWB0_SX179 +MMWB0_SX269 +MMWB0_SX359 +MMWB0_SX449 +MMWB0_SX89 +MMWS0_SI1518 +MMWS0_SI559 +MMWS0_SI888 +MMWS0_SX168 +MMWS0_SX258 +MMWS0_SX348 +MMWS0_SX438 +MMWS0_SX78 +MMWS1_SI1071 +MMWS1_SI1701 +MMWS1_SI2331 +MMWS1_SX261 +MMWS1_SX27 +MMWS1_SX351 +MMWS1_SX441 +MMWS1_SX81 +MMXS0_SI2136 +MMXS0_SI629 +MMXS0_SI876 +MMXS0_SX156 +MMXS0_SX246 +MMXS0_SX336 +MMXS0_SX426 +MMXS0_SX66 +MNET0_SI1446 +MNET0_SI2076 +MNET0_SI816 +MNET0_SX186 +MNET0_SX276 +MNET0_SX366 +MNET0_SX6 +MNET0_SX96 +MNTW0_SI1068 +MNTW0_SI1698 +MNTW0_SI2328 +MNTW0_SX168 +MNTW0_SX202 +MNTW0_SX258 +MNTW0_SX348 +MNTW0_SX78 +MPAR0_SI1576 +MPAR0_SI2206 +MPAR0_SI946 +MPAR0_SX136 +MPAR0_SX226 +MPAR0_SX316 +MPAR0_SX406 +MPAR0_SX46 +MPEB0_SI1034 +MPEB0_SI1860 +MPEB0_SI600 +MPEB0_SX150 +MPEB0_SX240 +MPEB0_SX330 +MPEB0_SX420 +MPEB0_SX60 +MPFU0_SI1258 +MPFU0_SI1888 +MPFU0_SI628 +MPFU0_SX178 +MPFU0_SX268 +MPFU0_SX358 +MPFU0_SX448 +MPFU0_SX88 +MPGH0_SI1554 +MPGH0_SI675 +MPGH0_SI924 +MPGH0_SX114 +MPGH0_SX204 +MPGH0_SX24 +MPGH0_SX294 +MPGH0_SX384 +MPGR0_SI1410 +MPGR0_SI2040 +MPGR0_SI780 +MPGR0_SX150 +MPGR0_SX240 +MPGR0_SX330 +MPGR0_SX420 +MPGR0_SX60 +MPGR1_SI1269 +MPGR1_SI1499 +MPGR1_SI2129 +MPGR1_SX149 +MPGR1_SX239 +MPGR1_SX329 +MPGR1_SX419 +MPGR1_SX59 +MPMB0_SI1501 +MPMB0_SI2131 +MPMB0_SI871 +MPMB0_SX151 +MPMB0_SX241 +MPMB0_SX331 +MPMB0_SX421 +MPMB0_SX61 +MPPC0_SI1412 +MPPC0_SI2042 +MPPC0_SI782 +MPPC0_SX152 +MPPC0_SX242 +MPPC0_SX332 +MPPC0_SX422 +MPPC0_SX62 +MPRB0_SI1205 +MPRB0_SI1215 +MPRB0_SI575 +MPRB0_SX125 +MPRB0_SX215 +MPRB0_SX305 +MPRB0_SX35 +MPRB0_SX395 +MPRD0_SI1431 +MPRD0_SI2061 +MPRD0_SI801 +MPRD0_SX171 +MPRD0_SX261 +MPRD0_SX351 +MPRD0_SX441 +MPRD0_SX81 +MPRK0_SI1097 +MPRK0_SI1727 +MPRK0_SI467 +MPRK0_SX107 +MPRK0_SX17 +MPRK0_SX197 +MPRK0_SX287 +MPRK0_SX377 +MPRT0_SI1210 +MPRT0_SI495 +MPRT0_SI580 +MPRT0_SX130 +MPRT0_SX220 +MPRT0_SX310 +MPRT0_SX40 +MPRT0_SX400 +MPSW0_SI1067 +MPSW0_SI1697 +MPSW0_SI2327 +MPSW0_SX167 +MPSW0_SX24 +MPSW0_SX257 +MPSW0_SX437 +MPSW0_SX77 +MRAB0_SI1224 +MRAB0_SI1854 +MRAB0_SI594 +MRAB0_SX144 +MRAB0_SX234 +MRAB0_SX324 +MRAB0_SX414 +MRAB0_SX54 +MRAB1_SI1478 +MRAB1_SI2108 +MRAB1_SI848 +MRAB1_SX128 +MRAB1_SX218 +MRAB1_SX308 +MRAB1_SX38 +MRAB1_SX398 +MRAI0_SI1954 +MRAI0_SI2052 +MRAI0_SI792 +MRAI0_SX162 +MRAI0_SX252 +MRAI0_SX342 +MRAI0_SX432 +MRAI0_SX72 +MRAM0_SI1275 +MRAM0_SI1905 +MRAM0_SI1951 +MRAM0_SX105 +MRAM0_SX15 +MRAM0_SX195 +MRAM0_SX285 +MRAM0_SX375 +MRAV0_SI1008 +MRAV0_SI1638 +MRAV0_SI2268 +MRAV0_SX108 +MRAV0_SX18 +MRAV0_SX198 +MRAV0_SX288 +MRAV0_SX378 +MRBC0_SI1665 +MRBC0_SI1859 +MRBC0_SI599 +MRBC0_SX149 +MRBC0_SX239 +MRBC0_SX329 +MRBC0_SX419 +MRBC0_SX59 +MRCG0_SI1428 +MRCG0_SI2058 +MRCG0_SI798 +MRCG0_SX168 +MRCG0_SX258 +MRCG0_SX348 +MRCG0_SX438 +MRCG0_SX78 +MRCW0_SI1371 +MRCW0_SI2001 +MRCW0_SI741 +MRCW0_SX111 +MRCW0_SX201 +MRCW0_SX21 +MRCW0_SX291 +MRCW0_SX381 +MRDD0_SI1050 +MRDD0_SI1680 +MRDD0_SI2310 +MRDD0_SX150 +MRDD0_SX240 +MRDD0_SX277 +MRDD0_SX330 +MRDD0_SX60 +MRDM0_SI1044 +MRDM0_SI1595 +MRDM0_SI965 +MRDM0_SX155 +MRDM0_SX245 +MRDM0_SX335 +MRDM0_SX425 +MRDM0_SX65 +MRDS0_SI1167 +MRDS0_SI1797 +MRDS0_SI537 +MRDS0_SX177 +MRDS0_SX267 +MRDS0_SX357 +MRDS0_SX447 +MRDS0_SX87 +MREE0_SI1104 +MREE0_SI1734 +MREE0_SI1959 +MREE0_SX114 +MREE0_SX204 +MREE0_SX24 +MREE0_SX294 +MREE0_SX384 +MREH1_SI1599 +MREH1_SI2229 +MREH1_SI969 +MREH1_SX159 +MREH1_SX249 +MREH1_SX339 +MREH1_SX429 +MREH1_SX69 +MREM0_SI1591 +MREM0_SI511 +MREM0_SI961 +MREM0_SX151 +MREM0_SX241 +MREM0_SX331 +MREM0_SX421 +MREM0_SX61 +MREW1_SI1500 +MREW1_SI2130 +MREW1_SI870 +MREW1_SX150 +MREW1_SX240 +MREW1_SX330 +MREW1_SX420 +MREW1_SX60 +MRFK0_SI1076 +MRFK0_SI1706 +MRFK0_SI2336 +MRFK0_SX176 +MRFK0_SX266 +MRFK0_SX356 +MRFK0_SX446 +MRFK0_SX86 +MRFL0_SI1156 +MRFL0_SI1786 +MRFL0_SI526 +MRFL0_SX166 +MRFL0_SX256 +MRFL0_SX346 +MRFL0_SX436 +MRFL0_SX76 +MRGM0_SI1162 +MRGM0_SI1792 +MRGM0_SI532 +MRGM0_SX172 +MRGM0_SX262 +MRGM0_SX416 +MRGM0_SX442 +MRGM0_SX82 +MRGS0_SI1356 +MRGS0_SI1986 +MRGS0_SI726 +MRGS0_SX186 +MRGS0_SX276 +MRGS0_SX366 +MRGS0_SX6 +MRGS0_SX96 +MRHL0_SI1515 +MRHL0_SI2145 +MRHL0_SI885 +MRHL0_SX165 +MRHL0_SX255 +MRHL0_SX345 +MRHL0_SX435 +MRHL0_SX75 +MRJB1_SI1020 +MRJB1_SI1413 +MRJB1_SI2021 +MRJB1_SX120 +MRJB1_SX210 +MRJB1_SX30 +MRJB1_SX300 +MRJB1_SX390 +MRJH0_SI1519 +MRJH0_SI889 +MRJH0_SI914 +MRJH0_SX169 +MRJH0_SX259 +MRJH0_SX307 +MRJH0_SX439 +MRJH0_SX79 +MRJM0_SI1095 +MRJM0_SI1228 +MRJM0_SI1858 +MRJM0_SX148 +MRJM0_SX238 +MRJM0_SX328 +MRJM0_SX418 +MRJM0_SX58 +MRJM1_SI1298 +MRJM1_SI1928 +MRJM1_SI668 +MRJM1_SX128 +MRJM1_SX218 +MRJM1_SX308 +MRJM1_SX38 +MRJM1_SX398 +MRJT0_SI1498 +MRJT0_SI1805 +MRJT0_SI868 +MRJT0_SX148 +MRJT0_SX238 +MRJT0_SX328 +MRJT0_SX418 +MRJT0_SX58 +MRKM0_SI1267 +MRKM0_SI1391 +MRKM0_SI637 +MRKM0_SX187 +MRKM0_SX277 +MRKM0_SX367 +MRKM0_SX7 +MRKM0_SX97 +MRLD0_SI1594 +MRLD0_SI2224 +MRLD0_SI964 +MRLD0_SX154 +MRLD0_SX244 +MRLD0_SX334 +MRLD0_SX424 +MRLD0_SX64 +MRLJ0_SI1420 +MRLJ0_SI2050 +MRLJ0_SI790 +MRLJ0_SX160 +MRLJ0_SX250 +MRLJ0_SX340 +MRLJ0_SX430 +MRLJ0_SX70 +MRLJ1_SI1671 +MRLJ1_SI2301 +MRLJ1_SI2332 +MRLJ1_SX141 +MRLJ1_SX231 +MRLJ1_SX321 +MRLJ1_SX411 +MRLJ1_SX51 +MRLK0_SI1468 +MRLK0_SI2140 +MRLK0_SI843 +MRLK0_SX123 +MRLK0_SX213 +MRLK0_SX303 +MRLK0_SX33 +MRLK0_SX393 +MRLR0_SI1196 +MRLR0_SI1826 +MRLR0_SI566 +MRLR0_SX116 +MRLR0_SX206 +MRLR0_SX26 +MRLR0_SX296 +MRLR0_SX386 +MRMB0_SI1581 +MRMB0_SI2211 +MRMB0_SI951 +MRMB0_SX141 +MRMB0_SX231 +MRMB0_SX321 +MRMB0_SX411 +MRMB0_SX51 +MRMG0_SI1080 +MRMG0_SI1710 +MRMG0_SI2340 +MRMG0_SX180 +MRMG0_SX270 +MRMG0_SX360 +MRMG0_SX450 +MRMG0_SX90 +MRMH0_SI1021 +MRMH0_SI1349 +MRMH0_SI2281 +MRMH0_SX121 +MRMH0_SX211 +MRMH0_SX301 +MRMH0_SX31 +MRMH0_SX391 +MRML0_SI1421 +MRML0_SI2051 +MRML0_SI791 +MRML0_SX161 +MRML0_SX251 +MRML0_SX341 +MRML0_SX431 +MRML0_SX71 +MRMS0_SI1113 +MRMS0_SI2057 +MRMS0_SI2100 +MRMS0_SX120 +MRMS0_SX210 +MRMS0_SX30 +MRMS0_SX300 +MRMS0_SX390 +MRPC1_SI1482 +MRPC1_SI2026 +MRPC1_SI2112 +MRPC1_SX132 +MRPC1_SX222 +MRPC1_SX312 +MRPC1_SX402 +MRPC1_SX42 +MRRE0_SI1334 +MRRE0_SI704 +MRRE0_SI952 +MRRE0_SX164 +MRRE0_SX254 +MRRE0_SX344 +MRRE0_SX434 +MRRE0_SX74 +MRSO0_SI1206 +MRSO0_SI1659 +MRSO0_SI2289 +MRSO0_SX129 +MRSO0_SX219 +MRSO0_SX309 +MRSO0_SX39 +MRSO0_SX399 +MRSP0_SI1429 +MRSP0_SI2059 +MRSP0_SI799 +MRSP0_SX169 +MRSP0_SX196 +MRSP0_SX259 +MRSP0_SX439 +MRSP0_SX79 +MRTC0_SI1458 +MRTC0_SI2088 +MRTC0_SI828 +MRTC0_SX108 +MRTC0_SX18 +MRTC0_SX198 +MRTC0_SX288 +MRTC0_SX378 +MRTJ0_SI1551 +MRTJ0_SI2032 +MRTJ0_SI772 +MRTJ0_SX142 +MRTJ0_SX232 +MRTJ0_SX322 +MRTJ0_SX412 +MRTJ0_SX52 +MRVG0_SI1140 +MRVG0_SI1770 +MRVG0_SI510 +MRVG0_SX150 +MRVG0_SX240 +MRVG0_SX330 +MRVG0_SX420 +MRVG0_SX60 +MRWA0_SI1603 +MRWA0_SI2233 +MRWA0_SI973 +MRWA0_SX163 +MRWA0_SX253 +MRWA0_SX343 +MRWA0_SX433 +MRWA0_SX73 +MRWS0_SI1102 +MRWS0_SI1732 +MRWS0_SI472 +MRWS0_SX112 +MRWS0_SX202 +MRWS0_SX22 +MRWS0_SX292 +MRWS0_SX382 +MRXB0_SI1585 +MRXB0_SI2215 +MRXB0_SI955 +MRXB0_SX145 +MRXB0_SX235 +MRXB0_SX325 +MRXB0_SX415 +MRXB0_SX55 +MSAH1_SI1049 +MSAH1_SI1679 +MSAH1_SI2309 +MSAH1_SX149 +MSAH1_SX239 +MSAH1_SX329 +MSAH1_SX419 +MSAH1_SX59 +MSAS0_SI1376 +MSAS0_SI2006 +MSAS0_SI746 +MSAS0_SX116 +MSAS0_SX206 +MSAS0_SX26 +MSAS0_SX296 +MSAS0_SX386 +MSAT0_SI1526 +MSAT0_SI2156 +MSAT0_SI896 +MSAT0_SX176 +MSAT0_SX266 +MSAT0_SX356 +MSAT0_SX446 +MSAT0_SX86 +MSAT1_SI1073 +MSAT1_SI1703 +MSAT1_SI2333 +MSAT1_SX173 +MSAT1_SX263 +MSAT1_SX353 +MSAT1_SX443 +MSAT1_SX83 +MSDB0_SI1007 +MSDB0_SI1637 +MSDB0_SI2267 +MSDB0_SX107 +MSDB0_SX17 +MSDB0_SX197 +MSDB0_SX287 +MSDB0_SX377 +MSDH0_SI2113 +MSDH0_SI2240 +MSDH0_SI980 +MSDH0_SX170 +MSDH0_SX260 +MSDH0_SX350 +MSDH0_SX440 +MSDH0_SX80 +MSDS0_SI1077 +MSDS0_SI1707 +MSDS0_SI2337 +MSDS0_SX177 +MSDS0_SX267 +MSDS0_SX357 +MSDS0_SX447 +MSDS0_SX87 +MSEM1_SI1440 +MSEM1_SI2070 +MSEM1_SI810 +MSEM1_SX180 +MSEM1_SX270 +MSEM1_SX360 +MSEM1_SX450 +MSEM1_SX90 +MSES0_SI1589 +MSES0_SI2216 +MSES0_SI2219 +MSES0_SX149 +MSES0_SX239 +MSES0_SX329 +MSES0_SX419 +MSES0_SX59 +MSFH0_SI1216 +MSFH0_SI1738 +MSFH0_SI586 +MSFH0_SX136 +MSFH0_SX226 +MSFH0_SX316 +MSFH0_SX406 +MSFH0_SX46 +MSFV0_SI1262 +MSFV0_SI1892 +MSFV0_SI632 +MSFV0_SX182 +MSFV0_SX272 +MSFV0_SX362 +MSFV0_SX452 +MSFV0_SX92 +MSJK0_SI1596 +MSJK0_SI2226 +MSJK0_SI966 +MSJK0_SX156 +MSJK0_SX246 +MSJK0_SX336 +MSJK0_SX426 +MSJK0_SX66 +MSMC0_SI1907 +MSMC0_SI509 +MSMC0_SI647 +MSMC0_SX107 +MSMC0_SX17 +MSMC0_SX197 +MSMC0_SX287 +MSMC0_SX377 +MSMR0_SI1150 +MSMR0_SI1405 +MSMR0_SI775 +MSMR0_SX145 +MSMR0_SX235 +MSMR0_SX325 +MSMR0_SX415 +MSMR0_SX55 +MSMS0_SI1433 +MSMS0_SI2063 +MSMS0_SI803 +MSMS0_SX173 +MSMS0_SX263 +MSMS0_SX353 +MSMS0_SX443 +MSMS0_SX83 +MSRG0_SI1221 +MSRG0_SI1851 +MSRG0_SI591 +MSRG0_SX141 +MSRG0_SX231 +MSRG0_SX321 +MSRG0_SX411 +MSRG0_SX51 +MSRR0_SI1131 +MSRR0_SI1761 +MSRR0_SI501 +MSRR0_SX141 +MSRR0_SX231 +MSRR0_SX30 +MSRR0_SX411 +MSRR0_SX51 +MSTF0_SI1396 +MSTF0_SI766 +MSTF0_SI852 +MSTF0_SX136 +MSTF0_SX226 +MSTF0_SX316 +MSTF0_SX406 +MSTF0_SX46 +MSVS0_SI1568 +MSVS0_SI2198 +MSVS0_SI938 +MSVS0_SX128 +MSVS0_SX218 +MSVS0_SX308 +MSVS0_SX38 +MSVS0_SX398 +MTAB0_SI1572 +MTAB0_SI2202 +MTAB0_SI942 +MTAB0_SX132 +MTAB0_SX222 +MTAB0_SX312 +MTAB0_SX402 +MTAB0_SX42 +MTAS0_SI1385 +MTAS0_SI2015 +MTAS0_SI755 +MTAS0_SX125 +MTAS0_SX215 +MTAS0_SX305 +MTAS0_SX35 +MTAS0_SX395 +MTAT0_SI1110 +MTAT0_SI1740 +MTAT0_SI811 +MTAT0_SX120 +MTAT0_SX210 +MTAT0_SX30 +MTAT0_SX300 +MTAT0_SX390 +MTAT1_SI1409 +MTAT1_SI1627 +MTAT1_SI779 +MTAT1_SX149 +MTAT1_SX239 +MTAT1_SX329 +MTAT1_SX419 +MTAT1_SX59 +MTBC0_SI1173 +MTBC0_SI1803 +MTBC0_SI543 +MTBC0_SX183 +MTBC0_SX273 +MTBC0_SX347 +MTBC0_SX363 +MTBC0_SX93 +MTCS0_SI1972 +MTCS0_SI2265 +MTCS0_SI712 +MTCS0_SX172 +MTCS0_SX262 +MTCS0_SX352 +MTCS0_SX442 +MTCS0_SX82 +MTDB0_SI1401 +MTDB0_SI2031 +MTDB0_SI771 +MTDB0_SX141 +MTDB0_SX231 +MTDB0_SX321 +MTDB0_SX411 +MTDB0_SX51 +MTDP0_SI1274 +MTDP0_SI1521 +MTDP0_SI2151 +MTDP0_SX171 +MTDP0_SX261 +MTDP0_SX351 +MTDP0_SX441 +MTDP0_SX81 +MTER0_SI1157 +MTER0_SI1787 +MTER0_SI527 +MTER0_SX167 +MTER0_SX17 +MTER0_SX257 +MTER0_SX437 +MTER0_SX77 +MTJG0_SI1520 +MTJG0_SI2157 +MTJG0_SI890 +MTJG0_SX170 +MTJG0_SX260 +MTJG0_SX350 +MTJG0_SX440 +MTJG0_SX80 +MTJM0_SI1226 +MTJM0_SI1856 +MTJM0_SI655 +MTJM0_SX146 +MTJM0_SX236 +MTJM0_SX326 +MTJM0_SX416 +MTJM0_SX56 +MTJS0_SI1192 +MTJS0_SI1822 +MTJS0_SI562 +MTJS0_SX112 +MTJS0_SX202 +MTJS0_SX22 +MTJS0_SX292 +MTJS0_SX382 +MTJU0_SI2020 +MTJU0_SI2269 +MTJU0_SI760 +MTJU0_SX130 +MTJU0_SX220 +MTJU0_SX310 +MTJU0_SX40 +MTJU0_SX400 +MTKD0_SI1187 +MTKD0_SI1817 +MTKD0_SI630 +MTKD0_SX107 +MTKD0_SX17 +MTKD0_SX197 +MTKD0_SX287 +MTKD0_SX377 +MTKP0_SI1023 +MTKP0_SI2283 +MTKP0_SI454 +MTKP0_SX123 +MTKP0_SX213 +MTKP0_SX303 +MTKP0_SX33 +MTKP0_SX393 +MTLB0_SI1134 +MTLB0_SI1764 +MTLB0_SI504 +MTLB0_SX144 +MTLB0_SX234 +MTLB0_SX324 +MTLB0_SX414 +MTLB0_SX54 +MTLC0_SI1313 +MTLC0_SI1477 +MTLC0_SI847 +MTLC0_SX127 +MTLC0_SX217 +MTLC0_SX307 +MTLC0_SX37 +MTLC0_SX397 +MTML0_SI1065 +MTML0_SI1695 +MTML0_SI2325 +MTML0_SX165 +MTML0_SX255 +MTML0_SX345 +MTML0_SX435 +MTML0_SX75 +MTMN0_SI1064 +MTMN0_SI2324 +MTMN0_SI582 +MTMN0_SX164 +MTMN0_SX254 +MTMN0_SX344 +MTMN0_SX434 +MTMN0_SX74 +MTMT0_SI1118 +MTMT0_SI1748 +MTMT0_SI488 +MTMT0_SX128 +MTMT0_SX218 +MTMT0_SX308 +MTMT0_SX38 +MTMT0_SX398 +MTPF0_SI1235 +MTPF0_SI1865 +MTPF0_SI605 +MTPF0_SX155 +MTPF0_SX245 +MTPF0_SX335 +MTPF0_SX425 +MTPF0_SX65 +MTPG0_SI1383 +MTPG0_SI2013 +MTPG0_SI753 +MTPG0_SX123 +MTPG0_SX213 +MTPG0_SX303 +MTPG0_SX33 +MTPG0_SX393 +MTPP0_SI1508 +MTPP0_SI2138 +MTPP0_SI878 +MTPP0_SX158 +MTPP0_SX248 +MTPP0_SX338 +MTPP0_SX428 +MTPP0_SX68 +MTPR0_SI1600 +MTPR0_SI2230 +MTPR0_SI506 +MTPR0_SX160 +MTPR0_SX250 +MTPR0_SX340 +MTPR0_SX430 +MTPR0_SX70 +MTQC0_SI1441 +MTQC0_SI2071 +MTQC0_SI480 +MTQC0_SX181 +MTQC0_SX271 +MTQC0_SX361 +MTQC0_SX451 +MTQC0_SX91 +MTRC0_SI1623 +MTRC0_SI589 +MTRC0_SI993 +MTRC0_SX170 +MTRC0_SX183 +MTRC0_SX273 +MTRC0_SX363 +MTRC0_SX93 +MTRR0_SI1548 +MTRR0_SI2178 +MTRR0_SI918 +MTRR0_SX108 +MTRR0_SX18 +MTRR0_SX198 +MTRR0_SX288 +MTRR0_SX378 +MTRT0_SI1227 +MTRT0_SI1857 +MTRT0_SI597 +MTRT0_SX147 +MTRT0_SX237 +MTRT0_SX254 +MTRT0_SX417 +MTRT0_SX57 +MTWH1_SI1512 +MTWH1_SI2142 +MTWH1_SI882 +MTWH1_SX162 +MTWH1_SX252 +MTWH1_SX342 +MTWH1_SX432 +MTWH1_SX72 +MTXS0_SI1060 +MTXS0_SI1690 +MTXS0_SI2320 +MTXS0_SX160 +MTXS0_SX250 +MTXS0_SX340 +MTXS0_SX430 +MTXS0_SX70 +MVJH0_SI1556 +MVJH0_SI2186 +MVJH0_SI926 +MVJH0_SX116 +MVJH0_SX206 +MVJH0_SX26 +MVJH0_SX296 +MVJH0_SX386 +MVLO0_SI1147 +MVLO0_SI1777 +MVLO0_SI517 +MVLO0_SX157 +MVLO0_SX247 +MVLO0_SX337 +MVLO0_SX427 +MVLO0_SX67 +MVRW0_SI1485 +MVRW0_SI2115 +MVRW0_SI855 +MVRW0_SX135 +MVRW0_SX225 +MVRW0_SX315 +MVRW0_SX405 +MVRW0_SX45 +MWAC0_SI1601 +MWAC0_SI2231 +MWAC0_SI971 +MWAC0_SX161 +MWAC0_SX251 +MWAC0_SX341 +MWAC0_SX431 +MWAC0_SX71 +MWAD0_SI1062 +MWAD0_SI1749 +MWAD0_SI2322 +MWAD0_SX162 +MWAD0_SX252 +MWAD0_SX342 +MWAD0_SX432 +MWAD0_SX72 +MWAR0_SI1045 +MWAR0_SI1675 +MWAR0_SI2305 +MWAR0_SX145 +MWAR0_SX235 +MWAR0_SX325 +MWAR0_SX415 +MWAR0_SX55 +MWCH0_SI1622 +MWCH0_SI1895 +MWCH0_SI2252 +MWCH0_SX182 +MWCH0_SX272 +MWCH0_SX362 +MWCH0_SX452 +MWCH0_SX92 +MWDK0_SI1436 +MWDK0_SI2017 +MWDK0_SI806 +MWDK0_SX176 +MWDK0_SX266 +MWDK0_SX356 +MWDK0_SX446 +MWDK0_SX86 +MWEM0_SI1320 +MWEM0_SI1393 +MWEM0_SI1950 +MWEM0_SX150 +MWEM0_SX240 +MWEM0_SX330 +MWEM0_SX420 +MWEM0_SX60 +MWGR0_SI1606 +MWGR0_SI2236 +MWGR0_SI976 +MWGR0_SX166 +MWGR0_SX256 +MWGR0_SX346 +MWGR0_SX436 +MWGR0_SX76 +MWRE0_SI1057 +MWRE0_SI1687 +MWRE0_SI2317 +MWRE0_SX157 +MWRE0_SX247 +MWRE0_SX337 +MWRE0_SX427 +MWRE0_SX67 +MWRP0_SI1443 +MWRP0_SI1525 +MWRP0_SI2073 +MWRP0_SX183 +MWRP0_SX273 +MWRP0_SX3 +MWRP0_SX363 +MWRP0_SX93 +MWSB0_SI1626 +MWSB0_SI2256 +MWSB0_SI996 +MWSB0_SX186 +MWSB0_SX276 +MWSB0_SX366 +MWSB0_SX6 +MWSB0_SX96 +MWSH0_SI1426 +MWSH0_SI2266 +MWSH0_SI796 +MWSH0_SX166 +MWSH0_SX256 +MWSH0_SX346 +MWSH0_SX436 +MWSH0_SX76 +MZMB0_SI1166 +MZMB0_SI1796 +MZMB0_SI536 +MZMB0_SX176 +MZMB0_SX266 +MZMB0_SX356 +MZMB0_SX446 +MZMB0_SX86 diff --git a/examples/wav2vec/unsupervised/config/timit_matched/train_text.uid b/examples/wav2vec/unsupervised/config/timit_matched/train_text.uid new file mode 100644 index 0000000000..c39fd0b91d --- /dev/null +++ b/examples/wav2vec/unsupervised/config/timit_matched/train_text.uid @@ -0,0 +1,3696 @@ +FAEM0_SI1392 +FAEM0_SI2022 +FAEM0_SI762 +FAEM0_SX132 +FAEM0_SX222 +FAEM0_SX312 +FAEM0_SX402 +FAEM0_SX42 +FAJW0_SI1263 +FAJW0_SI1893 +FAJW0_SI633 +FAJW0_SX183 +FAJW0_SX273 +FAJW0_SX3 +FAJW0_SX363 +FAJW0_SX93 +FALK0_SI1086 +FALK0_SI456 +FALK0_SI658 +FALK0_SX186 +FALK0_SX276 +FALK0_SX366 +FALK0_SX6 +FALK0_SX96 +FALR0_SI1325 +FALR0_SI1955 +FALR0_SI695 +FALR0_SX155 +FALR0_SX245 +FALR0_SX335 +FALR0_SX425 +FALR0_SX65 +FAPB0_SI1063 +FAPB0_SI1693 +FAPB0_SI2323 +FAPB0_SX163 +FAPB0_SX253 +FAPB0_SX343 +FAPB0_SX433 +FAPB0_SX73 +FBAS0_SI1387 +FBAS0_SI1472 +FBAS0_SI2066 +FBAS0_SX127 +FBAS0_SX217 +FBAS0_SX307 +FBAS0_SX37 +FBAS0_SX397 +FBCG1_SI1612 +FBCG1_SI2242 +FBCG1_SI982 +FBCG1_SX172 +FBCG1_SX262 +FBCG1_SX352 +FBCG1_SX442 +FBCG1_SX82 +FBCH0_SI1586 +FBCH0_SI956 +FBCH0_SI959 +FBCH0_SX146 +FBCH0_SX236 +FBCH0_SX326 +FBCH0_SX416 +FBCH0_SX56 +FBJL0_SI1552 +FBJL0_SI2182 +FBJL0_SI922 +FBJL0_SX112 +FBJL0_SX202 +FBJL0_SX22 +FBJL0_SX292 +FBJL0_SX382 +FBLV0_SI1058 +FBLV0_SI1688 +FBLV0_SI2318 +FBLV0_SX158 +FBLV0_SX248 +FBLV0_SX338 +FBLV0_SX428 +FBLV0_SX68 +FBMH0_SI1136 +FBMH0_SI1766 +FBMH0_SI970 +FBMH0_SX146 +FBMH0_SX236 +FBMH0_SX326 +FBMH0_SX416 +FBMH0_SX56 +FBMJ0_SI1776 +FBMJ0_SI516 +FBMJ0_SI815 +FBMJ0_SX156 +FBMJ0_SX246 +FBMJ0_SX336 +FBMJ0_SX426 +FBMJ0_SX66 +FCAG0_SI1503 +FCAG0_SI1641 +FCAG0_SI2133 +FCAG0_SX153 +FCAG0_SX243 +FCAG0_SX333 +FCAG0_SX423 +FCAG0_SX63 +FCAJ0_SI1479 +FCAJ0_SI1804 +FCAJ0_SI849 +FCAJ0_SX129 +FCAJ0_SX219 +FCAJ0_SX309 +FCAJ0_SX39 +FCAJ0_SX399 +FCDR1_SI1186 +FCDR1_SI1816 +FCDR1_SI556 +FCDR1_SX106 +FCDR1_SX16 +FCDR1_SX196 +FCDR1_SX286 +FCDR1_SX376 +FCEG0_SI1248 +FCEG0_SI1878 +FCEG0_SI618 +FCEG0_SX168 +FCEG0_SX258 +FCEG0_SX348 +FCEG0_SX438 +FCEG0_SX78 +FCJF0_SI1027 +FCJF0_SI1657 +FCJF0_SI648 +FCJF0_SX127 +FCJF0_SX217 +FCJF0_SX307 +FCJF0_SX37 +FCJF0_SX397 +FCJS0_SI1607 +FCJS0_SI2237 +FCJS0_SI977 +FCJS0_SX167 +FCJS0_SX257 +FCJS0_SX347 +FCJS0_SX437 +FCJS0_SX77 +FCKE0_SI1111 +FCKE0_SI1741 +FCKE0_SI481 +FCKE0_SX121 +FCKE0_SX211 +FCKE0_SX301 +FCKE0_SX31 +FCKE0_SX391 +FCLT0_SI1438 +FCLT0_SI2068 +FCLT0_SI808 +FCLT0_SX178 +FCLT0_SX268 +FCLT0_SX358 +FCLT0_SX448 +FCLT0_SX88 +FCMG0_SI1142 +FCMG0_SI1242 +FCMG0_SI1872 +FCMG0_SX162 +FCMG0_SX252 +FCMG0_SX342 +FCMG0_SX432 +FCMG0_SX72 +FCMM0_SI1083 +FCMM0_SI1957 +FCMM0_SI453 +FCMM0_SX183 +FCMM0_SX273 +FCMM0_SX363 +FCMM0_SX420 +FCMM0_SX93 +FCRZ0_SI1913 +FCRZ0_SI2053 +FCRZ0_SI793 +FCRZ0_SX163 +FCRZ0_SX253 +FCRZ0_SX343 +FCRZ0_SX433 +FCRZ0_SX73 +FCYL0_SI1297 +FCYL0_SI1927 +FCYL0_SI667 +FCYL0_SX127 +FCYL0_SX217 +FCYL0_SX349 +FCYL0_SX37 +FCYL0_SX397 +FDAS1_SI1461 +FDAS1_SI2091 +FDAS1_SI831 +FDAS1_SX111 +FDAS1_SX201 +FDAS1_SX21 +FDAS1_SX291 +FDAS1_SX381 +FDAW0_SI1271 +FDAW0_SI1406 +FDAW0_SI2036 +FDAW0_SX146 +FDAW0_SX236 +FDAW0_SX326 +FDAW0_SX416 +FDAW0_SX56 +FDFB0_SI1318 +FDFB0_SI1948 +FDFB0_SI2010 +FDFB0_SX148 +FDFB0_SX238 +FDFB0_SX328 +FDFB0_SX418 +FDFB0_SX58 +FDJH0_SI1565 +FDJH0_SI2195 +FDJH0_SI935 +FDJH0_SX125 +FDJH0_SX215 +FDJH0_SX305 +FDJH0_SX35 +FDJH0_SX395 +FDKN0_SI1081 +FDKN0_SI1202 +FDKN0_SI1711 +FDKN0_SX181 +FDKN0_SX271 +FDKN0_SX361 +FDKN0_SX451 +FDKN0_SX91 +FDML0_SI1149 +FDML0_SI1779 +FDML0_SI2075 +FDML0_SX159 +FDML0_SX249 +FDML0_SX339 +FDML0_SX429 +FDML0_SX69 +FDMY0_SI1197 +FDMY0_SI567 +FDMY0_SI714 +FDMY0_SX117 +FDMY0_SX207 +FDMY0_SX27 +FDMY0_SX297 +FDMY0_SX387 +FDNC0_SI1278 +FDNC0_SI1908 +FDNC0_SI2287 +FDNC0_SX108 +FDNC0_SX18 +FDNC0_SX198 +FDNC0_SX288 +FDNC0_SX378 +FDTD0_SI1561 +FDTD0_SI2191 +FDTD0_SI931 +FDTD0_SX121 +FDTD0_SX211 +FDTD0_SX301 +FDTD0_SX321 +FDTD0_SX391 +FDXW0_SI1511 +FDXW0_SI2141 +FDXW0_SI881 +FDXW0_SX161 +FDXW0_SX251 +FDXW0_SX341 +FDXW0_SX431 +FDXW0_SX71 +FEAC0_SI1245 +FEAC0_SI1875 +FEAC0_SI615 +FEAC0_SX165 +FEAC0_SX255 +FEAC0_SX345 +FEAC0_SX435 +FEAC0_SX75 +FEAR0_SI1252 +FEAR0_SI1882 +FEAR0_SI622 +FEAR0_SX172 +FEAR0_SX262 +FEAR0_SX352 +FEAR0_SX442 +FEAR0_SX82 +FECD0_SI1418 +FECD0_SI2048 +FECD0_SI788 +FECD0_SX158 +FECD0_SX248 +FECD0_SX338 +FECD0_SX428 +FECD0_SX68 +FEEH0_SI1112 +FEEH0_SI1742 +FEEH0_SI471 +FEEH0_SX122 +FEEH0_SX212 +FEEH0_SX302 +FEEH0_SX32 +FEEH0_SX392 +FEME0_SI1505 +FEME0_SI2135 +FEME0_SI875 +FEME0_SX155 +FEME0_SX245 +FEME0_SX335 +FEME0_SX425 +FEME0_SX65 +FETB0_SI1148 +FETB0_SI1778 +FETB0_SI518 +FETB0_SX158 +FETB0_SX248 +FETB0_SX338 +FETB0_SX428 +FETB0_SX68 +FEXM0_SI1101 +FEXM0_SI1731 +FEXM0_SI482 +FEXM0_SX111 +FEXM0_SX201 +FEXM0_SX291 +FEXM0_SX366 +FEXM0_SX381 +FGCS0_SI1486 +FGCS0_SI2116 +FGCS0_SI856 +FGCS0_SX136 +FGCS0_SX226 +FGCS0_SX316 +FGCS0_SX406 +FGCS0_SX46 +FGDP0_SI1618 +FGDP0_SI2248 +FGDP0_SI988 +FGDP0_SX178 +FGDP0_SX268 +FGDP0_SX358 +FGDP0_SX448 +FGDP0_SX88 +FGMB0_SI1145 +FGMB0_SI1775 +FGMB0_SI515 +FGMB0_SX155 +FGMB0_SX245 +FGMB0_SX335 +FGMB0_SX425 +FGMB0_SX65 +FGRW0_SI1152 +FGRW0_SI1782 +FGRW0_SI1990 +FGRW0_SX162 +FGRW0_SX252 +FGRW0_SX342 +FGRW0_SX432 +FGRW0_SX72 +FHLM0_SI1560 +FHLM0_SI2190 +FHLM0_SI930 +FHLM0_SX120 +FHLM0_SX210 +FHLM0_SX300 +FHLM0_SX349 +FHLM0_SX390 +FHXS0_SI1075 +FHXS0_SI2302 +FHXS0_SI2335 +FHXS0_SX175 +FHXS0_SX265 +FHXS0_SX355 +FHXS0_SX445 +FHXS0_SX85 +FJDM2_SI1582 +FJDM2_SI1964 +FJDM2_SI2212 +FJDM2_SX142 +FJDM2_SX232 +FJDM2_SX322 +FJDM2_SX412 +FJDM2_SX52 +FJEN0_SI1047 +FJEN0_SI1677 +FJEN0_SI2307 +FJEN0_SX147 +FJEN0_SX237 +FJEN0_SX327 +FJEN0_SX417 +FJEN0_SX57 +FJHK0_SI1022 +FJHK0_SI1652 +FJHK0_SI2282 +FJHK0_SX122 +FJHK0_SX212 +FJHK0_SX302 +FJHK0_SX32 +FJHK0_SX392 +FJKL0_SI1562 +FJKL0_SI2192 +FJKL0_SI932 +FJKL0_SX122 +FJKL0_SX212 +FJKL0_SX302 +FJKL0_SX32 +FJKL0_SX392 +FJLG0_SI1506 +FJLG0_SI1889 +FJLG0_SI2306 +FJLG0_SX179 +FJLG0_SX269 +FJLG0_SX359 +FJLG0_SX449 +FJLG0_SX89 +FJLR0_SI1231 +FJLR0_SI1861 +FJLR0_SI601 +FJLR0_SX151 +FJLR0_SX241 +FJLR0_SX331 +FJLR0_SX421 +FJLR0_SX61 +FJRB0_SI1302 +FJRB0_SI1932 +FJRB0_SI672 +FJRB0_SX132 +FJRB0_SX222 +FJRB0_SX312 +FJRB0_SX402 +FJRB0_SX42 +FJRP1_SI1432 +FJRP1_SI2062 +FJRP1_SI802 +FJRP1_SX172 +FJRP1_SX262 +FJRP1_SX352 +FJRP1_SX442 +FJRP1_SX82 +FJSK0_SI1052 +FJSK0_SI1682 +FJSK0_SI2312 +FJSK0_SX152 +FJSK0_SX242 +FJSK0_SX332 +FJSK0_SX422 +FJSK0_SX62 +FJSP0_SI1434 +FJSP0_SI1763 +FJSP0_SI804 +FJSP0_SX174 +FJSP0_SX264 +FJSP0_SX354 +FJSP0_SX444 +FJSP0_SX84 +FJWB1_SI2055 +FJWB1_SI748 +FJWB1_SI795 +FJWB1_SX165 +FJWB1_SX255 +FJWB1_SX345 +FJWB1_SX435 +FJWB1_SX75 +FJXM0_SI1211 +FJXM0_SI1971 +FJXM0_SI581 +FJXM0_SX131 +FJXM0_SX221 +FJXM0_SX311 +FJXM0_SX401 +FJXM0_SX41 +FJXP0_SI1122 +FJXP0_SI1752 +FJXP0_SI492 +FJXP0_SX132 +FJXP0_SX222 +FJXP0_SX312 +FJXP0_SX402 +FJXP0_SX42 +FKAA0_SI1208 +FKAA0_SI1838 +FKAA0_SI578 +FKAA0_SX128 +FKAA0_SX218 +FKAA0_SX308 +FKAA0_SX38 +FKAA0_SX398 +FKDE0_SI1141 +FKDE0_SI1771 +FKDE0_SI2221 +FKDE0_SX151 +FKDE0_SX241 +FKDE0_SX331 +FKDE0_SX421 +FKDE0_SX61 +FKDW0_SI1207 +FKDW0_SI1891 +FKDW0_SI577 +FKDW0_SX127 +FKDW0_SX217 +FKDW0_SX307 +FKDW0_SX37 +FKDW0_SX397 +FKFB0_SI1608 +FKFB0_SI2238 +FKFB0_SI978 +FKFB0_SX168 +FKFB0_SX258 +FKFB0_SX348 +FKFB0_SX438 +FKFB0_SX78 +FKKH0_SI1290 +FKKH0_SI1920 +FKKH0_SI660 +FKKH0_SX120 +FKKH0_SX210 +FKKH0_SX30 +FKKH0_SX300 +FKKH0_SX390 +FKLC0_SI1615 +FKLC0_SI2245 +FKLC0_SI985 +FKLC0_SX175 +FKLC0_SX265 +FKLC0_SX355 +FKLC0_SX445 +FKLC0_SX85 +FKLC1_SI1048 +FKLC1_SI1678 +FKLC1_SI2308 +FKLC1_SX148 +FKLC1_SX238 +FKLC1_SX328 +FKLC1_SX418 +FKLC1_SX58 +FKLH0_SI1257 +FKLH0_SI1887 +FKLH0_SI627 +FKLH0_SX177 +FKLH0_SX267 +FKLH0_SX357 +FKLH0_SX447 +FKLH0_SX87 +FKSR0_SI1117 +FKSR0_SI1747 +FKSR0_SI487 +FKSR0_SX161 +FKSR0_SX217 +FKSR0_SX366 +FKSR0_SX37 +FKSR0_SX397 +FLAC0_SI1339 +FLAC0_SI2161 +FLAC0_SI901 +FLAC0_SX181 +FLAC0_SX271 +FLAC0_SX361 +FLAC0_SX451 +FLAC0_SX91 +FLAG0_SI1464 +FLAG0_SI2094 +FLAG0_SI834 +FLAG0_SX114 +FLAG0_SX204 +FLAG0_SX24 +FLAG0_SX294 +FLAG0_SX384 +FLEH0_SI1051 +FLEH0_SI1681 +FLEH0_SI2311 +FLEH0_SX151 +FLEH0_SX241 +FLEH0_SX331 +FLEH0_SX421 +FLEH0_SX61 +FLET0_SI1137 +FLET0_SI1767 +FLET0_SI507 +FLET0_SX147 +FLET0_SX237 +FLET0_SX277 +FLET0_SX417 +FLET0_SX57 +FLHD0_SI1344 +FLHD0_SI1827 +FLHD0_SI1974 +FLHD0_SX174 +FLHD0_SX264 +FLHD0_SX354 +FLHD0_SX444 +FLHD0_SX84 +FLJA0_SI1078 +FLJA0_SI1708 +FLJA0_SI2338 +FLJA0_SX178 +FLJA0_SX268 +FLJA0_SX358 +FLJA0_SX448 +FLJA0_SX88 +FLJD0_SI1516 +FLJD0_SI2146 +FLJD0_SI886 +FLJD0_SX166 +FLJD0_SX256 +FLJD0_SX346 +FLJD0_SX436 +FLJD0_SX76 +FLJG0_SI1611 +FLJG0_SI2241 +FLJG0_SI981 +FLJG0_SX171 +FLJG0_SX261 +FLJG0_SX351 +FLJG0_SX441 +FLJG0_SX81 +FLKM0_SI1880 +FLKM0_SI620 +FLKM0_SI686 +FLKM0_SX116 +FLKM0_SX260 +FLKM0_SX350 +FLKM0_SX440 +FLKM0_SX80 +FLMA0_SI1243 +FLMA0_SI1873 +FLMA0_SI613 +FLMA0_SX163 +FLMA0_SX253 +FLMA0_SX343 +FLMA0_SX433 +FLMA0_SX73 +FLMC0_SI1372 +FLMC0_SI2002 +FLMC0_SI742 +FLMC0_SX112 +FLMC0_SX22 +FLMC0_SX292 +FLMC0_SX336 +FLMC0_SX382 +FLMK0_SI1035 +FLMK0_SI1229 +FLMK0_SI2295 +FLMK0_SX135 +FLMK0_SX225 +FLMK0_SX315 +FLMK0_SX405 +FLMK0_SX45 +FLOD0_SI1287 +FLOD0_SI1917 +FLOD0_SI657 +FLOD0_SX117 +FLOD0_SX171 +FLOD0_SX207 +FLOD0_SX297 +FLOD0_SX387 +FLTM0_SI1070 +FLTM0_SI1700 +FLTM0_SI2330 +FLTM0_SX170 +FLTM0_SX260 +FLTM0_SX350 +FLTM0_SX440 +FLTM0_SX80 +FMAH1_SI1509 +FMAH1_SI2139 +FMAH1_SI879 +FMAH1_SX159 +FMAH1_SX249 +FMAH1_SX339 +FMAH1_SX429 +FMAH1_SX69 +FMBG0_SI1160 +FMBG0_SI1790 +FMBG0_SI2264 +FMBG0_SX260 +FMBG0_SX3 +FMBG0_SX350 +FMBG0_SX440 +FMBG0_SX80 +FMEM0_SI1377 +FMEM0_SI2007 +FMEM0_SI747 +FMEM0_SX117 +FMEM0_SX207 +FMEM0_SX297 +FMEM0_SX333 +FMEM0_SX387 +FMJB0_SI1177 +FMJB0_SI1807 +FMJB0_SI547 +FMJB0_SX187 +FMJB0_SX277 +FMJB0_SX367 +FMJB0_SX7 +FMJB0_SX97 +FMJF0_SI1254 +FMJF0_SI1884 +FMJF0_SI624 +FMJF0_SX174 +FMJF0_SX264 +FMJF0_SX354 +FMJF0_SX444 +FMJF0_SX84 +FMJU0_SI1389 +FMJU0_SI2019 +FMJU0_SI759 +FMJU0_SX129 +FMJU0_SX219 +FMJU0_SX309 +FMJU0_SX39 +FMJU0_SX399 +FMKC0_SI1041 +FMKC0_SI1072 +FMKC0_SI1702 +FMKC0_SX172 +FMKC0_SX262 +FMKC0_SX352 +FMKC0_SX442 +FMKC0_SX82 +FMKF0_SI1018 +FMKF0_SI1536 +FMKF0_SI906 +FMKF0_SX186 +FMKF0_SX276 +FMKF0_SX366 +FMKF0_SX6 +FMKF0_SX96 +FMMH0_SI1537 +FMMH0_SI2167 +FMMH0_SI907 +FMMH0_SX187 +FMMH0_SX367 +FMMH0_SX420 +FMMH0_SX7 +FMMH0_SX97 +FMPG0_SI1602 +FMPG0_SI2232 +FMPG0_SI972 +FMPG0_SX162 +FMPG0_SX252 +FMPG0_SX342 +FMPG0_SX432 +FMPG0_SX72 +FNKL0_SI1522 +FNKL0_SI2152 +FNKL0_SI892 +FNKL0_SX172 +FNKL0_SX196 +FNKL0_SX262 +FNKL0_SX442 +FNKL0_SX82 +FNTB0_SI1203 +FNTB0_SI573 +FNTB0_SI679 +FNTB0_SX123 +FNTB0_SX213 +FNTB0_SX303 +FNTB0_SX33 +FNTB0_SX393 +FPAB1_SI1471 +FPAB1_SI2101 +FPAB1_SI841 +FPAB1_SX121 +FPAB1_SX211 +FPAB1_SX301 +FPAB1_SX31 +FPAB1_SX391 +FPAC0_SI1921 +FPAC0_SI2011 +FPAC0_SI661 +FPAC0_SX121 +FPAC0_SX211 +FPAC0_SX301 +FPAC0_SX31 +FPAC0_SX391 +FPAD0_SI1346 +FPAD0_SI1976 +FPAD0_SI716 +FPAD0_SX176 +FPAD0_SX266 +FPAD0_SX356 +FPAD0_SX446 +FPAD0_SX86 +FPAF0_SI1054 +FPAF0_SI1684 +FPAF0_SI2314 +FPAF0_SX154 +FPAF0_SX244 +FPAF0_SX334 +FPAF0_SX424 +FPAF0_SX64 +FPAZ0_SI1593 +FPAZ0_SI2223 +FPAZ0_SI963 +FPAZ0_SX153 +FPAZ0_SX243 +FPAZ0_SX27 +FPAZ0_SX423 +FPAZ0_SX63 +FPJF0_SI1046 +FPJF0_SI1259 +FPJF0_SI1676 +FPJF0_SX146 +FPJF0_SX236 +FPJF0_SX326 +FPJF0_SX352 +FPJF0_SX56 +FPLS0_SI1590 +FPLS0_SI2220 +FPLS0_SI960 +FPLS0_SX150 +FPLS0_SX240 +FPLS0_SX3 +FPLS0_SX330 +FPLS0_SX60 +FPMY0_SI1153 +FPMY0_SI1783 +FPMY0_SI523 +FPMY0_SX163 +FPMY0_SX196 +FPMY0_SX253 +FPMY0_SX343 +FPMY0_SX73 +FREH0_SI1315 +FREH0_SI1945 +FREH0_SI685 +FREH0_SX145 +FREH0_SX235 +FREH0_SX325 +FREH0_SX415 +FREH0_SX55 +FRJB0_SI1427 +FRJB0_SI1470 +FRJB0_SI1794 +FRJB0_SX167 +FRJB0_SX257 +FRJB0_SX347 +FRJB0_SX437 +FRJB0_SX77 +FRLL0_SI1514 +FRLL0_SI805 +FRLL0_SI884 +FRLL0_SX164 +FRLL0_SX254 +FRLL0_SX344 +FRLL0_SX434 +FRLL0_SX74 +FSAG0_SI1323 +FSAG0_SI1953 +FSAG0_SI693 +FSAG0_SX153 +FSAG0_SX243 +FSAG0_SX333 +FSAG0_SX423 +FSAG0_SX63 +FSAH0_SI1244 +FSAH0_SI1874 +FSAH0_SI614 +FSAH0_SX164 +FSAH0_SX327 +FSAH0_SX344 +FSAH0_SX434 +FSAH0_SX74 +FSAK0_SI1300 +FSAK0_SI1930 +FSAK0_SI670 +FSAK0_SX130 +FSAK0_SX220 +FSAK0_SX310 +FSAK0_SX40 +FSAK0_SX400 +FSBK0_SI1069 +FSBK0_SI1699 +FSBK0_SI2329 +FSBK0_SX169 +FSBK0_SX259 +FSBK0_SX349 +FSBK0_SX439 +FSBK0_SX79 +FSCN0_SI1886 +FSCN0_SI626 +FSCN0_SI705 +FSCN0_SX176 +FSCN0_SX266 +FSCN0_SX356 +FSCN0_SX446 +FSCN0_SX86 +FSDC0_SI1312 +FSDC0_SI1942 +FSDC0_SI2234 +FSDC0_SX142 +FSDC0_SX232 +FSDC0_SX322 +FSDC0_SX412 +FSDC0_SX52 +FSDJ0_SI1115 +FSDJ0_SI1745 +FSDJ0_SI485 +FSDJ0_SX125 +FSDJ0_SX215 +FSDJ0_SX305 +FSDJ0_SX35 +FSDJ0_SX395 +FSGF0_SI1557 +FSGF0_SI2187 +FSGF0_SI927 +FSGF0_SX117 +FSGF0_SX207 +FSGF0_SX27 +FSGF0_SX297 +FSGF0_SX387 +FSJG0_SI1570 +FSJG0_SI2200 +FSJG0_SI940 +FSJG0_SX130 +FSJG0_SX220 +FSJG0_SX310 +FSJG0_SX40 +FSJG0_SX400 +FSJK1_SI1025 +FSJK1_SI2285 +FSJK1_SI696 +FSJK1_SX125 +FSJK1_SX215 +FSJK1_SX305 +FSJK1_SX35 +FSJK1_SX395 +FSJS0_SI1171 +FSJS0_SI1801 +FSJS0_SI541 +FSJS0_SX181 +FSJS0_SX271 +FSJS0_SX361 +FSJS0_SX451 +FSJS0_SX91 +FSJW0_SI1333 +FSJW0_SI1963 +FSJW0_SI703 +FSJW0_SX163 +FSJW0_SX253 +FSJW0_SX343 +FSJW0_SX433 +FSJW0_SX73 +FSKC0_SI1416 +FSKC0_SI2046 +FSKC0_SI786 +FSKC0_SX156 +FSKC0_SX246 +FSKC0_SX336 +FSKC0_SX426 +FSKC0_SX66 +FSKL0_SI1529 +FSKL0_SI2159 +FSKL0_SI899 +FSKL0_SX179 +FSKL0_SX269 +FSKL0_SX359 +FSKL0_SX449 +FSKL0_SX89 +FSKP0_SI1098 +FSKP0_SI1728 +FSKP0_SI468 +FSKP0_SX108 +FSKP0_SX18 +FSKP0_SX198 +FSKP0_SX288 +FSKP0_SX378 +FSLS0_SI1056 +FSLS0_SI1686 +FSLS0_SI2316 +FSLS0_SX156 +FSLS0_SX202 +FSLS0_SX246 +FSLS0_SX426 +FSLS0_SX66 +FSMA0_SI1621 +FSMA0_SI2251 +FSMA0_SI991 +FSMA0_SX181 +FSMA0_SX271 +FSMA0_SX361 +FSMA0_SX451 +FSMA0_SX91 +FSMM0_SI1314 +FSMM0_SI1944 +FSMM0_SI684 +FSMM0_SX144 +FSMM0_SX234 +FSMM0_SX324 +FSMM0_SX414 +FSMM0_SX54 +FSMS1_SI1504 +FSMS1_SI2134 +FSMS1_SI874 +FSMS1_SX154 +FSMS1_SX244 +FSMS1_SX334 +FSMS1_SX347 +FSMS1_SX64 +FSPM0_SI1241 +FSPM0_SI1871 +FSPM0_SI611 +FSPM0_SX161 +FSPM0_SX251 +FSPM0_SX341 +FSPM0_SX431 +FSPM0_SX71 +FSRH0_SI1719 +FSRH0_SI1931 +FSRH0_SI671 +FSRH0_SX131 +FSRH0_SX221 +FSRH0_SX311 +FSRH0_SX401 +FSRH0_SX41 +FSSB0_SI1082 +FSSB0_SI1712 +FSSB0_SI2342 +FSSB0_SX182 +FSSB0_SX272 +FSSB0_SX362 +FSSB0_SX452 +FSSB0_SX92 +FTAJ0_SI1329 +FTAJ0_SI474 +FTAJ0_SI699 +FTAJ0_SX159 +FTAJ0_SX249 +FTAJ0_SX339 +FTAJ0_SX429 +FTAJ0_SX69 +FTBR0_SI1402 +FTBR0_SI2181 +FTBR0_SI921 +FTBR0_SX111 +FTBR0_SX201 +FTBR0_SX21 +FTBR0_SX291 +FTBR0_SX381 +FTBW0_SI1345 +FTBW0_SI1975 +FTBW0_SI715 +FTBW0_SX175 +FTBW0_SX265 +FTBW0_SX355 +FTBW0_SX445 +FTBW0_SX85 +FTLG0_SI1743 +FTLG0_SI483 +FTLG0_SI840 +FTLG0_SX123 +FTLG0_SX213 +FTLG0_SX303 +FTLG0_SX33 +FTLG0_SX393 +FTMG0_SI1532 +FTMG0_SI2162 +FTMG0_SI902 +FTMG0_SX182 +FTMG0_SX272 +FTMG0_SX362 +FTMG0_SX452 +FTMG0_SX92 +FVFB0_SI1032 +FVFB0_SI1510 +FVFB0_SI2292 +FVFB0_SX132 +FVFB0_SX222 +FVFB0_SX312 +FVFB0_SX402 +FVFB0_SX42 +FVKB0_SI1159 +FVKB0_SI1789 +FVKB0_SI529 +FVKB0_SX169 +FVKB0_SX259 +FVKB0_SX349 +FVKB0_SX439 +FVKB0_SX79 +FVMH0_SI1466 +FVMH0_SI2096 +FVMH0_SI836 +FVMH0_SX116 +FVMH0_SX206 +FVMH0_SX26 +FVMH0_SX296 +FVMH0_SX386 +MABC0_SI1620 +MABC0_SI2041 +MABC0_SI781 +MABC0_SX151 +MABC0_SX241 +MABC0_SX331 +MABC0_SX421 +MABC0_SX61 +MADC0_SI1367 +MADC0_SI1997 +MADC0_SI737 +MADC0_SX107 +MADC0_SX17 +MADC0_SX197 +MADC0_SX287 +MADC0_SX377 +MADD0_SI1295 +MADD0_SI1798 +MADD0_SI538 +MADD0_SX178 +MADD0_SX268 +MADD0_SX358 +MADD0_SX448 +MADD0_SX88 +MAEB0_SI1411 +MAEB0_SI2250 +MAEB0_SI990 +MAEB0_SX180 +MAEB0_SX270 +MAEB0_SX360 +MAEB0_SX450 +MAEB0_SX90 +MAEO0_SI1326 +MAEO0_SI1655 +MAEO0_SI1956 +MAEO0_SX156 +MAEO0_SX246 +MAEO0_SX336 +MAEO0_SX426 +MAEO0_SX66 +MAFM0_SI1569 +MAFM0_SI2199 +MAFM0_SI939 +MAFM0_SX129 +MAFM0_SX219 +MAFM0_SX309 +MAFM0_SX39 +MAFM0_SX399 +MAJP0_SI1074 +MAJP0_SI1704 +MAJP0_SI2334 +MAJP0_SX174 +MAJP0_SX264 +MAJP0_SX354 +MAJP0_SX444 +MAJP0_SX84 +MAKB0_SI1016 +MAKB0_SI1646 +MAKB0_SI2276 +MAKB0_SX116 +MAKB0_SX206 +MAKB0_SX26 +MAKB0_SX296 +MAKB0_SX386 +MAKR0_SI1352 +MAKR0_SI1982 +MAKR0_SI722 +MAKR0_SX182 +MAKR0_SX272 +MAKR0_SX362 +MAKR0_SX452 +MAKR0_SX92 +MAPV0_SI1293 +MAPV0_SI1923 +MAPV0_SI663 +MAPV0_SX123 +MAPV0_SX213 +MAPV0_SX303 +MAPV0_SX33 +MAPV0_SX393 +MARC0_SI1188 +MARC0_SI1818 +MARC0_SI558 +MARC0_SX108 +MARC0_SX18 +MARC0_SX198 +MARC0_SX288 +MARC0_SX378 +MARW0_SI1276 +MARW0_SI1906 +MARW0_SI646 +MARW0_SX106 +MARW0_SX16 +MARW0_SX286 +MARW0_SX349 +MARW0_SX376 +MBAR0_SI1319 +MBAR0_SI1949 +MBAR0_SI689 +MBAR0_SX149 +MBAR0_SX239 +MBAR0_SX329 +MBAR0_SX419 +MBAR0_SX59 +MBBR0_SI1055 +MBBR0_SI1685 +MBBR0_SI2315 +MBBR0_SX155 +MBBR0_SX245 +MBBR0_SX335 +MBBR0_SX425 +MBBR0_SX65 +MBCG0_SI2217 +MBCG0_SI486 +MBCG0_SI957 +MBCG0_SX147 +MBCG0_SX237 +MBCG0_SX327 +MBCG0_SX417 +MBCG0_SX57 +MBEF0_SI1281 +MBEF0_SI1911 +MBEF0_SI651 +MBEF0_SX111 +MBEF0_SX201 +MBEF0_SX21 +MBEF0_SX291 +MBEF0_SX381 +MBGT0_SI1341 +MBGT0_SI1841 +MBGT0_SI711 +MBGT0_SX171 +MBGT0_SX261 +MBGT0_SX351 +MBGT0_SX441 +MBGT0_SX81 +MBJV0_SI1247 +MBJV0_SI1877 +MBJV0_SI617 +MBJV0_SX167 +MBJV0_SX257 +MBJV0_SX347 +MBJV0_SX437 +MBJV0_SX77 +MBMA0_SI1222 +MBMA0_SI1852 +MBMA0_SI592 +MBMA0_SX142 +MBMA0_SX232 +MBMA0_SX322 +MBMA0_SX412 +MBMA0_SX52 +MBMA1_SI2207 +MBMA1_SI2214 +MBMA1_SI954 +MBMA1_SX144 +MBMA1_SX234 +MBMA1_SX324 +MBMA1_SX414 +MBMA1_SX54 +MBML0_SI1169 +MBML0_SI1799 +MBML0_SI539 +MBML0_SX179 +MBML0_SX269 +MBML0_SX359 +MBML0_SX449 +MBML0_SX89 +MBOM0_SI1014 +MBOM0_SI1644 +MBOM0_SI2274 +MBOM0_SX114 +MBOM0_SX204 +MBOM0_SX294 +MBOM0_SX311 +MBOM0_SX384 +MBSB0_SI1353 +MBSB0_SI1983 +MBSB0_SI723 +MBSB0_SX183 +MBSB0_SX273 +MBSB0_SX3 +MBSB0_SX363 +MBSB0_SX93 +MBTH0_SI2102 +MBTH0_SI505 +MBTH0_SI757 +MBTH0_SX122 +MBTH0_SX212 +MBTH0_SX302 +MBTH0_SX32 +MBTH0_SX392 +MBWP0_SI1531 +MBWP0_SI1969 +MBWP0_SI709 +MBWP0_SX169 +MBWP0_SX259 +MBWP0_SX349 +MBWP0_SX439 +MBWP0_SX79 +MCAE0_SI1447 +MCAE0_SI2077 +MCAE0_SI817 +MCAE0_SX187 +MCAE0_SX277 +MCAE0_SX367 +MCAE0_SX7 +MCAE0_SX97 +MCAL0_SI1138 +MCAL0_SI1768 +MCAL0_SI508 +MCAL0_SX148 +MCAL0_SX238 +MCAL0_SX328 +MCAL0_SX418 +MCAL0_SX58 +MCDC0_SI1292 +MCDC0_SI1922 +MCDC0_SI662 +MCDC0_SX122 +MCDC0_SX212 +MCDC0_SX302 +MCDC0_SX32 +MCDC0_SX392 +MCDD0_SI1513 +MCDD0_SI2143 +MCDD0_SI883 +MCDD0_SX163 +MCDD0_SX253 +MCDD0_SX343 +MCDD0_SX433 +MCDD0_SX73 +MCDR0_SI1154 +MCDR0_SI1784 +MCDR0_SI524 +MCDR0_SX164 +MCDR0_SX254 +MCDR0_SX344 +MCDR0_SX434 +MCDR0_SX74 +MCEF0_SI1135 +MCEF0_SI1765 +MCEF0_SI842 +MCEF0_SX145 +MCEF0_SX235 +MCEF0_SX325 +MCEF0_SX415 +MCEF0_SX55 +MCEW0_SI1442 +MCEW0_SI2072 +MCEW0_SI812 +MCEW0_SX182 +MCEW0_SX272 +MCEW0_SX362 +MCEW0_SX452 +MCEW0_SX92 +MCHL0_SI1347 +MCHL0_SI1404 +MCHL0_SI1977 +MCHL0_SX177 +MCHL0_SX267 +MCHL0_SX357 +MCHL0_SX447 +MCHL0_SX87 +MCLK0_SI1660 +MCLK0_SI2290 +MCLK0_SI650 +MCLK0_SX130 +MCLK0_SX220 +MCLK0_SX310 +MCLK0_SX40 +MCLK0_SX400 +MCLM0_SI1456 +MCLM0_SI2086 +MCLM0_SI826 +MCLM0_SX106 +MCLM0_SX16 +MCLM0_SX196 +MCLM0_SX286 +MCLM0_SX376 +MCPM0_SI1194 +MCPM0_SI1824 +MCPM0_SI564 +MCPM0_SX114 +MCPM0_SX204 +MCPM0_SX24 +MCPM0_SX294 +MCPM0_SX384 +MCRE0_SI1121 +MCRE0_SI1725 +MCRE0_SI1751 +MCRE0_SX131 +MCRE0_SX221 +MCRE0_SX24 +MCRE0_SX401 +MCRE0_SX41 +MCSS0_SI1380 +MCSS0_SI688 +MCSS0_SI750 +MCSS0_SX120 +MCSS0_SX210 +MCSS0_SX30 +MCSS0_SX300 +MCSS0_SX390 +MCTH0_SI1209 +MCTH0_SI1839 +MCTH0_SI579 +MCTH0_SX129 +MCTH0_SX219 +MCTH0_SX309 +MCTH0_SX39 +MCTH0_SX399 +MCTM0_SI1350 +MCTM0_SI1980 +MCTM0_SI720 +MCTM0_SX180 +MCTM0_SX270 +MCTM0_SX360 +MCTM0_SX450 +MCTM0_SX90 +MCXM0_SI1351 +MCXM0_SI1981 +MCXM0_SI721 +MCXM0_SX181 +MCXM0_SX271 +MCXM0_SX361 +MCXM0_SX451 +MCXM0_SX91 +MDAC0_SI1261 +MDAC0_SI1837 +MDAC0_SI631 +MDAC0_SX181 +MDAC0_SX271 +MDAC0_SX361 +MDAC0_SX451 +MDAC0_SX91 +MDAS0_SI1266 +MDAS0_SI1896 +MDAS0_SI636 +MDAS0_SX186 +MDAS0_SX21 +MDAS0_SX276 +MDAS0_SX6 +MDAS0_SX96 +MDBB1_SI1006 +MDBB1_SI1636 +MDBB1_SI2056 +MDBB1_SX106 +MDBB1_SX16 +MDBB1_SX196 +MDBB1_SX286 +MDBB1_SX376 +MDBP0_SI1158 +MDBP0_SI1788 +MDBP0_SI528 +MDBP0_SX168 +MDBP0_SX258 +MDBP0_SX348 +MDBP0_SX438 +MDBP0_SX78 +MDCD0_SI1415 +MDCD0_SI2045 +MDCD0_SI785 +MDCD0_SX155 +MDCD0_SX245 +MDCD0_SX335 +MDCD0_SX425 +MDCD0_SX65 +MDCM0_SI1480 +MDCM0_SI2110 +MDCM0_SI850 +MDCM0_SX130 +MDCM0_SX220 +MDCM0_SX310 +MDCM0_SX40 +MDCM0_SX400 +MDDC0_SI1419 +MDDC0_SI2049 +MDDC0_SI789 +MDDC0_SX159 +MDDC0_SX249 +MDDC0_SX339 +MDDC0_SX429 +MDDC0_SX69 +MDED0_SI1170 +MDED0_SI1800 +MDED0_SI540 +MDED0_SX180 +MDED0_SX270 +MDED0_SX360 +MDED0_SX450 +MDED0_SX90 +MDEF0_SI1123 +MDEF0_SI1563 +MDEF0_SI2193 +MDEF0_SX123 +MDEF0_SX213 +MDEF0_SX303 +MDEF0_SX33 +MDEF0_SX393 +MDEM0_SI1868 +MDEM0_SI608 +MDEM0_SI800 +MDEM0_SX158 +MDEM0_SX248 +MDEM0_SX338 +MDEM0_SX428 +MDEM0_SX68 +MDHL0_SI1439 +MDHL0_SI2069 +MDHL0_SI809 +MDHL0_SX179 +MDHL0_SX269 +MDHL0_SX359 +MDHL0_SX449 +MDHL0_SX89 +MDHS0_SI1530 +MDHS0_SI2160 +MDHS0_SI900 +MDHS0_SX180 +MDHS0_SX270 +MDHS0_SX360 +MDHS0_SX450 +MDHS0_SX90 +MDJM0_SI1455 +MDJM0_SI2085 +MDJM0_SI825 +MDJM0_SX105 +MDJM0_SX15 +MDJM0_SX195 +MDJM0_SX285 +MDJM0_SX375 +MDKS0_SI1066 +MDKS0_SI1696 +MDKS0_SI2326 +MDKS0_SX166 +MDKS0_SX256 +MDKS0_SX346 +MDKS0_SX436 +MDKS0_SX76 +MDLB0_SI1306 +MDLB0_SI1936 +MDLB0_SI676 +MDLB0_SX136 +MDLB0_SX226 +MDLB0_SX316 +MDLB0_SX406 +MDLB0_SX46 +MDLC0_SI1395 +MDLC0_SI2025 +MDLC0_SI765 +MDLC0_SX135 +MDLC0_SX225 +MDLC0_SX315 +MDLC0_SX405 +MDLC0_SX45 +MDLC1_SI1435 +MDLC1_SI2065 +MDLC1_SI2144 +MDLC1_SX175 +MDLC1_SX265 +MDLC1_SX355 +MDLC1_SX445 +MDLC1_SX85 +MDLC2_SI1614 +MDLC2_SI2244 +MDLC2_SI984 +MDLC2_SX174 +MDLC2_SX264 +MDLC2_SX354 +MDLC2_SX444 +MDLC2_SX84 +MDLH0_SI1960 +MDLH0_SI574 +MDLH0_SI700 +MDLH0_SX160 +MDLH0_SX250 +MDLH0_SX340 +MDLH0_SX430 +MDLH0_SX70 +MDLM0_SI1234 +MDLM0_SI1864 +MDLM0_SI604 +MDLM0_SX154 +MDLM0_SX244 +MDLM0_SX334 +MDLM0_SX424 +MDLM0_SX64 +MDLR0_SI1233 +MDLR0_SI1863 +MDLR0_SI603 +MDLR0_SX153 +MDLR0_SX243 +MDLR0_SX333 +MDLR0_SX423 +MDLR0_SX63 +MDLR1_SI1299 +MDLR1_SI1929 +MDLR1_SI669 +MDLR1_SX129 +MDLR1_SX219 +MDLR1_SX309 +MDLR1_SX39 +MDLR1_SX399 +MDMA0_SI1238 +MDMA0_SI1430 +MDMA0_SI2060 +MDMA0_SX170 +MDMA0_SX260 +MDMA0_SX350 +MDMA0_SX440 +MDMA0_SX80 +MDMT0_SI1832 +MDMT0_SI2341 +MDMT0_SI572 +MDMT0_SX122 +MDMT0_SX212 +MDMT0_SX302 +MDMT0_SX32 +MDMT0_SX392 +MDNS0_SI1011 +MDNS0_SI2271 +MDNS0_SI873 +MDNS0_SX111 +MDNS0_SX201 +MDNS0_SX21 +MDNS0_SX291 +MDNS0_SX381 +MDPB0_SI1760 +MDPB0_SI2126 +MDPB0_SI866 +MDPB0_SX146 +MDPB0_SX236 +MDPB0_SX326 +MDPB0_SX416 +MDPB0_SX56 +MDPK0_SI1053 +MDPK0_SI1683 +MDPK0_SI552 +MDPK0_SX153 +MDPK0_SX243 +MDPK0_SX333 +MDPK0_SX423 +MDPK0_SX63 +MDPS0_SI1651 +MDPS0_SI1979 +MDPS0_SI719 +MDPS0_SX179 +MDPS0_SX269 +MDPS0_SX359 +MDPS0_SX449 +MDPS0_SX89 +MDRD0_SI1382 +MDRD0_SI2012 +MDRD0_SI752 +MDRD0_SX122 +MDRD0_SX212 +MDRD0_SX302 +MDRD0_SX32 +MDRD0_SX392 +MDSJ0_SI1462 +MDSJ0_SI2092 +MDSJ0_SI832 +MDSJ0_SX112 +MDSJ0_SX22 +MDSJ0_SX292 +MDSJ0_SX382 +MDSJ0_SX438 +MDSS0_SI1881 +MDSS0_SI2087 +MDSS0_SI621 +MDSS0_SX171 +MDSS0_SX261 +MDSS0_SX351 +MDSS0_SX441 +MDSS0_SX81 +MDSS1_SI1327 +MDSS1_SI1713 +MDSS1_SI697 +MDSS1_SX157 +MDSS1_SX247 +MDSS1_SX337 +MDSS1_SX427 +MDSS1_SX67 +MDTB0_SI1200 +MDTB0_SI1830 +MDTB0_SI570 +MDTB0_SX120 +MDTB0_SX210 +MDTB0_SX300 +MDTB0_SX321 +MDTB0_SX390 +MDWD0_SI1260 +MDWD0_SI1890 +MDWD0_SI557 +MDWD0_SX180 +MDWD0_SX270 +MDWD0_SX360 +MDWD0_SX450 +MDWD0_SX90 +MDWH0_SI1168 +MDWH0_SI1925 +MDWH0_SI665 +MDWH0_SX125 +MDWH0_SX215 +MDWH0_SX305 +MDWH0_SX35 +MDWH0_SX395 +MDWM0_SI1546 +MDWM0_SI2176 +MDWM0_SI916 +MDWM0_SX106 +MDWM0_SX16 +MDWM0_SX286 +MDWM0_SX376 +MDWM0_SX433 +MEAL0_SI1547 +MEAL0_SI2177 +MEAL0_SI917 +MEAL0_SX107 +MEAL0_SX197 +MEAL0_SX287 +MEAL0_SX347 +MEAL0_SX377 +MEDR0_SI1374 +MEDR0_SI2004 +MEDR0_SI744 +MEDR0_SX114 +MEDR0_SX204 +MEDR0_SX24 +MEDR0_SX294 +MEDR0_SX384 +MEFG0_SI465 +MEFG0_SI491 +MEFG0_SI598 +MEFG0_SX105 +MEFG0_SX15 +MEFG0_SX195 +MEFG0_SX285 +MEFG0_SX375 +MEGJ0_SI1337 +MEGJ0_SI1967 +MEGJ0_SI707 +MEGJ0_SX167 +MEGJ0_SX257 +MEGJ0_SX3 +MEGJ0_SX437 +MEGJ0_SX77 +MEJL0_SI1592 +MEJL0_SI1654 +MEJL0_SI962 +MEJL0_SX152 +MEJL0_SX242 +MEJL0_SX332 +MEJL0_SX422 +MEJL0_SX62 +MEJS0_SI1240 +MEJS0_SI1870 +MEJS0_SI610 +MEJS0_SX160 +MEJS0_SX250 +MEJS0_SX340 +MEJS0_SX430 +MEJS0_SX70 +MESG0_SI1332 +MESG0_SI1962 +MESG0_SI702 +MESG0_SX162 +MESG0_SX252 +MESG0_SX342 +MESG0_SX432 +MESG0_SX72 +MESJ0_SI2039 +MESJ0_SI2257 +MESJ0_SI997 +MESJ0_SX187 +MESJ0_SX277 +MESJ0_SX367 +MESJ0_SX7 +MESJ0_SX97 +MEWM0_SI1348 +MEWM0_SI1978 +MEWM0_SI718 +MEWM0_SX178 +MEWM0_SX268 +MEWM0_SX358 +MEWM0_SX448 +MEWM0_SX88 +MFER0_SI1492 +MFER0_SI2122 +MFER0_SI862 +MFER0_SX142 +MFER0_SX232 +MFER0_SX322 +MFER0_SX412 +MFER0_SX52 +MFMC0_SI1132 +MFMC0_SI1762 +MFMC0_SI502 +MFMC0_SX142 +MFMC0_SX232 +MFMC0_SX322 +MFMC0_SX412 +MFMC0_SX52 +MFRM0_SI1155 +MFRM0_SI1717 +MFRM0_SI1785 +MFRM0_SX165 +MFRM0_SX255 +MFRM0_SX345 +MFRM0_SX435 +MFRM0_SX75 +MFWK0_SI1249 +MFWK0_SI1879 +MFWK0_SI619 +MFWK0_SX169 +MFWK0_SX259 +MFWK0_SX349 +MFWK0_SX439 +MFWK0_SX79 +MFXS0_SI1674 +MFXS0_SI2225 +MFXS0_SI2304 +MFXS0_SX144 +MFXS0_SX234 +MFXS0_SX324 +MFXS0_SX414 +MFXS0_SX54 +MFXV0_SI1005 +MFXV0_SI1342 +MFXV0_SI1635 +MFXV0_SX105 +MFXV0_SX15 +MFXV0_SX195 +MFXV0_SX285 +MFXV0_SX375 +MGAF0_SI1282 +MGAF0_SI1912 +MGAF0_SI652 +MGAF0_SX112 +MGAF0_SX202 +MGAF0_SX22 +MGAF0_SX292 +MGAF0_SX382 +MGAG0_SI1321 +MGAG0_SI645 +MGAG0_SI691 +MGAG0_SX151 +MGAG0_SX241 +MGAG0_SX331 +MGAG0_SX421 +MGAG0_SX61 +MGAK0_SI1036 +MGAK0_SI1666 +MGAK0_SI2296 +MGAK0_SX136 +MGAK0_SX226 +MGAK0_SX316 +MGAK0_SX406 +MGAK0_SX46 +MGAR0_SI1212 +MGAR0_SI1694 +MGAR0_SI1842 +MGAR0_SX132 +MGAR0_SX222 +MGAR0_SX312 +MGAR0_SX402 +MGAR0_SX42 +MGAW0_SI1165 +MGAW0_SI1802 +MGAW0_SI535 +MGAW0_SX175 +MGAW0_SX265 +MGAW0_SX355 +MGAW0_SX445 +MGAW0_SX85 +MGES0_SI1481 +MGES0_SI2111 +MGES0_SI851 +MGES0_SX131 +MGES0_SX221 +MGES0_SX311 +MGES0_SX401 +MGES0_SX41 +MGJC0_SI1256 +MGJC0_SI1335 +MGJC0_SI1965 +MGJC0_SX165 +MGJC0_SX255 +MGJC0_SX345 +MGJC0_SX435 +MGJC0_SX75 +MGRL0_SI1497 +MGRL0_SI2127 +MGRL0_SI867 +MGRL0_SX147 +MGRL0_SX237 +MGRL0_SX327 +MGRL0_SX417 +MGRL0_SX57 +MGRP0_SI1317 +MGRP0_SI1947 +MGRP0_SI687 +MGRP0_SX147 +MGRP0_SX237 +MGRP0_SX327 +MGRP0_SX417 +MGRP0_SX57 +MGSH0_SI1176 +MGSH0_SI1806 +MGSH0_SI546 +MGSH0_SX127 +MGSH0_SX186 +MGSH0_SX276 +MGSH0_SX6 +MGSH0_SX96 +MGSL0_SI1164 +MGSL0_SI534 +MGSL0_SI797 +MGSL0_SX174 +MGSL0_SX264 +MGSL0_SX354 +MGSL0_SX444 +MGSL0_SX84 +MGXP0_SI1087 +MGXP0_SI457 +MGXP0_SI525 +MGXP0_SX187 +MGXP0_SX277 +MGXP0_SX367 +MGXP0_SX7 +MGXP0_SX97 +MHBS0_SI1575 +MHBS0_SI2205 +MHBS0_SI945 +MHBS0_SX135 +MHBS0_SX225 +MHBS0_SX315 +MHBS0_SX405 +MHBS0_SX45 +MHIT0_SI1613 +MHIT0_SI2243 +MHIT0_SI983 +MHIT0_SX173 +MHIT0_SX263 +MHIT0_SX353 +MHIT0_SX443 +MHIT0_SX83 +MHJB0_SI1017 +MHJB0_SI1647 +MHJB0_SI2277 +MHJB0_SX117 +MHJB0_SX207 +MHJB0_SX27 +MHJB0_SX297 +MHJB0_SX387 +MHMG0_SI1365 +MHMG0_SI1995 +MHMG0_SI735 +MHMG0_SX105 +MHMG0_SX15 +MHMG0_SX195 +MHMG0_SX285 +MHMG0_SX375 +MHMR0_SI1119 +MHMR0_SI1692 +MHMR0_SI489 +MHMR0_SX129 +MHMR0_SX219 +MHMR0_SX309 +MHMR0_SX39 +MHMR0_SX399 +MHRM0_SI1475 +MHRM0_SI2218 +MHRM0_SI958 +MHRM0_SX148 +MHRM0_SX238 +MHRM0_SX328 +MHRM0_SX418 +MHRM0_SX58 +MHXL0_SI1772 +MHXL0_SI512 +MHXL0_SI612 +MHXL0_SX152 +MHXL0_SX242 +MHXL0_SX332 +MHXL0_SX422 +MHXL0_SX62 +MILB0_SI2163 +MILB0_SI807 +MILB0_SI903 +MILB0_SX183 +MILB0_SX273 +MILB0_SX3 +MILB0_SX363 +MILB0_SX93 +MJAC0_SI1331 +MJAC0_SI2148 +MJAC0_SI701 +MJAC0_SX251 +MJAC0_SX307 +MJAC0_SX341 +MJAC0_SX431 +MJAC0_SX71 +MJAE0_SI1524 +MJAE0_SI1999 +MJAE0_SI2154 +MJAE0_SX174 +MJAE0_SX264 +MJAE0_SX354 +MJAE0_SX444 +MJAE0_SX84 +MJAI0_SI1604 +MJAI0_SI682 +MJAI0_SI710 +MJAI0_SX164 +MJAI0_SX254 +MJAI0_SX344 +MJAI0_SX434 +MJAI0_SX74 +MJBG0_SI1232 +MJBG0_SI1724 +MJBG0_SI1862 +MJBG0_SX152 +MJBG0_SX242 +MJBG0_SX332 +MJBG0_SX422 +MJBG0_SX62 +MJDA0_SI1031 +MJDA0_SI1661 +MJDA0_SI2291 +MJDA0_SX131 +MJDA0_SX221 +MJDA0_SX311 +MJDA0_SX401 +MJDA0_SX41 +MJDC0_SI1161 +MJDC0_SI2165 +MJDC0_SI531 +MJDC0_SX171 +MJDC0_SX261 +MJDC0_SX351 +MJDC0_SX441 +MJDC0_SX81 +MJDE0_SI1120 +MJDE0_SI463 +MJDE0_SI490 +MJDE0_SX130 +MJDE0_SX220 +MJDE0_SX310 +MJDE0_SX40 +MJDE0_SX400 +MJDG0_SI1042 +MJDG0_SI1672 +MJDG0_SI1705 +MJDG0_SX142 +MJDG0_SX232 +MJDG0_SX322 +MJDG0_SX412 +MJDG0_SX52 +MJDM0_SI1340 +MJDM0_SI1937 +MJDM0_SI974 +MJDM0_SX170 +MJDM0_SX260 +MJDM0_SX350 +MJDM0_SX440 +MJDM0_SX80 +MJEB0_SI1286 +MJEB0_SI1916 +MJEB0_SI656 +MJEB0_SX170 +MJEB0_SX206 +MJEB0_SX26 +MJEB0_SX296 +MJEB0_SX386 +MJEB1_SI1467 +MJEB1_SI2097 +MJEB1_SI837 +MJEB1_SX117 +MJEB1_SX207 +MJEB1_SX27 +MJEB1_SX297 +MJEB1_SX387 +MJEE0_SI1237 +MJEE0_SI1867 +MJEE0_SI607 +MJEE0_SX157 +MJEE0_SX247 +MJEE0_SX337 +MJEE0_SX427 +MJEE0_SX67 +MJFH0_SI1107 +MJFH0_SI1737 +MJFH0_SI477 +MJFH0_SX117 +MJFH0_SX207 +MJFH0_SX27 +MJFH0_SX297 +MJFH0_SX387 +MJFR0_SI1605 +MJFR0_SI2235 +MJFR0_SI975 +MJFR0_SX165 +MJFR0_SX255 +MJFR0_SX345 +MJFR0_SX435 +MJFR0_SX75 +MJHI0_SI1328 +MJHI0_SI555 +MJHI0_SI698 +MJHI0_SX158 +MJHI0_SX248 +MJHI0_SX338 +MJHI0_SX428 +MJHI0_SX68 +MJJB0_SI1139 +MJJB0_SI1277 +MJJB0_SI1769 +MJJB0_SX149 +MJJB0_SX239 +MJJB0_SX329 +MJJB0_SX419 +MJJB0_SX59 +MJJJ0_SI1163 +MJJJ0_SI1793 +MJJJ0_SI533 +MJJJ0_SX173 +MJJJ0_SX263 +MJJJ0_SX353 +MJJJ0_SX443 +MJJJ0_SX83 +MJJM0_SI1251 +MJJM0_SI1457 +MJJM0_SI827 +MJJM0_SX107 +MJJM0_SX17 +MJJM0_SX197 +MJJM0_SX287 +MJJM0_SX377 +MJKR0_SI1201 +MJKR0_SI1831 +MJKR0_SI571 +MJKR0_SX121 +MJKR0_SX211 +MJKR0_SX301 +MJKR0_SX31 +MJKR0_SX391 +MJLB0_SI1616 +MJLB0_SI2246 +MJLB0_SI986 +MJLB0_SX176 +MJLB0_SX266 +MJLB0_SX356 +MJLB0_SX446 +MJLB0_SX86 +MJLG1_SI1012 +MJLG1_SI1642 +MJLG1_SI2272 +MJLG1_SX112 +MJLG1_SX202 +MJLG1_SX22 +MJLG1_SX292 +MJLG1_SX382 +MJLS0_SI1096 +MJLS0_SI1726 +MJLS0_SI466 +MJLS0_SX106 +MJLS0_SX16 +MJLS0_SX196 +MJLS0_SX286 +MJLS0_SX376 +MJMA0_SI1495 +MJMA0_SI2125 +MJMA0_SI865 +MJMA0_SX145 +MJMA0_SX235 +MJMA0_SX325 +MJMA0_SX415 +MJMA0_SX55 +MJMD0_SI1028 +MJMD0_SI1658 +MJMD0_SI2288 +MJMD0_SX128 +MJMD0_SX218 +MJMD0_SX308 +MJMD0_SX38 +MJMD0_SX398 +MJMM0_SI1255 +MJMM0_SI1885 +MJMM0_SI625 +MJMM0_SX175 +MJMM0_SX265 +MJMM0_SX355 +MJMM0_SX445 +MJMM0_SX85 +MJPG0_SI1191 +MJPG0_SI1821 +MJPG0_SI561 +MJPG0_SX111 +MJPG0_SX201 +MJPG0_SX21 +MJPG0_SX291 +MJPG0_SX381 +MJPM0_SI1368 +MJPM0_SI1998 +MJPM0_SI738 +MJPM0_SX108 +MJPM0_SX18 +MJPM0_SX198 +MJPM0_SX288 +MJPM0_SX378 +MJPM1_SI1897 +MJPM1_SI2280 +MJPM1_SI761 +MJPM1_SX131 +MJPM1_SX221 +MJPM1_SX311 +MJPM1_SX401 +MJPM1_SX41 +MJRA0_SI1236 +MJRA0_SI1866 +MJRA0_SI606 +MJRA0_SX156 +MJRA0_SX246 +MJRA0_SX336 +MJRA0_SX426 +MJRA0_SX66 +MJRG0_SI1366 +MJRG0_SI1996 +MJRG0_SI736 +MJRG0_SX106 +MJRG0_SX16 +MJRG0_SX286 +MJRG0_SX352 +MJRG0_SX376 +MJRH0_SI1125 +MJRH0_SI1755 +MJRH0_SI1840 +MJRH0_SX135 +MJRH0_SX225 +MJRH0_SX315 +MJRH0_SX405 +MJRH0_SX45 +MJRH1_SI1558 +MJRH1_SI1774 +MJRH1_SI514 +MJRH1_SX154 +MJRH1_SX244 +MJRH1_SX334 +MJRH1_SX424 +MJRH1_SX64 +MJRK0_SI1662 +MJRK0_SI2103 +MJRK0_SI880 +MJRK0_SX160 +MJRK0_SX250 +MJRK0_SX340 +MJRK0_SX430 +MJRK0_SX70 +MJRP0_SI1835 +MJRP0_SI1845 +MJRP0_SI585 +MJRP0_SX135 +MJRP0_SX225 +MJRP0_SX315 +MJRP0_SX405 +MJRP0_SX45 +MJSR0_SI1424 +MJSR0_SI2054 +MJSR0_SI794 +MJSR0_SX164 +MJSR0_SX254 +MJSR0_SX344 +MJSR0_SX434 +MJSR0_SX74 +MJWG0_SI2155 +MJWG0_SI813 +MJWG0_SI895 +MJWG0_SX175 +MJWG0_SX265 +MJWG0_SX355 +MJWG0_SX445 +MJWG0_SX85 +MJWS0_SI1143 +MJWS0_SI1773 +MJWS0_SI513 +MJWS0_SX153 +MJWS0_SX243 +MJWS0_SX333 +MJWS0_SX423 +MJWS0_SX63 +MJWT0_SI1291 +MJWT0_SI1381 +MJWT0_SI751 +MJWT0_SX121 +MJWT0_SX211 +MJWT0_SX301 +MJWT0_SX31 +MJWT0_SX391 +MJXA0_SI1507 +MJXA0_SI2137 +MJXA0_SI877 +MJXA0_SX157 +MJXA0_SX247 +MJXA0_SX337 +MJXA0_SX427 +MJXA0_SX67 +MJXL0_SI1172 +MJXL0_SI1795 +MJXL0_SI542 +MJXL0_SX182 +MJXL0_SX272 +MJXL0_SX362 +MJXL0_SX452 +MJXL0_SX92 +MKAG0_SI1609 +MKAG0_SI2239 +MKAG0_SI979 +MKAG0_SX169 +MKAG0_SX259 +MKAG0_SX30 +MKAG0_SX439 +MKAG0_SX79 +MKAH0_SI1528 +MKAH0_SI2158 +MKAH0_SI898 +MKAH0_SX178 +MKAH0_SX268 +MKAH0_SX358 +MKAH0_SX448 +MKAH0_SX88 +MKAJ0_SI1414 +MKAJ0_SI2044 +MKAJ0_SI784 +MKAJ0_SX154 +MKAJ0_SX244 +MKAJ0_SX334 +MKAJ0_SX424 +MKAJ0_SX64 +MKAM0_SI1250 +MKAM0_SI1316 +MKAM0_SI1465 +MKAM0_SX146 +MKAM0_SX236 +MKAM0_SX326 +MKAM0_SX416 +MKAM0_SX56 +MKDB0_SI2132 +MKDB0_SI588 +MKDB0_SI872 +MKDB0_SX152 +MKDB0_SX242 +MKDB0_SX332 +MKDB0_SX422 +MKDB0_SX62 +MKDD0_SI1567 +MKDD0_SI2197 +MKDD0_SI937 +MKDD0_SX127 +MKDD0_SX217 +MKDD0_SX307 +MKDD0_SX37 +MKDD0_SX397 +MKDT0_SI2153 +MKDT0_SI814 +MKDT0_SI893 +MKDT0_SX173 +MKDT0_SX263 +MKDT0_SX353 +MKDT0_SX443 +MKDT0_SX83 +MKES0_SI1253 +MKES0_SI1883 +MKES0_SI623 +MKES0_SX173 +MKES0_SX263 +MKES0_SX353 +MKES0_SX443 +MKES0_SX83 +MKJO0_SI1517 +MKJO0_SI2147 +MKJO0_SI887 +MKJO0_SX167 +MKJO0_SX257 +MKJO0_SX424 +MKJO0_SX437 +MKJO0_SX77 +MKLN0_SI1598 +MKLN0_SI2228 +MKLN0_SI968 +MKLN0_SX158 +MKLN0_SX248 +MKLN0_SX338 +MKLN0_SX428 +MKLN0_SX68 +MKLR0_SI1059 +MKLR0_SI1689 +MKLR0_SI2319 +MKLR0_SX159 +MKLR0_SX249 +MKLR0_SX339 +MKLR0_SX429 +MKLR0_SX69 +MKLS0_SI1437 +MKLS0_SI1533 +MKLS0_SI2067 +MKLS0_SX177 +MKLS0_SX267 +MKLS0_SX357 +MKLS0_SX447 +MKLS0_SX87 +MKLS1_SI1545 +MKLS1_SI2175 +MKLS1_SI915 +MKLS1_SX105 +MKLS1_SX15 +MKLS1_SX195 +MKLS1_SX285 +MKLS1_SX375 +MKLW0_SI1571 +MKLW0_SI1844 +MKLW0_SI2201 +MKLW0_SX131 +MKLW0_SX221 +MKLW0_SX311 +MKLW0_SX401 +MKLW0_SX41 +MKRG0_SI1491 +MKRG0_SI2121 +MKRG0_SI861 +MKRG0_SX141 +MKRG0_SX231 +MKRG0_SX31 +MKRG0_SX411 +MKRG0_SX51 +MKXL0_SI1185 +MKXL0_SI1815 +MKXL0_SI1958 +MKXL0_SX105 +MKXL0_SX15 +MKXL0_SX195 +MKXL0_SX285 +MKXL0_SX375 +MLBC0_SI1239 +MLBC0_SI1869 +MLBC0_SI609 +MLBC0_SX159 +MLBC0_SX249 +MLBC0_SX339 +MLBC0_SX429 +MLBC0_SX69 +MLEL0_SI1246 +MLEL0_SI1876 +MLEL0_SI616 +MLEL0_SX166 +MLEL0_SX256 +MLEL0_SX346 +MLEL0_SX436 +MLEL0_SX76 +MLJC0_SI1225 +MLJC0_SI1855 +MLJC0_SI595 +MLJC0_SX145 +MLJC0_SX235 +MLJC0_SX325 +MLJC0_SX415 +MLJC0_SX55 +MLJH0_SI1324 +MLJH0_SI1422 +MLJH0_SI694 +MLJH0_SX154 +MLJH0_SX244 +MLJH0_SX334 +MLJH0_SX424 +MLJH0_SX64 +MLNS0_SI1407 +MLNS0_SI2037 +MLNS0_SI777 +MLNS0_SX147 +MLNS0_SX237 +MLNS0_SX327 +MLNS0_SX417 +MLNS0_SX57 +MLSH0_SI1417 +MLSH0_SI2047 +MLSH0_SI787 +MLSH0_SX157 +MLSH0_SX247 +MLSH0_SX337 +MLSH0_SX427 +MLSH0_SX67 +MMAA0_SI1588 +MMAA0_SI2105 +MMAA0_SI845 +MMAA0_SX125 +MMAA0_SX215 +MMAA0_SX305 +MMAA0_SX35 +MMAA0_SX395 +MMAB1_SI1494 +MMAB1_SI2124 +MMAB1_SI864 +MMAB1_SX144 +MMAB1_SX234 +MMAB1_SX324 +MMAB1_SX414 +MMAB1_SX54 +MMAG0_SI1126 +MMAG0_SI1756 +MMAG0_SI496 +MMAG0_SX136 +MMAG0_SX226 +MMAG0_SX316 +MMAG0_SX406 +MMAG0_SX46 +MMAM0_SI1597 +MMAM0_SI1668 +MMAM0_SI2227 +MMAM0_SX157 +MMAM0_SX247 +MMAM0_SX337 +MMAM0_SX427 +MMAM0_SX67 +MMAR0_SI1336 +MMAR0_SI1966 +MMAR0_SI706 +MMAR0_SX166 +MMAR0_SX256 +MMAR0_SX346 +MMAR0_SX436 +MMAR0_SX76 +MMBS0_SI1151 +MMBS0_SI1781 +MMBS0_SI521 +MMBS0_SX161 +MMBS0_SX251 +MMBS0_SX341 +MMBS0_SX431 +MMBS0_SX71 +MMCC0_SI1338 +MMCC0_SI1968 +MMCC0_SI708 +MMCC0_SX168 +MMCC0_SX258 +MMCC0_SX348 +MMCC0_SX438 +MMCC0_SX78 +MMDB0_SI1358 +MMDB0_SI1617 +MMDB0_SI987 +MMDB0_SX177 +MMDB0_SX267 +MMDB0_SX357 +MMDB0_SX447 +MMDB0_SX87 +MMDG0_SI1780 +MMDG0_SI2035 +MMDG0_SI520 +MMDG0_SX160 +MMDG0_SX250 +MMDG0_SX340 +MMDG0_SX430 +MMDG0_SX70 +MMDM0_SI1311 +MMDM0_SI1941 +MMDM0_SI681 +MMDM0_SX141 +MMDM0_SX231 +MMDM0_SX321 +MMDM0_SX411 +MMDM0_SX51 +MMDM1_SI1650 +MMDM1_SI2043 +MMDM1_SI783 +MMDM1_SX153 +MMDM1_SX243 +MMDM1_SX333 +MMDM1_SX423 +MMDM1_SX63 +MMDS0_SI1343 +MMDS0_SI1973 +MMDS0_SI713 +MMDS0_SX173 +MMDS0_SX263 +MMDS0_SX353 +MMDS0_SX443 +MMDS0_SX83 +MMEA0_SI1388 +MMEA0_SI2018 +MMEA0_SI758 +MMEA0_SX128 +MMEA0_SX218 +MMEA0_SX308 +MMEA0_SX38 +MMEA0_SX398 +MMEB0_SI1357 +MMEB0_SI1987 +MMEB0_SI727 +MMEB0_SX187 +MMEB0_SX327 +MMEB0_SX367 +MMEB0_SX7 +MMEB0_SX97 +MMGC0_SI1305 +MMGC0_SI1935 +MMGC0_SI2184 +MMGC0_SX135 +MMGC0_SX225 +MMGC0_SX315 +MMGC0_SX405 +MMGC0_SX45 +MMGG0_SI1079 +MMGG0_SI1709 +MMGG0_SI2339 +MMGG0_SX179 +MMGG0_SX269 +MMGG0_SX359 +MMGG0_SX449 +MMGG0_SX89 +MMGK0_SI1322 +MMGK0_SI1952 +MMGK0_SI692 +MMGK0_SX152 +MMGK0_SX242 +MMGK0_SX332 +MMGK0_SX422 +MMGK0_SX62 +MMJB1_SI1408 +MMJB1_SI2038 +MMJB1_SI778 +MMJB1_SX148 +MMJB1_SX238 +MMJB1_SX328 +MMJB1_SX418 +MMJB1_SX58 +MMLM0_SI1527 +MMLM0_SI2150 +MMLM0_SI897 +MMLM0_SX177 +MMLM0_SX267 +MMLM0_SX357 +MMLM0_SX447 +MMLM0_SX87 +MMPM0_SI1061 +MMPM0_SI1691 +MMPM0_SI2321 +MMPM0_SX161 +MMPM0_SX251 +MMPM0_SX341 +MMPM0_SX431 +MMPM0_SX71 +MMRP0_SI2034 +MMRP0_SI717 +MMRP0_SI774 +MMRP0_SX144 +MMRP0_SX234 +MMRP0_SX324 +MMRP0_SX414 +MMRP0_SX54 +MMSM0_SI1106 +MMSM0_SI1736 +MMSM0_SI476 +MMSM0_SX116 +MMSM0_SX206 +MMSM0_SX26 +MMSM0_SX296 +MMSM0_SX386 +MMVP0_SI1284 +MMVP0_SI1914 +MMVP0_SI654 +MMVP0_SX114 +MMVP0_SX204 +MMVP0_SX294 +MMVP0_SX347 +MMVP0_SX384 +MMWB0_SI1619 +MMWB0_SI2249 +MMWB0_SI989 +MMWB0_SX179 +MMWB0_SX269 +MMWB0_SX359 +MMWB0_SX449 +MMWB0_SX89 +MMWS0_SI1518 +MMWS0_SI559 +MMWS0_SI888 +MMWS0_SX168 +MMWS0_SX258 +MMWS0_SX348 +MMWS0_SX438 +MMWS0_SX78 +MMWS1_SI1071 +MMWS1_SI1701 +MMWS1_SI2331 +MMWS1_SX261 +MMWS1_SX27 +MMWS1_SX351 +MMWS1_SX441 +MMWS1_SX81 +MMXS0_SI2136 +MMXS0_SI629 +MMXS0_SI876 +MMXS0_SX156 +MMXS0_SX246 +MMXS0_SX336 +MMXS0_SX426 +MMXS0_SX66 +MNET0_SI1446 +MNET0_SI2076 +MNET0_SI816 +MNET0_SX186 +MNET0_SX276 +MNET0_SX366 +MNET0_SX6 +MNET0_SX96 +MNTW0_SI1068 +MNTW0_SI1698 +MNTW0_SI2328 +MNTW0_SX168 +MNTW0_SX202 +MNTW0_SX258 +MNTW0_SX348 +MNTW0_SX78 +MPAR0_SI1576 +MPAR0_SI2206 +MPAR0_SI946 +MPAR0_SX136 +MPAR0_SX226 +MPAR0_SX316 +MPAR0_SX406 +MPAR0_SX46 +MPEB0_SI1034 +MPEB0_SI1860 +MPEB0_SI600 +MPEB0_SX150 +MPEB0_SX240 +MPEB0_SX330 +MPEB0_SX420 +MPEB0_SX60 +MPFU0_SI1258 +MPFU0_SI1888 +MPFU0_SI628 +MPFU0_SX178 +MPFU0_SX268 +MPFU0_SX358 +MPFU0_SX448 +MPFU0_SX88 +MPGH0_SI1554 +MPGH0_SI675 +MPGH0_SI924 +MPGH0_SX114 +MPGH0_SX204 +MPGH0_SX24 +MPGH0_SX294 +MPGH0_SX384 +MPGR0_SI1410 +MPGR0_SI2040 +MPGR0_SI780 +MPGR0_SX150 +MPGR0_SX240 +MPGR0_SX330 +MPGR0_SX420 +MPGR0_SX60 +MPGR1_SI1269 +MPGR1_SI1499 +MPGR1_SI2129 +MPGR1_SX149 +MPGR1_SX239 +MPGR1_SX329 +MPGR1_SX419 +MPGR1_SX59 +MPMB0_SI1501 +MPMB0_SI2131 +MPMB0_SI871 +MPMB0_SX151 +MPMB0_SX241 +MPMB0_SX331 +MPMB0_SX421 +MPMB0_SX61 +MPPC0_SI1412 +MPPC0_SI2042 +MPPC0_SI782 +MPPC0_SX152 +MPPC0_SX242 +MPPC0_SX332 +MPPC0_SX422 +MPPC0_SX62 +MPRB0_SI1205 +MPRB0_SI1215 +MPRB0_SI575 +MPRB0_SX125 +MPRB0_SX215 +MPRB0_SX305 +MPRB0_SX35 +MPRB0_SX395 +MPRD0_SI1431 +MPRD0_SI2061 +MPRD0_SI801 +MPRD0_SX171 +MPRD0_SX261 +MPRD0_SX351 +MPRD0_SX441 +MPRD0_SX81 +MPRK0_SI1097 +MPRK0_SI1727 +MPRK0_SI467 +MPRK0_SX107 +MPRK0_SX17 +MPRK0_SX197 +MPRK0_SX287 +MPRK0_SX377 +MPRT0_SI1210 +MPRT0_SI495 +MPRT0_SI580 +MPRT0_SX130 +MPRT0_SX220 +MPRT0_SX310 +MPRT0_SX40 +MPRT0_SX400 +MPSW0_SI1067 +MPSW0_SI1697 +MPSW0_SI2327 +MPSW0_SX167 +MPSW0_SX24 +MPSW0_SX257 +MPSW0_SX437 +MPSW0_SX77 +MRAB0_SI1224 +MRAB0_SI1854 +MRAB0_SI594 +MRAB0_SX144 +MRAB0_SX234 +MRAB0_SX324 +MRAB0_SX414 +MRAB0_SX54 +MRAB1_SI1478 +MRAB1_SI2108 +MRAB1_SI848 +MRAB1_SX128 +MRAB1_SX218 +MRAB1_SX308 +MRAB1_SX38 +MRAB1_SX398 +MRAI0_SI1954 +MRAI0_SI2052 +MRAI0_SI792 +MRAI0_SX162 +MRAI0_SX252 +MRAI0_SX342 +MRAI0_SX432 +MRAI0_SX72 +MRAM0_SI1275 +MRAM0_SI1905 +MRAM0_SI1951 +MRAM0_SX105 +MRAM0_SX15 +MRAM0_SX195 +MRAM0_SX285 +MRAM0_SX375 +MRAV0_SI1008 +MRAV0_SI1638 +MRAV0_SI2268 +MRAV0_SX108 +MRAV0_SX18 +MRAV0_SX198 +MRAV0_SX288 +MRAV0_SX378 +MRBC0_SI1665 +MRBC0_SI1859 +MRBC0_SI599 +MRBC0_SX149 +MRBC0_SX239 +MRBC0_SX329 +MRBC0_SX419 +MRBC0_SX59 +MRCG0_SI1428 +MRCG0_SI2058 +MRCG0_SI798 +MRCG0_SX168 +MRCG0_SX258 +MRCG0_SX348 +MRCG0_SX438 +MRCG0_SX78 +MRCW0_SI1371 +MRCW0_SI2001 +MRCW0_SI741 +MRCW0_SX111 +MRCW0_SX201 +MRCW0_SX21 +MRCW0_SX291 +MRCW0_SX381 +MRDD0_SI1050 +MRDD0_SI1680 +MRDD0_SI2310 +MRDD0_SX150 +MRDD0_SX240 +MRDD0_SX277 +MRDD0_SX330 +MRDD0_SX60 +MRDM0_SI1044 +MRDM0_SI1595 +MRDM0_SI965 +MRDM0_SX155 +MRDM0_SX245 +MRDM0_SX335 +MRDM0_SX425 +MRDM0_SX65 +MRDS0_SI1167 +MRDS0_SI1797 +MRDS0_SI537 +MRDS0_SX177 +MRDS0_SX267 +MRDS0_SX357 +MRDS0_SX447 +MRDS0_SX87 +MREE0_SI1104 +MREE0_SI1734 +MREE0_SI1959 +MREE0_SX114 +MREE0_SX204 +MREE0_SX24 +MREE0_SX294 +MREE0_SX384 +MREH1_SI1599 +MREH1_SI2229 +MREH1_SI969 +MREH1_SX159 +MREH1_SX249 +MREH1_SX339 +MREH1_SX429 +MREH1_SX69 +MREM0_SI1591 +MREM0_SI511 +MREM0_SI961 +MREM0_SX151 +MREM0_SX241 +MREM0_SX331 +MREM0_SX421 +MREM0_SX61 +MREW1_SI1500 +MREW1_SI2130 +MREW1_SI870 +MREW1_SX150 +MREW1_SX240 +MREW1_SX330 +MREW1_SX420 +MREW1_SX60 +MRFK0_SI1076 +MRFK0_SI1706 +MRFK0_SI2336 +MRFK0_SX176 +MRFK0_SX266 +MRFK0_SX356 +MRFK0_SX446 +MRFK0_SX86 +MRFL0_SI1156 +MRFL0_SI1786 +MRFL0_SI526 +MRFL0_SX166 +MRFL0_SX256 +MRFL0_SX346 +MRFL0_SX436 +MRFL0_SX76 +MRGM0_SI1162 +MRGM0_SI1792 +MRGM0_SI532 +MRGM0_SX172 +MRGM0_SX262 +MRGM0_SX416 +MRGM0_SX442 +MRGM0_SX82 +MRGS0_SI1356 +MRGS0_SI1986 +MRGS0_SI726 +MRGS0_SX186 +MRGS0_SX276 +MRGS0_SX366 +MRGS0_SX6 +MRGS0_SX96 +MRHL0_SI1515 +MRHL0_SI2145 +MRHL0_SI885 +MRHL0_SX165 +MRHL0_SX255 +MRHL0_SX345 +MRHL0_SX435 +MRHL0_SX75 +MRJB1_SI1020 +MRJB1_SI1413 +MRJB1_SI2021 +MRJB1_SX120 +MRJB1_SX210 +MRJB1_SX30 +MRJB1_SX300 +MRJB1_SX390 +MRJH0_SI1519 +MRJH0_SI889 +MRJH0_SI914 +MRJH0_SX169 +MRJH0_SX259 +MRJH0_SX307 +MRJH0_SX439 +MRJH0_SX79 +MRJM0_SI1095 +MRJM0_SI1228 +MRJM0_SI1858 +MRJM0_SX148 +MRJM0_SX238 +MRJM0_SX328 +MRJM0_SX418 +MRJM0_SX58 +MRJM1_SI1298 +MRJM1_SI1928 +MRJM1_SI668 +MRJM1_SX128 +MRJM1_SX218 +MRJM1_SX308 +MRJM1_SX38 +MRJM1_SX398 +MRJT0_SI1498 +MRJT0_SI1805 +MRJT0_SI868 +MRJT0_SX148 +MRJT0_SX238 +MRJT0_SX328 +MRJT0_SX418 +MRJT0_SX58 +MRKM0_SI1267 +MRKM0_SI1391 +MRKM0_SI637 +MRKM0_SX187 +MRKM0_SX277 +MRKM0_SX367 +MRKM0_SX7 +MRKM0_SX97 +MRLD0_SI1594 +MRLD0_SI2224 +MRLD0_SI964 +MRLD0_SX154 +MRLD0_SX244 +MRLD0_SX334 +MRLD0_SX424 +MRLD0_SX64 +MRLJ0_SI1420 +MRLJ0_SI2050 +MRLJ0_SI790 +MRLJ0_SX160 +MRLJ0_SX250 +MRLJ0_SX340 +MRLJ0_SX430 +MRLJ0_SX70 +MRLJ1_SI1671 +MRLJ1_SI2301 +MRLJ1_SI2332 +MRLJ1_SX141 +MRLJ1_SX231 +MRLJ1_SX321 +MRLJ1_SX411 +MRLJ1_SX51 +MRLK0_SI1468 +MRLK0_SI2140 +MRLK0_SI843 +MRLK0_SX123 +MRLK0_SX213 +MRLK0_SX303 +MRLK0_SX33 +MRLK0_SX393 +MRLR0_SI1196 +MRLR0_SI1826 +MRLR0_SI566 +MRLR0_SX116 +MRLR0_SX206 +MRLR0_SX26 +MRLR0_SX296 +MRLR0_SX386 +MRMB0_SI1581 +MRMB0_SI2211 +MRMB0_SI951 +MRMB0_SX141 +MRMB0_SX231 +MRMB0_SX321 +MRMB0_SX411 +MRMB0_SX51 +MRMG0_SI1080 +MRMG0_SI1710 +MRMG0_SI2340 +MRMG0_SX180 +MRMG0_SX270 +MRMG0_SX360 +MRMG0_SX450 +MRMG0_SX90 +MRMH0_SI1021 +MRMH0_SI1349 +MRMH0_SI2281 +MRMH0_SX121 +MRMH0_SX211 +MRMH0_SX301 +MRMH0_SX31 +MRMH0_SX391 +MRML0_SI1421 +MRML0_SI2051 +MRML0_SI791 +MRML0_SX161 +MRML0_SX251 +MRML0_SX341 +MRML0_SX431 +MRML0_SX71 +MRMS0_SI1113 +MRMS0_SI2057 +MRMS0_SI2100 +MRMS0_SX120 +MRMS0_SX210 +MRMS0_SX30 +MRMS0_SX300 +MRMS0_SX390 +MRPC1_SI1482 +MRPC1_SI2026 +MRPC1_SI2112 +MRPC1_SX132 +MRPC1_SX222 +MRPC1_SX312 +MRPC1_SX402 +MRPC1_SX42 +MRRE0_SI1334 +MRRE0_SI704 +MRRE0_SI952 +MRRE0_SX164 +MRRE0_SX254 +MRRE0_SX344 +MRRE0_SX434 +MRRE0_SX74 +MRSO0_SI1206 +MRSO0_SI1659 +MRSO0_SI2289 +MRSO0_SX129 +MRSO0_SX219 +MRSO0_SX309 +MRSO0_SX39 +MRSO0_SX399 +MRSP0_SI1429 +MRSP0_SI2059 +MRSP0_SI799 +MRSP0_SX169 +MRSP0_SX196 +MRSP0_SX259 +MRSP0_SX439 +MRSP0_SX79 +MRTC0_SI1458 +MRTC0_SI2088 +MRTC0_SI828 +MRTC0_SX108 +MRTC0_SX18 +MRTC0_SX198 +MRTC0_SX288 +MRTC0_SX378 +MRTJ0_SI1551 +MRTJ0_SI2032 +MRTJ0_SI772 +MRTJ0_SX142 +MRTJ0_SX232 +MRTJ0_SX322 +MRTJ0_SX412 +MRTJ0_SX52 +MRVG0_SI1140 +MRVG0_SI1770 +MRVG0_SI510 +MRVG0_SX150 +MRVG0_SX240 +MRVG0_SX330 +MRVG0_SX420 +MRVG0_SX60 +MRWA0_SI1603 +MRWA0_SI2233 +MRWA0_SI973 +MRWA0_SX163 +MRWA0_SX253 +MRWA0_SX343 +MRWA0_SX433 +MRWA0_SX73 +MRWS0_SI1102 +MRWS0_SI1732 +MRWS0_SI472 +MRWS0_SX112 +MRWS0_SX202 +MRWS0_SX22 +MRWS0_SX292 +MRWS0_SX382 +MRXB0_SI1585 +MRXB0_SI2215 +MRXB0_SI955 +MRXB0_SX145 +MRXB0_SX235 +MRXB0_SX325 +MRXB0_SX415 +MRXB0_SX55 +MSAH1_SI1049 +MSAH1_SI1679 +MSAH1_SI2309 +MSAH1_SX149 +MSAH1_SX239 +MSAH1_SX329 +MSAH1_SX419 +MSAH1_SX59 +MSAS0_SI1376 +MSAS0_SI2006 +MSAS0_SI746 +MSAS0_SX116 +MSAS0_SX206 +MSAS0_SX26 +MSAS0_SX296 +MSAS0_SX386 +MSAT0_SI1526 +MSAT0_SI2156 +MSAT0_SI896 +MSAT0_SX176 +MSAT0_SX266 +MSAT0_SX356 +MSAT0_SX446 +MSAT0_SX86 +MSAT1_SI1073 +MSAT1_SI1703 +MSAT1_SI2333 +MSAT1_SX173 +MSAT1_SX263 +MSAT1_SX353 +MSAT1_SX443 +MSAT1_SX83 +MSDB0_SI1007 +MSDB0_SI1637 +MSDB0_SI2267 +MSDB0_SX107 +MSDB0_SX17 +MSDB0_SX197 +MSDB0_SX287 +MSDB0_SX377 +MSDH0_SI2113 +MSDH0_SI2240 +MSDH0_SI980 +MSDH0_SX170 +MSDH0_SX260 +MSDH0_SX350 +MSDH0_SX440 +MSDH0_SX80 +MSDS0_SI1077 +MSDS0_SI1707 +MSDS0_SI2337 +MSDS0_SX177 +MSDS0_SX267 +MSDS0_SX357 +MSDS0_SX447 +MSDS0_SX87 +MSEM1_SI1440 +MSEM1_SI2070 +MSEM1_SI810 +MSEM1_SX180 +MSEM1_SX270 +MSEM1_SX360 +MSEM1_SX450 +MSEM1_SX90 +MSES0_SI1589 +MSES0_SI2216 +MSES0_SI2219 +MSES0_SX149 +MSES0_SX239 +MSES0_SX329 +MSES0_SX419 +MSES0_SX59 +MSFH0_SI1216 +MSFH0_SI1738 +MSFH0_SI586 +MSFH0_SX136 +MSFH0_SX226 +MSFH0_SX316 +MSFH0_SX406 +MSFH0_SX46 +MSFV0_SI1262 +MSFV0_SI1892 +MSFV0_SI632 +MSFV0_SX182 +MSFV0_SX272 +MSFV0_SX362 +MSFV0_SX452 +MSFV0_SX92 +MSJK0_SI1596 +MSJK0_SI2226 +MSJK0_SI966 +MSJK0_SX156 +MSJK0_SX246 +MSJK0_SX336 +MSJK0_SX426 +MSJK0_SX66 +MSMC0_SI1907 +MSMC0_SI509 +MSMC0_SI647 +MSMC0_SX107 +MSMC0_SX17 +MSMC0_SX197 +MSMC0_SX287 +MSMC0_SX377 +MSMR0_SI1150 +MSMR0_SI1405 +MSMR0_SI775 +MSMR0_SX145 +MSMR0_SX235 +MSMR0_SX325 +MSMR0_SX415 +MSMR0_SX55 +MSMS0_SI1433 +MSMS0_SI2063 +MSMS0_SI803 +MSMS0_SX173 +MSMS0_SX263 +MSMS0_SX353 +MSMS0_SX443 +MSMS0_SX83 +MSRG0_SI1221 +MSRG0_SI1851 +MSRG0_SI591 +MSRG0_SX141 +MSRG0_SX231 +MSRG0_SX321 +MSRG0_SX411 +MSRG0_SX51 +MSRR0_SI1131 +MSRR0_SI1761 +MSRR0_SI501 +MSRR0_SX141 +MSRR0_SX231 +MSRR0_SX30 +MSRR0_SX411 +MSRR0_SX51 +MSTF0_SI1396 +MSTF0_SI766 +MSTF0_SI852 +MSTF0_SX136 +MSTF0_SX226 +MSTF0_SX316 +MSTF0_SX406 +MSTF0_SX46 +MSVS0_SI1568 +MSVS0_SI2198 +MSVS0_SI938 +MSVS0_SX128 +MSVS0_SX218 +MSVS0_SX308 +MSVS0_SX38 +MSVS0_SX398 +MTAB0_SI1572 +MTAB0_SI2202 +MTAB0_SI942 +MTAB0_SX132 +MTAB0_SX222 +MTAB0_SX312 +MTAB0_SX402 +MTAB0_SX42 +MTAS0_SI1385 +MTAS0_SI2015 +MTAS0_SI755 +MTAS0_SX125 +MTAS0_SX215 +MTAS0_SX305 +MTAS0_SX35 +MTAS0_SX395 +MTAT0_SI1110 +MTAT0_SI1740 +MTAT0_SI811 +MTAT0_SX120 +MTAT0_SX210 +MTAT0_SX30 +MTAT0_SX300 +MTAT0_SX390 +MTAT1_SI1409 +MTAT1_SI1627 +MTAT1_SI779 +MTAT1_SX149 +MTAT1_SX239 +MTAT1_SX329 +MTAT1_SX419 +MTAT1_SX59 +MTBC0_SI1173 +MTBC0_SI1803 +MTBC0_SI543 +MTBC0_SX183 +MTBC0_SX273 +MTBC0_SX347 +MTBC0_SX363 +MTBC0_SX93 +MTCS0_SI1972 +MTCS0_SI2265 +MTCS0_SI712 +MTCS0_SX172 +MTCS0_SX262 +MTCS0_SX352 +MTCS0_SX442 +MTCS0_SX82 +MTDB0_SI1401 +MTDB0_SI2031 +MTDB0_SI771 +MTDB0_SX141 +MTDB0_SX231 +MTDB0_SX321 +MTDB0_SX411 +MTDB0_SX51 +MTDP0_SI1274 +MTDP0_SI1521 +MTDP0_SI2151 +MTDP0_SX171 +MTDP0_SX261 +MTDP0_SX351 +MTDP0_SX441 +MTDP0_SX81 +MTER0_SI1157 +MTER0_SI1787 +MTER0_SI527 +MTER0_SX167 +MTER0_SX17 +MTER0_SX257 +MTER0_SX437 +MTER0_SX77 +MTJG0_SI1520 +MTJG0_SI2157 +MTJG0_SI890 +MTJG0_SX170 +MTJG0_SX260 +MTJG0_SX350 +MTJG0_SX440 +MTJG0_SX80 +MTJM0_SI1226 +MTJM0_SI1856 +MTJM0_SI655 +MTJM0_SX146 +MTJM0_SX236 +MTJM0_SX326 +MTJM0_SX416 +MTJM0_SX56 +MTJS0_SI1192 +MTJS0_SI1822 +MTJS0_SI562 +MTJS0_SX112 +MTJS0_SX202 +MTJS0_SX22 +MTJS0_SX292 +MTJS0_SX382 +MTJU0_SI2020 +MTJU0_SI2269 +MTJU0_SI760 +MTJU0_SX130 +MTJU0_SX220 +MTJU0_SX310 +MTJU0_SX40 +MTJU0_SX400 +MTKD0_SI1187 +MTKD0_SI1817 +MTKD0_SI630 +MTKD0_SX107 +MTKD0_SX17 +MTKD0_SX197 +MTKD0_SX287 +MTKD0_SX377 +MTKP0_SI1023 +MTKP0_SI2283 +MTKP0_SI454 +MTKP0_SX123 +MTKP0_SX213 +MTKP0_SX303 +MTKP0_SX33 +MTKP0_SX393 +MTLB0_SI1134 +MTLB0_SI1764 +MTLB0_SI504 +MTLB0_SX144 +MTLB0_SX234 +MTLB0_SX324 +MTLB0_SX414 +MTLB0_SX54 +MTLC0_SI1313 +MTLC0_SI1477 +MTLC0_SI847 +MTLC0_SX127 +MTLC0_SX217 +MTLC0_SX307 +MTLC0_SX37 +MTLC0_SX397 +MTML0_SI1065 +MTML0_SI1695 +MTML0_SI2325 +MTML0_SX165 +MTML0_SX255 +MTML0_SX345 +MTML0_SX435 +MTML0_SX75 +MTMN0_SI1064 +MTMN0_SI2324 +MTMN0_SI582 +MTMN0_SX164 +MTMN0_SX254 +MTMN0_SX344 +MTMN0_SX434 +MTMN0_SX74 +MTMT0_SI1118 +MTMT0_SI1748 +MTMT0_SI488 +MTMT0_SX128 +MTMT0_SX218 +MTMT0_SX308 +MTMT0_SX38 +MTMT0_SX398 +MTPF0_SI1235 +MTPF0_SI1865 +MTPF0_SI605 +MTPF0_SX155 +MTPF0_SX245 +MTPF0_SX335 +MTPF0_SX425 +MTPF0_SX65 +MTPG0_SI1383 +MTPG0_SI2013 +MTPG0_SI753 +MTPG0_SX123 +MTPG0_SX213 +MTPG0_SX303 +MTPG0_SX33 +MTPG0_SX393 +MTPP0_SI1508 +MTPP0_SI2138 +MTPP0_SI878 +MTPP0_SX158 +MTPP0_SX248 +MTPP0_SX338 +MTPP0_SX428 +MTPP0_SX68 +MTPR0_SI1600 +MTPR0_SI2230 +MTPR0_SI506 +MTPR0_SX160 +MTPR0_SX250 +MTPR0_SX340 +MTPR0_SX430 +MTPR0_SX70 +MTQC0_SI1441 +MTQC0_SI2071 +MTQC0_SI480 +MTQC0_SX181 +MTQC0_SX271 +MTQC0_SX361 +MTQC0_SX451 +MTQC0_SX91 +MTRC0_SI1623 +MTRC0_SI589 +MTRC0_SI993 +MTRC0_SX170 +MTRC0_SX183 +MTRC0_SX273 +MTRC0_SX363 +MTRC0_SX93 +MTRR0_SI1548 +MTRR0_SI2178 +MTRR0_SI918 +MTRR0_SX108 +MTRR0_SX18 +MTRR0_SX198 +MTRR0_SX288 +MTRR0_SX378 +MTRT0_SI1227 +MTRT0_SI1857 +MTRT0_SI597 +MTRT0_SX147 +MTRT0_SX237 +MTRT0_SX254 +MTRT0_SX417 +MTRT0_SX57 +MTWH1_SI1512 +MTWH1_SI2142 +MTWH1_SI882 +MTWH1_SX162 +MTWH1_SX252 +MTWH1_SX342 +MTWH1_SX432 +MTWH1_SX72 +MTXS0_SI1060 +MTXS0_SI1690 +MTXS0_SI2320 +MTXS0_SX160 +MTXS0_SX250 +MTXS0_SX340 +MTXS0_SX430 +MTXS0_SX70 +MVJH0_SI1556 +MVJH0_SI2186 +MVJH0_SI926 +MVJH0_SX116 +MVJH0_SX206 +MVJH0_SX26 +MVJH0_SX296 +MVJH0_SX386 +MVLO0_SI1147 +MVLO0_SI1777 +MVLO0_SI517 +MVLO0_SX157 +MVLO0_SX247 +MVLO0_SX337 +MVLO0_SX427 +MVLO0_SX67 +MVRW0_SI1485 +MVRW0_SI2115 +MVRW0_SI855 +MVRW0_SX135 +MVRW0_SX225 +MVRW0_SX315 +MVRW0_SX405 +MVRW0_SX45 +MWAC0_SI1601 +MWAC0_SI2231 +MWAC0_SI971 +MWAC0_SX161 +MWAC0_SX251 +MWAC0_SX341 +MWAC0_SX431 +MWAC0_SX71 +MWAD0_SI1062 +MWAD0_SI1749 +MWAD0_SI2322 +MWAD0_SX162 +MWAD0_SX252 +MWAD0_SX342 +MWAD0_SX432 +MWAD0_SX72 +MWAR0_SI1045 +MWAR0_SI1675 +MWAR0_SI2305 +MWAR0_SX145 +MWAR0_SX235 +MWAR0_SX325 +MWAR0_SX415 +MWAR0_SX55 +MWCH0_SI1622 +MWCH0_SI1895 +MWCH0_SI2252 +MWCH0_SX182 +MWCH0_SX272 +MWCH0_SX362 +MWCH0_SX452 +MWCH0_SX92 +MWDK0_SI1436 +MWDK0_SI2017 +MWDK0_SI806 +MWDK0_SX176 +MWDK0_SX266 +MWDK0_SX356 +MWDK0_SX446 +MWDK0_SX86 +MWEM0_SI1320 +MWEM0_SI1393 +MWEM0_SI1950 +MWEM0_SX150 +MWEM0_SX240 +MWEM0_SX330 +MWEM0_SX420 +MWEM0_SX60 +MWGR0_SI1606 +MWGR0_SI2236 +MWGR0_SI976 +MWGR0_SX166 +MWGR0_SX256 +MWGR0_SX346 +MWGR0_SX436 +MWGR0_SX76 +MWRE0_SI1057 +MWRE0_SI1687 +MWRE0_SI2317 +MWRE0_SX157 +MWRE0_SX247 +MWRE0_SX337 +MWRE0_SX427 +MWRE0_SX67 +MWRP0_SI1443 +MWRP0_SI1525 +MWRP0_SI2073 +MWRP0_SX183 +MWRP0_SX273 +MWRP0_SX3 +MWRP0_SX363 +MWRP0_SX93 +MWSB0_SI1626 +MWSB0_SI2256 +MWSB0_SI996 +MWSB0_SX186 +MWSB0_SX276 +MWSB0_SX366 +MWSB0_SX6 +MWSB0_SX96 +MWSH0_SI1426 +MWSH0_SI2266 +MWSH0_SI796 +MWSH0_SX166 +MWSH0_SX256 +MWSH0_SX346 +MWSH0_SX436 +MWSH0_SX76 +MZMB0_SI1166 +MZMB0_SI1796 +MZMB0_SI536 +MZMB0_SX176 +MZMB0_SX266 +MZMB0_SX356 +MZMB0_SX446 +MZMB0_SX86 diff --git a/examples/wav2vec/unsupervised/config/timit_matched/valid.uid b/examples/wav2vec/unsupervised/config/timit_matched/valid.uid new file mode 100644 index 0000000000..ab5ef381ab --- /dev/null +++ b/examples/wav2vec/unsupervised/config/timit_matched/valid.uid @@ -0,0 +1,400 @@ +FADG0_SI1279 +FADG0_SI1909 +FADG0_SI649 +FADG0_SX109 +FADG0_SX19 +FADG0_SX199 +FADG0_SX289 +FADG0_SX379 +FAKS0_SI1573 +FAKS0_SI2203 +FAKS0_SI943 +FAKS0_SX133 +FAKS0_SX223 +FAKS0_SX313 +FAKS0_SX403 +FAKS0_SX43 +FCAL1_SI1403 +FCAL1_SI2033 +FCAL1_SI773 +FCAL1_SX143 +FCAL1_SX233 +FCAL1_SX323 +FCAL1_SX413 +FCAL1_SX53 +FCMH0_SI1454 +FCMH0_SI2084 +FCMH0_SI824 +FCMH0_SX104 +FCMH0_SX14 +FCMH0_SX194 +FCMH0_SX284 +FCMH0_SX374 +FDAC1_SI1474 +FDAC1_SI2104 +FDAC1_SI844 +FDAC1_SX124 +FDAC1_SX214 +FDAC1_SX304 +FDAC1_SX34 +FDAC1_SX394 +FDMS0_SI1218 +FDMS0_SI1502 +FDMS0_SI1848 +FDMS0_SX138 +FDMS0_SX228 +FDMS0_SX318 +FDMS0_SX408 +FDMS0_SX48 +FDRW0_SI1283 +FDRW0_SI1423 +FDRW0_SI653 +FDRW0_SX113 +FDRW0_SX203 +FDRW0_SX23 +FDRW0_SX293 +FDRW0_SX383 +FEDW0_SI1084 +FEDW0_SI1653 +FEDW0_SI1714 +FEDW0_SX184 +FEDW0_SX274 +FEDW0_SX364 +FEDW0_SX4 +FEDW0_SX94 +FGJD0_SI1179 +FGJD0_SI549 +FGJD0_SI818 +FGJD0_SX189 +FGJD0_SX279 +FGJD0_SX369 +FGJD0_SX9 +FGJD0_SX99 +FJEM0_SI1264 +FJEM0_SI1894 +FJEM0_SI634 +FJEM0_SX184 +FJEM0_SX274 +FJEM0_SX364 +FJEM0_SX4 +FJEM0_SX94 +FJMG0_SI1181 +FJMG0_SI1811 +FJMG0_SI551 +FJMG0_SX101 +FJMG0_SX11 +FJMG0_SX191 +FJMG0_SX281 +FJMG0_SX371 +FJSJ0_SI1484 +FJSJ0_SI2114 +FJSJ0_SI854 +FJSJ0_SX134 +FJSJ0_SX224 +FJSJ0_SX314 +FJSJ0_SX404 +FJSJ0_SX44 +FKMS0_SI1490 +FKMS0_SI2120 +FKMS0_SI860 +FKMS0_SX140 +FKMS0_SX230 +FKMS0_SX320 +FKMS0_SX410 +FKMS0_SX50 +FMAH0_SI1289 +FMAH0_SI1919 +FMAH0_SI659 +FMAH0_SX119 +FMAH0_SX209 +FMAH0_SX29 +FMAH0_SX299 +FMAH0_SX389 +FMML0_SI1040 +FMML0_SI1670 +FMML0_SI2300 +FMML0_SX140 +FMML0_SX230 +FMML0_SX320 +FMML0_SX410 +FMML0_SX50 +FNMR0_SI1399 +FNMR0_SI2029 +FNMR0_SI769 +FNMR0_SX139 +FNMR0_SX229 +FNMR0_SX319 +FNMR0_SX409 +FNMR0_SX49 +FREW0_SI1030 +FREW0_SI1280 +FREW0_SI1910 +FREW0_SX110 +FREW0_SX20 +FREW0_SX200 +FREW0_SX290 +FREW0_SX380 +FSEM0_SI1198 +FSEM0_SI1828 +FSEM0_SI568 +FSEM0_SX118 +FSEM0_SX208 +FSEM0_SX28 +FSEM0_SX298 +FSEM0_SX388 +MAJC0_SI1946 +MAJC0_SI2095 +MAJC0_SI835 +MAJC0_SX115 +MAJC0_SX205 +MAJC0_SX25 +MAJC0_SX295 +MAJC0_SX385 +MBDG0_SI1463 +MBDG0_SI2093 +MBDG0_SI833 +MBDG0_SX113 +MBDG0_SX203 +MBDG0_SX23 +MBDG0_SX293 +MBDG0_SX383 +MBNS0_SI1220 +MBNS0_SI1850 +MBNS0_SI590 +MBNS0_SX140 +MBNS0_SX230 +MBNS0_SX320 +MBNS0_SX410 +MBNS0_SX50 +MBWM0_SI1304 +MBWM0_SI1934 +MBWM0_SI674 +MBWM0_SX134 +MBWM0_SX224 +MBWM0_SX314 +MBWM0_SX404 +MBWM0_SX44 +MCSH0_SI1549 +MCSH0_SI2179 +MCSH0_SI919 +MCSH0_SX109 +MCSH0_SX19 +MCSH0_SX199 +MCSH0_SX289 +MCSH0_SX379 +MDLF0_SI1583 +MDLF0_SI2213 +MDLF0_SI953 +MDLF0_SX143 +MDLF0_SX233 +MDLF0_SX323 +MDLF0_SX413 +MDLF0_SX53 +MDLS0_SI1628 +MDLS0_SI2258 +MDLS0_SI998 +MDLS0_SX188 +MDLS0_SX278 +MDLS0_SX368 +MDLS0_SX8 +MDLS0_SX98 +MDVC0_SI2174 +MDVC0_SI2196 +MDVC0_SI936 +MDVC0_SX126 +MDVC0_SX216 +MDVC0_SX306 +MDVC0_SX36 +MDVC0_SX396 +MERS0_SI1019 +MERS0_SI1649 +MERS0_SI497 +MERS0_SX119 +MERS0_SX209 +MERS0_SX29 +MERS0_SX299 +MERS0_SX389 +MGJF0_SI1901 +MGJF0_SI641 +MGJF0_SI776 +MGJF0_SX101 +MGJF0_SX11 +MGJF0_SX191 +MGJF0_SX281 +MGJF0_SX371 +MGLB0_SI1534 +MGLB0_SI2164 +MGLB0_SI904 +MGLB0_SX184 +MGLB0_SX274 +MGLB0_SX364 +MGLB0_SX4 +MGLB0_SX94 +MGWT0_SI1539 +MGWT0_SI2169 +MGWT0_SI909 +MGWT0_SX189 +MGWT0_SX279 +MGWT0_SX369 +MGWT0_SX9 +MGWT0_SX99 +MJAR0_SI1988 +MJAR0_SI2247 +MJAR0_SI728 +MJAR0_SX188 +MJAR0_SX278 +MJAR0_SX368 +MJAR0_SX8 +MJAR0_SX98 +MJFC0_SI1033 +MJFC0_SI1663 +MJFC0_SI2293 +MJFC0_SX133 +MJFC0_SX223 +MJFC0_SX313 +MJFC0_SX403 +MJFC0_SX43 +MJSW0_SI1010 +MJSW0_SI1640 +MJSW0_SI2270 +MJSW0_SX110 +MJSW0_SX20 +MJSW0_SX200 +MJSW0_SX290 +MJSW0_SX380 +MMDB1_SI1625 +MMDB1_SI2255 +MMDB1_SI995 +MMDB1_SX185 +MMDB1_SX275 +MMDB1_SX365 +MMDB1_SX5 +MMDB1_SX95 +MMDM2_SI1452 +MMDM2_SI1555 +MMDM2_SI2082 +MMDM2_SX102 +MMDM2_SX12 +MMDM2_SX192 +MMDM2_SX282 +MMDM2_SX372 +MMJR0_SI1648 +MMJR0_SI2166 +MMJR0_SI2278 +MMJR0_SX118 +MMJR0_SX208 +MMJR0_SX28 +MMJR0_SX298 +MMJR0_SX388 +MMWH0_SI1089 +MMWH0_SI1301 +MMWH0_SI459 +MMWH0_SX189 +MMWH0_SX279 +MMWH0_SX369 +MMWH0_SX9 +MMWH0_SX99 +MPDF0_SI1542 +MPDF0_SI2172 +MPDF0_SI912 +MPDF0_SX102 +MPDF0_SX12 +MPDF0_SX192 +MPDF0_SX282 +MPDF0_SX372 +MRCS0_SI1223 +MRCS0_SI1853 +MRCS0_SI593 +MRCS0_SX143 +MRCS0_SX233 +MRCS0_SX323 +MRCS0_SX413 +MRCS0_SX53 +MREB0_SI1375 +MREB0_SI2005 +MREB0_SI745 +MREB0_SX115 +MREB0_SX205 +MREB0_SX25 +MREB0_SX295 +MREB0_SX385 +MRJM4_SI1489 +MRJM4_SI2119 +MRJM4_SI859 +MRJM4_SX139 +MRJM4_SX229 +MRJM4_SX319 +MRJM4_SX409 +MRJM4_SX49 +MRJR0_SI1182 +MRJR0_SI1812 +MRJR0_SI2313 +MRJR0_SX102 +MRJR0_SX12 +MRJR0_SX192 +MRJR0_SX282 +MRJR0_SX372 +MROA0_SI1307 +MROA0_SI1970 +MROA0_SI677 +MROA0_SX137 +MROA0_SX227 +MROA0_SX317 +MROA0_SX407 +MROA0_SX47 +MRTK0_SI1093 +MRTK0_SI1723 +MRTK0_SI1750 +MRTK0_SX103 +MRTK0_SX13 +MRTK0_SX193 +MRTK0_SX283 +MRTK0_SX373 +MRWS1_SI1130 +MRWS1_SI1496 +MRWS1_SI500 +MRWS1_SX140 +MRWS1_SX230 +MRWS1_SX320 +MRWS1_SX410 +MRWS1_SX50 +MTAA0_SI1285 +MTAA0_SI1915 +MTAA0_SI596 +MTAA0_SX115 +MTAA0_SX205 +MTAA0_SX25 +MTAA0_SX295 +MTAA0_SX385 +MTDT0_SI1994 +MTDT0_SI2254 +MTDT0_SI994 +MTDT0_SX184 +MTDT0_SX274 +MTDT0_SX364 +MTDT0_SX4 +MTDT0_SX94 +MTEB0_SI1133 +MTEB0_SI2064 +MTEB0_SI503 +MTEB0_SX143 +MTEB0_SX233 +MTEB0_SX323 +MTEB0_SX413 +MTEB0_SX53 +MTHC0_SI1015 +MTHC0_SI1645 +MTHC0_SI2275 +MTHC0_SX115 +MTHC0_SX205 +MTHC0_SX25 +MTHC0_SX295 +MTHC0_SX385 +MWJG0_SI1124 +MWJG0_SI1754 +MWJG0_SI494 +MWJG0_SX134 +MWJG0_SX224 +MWJG0_SX314 +MWJG0_SX404 +MWJG0_SX44 diff --git a/examples/wav2vec/unsupervised/config/timit_unmatched/test.uid b/examples/wav2vec/unsupervised/config/timit_unmatched/test.uid new file mode 100644 index 0000000000..e3967e4242 --- /dev/null +++ b/examples/wav2vec/unsupervised/config/timit_unmatched/test.uid @@ -0,0 +1,1680 @@ +FADG0_SA1 +FADG0_SA2 +FADG0_SI1279 +FADG0_SI1909 +FADG0_SI649 +FADG0_SX109 +FADG0_SX19 +FADG0_SX199 +FADG0_SX289 +FADG0_SX379 +FAKS0_SA1 +FAKS0_SA2 +FAKS0_SI1573 +FAKS0_SI2203 +FAKS0_SI943 +FAKS0_SX133 +FAKS0_SX223 +FAKS0_SX313 +FAKS0_SX403 +FAKS0_SX43 +FASW0_SA1 +FASW0_SA2 +FASW0_SI1550 +FASW0_SI2180 +FASW0_SI920 +FASW0_SX110 +FASW0_SX20 +FASW0_SX200 +FASW0_SX290 +FASW0_SX380 +FAWF0_SA1 +FAWF0_SA2 +FAWF0_SI1000 +FAWF0_SI1630 +FAWF0_SI2260 +FAWF0_SX10 +FAWF0_SX100 +FAWF0_SX190 +FAWF0_SX280 +FAWF0_SX370 +FCAL1_SA1 +FCAL1_SA2 +FCAL1_SI1403 +FCAL1_SI2033 +FCAL1_SI773 +FCAL1_SX143 +FCAL1_SX233 +FCAL1_SX323 +FCAL1_SX413 +FCAL1_SX53 +FCAU0_SA1 +FCAU0_SA2 +FCAU0_SI1037 +FCAU0_SI1667 +FCAU0_SI2297 +FCAU0_SX137 +FCAU0_SX227 +FCAU0_SX317 +FCAU0_SX407 +FCAU0_SX47 +FCFT0_SA1 +FCFT0_SA2 +FCFT0_SI1178 +FCFT0_SI1808 +FCFT0_SI548 +FCFT0_SX188 +FCFT0_SX278 +FCFT0_SX368 +FCFT0_SX8 +FCFT0_SX98 +FCMH0_SA1 +FCMH0_SA2 +FCMH0_SI1454 +FCMH0_SI2084 +FCMH0_SI824 +FCMH0_SX104 +FCMH0_SX14 +FCMH0_SX194 +FCMH0_SX284 +FCMH0_SX374 +FCMH1_SA1 +FCMH1_SA2 +FCMH1_SI1493 +FCMH1_SI2123 +FCMH1_SI863 +FCMH1_SX143 +FCMH1_SX233 +FCMH1_SX323 +FCMH1_SX413 +FCMH1_SX53 +FCMR0_SA1 +FCMR0_SA2 +FCMR0_SI1105 +FCMR0_SI1735 +FCMR0_SI475 +FCMR0_SX115 +FCMR0_SX205 +FCMR0_SX25 +FCMR0_SX295 +FCMR0_SX385 +FCRH0_SA1 +FCRH0_SA2 +FCRH0_SI1088 +FCRH0_SI1718 +FCRH0_SI458 +FCRH0_SX188 +FCRH0_SX278 +FCRH0_SX368 +FCRH0_SX8 +FCRH0_SX98 +FDAC1_SA1 +FDAC1_SA2 +FDAC1_SI1474 +FDAC1_SI2104 +FDAC1_SI844 +FDAC1_SX124 +FDAC1_SX214 +FDAC1_SX304 +FDAC1_SX34 +FDAC1_SX394 +FDHC0_SA1 +FDHC0_SA2 +FDHC0_SI1559 +FDHC0_SI2189 +FDHC0_SI929 +FDHC0_SX119 +FDHC0_SX209 +FDHC0_SX29 +FDHC0_SX299 +FDHC0_SX389 +FDMS0_SA1 +FDMS0_SA2 +FDMS0_SI1218 +FDMS0_SI1502 +FDMS0_SI1848 +FDMS0_SX138 +FDMS0_SX228 +FDMS0_SX318 +FDMS0_SX408 +FDMS0_SX48 +FDRD1_SA1 +FDRD1_SA2 +FDRD1_SI1544 +FDRD1_SI1566 +FDRD1_SI2149 +FDRD1_SX104 +FDRD1_SX14 +FDRD1_SX194 +FDRD1_SX284 +FDRD1_SX374 +FDRW0_SA1 +FDRW0_SA2 +FDRW0_SI1283 +FDRW0_SI1423 +FDRW0_SI653 +FDRW0_SX113 +FDRW0_SX203 +FDRW0_SX23 +FDRW0_SX293 +FDRW0_SX383 +FEDW0_SA1 +FEDW0_SA2 +FEDW0_SI1084 +FEDW0_SI1653 +FEDW0_SI1714 +FEDW0_SX184 +FEDW0_SX274 +FEDW0_SX364 +FEDW0_SX4 +FEDW0_SX94 +FELC0_SA1 +FELC0_SA2 +FELC0_SI1386 +FELC0_SI2016 +FELC0_SI756 +FELC0_SX126 +FELC0_SX216 +FELC0_SX306 +FELC0_SX36 +FELC0_SX396 +FGJD0_SA1 +FGJD0_SA2 +FGJD0_SI1179 +FGJD0_SI549 +FGJD0_SI818 +FGJD0_SX189 +FGJD0_SX279 +FGJD0_SX369 +FGJD0_SX9 +FGJD0_SX99 +FGMD0_SA1 +FGMD0_SA2 +FGMD0_SI1943 +FGMD0_SI2107 +FGMD0_SI683 +FGMD0_SX143 +FGMD0_SX233 +FGMD0_SX323 +FGMD0_SX413 +FGMD0_SX53 +FGWR0_SA1 +FGWR0_SA2 +FGWR0_SI1578 +FGWR0_SI2208 +FGWR0_SI948 +FGWR0_SX138 +FGWR0_SX228 +FGWR0_SX318 +FGWR0_SX408 +FGWR0_SX48 +FHES0_SA1 +FHES0_SA2 +FHES0_SI1109 +FHES0_SI1739 +FHES0_SI479 +FHES0_SX119 +FHES0_SX209 +FHES0_SX29 +FHES0_SX299 +FHES0_SX389 +FHEW0_SA1 +FHEW0_SA2 +FHEW0_SI2023 +FHEW0_SI690 +FHEW0_SI763 +FHEW0_SX133 +FHEW0_SX223 +FHEW0_SX313 +FHEW0_SX403 +FHEW0_SX43 +FISB0_SA1 +FISB0_SA2 +FISB0_SI1579 +FISB0_SI2209 +FISB0_SI949 +FISB0_SX139 +FISB0_SX229 +FISB0_SX319 +FISB0_SX409 +FISB0_SX49 +FJAS0_SA1 +FJAS0_SA2 +FJAS0_SI1400 +FJAS0_SI2030 +FJAS0_SI770 +FJAS0_SX140 +FJAS0_SX230 +FJAS0_SX320 +FJAS0_SX410 +FJAS0_SX50 +FJCS0_SA1 +FJCS0_SA2 +FJCS0_SI1309 +FJCS0_SI1833 +FJCS0_SI1939 +FJCS0_SX139 +FJCS0_SX229 +FJCS0_SX319 +FJCS0_SX409 +FJCS0_SX49 +FJEM0_SA1 +FJEM0_SA2 +FJEM0_SI1264 +FJEM0_SI1894 +FJEM0_SI634 +FJEM0_SX184 +FJEM0_SX274 +FJEM0_SX364 +FJEM0_SX4 +FJEM0_SX94 +FJLM0_SA1 +FJLM0_SA2 +FJLM0_SI1043 +FJLM0_SI1673 +FJLM0_SI2303 +FJLM0_SX143 +FJLM0_SX233 +FJLM0_SX323 +FJLM0_SX413 +FJLM0_SX53 +FJMG0_SA1 +FJMG0_SA2 +FJMG0_SI1181 +FJMG0_SI1811 +FJMG0_SI551 +FJMG0_SX101 +FJMG0_SX11 +FJMG0_SX191 +FJMG0_SX281 +FJMG0_SX371 +FJRE0_SA1 +FJRE0_SA2 +FJRE0_SI1116 +FJRE0_SI1587 +FJRE0_SI1746 +FJRE0_SX126 +FJRE0_SX216 +FJRE0_SX306 +FJRE0_SX36 +FJRE0_SX396 +FJSA0_SA1 +FJSA0_SA2 +FJSA0_SI1379 +FJSA0_SI2009 +FJSA0_SI749 +FJSA0_SX119 +FJSA0_SX209 +FJSA0_SX29 +FJSA0_SX299 +FJSA0_SX389 +FJSJ0_SA1 +FJSJ0_SA2 +FJSJ0_SI1484 +FJSJ0_SI2114 +FJSJ0_SI854 +FJSJ0_SX134 +FJSJ0_SX224 +FJSJ0_SX314 +FJSJ0_SX404 +FJSJ0_SX44 +FJWB0_SA1 +FJWB0_SA2 +FJWB0_SI1265 +FJWB0_SI635 +FJWB0_SI992 +FJWB0_SX185 +FJWB0_SX275 +FJWB0_SX365 +FJWB0_SX5 +FJWB0_SX95 +FKMS0_SA1 +FKMS0_SA2 +FKMS0_SI1490 +FKMS0_SI2120 +FKMS0_SI860 +FKMS0_SX140 +FKMS0_SX230 +FKMS0_SX320 +FKMS0_SX410 +FKMS0_SX50 +FLAS0_SA1 +FLAS0_SA2 +FLAS0_SI1026 +FLAS0_SI1488 +FLAS0_SI858 +FLAS0_SX138 +FLAS0_SX228 +FLAS0_SX318 +FLAS0_SX408 +FLAS0_SX48 +FLBW0_SA1 +FLBW0_SA2 +FLBW0_SI1219 +FLBW0_SI1849 +FLBW0_SI2253 +FLBW0_SX139 +FLBW0_SX229 +FLBW0_SX319 +FLBW0_SX409 +FLBW0_SX49 +FLKD0_SA1 +FLKD0_SA2 +FLKD0_SI1369 +FLKD0_SI739 +FLKD0_SI894 +FLKD0_SX109 +FLKD0_SX19 +FLKD0_SX199 +FLKD0_SX289 +FLKD0_SX379 +FLNH0_SA1 +FLNH0_SA2 +FLNH0_SI1214 +FLNH0_SI584 +FLNH0_SI941 +FLNH0_SX134 +FLNH0_SX224 +FLNH0_SX314 +FLNH0_SX404 +FLNH0_SX44 +FMAF0_SA1 +FMAF0_SA2 +FMAF0_SI1459 +FMAF0_SI2089 +FMAF0_SI829 +FMAF0_SX109 +FMAF0_SX19 +FMAF0_SX199 +FMAF0_SX289 +FMAF0_SX379 +FMAH0_SA1 +FMAH0_SA2 +FMAH0_SI1289 +FMAH0_SI1919 +FMAH0_SI659 +FMAH0_SX119 +FMAH0_SX209 +FMAH0_SX29 +FMAH0_SX299 +FMAH0_SX389 +FMCM0_SA1 +FMCM0_SA2 +FMCM0_SI1180 +FMCM0_SI1810 +FMCM0_SI550 +FMCM0_SX10 +FMCM0_SX100 +FMCM0_SX190 +FMCM0_SX280 +FMCM0_SX370 +FMGD0_SA1 +FMGD0_SA2 +FMGD0_SI1564 +FMGD0_SI2194 +FMGD0_SI934 +FMGD0_SX124 +FMGD0_SX214 +FMGD0_SX304 +FMGD0_SX34 +FMGD0_SX394 +FMLD0_SA1 +FMLD0_SA2 +FMLD0_SI2185 +FMLD0_SI822 +FMLD0_SI925 +FMLD0_SX115 +FMLD0_SX205 +FMLD0_SX25 +FMLD0_SX295 +FMLD0_SX385 +FMML0_SA1 +FMML0_SA2 +FMML0_SI1040 +FMML0_SI1670 +FMML0_SI2300 +FMML0_SX140 +FMML0_SX230 +FMML0_SX320 +FMML0_SX410 +FMML0_SX50 +FNLP0_SA1 +FNLP0_SA2 +FNLP0_SI1308 +FNLP0_SI1938 +FNLP0_SI678 +FNLP0_SX138 +FNLP0_SX228 +FNLP0_SX318 +FNLP0_SX408 +FNLP0_SX48 +FNMR0_SA1 +FNMR0_SA2 +FNMR0_SI1399 +FNMR0_SI2029 +FNMR0_SI769 +FNMR0_SX139 +FNMR0_SX229 +FNMR0_SX319 +FNMR0_SX409 +FNMR0_SX49 +FPAS0_SA1 +FPAS0_SA2 +FPAS0_SI1272 +FPAS0_SI2204 +FPAS0_SI944 +FPAS0_SX134 +FPAS0_SX224 +FPAS0_SX314 +FPAS0_SX404 +FPAS0_SX44 +FPKT0_SA1 +FPKT0_SA2 +FPKT0_SI1538 +FPKT0_SI2168 +FPKT0_SI908 +FPKT0_SX188 +FPKT0_SX278 +FPKT0_SX368 +FPKT0_SX8 +FPKT0_SX98 +FRAM1_SA1 +FRAM1_SA2 +FRAM1_SI1360 +FRAM1_SI522 +FRAM1_SI730 +FRAM1_SX10 +FRAM1_SX100 +FRAM1_SX190 +FRAM1_SX280 +FRAM1_SX370 +FREW0_SA1 +FREW0_SA2 +FREW0_SI1030 +FREW0_SI1280 +FREW0_SI1910 +FREW0_SX110 +FREW0_SX20 +FREW0_SX200 +FREW0_SX290 +FREW0_SX380 +FRNG0_SA1 +FRNG0_SA2 +FRNG0_SI1355 +FRNG0_SI1985 +FRNG0_SI725 +FRNG0_SX185 +FRNG0_SX275 +FRNG0_SX365 +FRNG0_SX5 +FRNG0_SX95 +FSEM0_SA1 +FSEM0_SA2 +FSEM0_SI1198 +FSEM0_SI1828 +FSEM0_SI568 +FSEM0_SX118 +FSEM0_SX208 +FSEM0_SX28 +FSEM0_SX298 +FSEM0_SX388 +FSLB1_SA1 +FSLB1_SA2 +FSLB1_SI1904 +FSLB1_SI644 +FSLB1_SI891 +FSLB1_SX104 +FSLB1_SX14 +FSLB1_SX194 +FSLB1_SX284 +FSLB1_SX374 +FSXA0_SA1 +FSXA0_SA2 +FSXA0_SI1108 +FSXA0_SI1846 +FSXA0_SI478 +FSXA0_SX118 +FSXA0_SX208 +FSXA0_SX28 +FSXA0_SX298 +FSXA0_SX388 +FTLH0_SA1 +FTLH0_SA2 +FTLH0_SI1009 +FTLH0_SI1390 +FTLH0_SI1639 +FTLH0_SX109 +FTLH0_SX19 +FTLH0_SX199 +FTLH0_SX289 +FTLH0_SX379 +FUTB0_SA1 +FUTB0_SA2 +FUTB0_SI1204 +FUTB0_SI1330 +FUTB0_SI1834 +FUTB0_SX124 +FUTB0_SX214 +FUTB0_SX304 +FUTB0_SX34 +FUTB0_SX394 +MABW0_SA1 +MABW0_SA2 +MABW0_SI1230 +MABW0_SI1664 +MABW0_SI2294 +MABW0_SX134 +MABW0_SX224 +MABW0_SX314 +MABW0_SX404 +MABW0_SX44 +MAHH0_SA1 +MAHH0_SA2 +MAHH0_SI1294 +MAHH0_SI1924 +MAHH0_SI664 +MAHH0_SX124 +MAHH0_SX214 +MAHH0_SX304 +MAHH0_SX34 +MAHH0_SX394 +MAJC0_SA1 +MAJC0_SA2 +MAJC0_SI1946 +MAJC0_SI2095 +MAJC0_SI835 +MAJC0_SX115 +MAJC0_SX205 +MAJC0_SX25 +MAJC0_SX295 +MAJC0_SX385 +MBDG0_SA1 +MBDG0_SA2 +MBDG0_SI1463 +MBDG0_SI2093 +MBDG0_SI833 +MBDG0_SX113 +MBDG0_SX203 +MBDG0_SX23 +MBDG0_SX293 +MBDG0_SX383 +MBJK0_SA1 +MBJK0_SA2 +MBJK0_SI1175 +MBJK0_SI2128 +MBJK0_SI545 +MBJK0_SX185 +MBJK0_SX275 +MBJK0_SX365 +MBJK0_SX5 +MBJK0_SX95 +MBNS0_SA1 +MBNS0_SA2 +MBNS0_SI1220 +MBNS0_SI1850 +MBNS0_SI590 +MBNS0_SX140 +MBNS0_SX230 +MBNS0_SX320 +MBNS0_SX410 +MBNS0_SX50 +MBPM0_SA1 +MBPM0_SA2 +MBPM0_SI1577 +MBPM0_SI1584 +MBPM0_SI947 +MBPM0_SX137 +MBPM0_SX227 +MBPM0_SX317 +MBPM0_SX407 +MBPM0_SX47 +MBWM0_SA1 +MBWM0_SA2 +MBWM0_SI1304 +MBWM0_SI1934 +MBWM0_SI674 +MBWM0_SX134 +MBWM0_SX224 +MBWM0_SX314 +MBWM0_SX404 +MBWM0_SX44 +MCCS0_SA1 +MCCS0_SA2 +MCCS0_SI1469 +MCCS0_SI2099 +MCCS0_SI839 +MCCS0_SX119 +MCCS0_SX209 +MCCS0_SX29 +MCCS0_SX299 +MCCS0_SX389 +MCEM0_SA1 +MCEM0_SA2 +MCEM0_SI1398 +MCEM0_SI2028 +MCEM0_SI768 +MCEM0_SX138 +MCEM0_SX228 +MCEM0_SX318 +MCEM0_SX408 +MCEM0_SX48 +MCHH0_SA1 +MCHH0_SA2 +MCHH0_SI1004 +MCHH0_SI1634 +MCHH0_SI530 +MCHH0_SX104 +MCHH0_SX14 +MCHH0_SX194 +MCHH0_SX284 +MCHH0_SX374 +MCMB0_SA1 +MCMB0_SA2 +MCMB0_SI1268 +MCMB0_SI1898 +MCMB0_SI638 +MCMB0_SX188 +MCMB0_SX278 +MCMB0_SX368 +MCMB0_SX8 +MCMB0_SX98 +MCMJ0_SA1 +MCMJ0_SA2 +MCMJ0_SI1094 +MCMJ0_SI464 +MCMJ0_SI602 +MCMJ0_SX104 +MCMJ0_SX14 +MCMJ0_SX194 +MCMJ0_SX284 +MCMJ0_SX374 +MCRC0_SA1 +MCRC0_SA2 +MCRC0_SI1092 +MCRC0_SI1722 +MCRC0_SI462 +MCRC0_SX102 +MCRC0_SX12 +MCRC0_SX192 +MCRC0_SX282 +MCRC0_SX372 +MCSH0_SA1 +MCSH0_SA2 +MCSH0_SI1549 +MCSH0_SI2179 +MCSH0_SI919 +MCSH0_SX109 +MCSH0_SX19 +MCSH0_SX199 +MCSH0_SX289 +MCSH0_SX379 +MCTT0_SA1 +MCTT0_SA2 +MCTT0_SI1144 +MCTT0_SI2188 +MCTT0_SI928 +MCTT0_SX118 +MCTT0_SX208 +MCTT0_SX28 +MCTT0_SX298 +MCTT0_SX388 +MCTW0_SA1 +MCTW0_SA2 +MCTW0_SI1373 +MCTW0_SI2003 +MCTW0_SI743 +MCTW0_SX113 +MCTW0_SX203 +MCTW0_SX23 +MCTW0_SX293 +MCTW0_SX383 +MDAB0_SA1 +MDAB0_SA2 +MDAB0_SI1039 +MDAB0_SI1669 +MDAB0_SI2299 +MDAB0_SX139 +MDAB0_SX229 +MDAB0_SX319 +MDAB0_SX409 +MDAB0_SX49 +MDAC2_SA1 +MDAC2_SA2 +MDAC2_SI2259 +MDAC2_SI560 +MDAC2_SI999 +MDAC2_SX189 +MDAC2_SX279 +MDAC2_SX369 +MDAC2_SX9 +MDAC2_SX99 +MDAW1_SA1 +MDAW1_SA2 +MDAW1_SI1453 +MDAW1_SI2083 +MDAW1_SI823 +MDAW1_SX103 +MDAW1_SX13 +MDAW1_SX193 +MDAW1_SX283 +MDAW1_SX373 +MDBB0_SA1 +MDBB0_SA2 +MDBB0_SI1195 +MDBB0_SI1825 +MDBB0_SI565 +MDBB0_SX115 +MDBB0_SX205 +MDBB0_SX25 +MDBB0_SX295 +MDBB0_SX385 +MDLD0_SA1 +MDLD0_SA2 +MDLD0_SI1543 +MDLD0_SI2173 +MDLD0_SI913 +MDLD0_SX103 +MDLD0_SX13 +MDLD0_SX193 +MDLD0_SX283 +MDLD0_SX373 +MDLF0_SA1 +MDLF0_SA2 +MDLF0_SI1583 +MDLF0_SI2213 +MDLF0_SI953 +MDLF0_SX143 +MDLF0_SX233 +MDLF0_SX323 +MDLF0_SX413 +MDLF0_SX53 +MDLS0_SA1 +MDLS0_SA2 +MDLS0_SI1628 +MDLS0_SI2258 +MDLS0_SI998 +MDLS0_SX188 +MDLS0_SX278 +MDLS0_SX368 +MDLS0_SX8 +MDLS0_SX98 +MDRB0_SA1 +MDRB0_SA2 +MDRB0_SI1174 +MDRB0_SI2109 +MDRB0_SI544 +MDRB0_SX184 +MDRB0_SX274 +MDRB0_SX364 +MDRB0_SX4 +MDRB0_SX94 +MDRM0_SA1 +MDRM0_SA2 +MDRM0_SI1013 +MDRM0_SI1643 +MDRM0_SI2273 +MDRM0_SX113 +MDRM0_SX203 +MDRM0_SX23 +MDRM0_SX293 +MDRM0_SX383 +MDSC0_SA1 +MDSC0_SA2 +MDSC0_SI1038 +MDSC0_SI2298 +MDSC0_SI967 +MDSC0_SX138 +MDSC0_SX228 +MDSC0_SX318 +MDSC0_SX408 +MDSC0_SX48 +MDVC0_SA1 +MDVC0_SA2 +MDVC0_SI2174 +MDVC0_SI2196 +MDVC0_SI936 +MDVC0_SX126 +MDVC0_SX216 +MDVC0_SX306 +MDVC0_SX36 +MDVC0_SX396 +MDWA0_SA1 +MDWA0_SA2 +MDWA0_SI1146 +MDWA0_SI1445 +MDWA0_SI519 +MDWA0_SX185 +MDWA0_SX275 +MDWA0_SX365 +MDWA0_SX5 +MDWA0_SX95 +MDWK0_SA1 +MDWK0_SA2 +MDWK0_SI1540 +MDWK0_SI2170 +MDWK0_SI910 +MDWK0_SX10 +MDWK0_SX100 +MDWK0_SX190 +MDWK0_SX280 +MDWK0_SX370 +MERS0_SA1 +MERS0_SA2 +MERS0_SI1019 +MERS0_SI1649 +MERS0_SI497 +MERS0_SX119 +MERS0_SX209 +MERS0_SX29 +MERS0_SX299 +MERS0_SX389 +MESD0_SA1 +MESD0_SA2 +MESD0_SI1002 +MESD0_SI1632 +MESD0_SI2262 +MESD0_SX102 +MESD0_SX12 +MESD0_SX192 +MESD0_SX282 +MESD0_SX372 +MFGK0_SA1 +MFGK0_SA2 +MFGK0_SI1451 +MFGK0_SI1744 +MFGK0_SI484 +MFGK0_SX124 +MFGK0_SX214 +MFGK0_SX304 +MFGK0_SX34 +MFGK0_SX394 +MGJF0_SA1 +MGJF0_SA2 +MGJF0_SI1901 +MGJF0_SI641 +MGJF0_SI776 +MGJF0_SX101 +MGJF0_SX11 +MGJF0_SX191 +MGJF0_SX281 +MGJF0_SX371 +MGLB0_SA1 +MGLB0_SA2 +MGLB0_SI1534 +MGLB0_SI2164 +MGLB0_SI904 +MGLB0_SX184 +MGLB0_SX274 +MGLB0_SX364 +MGLB0_SX4 +MGLB0_SX94 +MGMM0_SA1 +MGMM0_SA2 +MGMM0_SI1129 +MGMM0_SI1759 +MGMM0_SI499 +MGMM0_SX139 +MGMM0_SX229 +MGMM0_SX319 +MGMM0_SX409 +MGMM0_SX49 +MGRT0_SA1 +MGRT0_SA2 +MGRT0_SI1450 +MGRT0_SI2080 +MGRT0_SI820 +MGRT0_SX10 +MGRT0_SX100 +MGRT0_SX190 +MGRT0_SX280 +MGRT0_SX370 +MGWT0_SA1 +MGWT0_SA2 +MGWT0_SI1539 +MGWT0_SI2169 +MGWT0_SI909 +MGWT0_SX189 +MGWT0_SX279 +MGWT0_SX369 +MGWT0_SX9 +MGWT0_SX99 +MHPG0_SA1 +MHPG0_SA2 +MHPG0_SI1090 +MHPG0_SI1720 +MHPG0_SI460 +MHPG0_SX10 +MHPG0_SX100 +MHPG0_SX190 +MHPG0_SX280 +MHPG0_SX370 +MJAR0_SA1 +MJAR0_SA2 +MJAR0_SI1988 +MJAR0_SI2247 +MJAR0_SI728 +MJAR0_SX188 +MJAR0_SX278 +MJAR0_SX368 +MJAR0_SX8 +MJAR0_SX98 +MJBR0_SA1 +MJBR0_SA2 +MJBR0_SI1001 +MJBR0_SI1631 +MJBR0_SI2261 +MJBR0_SX101 +MJBR0_SX11 +MJBR0_SX191 +MJBR0_SX281 +MJBR0_SX371 +MJDH0_SA1 +MJDH0_SA2 +MJDH0_SI1354 +MJDH0_SI1984 +MJDH0_SI724 +MJDH0_SX184 +MJDH0_SX274 +MJDH0_SX364 +MJDH0_SX4 +MJDH0_SX94 +MJDM1_SA1 +MJDM1_SA2 +MJDM1_SI1085 +MJDM1_SI1715 +MJDM1_SI455 +MJDM1_SX185 +MJDM1_SX275 +MJDM1_SX365 +MJDM1_SX5 +MJDM1_SX95 +MJES0_SA1 +MJES0_SA2 +MJES0_SI1384 +MJES0_SI2014 +MJES0_SI754 +MJES0_SX124 +MJES0_SX214 +MJES0_SX304 +MJES0_SX34 +MJES0_SX394 +MJFC0_SA1 +MJFC0_SA2 +MJFC0_SI1033 +MJFC0_SI1663 +MJFC0_SI2293 +MJFC0_SX133 +MJFC0_SX223 +MJFC0_SX313 +MJFC0_SX403 +MJFC0_SX43 +MJJG0_SA1 +MJJG0_SA2 +MJJG0_SI1003 +MJJG0_SI1633 +MJJG0_SI2263 +MJJG0_SX103 +MJJG0_SX13 +MJJG0_SX193 +MJJG0_SX283 +MJJG0_SX373 +MJLN0_SA1 +MJLN0_SA2 +MJLN0_SI1449 +MJLN0_SI2079 +MJLN0_SI819 +MJLN0_SX189 +MJLN0_SX279 +MJLN0_SX369 +MJLN0_SX9 +MJLN0_SX99 +MJMP0_SA1 +MJMP0_SA2 +MJMP0_SI1535 +MJMP0_SI1791 +MJMP0_SI905 +MJMP0_SX185 +MJMP0_SX275 +MJMP0_SX365 +MJMP0_SX5 +MJMP0_SX95 +MJRF0_SA1 +MJRF0_SA2 +MJRF0_SI1114 +MJRF0_SI2081 +MJRF0_SI821 +MJRF0_SX101 +MJRF0_SX11 +MJRF0_SX191 +MJRF0_SX281 +MJRF0_SX371 +MJSW0_SA1 +MJSW0_SA2 +MJSW0_SI1010 +MJSW0_SI1640 +MJSW0_SI2270 +MJSW0_SX110 +MJSW0_SX20 +MJSW0_SX200 +MJSW0_SX290 +MJSW0_SX380 +MJTC0_SA1 +MJTC0_SA2 +MJTC0_SI1460 +MJTC0_SI2090 +MJTC0_SI830 +MJTC0_SX110 +MJTC0_SX20 +MJTC0_SX200 +MJTC0_SX290 +MJTC0_SX380 +MJTH0_SA1 +MJTH0_SA2 +MJTH0_SI1296 +MJTH0_SI1926 +MJTH0_SI666 +MJTH0_SX126 +MJTH0_SX216 +MJTH0_SX306 +MJTH0_SX36 +MJTH0_SX396 +MJVW0_SA1 +MJVW0_SA2 +MJVW0_SI1733 +MJVW0_SI1758 +MJVW0_SI473 +MJVW0_SX113 +MJVW0_SX203 +MJVW0_SX23 +MJVW0_SX293 +MJVW0_SX383 +MKCH0_SA1 +MKCH0_SA2 +MKCH0_SI1378 +MKCH0_SI1425 +MKCH0_SI2008 +MKCH0_SX118 +MKCH0_SX208 +MKCH0_SX28 +MKCH0_SX298 +MKCH0_SX388 +MKCL0_SA1 +MKCL0_SA2 +MKCL0_SI1091 +MKCL0_SI1721 +MKCL0_SI461 +MKCL0_SX101 +MKCL0_SX11 +MKCL0_SX191 +MKCL0_SX281 +MKCL0_SX371 +MKDR0_SA1 +MKDR0_SA2 +MKDR0_SI1273 +MKDR0_SI1903 +MKDR0_SI643 +MKDR0_SX103 +MKDR0_SX13 +MKDR0_SX193 +MKDR0_SX283 +MKDR0_SX373 +MKJL0_SA1 +MKJL0_SA2 +MKJL0_SI1100 +MKJL0_SI1730 +MKJL0_SI470 +MKJL0_SX110 +MKJL0_SX20 +MKJL0_SX200 +MKJL0_SX290 +MKJL0_SX380 +MKLT0_SA1 +MKLT0_SA2 +MKLT0_SI1213 +MKLT0_SI1843 +MKLT0_SI583 +MKLT0_SX133 +MKLT0_SX223 +MKLT0_SX313 +MKLT0_SX403 +MKLT0_SX43 +MLIH0_SA1 +MLIH0_SA2 +MLIH0_SI1183 +MLIH0_SI1813 +MLIH0_SI553 +MLIH0_SX103 +MLIH0_SX13 +MLIH0_SX193 +MLIH0_SX283 +MLIH0_SX373 +MLJB0_SA1 +MLJB0_SA2 +MLJB0_SI1310 +MLJB0_SI1940 +MLJB0_SI680 +MLJB0_SX140 +MLJB0_SX230 +MLJB0_SX320 +MLJB0_SX410 +MLJB0_SX50 +MLLL0_SA1 +MLLL0_SA2 +MLLL0_SI1363 +MLLL0_SI1993 +MLLL0_SI733 +MLLL0_SX103 +MLLL0_SX13 +MLLL0_SX193 +MLLL0_SX283 +MLLL0_SX373 +MLNT0_SA1 +MLNT0_SA2 +MLNT0_SI1574 +MLNT0_SI1902 +MLNT0_SI642 +MLNT0_SX102 +MLNT0_SX12 +MLNT0_SX192 +MLNT0_SX282 +MLNT0_SX372 +MMAB0_SA1 +MMAB0_SA2 +MMAB0_SI1362 +MMAB0_SI1992 +MMAB0_SI732 +MMAB0_SX102 +MMAB0_SX12 +MMAB0_SX192 +MMAB0_SX282 +MMAB0_SX372 +MMDB1_SA1 +MMDB1_SA2 +MMDB1_SI1625 +MMDB1_SI2255 +MMDB1_SI995 +MMDB1_SX185 +MMDB1_SX275 +MMDB1_SX365 +MMDB1_SX5 +MMDB1_SX95 +MMDH0_SA1 +MMDH0_SA2 +MMDH0_SI1656 +MMDH0_SI2118 +MMDH0_SI2286 +MMDH0_SX126 +MMDH0_SX216 +MMDH0_SX306 +MMDH0_SX36 +MMDH0_SX396 +MMDM2_SA1 +MMDM2_SA2 +MMDM2_SI1452 +MMDM2_SI1555 +MMDM2_SI2082 +MMDM2_SX102 +MMDM2_SX12 +MMDM2_SX192 +MMDM2_SX282 +MMDM2_SX372 +MMJR0_SA1 +MMJR0_SA2 +MMJR0_SI1648 +MMJR0_SI2166 +MMJR0_SI2278 +MMJR0_SX118 +MMJR0_SX208 +MMJR0_SX28 +MMJR0_SX298 +MMJR0_SX388 +MMWH0_SA1 +MMWH0_SA2 +MMWH0_SI1089 +MMWH0_SI1301 +MMWH0_SI459 +MMWH0_SX189 +MMWH0_SX279 +MMWH0_SX369 +MMWH0_SX9 +MMWH0_SX99 +MNJM0_SA1 +MNJM0_SA2 +MNJM0_SI1580 +MNJM0_SI2210 +MNJM0_SI950 +MNJM0_SX140 +MNJM0_SX230 +MNJM0_SX320 +MNJM0_SX410 +MNJM0_SX50 +MNLS0_SA1 +MNLS0_SA2 +MNLS0_SI1483 +MNLS0_SI1610 +MNLS0_SI853 +MNLS0_SX133 +MNLS0_SX223 +MNLS0_SX313 +MNLS0_SX403 +MNLS0_SX43 +MPAB0_SA1 +MPAB0_SA2 +MPAB0_SI1103 +MPAB0_SI1128 +MPAB0_SI498 +MPAB0_SX138 +MPAB0_SX228 +MPAB0_SX318 +MPAB0_SX408 +MPAB0_SX48 +MPAM0_SA1 +MPAM0_SA2 +MPAM0_SI1189 +MPAM0_SI1819 +MPAM0_SI1961 +MPAM0_SX109 +MPAM0_SX19 +MPAM0_SX199 +MPAM0_SX289 +MPAM0_SX379 +MPAM1_SA1 +MPAM1_SA2 +MPAM1_SI1029 +MPAM1_SI1836 +MPAM1_SI576 +MPAM1_SX126 +MPAM1_SX216 +MPAM1_SX306 +MPAM1_SX36 +MPAM1_SX396 +MPCS0_SA1 +MPCS0_SA2 +MPCS0_SI1359 +MPCS0_SI1989 +MPCS0_SI729 +MPCS0_SX189 +MPCS0_SX279 +MPCS0_SX369 +MPCS0_SX9 +MPCS0_SX99 +MPDF0_SA1 +MPDF0_SA2 +MPDF0_SI1542 +MPDF0_SI2172 +MPDF0_SI912 +MPDF0_SX102 +MPDF0_SX12 +MPDF0_SX192 +MPDF0_SX282 +MPDF0_SX372 +MPGL0_SA1 +MPGL0_SA2 +MPGL0_SI1099 +MPGL0_SI1729 +MPGL0_SI469 +MPGL0_SX109 +MPGL0_SX19 +MPGL0_SX199 +MPGL0_SX289 +MPGL0_SX379 +MPLB0_SA1 +MPLB0_SA2 +MPLB0_SI1394 +MPLB0_SI2024 +MPLB0_SI764 +MPLB0_SX134 +MPLB0_SX224 +MPLB0_SX314 +MPLB0_SX404 +MPLB0_SX44 +MPWM0_SA1 +MPWM0_SA2 +MPWM0_SI1127 +MPWM0_SI1757 +MPWM0_SI2279 +MPWM0_SX137 +MPWM0_SX227 +MPWM0_SX317 +MPWM0_SX407 +MPWM0_SX47 +MRCS0_SA1 +MRCS0_SA2 +MRCS0_SI1223 +MRCS0_SI1853 +MRCS0_SI593 +MRCS0_SX143 +MRCS0_SX233 +MRCS0_SX323 +MRCS0_SX413 +MRCS0_SX53 +MRCZ0_SA1 +MRCZ0_SA2 +MRCZ0_SI1541 +MRCZ0_SI2171 +MRCZ0_SI911 +MRCZ0_SX101 +MRCZ0_SX11 +MRCZ0_SX191 +MRCZ0_SX281 +MRCZ0_SX371 +MREB0_SA1 +MREB0_SA2 +MREB0_SI1375 +MREB0_SI2005 +MREB0_SI745 +MREB0_SX115 +MREB0_SX205 +MREB0_SX25 +MREB0_SX295 +MREB0_SX385 +MRES0_SA1 +MRES0_SA2 +MRES0_SI1217 +MRES0_SI1847 +MRES0_SI587 +MRES0_SX137 +MRES0_SX227 +MRES0_SX317 +MRES0_SX407 +MRES0_SX47 +MRGG0_SA1 +MRGG0_SA2 +MRGG0_SI1199 +MRGG0_SI1829 +MRGG0_SI569 +MRGG0_SX119 +MRGG0_SX209 +MRGG0_SX29 +MRGG0_SX299 +MRGG0_SX389 +MRJM3_SA1 +MRJM3_SA2 +MRJM3_SI1448 +MRJM3_SI1809 +MRJM3_SI2078 +MRJM3_SX188 +MRJM3_SX278 +MRJM3_SX368 +MRJM3_SX8 +MRJM3_SX98 +MRJM4_SA1 +MRJM4_SA2 +MRJM4_SI1489 +MRJM4_SI2119 +MRJM4_SI859 +MRJM4_SX139 +MRJM4_SX229 +MRJM4_SX319 +MRJM4_SX409 +MRJM4_SX49 +MRJO0_SA1 +MRJO0_SA2 +MRJO0_SI1364 +MRJO0_SI1624 +MRJO0_SI734 +MRJO0_SX104 +MRJO0_SX14 +MRJO0_SX194 +MRJO0_SX284 +MRJO0_SX374 +MRJR0_SA1 +MRJR0_SA2 +MRJR0_SI1182 +MRJR0_SI1812 +MRJR0_SI2313 +MRJR0_SX102 +MRJR0_SX12 +MRJR0_SX192 +MRJR0_SX282 +MRJR0_SX372 +MRJS0_SA1 +MRJS0_SA2 +MRJS0_SI1444 +MRJS0_SI1523 +MRJS0_SI2074 +MRJS0_SX184 +MRJS0_SX274 +MRJS0_SX364 +MRJS0_SX4 +MRJS0_SX94 +MRKO0_SA1 +MRKO0_SA2 +MRKO0_SI1397 +MRKO0_SI2027 +MRKO0_SI767 +MRKO0_SX137 +MRKO0_SX227 +MRKO0_SX317 +MRKO0_SX407 +MRKO0_SX47 +MRMS1_SA1 +MRMS1_SA2 +MRMS1_SI1487 +MRMS1_SI2117 +MRMS1_SI857 +MRMS1_SX137 +MRMS1_SX227 +MRMS1_SX317 +MRMS1_SX407 +MRMS1_SX47 +MROA0_SA1 +MROA0_SA2 +MROA0_SI1307 +MROA0_SI1970 +MROA0_SI677 +MROA0_SX137 +MROA0_SX227 +MROA0_SX317 +MROA0_SX407 +MROA0_SX47 +MRPC0_SA1 +MRPC0_SA2 +MRPC0_SI1753 +MRPC0_SI493 +MRPC0_SI933 +MRPC0_SX133 +MRPC0_SX223 +MRPC0_SX313 +MRPC0_SX403 +MRPC0_SX43 +MRPP0_SA1 +MRPP0_SA2 +MRPP0_SI1184 +MRPP0_SI1814 +MRPP0_SI554 +MRPP0_SX104 +MRPP0_SX14 +MRPP0_SX194 +MRPP0_SX284 +MRPP0_SX374 +MRRK0_SA1 +MRRK0_SA2 +MRRK0_SI1288 +MRRK0_SI1716 +MRRK0_SI1918 +MRRK0_SX118 +MRRK0_SX208 +MRRK0_SX28 +MRRK0_SX298 +MRRK0_SX388 +MRTK0_SA1 +MRTK0_SA2 +MRTK0_SI1093 +MRTK0_SI1723 +MRTK0_SI1750 +MRTK0_SX103 +MRTK0_SX13 +MRTK0_SX193 +MRTK0_SX283 +MRTK0_SX373 +MRWS1_SA1 +MRWS1_SA2 +MRWS1_SI1130 +MRWS1_SI1496 +MRWS1_SI500 +MRWS1_SX140 +MRWS1_SX230 +MRWS1_SX320 +MRWS1_SX410 +MRWS1_SX50 +MSFH1_SA1 +MSFH1_SA2 +MSFH1_SI1270 +MSFH1_SI1900 +MSFH1_SI640 +MSFH1_SX10 +MSFH1_SX100 +MSFH1_SX190 +MSFH1_SX280 +MSFH1_SX370 +MSJS1_SA1 +MSJS1_SA2 +MSJS1_SI1899 +MSJS1_SI639 +MSJS1_SI869 +MSJS1_SX189 +MSJS1_SX279 +MSJS1_SX369 +MSJS1_SX9 +MSJS1_SX99 +MSLB0_SA1 +MSLB0_SA2 +MSLB0_SI1193 +MSLB0_SI1823 +MSLB0_SI563 +MSLB0_SX113 +MSLB0_SX203 +MSLB0_SX23 +MSLB0_SX293 +MSLB0_SX383 +MSTK0_SA1 +MSTK0_SA2 +MSTK0_SI1024 +MSTK0_SI2222 +MSTK0_SI2284 +MSTK0_SX124 +MSTK0_SX214 +MSTK0_SX304 +MSTK0_SX34 +MSTK0_SX394 +MTAA0_SA1 +MTAA0_SA2 +MTAA0_SI1285 +MTAA0_SI1915 +MTAA0_SI596 +MTAA0_SX115 +MTAA0_SX205 +MTAA0_SX25 +MTAA0_SX295 +MTAA0_SX385 +MTAS1_SA1 +MTAS1_SA2 +MTAS1_SI1473 +MTAS1_SI2098 +MTAS1_SI838 +MTAS1_SX118 +MTAS1_SX208 +MTAS1_SX28 +MTAS1_SX298 +MTAS1_SX388 +MTDT0_SA1 +MTDT0_SA2 +MTDT0_SI1994 +MTDT0_SI2254 +MTDT0_SI994 +MTDT0_SX184 +MTDT0_SX274 +MTDT0_SX364 +MTDT0_SX4 +MTDT0_SX94 +MTEB0_SA1 +MTEB0_SA2 +MTEB0_SI1133 +MTEB0_SI2064 +MTEB0_SI503 +MTEB0_SX143 +MTEB0_SX233 +MTEB0_SX323 +MTEB0_SX413 +MTEB0_SX53 +MTHC0_SA1 +MTHC0_SA2 +MTHC0_SI1015 +MTHC0_SI1645 +MTHC0_SI2275 +MTHC0_SX115 +MTHC0_SX205 +MTHC0_SX25 +MTHC0_SX295 +MTHC0_SX385 +MTLS0_SA1 +MTLS0_SA2 +MTLS0_SI1370 +MTLS0_SI2000 +MTLS0_SI740 +MTLS0_SX110 +MTLS0_SX20 +MTLS0_SX200 +MTLS0_SX290 +MTLS0_SX380 +MTMR0_SA1 +MTMR0_SA2 +MTMR0_SI1303 +MTMR0_SI1933 +MTMR0_SI673 +MTMR0_SX133 +MTMR0_SX223 +MTMR0_SX313 +MTMR0_SX403 +MTMR0_SX43 +MTWH0_SA1 +MTWH0_SA2 +MTWH0_SI1190 +MTWH0_SI1629 +MTWH0_SI1820 +MTWH0_SX110 +MTWH0_SX20 +MTWH0_SX200 +MTWH0_SX290 +MTWH0_SX380 +MWBT0_SA1 +MWBT0_SA2 +MWBT0_SI1553 +MWBT0_SI2183 +MWBT0_SI923 +MWBT0_SX113 +MWBT0_SX203 +MWBT0_SX23 +MWBT0_SX293 +MWBT0_SX383 +MWEW0_SA1 +MWEW0_SA2 +MWEW0_SI1361 +MWEW0_SI1991 +MWEW0_SI731 +MWEW0_SX101 +MWEW0_SX11 +MWEW0_SX191 +MWEW0_SX281 +MWEW0_SX371 +MWJG0_SA1 +MWJG0_SA2 +MWJG0_SI1124 +MWJG0_SI1754 +MWJG0_SI494 +MWJG0_SX134 +MWJG0_SX224 +MWJG0_SX314 +MWJG0_SX404 +MWJG0_SX44 +MWVW0_SA1 +MWVW0_SA2 +MWVW0_SI1476 +MWVW0_SI2106 +MWVW0_SI846 +MWVW0_SX126 +MWVW0_SX216 +MWVW0_SX306 +MWVW0_SX36 +MWVW0_SX396 diff --git a/examples/wav2vec/unsupervised/config/timit_unmatched/train.uid b/examples/wav2vec/unsupervised/config/timit_unmatched/train.uid new file mode 100644 index 0000000000..35b02e7f82 --- /dev/null +++ b/examples/wav2vec/unsupervised/config/timit_unmatched/train.uid @@ -0,0 +1,3000 @@ +FAEM0_SA1 +FAEM0_SA2 +FAEM0_SI2022 +FAEM0_SX132 +FAEM0_SX222 +FAEM0_SX312 +FAEM0_SX402 +FAJW0_SA2 +FAJW0_SI1893 +FAJW0_SX183 +FAJW0_SX273 +FAJW0_SX363 +FALK0_SA1 +FALK0_SA2 +FALK0_SI1086 +FALK0_SI456 +FALK0_SX276 +FALK0_SX366 +FALK0_SX96 +FALR0_SA1 +FALR0_SA2 +FALR0_SI1955 +FALR0_SI695 +FALR0_SX155 +FALR0_SX245 +FALR0_SX425 +FALR0_SX65 +FAPB0_SA1 +FAPB0_SA2 +FAPB0_SI1693 +FAPB0_SX163 +FAPB0_SX253 +FAPB0_SX343 +FAPB0_SX73 +FBAS0_SA2 +FBAS0_SI1387 +FBAS0_SX127 +FBAS0_SX307 +FBAS0_SX37 +FBAS0_SX397 +FBCG1_SA2 +FBCG1_SI1612 +FBCG1_SI2242 +FBCG1_SI982 +FBCG1_SX262 +FBCG1_SX82 +FBCH0_SA1 +FBCH0_SA2 +FBCH0_SI1586 +FBCH0_SI956 +FBCH0_SX146 +FBCH0_SX326 +FBCH0_SX56 +FBJL0_SA1 +FBJL0_SA2 +FBJL0_SI1552 +FBJL0_SI2182 +FBJL0_SX112 +FBJL0_SX202 +FBJL0_SX22 +FBJL0_SX292 +FBJL0_SX382 +FBLV0_SA2 +FBLV0_SI2318 +FBLV0_SX158 +FBLV0_SX248 +FBLV0_SX428 +FBMH0_SA2 +FBMH0_SI1766 +FBMH0_SX146 +FBMH0_SX236 +FBMH0_SX326 +FBMH0_SX416 +FBMH0_SX56 +FBMJ0_SA2 +FBMJ0_SX156 +FBMJ0_SX246 +FBMJ0_SX426 +FBMJ0_SX66 +FCAG0_SA2 +FCAG0_SI1503 +FCAG0_SI1641 +FCAG0_SI2133 +FCAG0_SX333 +FCAG0_SX423 +FCAG0_SX63 +FCAJ0_SA1 +FCAJ0_SA2 +FCAJ0_SI1804 +FCAJ0_SI849 +FCAJ0_SX129 +FCAJ0_SX219 +FCAJ0_SX39 +FCAJ0_SX399 +FCDR1_SA1 +FCDR1_SA2 +FCDR1_SX16 +FCDR1_SX376 +FCEG0_SA1 +FCEG0_SI1248 +FCEG0_SI1878 +FCEG0_SI618 +FCEG0_SX168 +FCEG0_SX258 +FCEG0_SX348 +FCEG0_SX438 +FCEG0_SX78 +FCJF0_SA2 +FCJF0_SI1027 +FCJF0_SI1657 +FCJF0_SI648 +FCJF0_SX217 +FCJF0_SX307 +FCJF0_SX37 +FCJF0_SX397 +FCJS0_SA1 +FCJS0_SA2 +FCJS0_SI977 +FCJS0_SX167 +FCJS0_SX347 +FCJS0_SX437 +FCJS0_SX77 +FCKE0_SA1 +FCKE0_SI1111 +FCKE0_SX211 +FCKE0_SX301 +FCKE0_SX31 +FCKE0_SX391 +FCLT0_SA1 +FCLT0_SA2 +FCLT0_SI1438 +FCLT0_SX178 +FCLT0_SX268 +FCLT0_SX358 +FCMG0_SA1 +FCMG0_SI1242 +FCMG0_SX162 +FCMG0_SX252 +FCMG0_SX342 +FCMM0_SI1083 +FCMM0_SI453 +FCMM0_SX273 +FCMM0_SX363 +FCMM0_SX93 +FCRZ0_SA1 +FCRZ0_SA2 +FCRZ0_SI1913 +FCRZ0_SI793 +FCRZ0_SX163 +FCRZ0_SX253 +FCRZ0_SX343 +FCRZ0_SX73 +FCYL0_SA2 +FCYL0_SI1297 +FCYL0_SI1927 +FCYL0_SX127 +FCYL0_SX217 +FCYL0_SX397 +FDAS1_SA1 +FDAS1_SA2 +FDAS1_SX111 +FDAS1_SX21 +FDAS1_SX291 +FDAW0_SA1 +FDAW0_SA2 +FDAW0_SX146 +FDAW0_SX236 +FDAW0_SX326 +FDAW0_SX416 +FDAW0_SX56 +FDFB0_SI1318 +FDFB0_SI1948 +FDFB0_SX148 +FDFB0_SX238 +FDFB0_SX328 +FDFB0_SX418 +FDJH0_SA1 +FDJH0_SA2 +FDJH0_SI1565 +FDJH0_SI2195 +FDJH0_SX125 +FDJH0_SX215 +FDJH0_SX35 +FDJH0_SX395 +FDKN0_SA1 +FDKN0_SA2 +FDKN0_SI1081 +FDKN0_SI1711 +FDKN0_SX271 +FDKN0_SX361 +FDKN0_SX91 +FDML0_SA1 +FDML0_SI1149 +FDML0_SI1779 +FDML0_SI2075 +FDML0_SX339 +FDML0_SX69 +FDMY0_SI1197 +FDMY0_SX117 +FDMY0_SX207 +FDMY0_SX297 +FDNC0_SA1 +FDNC0_SA2 +FDNC0_SI2287 +FDNC0_SX108 +FDNC0_SX18 +FDNC0_SX378 +FDTD0_SA2 +FDTD0_SI1561 +FDTD0_SI2191 +FDTD0_SI931 +FDTD0_SX121 +FDTD0_SX301 +FDTD0_SX391 +FDXW0_SA2 +FDXW0_SI1511 +FDXW0_SI2141 +FDXW0_SI881 +FDXW0_SX161 +FDXW0_SX431 +FEAC0_SA1 +FEAC0_SA2 +FEAC0_SI1245 +FEAC0_SI1875 +FEAC0_SX255 +FEAC0_SX345 +FEAC0_SX435 +FEAR0_SA1 +FEAR0_SA2 +FEAR0_SI1252 +FEAR0_SI1882 +FEAR0_SX172 +FEAR0_SX262 +FEAR0_SX442 +FEAR0_SX82 +FECD0_SA2 +FECD0_SI2048 +FECD0_SX158 +FECD0_SX248 +FECD0_SX338 +FECD0_SX428 +FEEH0_SA2 +FEEH0_SI1112 +FEEH0_SX212 +FEEH0_SX302 +FEEH0_SX32 +FEEH0_SX392 +FEME0_SA2 +FEME0_SI1505 +FEME0_SI2135 +FEME0_SX245 +FEME0_SX425 +FETB0_SA2 +FETB0_SI1778 +FETB0_SI518 +FETB0_SX248 +FETB0_SX338 +FETB0_SX428 +FETB0_SX68 +FEXM0_SA2 +FEXM0_SI1731 +FEXM0_SX111 +FEXM0_SX201 +FEXM0_SX291 +FEXM0_SX381 +FGCS0_SA1 +FGCS0_SA2 +FGCS0_SI1486 +FGCS0_SI2116 +FGCS0_SI856 +FGCS0_SX46 +FGDP0_SA2 +FGDP0_SI1618 +FGDP0_SI2248 +FGDP0_SX178 +FGDP0_SX268 +FGDP0_SX358 +FGDP0_SX448 +FGMB0_SA1 +FGMB0_SA2 +FGMB0_SI515 +FGMB0_SX155 +FGMB0_SX425 +FGMB0_SX65 +FGRW0_SA2 +FGRW0_SI1782 +FGRW0_SI1990 +FGRW0_SX252 +FGRW0_SX342 +FGRW0_SX72 +FHLM0_SA1 +FHLM0_SA2 +FHLM0_SI1560 +FHLM0_SI2190 +FHLM0_SI930 +FHLM0_SX210 +FHLM0_SX300 +FHXS0_SI2335 +FHXS0_SX265 +FHXS0_SX355 +FHXS0_SX85 +FJDM2_SI1582 +FJDM2_SI1964 +FJDM2_SI2212 +FJDM2_SX322 +FJDM2_SX412 +FJEN0_SA2 +FJEN0_SI1047 +FJEN0_SI1677 +FJEN0_SI2307 +FJEN0_SX147 +FJEN0_SX237 +FJEN0_SX57 +FJHK0_SA1 +FJHK0_SA2 +FJHK0_SI1022 +FJHK0_SI1652 +FJHK0_SX122 +FJHK0_SX212 +FJHK0_SX32 +FJHK0_SX392 +FJKL0_SA1 +FJKL0_SA2 +FJKL0_SI1562 +FJKL0_SI2192 +FJKL0_SX122 +FJKL0_SX302 +FJKL0_SX32 +FJLG0_SA1 +FJLG0_SA2 +FJLG0_SI1506 +FJLG0_SX179 +FJLG0_SX269 +FJLG0_SX359 +FJLG0_SX449 +FJLG0_SX89 +FJLR0_SA2 +FJLR0_SI1861 +FJLR0_SI601 +FJLR0_SX151 +FJLR0_SX241 +FJLR0_SX331 +FJLR0_SX421 +FJLR0_SX61 +FJRB0_SA1 +FJRB0_SA2 +FJRB0_SI1302 +FJRB0_SI1932 +FJRB0_SI672 +FJRB0_SX132 +FJRB0_SX222 +FJRB0_SX312 +FJRB0_SX42 +FJRP1_SA2 +FJRP1_SI802 +FJRP1_SX172 +FJRP1_SX442 +FJSK0_SA2 +FJSK0_SI1682 +FJSK0_SI2312 +FJSK0_SX152 +FJSK0_SX242 +FJSK0_SX332 +FJSK0_SX422 +FJSK0_SX62 +FJSP0_SA1 +FJSP0_SA2 +FJSP0_SI1763 +FJSP0_SI804 +FJSP0_SX174 +FJSP0_SX84 +FJWB1_SA2 +FJWB1_SI2055 +FJWB1_SI795 +FJWB1_SX165 +FJWB1_SX255 +FJWB1_SX75 +FJXM0_SA2 +FJXM0_SI1211 +FJXM0_SI1971 +FJXM0_SX131 +FJXM0_SX221 +FJXP0_SA2 +FJXP0_SI492 +FJXP0_SX222 +FJXP0_SX312 +FJXP0_SX402 +FJXP0_SX42 +FKAA0_SA2 +FKAA0_SI1208 +FKAA0_SI1838 +FKAA0_SI578 +FKAA0_SX218 +FKAA0_SX308 +FKAA0_SX38 +FKDE0_SA2 +FKDE0_SI2221 +FKDE0_SX331 +FKDW0_SA1 +FKDW0_SA2 +FKDW0_SI577 +FKDW0_SX127 +FKDW0_SX217 +FKDW0_SX307 +FKDW0_SX37 +FKFB0_SA1 +FKFB0_SI2238 +FKFB0_SI978 +FKFB0_SX168 +FKFB0_SX258 +FKKH0_SI660 +FKKH0_SX210 +FKKH0_SX30 +FKKH0_SX300 +FKLC0_SA1 +FKLC0_SA2 +FKLC0_SI1615 +FKLC0_SI2245 +FKLC0_SX265 +FKLC0_SX445 +FKLC0_SX85 +FKLC1_SA1 +FKLC1_SA2 +FKLC1_SI1678 +FKLC1_SX148 +FKLC1_SX58 +FKLH0_SA1 +FKLH0_SI1887 +FKLH0_SI627 +FKLH0_SX267 +FKLH0_SX357 +FKLH0_SX447 +FKLH0_SX87 +FKSR0_SI1117 +FKSR0_SX161 +FKSR0_SX37 +FKSR0_SX397 +FLAC0_SA1 +FLAC0_SA2 +FLAC0_SI2161 +FLAC0_SI901 +FLAC0_SX181 +FLAC0_SX271 +FLAC0_SX361 +FLAC0_SX91 +FLAG0_SA1 +FLAG0_SI2094 +FLAG0_SX294 +FLEH0_SA1 +FLEH0_SA2 +FLEH0_SX151 +FLEH0_SX241 +FLEH0_SX421 +FLEH0_SX61 +FLET0_SA2 +FLET0_SI1137 +FLET0_SI1767 +FLET0_SX147 +FLET0_SX237 +FLET0_SX277 +FLET0_SX417 +FLET0_SX57 +FLHD0_SA1 +FLHD0_SA2 +FLHD0_SI1344 +FLHD0_SI1974 +FLHD0_SX174 +FLHD0_SX264 +FLHD0_SX444 +FLHD0_SX84 +FLJA0_SA2 +FLJA0_SI1708 +FLJA0_SX268 +FLJA0_SX358 +FLJA0_SX448 +FLJA0_SX88 +FLJD0_SA1 +FLJD0_SA2 +FLJD0_SI2146 +FLJD0_SX166 +FLJD0_SX256 +FLJD0_SX346 +FLJD0_SX436 +FLJG0_SA1 +FLJG0_SI1611 +FLJG0_SI2241 +FLJG0_SX261 +FLJG0_SX441 +FLJG0_SX81 +FLKM0_SI1880 +FLKM0_SX116 +FLMA0_SA2 +FLMA0_SI1243 +FLMA0_SI1873 +FLMA0_SX163 +FLMA0_SX253 +FLMA0_SX343 +FLMC0_SA1 +FLMC0_SA2 +FLMC0_SI2002 +FLMC0_SI742 +FLMC0_SX112 +FLMC0_SX292 +FLMC0_SX336 +FLMC0_SX382 +FLMK0_SA2 +FLMK0_SI2295 +FLMK0_SX135 +FLMK0_SX225 +FLMK0_SX45 +FLOD0_SA1 +FLOD0_SA2 +FLOD0_SI1287 +FLOD0_SI657 +FLOD0_SX207 +FLOD0_SX387 +FLTM0_SA2 +FLTM0_SI1700 +FLTM0_SX260 +FLTM0_SX80 +FMAH1_SA1 +FMAH1_SI1509 +FMAH1_SI2139 +FMAH1_SX249 +FMAH1_SX339 +FMAH1_SX429 +FMAH1_SX69 +FMBG0_SA1 +FMBG0_SI1790 +FMBG0_SX260 +FMBG0_SX3 +FMBG0_SX350 +FMBG0_SX440 +FMBG0_SX80 +FMEM0_SA2 +FMEM0_SI1377 +FMEM0_SI2007 +FMEM0_SX117 +FMEM0_SX207 +FMEM0_SX297 +FMJB0_SA1 +FMJB0_SA2 +FMJB0_SI1807 +FMJB0_SX187 +FMJB0_SX277 +FMJB0_SX367 +FMJB0_SX7 +FMJF0_SA1 +FMJF0_SI1254 +FMJF0_SI1884 +FMJF0_SX264 +FMJF0_SX354 +FMJF0_SX444 +FMJU0_SA1 +FMJU0_SA2 +FMJU0_SI2019 +FMJU0_SI759 +FMJU0_SX129 +FMJU0_SX219 +FMJU0_SX39 +FMKC0_SA1 +FMKC0_SA2 +FMKC0_SI1072 +FMKC0_SX172 +FMKC0_SX262 +FMKC0_SX352 +FMKF0_SA1 +FMKF0_SA2 +FMKF0_SI1536 +FMKF0_SI906 +FMKF0_SX276 +FMKF0_SX366 +FMKF0_SX6 +FMKF0_SX96 +FMMH0_SA1 +FMMH0_SA2 +FMMH0_SI1537 +FMMH0_SI2167 +FMMH0_SI907 +FMMH0_SX187 +FMMH0_SX367 +FMMH0_SX420 +FMMH0_SX7 +FMMH0_SX97 +FMPG0_SI1602 +FMPG0_SI2232 +FMPG0_SX252 +FMPG0_SX72 +FNKL0_SA1 +FNKL0_SA2 +FNKL0_SI2152 +FNKL0_SX172 +FNKL0_SX196 +FNKL0_SX262 +FNKL0_SX442 +FNKL0_SX82 +FNTB0_SA1 +FNTB0_SA2 +FNTB0_SX123 +FNTB0_SX213 +FNTB0_SX33 +FNTB0_SX393 +FPAB1_SA2 +FPAB1_SX121 +FPAB1_SX301 +FPAB1_SX31 +FPAB1_SX391 +FPAC0_SA1 +FPAC0_SI2011 +FPAC0_SX121 +FPAC0_SX211 +FPAC0_SX301 +FPAC0_SX31 +FPAC0_SX391 +FPAD0_SA1 +FPAD0_SI1346 +FPAD0_SI1976 +FPAD0_SX266 +FPAD0_SX446 +FPAF0_SI1684 +FPAF0_SI2314 +FPAF0_SX244 +FPAF0_SX334 +FPAF0_SX424 +FPAF0_SX64 +FPAZ0_SI1593 +FPAZ0_SX153 +FPAZ0_SX27 +FPAZ0_SX423 +FPAZ0_SX63 +FPJF0_SA2 +FPJF0_SI1046 +FPJF0_SI1676 +FPJF0_SX236 +FPJF0_SX326 +FPLS0_SA1 +FPLS0_SA2 +FPLS0_SI2220 +FPLS0_SX150 +FPLS0_SX240 +FPLS0_SX3 +FPLS0_SX60 +FPMY0_SA2 +FPMY0_SI1783 +FPMY0_SX163 +FPMY0_SX196 +FPMY0_SX253 +FPMY0_SX73 +FREH0_SI1315 +FREH0_SI685 +FREH0_SX145 +FREH0_SX235 +FREH0_SX325 +FREH0_SX55 +FRJB0_SA1 +FRJB0_SA2 +FRJB0_SI1427 +FRJB0_SI1470 +FRJB0_SI1794 +FRJB0_SX167 +FRJB0_SX257 +FRJB0_SX437 +FRJB0_SX77 +FRLL0_SA1 +FRLL0_SA2 +FRLL0_SI1514 +FRLL0_SI884 +FRLL0_SX164 +FRLL0_SX254 +FRLL0_SX344 +FRLL0_SX74 +FSAG0_SA2 +FSAG0_SI1953 +FSAG0_SI693 +FSAG0_SX63 +FSAH0_SI1244 +FSAH0_SI1874 +FSAH0_SX344 +FSAH0_SX74 +FSAK0_SA1 +FSAK0_SA2 +FSAK0_SI1930 +FSAK0_SI670 +FSAK0_SX130 +FSAK0_SX220 +FSAK0_SX310 +FSAK0_SX40 +FSAK0_SX400 +FSBK0_SA1 +FSBK0_SI1699 +FSBK0_SI2329 +FSBK0_SX259 +FSBK0_SX439 +FSBK0_SX79 +FSCN0_SI1886 +FSCN0_SX356 +FSDC0_SA1 +FSDC0_SI1942 +FSDC0_SI2234 +FSDC0_SX232 +FSDC0_SX412 +FSDJ0_SA1 +FSDJ0_SA2 +FSDJ0_SI1745 +FSDJ0_SX125 +FSDJ0_SX35 +FSGF0_SA1 +FSGF0_SA2 +FSGF0_SI1557 +FSGF0_SX207 +FSGF0_SX27 +FSGF0_SX297 +FSGF0_SX387 +FSJG0_SI1570 +FSJG0_SI2200 +FSJG0_SX310 +FSJK1_SA1 +FSJK1_SI1025 +FSJK1_SI2285 +FSJK1_SI696 +FSJK1_SX215 +FSJK1_SX305 +FSJK1_SX395 +FSJS0_SA2 +FSJS0_SI1171 +FSJS0_SI1801 +FSJS0_SI541 +FSJS0_SX271 +FSJS0_SX361 +FSJS0_SX91 +FSJW0_SA1 +FSJW0_SA2 +FSJW0_SI703 +FSJW0_SX163 +FSJW0_SX253 +FSJW0_SX343 +FSJW0_SX73 +FSKC0_SA1 +FSKC0_SA2 +FSKC0_SI2046 +FSKC0_SX156 +FSKC0_SX336 +FSKC0_SX426 +FSKC0_SX66 +FSKL0_SA1 +FSKL0_SA2 +FSKL0_SI2159 +FSKL0_SI899 +FSKL0_SX179 +FSKL0_SX269 +FSKL0_SX359 +FSKL0_SX89 +FSKP0_SA1 +FSKP0_SI1728 +FSKP0_SI468 +FSKP0_SX108 +FSKP0_SX18 +FSKP0_SX198 +FSKP0_SX288 +FSKP0_SX378 +FSLS0_SA1 +FSLS0_SA2 +FSLS0_SI1056 +FSLS0_SI1686 +FSLS0_SI2316 +FSLS0_SX202 +FSLS0_SX246 +FSLS0_SX66 +FSMA0_SA1 +FSMA0_SI1621 +FSMA0_SI2251 +FSMA0_SX271 +FSMA0_SX361 +FSMA0_SX91 +FSMM0_SA1 +FSMM0_SA2 +FSMM0_SI1314 +FSMM0_SI1944 +FSMM0_SI684 +FSMM0_SX414 +FSMM0_SX54 +FSMS1_SA1 +FSMS1_SA2 +FSMS1_SI1504 +FSMS1_SI2134 +FSMS1_SI874 +FSMS1_SX154 +FSMS1_SX334 +FSMS1_SX64 +FSPM0_SA1 +FSPM0_SI1871 +FSPM0_SI611 +FSPM0_SX341 +FSPM0_SX431 +FSRH0_SA1 +FSRH0_SA2 +FSRH0_SI1719 +FSRH0_SX131 +FSRH0_SX41 +FSSB0_SA1 +FSSB0_SA2 +FSSB0_SI1082 +FSSB0_SI2342 +FSSB0_SX182 +FSSB0_SX272 +FSSB0_SX452 +FSSB0_SX92 +FTAJ0_SA1 +FTAJ0_SA2 +FTAJ0_SI1329 +FTAJ0_SI474 +FTAJ0_SX339 +FTAJ0_SX69 +FTBR0_SA1 +FTBR0_SA2 +FTBR0_SI2181 +FTBR0_SX111 +FTBR0_SX201 +FTBR0_SX291 +FTBR0_SX381 +FTBW0_SA2 +FTBW0_SI1345 +FTBW0_SI1975 +FTBW0_SX265 +FTBW0_SX355 +FTBW0_SX445 +FTBW0_SX85 +FTLG0_SA1 +FTLG0_SA2 +FTLG0_SI840 +FTLG0_SX123 +FTLG0_SX213 +FTLG0_SX303 +FTLG0_SX33 +FTLG0_SX393 +FTMG0_SA1 +FTMG0_SA2 +FTMG0_SX182 +FTMG0_SX272 +FTMG0_SX362 +FTMG0_SX92 +FVFB0_SA1 +FVFB0_SI1032 +FVFB0_SI2292 +FVFB0_SX222 +FVFB0_SX312 +FVFB0_SX402 +FVKB0_SA2 +FVKB0_SI1159 +FVKB0_SI1789 +FVKB0_SI529 +FVKB0_SX169 +FVKB0_SX259 +FVKB0_SX439 +FVKB0_SX79 +FVMH0_SA1 +FVMH0_SI2096 +FVMH0_SX206 +FVMH0_SX296 +FVMH0_SX386 +MABC0_SA1 +MABC0_SA2 +MABC0_SX151 +MABC0_SX241 +MABC0_SX331 +MABC0_SX421 +MABC0_SX61 +MADC0_SA1 +MADC0_SA2 +MADC0_SI1997 +MADC0_SX17 +MADC0_SX197 +MADC0_SX287 +MADD0_SA1 +MADD0_SI1798 +MADD0_SI538 +MADD0_SX358 +MADD0_SX448 +MAEB0_SA1 +MAEB0_SA2 +MAEB0_SI2250 +MAEB0_SI990 +MAEB0_SX180 +MAEB0_SX270 +MAEB0_SX360 +MAEB0_SX90 +MAEO0_SA2 +MAEO0_SI1655 +MAEO0_SI1956 +MAEO0_SX156 +MAEO0_SX246 +MAEO0_SX336 +MAEO0_SX426 +MAEO0_SX66 +MAFM0_SA1 +MAFM0_SA2 +MAFM0_SI1569 +MAFM0_SI2199 +MAFM0_SX219 +MAFM0_SX39 +MAFM0_SX399 +MAJP0_SA1 +MAJP0_SI1074 +MAJP0_SI2334 +MAJP0_SX264 +MAJP0_SX354 +MAJP0_SX444 +MAJP0_SX84 +MAKB0_SA1 +MAKB0_SX206 +MAKB0_SX296 +MAKR0_SA1 +MAKR0_SA2 +MAKR0_SI1352 +MAKR0_SI1982 +MAKR0_SI722 +MAKR0_SX182 +MAKR0_SX272 +MAKR0_SX452 +MAPV0_SA1 +MAPV0_SA2 +MAPV0_SI1923 +MAPV0_SX123 +MAPV0_SX303 +MAPV0_SX33 +MAPV0_SX393 +MARC0_SA1 +MARC0_SI1188 +MARC0_SI1818 +MARC0_SI558 +MARC0_SX288 +MARC0_SX378 +MARW0_SA1 +MARW0_SA2 +MARW0_SI1276 +MARW0_SI646 +MARW0_SX106 +MARW0_SX16 +MARW0_SX376 +MBAR0_SA2 +MBAR0_SI1319 +MBAR0_SI1949 +MBAR0_SI689 +MBAR0_SX149 +MBAR0_SX239 +MBAR0_SX329 +MBBR0_SA1 +MBBR0_SA2 +MBBR0_SI1685 +MBBR0_SX155 +MBBR0_SX245 +MBBR0_SX425 +MBCG0_SA2 +MBCG0_SI2217 +MBCG0_SX147 +MBCG0_SX237 +MBCG0_SX417 +MBCG0_SX57 +MBEF0_SA1 +MBEF0_SA2 +MBEF0_SX111 +MBEF0_SX201 +MBEF0_SX291 +MBGT0_SA1 +MBGT0_SI1341 +MBGT0_SI711 +MBGT0_SX81 +MBJV0_SA2 +MBJV0_SI1247 +MBJV0_SI1877 +MBJV0_SX167 +MBJV0_SX257 +MBJV0_SX437 +MBJV0_SX77 +MBMA0_SA1 +MBMA0_SA2 +MBMA0_SI1852 +MBMA0_SX142 +MBMA0_SX322 +MBMA0_SX412 +MBMA1_SA1 +MBMA1_SA2 +MBMA1_SI2207 +MBMA1_SX144 +MBMA1_SX234 +MBMA1_SX414 +MBML0_SA1 +MBML0_SI1799 +MBML0_SI539 +MBML0_SX179 +MBML0_SX269 +MBML0_SX359 +MBML0_SX449 +MBOM0_SA1 +MBOM0_SI1014 +MBOM0_SI1644 +MBOM0_SX114 +MBOM0_SX204 +MBOM0_SX311 +MBOM0_SX384 +MBSB0_SA2 +MBSB0_SI1353 +MBSB0_SI1983 +MBSB0_SI723 +MBSB0_SX183 +MBSB0_SX273 +MBSB0_SX363 +MBSB0_SX93 +MBTH0_SA1 +MBTH0_SI505 +MBTH0_SI757 +MBTH0_SX212 +MBTH0_SX302 +MBTH0_SX392 +MBWP0_SA1 +MBWP0_SA2 +MBWP0_SI1531 +MBWP0_SI1969 +MBWP0_SI709 +MBWP0_SX169 +MBWP0_SX259 +MBWP0_SX439 +MBWP0_SX79 +MCAE0_SA1 +MCAE0_SA2 +MCAE0_SX187 +MCAE0_SX367 +MCAE0_SX7 +MCAE0_SX97 +MCAL0_SA1 +MCAL0_SI508 +MCAL0_SX148 +MCAL0_SX238 +MCAL0_SX328 +MCAL0_SX418 +MCAL0_SX58 +MCDC0_SA2 +MCDC0_SI1292 +MCDC0_SI1922 +MCDC0_SI662 +MCDC0_SX122 +MCDC0_SX302 +MCDC0_SX32 +MCDC0_SX392 +MCDD0_SA1 +MCDD0_SI1513 +MCDD0_SI2143 +MCDD0_SX163 +MCDD0_SX343 +MCDD0_SX73 +MCDR0_SA1 +MCDR0_SA2 +MCDR0_SX164 +MCDR0_SX254 +MCDR0_SX344 +MCDR0_SX434 +MCDR0_SX74 +MCEF0_SA1 +MCEF0_SA2 +MCEF0_SI1135 +MCEF0_SI1765 +MCEF0_SX145 +MCEF0_SX325 +MCEF0_SX55 +MCEW0_SI1442 +MCEW0_SX182 +MCEW0_SX272 +MCEW0_SX92 +MCHL0_SA1 +MCHL0_SA2 +MCHL0_SI1977 +MCHL0_SX177 +MCHL0_SX267 +MCHL0_SX357 +MCHL0_SX447 +MCLK0_SA1 +MCLK0_SA2 +MCLK0_SI1660 +MCLK0_SX130 +MCLK0_SX220 +MCLK0_SX40 +MCLK0_SX400 +MCLM0_SA2 +MCLM0_SI1456 +MCLM0_SX106 +MCLM0_SX16 +MCLM0_SX196 +MCLM0_SX286 +MCLM0_SX376 +MCPM0_SA2 +MCPM0_SI1194 +MCPM0_SI564 +MCPM0_SX204 +MCPM0_SX24 +MCRE0_SA1 +MCRE0_SA2 +MCRE0_SI1121 +MCRE0_SI1725 +MCRE0_SI1751 +MCRE0_SX131 +MCRE0_SX221 +MCRE0_SX24 +MCRE0_SX401 +MCRE0_SX41 +MCSS0_SA1 +MCSS0_SA2 +MCSS0_SX120 +MCSS0_SX210 +MCSS0_SX30 +MCSS0_SX300 +MCSS0_SX390 +MCTH0_SA2 +MCTH0_SI1209 +MCTH0_SI1839 +MCTH0_SI579 +MCTH0_SX129 +MCTH0_SX219 +MCTH0_SX309 +MCTH0_SX399 +MCTM0_SA1 +MCTM0_SA2 +MCTM0_SI720 +MCTM0_SX180 +MCTM0_SX270 +MCTM0_SX360 +MCTM0_SX450 +MCTM0_SX90 +MCXM0_SA1 +MCXM0_SA2 +MCXM0_SI1351 +MCXM0_SI1981 +MCXM0_SI721 +MCXM0_SX181 +MCXM0_SX271 +MCXM0_SX361 +MCXM0_SX451 +MDAC0_SA2 +MDAC0_SI1261 +MDAC0_SI1837 +MDAC0_SX271 +MDAC0_SX451 +MDAC0_SX91 +MDAS0_SA1 +MDAS0_SA2 +MDAS0_SI1266 +MDAS0_SX186 +MDAS0_SX21 +MDAS0_SX276 +MDAS0_SX96 +MDBB1_SA1 +MDBB1_SA2 +MDBB1_SI1006 +MDBB1_SI1636 +MDBB1_SI2056 +MDBB1_SX196 +MDBB1_SX286 +MDBP0_SA1 +MDBP0_SA2 +MDBP0_SI1158 +MDBP0_SI1788 +MDBP0_SX258 +MDBP0_SX348 +MDBP0_SX78 +MDCD0_SA1 +MDCD0_SA2 +MDCD0_SI2045 +MDCD0_SX155 +MDCD0_SX65 +MDCM0_SA1 +MDCM0_SA2 +MDCM0_SI2110 +MDCM0_SI850 +MDCM0_SX130 +MDCM0_SX220 +MDCM0_SX310 +MDDC0_SA1 +MDDC0_SA2 +MDDC0_SX249 +MDDC0_SX339 +MDDC0_SX429 +MDED0_SI1170 +MDED0_SI1800 +MDED0_SX180 +MDED0_SX270 +MDED0_SX360 +MDED0_SX450 +MDED0_SX90 +MDEF0_SA1 +MDEF0_SA2 +MDEF0_SI1563 +MDEF0_SI2193 +MDEF0_SX213 +MDEF0_SX33 +MDEF0_SX393 +MDEM0_SA2 +MDEM0_SI1868 +MDEM0_SX158 +MDEM0_SX248 +MDEM0_SX338 +MDEM0_SX68 +MDHL0_SA1 +MDHL0_SA2 +MDHL0_SI2069 +MDHL0_SI809 +MDHL0_SX179 +MDHL0_SX359 +MDHL0_SX89 +MDHS0_SX180 +MDHS0_SX270 +MDHS0_SX360 +MDHS0_SX450 +MDHS0_SX90 +MDJM0_SA1 +MDJM0_SA2 +MDJM0_SI2085 +MDJM0_SI825 +MDJM0_SX195 +MDJM0_SX285 +MDJM0_SX375 +MDKS0_SA1 +MDKS0_SA2 +MDKS0_SI1066 +MDKS0_SI1696 +MDKS0_SI2326 +MDKS0_SX256 +MDKS0_SX76 +MDLB0_SA1 +MDLB0_SI1936 +MDLB0_SI676 +MDLB0_SX226 +MDLB0_SX316 +MDLB0_SX46 +MDLC0_SA1 +MDLC0_SA2 +MDLC0_SI765 +MDLC0_SX135 +MDLC0_SX225 +MDLC0_SX315 +MDLC0_SX45 +MDLC1_SA1 +MDLC1_SX175 +MDLC1_SX265 +MDLC1_SX355 +MDLC1_SX85 +MDLC2_SA1 +MDLC2_SA2 +MDLC2_SI1614 +MDLC2_SI984 +MDLC2_SX174 +MDLC2_SX264 +MDLC2_SX444 +MDLC2_SX84 +MDLH0_SA1 +MDLH0_SI1960 +MDLH0_SI574 +MDLH0_SI700 +MDLH0_SX250 +MDLH0_SX340 +MDLH0_SX70 +MDLM0_SA1 +MDLM0_SA2 +MDLM0_SX244 +MDLM0_SX334 +MDLM0_SX64 +MDLR0_SI1233 +MDLR0_SX243 +MDLR0_SX423 +MDLR0_SX63 +MDLR1_SI1299 +MDLR1_SI1929 +MDLR1_SX129 +MDLR1_SX219 +MDLR1_SX309 +MDLR1_SX39 +MDLR1_SX399 +MDMA0_SA1 +MDMA0_SA2 +MDMA0_SI1238 +MDMA0_SI2060 +MDMT0_SI2341 +MDMT0_SI572 +MDMT0_SX212 +MDMT0_SX302 +MDMT0_SX392 +MDNS0_SA1 +MDNS0_SX111 +MDNS0_SX291 +MDNS0_SX381 +MDPB0_SA1 +MDPB0_SA2 +MDPB0_SI2126 +MDPB0_SX146 +MDPB0_SX236 +MDPB0_SX326 +MDPB0_SX56 +MDPK0_SA1 +MDPK0_SA2 +MDPK0_SI1683 +MDPK0_SI552 +MDPK0_SX153 +MDPK0_SX243 +MDPK0_SX63 +MDPS0_SA1 +MDPS0_SA2 +MDPS0_SI1651 +MDPS0_SI1979 +MDPS0_SX179 +MDPS0_SX269 +MDPS0_SX449 +MDPS0_SX89 +MDRD0_SA2 +MDRD0_SI1382 +MDRD0_SI2012 +MDRD0_SX122 +MDRD0_SX212 +MDRD0_SX302 +MDRD0_SX392 +MDSJ0_SA1 +MDSJ0_SA2 +MDSJ0_SI832 +MDSJ0_SX112 +MDSJ0_SX22 +MDSJ0_SX292 +MDSJ0_SX382 +MDSS0_SA1 +MDSS0_SI1881 +MDSS0_SI2087 +MDSS0_SI621 +MDSS0_SX171 +MDSS0_SX261 +MDSS0_SX351 +MDSS0_SX81 +MDSS1_SA2 +MDSS1_SI1713 +MDSS1_SX247 +MDSS1_SX337 +MDSS1_SX427 +MDTB0_SA1 +MDTB0_SA2 +MDTB0_SI570 +MDTB0_SX210 +MDTB0_SX300 +MDTB0_SX321 +MDTB0_SX390 +MDWD0_SA1 +MDWD0_SI1890 +MDWD0_SI557 +MDWD0_SX180 +MDWD0_SX360 +MDWD0_SX450 +MDWH0_SA2 +MDWH0_SI1925 +MDWH0_SX125 +MDWH0_SX35 +MDWH0_SX395 +MDWM0_SI1546 +MDWM0_SI2176 +MDWM0_SX106 +MDWM0_SX376 +MDWM0_SX433 +MEAL0_SA1 +MEAL0_SI1547 +MEAL0_SI917 +MEAL0_SX197 +MEAL0_SX287 +MEAL0_SX377 +MEDR0_SI744 +MEDR0_SX114 +MEDR0_SX204 +MEDR0_SX24 +MEDR0_SX294 +MEDR0_SX384 +MEFG0_SA2 +MEFG0_SI465 +MEFG0_SX105 +MEFG0_SX15 +MEFG0_SX195 +MEFG0_SX285 +MEFG0_SX375 +MEGJ0_SI1967 +MEGJ0_SX437 +MEGJ0_SX77 +MEJL0_SA2 +MEJL0_SI1592 +MEJL0_SI1654 +MEJL0_SI962 +MEJL0_SX332 +MEJL0_SX422 +MEJL0_SX62 +MEJS0_SA1 +MEJS0_SA2 +MEJS0_SI1870 +MEJS0_SX250 +MEJS0_SX430 +MEJS0_SX70 +MESG0_SA1 +MESG0_SA2 +MESG0_SI1332 +MESG0_SI1962 +MESG0_SX162 +MESG0_SX252 +MESG0_SX342 +MESG0_SX72 +MESJ0_SA1 +MESJ0_SA2 +MESJ0_SI2257 +MESJ0_SI997 +MESJ0_SX277 +MESJ0_SX367 +MESJ0_SX7 +MEWM0_SA1 +MEWM0_SA2 +MEWM0_SI1348 +MEWM0_SI1978 +MEWM0_SX268 +MEWM0_SX358 +MEWM0_SX448 +MFER0_SA1 +MFER0_SA2 +MFER0_SI1492 +MFER0_SI2122 +MFER0_SX232 +MFER0_SX322 +MFER0_SX412 +MFER0_SX52 +MFMC0_SA1 +MFMC0_SA2 +MFMC0_SI1132 +MFMC0_SI1762 +MFMC0_SI502 +MFMC0_SX142 +MFMC0_SX232 +MFMC0_SX322 +MFMC0_SX412 +MFMC0_SX52 +MFRM0_SA1 +MFRM0_SA2 +MFRM0_SI1155 +MFRM0_SI1717 +MFRM0_SI1785 +MFRM0_SX165 +MFRM0_SX255 +MFRM0_SX75 +MFWK0_SA1 +MFWK0_SA2 +MFWK0_SI1249 +MFWK0_SI619 +MFWK0_SX259 +MFWK0_SX439 +MFWK0_SX79 +MFXS0_SA1 +MFXS0_SA2 +MFXS0_SI1674 +MFXS0_SI2225 +MFXS0_SI2304 +MFXS0_SX144 +MFXS0_SX234 +MFXS0_SX414 +MFXV0_SA1 +MFXV0_SI1635 +MFXV0_SX15 +MFXV0_SX195 +MFXV0_SX285 +MFXV0_SX375 +MGAF0_SA2 +MGAF0_SI1912 +MGAF0_SI652 +MGAF0_SX112 +MGAF0_SX202 +MGAF0_SX292 +MGAG0_SA1 +MGAG0_SI1321 +MGAG0_SI645 +MGAG0_SX151 +MGAG0_SX241 +MGAG0_SX331 +MGAG0_SX421 +MGAG0_SX61 +MGAK0_SA1 +MGAK0_SA2 +MGAK0_SI1666 +MGAK0_SI2296 +MGAK0_SX316 +MGAK0_SX406 +MGAR0_SA1 +MGAR0_SA2 +MGAR0_SI1212 +MGAR0_SI1694 +MGAR0_SI1842 +MGAR0_SX222 +MGAR0_SX402 +MGAR0_SX42 +MGAW0_SA1 +MGAW0_SA2 +MGAW0_SI1802 +MGAW0_SX265 +MGAW0_SX355 +MGAW0_SX445 +MGAW0_SX85 +MGES0_SA2 +MGES0_SI1481 +MGES0_SX131 +MGES0_SX221 +MGES0_SX401 +MGES0_SX41 +MGJC0_SA1 +MGJC0_SI1256 +MGJC0_SI1335 +MGJC0_SI1965 +MGJC0_SX165 +MGJC0_SX255 +MGJC0_SX345 +MGRL0_SA1 +MGRL0_SA2 +MGRL0_SI1497 +MGRL0_SX237 +MGRL0_SX417 +MGRL0_SX57 +MGRP0_SA1 +MGRP0_SI1947 +MGRP0_SI687 +MGRP0_SX147 +MGRP0_SX237 +MGRP0_SX417 +MGRP0_SX57 +MGSH0_SA1 +MGSH0_SX186 +MGSH0_SX96 +MGSL0_SA2 +MGSL0_SI1164 +MGSL0_SX174 +MGSL0_SX354 +MGSL0_SX444 +MGSL0_SX84 +MGXP0_SA1 +MGXP0_SA2 +MGXP0_SI457 +MGXP0_SX277 +MGXP0_SX367 +MGXP0_SX97 +MHBS0_SA1 +MHBS0_SA2 +MHBS0_SI1575 +MHBS0_SI2205 +MHBS0_SX135 +MHBS0_SX225 +MHBS0_SX405 +MHIT0_SA2 +MHIT0_SI1613 +MHIT0_SI2243 +MHIT0_SX173 +MHIT0_SX263 +MHIT0_SX353 +MHIT0_SX443 +MHIT0_SX83 +MHJB0_SA2 +MHJB0_SI1647 +MHJB0_SI2277 +MHJB0_SX117 +MHJB0_SX207 +MHJB0_SX27 +MHJB0_SX297 +MHJB0_SX387 +MHMG0_SA1 +MHMG0_SA2 +MHMG0_SI1365 +MHMG0_SI1995 +MHMG0_SX105 +MHMG0_SX15 +MHMG0_SX285 +MHMG0_SX375 +MHMR0_SA2 +MHMR0_SI1119 +MHMR0_SX129 +MHMR0_SX219 +MHMR0_SX309 +MHMR0_SX39 +MHMR0_SX399 +MHRM0_SA2 +MHRM0_SI1475 +MHRM0_SI2218 +MHRM0_SX238 +MHRM0_SX328 +MHRM0_SX418 +MHXL0_SA1 +MHXL0_SA2 +MHXL0_SI512 +MHXL0_SI612 +MHXL0_SX152 +MHXL0_SX332 +MHXL0_SX422 +MHXL0_SX62 +MILB0_SA1 +MILB0_SI2163 +MILB0_SI807 +MILB0_SX183 +MILB0_SX273 +MILB0_SX3 +MILB0_SX363 +MILB0_SX93 +MJAC0_SA1 +MJAC0_SA2 +MJAC0_SI1331 +MJAC0_SI2148 +MJAC0_SX341 +MJAC0_SX431 +MJAE0_SA1 +MJAE0_SA2 +MJAE0_SI1524 +MJAE0_SI1999 +MJAE0_SI2154 +MJAE0_SX264 +MJAE0_SX354 +MJAE0_SX444 +MJAI0_SI1604 +MJAI0_SX164 +MJAI0_SX254 +MJAI0_SX344 +MJAI0_SX434 +MJAI0_SX74 +MJBG0_SA1 +MJBG0_SA2 +MJBG0_SI1232 +MJBG0_SI1724 +MJBG0_SI1862 +MJBG0_SX152 +MJBG0_SX242 +MJBG0_SX332 +MJBG0_SX422 +MJDA0_SA1 +MJDA0_SA2 +MJDA0_SI1661 +MJDA0_SI2291 +MJDA0_SX131 +MJDA0_SX221 +MJDA0_SX401 +MJDA0_SX41 +MJDC0_SA1 +MJDC0_SA2 +MJDC0_SI1161 +MJDC0_SI2165 +MJDC0_SX171 +MJDC0_SX261 +MJDC0_SX351 +MJDC0_SX441 +MJDC0_SX81 +MJDE0_SA2 +MJDE0_SX130 +MJDE0_SX310 +MJDE0_SX40 +MJDE0_SX400 +MJDG0_SA1 +MJDG0_SI1672 +MJDG0_SX142 +MJDG0_SX232 +MJDG0_SX322 +MJDG0_SX412 +MJDG0_SX52 +MJDM0_SA2 +MJDM0_SI1937 +MJDM0_SX260 +MJDM0_SX440 +MJDM0_SX80 +MJEB0_SA1 +MJEB0_SA2 +MJEB0_SI1286 +MJEB0_SI1916 +MJEB0_SX206 +MJEB0_SX26 +MJEB0_SX386 +MJEB1_SA1 +MJEB1_SI2097 +MJEB1_SX117 +MJEB1_SX27 +MJEB1_SX297 +MJEE0_SA2 +MJEE0_SI1237 +MJEE0_SI1867 +MJEE0_SI607 +MJEE0_SX157 +MJEE0_SX427 +MJEE0_SX67 +MJFH0_SA1 +MJFH0_SI1737 +MJFH0_SI477 +MJFH0_SX117 +MJFH0_SX207 +MJFH0_SX27 +MJFH0_SX297 +MJFH0_SX387 +MJFR0_SA2 +MJFR0_SI1605 +MJFR0_SI2235 +MJFR0_SI975 +MJFR0_SX165 +MJFR0_SX255 +MJFR0_SX345 +MJHI0_SA2 +MJHI0_SI555 +MJHI0_SI698 +MJHI0_SX248 +MJHI0_SX338 +MJHI0_SX428 +MJHI0_SX68 +MJJB0_SA2 +MJJB0_SI1139 +MJJB0_SI1277 +MJJB0_SI1769 +MJJB0_SX149 +MJJB0_SX329 +MJJB0_SX419 +MJJB0_SX59 +MJJJ0_SA1 +MJJJ0_SA2 +MJJJ0_SI1793 +MJJJ0_SI533 +MJJJ0_SX173 +MJJJ0_SX263 +MJJJ0_SX353 +MJJJ0_SX83 +MJJM0_SA1 +MJJM0_SI1457 +MJJM0_SX17 +MJJM0_SX197 +MJJM0_SX287 +MJJM0_SX377 +MJKR0_SA2 +MJKR0_SI1201 +MJKR0_SI1831 +MJKR0_SX121 +MJKR0_SX211 +MJKR0_SX301 +MJKR0_SX31 +MJKR0_SX391 +MJLB0_SA1 +MJLB0_SA2 +MJLB0_SI2246 +MJLB0_SI986 +MJLB0_SX266 +MJLB0_SX356 +MJLB0_SX446 +MJLB0_SX86 +MJLG1_SA1 +MJLG1_SA2 +MJLG1_SI1012 +MJLG1_SI1642 +MJLG1_SI2272 +MJLG1_SX112 +MJLG1_SX202 +MJLG1_SX22 +MJLG1_SX382 +MJLS0_SA1 +MJLS0_SA2 +MJLS0_SI1096 +MJLS0_SI466 +MJLS0_SX16 +MJLS0_SX196 +MJLS0_SX286 +MJLS0_SX376 +MJMA0_SI1495 +MJMA0_SI865 +MJMA0_SX145 +MJMA0_SX235 +MJMA0_SX325 +MJMA0_SX415 +MJMA0_SX55 +MJMD0_SA1 +MJMD0_SI1028 +MJMD0_SI1658 +MJMD0_SX128 +MJMD0_SX218 +MJMD0_SX398 +MJMM0_SA1 +MJMM0_SA2 +MJMM0_SI1885 +MJMM0_SI625 +MJMM0_SX265 +MJMM0_SX355 +MJMM0_SX445 +MJPG0_SA1 +MJPG0_SA2 +MJPG0_SI561 +MJPG0_SX291 +MJPG0_SX381 +MJPM0_SA1 +MJPM0_SI1998 +MJPM0_SI738 +MJPM0_SX108 +MJPM0_SX18 +MJPM0_SX198 +MJPM0_SX288 +MJPM1_SA1 +MJPM1_SA2 +MJPM1_SI1897 +MJPM1_SI761 +MJPM1_SX131 +MJPM1_SX221 +MJPM1_SX41 +MJRA0_SI606 +MJRA0_SX156 +MJRA0_SX246 +MJRA0_SX66 +MJRG0_SA1 +MJRG0_SA2 +MJRG0_SX106 +MJRG0_SX16 +MJRG0_SX286 +MJRH0_SA1 +MJRH0_SA2 +MJRH0_SI1125 +MJRH0_SI1755 +MJRH0_SX135 +MJRH0_SX315 +MJRH0_SX405 +MJRH0_SX45 +MJRH1_SA2 +MJRH1_SI1774 +MJRH1_SX334 +MJRH1_SX64 +MJRK0_SI2103 +MJRK0_SX340 +MJRK0_SX70 +MJRP0_SI1835 +MJRP0_SI585 +MJRP0_SX135 +MJRP0_SX315 +MJRP0_SX405 +MJRP0_SX45 +MJSR0_SA2 +MJSR0_SX164 +MJSR0_SX254 +MJSR0_SX434 +MJSR0_SX74 +MJWG0_SA2 +MJWG0_SI2155 +MJWG0_SX355 +MJWG0_SX445 +MJWG0_SX85 +MJWS0_SA1 +MJWS0_SA2 +MJWS0_SI1143 +MJWS0_SI1773 +MJWS0_SX243 +MJWS0_SX423 +MJWT0_SA2 +MJWT0_SI751 +MJXA0_SA1 +MJXA0_SA2 +MJXA0_SI1507 +MJXA0_SI2137 +MJXA0_SI877 +MJXA0_SX157 +MJXA0_SX247 +MJXA0_SX337 +MJXA0_SX67 +MJXL0_SA1 +MJXL0_SA2 +MJXL0_SI1795 +MJXL0_SX182 +MJXL0_SX272 +MJXL0_SX362 +MJXL0_SX452 +MJXL0_SX92 +MKAG0_SA2 +MKAG0_SI1609 +MKAG0_SI2239 +MKAG0_SX169 +MKAG0_SX30 +MKAG0_SX439 +MKAG0_SX79 +MKAH0_SA1 +MKAH0_SA2 +MKAH0_SI1528 +MKAH0_SI2158 +MKAH0_SI898 +MKAH0_SX268 +MKAH0_SX358 +MKAH0_SX448 +MKAH0_SX88 +MKAJ0_SA1 +MKAJ0_SI1414 +MKAJ0_SI2044 +MKAJ0_SI784 +MKAJ0_SX244 +MKAJ0_SX334 +MKAJ0_SX424 +MKAJ0_SX64 +MKAM0_SA2 +MKAM0_SI1316 +MKAM0_SX236 +MKAM0_SX416 +MKDB0_SI2132 +MKDB0_SI588 +MKDB0_SI872 +MKDB0_SX242 +MKDB0_SX332 +MKDB0_SX422 +MKDB0_SX62 +MKDD0_SA1 +MKDD0_SX127 +MKDD0_SX217 +MKDD0_SX307 +MKDD0_SX37 +MKDD0_SX397 +MKDT0_SA1 +MKDT0_SA2 +MKDT0_SI2153 +MKDT0_SI893 +MKDT0_SX173 +MKDT0_SX263 +MKDT0_SX353 +MKDT0_SX443 +MKDT0_SX83 +MKES0_SA2 +MKES0_SX263 +MKES0_SX353 +MKES0_SX443 +MKES0_SX83 +MKJO0_SA1 +MKJO0_SA2 +MKJO0_SI2147 +MKJO0_SX167 +MKJO0_SX257 +MKJO0_SX424 +MKJO0_SX77 +MKLN0_SA1 +MKLN0_SA2 +MKLN0_SI1598 +MKLN0_SI2228 +MKLN0_SX158 +MKLN0_SX338 +MKLN0_SX428 +MKLN0_SX68 +MKLR0_SA1 +MKLR0_SI1059 +MKLR0_SI2319 +MKLR0_SX159 +MKLR0_SX249 +MKLR0_SX339 +MKLR0_SX429 +MKLR0_SX69 +MKLS0_SA2 +MKLS0_SI1533 +MKLS0_SX177 +MKLS0_SX267 +MKLS0_SX447 +MKLS1_SI1545 +MKLS1_SI2175 +MKLS1_SX105 +MKLS1_SX15 +MKLS1_SX195 +MKLS1_SX285 +MKLW0_SA2 +MKLW0_SI1844 +MKLW0_SI2201 +MKLW0_SX131 +MKLW0_SX221 +MKLW0_SX401 +MKLW0_SX41 +MKRG0_SA1 +MKRG0_SA2 +MKRG0_SI1491 +MKRG0_SI2121 +MKRG0_SX141 +MKRG0_SX231 +MKRG0_SX31 +MKRG0_SX51 +MKXL0_SA1 +MKXL0_SI1185 +MKXL0_SX105 +MKXL0_SX195 +MKXL0_SX285 +MLBC0_SA2 +MLBC0_SI609 +MLBC0_SX159 +MLBC0_SX339 +MLBC0_SX429 +MLBC0_SX69 +MLEL0_SI1876 +MLEL0_SX346 +MLEL0_SX76 +MLJC0_SA1 +MLJC0_SA2 +MLJC0_SI1855 +MLJC0_SI595 +MLJC0_SX235 +MLJC0_SX325 +MLJC0_SX55 +MLJH0_SI1324 +MLJH0_SX154 +MLJH0_SX334 +MLJH0_SX424 +MLNS0_SA1 +MLNS0_SA2 +MLNS0_SI1407 +MLNS0_SI777 +MLNS0_SX147 +MLNS0_SX237 +MLNS0_SX327 +MLNS0_SX417 +MLNS0_SX57 +MLSH0_SA1 +MLSH0_SA2 +MLSH0_SI2047 +MLSH0_SI787 +MLSH0_SX157 +MLSH0_SX337 +MLSH0_SX427 +MLSH0_SX67 +MMAA0_SI2105 +MMAA0_SX125 +MMAA0_SX215 +MMAA0_SX305 +MMAA0_SX395 +MMAB1_SA1 +MMAB1_SA2 +MMAB1_SI2124 +MMAB1_SX144 +MMAB1_SX414 +MMAB1_SX54 +MMAG0_SI496 +MMAG0_SX226 +MMAG0_SX406 +MMAG0_SX46 +MMAM0_SA1 +MMAM0_SA2 +MMAM0_SI1597 +MMAM0_SI1668 +MMAM0_SX247 +MMAM0_SX337 +MMAM0_SX67 +MMAR0_SA1 +MMAR0_SA2 +MMAR0_SI1336 +MMAR0_SI706 +MMAR0_SX436 +MMAR0_SX76 +MMBS0_SA1 +MMBS0_SA2 +MMBS0_SI1151 +MMBS0_SX251 +MMBS0_SX341 +MMBS0_SX431 +MMBS0_SX71 +MMCC0_SA1 +MMCC0_SI1968 +MMCC0_SI708 +MMCC0_SX168 +MMCC0_SX258 +MMCC0_SX348 +MMCC0_SX438 +MMCC0_SX78 +MMDB0_SA1 +MMDB0_SA2 +MMDB0_SI1358 +MMDB0_SI1617 +MMDB0_SX267 +MMDB0_SX357 +MMDB0_SX447 +MMDB0_SX87 +MMDG0_SI2035 +MMDG0_SX340 +MMDG0_SX430 +MMDG0_SX70 +MMDM0_SA1 +MMDM0_SA2 +MMDM0_SX231 +MMDM0_SX321 +MMDM0_SX411 +MMDM0_SX51 +MMDM1_SA1 +MMDM1_SI1650 +MMDM1_SI783 +MMDM1_SX243 +MMDS0_SA2 +MMDS0_SI1343 +MMDS0_SI1973 +MMDS0_SI713 +MMDS0_SX173 +MMDS0_SX263 +MMDS0_SX353 +MMDS0_SX443 +MMDS0_SX83 +MMEA0_SA2 +MMEA0_SI1388 +MMEA0_SI2018 +MMEA0_SI758 +MMEA0_SX218 +MMEA0_SX308 +MMEA0_SX38 +MMEB0_SA1 +MMEB0_SI1357 +MMEB0_SI1987 +MMEB0_SI727 +MMEB0_SX7 +MMEB0_SX97 +MMGC0_SA1 +MMGC0_SI1935 +MMGC0_SI2184 +MMGC0_SX315 +MMGC0_SX405 +MMGC0_SX45 +MMGG0_SA1 +MMGG0_SA2 +MMGG0_SI1709 +MMGG0_SI2339 +MMGG0_SX179 +MMGG0_SX359 +MMGG0_SX89 +MMGK0_SA1 +MMGK0_SA2 +MMGK0_SI1322 +MMGK0_SI1952 +MMGK0_SI692 +MMGK0_SX152 +MMGK0_SX242 +MMGK0_SX422 +MMJB1_SA1 +MMJB1_SI1408 +MMJB1_SI2038 +MMJB1_SI778 +MMJB1_SX148 +MMJB1_SX238 +MMJB1_SX328 +MMJB1_SX418 +MMJB1_SX58 +MMLM0_SA1 +MMLM0_SA2 +MMLM0_SI1527 +MMLM0_SI897 +MMLM0_SX177 +MMLM0_SX267 +MMLM0_SX357 +MMLM0_SX447 +MMLM0_SX87 +MMPM0_SA1 +MMPM0_SA2 +MMPM0_SI1061 +MMPM0_SI1691 +MMPM0_SI2321 +MMPM0_SX251 +MMPM0_SX341 +MMPM0_SX431 +MMPM0_SX71 +MMRP0_SA1 +MMRP0_SI2034 +MMRP0_SI717 +MMRP0_SI774 +MMRP0_SX234 +MMRP0_SX414 +MMRP0_SX54 +MMSM0_SA1 +MMSM0_SA2 +MMSM0_SI1736 +MMSM0_SX26 +MMSM0_SX296 +MMSM0_SX386 +MMVP0_SI1284 +MMVP0_SI1914 +MMVP0_SX114 +MMVP0_SX204 +MMVP0_SX294 +MMVP0_SX384 +MMWB0_SA2 +MMWB0_SI1619 +MMWB0_SX179 +MMWB0_SX269 +MMWS0_SA1 +MMWS0_SI1518 +MMWS0_SI559 +MMWS0_SI888 +MMWS0_SX258 +MMWS0_SX78 +MMWS1_SA1 +MMWS1_SA2 +MMWS1_SI1071 +MMWS1_SI2331 +MMWS1_SX261 +MMWS1_SX27 +MMWS1_SX351 +MMWS1_SX441 +MMWS1_SX81 +MMXS0_SA1 +MMXS0_SA2 +MMXS0_SI629 +MMXS0_SI876 +MMXS0_SX156 +MMXS0_SX336 +MMXS0_SX66 +MNET0_SA1 +MNET0_SA2 +MNET0_SI1446 +MNET0_SI2076 +MNET0_SX186 +MNET0_SX276 +MNET0_SX366 +MNET0_SX96 +MNTW0_SA1 +MNTW0_SI2328 +MNTW0_SX202 +MNTW0_SX258 +MNTW0_SX348 +MPAR0_SA1 +MPAR0_SA2 +MPAR0_SI1576 +MPAR0_SX226 +MPAR0_SX406 +MPAR0_SX46 +MPEB0_SA1 +MPEB0_SA2 +MPEB0_SX150 +MPEB0_SX420 +MPEB0_SX60 +MPFU0_SA1 +MPFU0_SA2 +MPFU0_SI1888 +MPFU0_SX178 +MPFU0_SX268 +MPFU0_SX358 +MPFU0_SX88 +MPGH0_SA1 +MPGH0_SA2 +MPGH0_SI1554 +MPGH0_SI924 +MPGH0_SX204 +MPGH0_SX294 +MPGH0_SX384 +MPGR0_SA1 +MPGR0_SA2 +MPGR0_SI2040 +MPGR0_SI780 +MPGR0_SX150 +MPGR0_SX420 +MPGR0_SX60 +MPGR1_SA1 +MPGR1_SA2 +MPGR1_SI1269 +MPGR1_SI2129 +MPGR1_SX239 +MPGR1_SX329 +MPGR1_SX419 +MPGR1_SX59 +MPMB0_SX241 +MPPC0_SA2 +MPPC0_SI2042 +MPPC0_SI782 +MPPC0_SX152 +MPPC0_SX242 +MPPC0_SX332 +MPPC0_SX422 +MPPC0_SX62 +MPRB0_SA1 +MPRB0_SA2 +MPRB0_SI1205 +MPRB0_SX125 +MPRB0_SX215 +MPRB0_SX305 +MPRB0_SX35 +MPRB0_SX395 +MPRD0_SA2 +MPRD0_SI1431 +MPRD0_SI2061 +MPRK0_SA2 +MPRK0_SX17 +MPRK0_SX197 +MPRT0_SA2 +MPRT0_SI1210 +MPRT0_SI495 +MPRT0_SI580 +MPRT0_SX130 +MPRT0_SX220 +MPRT0_SX40 +MPRT0_SX400 +MPSW0_SA1 +MPSW0_SA2 +MPSW0_SI1697 +MPSW0_SI2327 +MPSW0_SX24 +MPSW0_SX257 +MPSW0_SX77 +MRAB0_SA1 +MRAB0_SA2 +MRAB0_SI1224 +MRAB0_SI594 +MRAB0_SX144 +MRAB0_SX234 +MRAB0_SX324 +MRAB0_SX414 +MRAB0_SX54 +MRAB1_SA1 +MRAB1_SA2 +MRAB1_SI1478 +MRAB1_SI2108 +MRAB1_SX218 +MRAB1_SX38 +MRAB1_SX398 +MRAI0_SI1954 +MRAI0_SX162 +MRAI0_SX252 +MRAI0_SX342 +MRAM0_SI1275 +MRAM0_SI1905 +MRAM0_SX105 +MRAM0_SX195 +MRAM0_SX285 +MRAM0_SX375 +MRAV0_SA1 +MRAV0_SA2 +MRAV0_SI1008 +MRAV0_SI1638 +MRAV0_SI2268 +MRAV0_SX108 +MRAV0_SX18 +MRAV0_SX198 +MRAV0_SX288 +MRAV0_SX378 +MRBC0_SA1 +MRBC0_SA2 +MRBC0_SI1665 +MRBC0_SI599 +MRBC0_SX149 +MRBC0_SX239 +MRBC0_SX59 +MRCG0_SA1 +MRCG0_SI2058 +MRCG0_SX258 +MRCG0_SX78 +MRCW0_SA2 +MRCW0_SI1371 +MRCW0_SI2001 +MRCW0_SX111 +MRCW0_SX201 +MRCW0_SX21 +MRCW0_SX381 +MRDD0_SA1 +MRDD0_SA2 +MRDD0_SI1050 +MRDD0_SI2310 +MRDD0_SX240 +MRDD0_SX330 +MRDM0_SA1 +MRDM0_SA2 +MRDM0_SI965 +MRDM0_SX155 +MRDM0_SX245 +MRDM0_SX425 +MRDS0_SA2 +MRDS0_SI1167 +MRDS0_SI1797 +MRDS0_SI537 +MRDS0_SX177 +MRDS0_SX267 +MRDS0_SX357 +MRDS0_SX447 +MRDS0_SX87 +MREE0_SA1 +MREE0_SA2 +MREE0_SI1734 +MREE0_SX114 +MREE0_SX204 +MREE0_SX294 +MREE0_SX384 +MREH1_SA2 +MREH1_SI2229 +MREH1_SX159 +MREH1_SX339 +MREH1_SX429 +MREM0_SA1 +MREM0_SI1591 +MREM0_SI961 +MREM0_SX151 +MREM0_SX241 +MREM0_SX331 +MREM0_SX421 +MREM0_SX61 +MREW1_SA1 +MREW1_SA2 +MREW1_SI1500 +MREW1_SI2130 +MREW1_SX150 +MREW1_SX240 +MREW1_SX330 +MREW1_SX420 +MREW1_SX60 +MRFK0_SA1 +MRFK0_SA2 +MRFK0_SI1706 +MRFK0_SI2336 +MRFK0_SX176 +MRFK0_SX266 +MRFK0_SX356 +MRFK0_SX86 +MRFL0_SA2 +MRFL0_SI1786 +MRFL0_SX346 +MRGM0_SA1 +MRGM0_SI1162 +MRGM0_SI1792 +MRGM0_SX416 +MRGM0_SX82 +MRGS0_SA1 +MRGS0_SI1986 +MRGS0_SX276 +MRGS0_SX366 +MRGS0_SX96 +MRHL0_SA1 +MRHL0_SA2 +MRHL0_SI1515 +MRHL0_SI2145 +MRHL0_SX165 +MRHL0_SX255 +MRHL0_SX75 +MRJB1_SI1020 +MRJB1_SX300 +MRJH0_SA1 +MRJH0_SI914 +MRJH0_SX259 +MRJH0_SX439 +MRJM0_SA1 +MRJM0_SA2 +MRJM0_SI1095 +MRJM0_SI1228 +MRJM0_SI1858 +MRJM0_SX238 +MRJM0_SX328 +MRJM0_SX418 +MRJM0_SX58 +MRJM1_SA1 +MRJM1_SI668 +MRJM1_SX218 +MRJM1_SX308 +MRJM1_SX38 +MRJM1_SX398 +MRJT0_SA1 +MRJT0_SI1805 +MRJT0_SX148 +MRJT0_SX238 +MRKM0_SA1 +MRKM0_SX187 +MRKM0_SX277 +MRKM0_SX7 +MRKM0_SX97 +MRLD0_SA1 +MRLD0_SI1594 +MRLD0_SI964 +MRLD0_SX244 +MRLD0_SX334 +MRLD0_SX64 +MRLJ0_SA2 +MRLJ0_SI1420 +MRLJ0_SI2050 +MRLJ0_SX160 +MRLJ0_SX430 +MRLJ0_SX70 +MRLJ1_SI1671 +MRLJ1_SI2332 +MRLJ1_SX141 +MRLJ1_SX231 +MRLJ1_SX411 +MRLJ1_SX51 +MRLK0_SA1 +MRLK0_SA2 +MRLK0_SI2140 +MRLK0_SX303 +MRLK0_SX33 +MRLK0_SX393 +MRLR0_SA1 +MRLR0_SA2 +MRLR0_SI1826 +MRLR0_SI566 +MRLR0_SX116 +MRLR0_SX206 +MRLR0_SX26 +MRLR0_SX296 +MRLR0_SX386 +MRMB0_SA1 +MRMB0_SI2211 +MRMB0_SI951 +MRMB0_SX141 +MRMB0_SX231 +MRMB0_SX321 +MRMB0_SX51 +MRMG0_SA2 +MRMG0_SI1710 +MRMG0_SI2340 +MRMG0_SX180 +MRMG0_SX270 +MRMG0_SX360 +MRMG0_SX90 +MRMH0_SA1 +MRMH0_SA2 +MRMH0_SI1021 +MRMH0_SX211 +MRMH0_SX301 +MRMH0_SX31 +MRMH0_SX391 +MRML0_SI2051 +MRML0_SI791 +MRML0_SX431 +MRML0_SX71 +MRMS0_SA1 +MRMS0_SA2 +MRMS0_SI1113 +MRMS0_SI2100 +MRMS0_SX120 +MRMS0_SX210 +MRMS0_SX30 +MRMS0_SX300 +MRMS0_SX390 +MRPC1_SA1 +MRPC1_SA2 +MRPC1_SI1482 +MRPC1_SI2026 +MRPC1_SX132 +MRPC1_SX222 +MRPC1_SX312 +MRPC1_SX402 +MRPC1_SX42 +MRRE0_SI704 +MRRE0_SX254 +MRRE0_SX434 +MRSO0_SA1 +MRSO0_SA2 +MRSO0_SI1659 +MRSO0_SI2289 +MRSO0_SX219 +MRSO0_SX309 +MRSO0_SX399 +MRSP0_SA1 +MRSP0_SA2 +MRSP0_SI2059 +MRSP0_SI799 +MRSP0_SX169 +MRSP0_SX196 +MRSP0_SX439 +MRSP0_SX79 +MRTC0_SA1 +MRTC0_SA2 +MRTC0_SI2088 +MRTC0_SI828 +MRTC0_SX108 +MRTC0_SX18 +MRTC0_SX198 +MRTC0_SX288 +MRTJ0_SA2 +MRTJ0_SI1551 +MRTJ0_SI2032 +MRTJ0_SX322 +MRTJ0_SX412 +MRVG0_SA1 +MRVG0_SA2 +MRVG0_SI1770 +MRVG0_SI510 +MRVG0_SX150 +MRVG0_SX330 +MRVG0_SX420 +MRVG0_SX60 +MRWA0_SA1 +MRWA0_SA2 +MRWA0_SI1603 +MRWA0_SI2233 +MRWA0_SX253 +MRWA0_SX343 +MRWA0_SX433 +MRWS0_SA1 +MRWS0_SA2 +MRWS0_SX112 +MRWS0_SX202 +MRWS0_SX292 +MRXB0_SA1 +MRXB0_SI1585 +MRXB0_SX145 +MRXB0_SX235 +MRXB0_SX325 +MRXB0_SX55 +MSAH1_SA1 +MSAH1_SA2 +MSAH1_SI1049 +MSAH1_SI2309 +MSAH1_SX149 +MSAH1_SX239 +MSAH1_SX329 +MSAH1_SX419 +MSAH1_SX59 +MSAS0_SA1 +MSAS0_SA2 +MSAS0_SI2006 +MSAS0_SX26 +MSAS0_SX296 +MSAT0_SA2 +MSAT0_SI1526 +MSAT0_SI2156 +MSAT0_SI896 +MSAT0_SX176 +MSAT0_SX266 +MSAT0_SX356 +MSAT0_SX446 +MSAT0_SX86 +MSAT1_SA1 +MSAT1_SA2 +MSAT1_SI1073 +MSAT1_SI1703 +MSAT1_SI2333 +MSAT1_SX173 +MSAT1_SX353 +MSDB0_SA1 +MSDB0_SA2 +MSDB0_SI1007 +MSDB0_SI1637 +MSDB0_SI2267 +MSDB0_SX107 +MSDB0_SX17 +MSDH0_SA1 +MSDH0_SA2 +MSDH0_SI2113 +MSDH0_SX260 +MSDH0_SX350 +MSDS0_SA2 +MSDS0_SI1707 +MSDS0_SI2337 +MSDS0_SX177 +MSDS0_SX447 +MSDS0_SX87 +MSEM1_SA1 +MSEM1_SA2 +MSEM1_SX360 +MSEM1_SX450 +MSEM1_SX90 +MSES0_SA1 +MSES0_SA2 +MSES0_SI2216 +MSES0_SI2219 +MSES0_SX149 +MSES0_SX329 +MSES0_SX59 +MSFH0_SA2 +MSFH0_SI1216 +MSFH0_SI586 +MSFH0_SX226 +MSFH0_SX46 +MSFV0_SA1 +MSFV0_SA2 +MSFV0_SI1262 +MSFV0_SX182 +MSFV0_SX272 +MSFV0_SX452 +MSJK0_SA1 +MSJK0_SA2 +MSJK0_SI2226 +MSJK0_SI966 +MSJK0_SX156 +MSJK0_SX246 +MSJK0_SX426 +MSJK0_SX66 +MSMC0_SA1 +MSMC0_SA2 +MSMC0_SI1907 +MSMC0_SI647 +MSMC0_SX107 +MSMC0_SX17 +MSMC0_SX197 +MSMC0_SX287 +MSMC0_SX377 +MSMR0_SA1 +MSMR0_SA2 +MSMR0_SI1405 +MSMR0_SI775 +MSMR0_SX145 +MSMR0_SX235 +MSMR0_SX325 +MSMR0_SX55 +MSMS0_SA2 +MSMS0_SI2063 +MSMS0_SI803 +MSMS0_SX263 +MSMS0_SX353 +MSMS0_SX443 +MSRG0_SA2 +MSRG0_SI1851 +MSRG0_SI591 +MSRG0_SX141 +MSRG0_SX231 +MSRG0_SX321 +MSRG0_SX411 +MSRG0_SX51 +MSRR0_SA1 +MSRR0_SA2 +MSRR0_SI1131 +MSRR0_SX141 +MSRR0_SX231 +MSRR0_SX30 +MSRR0_SX411 +MSRR0_SX51 +MSTF0_SA1 +MSTF0_SA2 +MSTF0_SI1396 +MSTF0_SX136 +MSTF0_SX226 +MSTF0_SX406 +MSVS0_SA1 +MSVS0_SI1568 +MSVS0_SX128 +MSVS0_SX218 +MSVS0_SX38 +MTAB0_SA1 +MTAB0_SA2 +MTAB0_SI2202 +MTAB0_SI942 +MTAB0_SX132 +MTAB0_SX222 +MTAB0_SX402 +MTAB0_SX42 +MTAS0_SA1 +MTAS0_SA2 +MTAS0_SI1385 +MTAS0_SI2015 +MTAS0_SI755 +MTAS0_SX125 +MTAS0_SX305 +MTAT0_SA2 +MTAT0_SI1740 +MTAT0_SX120 +MTAT0_SX210 +MTAT0_SX30 +MTAT0_SX300 +MTAT1_SA1 +MTAT1_SA2 +MTAT1_SI1409 +MTAT1_SI1627 +MTAT1_SX239 +MTAT1_SX419 +MTBC0_SA1 +MTBC0_SA2 +MTBC0_SI1173 +MTBC0_SX183 +MTBC0_SX273 +MTBC0_SX347 +MTBC0_SX363 +MTBC0_SX93 +MTCS0_SA1 +MTCS0_SI1972 +MTCS0_SX172 +MTCS0_SX262 +MTCS0_SX352 +MTCS0_SX442 +MTDB0_SA1 +MTDB0_SA2 +MTDB0_SI2031 +MTDB0_SX141 +MTDB0_SX231 +MTDB0_SX321 +MTDB0_SX411 +MTDB0_SX51 +MTDP0_SI1274 +MTDP0_SI2151 +MTDP0_SX261 +MTDP0_SX441 +MTDP0_SX81 +MTER0_SI527 +MTER0_SX167 +MTER0_SX17 +MTER0_SX257 +MTER0_SX77 +MTJG0_SA2 +MTJG0_SI1520 +MTJG0_SI890 +MTJG0_SX350 +MTJG0_SX440 +MTJG0_SX80 +MTJM0_SA1 +MTJM0_SA2 +MTJM0_SI1226 +MTJM0_SI655 +MTJM0_SX236 +MTJM0_SX326 +MTJM0_SX416 +MTJM0_SX56 +MTJS0_SA1 +MTJS0_SI1192 +MTJS0_SX112 +MTJS0_SX202 +MTJS0_SX22 +MTJS0_SX292 +MTJU0_SA1 +MTJU0_SA2 +MTJU0_SI2269 +MTJU0_SI760 +MTJU0_SX220 +MTJU0_SX310 +MTJU0_SX40 +MTKD0_SA1 +MTKD0_SA2 +MTKD0_SI1187 +MTKD0_SI1817 +MTKD0_SX17 +MTKD0_SX197 +MTKD0_SX377 +MTKP0_SA1 +MTKP0_SA2 +MTKP0_SX123 +MTKP0_SX213 +MTKP0_SX303 +MTKP0_SX33 +MTKP0_SX393 +MTLB0_SA2 +MTLB0_SI1764 +MTLB0_SI504 +MTLB0_SX144 +MTLB0_SX414 +MTLB0_SX54 +MTLC0_SA2 +MTLC0_SI847 +MTLC0_SX127 +MTLC0_SX217 +MTLC0_SX307 +MTLC0_SX37 +MTLC0_SX397 +MTML0_SA1 +MTML0_SA2 +MTML0_SI1065 +MTML0_SI1695 +MTML0_SX255 +MTML0_SX345 +MTML0_SX75 +MTMN0_SA1 +MTMN0_SX164 +MTMN0_SX254 +MTMN0_SX344 +MTMN0_SX74 +MTMT0_SA1 +MTMT0_SI1118 +MTMT0_SX128 +MTMT0_SX218 +MTMT0_SX308 +MTMT0_SX38 +MTMT0_SX398 +MTPF0_SA1 +MTPF0_SA2 +MTPF0_SI1235 +MTPF0_SI1865 +MTPF0_SI605 +MTPF0_SX155 +MTPF0_SX245 +MTPF0_SX335 +MTPF0_SX425 +MTPG0_SA1 +MTPG0_SA2 +MTPG0_SI2013 +MTPG0_SX123 +MTPG0_SX213 +MTPG0_SX33 +MTPG0_SX393 +MTPP0_SA1 +MTPP0_SA2 +MTPP0_SI2138 +MTPP0_SI878 +MTPP0_SX158 +MTPP0_SX248 +MTPP0_SX428 +MTPP0_SX68 +MTPR0_SA1 +MTPR0_SA2 +MTPR0_SI1600 +MTPR0_SI506 +MTPR0_SX250 +MTPR0_SX70 +MTQC0_SA2 +MTQC0_SI2071 +MTQC0_SX271 +MTQC0_SX361 +MTRC0_SA1 +MTRC0_SA2 +MTRC0_SI1623 +MTRC0_SI993 +MTRC0_SX170 +MTRC0_SX183 +MTRC0_SX273 +MTRC0_SX363 +MTRC0_SX93 +MTRR0_SA1 +MTRR0_SA2 +MTRR0_SI1548 +MTRR0_SI2178 +MTRR0_SX108 +MTRR0_SX18 +MTRR0_SX378 +MTRT0_SA1 +MTRT0_SI1857 +MTRT0_SI597 +MTRT0_SX147 +MTRT0_SX237 +MTRT0_SX417 +MTWH1_SA1 +MTWH1_SA2 +MTWH1_SI1512 +MTWH1_SI2142 +MTWH1_SI882 +MTWH1_SX162 +MTWH1_SX252 +MTWH1_SX342 +MTWH1_SX432 +MTXS0_SI1690 +MTXS0_SX250 +MTXS0_SX340 +MTXS0_SX70 +MVJH0_SA1 +MVJH0_SA2 +MVJH0_SI2186 +MVJH0_SX116 +MVJH0_SX26 +MVJH0_SX386 +MVLO0_SA2 +MVLO0_SI1147 +MVLO0_SI1777 +MVLO0_SX157 +MVLO0_SX247 +MVLO0_SX337 +MVLO0_SX427 +MVLO0_SX67 +MVRW0_SA1 +MVRW0_SI1485 +MVRW0_SI2115 +MVRW0_SI855 +MVRW0_SX315 +MVRW0_SX405 +MVRW0_SX45 +MWAC0_SA1 +MWAC0_SI2231 +MWAC0_SI971 +MWAC0_SX71 +MWAD0_SA1 +MWAD0_SA2 +MWAD0_SI1062 +MWAD0_SI1749 +MWAD0_SI2322 +MWAD0_SX162 +MWAD0_SX252 +MWAD0_SX342 +MWAR0_SA2 +MWAR0_SI2305 +MWAR0_SX145 +MWAR0_SX235 +MWAR0_SX325 +MWAR0_SX415 +MWAR0_SX55 +MWCH0_SA1 +MWCH0_SA2 +MWCH0_SI1622 +MWCH0_SX272 +MWCH0_SX362 +MWCH0_SX92 +MWDK0_SX266 +MWDK0_SX356 +MWDK0_SX446 +MWEM0_SA1 +MWEM0_SI1950 +MWEM0_SX240 +MWEM0_SX330 +MWEM0_SX60 +MWGR0_SA1 +MWGR0_SA2 +MWGR0_SI1606 +MWGR0_SI2236 +MWGR0_SI976 +MWGR0_SX166 +MWGR0_SX256 +MWGR0_SX436 +MWGR0_SX76 +MWRE0_SA1 +MWRE0_SI1687 +MWRE0_SI2317 +MWRE0_SX157 +MWRP0_SA2 +MWRP0_SI1525 +MWRP0_SI2073 +MWRP0_SX183 +MWRP0_SX3 +MWRP0_SX93 +MWSB0_SA1 +MWSB0_SA2 +MWSB0_SI1626 +MWSB0_SI2256 +MWSB0_SX186 +MWSB0_SX366 +MWSB0_SX6 +MWSB0_SX96 +MWSH0_SA1 +MWSH0_SA2 +MWSH0_SI2266 +MWSH0_SX346 +MWSH0_SX436 +MZMB0_SA2 +MZMB0_SI1166 +MZMB0_SI1796 +MZMB0_SI536 +MZMB0_SX176 +MZMB0_SX266 +MZMB0_SX356 +MZMB0_SX446 +MZMB0_SX86 diff --git a/examples/wav2vec/unsupervised/config/timit_unmatched/train_text.uid b/examples/wav2vec/unsupervised/config/timit_unmatched/train_text.uid new file mode 100644 index 0000000000..0e0c2517c9 --- /dev/null +++ b/examples/wav2vec/unsupervised/config/timit_unmatched/train_text.uid @@ -0,0 +1,1000 @@ +FAEM0_SI762 +FAEM0_SX42 +FAJW0_SA1 +FAJW0_SX3 +FAJW0_SX93 +FALK0_SX186 +FALK0_SX6 +FALR0_SI1325 +FBAS0_SA1 +FBAS0_SX217 +FBCG1_SA1 +FBCG1_SX172 +FBCG1_SX442 +FBCH0_SX236 +FBCH0_SX416 +FBLV0_SA1 +FBLV0_SI1058 +FBLV0_SX338 +FBLV0_SX68 +FBMH0_SA1 +FBMJ0_SI815 +FCAG0_SA1 +FCAG0_SX153 +FCAG0_SX243 +FCAJ0_SI1479 +FCAJ0_SX309 +FCDR1_SX106 +FCDR1_SX196 +FCEG0_SA2 +FCJF0_SA1 +FCJF0_SX127 +FCJS0_SI1607 +FCJS0_SI2237 +FCJS0_SX257 +FCKE0_SA2 +FCKE0_SX121 +FCLT0_SI2068 +FCLT0_SX448 +FCLT0_SX88 +FCMG0_SA2 +FCMG0_SI1872 +FCMG0_SX72 +FCMM0_SA1 +FCMM0_SA2 +FCMM0_SX183 +FCRZ0_SI2053 +FCRZ0_SX433 +FCYL0_SA1 +FCYL0_SX37 +FDAS1_SI2091 +FDAS1_SX201 +FDAS1_SX381 +FDAW0_SI1406 +FDFB0_SA1 +FDFB0_SA2 +FDFB0_SI2010 +FDFB0_SX58 +FDJH0_SX305 +FDML0_SA2 +FDML0_SX159 +FDML0_SX249 +FDML0_SX429 +FDMY0_SA2 +FDMY0_SX27 +FDNC0_SX198 +FDNC0_SX288 +FDTD0_SX211 +FDXW0_SA1 +FDXW0_SX251 +FDXW0_SX341 +FDXW0_SX71 +FEAC0_SX165 +FEAC0_SX75 +FEAR0_SI622 +FECD0_SX68 +FEEH0_SA1 +FEEH0_SI1742 +FEEH0_SI471 +FEEH0_SX122 +FEME0_SA1 +FEME0_SX155 +FEME0_SX65 +FETB0_SA1 +FETB0_SI1148 +FETB0_SX158 +FEXM0_SI1101 +FGCS0_SX136 +FGCS0_SX226 +FGCS0_SX316 +FGCS0_SX406 +FGDP0_SA1 +FGMB0_SI1775 +FGMB0_SX245 +FHLM0_SX390 +FHXS0_SA2 +FHXS0_SX445 +FJDM2_SA1 +FJDM2_SX232 +FJDM2_SX52 +FJHK0_SX302 +FJKL0_SX212 +FJKL0_SX392 +FJLG0_SI2306 +FJLR0_SA1 +FJRP1_SI2062 +FJRP1_SX82 +FJSK0_SA1 +FJSP0_SX264 +FJSP0_SX354 +FJSP0_SX444 +FJWB1_SA1 +FJWB1_SX345 +FJWB1_SX435 +FJXM0_SA1 +FJXM0_SI581 +FJXM0_SX401 +FJXP0_SA1 +FJXP0_SI1122 +FJXP0_SX132 +FKAA0_SX128 +FKAA0_SX398 +FKDE0_SA1 +FKDE0_SX151 +FKDE0_SX241 +FKDE0_SX421 +FKDE0_SX61 +FKDW0_SX397 +FKFB0_SA2 +FKFB0_SX348 +FKFB0_SX78 +FKKH0_SA1 +FKKH0_SA2 +FKKH0_SX120 +FKKH0_SX390 +FKLC0_SX355 +FKLC1_SI2308 +FKLC1_SX238 +FKLC1_SX328 +FKLC1_SX418 +FKLH0_SA2 +FKLH0_SX177 +FKSR0_SA1 +FKSR0_SA2 +FKSR0_SI1747 +FKSR0_SI487 +FKSR0_SX217 +FLAC0_SX451 +FLAG0_SA2 +FLAG0_SX114 +FLAG0_SX204 +FLAG0_SX24 +FLAG0_SX384 +FLEH0_SI1681 +FLEH0_SI2311 +FLEH0_SX331 +FLET0_SA1 +FLHD0_SI1827 +FLHD0_SX354 +FLJA0_SA1 +FLJA0_SI2338 +FLJD0_SI886 +FLJD0_SX76 +FLJG0_SA2 +FLKM0_SA2 +FLKM0_SI686 +FLKM0_SX260 +FLKM0_SX80 +FLMA0_SA1 +FLMA0_SI613 +FLMA0_SX433 +FLMA0_SX73 +FLMC0_SX22 +FLMK0_SI1035 +FLMK0_SX315 +FLMK0_SX405 +FLOD0_SI1917 +FLOD0_SX117 +FLOD0_SX171 +FLOD0_SX297 +FLTM0_SA1 +FLTM0_SI1070 +FLTM0_SI2330 +FMAH1_SA2 +FMAH1_SX159 +FMBG0_SA2 +FMBG0_SI2264 +FMEM0_SI747 +FMEM0_SX387 +FMJB0_SI547 +FMJB0_SX97 +FMJF0_SA2 +FMJU0_SX309 +FMJU0_SX399 +FMKC0_SI1702 +FMKC0_SX442 +FMKC0_SX82 +FMKF0_SX186 +FMPG0_SA2 +FNKL0_SI1522 +FNTB0_SI1203 +FNTB0_SI573 +FNTB0_SX303 +FPAB1_SI1471 +FPAB1_SX211 +FPAC0_SA2 +FPAD0_SA2 +FPAD0_SX356 +FPAD0_SX86 +FPAF0_SA2 +FPAF0_SX154 +FPAZ0_SA1 +FPAZ0_SA2 +FPAZ0_SX243 +FPJF0_SA1 +FPJF0_SX146 +FPJF0_SX56 +FPLS0_SI1590 +FPLS0_SX330 +FPMY0_SA1 +FPMY0_SX343 +FREH0_SA1 +FREH0_SA2 +FREH0_SX415 +FRJB0_SX347 +FRLL0_SX434 +FSAG0_SA1 +FSAG0_SX243 +FSAH0_SA1 +FSAH0_SA2 +FSAH0_SX164 +FSAH0_SX434 +FSBK0_SA2 +FSBK0_SI1069 +FSBK0_SX169 +FSCN0_SA2 +FSCN0_SI626 +FSCN0_SX266 +FSCN0_SX446 +FSCN0_SX86 +FSDC0_SA2 +FSDC0_SX142 +FSDC0_SX322 +FSDC0_SX52 +FSDJ0_SI485 +FSDJ0_SX215 +FSDJ0_SX305 +FSDJ0_SX395 +FSGF0_SX117 +FSJG0_SX130 +FSJK1_SA2 +FSJK1_SX125 +FSJK1_SX35 +FSJS0_SX181 +FSJW0_SI1963 +FSJW0_SX433 +FSKC0_SI1416 +FSKC0_SI786 +FSKC0_SX246 +FSKL0_SI1529 +FSKL0_SX449 +FSKP0_SA2 +FSLS0_SX156 +FSLS0_SX426 +FSMA0_SA2 +FSMA0_SX181 +FSMM0_SX144 +FSMM0_SX234 +FSMS1_SX244 +FSMS1_SX347 +FSPM0_SA2 +FSPM0_SX161 +FSPM0_SX71 +FSRH0_SI1931 +FSRH0_SI671 +FSRH0_SX221 +FSRH0_SX401 +FTAJ0_SI699 +FTAJ0_SX159 +FTAJ0_SX249 +FTAJ0_SX429 +FTBR0_SX21 +FTBW0_SA1 +FTMG0_SI1532 +FTMG0_SI2162 +FTMG0_SX452 +FVFB0_SA2 +FVFB0_SX132 +FVFB0_SX42 +FVKB0_SA1 +FVMH0_SA2 +FVMH0_SX116 +FVMH0_SX26 +MABC0_SI1620 +MABC0_SI2041 +MABC0_SI781 +MADC0_SX107 +MADC0_SX377 +MADD0_SA2 +MADD0_SI1295 +MADD0_SX178 +MADD0_SX268 +MADD0_SX88 +MAEB0_SX450 +MAEO0_SA1 +MAFM0_SI939 +MAFM0_SX129 +MAFM0_SX309 +MAJP0_SA2 +MAKB0_SI1646 +MAKB0_SX26 +MAKB0_SX386 +MAKR0_SX362 +MAKR0_SX92 +MAPV0_SX213 +MARC0_SA2 +MARC0_SX108 +MARC0_SX18 +MARC0_SX198 +MARW0_SI1906 +MBAR0_SA1 +MBAR0_SX419 +MBAR0_SX59 +MBBR0_SI2315 +MBBR0_SX65 +MBCG0_SA1 +MBCG0_SI486 +MBEF0_SI1281 +MBEF0_SI1911 +MBEF0_SI651 +MBEF0_SX21 +MBEF0_SX381 +MBGT0_SA2 +MBGT0_SX261 +MBGT0_SX351 +MBGT0_SX441 +MBJV0_SA1 +MBJV0_SI617 +MBJV0_SX347 +MBMA0_SI592 +MBMA0_SX232 +MBMA0_SX52 +MBMA1_SI2214 +MBMA1_SX54 +MBML0_SA2 +MBML0_SI1169 +MBML0_SX89 +MBOM0_SA2 +MBOM0_SI2274 +MBOM0_SX294 +MBSB0_SA1 +MBSB0_SX3 +MBTH0_SA2 +MBTH0_SX122 +MBTH0_SX32 +MCAE0_SX277 +MCAL0_SA2 +MCAL0_SI1768 +MCDC0_SA1 +MCDC0_SX212 +MCDD0_SA2 +MCDD0_SI883 +MCDD0_SX253 +MCDD0_SX433 +MCDR0_SI1154 +MCEF0_SX235 +MCEF0_SX415 +MCEW0_SA2 +MCHL0_SX87 +MCLK0_SX310 +MCLM0_SA1 +MCLM0_SI2086 +MCLM0_SI826 +MCPM0_SA1 +MCPM0_SX114 +MCPM0_SX294 +MCPM0_SX384 +MCSS0_SI750 +MCTH0_SA1 +MCTH0_SX39 +MCXM0_SX91 +MDAC0_SA1 +MDAC0_SX181 +MDAC0_SX361 +MDAS0_SX6 +MDBB1_SX106 +MDBB1_SX16 +MDBB1_SX376 +MDBP0_SX168 +MDCD0_SI1415 +MDCD0_SX245 +MDCD0_SX425 +MDCM0_SX40 +MDCM0_SX400 +MDDC0_SI2049 +MDDC0_SI789 +MDDC0_SX159 +MDDC0_SX69 +MDED0_SA1 +MDED0_SA2 +MDEF0_SX123 +MDEF0_SX303 +MDHL0_SI1439 +MDHL0_SX269 +MDHL0_SX449 +MDHS0_SA1 +MDHS0_SA2 +MDHS0_SI1530 +MDHS0_SI2160 +MDJM0_SX105 +MDJM0_SX15 +MDKS0_SX436 +MDLB0_SA2 +MDLC0_SX405 +MDLC1_SA2 +MDLC1_SI2065 +MDLC1_SI2144 +MDLC1_SX445 +MDLC2_SI2244 +MDLC2_SX354 +MDLH0_SA2 +MDLM0_SI1234 +MDLM0_SI1864 +MDLM0_SX154 +MDLM0_SX424 +MDLR0_SA1 +MDLR0_SA2 +MDLR0_SI1863 +MDLR0_SI603 +MDLR0_SX153 +MDLR1_SA1 +MDLR1_SA2 +MDMA0_SI1430 +MDMA0_SX260 +MDMA0_SX80 +MDMT0_SA1 +MDMT0_SA2 +MDMT0_SI1832 +MDMT0_SX122 +MDMT0_SX32 +MDNS0_SA2 +MDNS0_SI2271 +MDNS0_SX201 +MDNS0_SX21 +MDPB0_SX416 +MDPK0_SI1053 +MDPK0_SX333 +MDPK0_SX423 +MDPS0_SI719 +MDPS0_SX359 +MDRD0_SA1 +MDRD0_SX32 +MDSJ0_SI2092 +MDSS0_SA2 +MDSS0_SX441 +MDSS1_SA1 +MDSS1_SI1327 +MDSS1_SI697 +MDSS1_SX157 +MDSS1_SX67 +MDTB0_SI1200 +MDTB0_SI1830 +MDTB0_SX120 +MDWD0_SA2 +MDWD0_SX270 +MDWD0_SX90 +MDWH0_SX215 +MDWH0_SX305 +MDWM0_SA1 +MDWM0_SA2 +MDWM0_SX16 +MDWM0_SX286 +MEAL0_SA2 +MEAL0_SI2177 +MEAL0_SX107 +MEAL0_SX347 +MEDR0_SA1 +MEDR0_SA2 +MEDR0_SI1374 +MEFG0_SA1 +MEGJ0_SA2 +MEGJ0_SX257 +MEGJ0_SX3 +MEJL0_SA1 +MEJL0_SX152 +MEJL0_SX242 +MEJS0_SI610 +MEJS0_SX160 +MEJS0_SX340 +MESG0_SX432 +MESJ0_SX187 +MESJ0_SX97 +MEWM0_SI718 +MEWM0_SX178 +MEWM0_SX88 +MFER0_SI862 +MFER0_SX142 +MFRM0_SX345 +MFRM0_SX435 +MFWK0_SI1879 +MFWK0_SX169 +MFXS0_SX54 +MFXV0_SA2 +MFXV0_SX105 +MGAF0_SA1 +MGAF0_SX22 +MGAF0_SX382 +MGAG0_SA2 +MGAK0_SX226 +MGAK0_SX46 +MGAR0_SX132 +MGAW0_SI535 +MGAW0_SX175 +MGES0_SA1 +MGES0_SI2111 +MGES0_SI851 +MGJC0_SA2 +MGJC0_SX75 +MGRL0_SI2127 +MGRL0_SI867 +MGRL0_SX147 +MGRP0_SA2 +MGSH0_SA2 +MGSH0_SI1806 +MGSH0_SX127 +MGSH0_SX276 +MGSH0_SX6 +MGSL0_SA1 +MGSL0_SI534 +MGSL0_SX264 +MGXP0_SX187 +MGXP0_SX7 +MHBS0_SX315 +MHBS0_SX45 +MHIT0_SA1 +MHJB0_SA1 +MHJB0_SI1017 +MHMG0_SX195 +MHMR0_SA1 +MHMR0_SI489 +MHRM0_SA1 +MHRM0_SI958 +MHRM0_SX148 +MHRM0_SX58 +MHXL0_SI1772 +MHXL0_SX242 +MILB0_SA2 +MJAC0_SX307 +MJAC0_SX71 +MJAE0_SX174 +MJAI0_SA1 +MJAI0_SA2 +MJBG0_SX62 +MJDA0_SI1031 +MJDA0_SX311 +MJDE0_SI463 +MJDG0_SA2 +MJDG0_SI1042 +MJDG0_SI1705 +MJDM0_SA1 +MJDM0_SI974 +MJEB0_SI656 +MJEB0_SX296 +MJEB1_SA2 +MJEB1_SX207 +MJEB1_SX387 +MJEE0_SA1 +MJEE0_SX247 +MJEE0_SX337 +MJFH0_SA2 +MJFH0_SI1107 +MJFR0_SX75 +MJHI0_SA1 +MJHI0_SX158 +MJJB0_SA1 +MJJB0_SX239 +MJJJ0_SX443 +MJJM0_SA2 +MJJM0_SI827 +MJJM0_SX107 +MJKR0_SA1 +MJKR0_SI571 +MJLB0_SX176 +MJLG1_SX292 +MJLS0_SX106 +MJMA0_SA1 +MJMA0_SA2 +MJMD0_SA2 +MJMD0_SX308 +MJMD0_SX38 +MJMM0_SX85 +MJPG0_SI1191 +MJPG0_SX111 +MJPG0_SX201 +MJPG0_SX21 +MJPM0_SA2 +MJPM0_SX378 +MJPM1_SI2280 +MJPM1_SX401 +MJRA0_SA1 +MJRA0_SA2 +MJRA0_SI1236 +MJRA0_SI1866 +MJRA0_SX426 +MJRG0_SI1366 +MJRG0_SI1996 +MJRG0_SX376 +MJRH0_SX225 +MJRH1_SA1 +MJRH1_SI514 +MJRH1_SX154 +MJRH1_SX244 +MJRH1_SX424 +MJRK0_SA1 +MJRK0_SA2 +MJRK0_SI1662 +MJRK0_SX160 +MJRK0_SX250 +MJRK0_SX430 +MJRP0_SA1 +MJRP0_SA2 +MJRP0_SX225 +MJSR0_SA1 +MJSR0_SI1424 +MJSR0_SX344 +MJWG0_SA1 +MJWG0_SX265 +MJWS0_SI513 +MJWS0_SX153 +MJWS0_SX63 +MJWT0_SA1 +MJWT0_SX121 +MJWT0_SX211 +MJWT0_SX301 +MJWT0_SX31 +MJWT0_SX391 +MJXA0_SX427 +MJXL0_SI542 +MKAG0_SA1 +MKAG0_SX259 +MKAJ0_SA2 +MKAJ0_SX154 +MKAM0_SA1 +MKAM0_SX146 +MKAM0_SX326 +MKAM0_SX56 +MKDB0_SA1 +MKDB0_SA2 +MKDB0_SX152 +MKDD0_SA2 +MKES0_SA1 +MKES0_SI1253 +MKES0_SI1883 +MKES0_SX173 +MKJO0_SI1517 +MKJO0_SI887 +MKJO0_SX437 +MKLN0_SI968 +MKLN0_SX248 +MKLR0_SA2 +MKLR0_SI1689 +MKLS0_SA1 +MKLS0_SX357 +MKLS0_SX87 +MKLS1_SA1 +MKLS1_SA2 +MKLS1_SX375 +MKLW0_SA1 +MKRG0_SX411 +MKXL0_SA2 +MKXL0_SX15 +MKXL0_SX375 +MLBC0_SA1 +MLBC0_SI1869 +MLBC0_SX249 +MLEL0_SA1 +MLEL0_SA2 +MLEL0_SI1246 +MLEL0_SX256 +MLEL0_SX436 +MLJC0_SX145 +MLJC0_SX415 +MLJH0_SX64 +MLNS0_SI2037 +MMAA0_SA1 +MMAA0_SA2 +MMAA0_SX35 +MMAB1_SI1494 +MMAB1_SX234 +MMAG0_SA2 +MMAG0_SI1126 +MMAG0_SX316 +MMAM0_SI2227 +MMAM0_SX157 +MMAM0_SX427 +MMAR0_SX256 +MMBS0_SI1781 +MMCC0_SA2 +MMDB0_SX177 +MMDG0_SA1 +MMDG0_SA2 +MMDG0_SI520 +MMDG0_SX160 +MMDG0_SX250 +MMDM0_SI1941 +MMDM0_SI681 +MMDM0_SX141 +MMDM1_SA2 +MMDM1_SI2043 +MMDM1_SX423 +MMDM1_SX63 +MMDS0_SA1 +MMEA0_SA1 +MMEA0_SX128 +MMEA0_SX398 +MMEB0_SA2 +MMEB0_SX187 +MMEB0_SX367 +MMGC0_SA2 +MMGC0_SX135 +MMGC0_SX225 +MMGG0_SX269 +MMGK0_SX332 +MMGK0_SX62 +MMJB1_SA2 +MMRP0_SA2 +MMRP0_SX144 +MMSM0_SX116 +MMSM0_SX206 +MMVP0_SA1 +MMVP0_SA2 +MMWB0_SI989 +MMWB0_SX89 +MMWS0_SA2 +MMWS0_SX168 +MMWS0_SX348 +MMWS0_SX438 +MMWS1_SI1701 +MMXS0_SI2136 +MMXS0_SX246 +MMXS0_SX426 +MNET0_SI816 +MNET0_SX6 +MNTW0_SA2 +MNTW0_SX168 +MNTW0_SX78 +MPAR0_SI2206 +MPAR0_SI946 +MPAR0_SX136 +MPAR0_SX316 +MPEB0_SI1034 +MPEB0_SI1860 +MPEB0_SX240 +MPEB0_SX330 +MPFU0_SI628 +MPFU0_SX448 +MPGH0_SX114 +MPGH0_SX24 +MPGR0_SX240 +MPGR0_SX330 +MPGR1_SX149 +MPPC0_SA1 +MPRD0_SA1 +MPRD0_SX261 +MPRD0_SX351 +MPRD0_SX441 +MPRD0_SX81 +MPRK0_SI1727 +MPRK0_SX107 +MPRK0_SX377 +MPRT0_SA1 +MPRT0_SX310 +MPSW0_SI1067 +MPSW0_SX167 +MPSW0_SX437 +MRAB1_SX128 +MRAB1_SX308 +MRAI0_SA1 +MRAI0_SA2 +MRAI0_SX72 +MRAM0_SA1 +MRAM0_SA2 +MRAM0_SX15 +MRBC0_SI1859 +MRBC0_SX329 +MRBC0_SX419 +MRCG0_SI798 +MRCG0_SX168 +MRCW0_SA1 +MRCW0_SX291 +MRDD0_SI1680 +MRDD0_SX150 +MRDD0_SX277 +MRDD0_SX60 +MRDM0_SI1595 +MRDM0_SX65 +MRDS0_SA1 +MREE0_SX24 +MREH1_SX249 +MREH1_SX69 +MREM0_SA2 +MREW1_SI870 +MRFK0_SX446 +MRFL0_SA1 +MRFL0_SX256 +MRFL0_SX436 +MRFL0_SX76 +MRGM0_SA2 +MRGM0_SX262 +MRGS0_SA2 +MRGS0_SX186 +MRHL0_SI885 +MRHL0_SX345 +MRHL0_SX435 +MRJB1_SA1 +MRJB1_SA2 +MRJB1_SX210 +MRJB1_SX30 +MRJB1_SX390 +MRJH0_SA2 +MRJH0_SX307 +MRJH0_SX79 +MRJM0_SX148 +MRJM1_SA2 +MRJM1_SI1298 +MRJM1_SI1928 +MRJM1_SX128 +MRJT0_SA2 +MRJT0_SI1498 +MRJT0_SX328 +MRJT0_SX418 +MRKM0_SA2 +MRKM0_SX367 +MRLD0_SA2 +MRLD0_SI2224 +MRLD0_SX154 +MRLD0_SX424 +MRLJ0_SA1 +MRLJ0_SX250 +MRLJ0_SX340 +MRLJ1_SA1 +MRLJ1_SA2 +MRLJ1_SX321 +MRLK0_SI843 +MRLK0_SX123 +MRLK0_SX213 +MRMB0_SA2 +MRMB0_SI1581 +MRMB0_SX411 +MRMG0_SA1 +MRMG0_SI1080 +MRMG0_SX450 +MRMH0_SI1349 +MRMH0_SI2281 +MRMH0_SX121 +MRML0_SA2 +MRML0_SX341 +MRPC1_SI2112 +MRRE0_SA2 +MRRE0_SX164 +MRRE0_SX344 +MRRE0_SX74 +MRSO0_SX129 +MRSO0_SX39 +MRSP0_SX259 +MRTC0_SX378 +MRVG0_SI1140 +MRVG0_SX240 +MRWA0_SI973 +MRWA0_SX163 +MRWA0_SX73 +MRWS0_SI1732 +MRWS0_SI472 +MRWS0_SX22 +MRWS0_SX382 +MRXB0_SA2 +MRXB0_SX415 +MSAH1_SI1679 +MSAS0_SX116 +MSAS0_SX206 +MSAS0_SX386 +MSAT0_SA1 +MSAT1_SX263 +MSAT1_SX443 +MSAT1_SX83 +MSDB0_SX197 +MSDB0_SX287 +MSDB0_SX377 +MSDH0_SI2240 +MSDH0_SX440 +MSDH0_SX80 +MSDS0_SA1 +MSEM1_SI1440 +MSEM1_SX180 +MSEM1_SX270 +MSES0_SI1589 +MSES0_SX239 +MSES0_SX419 +MSFH0_SX316 +MSFV0_SI1892 +MSFV0_SX362 +MSFV0_SX92 +MSMR0_SX415 +MSMS0_SA1 +MSMS0_SX173 +MSMS0_SX83 +MSRG0_SA1 +MSRG0_SI1221 +MSTF0_SI766 +MSTF0_SX316 +MSTF0_SX46 +MSVS0_SA2 +MSVS0_SX308 +MTAS0_SX215 +MTAS0_SX35 +MTAS0_SX395 +MTAT0_SX390 +MTAT1_SX59 +MTBC0_SI1803 +MTCS0_SA2 +MTCS0_SI2265 +MTCS0_SX82 +MTDP0_SA2 +MTER0_SA2 +MTER0_SI1787 +MTJG0_SA1 +MTJG0_SI2157 +MTJG0_SX260 +MTJM0_SI1856 +MTJM0_SX146 +MTJU0_SX130 +MTJU0_SX400 +MTKD0_SX107 +MTKD0_SX287 +MTKP0_SI1023 +MTLB0_SA1 +MTLB0_SX234 +MTLC0_SA1 +MTML0_SI2325 +MTML0_SX165 +MTMN0_SA2 +MTMN0_SI1064 +MTMN0_SI2324 +MTMN0_SX434 +MTMT0_SA2 +MTMT0_SI1748 +MTPF0_SX65 +MTPG0_SI1383 +MTPG0_SI753 +MTPG0_SX303 +MTPP0_SX338 +MTPR0_SX340 +MTQC0_SI480 +MTQC0_SX91 +MTRR0_SX198 +MTRR0_SX288 +MTRT0_SA2 +MTRT0_SX254 +MTRT0_SX57 +MTWH1_SX72 +MTXS0_SA1 +MTXS0_SA2 +MVJH0_SI926 +MVJH0_SX206 +MVJH0_SX296 +MVLO0_SA1 +MVRW0_SA2 +MVRW0_SX135 +MVRW0_SX225 +MWAC0_SA2 +MWAC0_SX341 +MWAC0_SX431 +MWAD0_SX432 +MWAD0_SX72 +MWAR0_SA1 +MWAR0_SI1675 +MWCH0_SI1895 +MWCH0_SI2252 +MWCH0_SX182 +MWCH0_SX452 +MWDK0_SA1 +MWDK0_SA2 +MWDK0_SI2017 +MWDK0_SI806 +MWDK0_SX176 +MWDK0_SX86 +MWEM0_SA2 +MWEM0_SI1320 +MWEM0_SI1393 +MWEM0_SX150 +MWGR0_SX346 +MWRE0_SX247 +MWRE0_SX337 +MWRE0_SX427 +MWRP0_SA1 +MWRP0_SX273 +MWRP0_SX363 +MWSB0_SX276 +MWSH0_SX256 +MWSH0_SX76 +MZMB0_SA1 diff --git a/examples/wav2vec/unsupervised/config/timit_unmatched/valid.uid b/examples/wav2vec/unsupervised/config/timit_unmatched/valid.uid new file mode 100644 index 0000000000..e99edfe937 --- /dev/null +++ b/examples/wav2vec/unsupervised/config/timit_unmatched/valid.uid @@ -0,0 +1,620 @@ +FAEM0_SI1392 +FAJW0_SI1263 +FAJW0_SI633 +FALK0_SI658 +FALR0_SX335 +FAPB0_SI1063 +FAPB0_SI2323 +FAPB0_SX433 +FBAS0_SI1472 +FBAS0_SI2066 +FBCG1_SX352 +FBCH0_SI959 +FBJL0_SI922 +FBLV0_SI1688 +FBMH0_SI1136 +FBMH0_SI970 +FBMJ0_SA1 +FBMJ0_SI1776 +FBMJ0_SI516 +FBMJ0_SX336 +FCDR1_SI1186 +FCDR1_SI1816 +FCDR1_SI556 +FCDR1_SX286 +FCKE0_SI1741 +FCKE0_SI481 +FCLT0_SI808 +FCMG0_SI1142 +FCMG0_SX432 +FCMM0_SI1957 +FCMM0_SX420 +FCYL0_SI667 +FCYL0_SX349 +FDAS1_SI1461 +FDAS1_SI831 +FDAW0_SI1271 +FDAW0_SI2036 +FDJH0_SI935 +FDKN0_SI1202 +FDKN0_SX181 +FDKN0_SX451 +FDMY0_SA1 +FDMY0_SI567 +FDMY0_SI714 +FDMY0_SX387 +FDNC0_SI1278 +FDNC0_SI1908 +FDTD0_SA1 +FDTD0_SX321 +FEAC0_SI615 +FEAR0_SX352 +FECD0_SA1 +FECD0_SI1418 +FECD0_SI788 +FEME0_SI875 +FEME0_SX335 +FEXM0_SA1 +FEXM0_SI482 +FEXM0_SX366 +FGDP0_SI988 +FGDP0_SX88 +FGMB0_SI1145 +FGMB0_SX335 +FGRW0_SA1 +FGRW0_SI1152 +FGRW0_SX162 +FGRW0_SX432 +FHLM0_SX120 +FHLM0_SX349 +FHXS0_SA1 +FHXS0_SI1075 +FHXS0_SI2302 +FHXS0_SX175 +FJDM2_SA2 +FJDM2_SX142 +FJEN0_SA1 +FJEN0_SX327 +FJEN0_SX417 +FJHK0_SI2282 +FJKL0_SI932 +FJLG0_SI1889 +FJLR0_SI1231 +FJRB0_SX402 +FJRP1_SA1 +FJRP1_SI1432 +FJRP1_SX262 +FJRP1_SX352 +FJSK0_SI1052 +FJSP0_SI1434 +FJWB1_SI748 +FJXM0_SX311 +FJXM0_SX41 +FJXP0_SI1752 +FKAA0_SA1 +FKDE0_SI1141 +FKDE0_SI1771 +FKDW0_SI1207 +FKDW0_SI1891 +FKFB0_SI1608 +FKFB0_SX438 +FKKH0_SI1290 +FKKH0_SI1920 +FKLC0_SI985 +FKLC0_SX175 +FKLC1_SI1048 +FKLH0_SI1257 +FKSR0_SX366 +FLAC0_SI1339 +FLAG0_SI1464 +FLAG0_SI834 +FLEH0_SI1051 +FLET0_SI507 +FLJA0_SI1078 +FLJA0_SX178 +FLJD0_SI1516 +FLJG0_SI981 +FLJG0_SX171 +FLJG0_SX351 +FLKM0_SA1 +FLKM0_SI620 +FLKM0_SX350 +FLKM0_SX440 +FLMC0_SI1372 +FLMK0_SA1 +FLMK0_SI1229 +FLTM0_SX170 +FLTM0_SX350 +FLTM0_SX440 +FMAH1_SI879 +FMBG0_SI1160 +FMEM0_SA1 +FMEM0_SX333 +FMJB0_SI1177 +FMJF0_SI624 +FMJF0_SX174 +FMJF0_SX84 +FMJU0_SI1389 +FMKC0_SI1041 +FMKF0_SI1018 +FMPG0_SA1 +FMPG0_SI972 +FMPG0_SX162 +FMPG0_SX342 +FMPG0_SX432 +FNKL0_SI892 +FNTB0_SI679 +FPAB1_SA1 +FPAB1_SI2101 +FPAB1_SI841 +FPAC0_SI1921 +FPAC0_SI661 +FPAD0_SI716 +FPAD0_SX176 +FPAF0_SA1 +FPAF0_SI1054 +FPAZ0_SI2223 +FPAZ0_SI963 +FPJF0_SI1259 +FPJF0_SX352 +FPLS0_SI960 +FPMY0_SI1153 +FPMY0_SI523 +FREH0_SI1945 +FRLL0_SI805 +FSAG0_SI1323 +FSAG0_SX153 +FSAG0_SX333 +FSAG0_SX423 +FSAH0_SI614 +FSAH0_SX327 +FSAK0_SI1300 +FSBK0_SX349 +FSCN0_SA1 +FSCN0_SI705 +FSCN0_SX176 +FSDC0_SI1312 +FSDJ0_SI1115 +FSGF0_SI2187 +FSGF0_SI927 +FSJG0_SA1 +FSJG0_SA2 +FSJG0_SI940 +FSJG0_SX220 +FSJG0_SX40 +FSJG0_SX400 +FSJS0_SA1 +FSJS0_SX451 +FSJW0_SI1333 +FSKP0_SI1098 +FSMA0_SI991 +FSMA0_SX451 +FSMM0_SX324 +FSPM0_SI1241 +FSPM0_SX251 +FSRH0_SX311 +FSSB0_SI1712 +FSSB0_SX362 +FTBR0_SI1402 +FTBR0_SI921 +FTBW0_SI715 +FTBW0_SX175 +FTLG0_SI1743 +FTLG0_SI483 +FTMG0_SI902 +FVFB0_SI1510 +FVKB0_SX349 +FVMH0_SI1466 +FVMH0_SI836 +MADC0_SI1367 +MADC0_SI737 +MAEB0_SI1411 +MAEO0_SI1326 +MAJP0_SI1704 +MAJP0_SX174 +MAKB0_SA2 +MAKB0_SI1016 +MAKB0_SI2276 +MAKB0_SX116 +MAPV0_SI1293 +MAPV0_SI663 +MARW0_SX286 +MARW0_SX349 +MBBR0_SI1055 +MBBR0_SX335 +MBCG0_SI957 +MBCG0_SX327 +MBGT0_SI1841 +MBGT0_SX171 +MBMA0_SI1222 +MBMA1_SI954 +MBMA1_SX324 +MBTH0_SI2102 +MBWP0_SX349 +MCAE0_SI1447 +MCAE0_SI2077 +MCAE0_SI817 +MCAL0_SI1138 +MCDR0_SI1784 +MCDR0_SI524 +MCEF0_SI842 +MCEW0_SA1 +MCEW0_SI2072 +MCEW0_SI812 +MCEW0_SX362 +MCEW0_SX452 +MCHL0_SI1347 +MCHL0_SI1404 +MCLK0_SI2290 +MCLK0_SI650 +MCPM0_SI1824 +MCSS0_SI1380 +MCSS0_SI688 +MCTM0_SI1350 +MCTM0_SI1980 +MDAC0_SI631 +MDAS0_SI1896 +MDAS0_SI636 +MDBP0_SI528 +MDBP0_SX438 +MDCD0_SI785 +MDCD0_SX335 +MDCM0_SI1480 +MDDC0_SI1419 +MDED0_SI540 +MDEF0_SI1123 +MDEM0_SA1 +MDEM0_SI608 +MDEM0_SI800 +MDEM0_SX428 +MDHS0_SI900 +MDJM0_SI1455 +MDKS0_SX166 +MDKS0_SX346 +MDLB0_SI1306 +MDLB0_SX136 +MDLB0_SX406 +MDLC0_SI1395 +MDLC0_SI2025 +MDLC1_SI1435 +MDLH0_SX160 +MDLH0_SX430 +MDLM0_SI604 +MDLR0_SX333 +MDLR1_SI669 +MDMA0_SX170 +MDMA0_SX350 +MDMA0_SX440 +MDNS0_SI1011 +MDNS0_SI873 +MDPB0_SI1760 +MDPB0_SI866 +MDRD0_SI752 +MDSJ0_SI1462 +MDSJ0_SX438 +MDWD0_SI1260 +MDWH0_SA1 +MDWH0_SI1168 +MDWH0_SI665 +MDWM0_SI916 +MEDR0_SI2004 +MEFG0_SI491 +MEFG0_SI598 +MEGJ0_SA1 +MEGJ0_SI1337 +MEGJ0_SI707 +MEGJ0_SX167 +MEJS0_SI1240 +MESG0_SI702 +MESJ0_SI2039 +MFWK0_SX349 +MFXS0_SX324 +MFXV0_SI1005 +MFXV0_SI1342 +MGAF0_SI1282 +MGAG0_SI691 +MGAK0_SI1036 +MGAK0_SX136 +MGAR0_SX312 +MGAW0_SI1165 +MGES0_SX311 +MGJC0_SX435 +MGRL0_SX327 +MGRP0_SI1317 +MGRP0_SX327 +MGSH0_SI1176 +MGSH0_SI546 +MGSL0_SI797 +MGXP0_SI1087 +MGXP0_SI525 +MHBS0_SI945 +MHIT0_SI983 +MHMG0_SI735 +MHMR0_SI1692 +MILB0_SI903 +MJAC0_SI701 +MJAC0_SX251 +MJAE0_SX84 +MJAI0_SI682 +MJAI0_SI710 +MJDC0_SI531 +MJDE0_SA1 +MJDE0_SI1120 +MJDE0_SI490 +MJDE0_SX220 +MJDM0_SI1340 +MJDM0_SX170 +MJDM0_SX350 +MJEB0_SX170 +MJEB1_SI1467 +MJEB1_SI837 +MJFR0_SA1 +MJFR0_SX435 +MJHI0_SI1328 +MJJJ0_SI1163 +MJJM0_SI1251 +MJLB0_SI1616 +MJLS0_SI1726 +MJMA0_SI2125 +MJMD0_SI2288 +MJMM0_SI1255 +MJMM0_SX175 +MJPG0_SI1821 +MJPM0_SI1368 +MJPM1_SX311 +MJRA0_SX336 +MJRG0_SI736 +MJRG0_SX352 +MJRH0_SI1840 +MJRH1_SI1558 +MJRK0_SI880 +MJRP0_SI1845 +MJSR0_SI2054 +MJSR0_SI794 +MJWG0_SI813 +MJWG0_SI895 +MJWG0_SX175 +MJWS0_SX333 +MJWT0_SI1291 +MJWT0_SI1381 +MJXL0_SI1172 +MKAG0_SI979 +MKAH0_SX178 +MKAM0_SI1250 +MKAM0_SI1465 +MKDD0_SI1567 +MKDD0_SI2197 +MKDD0_SI937 +MKDT0_SI814 +MKES0_SI623 +MKLS0_SI1437 +MKLS0_SI2067 +MKLS1_SI915 +MKLW0_SI1571 +MKLW0_SX311 +MKRG0_SI861 +MKXL0_SI1815 +MKXL0_SI1958 +MLBC0_SI1239 +MLEL0_SI616 +MLEL0_SX166 +MLJC0_SI1225 +MLJH0_SA1 +MLJH0_SA2 +MLJH0_SI1422 +MLJH0_SI694 +MLJH0_SX244 +MLSH0_SI1417 +MLSH0_SX247 +MMAA0_SI1588 +MMAA0_SI845 +MMAB1_SI864 +MMAB1_SX324 +MMAG0_SA1 +MMAG0_SI1756 +MMAG0_SX136 +MMAR0_SI1966 +MMAR0_SX166 +MMAR0_SX346 +MMBS0_SI521 +MMBS0_SX161 +MMCC0_SI1338 +MMDB0_SI987 +MMDG0_SI1780 +MMDM0_SI1311 +MMDM1_SX153 +MMDM1_SX333 +MMEB0_SX327 +MMGC0_SI1305 +MMGG0_SI1079 +MMGG0_SX449 +MMLM0_SI2150 +MMPM0_SX161 +MMRP0_SX324 +MMSM0_SI1106 +MMSM0_SI476 +MMVP0_SI654 +MMVP0_SX347 +MMWB0_SA1 +MMWB0_SI2249 +MMWB0_SX359 +MMWB0_SX449 +MNTW0_SI1068 +MNTW0_SI1698 +MPEB0_SI600 +MPFU0_SI1258 +MPGH0_SI675 +MPGR0_SI1410 +MPGR1_SI1499 +MPMB0_SA1 +MPMB0_SA2 +MPMB0_SI1501 +MPMB0_SI2131 +MPMB0_SI871 +MPMB0_SX151 +MPMB0_SX331 +MPMB0_SX421 +MPMB0_SX61 +MPPC0_SI1412 +MPRB0_SI1215 +MPRB0_SI575 +MPRD0_SI801 +MPRD0_SX171 +MPRK0_SA1 +MPRK0_SI1097 +MPRK0_SI467 +MPRK0_SX287 +MRAB0_SI1854 +MRAB1_SI848 +MRAI0_SI2052 +MRAI0_SI792 +MRAI0_SX432 +MRAM0_SI1951 +MRCG0_SA2 +MRCG0_SI1428 +MRCG0_SX348 +MRCG0_SX438 +MRCW0_SI741 +MRDM0_SI1044 +MRDM0_SX335 +MREE0_SI1104 +MREE0_SI1959 +MREH1_SA1 +MREH1_SI1599 +MREH1_SI969 +MREM0_SI511 +MRFK0_SI1076 +MRFL0_SI1156 +MRFL0_SI526 +MRFL0_SX166 +MRGM0_SI532 +MRGM0_SX172 +MRGM0_SX442 +MRGS0_SI1356 +MRGS0_SI726 +MRGS0_SX6 +MRJB1_SI1413 +MRJB1_SI2021 +MRJB1_SX120 +MRJH0_SI1519 +MRJH0_SI889 +MRJH0_SX169 +MRJT0_SI868 +MRJT0_SX58 +MRKM0_SI1267 +MRKM0_SI1391 +MRKM0_SI637 +MRLJ0_SI790 +MRLJ1_SI2301 +MRLK0_SI1468 +MRLR0_SI1196 +MRML0_SA1 +MRML0_SI1421 +MRML0_SX161 +MRML0_SX251 +MRMS0_SI2057 +MRRE0_SA1 +MRRE0_SI1334 +MRRE0_SI952 +MRSO0_SI1206 +MRSP0_SI1429 +MRTC0_SI1458 +MRTJ0_SA1 +MRTJ0_SI772 +MRTJ0_SX142 +MRTJ0_SX232 +MRTJ0_SX52 +MRWS0_SI1102 +MRXB0_SI2215 +MRXB0_SI955 +MSAS0_SI1376 +MSAS0_SI746 +MSDH0_SI980 +MSDH0_SX170 +MSDS0_SI1077 +MSDS0_SX267 +MSDS0_SX357 +MSEM1_SI2070 +MSEM1_SI810 +MSFH0_SA1 +MSFH0_SI1738 +MSFH0_SX136 +MSFH0_SX406 +MSFV0_SI632 +MSJK0_SI1596 +MSJK0_SX336 +MSMC0_SI509 +MSMR0_SI1150 +MSMS0_SI1433 +MSRR0_SI1761 +MSRR0_SI501 +MSTF0_SI852 +MSVS0_SI2198 +MSVS0_SI938 +MSVS0_SX398 +MTAB0_SI1572 +MTAB0_SX312 +MTAT0_SA1 +MTAT0_SI1110 +MTAT0_SI811 +MTAT1_SI779 +MTAT1_SX149 +MTAT1_SX329 +MTBC0_SI543 +MTCS0_SI712 +MTDB0_SI1401 +MTDB0_SI771 +MTDP0_SA1 +MTDP0_SI1521 +MTDP0_SX171 +MTDP0_SX351 +MTER0_SA1 +MTER0_SI1157 +MTER0_SX437 +MTJG0_SX170 +MTJS0_SA2 +MTJS0_SI1822 +MTJS0_SI562 +MTJS0_SX382 +MTJU0_SI2020 +MTKD0_SI630 +MTKP0_SI2283 +MTKP0_SI454 +MTLB0_SI1134 +MTLB0_SX324 +MTLC0_SI1313 +MTLC0_SI1477 +MTML0_SX435 +MTMN0_SI582 +MTMT0_SI488 +MTPP0_SI1508 +MTPR0_SI2230 +MTPR0_SX160 +MTPR0_SX430 +MTQC0_SA1 +MTQC0_SI1441 +MTQC0_SX181 +MTQC0_SX451 +MTRC0_SI589 +MTRR0_SI918 +MTRT0_SI1227 +MTXS0_SI1060 +MTXS0_SI2320 +MTXS0_SX160 +MTXS0_SX430 +MVJH0_SI1556 +MVLO0_SI517 +MWAC0_SI1601 +MWAC0_SX161 +MWAC0_SX251 +MWAR0_SI1045 +MWDK0_SI1436 +MWEM0_SX420 +MWRE0_SA2 +MWRE0_SI1057 +MWRE0_SX67 +MWRP0_SI1443 +MWSB0_SI996 +MWSH0_SI1426 +MWSH0_SI796 +MWSH0_SX166 diff --git a/examples/wav2vec/unsupervised/data/__init__.py b/examples/wav2vec/unsupervised/data/__init__.py new file mode 100644 index 0000000000..d0545627ef --- /dev/null +++ b/examples/wav2vec/unsupervised/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .extracted_features_dataset import ExtractedFeaturesDataset +from .random_input_dataset import RandomInputDataset + + +__all__ = [ + "ExtractedFeaturesDataset", + "RandomInputDataset", +] diff --git a/examples/wav2vec/unsupervised/data/extracted_features_dataset.py b/examples/wav2vec/unsupervised/data/extracted_features_dataset.py new file mode 100644 index 0000000000..7f7a58c0e5 --- /dev/null +++ b/examples/wav2vec/unsupervised/data/extracted_features_dataset.py @@ -0,0 +1,167 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import logging +import os +import contextlib + +import numpy as np +import torch + +from fairseq.data import FairseqDataset, data_utils + + +logger = logging.getLogger(__name__) + + +class ExtractedFeaturesDataset(FairseqDataset): + def __init__( + self, + path, + split, + min_length=3, + max_length=None, + labels=None, + label_dict=None, + shuffle=True, + sort_by_length=True, + aux_target_postfix=None, + ): + super().__init__() + + self.min_length = min_length + self.max_length = max_length + self.shuffle = shuffle + self.sort_by_length = sort_by_length + self.label_dict = label_dict + + if labels is not None: + assert label_dict is not None + + self.sizes = [] + self.offsets = [] + self.labels = [] + self.aux_tgt = None + + path = os.path.join(path, split) + data_path = path + self.data = np.load(data_path + ".npy", mmap_mode="r") + + offset = 0 + skipped = 0 + + if not os.path.exists(path + f".{labels}"): + labels = None + + with open(data_path + ".lengths", "r") as len_f, open( + path + f".{labels}", "r" + ) if labels is not None else contextlib.ExitStack() as lbl_f: + for line in len_f: + length = int(line.rstrip()) + lbl = None if labels is None else next(lbl_f).rstrip().split() + if length >= min_length and ( + max_length is None or length <= max_length + ): + self.sizes.append(length) + self.offsets.append(offset) + if lbl is not None: + self.labels.append(lbl) + offset += length + + self.sizes = np.asarray(self.sizes) + self.offsets = np.asarray(self.offsets) + + if aux_target_postfix is not None: + if not os.path.exists(path+f".{aux_target_postfix}"): + logger.info(f"auxaliry target for {split} missing") + else: + with open(path+f".{aux_target_postfix}", "r") as t_f: + self.aux_tgt = [ + torch.LongTensor(list(map(int,seg.strip().split())))\ + for seg in t_f] + + logger.info(f"loaded {len(self.offsets)}, skipped {skipped} samples") + + def __getitem__(self, index): + offset = self.offsets[index] + end = self.sizes[index] + offset + feats = torch.from_numpy(self.data[offset:end].copy()).float() + + res = {"id": index, "features": feats} + if len(self.labels) > 0: + res["target"] = self.label_dict.encode_line( + self.labels[index], + line_tokenizer=lambda x: x, + append_eos=False, + ) + + if self.aux_tgt: + res["aux_target"] = self.aux_tgt[index] + + return res + + def __len__(self): + return len(self.sizes) + + def collater(self, samples): + if len(samples) == 0: + return {} + + features = [s["features"] for s in samples] + sizes = [len(s) for s in features] + + target_size = max(sizes) + + collated_features = features[0].new_zeros( + len(features), target_size, features[0].size(-1) + ) + padding_mask = torch.BoolTensor(collated_features.shape[:-1]).fill_(False) + for i, (f, size) in enumerate(zip(features, sizes)): + collated_features[i, :size] = f + padding_mask[i, size:] = True + + res = { + "id": torch.LongTensor([s["id"] for s in samples]), + "net_input": {"features": collated_features, "padding_mask": padding_mask}, + } + + if len(self.labels) > 0: + target = data_utils.collate_tokens( + [s["target"] for s in samples], + pad_idx=self.label_dict.pad(), + left_pad=False, + ) + res["target"] = target + + if self.aux_tgt: + idxs = torch.nn.utils.rnn.pad_sequence( + [s["aux_target"] for s in samples], + batch_first=True, + padding_value=-1, + ) + res["net_input"]["aux_target"] = idxs + + return res + + def num_tokens(self, index): + return self.size(index) + + def size(self, index): + return self.sizes[index] + + def ordered_indices(self): + """Return an ordered list of indices. Batches will be constructed based + on this order.""" + if self.shuffle: + order = [np.random.permutation(len(self))] + else: + order = [np.arange(len(self))] + + if self.sort_by_length: + order.append(self.sizes) + return np.lexsort(order)[::-1] + else: + return order[0] diff --git a/examples/wav2vec/unsupervised/data/random_input_dataset.py b/examples/wav2vec/unsupervised/data/random_input_dataset.py new file mode 100644 index 0000000000..886505616c --- /dev/null +++ b/examples/wav2vec/unsupervised/data/random_input_dataset.py @@ -0,0 +1,62 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import random +from typing import List + +from fairseq.data import BaseWrapperDataset, data_utils + + +class RandomInputDataset(BaseWrapperDataset): + def __init__( + self, + dataset, + random_input_dataset, + input_key_path: List[str], + add_to_input, + pad_idx, + ): + super().__init__(dataset) + self.random_input_dataset = random_input_dataset + if isinstance(input_key_path, str): + input_key_path = [input_key_path] + assert len(input_key_path) > 0 + self.input_key_path = input_key_path + self.add_to_input = add_to_input + self.pad_idx = pad_idx + + def get_target(self, item): + target_loc = item + for p in self.input_key_path[:-1]: + target_loc = target_loc[p] + return self.input_key_path[-1], target_loc + + def get_target_value(self, item): + k, target_loc = self.get_target(item) + return target_loc[k] + + def __getitem__(self, index): + item = self.dataset[index] + k, target_loc = self.get_target(item) + target_loc[k] = random.choice(self.random_input_dataset) + return item + + def collater(self, samples): + collated = self.dataset.collater(samples) + if len(collated) == 0: + return collated + indices = set(collated["id"].tolist()) + + random_inputs = data_utils.collate_tokens( + [self.get_target_value(s) for s in samples if s["id"] in indices], + pad_idx=self.pad_idx, + left_pad=False, + ) + k, target_loc = self.get_target( + collated if not self.add_to_input else collated["net_input"] + ) + target_loc[k] = random_inputs + + return collated diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/README.md b/examples/wav2vec/unsupervised/kaldi_self_train/README.md new file mode 100644 index 0000000000..314984fcbb --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/README.md @@ -0,0 +1,56 @@ +# Self-Training with Kaldi HMM Models +This folder contains recipes for self-training on pseudo phone transcripts and +decoding into phones or words with [kaldi](https://github.com/kaldi-asr/kaldi). + +To start, download and install kaldi follow its instruction, and place this +folder in `path/to/kaldi/egs`. + +## Training +Assuming the following has been prepared: +- `w2v_dir`: contains features `{train,valid}.{npy,lengths}`, real transcripts `{train,valid}.${label}`, and dict `dict.${label}.txt` +- `lab_dir`: contains pseudo labels `{train,valid}.txt` +- `arpa_lm`: Arpa-format n-gram phone LM for decoding +- `arpa_lm_bin`: Arpa-format n-gram phone LM for unsupervised model selection to be used with KenLM + +Set these variables in `train.sh`, as well as `out_dir`, the output directory, +and then run it. + +The output will be: +``` +==== WER w.r.t. real transcript (select based on unsupervised metric) +INFO:root:./out/exp/mono/decode_valid/scoring/14.0.0.tra.txt: score 0.9178 wer 28.71% lm_ppl 24.4500 gt_wer 25.57% +INFO:root:./out/exp/tri1/decode_valid/scoring/17.1.0.tra.txt: score 0.9257 wer 26.99% lm_ppl 30.8494 gt_wer 21.90% +INFO:root:./out/exp/tri2b/decode_valid/scoring/8.0.0.tra.txt: score 0.7506 wer 23.15% lm_ppl 25.5944 gt_wer 15.78% +``` +where `wer` is the word eror rate with respect to the pseudo label, `gt_wer` to +the ground truth label, `lm_ppl` the language model perplexity of HMM prediced +transcripts, and `score` is the unsupervised metric for model selection. We +choose the model and the LM parameter of the one with the lowest score. In the +example above, it is `tri2b`, `8.0.0`. + + +## Decoding into Phones +In `decode_phone.sh`, set `out_dir` the same as used in `train.sh`, set +`dec_exp` and `dec_lmparam` to the selected model and LM parameter (e.g. +`tri2b` and `8.0.0` in the above example). `dec_script` needs to be set +according to `dec_exp`: for mono/tri1/tri2b, use `decode.sh`; for tri3b, use +`decode_fmllr.sh`. + +The output will be saved at `out_dir/dec_data` + + +## Decoding into Words +`decode_word_step1.sh` prepares WFSTs for word decoding. Besides the variables +mentioned above, set +- `wrd_arpa_lm`: Arpa-format n-gram word LM for decoding +- `wrd_arpa_lm_bin`: Arpa-format n-gram word LM for unsupervised model selection + +`decode_word_step1.sh` decodes the `train` and `valid` split into word and runs +unsupervised model selection using the `valid` split. The output is like: +``` +INFO:root:./out/exp/tri2b/decodeword_valid/scoring/17.0.0.tra.txt: score 1.8693 wer 24.97% lm_ppl 1785.5333 gt_wer 31.45% +``` + +After determining the LM parameter (`17.0.0` in the example above), set it in +`decode_word_step2.sh` and run it. The output will be saved at +`out_dir/dec_data_word`. diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/cmd.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/cmd.sh new file mode 100644 index 0000000000..e74953194d --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="run.pl --mem 2G" +export decode_cmd="run.pl --mem 4G" +export mkgraph_cmd="run.pl --mem 8G" diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_phone.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_phone.sh new file mode 100644 index 0000000000..947342a0b7 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_phone.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# decode into phones (and prepare a new data directory for HMM outputs) + +. ./path.sh + +set -eu + +out_dir= # same as in train.sh +dec_lmparam= # LM hyperparameters (e.g., 7.0.0) +dec_exp= +dec_script= +dec_splits="train valid" +dec_data_dir=$out_dir/dec_data # where to write HMM output + +data_dir=${out_dir}/data + +local/decode.sh --nj 40 --graph_name graph \ + --val_sets "$dec_splits" --decode_script $dec_script \ + $out_dir/exp/$dec_exp $data_dir $data_dir/lang_test + +if [ ! -z $dec_lmparam ]; then + for x in $dec_splits; do + mkdir -p $dec_data_dir/$x + cp $data_dir/$x/{feats.scp,cmvn.scp,utt2spk,spk2utt} $dec_data_dir/$x/ + + tra=$out_dir/exp/$dec_exp/decode_${x}/scoring/${dec_lmparam}.tra + cat $tra | utils/int2sym.pl -f 2- $data_dir/lang/words.txt | \ + sed 's:<UNK>::g' | sed 's:<SIL>::g' > $dec_data_dir/${x}/text + utils/fix_data_dir.sh $dec_data_dir/${x} + echo "WER on ${x} is" $(compute-wer ark:$data_dir/${x}_gt/text ark:$dec_data_dir/$x/text | cut -d" " -f2-) + done +fi diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step1.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step1.sh new file mode 100644 index 0000000000..c1276bbe4d --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step1.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# prepare word WFSTs, reference data, and decode + +set -eu + +w2v_dir= # same as in train.sh +out_dir= # same as in train.sh +lexicon= # word to phone mapping +wrd_arpa_lm= # word LM +wrd_arpa_lm_bin= # word LM for KenLM, used in unsupervised selection + +dec_exp= # what HMM stage to decode (e.g., tri3b) +dec_script= # what decoding script to use (e.g., steps/decode_fmllr.sh) +phn_label=phnc +wrd_label=wrd +dec_suffix=word +dec_splits="train valid" +valid_split="valid" + +data_dir=$out_dir/data +wrd_data_dir=$out_dir/data_word + +lexicon_clean=$(mktemp) +cat $lexicon | sort | uniq > $lexicon_clean +local/prepare_lang_word.sh $w2v_dir/dict.${phn_label}.txt $data_dir $lexicon_clean && rm $lexicon_clean +local/prepare_lm.sh --langdir $data_dir/lang_word --lmdir $data_dir/lang_test_word $wrd_arpa_lm $data_dir + +for x in $dec_splits; do + x_gt=${x}_gt + mkdir -p $wrd_data_dir/$x_gt + cp $data_dir/$x_gt/{feats.scp,cmvn.scp,utt2spk,spk2utt} $wrd_data_dir/$x_gt/ + python local/copy_aligned_text.py < $w2v_dir/$x.$wrd_label > $wrd_data_dir/$x_gt/text +done + +local/decode.sh --nj 40 --graph_name graph${dec_suffix} --decode_suffix $dec_suffix \ + --val_sets "$dec_splits" --decode_script $dec_script \ + $out_dir/exp/$dec_exp $data_dir $data_dir/lang_test_word + +local/unsup_select_decode_word.sh \ + --split $valid_split --kenlm_path $wrd_arpa_lm_bin \ + --ref_txt $wrd_data_dir/${valid_split}_gt/text \ + --psd_txt $data_dir/${valid_split}/text \ + --dec_name decode${dec_suffix} --graph_name graph${dec_suffix} \ + --phonemize_lexicon $data_dir/local/dict_word/lexicon.txt \ + $out_dir/exp diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step2.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step2.sh new file mode 100644 index 0000000000..59a6cbb125 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step2.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# prepare a new data directory of HMM word output + +. ./path.sh + +set -eu + +out_dir= # same as in train.sh +dec_lmparam= # LM hyperparameters (e.g., 7.0.0) + +dec_exp=tri3b # what HMM stage to decode (e.g., tri3b) +dec_suffix=word +dec_splits="train valid" +dec_data_dir=$out_dir/dec_data_word # where to write HMM output + +data_dir=$out_dir/data +wrd_data_dir=$out_dir/data_word + +for x in $dec_splits; do + mkdir -p $dec_data_dir/$x + cp $data_dir/$x/{feats.scp,cmvn.scp,utt2spk,spk2utt} $dec_data_dir/$x/ + + tra=$out_dir/exp/$dec_exp/decode${dec_suffix}_${x}/scoring/${dec_lmparam}.tra + cat $tra | utils/int2sym.pl -f 2- $data_dir/lang_word/words.txt | \ + sed 's:<UNK>::g' | sed 's:<SIL>::g' > $dec_data_dir/$x/text + utils/fix_data_dir.sh $dec_data_dir/$x + echo "WER on $x is" $(compute-wer ark:$wrd_data_dir/${x}_gt/text ark:$dec_data_dir/$x/text | cut -d" " -f2-) +done + diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/copy_aligned_text.py b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/copy_aligned_text.py new file mode 100644 index 0000000000..5f4faa9921 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/copy_aligned_text.py @@ -0,0 +1,4 @@ +import sys + +for idx, line in enumerate(sys.stdin): + print(f"utt{idx:010d} {line}", end='') \ No newline at end of file diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/decode.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/decode.sh new file mode 100755 index 0000000000..811cb63c88 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/decode.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -u + +val_sets="dev_other" +graph_name=graph +decode_suffix="" +decode_script="steps/decode_fmllr.sh" +decode_args="" +nj=60 + +. ./cmd.sh +. ./path.sh +. parse_options.sh + +set -x +exp_dir=$1 +data_root=$2 +lang_test=$3 + +graph=$exp_dir/$graph_name + +if [ ! -d $graph ]; then + utils/mkgraph.sh $lang_test $exp_dir $graph +fi + +for part in $val_sets; do + dec_dir=$exp_dir/decode${decode_suffix}_${part} + if [ ! -d $dec_dir ]; then + echo "decoding $part for $exp_dir" + $decode_script --nj $nj --cmd "$decode_cmd" $decode_args \ + $graph $data_root/$part $dec_dir & + else + echo "$dec_dir exists. skip" + fi +done + +wait diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_data_from_w2v.py b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_data_from_w2v.py new file mode 100644 index 0000000000..66954ea5c9 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_data_from_w2v.py @@ -0,0 +1,56 @@ +import kaldi_io +import numpy as np +import os + + +def get_parser(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("w2v_dir", help="wav2vec feature and text directory") + parser.add_argument("tar_root", help="output data directory in kaldi's format") + parser.add_argument("split", help="name of the subset") + parser.add_argument("--label", default="", help="if specified, copy labels too") + return parser + +def main(): + parser = get_parser() + args = parser.parse_args() + + tar_dir = os.path.join(args.tar_root, args.split) + os.makedirs(tar_dir, exist_ok=True) + + lengths_path = os.path.join(args.w2v_dir, f"{args.split}.lengths") + with open(lengths_path) as f: + lengths = [int(line.rstrip()) for line in f] + offsets = [0] + np.cumsum(lengths[:-1]).tolist() + feats = np.load( + os.path.join(args.w2v_dir, f"{args.split}.npy"), + mmap_mode="r" + ) + assert feats.shape[0] == sum(lengths), \ + f"lengths mismatch {feats.shape[0]} != {sum(lengths)}" + + ark_path = os.path.join(tar_dir, "feats.ark") + scp_path = os.path.join(tar_dir, "feats.scp") + wspec = f"ark:| copy-feats --compress=true ark:- ark,scp:{ark_path},{scp_path}" + with kaldi_io.open_or_fd(wspec, "wb") as f: + for idx, (offset, length) in enumerate(zip(offsets, lengths)): + feat = feats[offset:offset+length] + kaldi_io.write_mat(f, feat, key=f"utt{idx:010d}") + + u2s_path = os.path.join(tar_dir, "utt2spk") + s2u_path = os.path.join(tar_dir, "spk2utt") + with open(u2s_path, "w") as f_u2s, open(s2u_path, "w") as f_s2u: + for idx in range(len(lengths)): + f_u2s.write(f"utt{idx:010d} utt{idx:010d}\n") + f_s2u.write(f"utt{idx:010d} utt{idx:010d}\n") + + if bool(args.label): + lab_path = os.path.join(args.w2v_dir, f"{args.split}.{args.label}") + txt_path = os.path.join(tar_dir, "text") + with open(lab_path) as f_lab, open(txt_path, "w") as f_txt: + for idx, line in enumerate(f_lab): + f_txt.write(f"utt{idx:010d} {line}") + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang.sh new file mode 100755 index 0000000000..e9a80001eb --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +sil_prob=0.5 +num_sil_states=3 +num_nonsil_states=1 + +. ./cmd.sh +. ./path.sh +. parse_options.sh + +set -eux + +dict=$1 +data_dir=$2 + +dict_dir=$data_dir/local/dict +tmplm_dir=$data_dir/local/lang_tmp +lm_dir=$data_dir/lang + +mkdir -p $dict_dir $tmplm_dir $lm_dir + +# prepare dict +echo "SIL" > $dict_dir/silence_phones.txt +echo "SIL" > $dict_dir/optional_silence.txt +awk '{print $1}' $dict > $dict_dir/nonsilence_phones.txt + +echo "SIL SIL" > $dict_dir/lexicon.txt +echo "<UNK> SIL" >> $dict_dir/lexicon.txt +awk '{print $1" "$1}' $dict >> $dict_dir/lexicon.txt + +echo "SIL" > $dict_dir/extra_questions.txt +awk '{printf $1" "} END {printf "\n"}' $dict >> $dict_dir/extra_questions.txt + +# prepare lang +utils/prepare_lang.sh --sil-prob $sil_prob --position-dependent-phones false \ + --num_sil_states $num_sil_states --num_nonsil_states $num_nonsil_states \ + $dict_dir "<UNK>" $tmplm_dir $lm_dir diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang_word.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang_word.sh new file mode 100755 index 0000000000..a7ea3877be --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang_word.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +num_sil_states=3 +num_nonsil_states=1 + +. ./cmd.sh +. ./path.sh +. parse_options.sh + +set -eux + +dict=$1 +data_dir=$2 +lexicon=$3 + +dict_dir=$data_dir/local/dict_word +tmplm_dir=$data_dir/local/lang_tmp_word +lm_dir=$data_dir/lang_word + +mkdir -p $dict_dir $tmplm_dir $lm_dir + +# prepare dict +echo "SIL" > $dict_dir/silence_phones.txt +echo "SIL" > $dict_dir/optional_silence.txt +awk '{print $1}' $dict > $dict_dir/nonsilence_phones.txt + +(echo "!SIL SIL"; echo "<UNK> SIL";) | cat - $lexicon > $dict_dir/lexicon.txt + +echo "SIL" > $dict_dir/extra_questions.txt +awk '{printf $1" "} END {printf "\n"}' $dict >> $dict_dir/extra_questions.txt + +# prepare lang +utils/prepare_lang.sh --position-dependent-phones false \ + --num_sil_states $num_sil_states --num_nonsil_states $num_nonsil_states \ + $dict_dir "<UNK>" $tmplm_dir $lm_dir diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lm.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lm.sh new file mode 100755 index 0000000000..c2edcefede --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lm.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +langdir="" +lmdir="" + +. ./cmd.sh +. ./path.sh +. parse_options.sh + +arpa_lm=$1 +data=$2 + +if [ -z $langdir ]; then + langdir=$data/lang +fi +if [ -z $lmdir ]; then + lmdir=$data/lang_test +fi + +if [ ! -d $langdir ]; then + echo "$langdir not found. run local/prepare_lang.sh first" && exit 1 +fi + +mkdir -p $lmdir +cp -r $langdir/* $lmdir + +if [[ "$arpa_lm" == *.gz ]]; then + gunzip -c $arpa_lm | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lmdir/words.txt - $lmdir/G.fst +else + arpa2fst --disambig-symbol=#0 --read-symbol-table=$lmdir/words.txt $arpa_lm $lmdir/G.fst +fi +fstisstochastic $lmdir/G.fst +utils/validate_lang.pl $lmdir || exit 1 + +echo "done preparing lm ($lmdir)" diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/score.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/score.sh new file mode 100755 index 0000000000..cb5bbb7277 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/score.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# 2014 Guoguo Chen +# Apache 2.0 + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=true +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=7 +max_lmwt=17 +iter=final +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring " + echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 + +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + +mkdir -p $dir/scoring/log + +cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt + +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-best-path --word-symbol-table=$symtab \ + ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1; +done + +# Note: the double level of quoting for the sed command +for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \ + cat $dir/scoring/LMWT.$wip.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; +done + +exit 0; diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/show_wer.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/show_wer.sh new file mode 100755 index 0000000000..9ecf1690c6 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/show_wer.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +split="dev_other" +ref_data="" +get_best_wer=true +dec_name="decode" +graph_name="graph" + +. ./cmd.sh +. ./path.sh +. parse_options.sh + +exp_root=$1 + +set -eu + +echo "==== WER w.r.t. pseudo transcript" +for x in $exp_root/*/${dec_name}_${split}*; do grep WER $x/wer_* 2>/dev/null | utils/best_wer.sh; done + + +if [ ! -z $ref_data ]; then + echo "==== WER w.r.t. real transcript (select based on pseudo WER)" + ref_txt=$ref_data/$split/text + for x in $exp_root/*/${dec_name}_${split}*; do + lang=$(dirname $x)/$graph_name + + lmwt=$( + grep WER $x/wer_* 2>/dev/null | utils/best_wer.sh | + sed 's/.*wer_\(.*\)$/\1/g' | sed 's/_/./g' + ) + tra=$x/scoring/$lmwt.tra + cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' | \ + compute-wer --text --mode=present \ + ark:$ref_txt ark,p:- 2> /dev/null | grep WER | xargs -I{} echo {} $tra + done +fi + +if [ ! -z $ref_data ] && $get_best_wer; then + echo "==== WER w.r.t. real transcript (select based on true WER)" + ref_txt=$ref_data/$split/text + for x in $exp_root/*/${dec_name}_${split}*; do + lang=$(dirname $x)/$graph_name + + for tra in $x/scoring/*.tra; do + cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' | \ + compute-wer --text --mode=present \ + ark:$ref_txt ark,p:- 2> /dev/null | grep WER | xargs -I{} echo {} $tra + done | sort -k2n | head -n1 + done +fi + +exit 0; diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/train_subset_lgbeam.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/train_subset_lgbeam.sh new file mode 100755 index 0000000000..913c1d8e43 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/train_subset_lgbeam.sh @@ -0,0 +1,129 @@ +#!/usr/bin/env bash + +out_root=/tmp +out_name=train_${RANDOM} +num_nonsil_states=1 + +valid="dev_other" +train="train" +mono_size="-1" # 2000 +tri1_size="-1" # 5000 +tri2b_size="-1" # 10000 +tri3b_size="-1" # 10000 + +# Acoustic model parameters +numLeavesTri1=2000 +numGaussTri1=10000 +numLeavesMLLT=2500 +numGaussMLLT=15000 +numLeavesSAT=2500 +numGaussSAT=15000 + +stage=1 +max_stage=1 + +. ./cmd.sh +. ./path.sh +. parse_options.sh + +data=$1 +lang=$2 +lang_test=$3 + +exp_root=$out_root/$out_name + +# you might not want to do this for interactive shells. +set -e + + +if [ $stage -le 1 ] && [ $max_stage -ge 1 ]; then + # train a monophone system + if [ ! $mono_size -eq -1 ]; then + utils/subset_data_dir.sh $data/$train $mono_size $data/${train}_${mono_size} + mono_train=${train}_${mono_size} + else + mono_train=${train} + fi + + steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \ + --initial-beam 40 --regular-beam 60 --retry-beam 120 \ + $data/$mono_train $lang $exp_root/mono + + utils/mkgraph.sh $lang_test $exp_root/mono $exp_root/mono/graph + steps/decode.sh --nj 20 --cmd "$decode_cmd" \ + $exp_root/mono/graph $data/$valid $exp_root/mono/decode_$valid & +fi + + +if [ $stage -le 2 ] && [ $max_stage -ge 2 ]; then + # train a first delta + delta-delta triphone system on a subset of 5000 utterances + if [ ! $tri1_size -eq -1 ]; then + utils/subset_data_dir.sh $data/$train $tri1_size $data/${train}_${tri1_size} + tri1_train=${train}_${tri1_size} + else + tri1_train=${train} + fi + + steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ + $data/$tri1_train $lang \ + $exp_root/mono $exp_root/mono_ali_${tri1_train} + + steps_gan/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + --num_nonsil_states $num_nonsil_states $numLeavesTri1 $numGaussTri1 \ + $data/$tri1_train $lang \ + $exp_root/mono_ali_${tri1_train} $exp_root/tri1 + + utils/mkgraph.sh $lang_test $exp_root/tri1 $exp_root/tri1/graph + steps/decode.sh --nj 20 --cmd "$decode_cmd" \ + $exp_root/tri1/graph $data/$valid $exp_root/tri1/decode_$valid & +fi + +if [ $stage -le 3 ] && [ $max_stage -ge 3 ]; then + # train an LDA+MLLT system. + if [ ! $tri2b_size -eq -1 ]; then + utils/subset_data_dir.sh $data/$train $tri2b_size $data/${train}_${tri2b_size} + tri2b_train=${train}_${tri2b_size} + else + tri2b_train=${train} + fi + + steps/align_si.sh --nj 10 --cmd "$train_cmd" \ + $data/$tri2b_train $lang \ + $exp_root/tri1 $exp_root/tri1_ali_${tri2b_train} + + steps_gan/train_lda_mllt.sh --cmd "$train_cmd" \ + --num_nonsil_states $num_nonsil_states \ + --splice-opts "--left-context=3 --right-context=3" $numLeavesMLLT $numGaussMLLT \ + $data/$tri2b_train $lang \ + $exp_root/tri1_ali_${tri2b_train} $exp_root/tri2b + + utils/mkgraph.sh $lang_test $exp_root/tri2b $exp_root/tri2b/graph + steps/decode.sh --nj 20 --cmd "$decode_cmd" \ + $exp_root/tri2b/graph $data/$valid $exp_root/tri2b/decode_$valid & +fi + + +if [ $stage -le 4 ] && [ $max_stage -ge 4 ]; then + # Train tri3b, which is LDA+MLLT+SAT on 10k utts + if [ ! $tri3b_size -eq -1 ]; then + utils/subset_data_dir.sh $data/$train $tri3b_size $data/${train}_${tri3b_size} + tri3b_train=${train}_${tri3b_size} + else + tri3b_train=${train} + fi + + steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \ + $data/$tri3b_train $lang \ + $exp_root/tri2b $exp_root/tri2b_ali_${tri2b_train} + + steps_gan/train_sat.sh --cmd "$train_cmd" \ + --num_nonsil_states $num_nonsil_states $numLeavesSAT $numGaussSAT \ + $data/$tri3b_train $lang \ + $exp_root/tri2b_ali_${tri2b_train} $exp_root/tri3b + + utils/mkgraph.sh $lang_test $exp_root/tri3b $exp_root/tri3b/graph + steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ + $exp_root/tri3b/graph $data/$valid $exp_root/tri3b/decode_$valid & +fi + +wait diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select.py b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select.py new file mode 100644 index 0000000000..1122c88c19 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select.py @@ -0,0 +1,135 @@ +""" +Implement unsupervised metric for decoding hyperparameter selection: + $$ alpha * LM_PPL + ViterbitUER(%) * 100 $$ +""" +import argparse +import logging +import math +import sys + +import kenlm +import editdistance +from g2p_en import G2p + +logging.root.setLevel(logging.INFO) +logging.basicConfig(stream=sys.stdout, level=logging.INFO) +logger = logging.getLogger(__name__) + + +def get_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("ref_tra", help="reference pseudo labels") + parser.add_argument("hyp_tra", help="decoded pseudo labels to be assess") + parser.add_argument("--kenlm_path", default="/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o5.bin", help="") + parser.add_argument("--uppercase", action="store_true", help="") + parser.add_argument("--skipwords", default="", help="") + parser.add_argument("--gt_tra", default="", help="ground truth pseudo labels for computing oracle WER") + parser.add_argument("--min_vt_uer", default=0.0, type=float) + parser.add_argument("--phonemize", action="store_true", help="phonemize word hypotheses, used when reference is phone transcript") + parser.add_argument("--phonemize_lexicon", default="", type=str, help="use a lexicon for phonemizing") + return parser + +def load_tra(tra_path): + with open(tra_path, "r") as f: + uid_to_tra = {} + for line in f: + toks = line.rstrip().split() + uid, tra = toks[0], " ".join(toks[1:]) + uid_to_tra[uid] = tra + logger.debug(f"loaded {len(uid_to_tra)} utterances from {tra_path}") + return uid_to_tra + +def load_lex(lex_path): + with open(lex_path, "r") as f: + w2p = {} + for line in f: + w, p = line.rstrip().split(None, 1) + w2p[w] = p.split() + return w2p + +def compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p, g2p_dict): + d_cnt = 0 + w_cnt = 0 + w_cnt_h = 0 + for uid in hyp_uid_to_tra: + ref = ref_uid_to_tra[uid].split() + if g2p_dict is not None: + hyp = [] + for word in hyp_uid_to_tra[uid].split(): + if word in g2p_dict: + hyp = hyp + g2p_dict[word] + else: + logger.warning(f"{word} not in g2p_dict") + elif g2p is not None: + hyp = g2p(hyp_uid_to_tra[uid]) + hyp = [p for p in hyp if p != "'" and p != " "] + hyp = [p[:-1] if p[-1].isnumeric() else p for p in hyp] + else: + hyp = hyp_uid_to_tra[uid].split() + logger.debug(( + f"======================\n" + f"HYP: {' '.join(hyp)}\n" + f"REF: {' '.join(ref)}" + )) + d_cnt += editdistance.eval(ref, hyp) + w_cnt += len(ref) + w_cnt_h += len(hyp) + wer = float(d_cnt) / w_cnt + logger.debug(( + f"wer = {wer*100:.2f}%; num. of ref words = {w_cnt}; " + f"num. of hyp words = {w_cnt_h}; num. of sentences = {len(ref_uid_to_tra)}" + )) + return wer + +def compute_lm_ppl(hyp_uid_to_tra, score_fn): + lm_score = 0. + w_cnt = 0 + for hyp in hyp_uid_to_tra.values(): + cur_score = score_fn(hyp) + cur_cnt = len(hyp.split()) + 1 # plus one for </s> + lm_score += cur_score + w_cnt += cur_cnt + logger.debug(( + f"======================\n" + f"score sum/avg = {cur_score:.2f}/{cur_score/cur_cnt:.2f}\n" + f"hyp = {hyp}" + )) + lm_ppl = math.pow(10, -lm_score / w_cnt) + logger.debug(f"lm ppl = {lm_ppl:.2f}; num. of words = {w_cnt}") + return lm_ppl + +def main(): + args = get_parser().parse_args() + logger.debug(f"Args: {args}") + + ref_uid_to_tra = load_tra(args.ref_tra) + hyp_uid_to_tra = load_tra(args.hyp_tra) + assert not bool(set(hyp_uid_to_tra.keys()) - set(ref_uid_to_tra.keys())) + + lm = kenlm.Model(args.kenlm_path) + skipwords = set(args.skipwords.split(",")) + def compute_lm_score(s): + s = " ".join(w for w in s.split() if w not in skipwords) + s = s.upper() if args.uppercase else s + return lm.score(s) + + g2p, g2p_dict = None, None + if args.phonemize: + if args.phonemize_lexicon: + g2p_dict = load_lex(args.phonemize_lexicon) + else: + g2p = G2p() + + wer = compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p, g2p_dict) + lm_ppl = compute_lm_ppl(hyp_uid_to_tra, compute_lm_score) + + gt_wer = -math.inf + if args.gt_tra: + gt_uid_to_tra = load_tra(args.gt_tra) + gt_wer = compute_wer(gt_uid_to_tra, hyp_uid_to_tra, None, None) + + score = math.log(lm_ppl) * max(wer, args.min_vt_uer) + logging.info(f"{args.hyp_tra}: score={score:.4f}; wer={wer*100:.2f}%; lm_ppl={lm_ppl:.4f}; gt_wer={gt_wer*100:.2f}%") + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode.sh new file mode 100755 index 0000000000..b34c5b6e06 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +split="dev_other" +ref_txt="" # ground truth transcript path +psd_txt="" # pseudo transcript path +get_best_wer=true +dec_name="decode" +graph_name="graph" +kenlm_path=/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o6.bin + +. ./cmd.sh +. ./path.sh +. parse_options.sh + +exp_root=$1 +unsup_args="" +if [ $# -ge 2 ]; then + unsup_args=$2 +fi + +set -eu + +if [ ! -z $ref_txt ] && $get_best_wer; then + echo "==== WER w.r.t. real transcript (select based on unsupervised metric)" + for x in $exp_root/*/${dec_name}_${split}*; do + lang=$(dirname $x)/$graph_name + + ( + for tra in $x/scoring/*.tra; do + cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' > $tra.txt + python local/unsup_select.py $psd_txt $tra.txt --kenlm_path $kenlm_path --gt_tra $ref_txt $unsup_args + done 2>/dev/null | grep "score=" | sed 's/=/ /g' | sed 's/;//g' | sort -k3n | head -n1 + ) & + done +fi +wait + diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode_word.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode_word.sh new file mode 100755 index 0000000000..c10a6b8809 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode_word.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +split="dev_other" +ref_txt="" # ground truth transcript path +psd_txt="" # pseudo transcript path +get_best_wer=true +dec_name="decode" +graph_name="graph" +kenlm_path=/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o6.bin +phonemize_lexicon="" + +. ./cmd.sh +. ./path.sh +. parse_options.sh +. /private/home/wnhsu/unsup_asr/fairseq-py-unsup/env.sh + +exp_root=$1 + +set -eu + +if [ ! -z $ref_txt ] && $get_best_wer; then + echo "==== WER w.r.t. real transcript (select based on unsupervised metric)" + for x in $exp_root/*/${dec_name}_${split}*; do + lang=$(dirname $x)/$graph_name + + for tra in $x/scoring/*.tra; do + cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:\<UNK\>::g' > $tra.txt + python local/unsup_select.py $psd_txt $tra.txt \ + --kenlm_path $kenlm_path --gt_tra $ref_txt --phonemize \ + --phonemize_lexicon "$phonemize_lexicon" + done | grep "score=" | sed 's/=/ /g' | sed 's/;//g' | sort -k3n | head -n1 + done +fi + + diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/path.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/path.sh new file mode 100755 index 0000000000..1a6fb5f891 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/steps b/examples/wav2vec/unsupervised/kaldi_self_train/st/steps new file mode 120000 index 0000000000..6e99bf5b5a --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_deltas.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_deltas.sh new file mode 100755 index 0000000000..af68715ab0 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_deltas.sh @@ -0,0 +1,175 @@ +#!/usr/bin/env bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Begin configuration. +stage=-4 # This allows restarting after partway, when something when wrong. +config= +cmd=run.pl +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +realign_iters="10 20 30"; +num_iters=35 # Number of iterations of training +max_iter_inc=25 # Last iter to increase #Gauss on. +beam=10 +careful=false +retry_beam=40 +boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment +power=0.25 # Exponent for number of gaussians according to occurrence counts +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +norm_vars=false # deprecated. Prefer --cmvn-opts "--norm-vars=true" + # use the option --cmvn-opts "--norm-means=false" +cmvn_opts= +delta_opts= +context_opts= # use"--context-width=5 --central-position=2" for quinphone +num_nonsil_states=3 +# End configuration. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh; +. parse_options.sh || exit 1; + +if [ $# != 6 ]; then + echo "Usage: steps/train_deltas.sh <num-leaves> <tot-gauss> <data-dir> <lang-dir> <alignment-dir> <exp-dir>" + echo "e.g.: steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1" + echo "main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." + echo " --config <config-file> # config containing options" + echo " --stage <stage> # stage to do partial re-run from." + exit 1; +fi + +numleaves=$1 +totgauss=$2 +data=$3 +lang=$4 +alidir=$5 +dir=$6 + +for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do + [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1; +done + +numgauss=$numleaves +incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss +oov=`cat $lang/oov.int` || exit 1; +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; +nj=`cat $alidir/num_jobs` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs + +utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; + +sdata=$data/split$nj; +split_data.sh $data $nj || exit 1; + + +[ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \ + echo "$0: warning: ignoring CMVN options from source directory $alidir" +$norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts" +echo $cmvn_opts > $dir/cmvn_opts # keep track of options to CMVN. +[ ! -z $delta_opts ] && echo $delta_opts > $dir/delta_opts + +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |" + +rm $dir/.error 2>/dev/null + +if [ $stage -le -3 ]; then + echo "$0: accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + acc-tree-stats $context_opts \ + --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ + "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; + sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -2 ]; then + echo "$0: getting questions for tree-building, via clustering" + # preparing questions, roots file... + cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) $context_opts \ + $dir/treeacc $lang/phones/sets.int \ + $dir/questions.int 2> $dir/log/questions.log || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + compile-questions $context_opts $lang/topo $dir/questions.int \ + $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; + + echo "$0: building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; + + $cmd $dir/log/init_model.log \ + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl || exit 1; + if grep 'no stats' $dir/log/init_model.log; then + echo "** The warnings above about 'no stats' generally mean you have phones **" + echo "** (or groups of phones) in your phone set that had no corresponding data. **" + echo "** You should probably figure out whether something went wrong, **" + echo "** or whether your data just doesn't happen to have examples of those **" + echo "** phones. **" + fi + + gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1; + rm $dir/treeacc +fi + +if [ $stage -le -1 ]; then + # Convert the alignments. + echo "$0: converting alignments from $alidir to use current tree" + $cmd JOB=1:$nj $dir/log/convert.JOB.log \ + convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \ + "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 0 ]; then + echo "$0: compiling graphs of transcripts" + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl $lang/L.fst \ + "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + +x=1 +while [ $x -lt $num_iters ]; do + echo "$0: training pass $x" + if [ $stage -le $x ]; then + if echo $realign_iters | grep -w $x >/dev/null; then + echo "$0: aligning data" + mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-acc-stats-ali $dir/$x.mdl "$feats" \ + "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; + $cmd $dir/log/update.$x.log \ + gmm-est --mix-up=$numgauss --power=$power \ + --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc + rm $dir/$x.occs + fi + [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; + x=$[$x+1]; +done + +rm $dir/final.mdl $dir/final.occs 2>/dev/null +ln -s $x.mdl $dir/final.mdl +ln -s $x.occs $dir/final.occs + +steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir + +# Summarize warning messages... +utils/summarize_warnings.pl $dir/log + +steps/info/gmm_dir_info.pl $dir + +echo "$0: Done training system with delta+delta-delta features in $dir" + +exit 0 diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_lda_mllt.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_lda_mllt.sh new file mode 100755 index 0000000000..9d8c319ce8 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_lda_mllt.sh @@ -0,0 +1,239 @@ +#!/usr/bin/env bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# +# LDA+MLLT refers to the way we transform the features after computing +# the MFCCs: we splice across several frames, reduce the dimension (to 40 +# by default) using Linear Discriminant Analysis), and then later estimate, +# over multiple iterations, a diagonalizing transform known as MLLT or STC. +# See http://kaldi-asr.org/doc/transform.html for more explanation. +# +# Apache 2.0. + +# Begin configuration. +cmd=run.pl +config= +stage=-5 +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +realign_iters="10 20 30"; +mllt_iters="2 4 6 12"; +num_iters=35 # Number of iterations of training +max_iter_inc=25 # Last iter to increase #Gauss on. +dim=40 +beam=10 +retry_beam=40 +careful=false +boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment +power=0.25 # Exponent for number of gaussians according to occurrence counts +randprune=4.0 # This is approximately the ratio by which we will speed up the + # LDA and MLLT calculations via randomized pruning. +splice_opts= +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +norm_vars=false # deprecated. Prefer --cmvn-opts "--norm-vars=false" +cmvn_opts= +context_opts= # use "--context-width=5 --central-position=2" for quinphone. +# End configuration. +train_tree=true # if false, don't actually train the tree. +use_lda_mat= # If supplied, use this LDA[+MLLT] matrix. +num_nonsil_states=3 + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 6 ]; then + echo "Usage: steps/train_lda_mllt.sh [options] <#leaves> <#gauss> <data> <lang> <alignments> <dir>" + echo " e.g.: steps/train_lda_mllt.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." + echo " --config <config-file> # config containing options" + echo " --stage <stage> # stage to do partial re-run from." + exit 1; +fi + +numleaves=$1 +totgauss=$2 +data=$3 +lang=$4 +alidir=$5 +dir=$6 + +for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do + [ ! -f $f ] && echo "train_lda_mllt.sh: no such file $f" && exit 1; +done + +numgauss=$numleaves +incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment +oov=`cat $lang/oov.int` || exit 1; +nj=`cat $alidir/num_jobs` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; + +mkdir -p $dir/log + +utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; + +echo $nj >$dir/num_jobs +echo "$splice_opts" >$dir/splice_opts # keep track of frame-splicing options + # so that later stages of system building can know what they were. + + +[ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \ + echo "$0: warning: ignoring CMVN options from source directory $alidir" +$norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts" +echo $cmvn_opts > $dir/cmvn_opts # keep track of options to CMVN. + +sdata=$data/split$nj; +split_data.sh $data $nj || exit 1; + +splicedfeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" +# Note: $feats gets overwritten later in the script. +feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |" + + + +if [ $stage -le -5 ]; then + if [ -z "$use_lda_mat" ]; then + echo "$0: Accumulating LDA statistics." + rm $dir/lda.*.acc 2>/dev/null + $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \ + ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \ + acc-lda --rand-prune=$randprune $alidir/final.mdl "$splicedfeats" ark,s,cs:- \ + $dir/lda.JOB.acc || exit 1; + est-lda --write-full-matrix=$dir/full.mat --dim=$dim $dir/0.mat $dir/lda.*.acc \ + 2>$dir/log/lda_est.log || exit 1; + rm $dir/lda.*.acc + else + echo "$0: Using supplied LDA matrix $use_lda_mat" + cp $use_lda_mat $dir/0.mat || exit 1; + [ ! -z "$mllt_iters" ] && \ + echo "$0: Warning: using supplied LDA matrix $use_lda_mat but we will do MLLT," && \ + echo " which you might not want; to disable MLLT, specify --mllt-iters ''" && \ + sleep 5 + fi +fi + +cur_lda_iter=0 + +if [ $stage -le -4 ] && $train_tree; then + echo "$0: Accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + acc-tree-stats $context_opts \ + --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ + "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; + [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1; + $cmd $dir/log/sum_tree_acc.log \ + sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; + rm $dir/*.treeacc +fi + + +if [ $stage -le -3 ] && $train_tree; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) $context_opts $dir/treeacc $lang/phones/sets.int \ + $dir/questions.int 2> $dir/log/questions.log || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + compile-questions $context_opts $lang/topo $dir/questions.int \ + $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; +fi + +if [ $stage -le -2 ]; then + echo "$0: Initializing the model" + if $train_tree; then + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; + rm $dir/treeacc + else + cp $alidir/tree $dir/ || exit 1; + $cmd JOB=1 $dir/log/init_model.log \ + gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \ + "$feats subset-feats ark:- ark:-|" || exit 1; + fi +fi + + +if [ $stage -le -1 ]; then + # Convert the alignments. + echo "$0: Converting alignments from $alidir to use current tree" + $cmd JOB=1:$nj $dir/log/convert.JOB.log \ + convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \ + "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then + echo "$0: Compiling graphs of transcripts" + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl $lang/L.fst \ + "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + + +x=1 +while [ $x -lt $num_iters ]; do + echo Training pass $x + if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then + echo Aligning data + mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + if echo $mllt_iters | grep -w $x >/dev/null; then + if [ $stage -le $x ]; then + echo "$0: Estimating MLLT" + $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \ + gmm-acc-mllt --rand-prune=$randprune $dir/$x.mdl "$feats" ark:- $dir/$x.JOB.macc \ + || exit 1; + est-mllt $dir/$x.mat.new $dir/$x.*.macc 2> $dir/log/mupdate.$x.log || exit 1; + gmm-transform-means $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl \ + 2> $dir/log/transform_means.$x.log || exit 1; + compose-transforms --print-args=false $dir/$x.mat.new $dir/$cur_lda_iter.mat $dir/$x.mat || exit 1; + rm $dir/$x.*.macc + fi + feats="$splicedfeats transform-feats $dir/$x.mat ark:- ark:- |" + cur_lda_iter=$x + fi + + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-acc-stats-ali $dir/$x.mdl "$feats" \ + "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; + $cmd $dir/log/update.$x.log \ + gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power \ + $dir/$x.mdl "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs + fi + [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; + x=$[$x+1]; +done + +rm $dir/final.{mdl,mat,occs} 2>/dev/null +ln -s $x.mdl $dir/final.mdl +ln -s $x.occs $dir/final.occs +ln -s $cur_lda_iter.mat $dir/final.mat + +steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir + +# Summarize warning messages... +utils/summarize_warnings.pl $dir/log + +steps/info/gmm_dir_info.pl $dir + +echo "$0: Done training system with LDA+MLLT features in $dir" + +exit 0 diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_sat.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_sat.sh new file mode 100755 index 0000000000..f75afafb1c --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_sat.sh @@ -0,0 +1,281 @@ +#!/usr/bin/env bash +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + + +# This does Speaker Adapted Training (SAT), i.e. train on +# fMLLR-adapted features. It can be done on top of either LDA+MLLT, or +# delta and delta-delta features. If there are no transforms supplied +# in the alignment directory, it will estimate transforms itself before +# building the tree (and in any case, it estimates transforms a number +# of times during training). + + +# Begin configuration section. +stage=-5 +exit_stage=-100 # you can use this to require it to exit at the + # beginning of a specific stage. Not all values are + # supported. +fmllr_update_type=full +cmd=run.pl +scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +beam=10 +retry_beam=40 +careful=false +boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment +context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. +realign_iters="10 20 30"; +fmllr_iters="2 4 6 12"; +silence_weight=0.0 # Weight on silence in fMLLR estimation. +num_iters=35 # Number of iterations of training +max_iter_inc=25 # Last iter to increase #Gauss on. +power=0.2 # Exponent for number of gaussians according to occurrence counts +cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves +phone_map= +train_tree=true +tree_stats_opts= +cluster_phones_opts= +compile_questions_opts= +# End configuration section. +num_nonsil_states=3 + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 6 ]; then + echo "Usage: steps/train_sat.sh <#leaves> <#gauss> <data> <lang> <ali-dir> <exp-dir>" + echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b" + echo "Main options (for others, see top of script file)" + echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." + echo " --config <config-file> # config containing options" + echo " --stage <stage> # stage to do partial re-run from." + exit 1; +fi + +numleaves=$1 +totgauss=$2 +data=$3 +lang=$4 +alidir=$5 +dir=$6 + +for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do + [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1; +done + +numgauss=$numleaves +incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment +oov=`cat $lang/oov.int` +nj=`cat $alidir/num_jobs` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` +ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; +sdata=$data/split$nj; +splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. +cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` +delta_opts=`cat $alidir/delta_opts 2>/dev/null` +phone_map_opt= +[ ! -z "$phone_map" ] && phone_map_opt="--phone-map='$phone_map'" + +mkdir -p $dir/log +cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. +cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +cp $alidir/delta_opts $dir 2>/dev/null # delta option. + +utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; + +echo $nj >$dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +# Set up features. + +if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +## Set up speaker-independent features. +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" + cp $alidir/final.mat $dir + cp $alidir/full.mat $dir 2>/dev/null + ;; + *) echo "$0: invalid feature type $feat_type" && exit 1; +esac + +## Get initial fMLLR transforms (possibly from alignment dir) +if [ -f $alidir/trans.1 ]; then + echo "$0: Using transforms from $alidir" + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" + cur_trans_dir=$alidir +else + if [ $stage -le -5 ]; then + echo "$0: obtaining initial fMLLR transforms since not present in $alidir" + # The next line is necessary because of $silphonelist otherwise being incorrect; would require + # old $lang dir which would require another option. Not needed anyway. + [ ! -z "$phone_map" ] && \ + echo "$0: error: you must provide transforms if you use the --phone-map option." && exit 1; + $cmd JOB=1:$nj $dir/log/fmllr.0.JOB.log \ + ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ + weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $alidir/final.mdl "$sifeats" \ + ark:- ark:$dir/trans.JOB || exit 1; + fi + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" + cur_trans_dir=$dir +fi + +if [ $stage -le -4 ] && $train_tree; then + # Get tree stats. + echo "$0: Accumulating tree stats" + $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ + acc-tree-stats $context_opts $tree_stats_opts $phone_map_opt --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ + "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; + [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1; + $cmd $dir/log/sum_tree_acc.log \ + sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; + rm $dir/*.treeacc +fi + +if [ $stage -le -3 ] && $train_tree; then + echo "$0: Getting questions for tree clustering." + # preparing questions, roots file... + cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) \ + $cluster_phones_opts $context_opts \ + $dir/treeacc $lang/phones/sets.int $dir/questions.int 2>$dir/log/questions.log || exit 1; + cat $lang/phones/extra_questions.int >> $dir/questions.int + compile-questions $context_opts $compile_questions_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; + + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; +fi + +if [ $stage -le -2 ]; then + echo "$0: Initializing the model" + if $train_tree; then + gmm-init-model --write-occs=$dir/1.occs \ + $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; + grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; + rm $dir/treeacc + else + cp $alidir/tree $dir/ || exit 1; + $cmd JOB=1 $dir/log/init_model.log \ + gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \ + "$feats subset-feats ark:- ark:-|" || exit 1; + fi +fi + +if [ $stage -le -1 ]; then + # Convert the alignments. + echo "$0: Converting alignments from $alidir to use current tree" + $cmd JOB=1:$nj $dir/log/convert.JOB.log \ + convert-ali $phone_map_opt $alidir/final.mdl $dir/1.mdl $dir/tree \ + "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +[ "$exit_stage" -eq 0 ] && echo "$0: Exiting early: --exit-stage $exit_stage" && exit 0; + +if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then + echo "$0: Compiling graphs of transcripts" + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl $lang/L.fst \ + "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + +x=1 +while [ $x -lt $num_iters ]; do + echo Pass $x + if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then + echo Aligning data + mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" + $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ + gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; + fi + + if echo $fmllr_iters | grep -w $x >/dev/null; then + if [ $stage -le $x ]; then + echo Estimating fMLLR transforms + # We estimate a transform that's additional to the previous transform; + # we'll compose them. + $cmd JOB=1:$nj $dir/log/fmllr.$x.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \ + gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ + --spk2utt=ark:$sdata/JOB/spk2utt $dir/$x.mdl \ + "$feats" ark:- ark:$dir/tmp_trans.JOB || exit 1; + for n in `seq $nj`; do + ! ( compose-transforms --b-is-affine=true \ + ark:$dir/tmp_trans.$n ark:$cur_trans_dir/trans.$n ark:$dir/composed_trans.$n \ + && mv $dir/composed_trans.$n $dir/trans.$n && \ + rm $dir/tmp_trans.$n ) 2>$dir/log/compose_transforms.$x.log \ + && echo "$0: Error composing transforms" && exit 1; + done + fi + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + cur_trans_dir=$dir + fi + + if [ $stage -le $x ]; then + $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ + gmm-acc-stats-ali $dir/$x.mdl "$feats" \ + "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; + [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1; + $cmd $dir/log/update.$x.log \ + gmm-est --power=$power --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; + rm $dir/$x.mdl $dir/$x.*.acc + rm $dir/$x.occs + fi + [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; + x=$[$x+1]; +done + + +if [ $stage -le $x ]; then + # Accumulate stats for "alignment model"-- this model is + # computed with the speaker-independent features, but matches Gaussian-for-Gaussian + # with the final speaker-adapted model. + $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \ + ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ + gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \ + ark,s,cs:- $dir/$x.JOB.acc || exit 1; + [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1; + # Update model. + $cmd $dir/log/est_alimdl.log \ + gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \ + "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1; + rm $dir/$x.*.acc +fi + +rm $dir/final.{mdl,alimdl,occs} 2>/dev/null +ln -s $x.mdl $dir/final.mdl +ln -s $x.occs $dir/final.occs +ln -s $x.alimdl $dir/final.alimdl + + +steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir + +utils/summarize_warnings.pl $dir/log +( + echo "$0: Likelihood evolution:" + for x in `seq $[$num_iters-1]`; do + tail -n 30 $dir/log/acc.$x.*.log | awk '/Overall avg like/{l += $(NF-3)*$(NF-1); t += $(NF-1); } + /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);} + END{ d /= t2; l /= t; printf("%s ", d+l); } ' + done + echo +) | tee $dir/log/summary.log + + +steps/info/gmm_dir_info.pl $dir + +echo "$0: done training SAT system in $dir" + +exit 0 diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/train.sh b/examples/wav2vec/unsupervised/kaldi_self_train/st/train.sh new file mode 100644 index 0000000000..f3a3d3fc7c --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/train.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -eu + +w2v_dir= # contains features `{train,valid}.{npy,lengths}`, real transcripts `{train,valid}.${label}`, and dict `dict.${label}.txt` +lab_dir= # contains pseudo labels `{train,valid}.txt` +out_dir= # output root +arpa_lm= # phone LM +arpa_lm_bin= # (binary) phone LM for KenLM, used in unsupervised selection + +label=phnc +train_name="train" +valid_name="valid" +data_dir=${out_dir}/data + +mkdir -p ${out_dir}/exp +local/prepare_lang.sh $w2v_dir/dict.${label}.txt $data_dir +local/prepare_lm.sh $arpa_lm $data_dir + +for x in $train_name $valid_name; do + x_gt=${x}_gt + + # prepare pseudo data + python local/prepare_data_from_w2v.py $w2v_dir $data_dir $x + steps/compute_cmvn_stats.sh $data_dir/$x $out_dir/exp/make_feat/$x $out_dir/feats/$x + python local/copy_aligned_text.py < $lab_dir/$x.txt > $data_dir/$x/text + + # prepare ground truth data + mkdir $data_dir/$x_gt + cp $data_dir/$x/{feats.scp,cmvn.scp,utt2spk,spk2utt} $data_dir/$x_gt/ + python local/copy_aligned_text.py < $w2v_dir/$x.$label > $data_dir/$x_gt/text +done + +local/train_subset_lgbeam.sh \ + --out_root ${out_dir} --out_name exp --train $train_name --valid $valid_name \ + --mono_size 2000 --tri1_size 5000 --tri2b_size -1 --tri3b_size -1 \ + --stage 1 --max_stage 3 $data_dir $data_dir/lang $data_dir/lang_test + +local/unsup_select_decode.sh \ + --split $valid_name --kenlm_path $arpa_lm_bin \ + --ref_txt $data_dir/${valid_name}_gt/text \ + --psd_txt $data_dir/${valid_name}/text \ + $out_dir/exp diff --git a/examples/wav2vec/unsupervised/kaldi_self_train/st/utils b/examples/wav2vec/unsupervised/kaldi_self_train/st/utils new file mode 120000 index 0000000000..b240885218 --- /dev/null +++ b/examples/wav2vec/unsupervised/kaldi_self_train/st/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/examples/wav2vec/unsupervised/models/__init__.py b/examples/wav2vec/unsupervised/models/__init__.py new file mode 100644 index 0000000000..3e3039b708 --- /dev/null +++ b/examples/wav2vec/unsupervised/models/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .wav2vec_u import Wav2vec_U + + +__all__ = [ + "Wav2vec_U", +] diff --git a/examples/wav2vec/unsupervised/models/wav2vec_u.py b/examples/wav2vec/unsupervised/models/wav2vec_u.py new file mode 100644 index 0000000000..8a1e9055e3 --- /dev/null +++ b/examples/wav2vec/unsupervised/models/wav2vec_u.py @@ -0,0 +1,687 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from enum import Enum, auto +import math +import numpy as np +from typing import Tuple, List, Optional, Dict + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import autograd + +from fairseq import checkpoint_utils, utils +from fairseq.dataclass import FairseqDataclass +from fairseq.models import BaseFairseqModel, register_model +from fairseq.modules import ( + SamePad, + TransposeLast, +) + + +class SegmentationType(Enum): + NONE = auto() + RANDOM = auto() + UNIFORM_RANDOM = auto() + UNIFORM_RANDOM_JOIN = auto() + JOIN = auto() + + +@dataclass +class SegmentationConfig(FairseqDataclass): + type: SegmentationType = SegmentationType.NONE + subsample_rate: float = 0.25 + mean_pool: bool = True + mean_pool_join: bool = False + remove_zeros: bool = False + + +@dataclass +class Wav2vec_UConfig(FairseqDataclass): + discriminator_kernel: int = 3 + discriminator_dilation: int = 1 + discriminator_dim: int = 256 + discriminator_causal: bool = True + discriminator_linear_emb: bool = False + discriminator_depth: int = 1 + discriminator_max_pool: bool = False + discriminator_act_after_linear: bool = False + discriminator_dropout: float = 0.0 + discriminator_spectral_norm: bool = False + discriminator_weight_norm: bool = False + + generator_kernel: int = 4 + generator_dilation: int = 1 + generator_stride: int = 1 + generator_pad: int = -1 + generator_bias: bool = False + generator_dropout: float = 0.0 + generator_batch_norm: int = 0 + generator_residual: bool = False + + blank_weight: float = 0 + blank_mode: str = "add" + blank_is_sil: bool = False + no_softmax: bool = False + + smoothness_weight: float = 0.0 + smoothing: float = 0.0 + smoothing_one_sided: bool = False + gradient_penalty: float = 0.0 + probabilistic_grad_penalty_slicing: bool = False + code_penalty: float = 0.0 + mmi_weight: float = 0.0 + target_dim: int = 64 + target_downsample_rate: int = 2 + gumbel: bool = False + hard_gumbel: bool = True + temp: Tuple[float, float, float] = (2, 0.1, 0.99995) + input_dim: int = 128 + + segmentation: SegmentationConfig = SegmentationConfig() + + +class Segmenter(nn.Module): + cfg: SegmentationConfig + + def __init__(self, cfg: SegmentationConfig): + super().__init__() + self.cfg = cfg + self.subsample_rate = cfg.subsample_rate + + def pre_segment(self, dense_x, dense_padding_mask): + return dense_x, dense_padding_mask + + def logit_segment(self, logits, padding_mask): + return logits, padding_mask + + +class RandomSegmenter(Segmenter): + def pre_segment(self, dense_x, dense_padding_mask): + target_num = math.ceil(dense_x.size(1) * self.subsample_rate) + ones = torch.ones(dense_x.shape[:-1], device=dense_x.device) + indices, _ = ones.multinomial(target_num).sort(dim=-1) + indices_ld = indices.unsqueeze(-1).expand(-1, -1, dense_x.size(-1)) + dense_x = dense_x.gather(1, indices_ld) + dense_padding_mask = dense_padding_mask.gather(1, index=indices) + return dense_x, dense_padding_mask + + +class UniformRandomSegmenter(Segmenter): + def pre_segment(self, dense_x, dense_padding_mask): + bsz, tsz, fsz = dense_x.shape + + target_num = math.ceil(tsz * self.subsample_rate) + + rem = tsz % target_num + + if rem > 0: + dense_x = F.pad(dense_x, [0, 0, 0, target_num - rem]) + dense_padding_mask = F.pad( + dense_padding_mask, [0, target_num - rem], value=True + ) + + dense_x = dense_x.view(bsz, target_num, -1, fsz) + dense_padding_mask = dense_padding_mask.view(bsz, target_num, -1) + + if self.cfg.mean_pool: + dense_x = dense_x.mean(dim=-2) + dense_padding_mask = dense_padding_mask.all(dim=-1) + else: + ones = torch.ones((bsz, dense_x.size(2)), device=dense_x.device) + indices = ones.multinomial(1) + indices = indices.unsqueeze(-1).expand(-1, target_num, -1) + indices_ld = indices.unsqueeze(-1).expand(-1, -1, -1, fsz) + dense_x = dense_x.gather(2, indices_ld).reshape(bsz, -1, fsz) + dense_padding_mask = dense_padding_mask.gather(2, index=indices).reshape( + bsz, -1 + ) + return dense_x, dense_padding_mask + + +class JoinSegmenter(Segmenter): + def logit_segment(self, logits, padding_mask): + preds = logits.argmax(dim=-1) + + if padding_mask.any(): + preds[padding_mask] = -1 # mark pad + uniques = [] + + bsz, tsz, csz = logits.shape + + for p in preds: + uniques.append( + p.cpu().unique_consecutive(return_inverse=True, return_counts=True) + ) + + new_tsz = max(u[0].numel() for u in uniques) + new_logits = logits.new_zeros(bsz, new_tsz, csz) + new_pad = padding_mask.new_zeros(bsz, new_tsz) + + for b in range(bsz): + u, idx, c = uniques[b] + keep = u != -1 + + if self.cfg.remove_zeros: + keep.logical_and_(u != 0) + + if self.training and not self.cfg.mean_pool_join: + u[0] = 0 + u[1:] = c.cumsum(0)[:-1] + m = c > 1 + r = torch.rand(m.sum()) + o = (c[m] * r).long() + u[m] += o + new_logits[b, : u.numel()] = logits[b, u] + else: + new_logits[b].index_add_( + dim=0, index=idx.to(new_logits.device), source=logits[b] + ) + new_logits[b, : c.numel()] /= c.unsqueeze(-1).to(new_logits.device) + + new_sz = keep.sum() + if not keep.all(): + kept_logits = new_logits[b, : c.numel()][keep] + new_logits[b, :new_sz] = kept_logits + + if new_sz < new_tsz: + pad = new_tsz - new_sz + new_logits[b, -pad:] = 0 + new_pad[b, -pad:] = True + + return new_logits, new_pad + + +class UniformRandomJoinSegmenter(UniformRandomSegmenter, JoinSegmenter): + pass + + +SEGMENT_FACTORY = { + SegmentationType.NONE: Segmenter, + SegmentationType.RANDOM: RandomSegmenter, + SegmentationType.UNIFORM_RANDOM: UniformRandomSegmenter, + SegmentationType.UNIFORM_RANDOM_JOIN: UniformRandomJoinSegmenter, + SegmentationType.JOIN: JoinSegmenter, +} + + +class Discriminator(nn.Module): + def __init__(self, dim, cfg: Wav2vec_UConfig): + super().__init__() + + inner_dim = cfg.discriminator_dim + kernel = cfg.discriminator_kernel + dilation = cfg.discriminator_dilation + self.max_pool = cfg.discriminator_max_pool + + if cfg.discriminator_causal: + padding = kernel - 1 + else: + padding = kernel // 2 + + def make_conv(in_d, out_d, k, p=0, has_dilation=True): + conv = nn.Conv1d( + in_d, + out_d, + kernel_size=k, + padding=p, + dilation=dilation if has_dilation else 1, + ) + if cfg.discriminator_spectral_norm: + conv = nn.utils.spectral_norm(conv) + elif cfg.discriminator_weight_norm: + conv = nn.utils.weight_norm(conv) + return conv + + inner_net = [ + nn.Sequential( + make_conv(inner_dim, inner_dim, kernel, padding), + SamePad(kernel_size=kernel, causal=cfg.discriminator_causal), + nn.Dropout(cfg.discriminator_dropout), + nn.GELU(), + ) + for _ in range(cfg.discriminator_depth - 1) + ] + [ + make_conv(inner_dim, 1, kernel, padding, has_dilation=False), + SamePad(kernel_size=kernel, causal=cfg.discriminator_causal), + ] + + if cfg.discriminator_linear_emb: + emb_net = [make_conv(dim, inner_dim, 1)] + else: + emb_net = [ + make_conv(dim, inner_dim, kernel, padding), + SamePad(kernel_size=kernel, causal=cfg.discriminator_causal), + ] + + if cfg.discriminator_act_after_linear: + emb_net.append(nn.GELU()) + + self.net = nn.Sequential( + *emb_net, + nn.Dropout(cfg.discriminator_dropout), + *inner_net, + ) + + def forward(self, x, padding_mask): + x = x.transpose(1, 2) # BTC -> BCT + x = self.net(x) + x = x.transpose(1, 2) + x_sz = x.size(1) + if padding_mask is not None and padding_mask.any() and padding_mask.dim() > 1: + padding_mask = padding_mask[:, : x.size(1)] + x[padding_mask] = float("-inf") if self.max_pool else 0 + x_sz = x_sz - padding_mask.sum(dim=-1) + x = x.squeeze(-1) + if self.max_pool: + x, _ = x.max(dim=-1) + else: + x = x.sum(dim=-1) + x = x / x_sz + return x + + +class Generator(nn.Module): + def __init__(self, input_dim, output_dim, cfg: Wav2vec_UConfig): + super().__init__() + + self.cfg = cfg + self.output_dim = output_dim + self.stride = cfg.generator_stride + self.dropout = nn.Dropout(cfg.generator_dropout) + self.batch_norm = cfg.generator_batch_norm != 0 + self.residual = cfg.generator_residual + + padding = ( + cfg.generator_kernel // 2 if cfg.generator_pad < 0 else cfg.generator_pad + ) + self.proj = nn.Sequential( + TransposeLast(), + nn.Conv1d( + input_dim, + output_dim, + kernel_size=cfg.generator_kernel, + stride=cfg.generator_stride, + dilation=cfg.generator_dilation, + padding=padding, + bias=cfg.generator_bias, + ), + TransposeLast(), + ) + + if self.batch_norm: + self.bn = nn.BatchNorm1d(input_dim) + self.bn.weight.data.fill_(cfg.generator_batch_norm) + if self.residual: + self.in_proj = nn.Linear(input_dim, input_dim) + + def forward(self, dense_x, tokens, dense_padding_mask): + result = {} + + if self.batch_norm: + dense_x = self.bn_padded_data(dense_x, dense_padding_mask) + if self.residual: + inter_x = self.in_proj(self.dropout(dense_x)) + dense_x = dense_x + inter_x + result["inter_x"] = inter_x + + dense_x = self.dropout(dense_x) + + dense_x = self.proj(dense_x) + if self.stride > 1: + dense_padding_mask = dense_padding_mask[:, :: self.stride] + + if dense_padding_mask.size(1) != dense_x.size(1): + new_padding = dense_padding_mask.new_zeros(dense_x.shape[:-1]) + diff = new_padding.size(1) - dense_padding_mask.size(1) + + if diff > 0: + new_padding[:, diff:] = dense_padding_mask + else: + assert diff < 0 + new_padding = dense_padding_mask[:, :diff] + + dense_padding_mask = new_padding + + token_x = None + if tokens is not None: + token_x = dense_x.new_zeros(tokens.numel(), self.output_dim) + token_x.scatter_(1, tokens.view(-1, 1).long(), 1) + token_x = token_x.view(tokens.shape + (self.output_dim,)) + + result["dense_x"] = dense_x + result["token_x"] = token_x + result["dense_padding_mask"] = dense_padding_mask + + return result + + def bn_padded_data(self, feature, padding_mask): + normed_feature = feature.clone() + normed_feature[~padding_mask] = self.bn( + feature[~padding_mask].unsqueeze(-1) + ).squeeze(-1) + return normed_feature + + +@register_model("wav2vec_u", dataclass=Wav2vec_UConfig) +class Wav2vec_U(BaseFairseqModel): + def calc_gradient_penalty(self, real_data, fake_data): + + b_size = min(real_data.size(0), fake_data.size(0)) + t_size = min(real_data.size(1), fake_data.size(1)) + + if self.cfg.probabilistic_grad_penalty_slicing: + + def get_slice(data, dim, target_size): + + size = data.size(dim) + diff = size - target_size + if diff <= 0: + return data + + start = np.random.randint(0, diff + 1) + return data.narrow(dim=dim, start=start, length=target_size) + + real_data = get_slice(real_data, 0, b_size) + real_data = get_slice(real_data, 1, t_size) + fake_data = get_slice(fake_data, 0, b_size) + fake_data = get_slice(fake_data, 1, t_size) + + else: + real_data = real_data[:b_size, :t_size] + fake_data = fake_data[:b_size, :t_size] + + alpha = torch.rand(real_data.size(0), 1, 1) + alpha = alpha.expand(real_data.size()) + alpha = alpha.to(real_data.device) + + interpolates = alpha * real_data + ((1 - alpha) * fake_data) + + disc_interpolates = self.discriminator(interpolates, None) + + gradients = autograd.grad( + outputs=disc_interpolates, + inputs=interpolates, + grad_outputs=torch.ones(disc_interpolates.size(), device=real_data.device), + create_graph=True, + retain_graph=True, + only_inputs=True, + )[0] + + gradient_penalty = (gradients.norm(2, dim=1) - 1) ** 2 + return gradient_penalty + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + self.update_num = num_updates + self.curr_temp = max( + self.max_temp * self.temp_decay ** num_updates, self.min_temp + ) + + def discrim_step(self, num_updates): + return num_updates % 2 == 1 + + def get_groups_for_update(self, num_updates): + return "discriminator" if self.discrim_step(num_updates) else "generator" + + def __init__(self, cfg: Wav2vec_UConfig, target_dict): + super().__init__() + + self.cfg = cfg + self.zero_index = target_dict.index("<SIL>") if "<SIL>" in target_dict else 0 + self.smoothness_weight = cfg.smoothness_weight + + output_size = len(target_dict) + self.pad = target_dict.pad() + self.eos = target_dict.eos() + self.smoothing = cfg.smoothing + self.smoothing_one_sided = cfg.smoothing_one_sided + self.no_softmax = cfg.no_softmax + self.gumbel = cfg.gumbel + self.hard_gumbel = cfg.hard_gumbel + self.last_acc = None + + self.gradient_penalty = cfg.gradient_penalty + self.code_penalty = cfg.code_penalty + self.mmi_weight = cfg.mmi_weight + self.blank_weight = cfg.blank_weight + self.blank_mode = cfg.blank_mode + self.blank_index = target_dict.index("<SIL>") if cfg.blank_is_sil else 0 + assert self.blank_index != target_dict.unk() + + self.discriminator = Discriminator(output_size, cfg) + for p in self.discriminator.parameters(): + p.param_group = "discriminator" + + self.pca_A = self.pca_b = None + d = cfg.input_dim + + self.segmenter = SEGMENT_FACTORY[cfg.segmentation.type](cfg.segmentation) + + self.generator = Generator(d, output_size, cfg) + + for p in self.generator.parameters(): + p.param_group = "generator" + + for p in self.segmenter.parameters(): + p.param_group = "generator" + + self.max_temp, self.min_temp, self.temp_decay = cfg.temp + self.curr_temp = self.max_temp + self.update_num = 0 + + if self.mmi_weight > 0: + self.target_downsample_rate = cfg.target_downsample_rate + self.decoder = nn.Linear(d, cfg.target_dim) + for p in self.decoder.parameters(): + p.param_group = "generator" + + @classmethod + def build_model(cls, cfg, task): + return cls(cfg, task.target_dictionary) + + def get_logits( + self, + net_output: Optional[Dict[str, List[Optional[torch.Tensor]]]], + normalize: bool = False, + ): + logits = net_output["logits"] + + if self.blank_weight != 0: + if self.blank_mode == "add": + logits[..., self.blank_index] += self.blank_weight + elif self.blank_mode == "set": + logits[..., self.blank_index] = self.blank_weight + else: + raise Exception(f"invalid blank mode {self.blank_mode}") + + padding = net_output["padding_mask"] + if padding.any(): + logits[padding] = float("-inf") + logits[padding][..., self.blank_index] = float("inf") + + if normalize: + logits = utils.log_softmax(logits.float(), dim=-1) + + return logits.transpose(0, 1) + + def get_normalized_probs( + self, + net_output: Tuple[ + torch.Tensor, Optional[Dict[str, List[Optional[torch.Tensor]]]] + ], + log_probs: bool, + sample: Optional[Dict[str, torch.Tensor]] = None, + ): + logits = self.get_logits(net_output) + + probs = super().get_normalized_probs(logits, log_probs, sample) + # BTC -> TBC for ctc + probs = probs.transpose(0, 1) + return probs + + def normalize(self, dense_x): + + bsz, tsz, csz = dense_x.shape + + if dense_x.numel() == 0: + raise Exception(dense_x.shape) + _, k = dense_x.max(-1) + hard_x = ( + dense_x.new_zeros(bsz * tsz, csz) + .scatter_(-1, k.view(-1, 1), 1.0) + .view(-1, csz) + ) + hard_probs = torch.mean(hard_x.float(), dim=0) + code_perplexity = torch.exp( + -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1) + ) + + avg_probs = torch.softmax(dense_x.reshape(-1, csz).float(), dim=-1).mean(dim=0) + prob_perplexity = torch.exp( + -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1) + ) + + if not self.no_softmax: + if self.training and self.gumbel: + dense_x = F.gumbel_softmax( + dense_x.float(), tau=self.curr_temp, hard=self.hard_gumbel + ).type_as(dense_x) + else: + dense_x = dense_x.softmax(-1) + + return dense_x, code_perplexity, prob_perplexity + + def forward( + self, + features, + padding_mask, + random_label=None, + dense_x_only=False, + segment=True, + aux_target=None, + ): + if segment: + features, padding_mask = self.segmenter.pre_segment(features, padding_mask) + + orig_size = features.size(0) * features.size(1) - padding_mask.sum() + + gen_result = self.generator(features, random_label, padding_mask) + + orig_dense_x, token_x = gen_result["dense_x"], gen_result["token_x"] + orig_dense_padding_mask = gen_result["dense_padding_mask"] + + if segment: + dense_x, dense_padding_mask = self.segmenter.logit_segment( + orig_dense_x, orig_dense_padding_mask + ) + else: + dense_x = orig_dense_x + dense_padding_mask = orig_dense_padding_mask + + dense_logits = dense_x + prob_perplexity = None + code_perplexity = None + + if not (self.no_softmax and dense_x_only): + dense_x, code_perplexity, prob_perplexity = self.normalize(dense_logits) + + if dense_x_only or self.discriminator is None: + return { + "logits": dense_x, + "padding_mask": dense_padding_mask, + } + + token_padding_mask = random_label == self.pad + + dense_y = self.discriminator(dense_x, dense_padding_mask) + token_y = self.discriminator(token_x, token_padding_mask) + + sample_size = features.size(0) + + d_step = self.discrim_step(self.update_num) + + fake_smooth = self.smoothing + real_smooth = self.smoothing + if self.smoothing_one_sided: + fake_smooth = 0 + + zero_loss = None + smoothness_loss = None + code_pen = None + mmi_loss = None + + if d_step: + loss_dense = F.binary_cross_entropy_with_logits( + dense_y, + dense_y.new_ones(dense_y.shape) - fake_smooth, + reduction="sum", + ) + loss_token = F.binary_cross_entropy_with_logits( + token_y, + token_y.new_zeros(token_y.shape) + real_smooth, + reduction="sum", + ) + if self.training and self.gradient_penalty > 0: + grad_pen = self.calc_gradient_penalty(token_x, dense_x) + grad_pen = grad_pen.sum() * self.gradient_penalty + else: + grad_pen = None + else: + grad_pen = None + loss_token = None + loss_dense = F.binary_cross_entropy_with_logits( + dense_y, + dense_y.new_zeros(dense_y.shape) + fake_smooth, + reduction="sum", + ) + num_vars = dense_x.size(-1) + if prob_perplexity is not None: + code_pen = (num_vars - prob_perplexity) / num_vars + code_pen = code_pen * sample_size * self.code_penalty + + if self.smoothness_weight > 0: + smoothness_loss = F.mse_loss( + dense_logits[:, :-1], dense_logits[:, 1:], reduction="none" + ) + smoothness_loss[dense_padding_mask[:, 1:]] = 0 + smoothness_loss = ( + smoothness_loss.mean() * sample_size * self.smoothness_weight + ) + + if (self.mmi_weight > 0) and (aux_target is not None): + inter_x = self.decoder(gen_result["inter_x"]) + if self.target_downsample_rate > 1: + aux_target = aux_target[:, :: self.target_downsample_rate] + max_t_len = min(aux_target.shape[1], inter_x.shape[1]) + mmi_loss = F.cross_entropy( + inter_x[:, :max_t_len].transpose(1, 2), + aux_target[:, :max_t_len], + ignore_index=-1, + reduction="none", + ) + mmi_loss = mmi_loss.mean() * mmi_loss.shape[0] * self.mmi_weight + + result = { + "losses": { + "grad_pen": grad_pen, + "code_pen": code_pen, + "smoothness": smoothness_loss, + "mmi": mmi_loss, + }, + "temp": self.curr_temp, + "code_ppl": code_perplexity, + "prob_ppl": prob_perplexity, + "d_steps": int(d_step), + "sample_size": sample_size, + } + + suff = "_d" if d_step else "_g" + result["losses"]["dense" + suff] = loss_dense + result["losses"]["token" + suff] = loss_token + + return result diff --git a/examples/wav2vec/unsupervised/scripts/apply_pca.py b/examples/wav2vec/unsupervised/scripts/apply_pca.py new file mode 100644 index 0000000000..10ad6ce47c --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/apply_pca.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import os.path as osp +import math +import numpy as np +import tqdm +import torch +from shutil import copyfile + +from npy_append_array import NpyAppendArray + + +def get_parser(): + parser = argparse.ArgumentParser( + description="transforms features via a given pca and stored them in target dir" + ) + # fmt: off + parser.add_argument('source', help='directory with features') + parser.add_argument('--split', help='which split to read', required=True) + parser.add_argument('--save-dir', help='where to save the output', required=True) + parser.add_argument('--pca-path', type=str, help='pca location. will append _A.npy and _b.npy', required=True) + parser.add_argument('--batch-size', type=int, default=2048000, help='batch size') + parser.add_argument('--unfiltered', action='store_true', help='process the unfiltered version') + # fmt: on + + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + + source_path = osp.join(args.source, args.split) + data_poth = source_path + "_unfiltered" if args.unfiltered else source_path + + print(f"data path: {data_poth}") + + features = np.load(data_poth + ".npy", mmap_mode="r") + pca_A = torch.from_numpy(np.load(args.pca_path + "_A.npy")).cuda() + pca_b = torch.from_numpy(np.load(args.pca_path + "_b.npy")).cuda() + + os.makedirs(args.save_dir, exist_ok=True) + save_path = osp.join(args.save_dir, args.split) + + copyfile(source_path + ".tsv", save_path + ".tsv") + copyfile(data_poth + ".lengths", save_path + ".lengths") + + if osp.exists(source_path + ".phn"): + copyfile(source_path + ".phn", save_path + ".phn") + + if osp.exists(source_path + ".wrd"): + copyfile(source_path + ".wrd", save_path + ".wrd") + + if osp.exists(save_path + ".npy"): + os.remove(save_path + ".npy") + npaa = NpyAppendArray(save_path + ".npy") + + batches = math.ceil(features.shape[0] / args.batch_size) + + with torch.no_grad(): + for b in tqdm.trange(batches): + start = b * args.batch_size + end = start + args.batch_size + x = torch.from_numpy(features[start:end]).cuda() + x = torch.matmul(x, pca_A) + pca_b + npaa.append(x.cpu().numpy()) + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/copy_labels.py b/examples/wav2vec/unsupervised/scripts/copy_labels.py new file mode 100644 index 0000000000..989868388e --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/copy_labels.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import sys + +for idx, line in enumerate(sys.stdin): + print(f"utt{idx:010d} {line}", end="") diff --git a/examples/wav2vec/unsupervised/scripts/filter_lexicon.py b/examples/wav2vec/unsupervised/scripts/filter_lexicon.py new file mode 100644 index 0000000000..5bf3e51e7a --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/filter_lexicon.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import sys + +from fairseq.data import Dictionary + + +def get_parser(): + parser = argparse.ArgumentParser( + description="filters a lexicon given a unit dictionary" + ) + parser.add_argument("-d", "--unit-dict", help="unit dictionary", required=True) + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + + d = Dictionary.load(args.unit_dict) + symbols = set(d.symbols) + + for line in sys.stdin: + items = line.rstrip().split() + skip = len(items) < 2 + for x in items[1:]: + if x not in symbols: + skip = True + break + if not skip: + print(line, end="") + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/filter_tsv.py b/examples/wav2vec/unsupervised/scripts/filter_tsv.py new file mode 100644 index 0000000000..a09d79acf3 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/filter_tsv.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import argparse +import sys + + +parser = argparse.ArgumentParser() +parser.add_argument("--tsv", required=True, type=str) +parser.add_argument("--no-skip", action="store_true") +parser.add_argument("--keep", action="store_true") +params = parser.parse_args() + + +def get_fname(line): + p = os.path.basename(line.split("\t")[0]) + p = os.path.splitext(p)[0] + return p + + +# filenames to exclude +seen = set() +with open(params.tsv) as f: + if not params.no_skip: + root = next(f).rstrip() + for line in f: + seen.add(get_fname(line)) + +for i, line in enumerate(sys.stdin): + exists = get_fname(line) in seen + keep = (exists and params.keep) or (not exists and not params.keep) + if i == 0 or keep: + print(line, end="") diff --git a/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py b/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py new file mode 100644 index 0000000000..2e31c307bd --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import sys + +from g2p_en import G2p + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--compact", + action="store_true", + help="if set, compacts phones", + ) + args = parser.parse_args() + + compact = args.compact + + wrd_to_phn = {} + g2p = G2p() + for line in sys.stdin: + words = line.strip().split() + phones = [] + for w in words: + if w not in wrd_to_phn: + wrd_to_phn[w] = g2p(w) + if compact: + wrd_to_phn[w] = [ + p[:-1] if p[-1].isnumeric() else p for p in wrd_to_phn[w] + ] + phones.extend(wrd_to_phn[w]) + try: + print(" ".join(phones)) + except: + print(wrd_to_phn, words, phones, file=sys.stderr) + raise + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/ltr_to_wrd.py b/examples/wav2vec/unsupervised/scripts/ltr_to_wrd.py new file mode 100644 index 0000000000..36c85d1e2f --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/ltr_to_wrd.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import sys + + +def main(): + for line in sys.stdin: + print(line.replace(" ", "").replace("|", " ").strip()) + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/mean_pool.py b/examples/wav2vec/unsupervised/scripts/mean_pool.py new file mode 100644 index 0000000000..4eea048ef3 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/mean_pool.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import os.path as osp +import math +import numpy as np +import tqdm +import torch +import torch.nn.functional as F +from shutil import copyfile + +from npy_append_array import NpyAppendArray + + +def get_parser(): + parser = argparse.ArgumentParser( + description="mean pools representations by compressing uniform splits of the data" + ) + # fmt: off + parser.add_argument('source', help='directory with features') + parser.add_argument('--split', help='which split to read', required=True) + parser.add_argument('--save-dir', help='where to save the output', required=True) + parser.add_argument('--subsample-rate', type=float, default=0.5, help='size to subsample data to') + + parser.add_argument('--remove-extra', action='store_true', help='if true, removes extra states that cant be pooled, otherwise pads with 0s') + # fmt: on + + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + + source_path = osp.join(args.source, args.split) + + print(f"data path: {source_path}") + + features = np.load(source_path + ".npy", mmap_mode="r") + + os.makedirs(args.save_dir, exist_ok=True) + save_path = osp.join(args.save_dir, args.split) + + copyfile(source_path + ".tsv", save_path + ".tsv") + + if os.path.exists(source_path + ".phn"): + copyfile(source_path + ".phn", save_path + ".phn") + if os.path.exists(source_path + ".wrd"): + copyfile(source_path + ".wrd", save_path + ".wrd") + + if os.path.exists(osp.join(args.source, "dict.phn.txt")): + copyfile( + osp.join(args.source, "dict.phn.txt"), + osp.join(args.save_dir, "dict.phn.txt"), + ) + + if osp.exists(save_path + ".npy"): + os.remove(save_path + ".npy") + npaa = NpyAppendArray(save_path + ".npy") + + with open(source_path + ".lengths", "r") as lf: + lengths = lf.readlines() + + fsz = features.shape[-1] + start = 0 + with torch.no_grad(): + with open(save_path + ".lengths", "w") as lengths_out: + for length in tqdm.tqdm(lengths): + length = int(length) + end = start + length + feats = features[start:end] + start += length + x = torch.from_numpy(feats).cuda() + target_num = math.ceil(length * args.subsample_rate) + rem = length % target_num + + if rem > 0: + if args.remove_extra: + to_rem = target_num - rem + target_num -= 1 + x = x[:-to_rem] + else: + to_add = target_num - rem + x = F.pad(x, [0, 0, 0, to_add]) + x[-to_add:] = x[-to_add - 1] + + x = x.view(target_num, -1, fsz) + x = x.mean(dim=-2) + print(target_num, file=lengths_out) + npaa.append(x.cpu().numpy()) + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/merge_clusters.py b/examples/wav2vec/unsupervised/scripts/merge_clusters.py new file mode 100644 index 0000000000..2780f9d971 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/merge_clusters.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import os.path as osp +import numpy as np +import tqdm +import torch +import random +from shutil import copyfile + +from npy_append_array import NpyAppendArray + + +def get_parser(): + parser = argparse.ArgumentParser( + description="transforms features via a given pca and stored them in target dir" + ) + # fmt: off + parser.add_argument('source', help='directory with features') + parser.add_argument('--split', help='which split to read', required=True) + parser.add_argument('--save-dir', help='where to save the output', required=True) + parser.add_argument('--cluster-dir', help='where the clusters are') + parser.add_argument('--pooling', type=str, default='mean', choices=['mean', 'sample'], help='how to pool') + # fmt: on + + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + + source_path = osp.join(args.source, args.split) + cluster_path = osp.join(args.cluster_dir, args.split + ".src") + print(f"data path: {source_path}") + + features = np.load(source_path + ".npy", mmap_mode="r") + sizes = [] + offsets = [] + offset = 0 + with open(source_path + ".lengths", "r") as len_f: + for line in len_f: + length = int(line.rstrip()) + sizes.append(length) + offsets.append(offset) + offset += length + + clusters = [] + with open(cluster_path, "r") as cf: + for line in cf: + line = line.rstrip() + items = line.split() + items = list(map(int, items)) + clusters.append(items) + + os.makedirs(args.save_dir, exist_ok=True) + save_path = osp.join(args.save_dir, args.split) + + copyfile(source_path + ".tsv", save_path + ".tsv") + + if os.path.exists(source_path + ".phn"): + copyfile(source_path + ".phn", save_path + ".phn") + if os.path.exists(osp.join(args.source, "dict.phn.txt")): + copyfile( + osp.join(args.source, "dict.phn.txt"), + osp.join(args.save_dir, "dict.phn.txt"), + ) + if os.path.exists(source_path + ".wrd"): + copyfile(source_path + ".wrd", save_path + ".wrd") + + if osp.exists(save_path + ".npy"): + os.remove(save_path + ".npy") + npaa = NpyAppendArray(save_path + ".npy") + + def merge(feats, clust): + feats = torch.from_numpy(feats.copy()) + clust = torch.LongTensor(clust) + _, counts = clust.unique_consecutive(return_counts=True) + curr = 0 + + merged = [] + for c in counts: + c = c.item() + start = curr + end = curr + c + curr += c + if args.pooling == "mean": + new_x = feats[start:end].mean(dim=0) + elif args.pooling == "sample": + new_x = feats[start + int(random.random() * c)] + else: + raise NotImplementedError() + merged.append(new_x) + + return torch.stack(merged, dim=0).numpy() + + with open(save_path + ".lengths", "w") as l_f: + for size, offset, clust in tqdm.tqdm( + zip(sizes, offsets, clusters), total=len(sizes) + ): + end = size + offset + feats = features[offset:end] + feats = merge(feats, clust) + print(len(feats), file=l_f) + npaa.append(feats) + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py b/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py new file mode 100644 index 0000000000..c2bd16efb5 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import fasttext as ft +import os +import regex +import sys + + +def get_parser(): + parser = argparse.ArgumentParser( + description="reads text from stdin and outputs normalized, lid-filtered version to stdout" + ) + parser.add_argument( + "--fasttext-model", + help="path to fasttext model", + default="lid.187.bin", + ) + parser.add_argument("--lang", help="language id", required=True) + parser.add_argument( + "--lid-threshold", + type=float, + help="threshold for this lang id probability", + default=0.4, + ) + + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]") + + lg = args.lang.lower() + lg_label = f"__label__{lg}" + thresh = args.lid_threshold + + if os.path.exists(args.fasttext_model): + model = ft.load_model(args.fasttext_model) + else: + print( + f"fasttext language id model {args.fasttext_model} not found. Proceeding without language filtering. " + f"To enable language filtering, please download the latest language id model " + f"from https://fasttext.cc/docs/en/language-identification.html", + file=sys.stderr, + ) + model = None + + for line in sys.stdin: + line = line.strip() + line = filter_r.sub(" ", line) + line = " ".join(line.split()) + + if model is not None: + lid, prob = model.predict(line, k=100) + try: + target_idx = lid.index(lg_label) + except ValueError: + continue + if target_idx == 0 or prob[target_idx] >= thresh: + print(line) + else: + print(line) + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/normalize_text.py b/examples/wav2vec/unsupervised/scripts/normalize_text.py new file mode 100644 index 0000000000..9d0ffeb27d --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/normalize_text.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import regex +import sys + + +def main(): + filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]") + + for line in sys.stdin: + line = line.strip() + line = filter_r.sub(" ", line) + line = " ".join(line.split()) + print(line) + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/pca.py b/examples/wav2vec/unsupervised/scripts/pca.py new file mode 100644 index 0000000000..948cf5319f --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/pca.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import os.path as osp +import numpy as np + +import faiss + + + +def get_parser(): + parser = argparse.ArgumentParser( + description="compute a pca matrix given an array of numpy features" + ) + # fmt: off + parser.add_argument('data', help='numpy file containing features') + parser.add_argument('--output', help='where to save the pca matrix', required=True) + parser.add_argument('--dim', type=int, help='dim for pca reduction', required=True) + parser.add_argument('--eigen-power', type=float, default=0, help='eigen power, -0.5 for whitening') + + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + + print("Reading features") + x = np.load(args.data, mmap_mode="r") + + print("Computing PCA") + pca = faiss.PCAMatrix(x.shape[-1], args.dim, args.eigen_power) + pca.train(x) + b = faiss.vector_to_array(pca.b) + A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in) + + os.makedirs(args.output, exist_ok=True) + + prefix = str(args.dim) + if args.eigen_power != 0: + prefix += f"_{args.eigen_power}" + + np.save(osp.join(args.output, f"{prefix}_pca_A"), A.T) + np.save(osp.join(args.output, f"{prefix}_pca_b"), b) + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py b/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py new file mode 100644 index 0000000000..c6512d7322 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import numpy as np +import sys + + +def get_parser(): + parser = argparse.ArgumentParser( + description="converts words to phones adding optional silences around in between words" + ) + parser.add_argument( + "--sil-prob", + "-s", + type=float, + default=0, + help="probability of inserting silence between each word", + ) + parser.add_argument( + "--surround", + action="store_true", + help="if set, surrounds each example with silence", + ) + parser.add_argument( + "--lexicon", + help="lexicon to convert to phones", + required=True, + ) + + return parser + + +def main(): + parser = get_parser() + args = parser.parse_args() + + sil_prob = args.sil_prob + surround = args.surround + sil = "<SIL>" + + wrd_to_phn = {} + + with open(args.lexicon, "r") as lf: + for line in lf: + items = line.rstrip().split() + assert len(items) > 1, line + assert items[0] not in wrd_to_phn, items + wrd_to_phn[items[0]] = items[1:] + + for line in sys.stdin: + words = line.strip().split() + + if not all(w in wrd_to_phn for w in words): + continue + + phones = [] + if surround: + phones.append(sil) + + sample_sil_probs = None + if sil_prob > 0 and len(words) > 1: + sample_sil_probs = np.random.random(len(words) - 1) + + for i, w in enumerate(words): + phones.extend(wrd_to_phn[w]) + if ( + sample_sil_probs is not None + and i < len(sample_sil_probs) + and sample_sil_probs[i] < sil_prob + ): + phones.append(sil) + + if surround: + phones.append(sil) + print(" ".join(phones)) + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/prepare_audio.sh b/examples/wav2vec/unsupervised/scripts/prepare_audio.sh new file mode 100644 index 0000000000..013f7a9b05 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/prepare_audio.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env zsh +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +source_dir=$1 +tgt_dir=$2 +model=$3 + +if [ -z "$4" ] + then + dim=512 + else + dim=$4 +fi + +echo "using $dim dim for PCA" + +if [ -z "$5" ] + then + layer=14 + else + layer=$5 +fi + +echo "extracting from layer $layer" + +train_split=train +valid_split=valid +test_split=test + +all_splits=($train_split) + +if [[ -f "$source_dir/valid.tsv" ]]; then + all_splits+=('valid') +fi + +if [[ -f "$source_dir/test.tsv" ]]; then + all_splits+=('test') +fi + +echo "processing splits: $all_splits" + +mkdir -p $tgt_dir + +cp $source_dir/*.tsv $tgt_dir +cp $source_dir/*.wrd $tgt_dir +cp $source_dir/*.ltr $tgt_dir +cp $source_dir/*.phn $tgt_dir +cp $source_dir/dict* $tgt_dir + +setopt shwordsplit + +for split in $all_splits; do + python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py $source_dir --split $split \ + --save-dir $tgt_dir --checkpoint $model --layer $layer +done + +python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py $tgt_dir/${train_split}.tsv \ +--checkpoint $model --save-dir $tgt_dir -f "CLUS128" --sample-pct 1.0 + +for split in $all_splits; do + python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py $tgt_dir \ + --checkpoint $model --path $tgt_dir/CLUS128 --split $split +done + +python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/pca.py $tgt_dir/${train_split}.npy --output $tgt_dir/pca --dim $dim + +for split in $all_splits; do + python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/apply_pca.py $tgt_dir --split $split --save-dir $tgt_dir/precompute_pca$dim --pca-path $tgt_dir/pca/${dim}_pca --batch-size 1048000 + + python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/merge_clusters.py $tgt_dir/precompute_pca$dim --cluster-dir $tgt_dir/CLUS128 \ + --split $split --save-dir $tgt_dir/precompute_pca${dim}_cls128_mean --pooling mean + + python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/mean_pool.py $tgt_dir/precompute_pca${dim}_cls128_mean \ + --save-dir $tgt_dir/precompute_pca${dim}_cls128_mean_pooled --split $split +done diff --git a/examples/wav2vec/unsupervised/scripts/prepare_audio_v2.sh b/examples/wav2vec/unsupervised/scripts/prepare_audio_v2.sh new file mode 100644 index 0000000000..96a52c5c83 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/prepare_audio_v2.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env zsh +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +source_dir=$1 +tgt_dir=$2 +model=$3 + +if [ -z "$4" ] + then + dim=64 + else + dim=$4 +fi + +echo "using $dim clusters for auxilary target" + +if [ -z "$5" ] + then + layer=14 + else + layer=$5 +fi + +echo "extracting from layer $layer" + +train_split=train +valid_split=valid +test_split=test + +all_splits=($train_split) + +if [[ -f "$source_dir/valid.tsv" ]]; then + all_splits+=('valid') +fi + +if [[ -f "$source_dir/test.tsv" ]]; then + all_splits+=('test') +fi + +echo "processing splits: $all_splits" + +mkdir -p $tgt_dir + +cp $source_dir/*.tsv $tgt_dir +cp $source_dir/*.wrd $tgt_dir +cp $source_dir/*.ltr $tgt_dir +cp $source_dir/*.phn $tgt_dir +cp $source_dir/dict* $tgt_dir + +setopt shwordsplit + +for split in $all_splits; do + python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py $source_dir --split $split \ + --save-dir $tgt_dir --checkpoint $model --layer $layer +done + + +mkdir -p $tgt_dir/mfcc + +# Consider spliting corpus into chuncks for large corpus, see HuBERT preprocessing for more details +python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_mfcc_feature.py \ + $tgt_dir $train_split 1 0 $tgt_dir/mfcc +python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_km_label.py \ + $tgt_dir/mfcc $train_split $tgt_dir/mfcc/cls$dim 1 0 $tgt_dir/mfcc/cls${dim}_idx +cp $tgt_dir/mfcc/cls${dim}_idx/${train_split}_0_1.km $tgt_dir/$train_split.km diff --git a/examples/wav2vec/unsupervised/scripts/prepare_text.sh b/examples/wav2vec/unsupervised/scripts/prepare_text.sh new file mode 100644 index 0000000000..dbd17a2472 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/prepare_text.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env zsh +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +lg=$1 +text_path=$2 +target_dir=$3 +min_phones=$4 +phonemizer=$5 +lid_path=$6 +sil_prob=$7 + +if [ -z "$lid_path" ]; then + lid_path="lid.187.bin" +fi + +ph_lg=${lg:l} +if test "$lg" = 'fr'; then + ph_lg='fr-fr' +elif test "$lg" = 'en'; then + ph_lg='en-us' +elif test "$lg" = 'pt'; then + ph_lg='pt-br' +fi + +ESPEAK_PATH='' +if test "$phonemizer" = 'espeak'; then + ESPEAK_PATH=$(which espeak) +elif test "$phonemizer" = 'espeak-ng'; then + ESPEAK_PATH=$(which espeak-ng) +elif test "$phonemizer" = 'G2P'; then + ESPEAK_PATH='' +else + echo "Unknown phonemizer $phonemizer. Valid options are espeak, espean-ng and G2P" + exit 1 +fi + +echo $lg +echo $ph_lg +echo $text_path +echo $target_dir +echo "min phone seen threshold is $min_phones" + +mkdir -p $target_dir +python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py --lang $lg --fasttext-model $lid_path < $text_path | grep -v '\-\-\-' >! $target_dir/lm.upper.lid.txt +python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/lm.upper.lid.txt --only-source --destdir $target_dir --thresholdsrc 2 --padding-factor 1 --dict-only +cut -f1 -d' ' $target_dir/dict.txt | grep -v -x '[[:punct:]]*' | grep -Pv '\d\d\d\d\d+' >! $target_dir/words.txt + + +if [ -z "$ESPEAK_PATH" ]; then + python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py --compact < $target_dir/words.txt > $target_dir/phones.txt +else + # echoing 1 into corpus will prevent the mismatch lines between lexicon and phones in case the phonemizer fails + one=$(echo "1" | PHONEMIZER_ESPEAK_PATH=$ESPEAK_PATH phonemize -p ' ' -w '' -l $ph_lg --language-switch remove-flags) + sed 's/$/ 1/' $target_dir/words.txt | PHONEMIZER_ESPEAK_PATH=$ESPEAK_PATH phonemize -o $target_dir/phones.txt -p ' ' -w '' -l $ph_lg -j 70 --language-switch remove-flags + echo "one is ${one}" + sed -i "s/${one}$//" $target_dir/phones.txt +fi + +paste $target_dir/words.txt $target_dir/phones.txt >! $target_dir/lexicon.lst + +python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/phones.txt --only-source --destdir $target_dir/phones --thresholdsrc $min_phones --padding-factor 1 --dict-only + +python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/filter_lexicon.py -d $target_dir/phones/dict.txt < $target_dir/lexicon.lst >! $target_dir/lexicon_filtered.lst +python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py -s $sil_prob --surround --lexicon $target_dir/lexicon_filtered.lst < $target_dir/lm.upper.lid.txt >! $target_dir/phones/lm.phones.filtered.txt +cp $target_dir/phones/dict.txt $target_dir/phones/dict.phn.txt +echo "<SIL> 0" >> $target_dir/phones/dict.phn.txt +python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/phones/lm.phones.filtered.txt --workers 70 --only-source --destdir $target_dir/phones --srcdict $target_dir/phones/dict.phn.txt + +$KENLM_ROOT/lmplz -o 4 < $target_dir/lm.upper.lid.txt --discount_fallback --prune 0 0 0 3 >! $target_dir/kenlm.wrd.o40003.arpa +$KENLM_ROOT/build_binary $target_dir/kenlm.wrd.o40003.arpa $target_dir/kenlm.wrd.o40003.bin + +lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_words_sil lm_arpa=$target_dir/kenlm.wrd.o40003.arpa wav2letter_lexicon=$target_dir/lexicon_filtered.lst data_dir=$target_dir/phones in_labels=phn "blank_symbol='<SIL>'" +lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_words lm_arpa=$target_dir/kenlm.wrd.o40003.arpa wav2letter_lexicon=$target_dir/lexicon_filtered.lst data_dir=$target_dir/phones in_labels=phn + +$KENLM_ROOT/lmplz -o 4 < $target_dir/phones/lm.phones.filtered.txt --discount_fallback >! $target_dir/phones/lm.phones.filtered.04.arpa +$KENLM_ROOT/build_binary $target_dir/phones/lm.phones.filtered.04.arpa $target_dir/phones/lm.phones.filtered.04.bin +$KENLM_ROOT/lmplz -o 6 < $target_dir/phones/lm.phones.filtered.txt --discount_fallback >! $target_dir/phones/lm.phones.filtered.06.arpa +$KENLM_ROOT/build_binary $target_dir/phones/lm.phones.filtered.06.arpa $target_dir/phones/lm.phones.filtered.06.bin + +lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_phn_sil lm_arpa=$target_dir/phones/lm.phones.filtered.06.arpa data_dir=$target_dir/phones in_labels=phn "blank_symbol='<SIL>'" diff --git a/examples/wav2vec/unsupervised/scripts/prepare_timit.sh b/examples/wav2vec/unsupervised/scripts/prepare_timit.sh new file mode 100644 index 0000000000..d8f5d596b4 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/prepare_timit.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +timit_root=$1 # assume it is the upper-cased version +tgt_dir=$2 +model=$3 + +set -eu + +setups="matched unmatched" +splits="test valid train train_text" + +tgt_dir=$(realpath $tgt_dir) +sph2wav=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +wav_dir=$tgt_dir/wav + + +mkdir -p $tgt_dir $wav_dir +find $timit_root/{TRAIN,TEST} -iname "*.WAV" > $tgt_dir/all_sph.flist +cat $tgt_dir/all_sph.flist | sed -e 's#//*#/#g' -e 's#.*/\([^/]*\)/\([^/]*\).WAV#\1_\2#g' > $tgt_dir/all.uid +paste -d' ' $tgt_dir/{all_sph.flist,all.uid} | \ + awk -v sph2wav=$sph2wav -v wav_dir=$wav_dir '{print sph2wav " -f wav " $1 " > " wav_dir "/" $2 ".wav"}' \ + > $tgt_dir/sph2wav.sh +bash $tgt_dir/sph2wav.sh +cat $tgt_dir/all.uid | awk -v wav_dir=$(pwd)/$wav_dir '{print $1" "wav_dir"/"$1".wav"}' | sort > $tgt_dir/all_wav.scp +cut -d' ' -f2 $tgt_dir/all_wav.scp | xargs -I{} soxi -s {} > $tgt_dir/all.dur +paste -d' ' $tgt_dir/{all_wav.scp,all.dur} > $tgt_dir/all_wav_dur.scp +rm $tgt_dir/{all.uid,all_sph.flist,sph2wav.sh} + +find $timit_root/{TRAIN,TEST} -iname "*.PHN" > $tgt_dir/all_phn60.flist +while read line; do + if [ ! -f $line ]; then + >&2 echo "Cannot find transcription file '$line'" && exit 1; + fi + cut -f3 -d' ' "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;' +done < $tgt_dir/all_phn60.flist > $tgt_dir/all.phn60 +cat $tgt_dir/all_phn60.flist | sed -e 's#//*#/#g' -e 's#.*/\([^/]*\)/\([^/]*\).PHN#\1_\2#g' | \ + paste -d' ' - $tgt_dir/all.phn60 | \ + $KALDI_ROOT/egs/timit/s5/local/timit_norm_trans.pl -i - -m $KALDI_ROOT/egs/timit/s5/conf/phones.60-48-39.map -to 39 | \ + sort > $tgt_dir/all.phn +echo "done preparing wav and 39-phone transcripts" + + +for s in $setups; do + mkdir -p $tgt_dir/$s + for x in $splits; do + uid_path=config/timit_${s}/${x}.uid + grep -w -f $uid_path $tgt_dir/all.phn | cut -d' ' -f2- > $tgt_dir/$s/$x.phn + ln -sf $(realpath $tgt_dir/$s/$x.phn) $tgt_dir/$s/$x.wrd + + echo "/" > $tgt_dir/$s/$x.tsv && grep -w -f $uid_path $tgt_dir/all_wav_dur.scp | cut -d' ' -f2- | sed 's# #\t#' >> $tgt_dir/$s/$x.tsv + done + + for x in $splits; do + cat $tgt_dir/$s/$x.phn + done | tr ' ' '\n' | sort -u | awk '{print $1" "1}' > $tgt_dir/$s/dict.phn.txt + ln -sf $(realpath $tgt_dir/$s/dict.phn.txt) $tgt_dir/$s/dict.wrd.txt +done +echo "done preparing unmatched and matched setups for TIMIT" + + +for s in $setups; do + zsh scripts/prepare_audio.sh $tgt_dir/$s $tgt_dir/$s/feat $model + + lm_dir=$tgt_dir/$s/phones + fst_dir=$tgt_dir/$s/fst/phn_to_phn + + python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $tgt_dir/$s/train_text.phn --workers 10 --only-source --destdir $lm_dir --srcdict $tgt_dir/$s/dict.phn.txt + $KENLM_ROOT/lmplz -o 3 < $tgt_dir/$s/train_text.phn --discount_fallback >$lm_dir/train_text_phn.03.arpa + $KENLM_ROOT/build_binary $lm_dir/train_text_phn.03.arpa $lm_dir/train_text_phn.03.bin + $KENLM_ROOT/lmplz -o 4 < $tgt_dir/$s/train_text.phn --discount_fallback >$lm_dir/train_text_phn.04.arpa + $KENLM_ROOT/build_binary $lm_dir/train_text_phn.04.arpa $lm_dir/train_text_phn.04.bin + + python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$fst_dir lm_arpa=$lm_dir/train_text_phn.03.arpa data_dir=$tgt_dir/$s in_labels=phn +done +echo "done preprocessing audio and text for wav2vec-U" diff --git a/examples/wav2vec/unsupervised/scripts/remove_silence.py b/examples/wav2vec/unsupervised/scripts/remove_silence.py new file mode 100644 index 0000000000..fac88b9897 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/remove_silence.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +get intervals from .vads file, specify output data, and this script removes silences and saves the audio data in out path folder +paths=shards/train.tsv +vads=shards/train.vads +python remove_silence.py --paths $paths --vads $vads +""" + +import os +import argparse +import torch +import torchaudio +import tqdm + + +parser = argparse.ArgumentParser() +parser.add_argument("--tsv", default="", type=str) +parser.add_argument("--vads", default="", type=str) +parser.add_argument("--out", type=str) +params = parser.parse_args() + +# load paths +paths = [] +with open(params.tsv) as f: + root = next(f).rstrip() + for line in f: + paths.append(os.path.join(root, line.rstrip().split("\t")[0])) + +# load vads +list_intervals = [] +with open(params.vads) as f: + for line in f: + interval = [ + [int(w.split(":")[0]), int(w.split(":")[1])] for w in line.rstrip().split() + ] + list_intervals.append(interval) + + +# load audio and keep only intervals (i.e. remove silences) +for i in tqdm.trange(len(paths)): + data, _ = torchaudio.load(paths[i]) + if len(list_intervals[i]) > 0: + data_filtered = torch.cat( + [data[0][int(it[0]) : int(it[1])] for it in list_intervals[i]] + ).unsqueeze(0) + else: + data_filtered = data + + # YOU MAY NEED TO MODIFY THIS TO GET THE RIGHT SUBPATH + # outpath = params.out + '/'.join(paths[i].split('/')[-1]) + outpath = params.out + "/" + "/".join(paths[i].split("/")[-2:]) + + if not os.path.isdir("/".join(outpath.split("/")[:-1])): + os.makedirs("/".join(outpath.split("/")[:-1])) + if not os.path.exists(outpath): + torchaudio.save(outpath, data_filtered, sample_rate=16000) + else: + print(outpath, "exists!") diff --git a/examples/wav2vec/unsupervised/scripts/vads.py b/examples/wav2vec/unsupervised/scripts/vads.py new file mode 100644 index 0000000000..2398da97d8 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/vads.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import sys + +from copy import deepcopy +from scipy.signal import lfilter + +import numpy as np +from tqdm import tqdm +import soundfile as sf +import os.path as osp + + +def get_parser(): + parser = argparse.ArgumentParser(description="compute vad segments") + parser.add_argument( + "--rvad-home", + "-r", + help="path to rvad home (see https://github.com/zhenghuatan/rVADfast)", + required=True, + ) + + return parser + + +def rvad(speechproc, path): + winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512 + ftThres = 0.5 + vadThres = 0.4 + opts = 1 + + data, fs = sf.read(path) + assert fs == 16_000, "sample rate must be 16khz" + ft, flen, fsh10, nfr10 = speechproc.sflux(data, fs, winlen, ovrlen, nftt) + + # --spectral flatness -- + pv01 = np.zeros(ft.shape[0]) + pv01[np.less_equal(ft, ftThres)] = 1 + pitch = deepcopy(ft) + + pvblk = speechproc.pitchblockdetect(pv01, pitch, nfr10, opts) + + # --filtering-- + ENERGYFLOOR = np.exp(-50) + b = np.array([0.9770, -0.9770]) + a = np.array([1.0000, -0.9540]) + fdata = lfilter(b, a, data, axis=0) + + # --pass 1-- + noise_samp, noise_seg, n_noise_samp = speechproc.snre_highenergy( + fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk + ) + + # sets noisy segments to zero + for j in range(n_noise_samp): + fdata[range(int(noise_samp[j, 0]), int(noise_samp[j, 1]) + 1)] = 0 + + vad_seg = speechproc.snre_vad( + fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres + ) + return vad_seg, data + + +def main(): + parser = get_parser() + args = parser.parse_args() + + sys.path.append(args.rvad_home) + import speechproc + + stride = 160 + lines = sys.stdin.readlines() + root = lines[0].rstrip() + for fpath in tqdm(lines[1:]): + path = osp.join(root, fpath.split()[0]) + vads, wav = rvad(speechproc, path) + + start = None + vad_segs = [] + for i, v in enumerate(vads): + if start is None and v == 1: + start = i * stride + elif start is not None and v == 0: + vad_segs.append((start, i * stride)) + start = None + if start is not None: + vad_segs.append((start, len(wav))) + + print(" ".join(f"{v[0]}:{v[1]}" for v in vad_segs)) + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py b/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py new file mode 100644 index 0000000000..a5dd7ae6c1 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import os.path as osp +import numpy as np +import tqdm +import torch +import sys + +import faiss +import torch.nn.functional as F + +from wav2vec_cluster_faiss import parse_faiss_specs, Wav2VecFeatureReader + + +def get_parser(): + parser = argparse.ArgumentParser(description="apply clusters") + # fmt: off + parser.add_argument('data', help='location of tsv files') + parser.add_argument('--split', help='split to process', required=True) + parser.add_argument('--labels', help='split to process', default="phn") + parser.add_argument('--path', help='path to pca and centroids', required=True) + parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec model (if using wav2vec features)', required=True) + parser.add_argument('--layer', '-l', type=int, help='which layer to read', default=14) + parser.add_argument('--max-tsz', type=int, help='batch kmeans up to this much', default=14) + # fmt: on + + return parser + + +def get_iterator(args): + label_path = osp.join(args.data, f"{args.split}.{args.labels}") + if osp.exists(label_path): + lp = open(label_path, "r") + else: + lp = None + + with open(osp.join(args.data, f"{args.split}.tsv"), "r") as fp: + lines = fp.read().split("\n") + root = lines.pop(0).strip() + files = [line.rstrip() for line in lines if len(line) > 0] + + if lp is not None: + lbls = [line.rstrip() for line in lp] + else: + lbls = [None] * len(files) + + num = len(files) + reader = Wav2VecFeatureReader(args.checkpoint, args.layer) + + def iterate(): + for fname, lbl in zip(files, lbls): + file = osp.join(root, fname.split("\t")[0]) + feats = reader.get_feats(file) + yield feats.data, fname, lbl + + return iterate, num, root + + +def main(): + parser = get_parser() + args = parser.parse_args() + + spec = osp.basename(args.path) + + try: + faiss_spec = parse_faiss_specs(spec.rstrip("/"))[0] + except: + print(spec) + raise + + print("Faiss Spec:", faiss_spec, file=sys.stderr) + + if faiss_spec.pca: + A = torch.from_numpy(np.load(osp.join(args.path, "pca_A.npy"))).cuda() + b = torch.from_numpy(np.load(osp.join(args.path, "pca_b.npy"))).cuda() + print("Loaded PCA", file=sys.stderr) + + centroids = np.load(osp.join(args.path, "centroids.npy")) + print("Loaded centroids", centroids.shape, file=sys.stderr) + + res = faiss.StandardGpuResources() + index_flat = ( + faiss.IndexFlatL2(centroids.shape[1]) + if not faiss_spec.sphere + else faiss.IndexFlatIP(centroids.shape[1]) + ) + faiss_index = faiss.index_cpu_to_gpu(res, 0, index_flat) + faiss_index.add(centroids) + + generator, num, root = get_iterator(args) + iterator = generator() + + had_labels = False + label_path = osp.join(args.path, f"{args.split}.{args.labels}") + + with torch.no_grad(): + with open(osp.join(args.path, f"{args.split}.src"), "w") as fp, open( + osp.join(args.path, f"{args.split}.tsv"), "w" + ) as pp, open(label_path, "w") as lp: + print(root, file=pp) + for f, fname, lbl in tqdm.tqdm(iterator, total=num): + if faiss_spec.pca: + f = torch.mm(f, A) + b + if faiss_spec.norm: + f = F.normalize(f, p=2, dim=-1) + + f = f.cpu().numpy() + + _, z = faiss_index.search(f, 1) + + print(" ".join(str(x.item()) for x in z), file=fp) + print(fname, file=pp) + + if lbl is not None: + print(lbl, file=lp) + had_labels = True + if not had_labels: + os.remove(label_path) + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py b/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py new file mode 100644 index 0000000000..632a69e9f4 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import gc +import os +import os.path as osp +import random +import numpy as np +import tqdm +import torch + +from collections import namedtuple + +import faiss + +import fairseq +import soundfile as sf + + +def get_parser(): + parser = argparse.ArgumentParser( + description="compute kmeans codebook from kaldi-computed feats" + ) + # fmt: off + parser.add_argument('data', help='location of tsv files') + parser.add_argument('--save-dir', help='where to save the output', required=True) + parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec model (if using wav2vec features)', required=True) + parser.add_argument('--sample-pct', '-r', type=float, help='percentage of timesteps to sample', default=0) + parser.add_argument('--layer', '-l', type=int, help='which layer to read', default=14) + parser.add_argument('--faiss-specs', '-f', type=str, + help='faiss index specs; separated by space ' + 'format is: PCAx_NORM_CLUSx_SPHERICAL -> ' + 'PCAx if exists first apply PCA ' + 'NORM if exists, normalize the vector by L2 norm ' + 'CLUSx must exist, cluster to x clusters ' + 'SPEHRICAL if exists, apply spherical kmeans', + default='l2') + # fmt: on + + return parser + + +faiss_spec = namedtuple("faiss_spec", ["pca", "norm", "n_clus", "sphere", "spec_str"]) + + +def parse_faiss_specs(specs_str): + specs = [] + for ss in specs_str.split(): + comps = ss.split("_") + pca = 0 + norm = False + n_clus = 0 + sphere = False + for c in comps: + if c.startswith("PCA"): + pca = int(c[3:]) + elif c == "NORM": + norm = True + elif c.startswith("CLUS"): + n_clus = int(c[4:]) + elif c == "SPHERICAL": + sphere = True + assert n_clus > 0 + specs.append( + faiss_spec(pca=pca, norm=norm, n_clus=n_clus, sphere=sphere, spec_str=ss) + ) + return specs + + +class Wav2VecFeatureReader(object): + def __init__(self, cp_file, layer): + state = fairseq.checkpoint_utils.load_checkpoint_to_cpu(cp_file) + + self.layer = layer + + if "cfg" in state: + w2v_args = state["cfg"] + task = fairseq.tasks.setup_task(w2v_args.task) + model = task.build_model(w2v_args.model) + else: + w2v_args = state["args"] + task = fairseq.tasks.setup_task(w2v_args) + model = task.build_model(w2v_args) + model.load_state_dict(state["model"], strict=True) + model.eval() + model.cuda() + self.model = model + + def read_audio(self, fname): + """Load an audio file and return PCM along with the sample rate""" + wav, sr = sf.read(fname) + assert sr == 16e3 + + return wav + + def get_feats(self, loc): + x = self.read_audio(loc) + with torch.no_grad(): + source = torch.from_numpy(x).view(1, -1).float().cuda() + res = self.model( + source=source, mask=False, features_only=True, layer=self.layer + ) + return res["layer_results"][self.layer][0].squeeze(1) + + +def get_iterator(args): + with open(args.data, "r") as fp: + lines = fp.read().split("\n") + root = lines.pop(0).strip() + files = [osp.join(root, line.split("\t")[0]) for line in lines if len(line) > 0] + + if getattr(args, "sample_pct", 0) > 0: + files = random.sample(files, int(args.sample_pct * len(files))) + num = len(files) + reader = Wav2VecFeatureReader(args.checkpoint, args.layer) + + def iterate(): + for fname in files: + feats = reader.get_feats(fname) + yield feats.cpu().numpy() + + return iterate, num + + +def main(): + parser = get_parser() + args = parser.parse_args() + + faiss_specs = parse_faiss_specs(args.faiss_specs) + print("Faiss Specs:", faiss_specs) + + feat_path = osp.join(args.save_dir, "features") + if osp.exists(feat_path + ".npy"): + feats = np.load(feat_path + ".npy") + else: + generator, num = get_iterator(args) + iterator = generator() + + feats = [] + for f in tqdm.tqdm(iterator, total=num): + feats.append(f) + + del iterator + del generator + + feats = np.concatenate(feats) + + print(feats.shape) + + os.makedirs(args.save_dir, exist_ok=True) + # np.save(feat_path, feats) + + gc.collect() + torch.cuda.empty_cache() + + reload = False + for spec in faiss_specs: + print("Processing spec", spec) + + if reload: + print("Reloading...") + del feats + gc.collect() + feats = np.load(feat_path + ".npy") + + save_path = osp.join(args.save_dir, spec.spec_str) + os.makedirs(save_path, exist_ok=True) + d = feats.shape[-1] + x = feats + if spec.pca > 0: + print("Computing PCA") + pca = faiss.PCAMatrix(d, spec.pca) + pca.train(x) + d = spec.pca + b = faiss.vector_to_array(pca.b) + A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in) + np.save(osp.join(save_path, "pca_A"), A.T) + np.save(osp.join(save_path, "pca_b"), b) + print("Applying PCA") + x = pca.apply_py(x) + + if spec.norm: + reload = spec.pca <= 0 + print("Normalizing") + faiss.normalize_L2(x) + + print("Computing kmeans") + kmeans = faiss.Kmeans( + d, + spec.n_clus, + niter=50, + verbose=True, + spherical=spec.sphere, + max_points_per_centroid=feats.shape[0], + gpu=True, + nredo=3, + ) + kmeans.train(x) + np.save(osp.join(save_path, "centroids"), kmeans.centroids) + del kmeans + del x + gc.collect() + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py b/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py new file mode 100644 index 0000000000..b07e274d20 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import os.path as osp +import tqdm +import torch +import torch.nn.functional as F +from shutil import copyfile + +from npy_append_array import NpyAppendArray + +import fairseq +import soundfile as sf + + +def get_parser(): + parser = argparse.ArgumentParser( + description="compute kmeans codebook from kaldi-computed feats" + ) + # fmt: off + parser.add_argument('data', help='location of tsv files') + parser.add_argument('--split', help='which split to read', required=True) + parser.add_argument('--save-dir', help='where to save the output', required=True) + parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec ctc model', required=True) + parser.add_argument('--layer', type=int, default=14, help='which layer to use') + # fmt: on + + return parser + + +class Wav2VecFeatureReader(object): + def __init__(self, cp_file, layer): + model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( + [cp_file] + ) + model = model[0] + model.eval() + model.cuda() + self.model = model + self.task = task + self.layer = layer + + def read_audio(self, fname): + """Load an audio file and return PCM along with the sample rate""" + wav, sr = sf.read(fname) + assert sr == 16e3 + + return wav + + def get_feats(self, loc): + x = self.read_audio(loc) + with torch.no_grad(): + source = torch.from_numpy(x).float().cuda() + if self.task.cfg.normalize: + assert source.dim() == 1, source.dim() + with torch.no_grad(): + source = F.layer_norm(source, source.shape) + source = source.view(1, -1) + + m_res = self.model(source=source, mask=False, features_only=True, layer=self.layer) + return m_res["x"].squeeze(0).cpu() + + +def get_iterator(args): + with open(osp.join(args.data, args.split) + ".tsv", "r") as fp: + lines = fp.read().split("\n") + root = lines.pop(0).strip() + files = [osp.join(root, line.split("\t")[0]) for line in lines if len(line) > 0] + + num = len(files) + reader = Wav2VecFeatureReader(args.checkpoint, args.layer) + + def iterate(): + for fname in files: + w2v_feats = reader.get_feats(fname) + yield w2v_feats + + return iterate, num + + +def main(): + parser = get_parser() + args = parser.parse_args() + + os.makedirs(args.save_dir, exist_ok=True) + + def create_files(dest): + copyfile(osp.join(args.data, args.split) + ".tsv", dest + ".tsv") + if osp.exists(osp.join(args.data, args.split) + ".wrd"): + copyfile(osp.join(args.data, args.split) + ".wrd", dest + ".wrd") + if osp.exists(osp.join(args.data, args.split) + ".phn"): + copyfile(osp.join(args.data, args.split) + ".phn", dest + ".phn") + + if osp.exists(dest + ".npy"): + os.remove(dest + ".npy") + npaa = NpyAppendArray(dest + ".npy") + return npaa + + save_path = osp.join(args.save_dir, args.split) + npaa = create_files(save_path) + + generator, num = get_iterator(args) + iterator = generator() + + with open(save_path + ".lengths", "w") as l_f: + for w2v_feats in tqdm.tqdm(iterator, total=num): + print(len(w2v_feats), file=l_f) + + if len(w2v_feats) > 0: + npaa.append(w2v_feats.numpy()) + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/scripts/wer.py b/examples/wav2vec/unsupervised/scripts/wer.py new file mode 100644 index 0000000000..613ab50d39 --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/wer.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +Implement unsupervised metric for decoding hyperparameter selection: + $$ alpha * LM_PPL + ViterbitUER(%) * 100 $$ +""" +import argparse +import logging +import sys + +import editdistance + +logging.root.setLevel(logging.INFO) +logging.basicConfig(stream=sys.stdout, level=logging.INFO) +logger = logging.getLogger(__name__) + + +def get_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("-s", "--hypo", help="hypo transcription", required=True) + parser.add_argument( + "-r", "--reference", help="reference transcription", required=True + ) + return parser + + +def compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p): + d_cnt = 0 + w_cnt = 0 + w_cnt_h = 0 + for uid in hyp_uid_to_tra: + ref = ref_uid_to_tra[uid].split() + if g2p is not None: + hyp = g2p(hyp_uid_to_tra[uid]) + hyp = [p for p in hyp if p != "'" and p != " "] + hyp = [p[:-1] if p[-1].isnumeric() else p for p in hyp] + else: + hyp = hyp_uid_to_tra[uid].split() + d_cnt += editdistance.eval(ref, hyp) + w_cnt += len(ref) + w_cnt_h += len(hyp) + wer = float(d_cnt) / w_cnt + logger.debug( + ( + f"wer = {wer * 100:.2f}%; num. of ref words = {w_cnt}; " + f"num. of hyp words = {w_cnt_h}; num. of sentences = {len(ref_uid_to_tra)}" + ) + ) + return wer + + +def main(): + args = get_parser().parse_args() + + errs = 0 + count = 0 + with open(args.hypo, "r") as hf, open(args.reference, "r") as rf: + for h, r in zip(hf, rf): + h = h.rstrip().split() + r = r.rstrip().split() + errs += editdistance.eval(r, h) + count += len(r) + + logger.info(f"UER: {errs / count * 100:.2f}%") + + +if __name__ == "__main__": + main() + + +def load_tra(tra_path): + with open(tra_path, "r") as f: + uid_to_tra = {} + for line in f: + uid, tra = line.split(None, 1) + uid_to_tra[uid] = tra + logger.debug(f"loaded {len(uid_to_tra)} utterances from {tra_path}") + return uid_to_tra diff --git a/examples/wav2vec/unsupervised/scripts/wrd_to_ltr.py b/examples/wav2vec/unsupervised/scripts/wrd_to_ltr.py new file mode 100644 index 0000000000..f83471409a --- /dev/null +++ b/examples/wav2vec/unsupervised/scripts/wrd_to_ltr.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import sys + + +def main(): + for line in sys.stdin: + print(" ".join(list(line.strip().replace(" ", "|"))) + " |") + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec/unsupervised/tasks/__init__.py b/examples/wav2vec/unsupervised/tasks/__init__.py new file mode 100644 index 0000000000..6d7dd625e0 --- /dev/null +++ b/examples/wav2vec/unsupervised/tasks/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .unpaired_audio_text import UnpairedAudioText + + +__all__ = [ + "UnpairedAudioText", +] diff --git a/examples/wav2vec/unsupervised/tasks/unpaired_audio_text.py b/examples/wav2vec/unsupervised/tasks/unpaired_audio_text.py new file mode 100644 index 0000000000..b6b65d5c49 --- /dev/null +++ b/examples/wav2vec/unsupervised/tasks/unpaired_audio_text.py @@ -0,0 +1,452 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +from dataclasses import dataclass, field +import logging +import math +import os +from typing import Optional +import torch + +from fairseq.logging import metrics +from fairseq.tasks import FairseqTask, register_task +from ..data import ExtractedFeaturesDataset, RandomInputDataset + +from fairseq.data import ( + Dictionary, + data_utils, + StripTokenDataset, +) +from fairseq.dataclass import FairseqDataclass +from fairseq.distributed.utils import get_data_parallel_world_size +from omegaconf import MISSING + +from examples.speech_recognition.kaldi.kaldi_decoder import ( + KaldiDecoder, + KaldiDecoderConfig, +) + + +logger = logging.getLogger(__name__) + + +@dataclass +class DecodingConfig(FairseqDataclass): + kenlm_path: Optional[str] = None + lm_weight: float = 0 + blank_weight: float = 0 + + +@dataclass +class UnpairedAudioTextConfig(FairseqDataclass): + data: str = field( + default=MISSING, metadata={"help": "path to data directory containing audio"} + ) + text_data: str = field( + default=MISSING, metadata={"help": "path to data directory containing text"} + ) + max_length: Optional[int] = None + labels: Optional[str] = field( + default=None, + metadata={"help": "extension of the label file to load, used for fine-tuning"}, + ) + aux_target_postfix: Optional[str] = field( + default=None, + metadata={"help": "auxaliry target filename extension"}, + ) + unfiltered: bool = field( + default=False, metadata={"help": "load data with _unfiltered suffix"} + ) + ctc_eval: bool = field( + default=False, metadata={"help": "eval UER as if computed by CTC"} + ) + sort_by_length: bool = field( + default=True, metadata={"help": "sort examples by length of audio timesteps"} + ) + shuffle: bool = field(default=True, metadata={"help": "shuffle examples"}) + append_eos: bool = field(default=False, metadata={"help": "append eos"}) + uppercase: Optional[bool] = field( + default=False, metadata={"help": "uppercase for LM score computation"} + ) + skipwords: Optional[str] = field( + default="", + metadata={ + "help": "comma-separated words to be removed for LM score computation" + }, + ) + kenlm_path: Optional[str] = None + vocab_usage_power: float = 2 + + word_decoder_config: Optional[KaldiDecoderConfig] = None + word_kenlm_path: Optional[str] = None + + decoding_config: DecodingConfig = DecodingConfig() + + +@register_task("unpaired_audio_text", dataclass=UnpairedAudioTextConfig) +class UnpairedAudioText(FairseqTask): + """ """ + + cfg: UnpairedAudioTextConfig + + def __init__( + self, + cfg: UnpairedAudioTextConfig, + source_dictionary=None, + target_dictionary=None, + ): + super().__init__(cfg) + + self._target_dictionary = target_dictionary + self._source_dictionary = source_dictionary + self.num_symbols = ( + len([s for s in target_dictionary.symbols if not s.startswith("madeup")]) + - target_dictionary.nspecial + ) + self.sil_id = ( + target_dictionary.index("<SIL>") if "<SIL>" in target_dictionary else -1 + ) + self.kenlm = None + if cfg.kenlm_path is not None: + import kenlm + + self.kenlm = kenlm.Model(cfg.kenlm_path) + + self.word_kenlm = None + if cfg.word_kenlm_path is not None: + import kenlm + + self.word_kenlm = kenlm.Model(cfg.word_kenlm_path) + + self.uppercase = cfg.uppercase + self.skipwords = set(cfg.skipwords.split(",")) + + def str_postprocess(s): + s = " ".join(w for w in s.split() if w not in self.skipwords) + s = s.upper() if self.uppercase else s + return s + + self.str_postprocess = str_postprocess + self.compute_lm_score = lambda s: self.kenlm.score(self.str_postprocess(s)) + + self.compute_word_score = None + if cfg.word_decoder_config is not None: + self.kaldi_decoder = KaldiDecoder(cfg.word_decoder_config, beam=10) + + def compute_word_score(logits, padding): + res = self.kaldi_decoder.decode(logits, padding) + for r in res: + r = r.result() + assert len(r) == 1 + r = r[0] + yield r["score"], r["words"] + + self.compute_word_score = compute_word_score + + @classmethod + def setup_task(cls, cfg: UnpairedAudioTextConfig, **kwargs): + """Setup the task (e.g., load dictionaries). + + Args: + cfg (AudioPretrainingConfig): configuration of this task + """ + + dict_path = os.path.join(cfg.text_data, "dict.txt") + if os.path.exists(dict_path): + target_dictionary = Dictionary.load(dict_path) + else: + dict_path = os.path.join(cfg.data, f"dict.{cfg.labels}.txt") + target_dictionary = Dictionary.load(dict_path) + + return cls(cfg, target_dictionary=target_dictionary) + + def optimizer_step(self, optimizer, model, update_num): + if hasattr(model, "get_groups_for_update"): + groups = model.get_groups_for_update(update_num) + optimizer.step(groups={groups}) + else: + optimizer.step() + + def valid_step(self, sample, model, criterion): + res = model( + **sample["net_input"], + dense_x_only=True, + ) + + dense_x = res["logits"] + padding_mask = res["padding_mask"] + + word_scores = None + if self.compute_word_score is not None: + word_scores = self.compute_word_score(dense_x.cpu(), padding_mask.cpu()) + + z = dense_x.argmax(-1) + z[padding_mask] = self.target_dictionary.pad() + + vocab_seen = torch.zeros(self.num_symbols, dtype=torch.bool) + + import editdistance + + c_err = 0 + c_len = 0 + pred_c_len = 0 + lm_score_sum = 0 + for i, (x, t, id) in enumerate( + zip( + z, + sample["target"] if "target" in sample else [None] * len(z), + sample["id"], + ) + ): + + if t is not None: + t = t[(t >= self.target_dictionary.nspecial)] + x = x[ + (x >= self.target_dictionary.nspecial) + & (x < (self.num_symbols + self.target_dictionary.nspecial)) + ] + if self.sil_id >= 0: + x = x[x != self.sil_id] + + vocab_seen[x - self.target_dictionary.nspecial] = True + + pred_units_arr = x + if self.cfg.ctc_eval: + pred_units_arr = pred_units_arr.unique_consecutive() + pred_units_arr = pred_units_arr[pred_units_arr != 0] + + if id == 0: + if t is not None: + logger.info(f"REF: {self.target_dictionary.string(t)}") + logger.info(f"HYP: {self.target_dictionary.string(pred_units_arr)}") + + if self.kenlm is not None: + if t is not None: + ref_lm_s = self.compute_lm_score( + self.target_dictionary.string(t) + ) + logger.info( + f"LM [REF]: {ref_lm_s}, {math.pow(10, -ref_lm_s / (len(t) + 1))}" + ) + + hyp_lm_s = self.compute_lm_score( + self.target_dictionary.string(pred_units_arr) + ) + logger.info( + f"LM [HYP]: {hyp_lm_s}, {math.pow(10, -hyp_lm_s / (len(pred_units_arr) + 1))}" + ) + + pred_units_arr = pred_units_arr.tolist() + + pred_c_len += len(pred_units_arr) + + if t is not None: + t = t.tolist() + c_err += editdistance.eval(pred_units_arr, t) + c_len += len(t) + else: + c_len = pred_c_len + + if self.kenlm is not None: + pred_str = self.target_dictionary.string(pred_units_arr) + lm_score = self.compute_lm_score(pred_str) + lm_score_sum += lm_score + + kaldi_score_sum = 0 + word_lm_sum = 0 + num_words = 0 + if word_scores is not None: + for score, words in word_scores: + kaldi_score_sum += score + num_words += len(words) + if self.word_kenlm is not None: + word_lm_sum += self.kenlm.score(" ".join(words)) + + try: + world_size = get_data_parallel_world_size() + except: + world_size = 1 + + logging_output = { + "loss": c_err, + "_num_char_errors": c_err, + "_num_chars": c_len, + "_num_pred_chars": pred_c_len, + "ntokens": c_len, + "nsentences": z.size(0), + "sample_size": c_len, + "_world_size": world_size, + "_lm_score_sum": lm_score_sum, + "_kaldi_score_sum": kaldi_score_sum, + "_word_lm_sum": word_lm_sum, + "_num_words": num_words, + "_vocab_seen": vocab_seen, + } + + return c_err, c_len, logging_output + + def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): + data_path = self.cfg.data + task_cfg = task_cfg or self.cfg + + has_unpaired_text = os.path.exists( + os.path.join(self.cfg.text_data, f"{split}.idx") + ) + + self.datasets[split] = ExtractedFeaturesDataset( + path=data_path, + split=split, + min_length=3, + max_length=task_cfg.max_length, + labels=None if has_unpaired_text else task_cfg.labels, + label_dict=self.target_dictionary, + shuffle=getattr(task_cfg, "shuffle", True), + sort_by_length=task_cfg.sort_by_length, + aux_target_postfix=task_cfg.aux_target_postfix, + ) + + logger.info(f"split {split} has unpaired text? {has_unpaired_text}") + if has_unpaired_text: + text_dataset = data_utils.load_indexed_dataset( + os.path.join(self.cfg.text_data, split), self.target_dictionary + ) + text_dataset = StripTokenDataset(text_dataset, self.target_dictionary.eos()) + self.datasets[split] = RandomInputDataset( + self.datasets[split], + text_dataset, + ["random_label"], + add_to_input=True, + pad_idx=self.target_dictionary.pad(), + ) + + @property + def source_dictionary(self): + return self._source_dictionary + + @property + def target_dictionary(self): + """Return the :class:`~fairseq.data.Dictionary` for the language + model.""" + return self._target_dictionary + + def max_positions(self): + """Maximum input length supported by the encoder.""" + return None + + def reduce_metrics(self, logging_outputs, criterion): + super().reduce_metrics(logging_outputs, criterion) + + zero = torch.scalar_tensor(0.0) + num_char_errors = sum( + log.get("_num_char_errors", zero) for log in logging_outputs + ) + num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs) + num_word_errors = sum( + log.get("_num_word_errors", zero) for log in logging_outputs + ) + num_words = sum(log.get("_num_words", zero) for log in logging_outputs) + num_pred_chars = sum( + log.get("_num_pred_chars", zero) for log in logging_outputs + ) + + lm_score_sum = sum(log.get("_lm_score_sum", zero) for log in logging_outputs) + vocab_seen = ( + sum(log.get("_vocab_seen", zero) for log in logging_outputs) + .bool() + .sum() + .item() + ) + kaldi_score_sum = sum( + log.get("_kaldi_score_sum", zero) for log in logging_outputs + ) + word_lm_sum = sum(log.get("_word_lm_sum", zero) for log in logging_outputs) + + metrics.log_scalar_sum("_num_char_errors", num_char_errors) + metrics.log_scalar_sum("_num_chars", num_chars) + metrics.log_scalar_sum("_num_word_errors", num_word_errors) + metrics.log_scalar_sum("_num_words", num_words) + + metrics.log_scalar_sum("lm_score_sum", lm_score_sum) + metrics.log_scalar_sum("num_pred_chars", num_pred_chars) + + if self.cfg.word_kenlm_path is not None: + metrics.log_scalar_sum("kaldi_score_sum", kaldi_score_sum) + metrics.log_scalar_sum("word_lm_sum", word_lm_sum) + + if num_chars > 0: + metrics.log_derived( + "uer", + lambda meters: meters["_num_char_errors"].sum + * 100.0 + / meters["_num_chars"].sum + if meters["_num_chars"].sum > 0 + else float("nan"), + ) + + if lm_score_sum < 0 and vocab_seen > 0: + metrics.log_scalar("vocab_seen_pct", vocab_seen / self.num_symbols) + + metrics.log_derived( + "weighted_lm_ppl", + lambda meters: math.pow( + 10, + -meters["lm_score_sum"].sum + / ( + meters["num_pred_chars"].sum + meters["nsentences"].sum + ), # account for </s> + ) + / meters["vocab_seen_pct"].avg ** self.cfg.vocab_usage_power, + ) + + metrics.log_derived( + "lm_ppl", + lambda meters: math.pow( + 10, + -meters["lm_score_sum"].sum + / ( + meters["num_pred_chars"].sum + meters["nsentences"].sum + ), # account for </s> + ), + ) + else: + metrics.log_derived("weighted_lm_ppl", lambda meters: float("inf")) + + if num_words > 0: + if word_lm_sum != 0: + metrics.log_derived( + "word_lm_ppl", + lambda meters: math.pow( + 10, + -meters["word_lm_sum"].sum + / ( + meters["_num_words"].sum + meters["nsentences"].sum + ), # account for </s> + ), + ) + metrics.log_derived( + "weighted_word_lm_ppl", + lambda meters: math.pow( + 10, + -meters["word_lm_sum"].sum + / ( + meters["_num_words"].sum + meters["nsentences"].sum + ), # account for </s> + ) + / meters["vocab_seen_pct"].avg ** self.cfg.vocab_usage_power, + ) + + if self.cfg.word_kenlm_path is not None: + metrics.log_derived( + "kaldi_score", + lambda meters: meters["kaldi_score_sum"].sum + / meters["nsentences"].sum, + ) + + def build_model(self, cfg: FairseqDataclass, from_checkpoint=False): + model = super().build_model(cfg) + + return model diff --git a/examples/wav2vec/unsupervised/w2vu_generate.py b/examples/wav2vec/unsupervised/w2vu_generate.py new file mode 100644 index 0000000000..0611297a4f --- /dev/null +++ b/examples/wav2vec/unsupervised/w2vu_generate.py @@ -0,0 +1,714 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +Run inference for pre-processed data with a trained model. +""" + +import ast +from collections import namedtuple +from dataclasses import dataclass, field +from enum import Enum, auto +import hydra +from hydra.core.config_store import ConfigStore +import logging +import math +import os +from omegaconf import OmegaConf +from typing import Optional +import sys + +import editdistance +import torch + +from hydra.core.hydra_config import HydraConfig + +from fairseq import checkpoint_utils, progress_bar, tasks, utils +from fairseq.data.data_utils import post_process +from fairseq.dataclass.configs import FairseqDataclass, FairseqConfig +from fairseq.logging.meters import StopwatchMeter +from omegaconf import open_dict + +from examples.speech_recognition.kaldi.kaldi_decoder import KaldiDecoderConfig + +logging.root.setLevel(logging.INFO) +logging.basicConfig(stream=sys.stdout, level=logging.INFO) +logger = logging.getLogger(__name__) + + +class DecoderType(Enum): + VITERBI = auto() + KENLM = auto() + FAIRSEQ = auto() + KALDI = auto() + + +@dataclass +class UnsupGenerateConfig(FairseqDataclass): + fairseq: FairseqConfig = FairseqConfig() + lm_weight: float = field( + default=2.0, + metadata={"help": "language model weight"}, + ) + w2l_decoder: DecoderType = field( + default=DecoderType.VITERBI, + metadata={"help": "type of decoder to use"}, + ) + kaldi_decoder_config: Optional[KaldiDecoderConfig] = None + lexicon: Optional[str] = field( + default=None, + metadata={ + "help": "path to lexicon. This is also used to 'phonemize' for unsupvised param tuning" + }, + ) + lm_model: Optional[str] = field( + default=None, + metadata={"help": "path to language model (kenlm or fairseq)"}, + ) + decode_stride: Optional[float] = field( + default=None, + metadata={"help": "changing the decoding frequency of the generator"}, + ) + unit_lm: bool = field( + default=False, + metadata={"help": "whether to use unit lm"}, + ) + beam_threshold: float = field( + default=50.0, + metadata={"help": "beam score threshold"}, + ) + beam_size_token: float = field( + default=100.0, + metadata={"help": "max tokens per beam"}, + ) + beam: int = field( + default=5, + metadata={"help": "decoder beam size"}, + ) + nbest: int = field( + default=1, + metadata={"help": "number of results to return"}, + ) + word_score: float = field( + default=1.0, + metadata={"help": "word score to add at end of word"}, + ) + unk_weight: float = field( + default=-math.inf, + metadata={"help": "unknown token weight"}, + ) + sil_weight: float = field( + default=0.0, + metadata={"help": "silence token weight"}, + ) + targets: Optional[str] = field( + default=None, + metadata={"help": "extension of ground truth labels to compute UER"}, + ) + results_path: Optional[str] = field( + default=None, + metadata={"help": "where to store results"}, + ) + post_process: Optional[str] = field( + default=None, + metadata={"help": "how to post process results"}, + ) + vocab_usage_power: float = field( + default=2, + metadata={"help": "for unsupervised param tuning"}, + ) + + viterbi_transcript: Optional[str] = field( + default=None, + metadata={"help": "for unsupervised param tuning"}, + ) + min_lm_ppl: float = field( + default=0, + metadata={"help": "for unsupervised param tuning"}, + ) + min_vt_uer: float = field( + default=0, + metadata={"help": "for unsupervised param tuning"}, + ) + + blank_weight: float = field( + default=0, + metadata={"help": "value to add or set for blank emission"}, + ) + blank_mode: str = field( + default="set", + metadata={ + "help": "can be add or set, how to modify blank emission with blank weight" + }, + ) + sil_is_blank: bool = field( + default=False, + metadata={"help": "if true, <SIL> token is same as blank token"}, + ) + + unsupervised_tuning: bool = field( + default=False, + metadata={ + "help": "if true, returns a score based on unsupervised param selection metric instead of UER" + }, + ) + is_ax: bool = field( + default=False, + metadata={ + "help": "if true, assumes we are using ax for tuning and returns a tuple for ax to consume" + }, + ) + + +def get_dataset_itr(cfg, task): + return task.get_batch_iterator( + dataset=task.dataset(cfg.fairseq.dataset.gen_subset), + max_tokens=cfg.fairseq.dataset.max_tokens, + max_sentences=cfg.fairseq.dataset.batch_size, + max_positions=(sys.maxsize, sys.maxsize), + ignore_invalid_inputs=cfg.fairseq.dataset.skip_invalid_size_inputs_valid_test, + required_batch_size_multiple=cfg.fairseq.dataset.required_batch_size_multiple, + num_shards=cfg.fairseq.dataset.num_shards, + shard_id=cfg.fairseq.dataset.shard_id, + num_workers=cfg.fairseq.dataset.num_workers, + data_buffer_size=cfg.fairseq.dataset.data_buffer_size, + ).next_epoch_itr(shuffle=False) + + +def process_predictions( + cfg: UnsupGenerateConfig, + hypos, + tgt_dict, + target_tokens, + res_files, +): + retval = [] + word_preds = [] + transcriptions = [] + dec_scores = [] + + for i, hypo in enumerate(hypos[: min(len(hypos), cfg.nbest)]): + if torch.is_tensor(hypo["tokens"]): + tokens = hypo["tokens"].int().cpu() + tokens = tokens[tokens >= tgt_dict.nspecial] + hyp_pieces = tgt_dict.string(tokens) + else: + hyp_pieces = " ".join(hypo["tokens"]) + + if "words" in hypo and len(hypo["words"]) > 0: + hyp_words = " ".join(hypo["words"]) + else: + hyp_words = post_process(hyp_pieces, cfg.post_process) + + to_write = {} + if res_files is not None: + to_write[res_files["hypo.units"]] = hyp_pieces + to_write[res_files["hypo.words"]] = hyp_words + + tgt_words = "" + if target_tokens is not None: + if isinstance(target_tokens, str): + tgt_pieces = tgt_words = target_tokens + else: + tgt_pieces = tgt_dict.string(target_tokens) + tgt_words = post_process(tgt_pieces, cfg.post_process) + + if res_files is not None: + to_write[res_files["ref.units"]] = tgt_pieces + to_write[res_files["ref.words"]] = tgt_words + + if not cfg.fairseq.common_eval.quiet: + logger.info(f"HYPO {i}:" + hyp_words) + if tgt_words: + logger.info("TARGET:" + tgt_words) + + if "am_score" in hypo and "lm_score" in hypo: + logger.info( + f"DECODER AM SCORE: {hypo['am_score']}, DECODER LM SCORE: {hypo['lm_score']}, DECODER SCORE: {hypo['score']}" + ) + elif "score" in hypo: + logger.info(f"DECODER SCORE: {hypo['score']}") + + logger.info("___________________") + + hyp_words_arr = hyp_words.split() + tgt_words_arr = tgt_words.split() + + retval.append( + ( + editdistance.eval(hyp_words_arr, tgt_words_arr), + len(hyp_words_arr), + len(tgt_words_arr), + hyp_pieces, + hyp_words, + ) + ) + word_preds.append(hyp_words_arr) + transcriptions.append(to_write) + dec_scores.append(-hypo.get("score", 0)) # negate cuz kaldi returns NLL + + if len(retval) > 1: + best = None + for r, t in zip(retval, transcriptions): + if best is None or r[0] < best[0][0]: + best = r, t + for dest, tran in best[1].items(): + print(tran, file=dest) + dest.flush() + return best[0] + + assert len(transcriptions) == 1 + for dest, tran in transcriptions[0].items(): + print(tran, file=dest) + + return retval[0] + + +def prepare_result_files(cfg: UnsupGenerateConfig): + def get_res_file(file_prefix): + if cfg.fairseq.dataset.num_shards > 1: + file_prefix = f"{cfg.fairseq.dataset.shard_id}_{file_prefix}" + path = os.path.join( + cfg.results_path, + "{}{}.txt".format( + cfg.fairseq.dataset.gen_subset, + file_prefix, + ), + ) + return open(path, "w", buffering=1) + + if not cfg.results_path: + return None + + return { + "hypo.words": get_res_file(""), + "hypo.units": get_res_file("_units"), + "ref.words": get_res_file("_ref"), + "ref.units": get_res_file("_ref_units"), + "hypo.nbest.words": get_res_file("_nbest_words"), + } + + +def optimize_models(cfg: UnsupGenerateConfig, use_cuda, models): + """Optimize ensemble for generation""" + for model in models: + model.eval() + if cfg.fairseq.common.fp16: + model.half() + if use_cuda: + model.cuda() + + +GenResult = namedtuple( + "GenResult", + [ + "count", + "errs_t", + "gen_timer", + "lengths_hyp_unit_t", + "lengths_hyp_t", + "lengths_t", + "lm_score_t", + "num_feats", + "num_sentences", + "num_symbols", + "vt_err_t", + "vt_length_t", + ], +) + + +def generate(cfg: UnsupGenerateConfig, models, saved_cfg, use_cuda): + task = tasks.setup_task(cfg.fairseq.task) + saved_cfg.task.labels = cfg.fairseq.task.labels + task.load_dataset(cfg.fairseq.dataset.gen_subset, task_cfg=saved_cfg.task) + # Set dictionary + tgt_dict = task.target_dictionary + logger.info( + "| {} {} {} examples".format( + cfg.fairseq.task.data, + cfg.fairseq.dataset.gen_subset, + len(task.dataset(cfg.fairseq.dataset.gen_subset)), + ) + ) + # Load dataset (possibly sharded) + itr = get_dataset_itr(cfg, task) + # Initialize generator + gen_timer = StopwatchMeter() + + def build_generator(cfg: UnsupGenerateConfig): + w2l_decoder = cfg.w2l_decoder + if w2l_decoder == DecoderType.VITERBI: + from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder + + return W2lViterbiDecoder(cfg, task.target_dictionary) + elif w2l_decoder == DecoderType.KENLM: + from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder + + return W2lKenLMDecoder(cfg, task.target_dictionary) + elif w2l_decoder == DecoderType.FAIRSEQ: + from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder + + return W2lFairseqLMDecoder(cfg, task.target_dictionary) + elif w2l_decoder == DecoderType.KALDI: + from examples.speech_recognition.kaldi.kaldi_decoder import KaldiDecoder + + assert cfg.kaldi_decoder_config is not None + + return KaldiDecoder( + cfg.kaldi_decoder_config, + cfg.beam, + ) + else: + raise NotImplementedError( + "only wav2letter decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment but found " + + str(w2l_decoder) + ) + + generator = build_generator(cfg) + + kenlm = None + fairseq_lm = None + if cfg.lm_model is not None: + import kenlm + + kenlm = kenlm.Model(cfg.lm_model) + + num_sentences = 0 + if cfg.results_path is not None and not os.path.exists(cfg.results_path): + os.makedirs(cfg.results_path) + + res_files = prepare_result_files(cfg) + errs_t = 0 + lengths_hyp_t = 0 + lengths_hyp_unit_t = 0 + lengths_t = 0 + count = 0 + num_feats = 0 + all_hyp_pieces = [] + all_hyp_words = [] + + num_symbols = ( + len([s for s in tgt_dict.symbols if not s.startswith("madeup")]) + - tgt_dict.nspecial + ) + targets = None + if cfg.targets is not None: + tgt_path = os.path.join( + cfg.fairseq.task.data, cfg.fairseq.dataset.gen_subset + "." + cfg.targets + ) + if os.path.exists(tgt_path): + with open(tgt_path, "r") as f: + targets = f.read().splitlines() + viterbi_transcript = None + if cfg.viterbi_transcript is not None and len(cfg.viterbi_transcript) > 0: + logger.info(f"loading viterbi transcript from {cfg.viterbi_transcript}") + with open(cfg.viterbi_transcript, "r") as vf: + viterbi_transcript = vf.readlines() + viterbi_transcript = [v.rstrip().split() for v in viterbi_transcript] + + gen_timer.start() + + start = 0 + end = len(itr) + + hypo_futures = None + if cfg.w2l_decoder == DecoderType.KALDI: + logger.info("Extracting features") + hypo_futures = [] + samples = [] + with progress_bar.build_progress_bar(cfg.fairseq.common, itr) as t: + for i, sample in enumerate(t): + if "net_input" not in sample or i < start or i >= end: + continue + if "padding_mask" not in sample["net_input"]: + sample["net_input"]["padding_mask"] = None + + hypos, num_feats = gen_hypos( + generator, models, num_feats, sample, task, use_cuda + ) + hypo_futures.append(hypos) + samples.append(sample) + itr = list(zip(hypo_futures, samples)) + start = 0 + end = len(itr) + logger.info("Finished extracting features") + + with progress_bar.build_progress_bar(cfg.fairseq.common, itr) as t: + for i, sample in enumerate(t): + if i < start or i >= end: + continue + + if hypo_futures is not None: + hypos, sample = sample + hypos = [h.result() for h in hypos] + else: + if "net_input" not in sample: + continue + + hypos, num_feats = gen_hypos( + generator, models, num_feats, sample, task, use_cuda + ) + + for i, sample_id in enumerate(sample["id"].tolist()): + if targets is not None: + target_tokens = targets[sample_id] + elif "target" in sample or "target_label" in sample: + toks = ( + sample["target"][i, :] + if "target_label" not in sample + else sample["target_label"][i, :] + ) + + target_tokens = utils.strip_pad(toks, tgt_dict.pad()).int().cpu() + else: + target_tokens = None + + # Process top predictions + ( + errs, + length_hyp, + length, + hyp_pieces, + hyp_words, + ) = process_predictions( + cfg, + hypos[i], + tgt_dict, + target_tokens, + res_files, + ) + errs_t += errs + lengths_hyp_t += length_hyp + lengths_hyp_unit_t += ( + len(hyp_pieces) if len(hyp_pieces) > 0 else len(hyp_words) + ) + lengths_t += length + count += 1 + all_hyp_pieces.append(hyp_pieces) + all_hyp_words.append(hyp_words) + + num_sentences += ( + sample["nsentences"] if "nsentences" in sample else sample["id"].numel() + ) + + lm_score_sum = 0 + if kenlm is not None: + + if cfg.unit_lm: + lm_score_sum = sum(kenlm.score(w) for w in all_hyp_pieces) + else: + lm_score_sum = sum(kenlm.score(w) for w in all_hyp_words) + elif fairseq_lm is not None: + lm_score_sum = sum(fairseq_lm.score([h.split() for h in all_hyp_words])[0]) + + vt_err_t = 0 + vt_length_t = 0 + if viterbi_transcript is not None: + unit_hyps = [] + if cfg.targets is not None and cfg.lexicon is not None: + lex = {} + with open(cfg.lexicon, "r") as lf: + for line in lf: + items = line.rstrip().split() + lex[items[0]] = items[1:] + for h in all_hyp_pieces: + hyp_ws = [] + for w in h.split(): + assert w in lex, w + hyp_ws.extend(lex[w]) + unit_hyps.append(hyp_ws) + + else: + unit_hyps.extend([h.split() for h in all_hyp_words]) + + vt_err_t = sum( + editdistance.eval(vt, h) for vt, h in zip(viterbi_transcript, unit_hyps) + ) + + vt_length_t = sum(len(h) for h in viterbi_transcript) + + if res_files is not None: + for r in res_files.values(): + r.close() + + gen_timer.stop(lengths_hyp_t) + + return GenResult( + count, + errs_t, + gen_timer, + lengths_hyp_unit_t, + lengths_hyp_t, + lengths_t, + lm_score_sum, + num_feats, + num_sentences, + num_symbols, + vt_err_t, + vt_length_t, + ) + + +def gen_hypos(generator, models, num_feats, sample, task, use_cuda): + sample = utils.move_to_cuda(sample) if use_cuda else sample + + if "features" in sample["net_input"]: + sample["net_input"]["dense_x_only"] = True + num_feats += ( + sample["net_input"]["features"].shape[0] + * sample["net_input"]["features"].shape[1] + ) + hypos = task.inference_step(generator, models, sample, None) + return hypos, num_feats + + +def main(cfg: UnsupGenerateConfig, model=None): + if ( + cfg.fairseq.dataset.max_tokens is None + and cfg.fairseq.dataset.batch_size is None + ): + cfg.fairseq.dataset.max_tokens = 1024000 + + use_cuda = torch.cuda.is_available() and not cfg.fairseq.common.cpu + + task = tasks.setup_task(cfg.fairseq.task) + + overrides = ast.literal_eval(cfg.fairseq.common_eval.model_overrides) + + if cfg.fairseq.task._name == "unpaired_audio_text": + overrides["model"] = { + "blank_weight": cfg.blank_weight, + "blank_mode": cfg.blank_mode, + "blank_is_sil": cfg.sil_is_blank, + "no_softmax": True, + "segmentation": { + "type": "NONE", + }, + } + else: + overrides["model"] = { + "blank_weight": cfg.blank_weight, + "blank_mode": cfg.blank_mode, + } + + if cfg.decode_stride: + overrides["model"]["generator_stride"] = cfg.decode_stride + + if model is None: + # Load ensemble + logger.info("| loading model(s) from {}".format(cfg.fairseq.common_eval.path)) + models, saved_cfg = checkpoint_utils.load_model_ensemble( + cfg.fairseq.common_eval.path.split("\\"), + arg_overrides=overrides, + task=task, + suffix=cfg.fairseq.checkpoint.checkpoint_suffix, + strict=(cfg.fairseq.checkpoint.checkpoint_shard_count == 1), + num_shards=cfg.fairseq.checkpoint.checkpoint_shard_count, + ) + optimize_models(cfg, use_cuda, models) + else: + models = [model] + saved_cfg = cfg.fairseq + + with open_dict(saved_cfg.task): + saved_cfg.task.shuffle = False + saved_cfg.task.sort_by_length = False + + gen_result = generate(cfg, models, saved_cfg, use_cuda) + + wer = None + if gen_result.lengths_t > 0: + wer = gen_result.errs_t * 100.0 / gen_result.lengths_t + logger.info(f"WER: {wer}") + + lm_ppl = float("inf") + + if gen_result.lm_score_t != 0 and gen_result.lengths_hyp_t > 0: + hyp_len = gen_result.lengths_hyp_t + lm_ppl = math.pow( + 10, -gen_result.lm_score_t / (hyp_len + gen_result.num_sentences) + ) + logger.info(f"LM PPL: {lm_ppl}") + + logger.info( + "| Processed {} sentences ({} tokens) in {:.1f}s ({:.2f}" + " sentences/s, {:.2f} tokens/s)".format( + gen_result.num_sentences, + gen_result.gen_timer.n, + gen_result.gen_timer.sum, + gen_result.num_sentences / gen_result.gen_timer.sum, + 1.0 / gen_result.gen_timer.avg, + ) + ) + + vt_diff = None + if gen_result.vt_length_t > 0: + vt_diff = gen_result.vt_err_t / gen_result.vt_length_t + vt_diff = max(cfg.min_vt_uer, vt_diff) + + lm_ppl = max(cfg.min_lm_ppl, lm_ppl) + + if not cfg.unsupervised_tuning: + weighted_score = wer + else: + weighted_score = math.log(lm_ppl) * (vt_diff or 1.0) + + res = ( + f"| Generate {cfg.fairseq.dataset.gen_subset} with beam={cfg.beam}, " + f"lm_weight={cfg.kaldi_decoder_config.acoustic_scale if cfg.kaldi_decoder_config else cfg.lm_weight}, " + f"word_score={cfg.word_score}, sil_weight={cfg.sil_weight}, blank_weight={cfg.blank_weight}, " + f"WER: {wer}, LM_PPL: {lm_ppl}, num feats: {gen_result.num_feats}, " + f"length: {gen_result.lengths_hyp_t}, UER to viterbi: {(vt_diff or 0) * 100}, score: {weighted_score}" + ) + + logger.info(res) + # print(res) + + return task, weighted_score + + +@hydra.main( + config_path=os.path.join("../../..", "fairseq", "config"), config_name="config" +) +def hydra_main(cfg): + with open_dict(cfg): + # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) + cfg.job_logging_cfg = OmegaConf.to_container( + HydraConfig.get().job_logging, resolve=True + ) + + cfg = OmegaConf.create( + OmegaConf.to_container(cfg, resolve=False, enum_to_str=False) + ) + OmegaConf.set_struct(cfg, True) + logger.info(cfg) + + utils.import_user_module(cfg.fairseq.common) + + _, score = main(cfg) + + if cfg.is_ax: + return score, None + return score + + +def cli_main(): + try: + from hydra._internal.utils import get_args + + cfg_name = get_args().config_name or "config" + except: + logger.warning("Failed to get config name from hydra args") + cfg_name = "config" + + cs = ConfigStore.instance() + cs.store(name=cfg_name, node=UnsupGenerateConfig) + hydra_main() + + +if __name__ == "__main__": + cli_main() diff --git a/examples/wav2vec/vq-wav2vec_featurize.py b/examples/wav2vec/vq-wav2vec_featurize.py index baabc1d365..627072ee17 100644 --- a/examples/wav2vec/vq-wav2vec_featurize.py +++ b/examples/wav2vec/vq-wav2vec_featurize.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. """ -Helper script to pre-compute embeddings for a wav2letter++ dataset +Helper script to pre-compute embeddings for a flashlight (previously called wav2letter++) dataset """ import argparse @@ -16,8 +16,7 @@ import soundfile as sf import torch -import tqdm -from fairseq.models.wav2vec.wav2vec import Wav2VecModel +import fairseq from torch import nn from torch.utils.data import DataLoader @@ -211,13 +210,11 @@ def load_data(self, fnames): return loader def load_model(self): - cp = torch.load(self.checkpoint, map_location=lambda x, _: x) + model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.checkpoint]) + model = model[0] - model = Wav2VecModel.build_model(cp["args"], None) + self.quantize_location = getattr(cfg.model, "vq", "encoder") - self.quantize_location = getattr(cp["args"], "vq", "encoder") - - model.load_state_dict(cp["model"]) model.eval().float() model.cuda() diff --git a/examples/wav2vec/wav2vec_featurize.py b/examples/wav2vec/wav2vec_featurize.py index 9283930587..588268b708 100644 --- a/examples/wav2vec/wav2vec_featurize.py +++ b/examples/wav2vec/wav2vec_featurize.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. """ -Helper script to pre-compute embeddings for a wav2letter++ dataset +Helper script to pre-compute embeddings for a flashlight (previously called wav2letter++) dataset """ import argparse @@ -18,7 +18,7 @@ import soundfile as sf import torch import tqdm -from fairseq.models.wav2vec.wav2vec import Wav2VecModel +import fairseq from torch import nn @@ -35,10 +35,8 @@ class PretrainedWav2VecModel(nn.Module): def __init__(self, fname): super().__init__() - checkpoint = torch.load(fname) - self.args = checkpoint["args"] - model = Wav2VecModel.build_model(self.args, None) - model.load_state_dict(checkpoint["model"]) + model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([fname]) + model = model[0] model.eval() self.model = model @@ -54,7 +52,7 @@ def forward(self, x): class EmbeddingWriterConfig(argparse.ArgumentParser): def __init__(self): - super().__init__("Pre-compute embeddings for wav2letter++ datasets") + super().__init__("Pre-compute embeddings for flashlight datasets") kwargs = {"action": "store", "type": str, "required": True} @@ -69,7 +67,7 @@ def __init__(self): self.add_argument( "--no-copy-labels", action="store_true", - help="Do not copy label files. Useful for large datasets, use --targetdir in wav2letter then.", + help="Do not copy label files. Useful for large datasets, use --targetdir in flashlight then.", ) self.add_argument( "--use-feat", @@ -95,7 +93,7 @@ def __call__(self, x): class H5Writer: - """ Write features as hdf5 file in wav2letter++ compatible format """ + """ Write features as hdf5 file in flashlight compatible format """ def __init__(self, fname): self.fname = fname @@ -111,11 +109,11 @@ def write(self, data): class EmbeddingDatasetWriter(object): - """Given a model and a wav2letter++ dataset, pre-compute and store embeddings + """Given a model and a flashlight dataset, pre-compute and store embeddings Args: input_root, str : - Path to the wav2letter++ dataset + Path to the flashlight dataset output_root, str : Desired output directory. Will be created if non-existent split, str : diff --git a/examples/wav2vec/wav2vec_manifest.py b/examples/wav2vec/wav2vec_manifest.py index 1d27f58afc..9b8aa180e8 100644 --- a/examples/wav2vec/wav2vec_manifest.py +++ b/examples/wav2vec/wav2vec_manifest.py @@ -47,15 +47,24 @@ def get_parser(): def main(args): assert args.valid_percent >= 0 and args.valid_percent <= 1.0 + if not os.path.exists(args.dest): + os.makedirs(args.dest) + dir_path = os.path.realpath(args.root) search_path = os.path.join(dir_path, "**/*." + args.ext) rand = random.Random(args.seed) - with open(os.path.join(args.dest, "train.tsv"), "w") as train_f, open( - os.path.join(args.dest, "valid.tsv"), "w" - ) as valid_f: + valid_f = ( + open(os.path.join(args.dest, "valid.tsv"), "w") + if args.valid_percent > 0 + else None + ) + + with open(os.path.join(args.dest, "train.tsv"), "w") as train_f: print(dir_path, file=train_f) - print(dir_path, file=valid_f) + + if valid_f is not None: + print(dir_path, file=valid_f) for fname in glob.iglob(search_path, recursive=True): file_path = os.path.realpath(fname) @@ -68,6 +77,8 @@ def main(args): print( "{}\t{}".format(os.path.relpath(file_path, dir_path), frames), file=dest ) + if valid_f is not None: + valid_f.close() if __name__ == "__main__": diff --git a/examples/wav2vec/xlsr/README.md b/examples/wav2vec/xlsr/README.md new file mode 100644 index 0000000000..e0a7c4ef3f --- /dev/null +++ b/examples/wav2vec/xlsr/README.md @@ -0,0 +1,95 @@ +# XLS-R + +XLS-R is a set of large-scale models for self-supervised cross-lingual speech representation learning based on wav2vec 2.0. It was pretrained on 128 languages and approximately 436K hours of unlabeled speech data. With finetuning, these models achieve state of the art performance in speech translation, speech recognition and language identification. We evaluate the model across multiple benchmarks such as CoVoST-2 for speech translation, BABEL / MLS / CommonVoice / VoxPopuli for automatic speech recognition, and VoxLingua107 for language identification as we llas VoxCeleb1 for speaker identification. More details about this work can be found in our [paper](https://arxiv.org/pdf/2111.09296.pdf) and download links can be found below. + +Model | Link +|------|------ +XLS-R 300M | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_300m.pt) +XLS-R 1B | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_960m_1000k.pt) +XLS-R 2B | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_2B_1000k.pt) + +You can also download these models [here](https://huggingface.co/models?other=xls_r) and read more about it in the [blogpost](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) from Hugging Face. + +## Speech Translation Finetuned Models + +We multilingually finetune XLS-R models on [CoVoST 2](https://github.com/facebookresearch/covost), which has 21 +into-English and 15 out-of-English directions. + +Model | Directions | Link +|------|------|------ +XLS-R 300M | 21 langs → En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_300m_21_en.pt) +XLS-R 300M | En → 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_300m_en_15.pt) +XLS-R 1B | 21 langs → En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_1b_21_en.pt) +XLS-R 1B | En → 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_1b_en_15.pt) +XLS-R 2B | 21 langs → En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_21_en.pt) +XLS-R 2B | En → 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_en_15.pt) +XLS-R 2B | 21 langs → En + En → 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_22_16.pt) + +## ASR Finetuning + +You can refer the original wav2vec documentation on detailed instructions about how to finetune a pretrained model with CTC [here](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#fine-tune-a-pre-trained-model-with-ctc). Below is an example command and you can find the values for different hyperparameters to reproduce the results in our paper. + +```shell script +$ fairseq-hydra-train \ + distributed_training.distributed_port=$PORT \ + task.data=/path/to/data \ + model.w2v_path=/path/to/model.pt \ + --config-dir /path/to/fairseq-py/examples/wav2vec/xlsr/config \ + --config-name finetune +``` + +For finetuning the 300M as well as 1B model, we use the same hyperparameter setting defined in `finetune.yaml`. We vary `optimization.max_update` as described in the below table and the `optimization.lr` is picked from the interval [2e-5, 3e-4] based on dev word error rate. + +Benchmark | Total Number of Updates +|------|------ +Babel | 26000 +Common Voice | 13000 +VoxPopuli | 50000 +MLS 10h | 20000 + +For finetuning the 2B model, we make some additional changes for `finetune.yaml` . We use the fully_sharded `distributed_training.ddp_backend` provided by the [fairscale](https://github.com/facebookresearch/fairscale) library and and set `model.activation_checkpoint` to true. We also increase `dataset.max_tokens` to 2560000 and use a total effective batch size of 2560000*24. We sweep for the best `optimization.lr` within the interval [3e−6,3e−5] using dev error rate. For common voice dataset, we pick the `model.mask_prob` for different languages among {0.30, 0.40} based on best dev error rate. + +## LID Inference + +Model | Link +|------|------ +XLS-R 300M + ft Voxlingua107 | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr_300m_voxlingua107_ft.pt) + +How to run inference & calculate accuracy (step-by-step): +1. Download the Voxlingua107 checkpoint from the table above. +1. Use this python script to extract logit/embedding from the XLSR model: https://github.com/fairinternal/fairseq-py/blob/xlsr2/examples/wav2vec/gen_audio_embedding.py +```shell command +CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python3 examples/wav2vec/gen_audio_embedding.py \ + /fsx/data/VoxLingua107/manifest --path "/path/to/checkpoint.pt" \ + --task audio_classification --batch-size 90 --gen-subset test \ + --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \ + --infer-xtimes 10 --infer-max-sample-size 160000 --output-path /tmp/tmp_voxling_infer.npz +``` + +2. Calculate the overall accuracy, 0-5 seconds and 5-20 seconds: +```shell command +PYTHONPATH='.' python examples/wav2vec/eval_speaker_clf_task.py \ + --task cls --merge mean_logit --data /tmp/tmp_voxling_infer.npz + +Output: +| run classification evaluation +| acc = 94.34% -- err = 5.66% -- correct=1518 total=1609 +| acc 0to5 = 90.91% -- err = 9.09% -- c_5=230.0 t_5=253 +| acc 5to20 = 94.99% -- err = 5.01% -- c_20=1288.0 t_20=1356 +``` + +## Citation + +Please cite as: + +``` bibtex +@article{babu2021xlsr, + title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale}, + author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli}, + year={2021}, + volume={abs/2111.09296}, + journal={arXiv}, +} +``` + + diff --git a/examples/wav2vec/xlsr/config/finetune.yaml b/examples/wav2vec/xlsr/config/finetune.yaml new file mode 100644 index 0000000000..8736e101c5 --- /dev/null +++ b/examples/wav2vec/xlsr/config/finetune.yaml @@ -0,0 +1,66 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + +checkpoint: + save_interval: 1000 + save_interval_updates: 1000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: wer + +task: + _name: audio_finetuning + data: ??? + normalize: true + labels: ltr + +dataset: + num_workers: 6 + max_tokens: 1280000 + skip_invalid_size_inputs_valid_test: true + validate_after_updates: 10000 + validate_interval_updates: 1000 + valid_subset: valid + +distributed_training: + ddp_backend: legacy_ddp + distributed_world_size: 4 + +criterion: + _name: ctc + zero_infinity: true + +optimization: + max_update: ??? + lr: [0.0003] + sentence_avg: true + update_freq: [5] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-08 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.1, 0.4, 0.5] + final_lr_scale: 0.05 + +model: + _name: wav2vec_ctc + w2v_path: ??? + apply_mask: true + mask_prob: 0.75 + mask_channel_prob: 0.25 + mask_channel_length: 64 + layerdrop: 0.1 + activation_dropout: 0.1 + feature_grad_mult: 0.0 + freeze_finetune_updates: 10000 + + checkpoint_activations: false diff --git a/examples/wav2vec/xlsr/scripts/eval_speaker_clf_task.py b/examples/wav2vec/xlsr/scripts/eval_speaker_clf_task.py new file mode 100644 index 0000000000..16d07516f8 --- /dev/null +++ b/examples/wav2vec/xlsr/scripts/eval_speaker_clf_task.py @@ -0,0 +1,173 @@ +""" +Usage: + This scripts it to evaluate the classification accuracy/error rate from the embedding extracted + by gen_audio_embedding.py + Example (LID classification) + + PYTHONPATH='.' python examples/wav2vec/eval_speaker_clf_task.py \ + --data /fsx/androstj/exps/lid_voxlingua/infer/atj_xlsr2_100pct_300M_mean_fast_upd_100k_new.npz \ + --task cls --merge mean_logit +""" +import numpy as np +import sklearn +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.preprocessing import StandardScaler +from tqdm import tqdm +import ipdb +import logging +import argparse +from scipy.special import softmax + +log=logging.getLogger(__name__) +log.setLevel(logging.INFO) + +def calculate_eer(y_label, y_score): + # y denotes groundtruth scores, + # y_score denotes the prediction scores. + from scipy.optimize import brentq + from sklearn.metrics import roc_curve + from scipy.interpolate import interp1d + + fpr, tpr, thresholds = roc_curve(y_label, y_score, pos_label=1) + eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.) + optimal_threshold = interp1d(fpr, thresholds)(eer) + return eer, optimal_threshold + +def calculate_minDCF(y_label, y_score, p_target=0.01, c_miss=1, c_fa=1): + # https://github.com/kaldi-asr/kaldi/blob/master/egs/sre08/v1/sid/compute_min_dcf.py + from sklearn.metrics import det_curve + fpr, fnr, thresholds = det_curve(y_label, y_score, pos_label=1) + min_c_det = float("inf") + min_c_det_threshold = thresholds[0] + for i in range(0, len(fpr)): + # See Equation (2). it is a weighted sum of false negative + # and false positive errors. + c_det = c_miss * fnr[i] * p_target + c_fa * fpr[i] * (1 - p_target) + if c_det < min_c_det: + min_c_det = c_det + min_c_det_threshold = thresholds[i] + # See Equations (3) and (4). Now we normalize the cost. + c_def = min(c_miss * p_target, c_fa * (1 - p_target)) + min_dcf = min_c_det / c_def + return min_dcf, min_c_det_threshold + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--data', help='npz contains name & latent file') + parser.add_argument('--task', choices=['cls', 'veri', 'cls_voxlingua']) + parser.add_argument('--merge', choices=['mean_logit', 'first_logit', 'mean_latent_sim', 'first_latent_sim', 'mean_logit_sim', 'first_logit_sim']) + parser.add_argument('--veri-pair', help='verification file contains 1/0 utt_x utt_y') + parser.add_argument('--scaler', type=str, choices=['mean_var']) + parser.add_argument('--compress-method', choices=['pca']) + parser.add_argument('--compress-dim', type=int) + args = parser.parse_args() + + if args.task in ['cls', 'cls_voxlingua']: + print('| run classification evaluation') + data = np.load(args.data) + data_logit = data['logit'] + data_target = data['target'] + data_src_len = data['src_len'] + assert data_logit.shape[0] == data_target.shape[0] + B = data_logit.shape[0] + correct = 0 + total = 0 + data_prob = softmax(data_logit, axis=2) + correct_vs_len = np.empty((B, 2)) + for ii in range(B): + _target = data_target[ii] + if args.merge == 'mean_logit': + _prob = np.mean(data_prob[ii], axis=0) + top_1 = np.argmax(_prob) + elif args.merge == 'first_logit': + _prob = data_prob[ii][0] + top_1 = np.argmax(_prob) + else : + raise ValueError() + is_top_1 = (1 if top_1 == _target else 0) + correct += is_top_1 + total += 1 + _src_len = data_src_len[ii] / 16000 + correct_vs_len[ii] = [is_top_1, _src_len] + + acc = correct / total * 100 + t_5 = correct_vs_len[:, 1] <= 5 + t_20 = correct_vs_len[:, 1] > 5 + c_5 = correct_vs_len[t_5, 0].sum() + c_20 = correct_vs_len[t_20, 0].sum() + t_5 = t_5.sum() + t_20 = t_20.sum() + acc_5 = c_5 / t_5 * 100 + acc_20 = c_20 / t_20 * 100 + print(f'| acc = {acc:.2f}% -- err = {100-acc:.2f}% -- {correct=} {total=}') + print(f'| acc 0to5 = {acc_5:.2f}% -- err = {100-acc_5:.2f}% -- {c_5=} {t_5=}') + print(f'| acc 5to20 = {acc_20:.2f}% -- err = {100-acc_20:.2f}% -- {c_20=} {t_20=}') + + + + if args.task == 'veri': + print('| run verification evaluation') + veri_pairs = [] + with open(args.veri_pair) as ff: + for fi in ff: + a,b,c = fi.split() + a = int(a) + veri_pairs.append([a,b,c]) + + data = np.load(args.data) + if 'logit' in args.merge: + data_latent = data['logit'] + elif 'latent' in args.merge: + data_latent = data['latent'] + else : + raise ValueError() + + data_name = data['name'] + assert len(data_name) == len(data_latent) + map_name_latent = {} + + from sklearn.pipeline import make_pipeline + pipe = [] + if args.scaler == 'mean_var': + print(f'| apply StandardScaler') + pipe.append(StandardScaler()) + + if args.compress_method == 'pca': + n_comp = args.compress_dim + print(f'| apply PCA with {n_comp=}') + from sklearn.decomposition import PCA + pipe.append(PCA(n_components=n_comp)) + if len(pipe) > 0 : + pipe = make_pipeline(*pipe) + data_latent_2d = data_latent.reshape(-1, data_latent.shape[-1]) + pipe.fit(data_latent_2d) + data_latent_2d = pipe.transform(data_latent_2d) + data_latent = data_latent_2d.reshape(data_latent.shape[0], data_latent.shape[1], -1) + + for ii in range(len(data_name)): + map_name_latent[data_name[ii]] = data_latent[ii] + labels = [] + scores = [] + for lbl, pair_a, pair_b in tqdm(veri_pairs): + labels.append(lbl) + pair_a = map_name_latent[pair_a] + pair_b = map_name_latent[pair_b] + assert pair_a.ndim == pair_b.ndim == 2 + score = cosine_similarity(pair_a, pair_b) + if args.merge.startswith('mean'): + score = np.mean(score) + elif args.merge.startswith('first'): + score = score[0, 0] + else : + raise ValueError() + scores.append(score) + labels = np.array(labels) + scores = np.array(scores) + eer, eer_threshold = calculate_eer(labels, scores) + minDCF, minDCF_threshold = calculate_minDCF(labels, scores) + print('='*40) + print(f'| EER = {eer*100:.2f}%\tthreshold = {eer_threshold:.2f}') + print(f'| minDCF = {minDCF:.2f}\tthreshold = {minDCF_threshold:.2f}') + + diff --git a/examples/wav2vec/xlsr/scripts/gen_audio_embedding.py b/examples/wav2vec/xlsr/scripts/gen_audio_embedding.py new file mode 100644 index 0000000000..e5de1d5efd --- /dev/null +++ b/examples/wav2vec/xlsr/scripts/gen_audio_embedding.py @@ -0,0 +1,222 @@ +""" +Usage: + This script is used to extract the embedding / logit for speech classification task. + 1. Set fdir into your model checkpoint directory + 2. Run the following command (preferrably on GPU machine to speed up the inference process) + + CUDA_VISIBLE_DEVICES=0 python3 examples/wav2vec/gen_audio_embedding.py /fsx/data/VoxLingua107/manifest --path ${fdir} \ + --task audio_classification --batch-size 90 --gen-subset test \ + --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \ + --infer-xtimes 10 --infer-max-sample-size 160000 --output-path $odir + + Example: + Case: LID logit extraction + fdir='/fsx/androstj/exps/voxlingua_lid_train_all/ckpt_100pct_300m_voxling-act_linear-pool_mean_fast-lr_1e-4-phase_0.1_0.4_0.5-maxupd_100000-ufreq_1-mprob_0.5-fz_0-cr_softmax/0/checkpoints/checkpoint_best.pt' + python3 examples/wav2vec/gen_audio_embedding.py /fsx/data/VoxLingua107/manifest --path ${fdir} \ + --task audio_classification --batch-size 90 --gen-subset test \ + --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \ + --infer-xtimes 10 --infer-max-sample-size 160000 --output-path $odir + +""" +import torch +from fairseq import checkpoint_utils, distributed_utils, options, utils +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.logging import metrics, progress_bar +from fairseq import checkpoint_utils, data, options, tasks +from fairseq.data import FileAudioDataset, AddTargetDataset, Dictionary +from fairseq.tasks.audio_classification import LabelEncoder +import ipdb +import copy +import sys +from tqdm import tqdm +import tempfile +import numpy as np +import sklearn + +def subset_manifest(infer_manifest, veri_pair): + with open(infer_manifest) as ff, open(veri_pair) as gg, \ + tempfile.NamedTemporaryFile('w', delete=False) as ww: + fnames = ff.read().strip().split("\n") + basedir = fnames[0] + needed_fname = [] + for gi in gg.read().strip().split('\n'): + _, x1, x2 = gi.split() + needed_fname.append(x1) + needed_fname.append(x2) + needed_fname = set(needed_fname) + + ww.write(basedir+'\n') + for ii in range(1, len(fnames)): + x1,x2 = fnames[ii].split() + if x1 in needed_fname: + ww.write(fnames[ii]+'\n') + print(f'| subset manifest for verification: {ww.name}') + return ww.name + +def wrap_target_dataset(infer_manifest, dataset, task): + label_path = infer_manifest.replace(".tsv", ".label") + with open(label_path, "r") as f: + labels = f.read().strip().split("\n") + assert len(labels) == len(dataset) + process_label = LabelEncoder(task.target_dictionary) + dataset = AddTargetDataset(dataset, labels, + pad=task.target_dictionary.pad(), + eos=task.target_dictionary.eos(), + batch_targets=True, + process_label=process_label, + add_to_input=False) + return dataset + +def resample_data(source, padding_mask, n_sample, max_sample_len): + # source: BxT + # padding_mask: BxT + B = source.shape[0] + T = source.shape[1] + sources = [] + padding_masks = [] + seq_len = (~padding_mask).sum(1) + for jj in range(n_sample): + new_source = source.new_zeros(B, max_sample_len) + new_padding_mask = padding_mask.new_zeros(B, max_sample_len) + for ii in range(B): + if seq_len[ii] > max_sample_len: + start = np.random.randint(0, seq_len[ii]-max_sample_len+1) + end = start + max_sample_len + else : + start = 0 + end = seq_len[ii] + new_source[ii, 0:end-start] = source[ii, start:end] + new_padding_mask[ii, end-start+1:] = True + sources.append(new_source) + padding_masks.append(new_padding_mask) + return sources, padding_masks + +def resample_sample(sample, n_sample, max_sample_len): + new_sources, new_padding_masks = resample_data(sample['net_input']['source'], sample['net_input']['padding_mask'], n_sample, max_sample_len) + new_samples = [] + for ii in range(n_sample): + new_sample = copy.deepcopy(sample) + new_sample['net_input']['source'] = new_sources[ii] + new_sample['net_input']['padding_mask'] = new_padding_masks[ii] + new_samples.append(new_sample) + return new_samples + +if __name__ == '__main__': + np.random.seed(123) + # Parse command-line arguments for generation + parser = options.get_generation_parser(default_task='audio_classification') + # parser.add_argument('--infer-merge', type=str, default='mean') + parser.add_argument('--infer-xtimes', type=int, default=1) + parser.add_argument('--infer-max-sample-size', type=int, default=5*16000) # 5 secs + parser.add_argument('--infer-manifest', type=str) + parser.add_argument('--verification-pair', type=str, required=False, + help=''' + a file that contains pairs of utts to evaluated if they are from same speaker or not + format: (following voxceleb) + 1/0 <wav_pair_a> <wav_pair_b> + ''') + parser.add_argument('--output-path', type=str) + # parser.add_argument('--infer-xtimes', type=int, default=1) + + args = options.parse_args_and_arch(parser) + # Setup task + # task = tasks.setup_task(args) + use_cuda = not args.cpu + + # Load model & task + print('| loading model from {}'.format(args.path)) + arg_overrides = { + 'data': args.data, + # 'mask_prob': 0 + #'max_sample_size': sys.maxsize, + #'min_sample_size': 0, + } + state = checkpoint_utils.load_checkpoint_to_cpu(args.path) + # move to AWS + state['cfg']['model']['w2v_path'] = state['cfg']['model']['w2v_path'].replace('/checkpoint/arbabu/XLSR2/model_versions/', '/fsx/data/model_versions/').replace('/checkpoint/kushall/final_model_checkpoints/wav2vec2/', '/fsx/data/wav2vec_ckpt/') + state['cfg']['task']['data'] = state['cfg']['task']['data'].replace('/checkpoint/kushall/data/', '/fsx/data/') + + models, _model_args, task = checkpoint_utils.load_model_ensemble_and_task([args.path], + arg_overrides=arg_overrides, + task=None, + state=state) + model = models[0] + model.eval() + if use_cuda: + model.cuda() + + + # Load dataset + task.load_dataset(args.gen_subset) + dataset = task.dataset(args.gen_subset) + infer_manifest = args.infer_manifest + # only decode needed utts + # infer_manifest = subset_manifest(infer_manifest, + # args.verification_pair) + infer_dataset = FileAudioDataset(infer_manifest, + sample_rate=task.cfg.sample_rate, + max_sample_size=10**10, #task.cfg.max_sample_size, + min_sample_size=1, #task.cfg.min_sample_size, + pad=True, + normalize=task.cfg.normalize) + # add target (if needed) + infer_dataset = wrap_target_dataset(infer_manifest, infer_dataset, task) + itr = task.get_batch_iterator( + dataset=infer_dataset, + max_sentences=args.batch_size, + ).next_epoch_itr(shuffle=False) + + + # correct = 0 + # total = 0 + list_uttname = [] + list_latent = [] + list_logit = [] + list_target = [] + list_src_len = [] + with torch.no_grad(): + for _, sample in tqdm(enumerate(itr)): + # resample if needed + samples = resample_sample(sample, args.infer_xtimes, args.infer_max_sample_size) + list_uttname.extend(sample['name']) + list_target.extend(sample['target'][:, 0].cpu().numpy()) + list_src_len.extend((~sample['net_input']['padding_mask']).sum(1).cpu().numpy()) + latents = [] + logits = [] + for sample in samples: + sample = utils.move_to_cuda(sample) if use_cuda else sample + try: + latent = model.forward_latent(**sample['net_input']) + latents.append(latent.detach().cpu().numpy()) + except: + latent = None + logit = model.forward(**sample['net_input']) + logits.append(logit.detach().cpu().numpy()) + + if len(latents) > 0: + latents = np.stack(latents, 1) # B,X,D + logits = np.stack(logits, 1) # B,X,Cls + list_latent.extend(latents) + list_logit.extend(logits) + + # create big npz + list_uttname = np.array(list_uttname) + list_latent = np.array(list_latent) + list_target = np.array(list_target) + list_logit = np.array(list_logit) + list_src_len = np.array(list_src_len) + # save to npz + output_path = args.output_path + if (output_path is None): + output_path = tempfile.NamedTemporaryFile('wb', delete=False).name + + with open(output_path, 'wb') as ww: + np.savez(ww, name=list_uttname, + latent=list_latent, + target=list_target, + logit=list_logit, + src_len=list_src_len) + + print("="*10 + " REPORT " + "="*10) + print(f'| latent saved in {output_path}') + print(f'| {list_uttname.shape=}, {list_latent.shape=}, {list_target.shape=}, {list_logit.shape=}, {list_src_len.shape=}') diff --git a/examples/wmt20/README.md b/examples/wmt20/README.md new file mode 100644 index 0000000000..b4f2874652 --- /dev/null +++ b/examples/wmt20/README.md @@ -0,0 +1,72 @@ +# WMT 20 + +This page provides pointers to the models of Facebook-FAIR's WMT'20 news translation task submission [(Chen et al., 2020)](https://arxiv.org/abs/2011.08298). + +## Single best MT models (after finetuning on part of WMT20 news dev set) + +Model | Description | Download +---|---|--- +`transformer.wmt20.ta-en` | Ta->En | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.ta-en.single.tar.gz) +`transformer.wmt20.en-ta` | En->Ta | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-ta.single.tar.gz) +`transformer.wmt20.iu-en.news` | Iu->En (News domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.news.single.tar.gz) +`transformer.wmt20.en-iu.news` | En->Iu (News domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.news.single.tar.gz) +`transformer.wmt20.iu-en.nh` | Iu->En (Nunavut Hansard domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.nh.single.tar.gz) +`transformer.wmt20.en-iu.nh` | En->Iu (Nunavut Hansard domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.nh.single.tar.gz) + +## Language models +Model | Description | Download +---|---|--- +`transformer_lm.wmt20.en` | En Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en.tar.gz) +`transformer_lm.wmt20.ta` | Ta Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.ta.tar.gz) +`transformer_lm.wmt20.iu.news` | Iu Language Model (News domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu.news.tar.gz) +`transformer_lm.wmt20.iu.nh` | Iu Language Model (Nunavut Hansard domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu.nh.tar.gz) + +## Example usage (torch.hub) + +#### Translation + +```python +import torch + +# English to Tamil translation +en2ta = torch.hub.load('pytorch/fairseq', 'transformer.wmt20.en-ta') +en2ta.translate("Machine learning is great!") # 'இயந்திரக் கற்றல் அருமை!' + +# Tamil to English translation +ta2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt20.ta-en') +ta2en.translate("இயந்திரக் கற்றல் அருமை!") # 'Machine learning is great!' + +# English to Inuktitut translation +en2iu = torch.hub.load('pytorch/fairseq', 'transformer.wmt20.en-iu.news') +en2iu.translate("machine learning is great!") # 'ᖃᒧᑕᐅᔭᓄᑦ ᐃᓕᓐᓂᐊᕐᓂᖅ ᐱᐅᔪᒻᒪᕆᒃ!' + +# Inuktitut to English translation +iu2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt20.iu-en.news') +iu2en.translate("ᖃᒧᑕᐅᔭᓄᑦ ᐃᓕᓐᓂᐊᕐᓂᖅ ᐱᐅᔪᒻᒪᕆᒃ!") # 'Machine learning excellence!' +``` + +#### Language Modeling + +```python +# Sample from the English LM +en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt20.en') +en_lm.sample("Machine learning is") # 'Machine learning is a type of artificial intelligence that uses machine learning to learn from data and make predictions.' + +# Sample from the Tamil LM +ta_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt20.ta') +ta_lm.sample("இயந்திரக் கற்றல் என்பது செயற்கை நுண்ணறிவின்") # 'இயந்திரக் கற்றல் என்பது செயற்கை நுண்ணறிவின் ஒரு பகுதியாகும்.' + +# Sample from the Inuktitut LM +iu_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt20.iu.news') +iu_lm.sample("ᖃᒧᑕᐅᔭᓄᑦ ᐃᓕᓐᓂᐊᕐᓂᖅ") # 'ᖃᒧᑕᐅᔭᓄᑦ ᐃᓕᓐᓂᐊᕐᓂᖅ, ᐊᒻᒪᓗ ᓯᓚᐅᑉ ᐊᓯᙳᖅᐸᓪᓕᐊᓂᖓᓄᑦ ᖃᓄᐃᓕᐅᕈᑎᒃᓴᑦ, ᐃᓚᖃᖅᖢᑎᒃ ᐅᑯᓂᖓ:' +``` + +## Citation +```bibtex +@inproceedings{chen2020facebook + title={Facebook AI's WMT20 News Translation Task Submission}, + author={Peng-Jen Chen and Ann Lee and Changhan Wang and Naman Goyal and Angela Fan and Mary Williamson and Jiatao Gu}, + booktitle={Proc. of WMT}, + year={2020}, +} +``` diff --git a/examples/wmt21/README.md b/examples/wmt21/README.md new file mode 100644 index 0000000000..524fffb724 --- /dev/null +++ b/examples/wmt21/README.md @@ -0,0 +1,25 @@ +# WMT 21 + +This page provides pointers to the models of Facebook AI's WMT'21 news translation task submission [(Tran et al., 2021)](https://arxiv.org/abs/2108.03265). + +## Single best dense models + +Model | Description | Download +---|---|--- +`wmt21.dense-24-wide.X-En` | X-En | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt21.dense-24-wide.X-En.tar.gz) +`wmt21.dense-24-wide.En-X` | En-X | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt21.dense-24-wide.En-X.tar.gz) + +## Example usage + +See eval.sh + + +## Citation +```bibtex +@inproceedings{tran2021facebook + title={Facebook AI’s WMT21 News Translation Task Submission}, + author={Chau Tran and Shruti Bhosale and James Cross and Philipp Koehn and Sergey Edunov and Angela Fan}, + booktitle={Proc. of WMT}, + year={2021}, +} +``` diff --git a/examples/wmt21/eval.sh b/examples/wmt21/eval.sh new file mode 100644 index 0000000000..b36d934c51 --- /dev/null +++ b/examples/wmt21/eval.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +SRC=en +TGT=is +MODEL_NAME=wmt21.dense-24-wide.En-X + +PATH_TO_FAIRSEQ_PY=. +TMP_DIR=generation_tmp +mkdir -p $TMP_DIR + +REPLACE_UNICODE_PUNCT=$PATH_TO_FAIRSEQ_PY/examples/wmt21/scripts/replace-unicode-punctuation.perl +NORM_PUNCT=$PATH_TO_FAIRSEQ_PY/examples/wmt21/scripts/normalize-punctuation.perl +if [ ! -d "${TMP_DIR}/${MODEL_NAME}" ]; then + wget https://dl.fbaipublicfiles.com/fairseq/models/${MODEL_NAME}.tar.gz -P $TMP_DIR/ + tar -xvf $TMP_DIR/${MODEL_NAME}.tar.gz -C $TMP_DIR +fi +MODEL_DIR=$TMP_DIR/${MODEL_NAME} +if [ ! -d "${TMP_DIR}/wmt21-news-systems" ]; then + git clone https://github.com/wmt-conference/wmt21-news-systems $TMP_DIR/wmt21-news-systems +fi + +DOMAIN_TAG="wmtdata newsdomain" +INPUT_FILE=$TMP_DIR/wmt21-news-systems/txt/sources/newstest2021.${SRC}-${TGT}.src.${SRC} +REF_FILE=$TMP_DIR/wmt21-news-systems/txt/references/newstest2021.${SRC}-${TGT}.ref.A.${TGT} + +# Translate +cat ${INPUT_FILE} | sed "s/^/${DOMAIN_TAG} /" | $REPLACE_UNICODE_PUNCT | $NORM_PUNCT -l ${SRC} | python $PATH_TO_FAIRSEQ_PY/fairseq_cli/interactive.py $MODEL_DIR \ + --path ${MODEL_DIR}/checkpoint.pt \ + --task translation_multi_simple_epoch \ + --langs "en,ha,is,ja,cs,ru,zh,de" \ + --lang-pairs $SRC-$TGT \ + --bpe "sentencepiece" \ + --sentencepiece-model ${MODEL_DIR}/sentencepiece.model \ + --buffer-size 1024 \ + --batch-size 10 -s $SRC -t $TGT \ + --decoder-langtok \ + --encoder-langtok src \ + --beam 5 \ + --lenpen 1.0 \ + --fp16 > $TMP_DIR/${SRC}-${TGT}.gen_log + +cat $TMP_DIR/$SRC-$TGT.gen_log | grep -P "^D-" | cut -f3 > $TMP_DIR/$SRC-$TGT.hyp + +# Calculate BLEU score +sacrebleu -l $SRC-$TGT $REF_FILE < $TMP_DIR/$SRC-$TGT.hyp diff --git a/examples/wmt21/scripts/normalize-punctuation.perl b/examples/wmt21/scripts/normalize-punctuation.perl new file mode 100644 index 0000000000..a7c0750f58 --- /dev/null +++ b/examples/wmt21/scripts/normalize-punctuation.perl @@ -0,0 +1,90 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; + +my $language = "en"; +my $PENN = 0; + +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); # not buffered (flush each line) + /^-l$/ && ($language = shift, next); + /^[^\-]/ && ($language = $_, next); + /^-penn$/ && ($PENN = 1, next); +} + +while(<STDIN>) { + s/\r//g; + # remove extra spaces + s/\(/ \(/g; + s/\)/\) /g; s/ +/ /g; + s/\) ([\.\!\:\?\;\,])/\)$1/g; + s/\( /\(/g; + s/ \)/\)/g; + s/(\d) \%/$1\%/g; + s/ :/:/g; + s/ ;/;/g; + # normalize unicode punctuation + if ($PENN == 0) { + s/\`/\'/g; + s/\'\'/ \" /g; + } + + s/„/\"/g; + s/“/\"/g; + s/”/\"/g; + s/–/-/g; + s/—/ - /g; s/ +/ /g; + s/´/\'/g; + s/([a-z])‘([a-z])/$1\'$2/gi; + s/([a-z])’([a-z])/$1\'$2/gi; + s/‘/\'/g; + s/‚/\'/g; + s/’/\"/g; + s/''/\"/g; + s/´´/\"/g; + s/…/.../g; + # French quotes + s/ « / \"/g; + s/« /\"/g; + s/«/\"/g; + s/ » /\" /g; + s/ »/\"/g; + s/»/\"/g; + # handle pseudo-spaces + s/ \%/\%/g; + s/nº /nº /g; + s/ :/:/g; + s/ ºC/ ºC/g; + s/ cm/ cm/g; + s/ \?/\?/g; + s/ \!/\!/g; + s/ ;/;/g; + s/, /, /g; s/ +/ /g; + + # English "quotation," followed by comma, style + if ($language eq "en") { + s/\"([,\.]+)/$1\"/g; + } + # Czech is confused + elsif ($language eq "cs" || $language eq "cz") { + } + # German/Spanish/French "quotation", followed by comma, style + else { + s/,\"/\",/g; + s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence + } + + + if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") { + s/(\d) (\d)/$1,$2/g; + } + else { + s/(\d) (\d)/$1.$2/g; + } + print $_; +} diff --git a/examples/wmt21/scripts/replace-unicode-punctuation.perl b/examples/wmt21/scripts/replace-unicode-punctuation.perl new file mode 100644 index 0000000000..faed2cd9d8 --- /dev/null +++ b/examples/wmt21/scripts/replace-unicode-punctuation.perl @@ -0,0 +1,55 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; + +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); # not buffered (flush each line) +} + +#binmode(STDIN, ":utf8"); +#binmode(STDOUT, ":utf8"); + +while(<STDIN>) { + s/,/,/g; + s/。 */. /g; + s/、/,/g; + s/”/"/g; + s/“/"/g; + s/∶/:/g; + s/:/:/g; + s/?/\?/g; + s/《/"/g; + s/》/"/g; + s/)/\)/g; + s/!/\!/g; + s/(/\(/g; + s/;/;/g; + s/1/1/g; + s/」/"/g; + s/「/"/g; + s/0/0/g; + s/3/3/g; + s/2/2/g; + s/5/5/g; + s/6/6/g; + s/9/9/g; + s/7/7/g; + s/8/8/g; + s/4/4/g; + s/. */. /g; + s/~/\~/g; + s/’/\'/g; + s/…/\.\.\./g; + s/━/\-/g; + s/〈/\</g; + s/〉/\>/g; + s/【/\[/g; + s/】/\]/g; + s/%/\%/g; + print $_; +} diff --git a/examples/womens_bios/README.md b/examples/womens_bios/README.md new file mode 100644 index 0000000000..07d0646887 --- /dev/null +++ b/examples/womens_bios/README.md @@ -0,0 +1,81 @@ +# Wikipedia Biographies of Women + + +## Training: + +The training dataset is created based on WikiSum, a dataset created from the paper [Generating Wikipedia by Summarizing Long Sequences](https://arxiv.org/pdf/1801.10198.pdf). The dataset needs to be generated following the instructions in this [Github Repository](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wikisum). + +### How is the WikiSum dataset structured? + +Overall, the task in WikiSum was to generate the entire Wikipedia article based on the contents of the top 10 Google Search Results. The authors provide a way for people to recreate their work. In the WikiSum Github, there are two options for the dataset recreation --- the first is to use CommonCrawl (a static, open source crawl of the web) and the second to do Live Web Fetches. The second has higher coverage, but the content is subject to change and difficult to fetch. We used the static, Commoncrawl version. This can be downloaded following the Github repo instructions, though note it will require usage of Google Cloud. + +Note: in our experience, it also requires requesting that the resource limit of the Google Cloud instance be raised, which requires emailing. + +Note: Having higher coverage in the training dataset would be expected to improve the model quality. There are many instances in the dataset where the training input (web evidence) does not contain sufficient content for producing the desired Wikipedia article. This may harm the model's ability to learn to retrieve, look at the input evidence, and overall could contribute to increased challenges in generating verifiable Wikipedia biographies. + +### How do you go from WikiSum dataset to Biography dataset? + +The WikiSum dataset is for Wikipedia in general, not just biographies. We do this by querying WikiData to see if the Wikipedia article has an occupation, with the thought that all articles with occupations are probably biographies. + + +## Evaluation: + +You can download the dataset and baseline model with the following command: + +``` +wget -N 'https://dl.fbaipublicfiles.com/fairseq/womenbios_dataset.zip' +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' +``` + +We provide the full text Wikipedia articles split into four categories: +- Women in Africa +- Women in Asia +- Women in Science +- Women +We note that these are not exhaustive intersectional categories and mainly stem from personal interest. + +We also provide the URL of the Wikipedia article. Note that Wikipedia articles are constantly being improved, edited, and changed. Thus, it's completely possible that the Wikipedia article on Wikipedia has been lovingly improved by other Wikipedia editors. + +To get the occupations of each biographical subject, we use WikiData. We provide a sample script to do this. We also provide the raw output of this query. + +The final part of the evaluation dataset is to query web evidence for each of the biographical subjects. This is the part of the evaluation dataset that requires the most improvement. As we discuss in our paper, one of the major reasons why it is difficult to write biographies for sometimes very well qualified women is that there is not information online about them. Further, the search engine may not find it. We encourage others to improve upon this part of the data, as even re-querying again on the internet may find new, updated sources of information as the web is constantly evolving. + +We use the search engine from [Internet-Augmented Dialogue Generation](https://arxiv.org/abs/2107.07566), see [project URL](https://parl.ai/projects/sea/) to do the search queries. Note: we remove wikipedia site sources from our query (or we'd query the data itself). However, it's possible Wikipedia information can be copied around in multiple forms on the web, linked with edits, etc. + + +## Section by Section Generation: + +Wikipedia articles are split into sections, which are usually separated by headings. These headings can be separated in the article text by looking for these equal signs (==), where the number of equal signs usually signals if you are looking at a toplevel heading or a subheading, etc. An example regex that you can use is: + +` +section_header_re = re.compile(r"(?<!=)==([^=]+)==(?!=)") +` + + +## List of Notes: +- People can have multiple occupations, and we keep all occupations that we query from WikiData + + +## List of Possible Improvement Areas: +Using a larger generative pre-trained model, larger-scale retrieval, a retrieval encoder specialized to Wikipedia (or biographies), tuning all of the training & generation parameters exhaustively --- and the like --- would most likely be very useful. Overall, we hope that this is a starting point for others who might be interested in focusing on how we can help address the gender gap on Wikipedia. + + +## Interested in Wikipedia and Gender Gap? +You might want to check out: +- https://humaniki.wmcloud.org/ +- https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Women_in_Red and https://wikimediafoundation.org/news/2018/10/18/women-in-red-wikiproject/ +- https://meta.wikimedia.org/wiki/Whose_Knowledge%3F/VisibleWikiWomen +- https://www.ted.com/talks/jess_wade_a_voice_for_diversity_in_science + +and thanks again to all of the Wikipedia editors and the entire community that is already working so hard to write amazing articles for diverse groups of people. + + +# LICENSE +This is licensed under CC-BY-NC, however portions of the dataset are available under separate license terms: text sourced from Wikipedia is licensed under CC-BY-SA. + + + + + diff --git a/examples/womens_bios/query_occupations_from_wikidata.py b/examples/womens_bios/query_occupations_from_wikidata.py new file mode 100644 index 0000000000..8028c6eece --- /dev/null +++ b/examples/womens_bios/query_occupations_from_wikidata.py @@ -0,0 +1,34 @@ +import sys +from SPARQLWrapper import SPARQLWrapper, JSON + +endpoint_url = "https://query.wikidata.org/sparql" + +with open("/your/urls/here") as f: + data = f.readlines() +urls = [i.strip() for i in data] + +def get_results(endpoint_url, URL): + query = f"""SELECT ?uriLabel ?occupation ?occupationLabel ?dob ?dobLabel WHERE {{ + <{URL}> schema:about ?uri . + ?uri wdt:P106 ?occupation . + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }} + }}""" + user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1]) + sparql = SPARQLWrapper(endpoint_url, agent=user_agent) + sparql.setQuery(query) + sparql.setReturnFormat(JSON) + return sparql.query().convert() + +all_occupations = [] +for URL in urls: + results = get_results(endpoint_url, URL) + occupations = [] + for result in results["results"]["bindings"]: + occupations.append(result['occupationLabel']['value']) + all_occupations.append(result['uriLabel']['value'] + ", " + ", ".join(occupations)) + +assert(len(all_occupations) == len(urls)) + +with open("/your/file/output/here", "w") as o: + for line in all_occupations: + o.write(line.strip() + "\n") \ No newline at end of file diff --git a/examples/xformers/README.md b/examples/xformers/README.md new file mode 100644 index 0000000000..400a74d536 --- /dev/null +++ b/examples/xformers/README.md @@ -0,0 +1,43 @@ +# Using xFormers with FairSeq + +[xFormers](https://github.com/facebookresearch/xformers) is a xFormers is a modular library for flexibly generating transformer architectures with interoperable and optimized building blocks. +The current integration allows for FairSeq users to use an attention variant available in the xFormers repository. + +In order to enable xFormers, all that needs to be passed in is a string representing an [xFormers attention config](https://github.com/facebookresearch/xformers/blob/5f754129bfb1ea53747b1ab2077261ea762faa47/xformers/components/attention/base.py#L18). + +The various attention variants can be found [here](https://github.com/facebookresearch/xformers/tree/main/xformers/components/attention). +These include sparse attention and blocksparse attention. + +For example, you could pass in the following args: + ```python +decoder_xformers_att_config = '{"name": "scaled_dot_product"}' + +encoder_xformers_att_config = '{"name": "linformer", "seq_len": "256"}' + ``` + +In order to use blocksparse attention you would have to additionally pass in a blocksparse layout and blocksize. For example: + + ```python + + xformers_att_config = '{"name": "scaled_dot_product"}' + xformers_blocksparse_blocksize = 16 + xformers_blocksparse_layout = torch.ones( + seq_len // xformers_blocksparse_blocksize, + seq_len // xformers_blocksparse_blocksize, + ) + + xf_blocksparse_mha = ( + MultiheadAttention( + embedding, + num_heads, + dropout=0.0, + add_zero_attn=add_zero_attn, + xformers_att_config=xformers_att_config, + xformers_blocksparse_layout=xformers_blocksparse_layout, + xformers_blocksparse_blocksize=xformers_blocksparse_blocksize, + ) + + ``` + +The xFormers repository currenlty has benchmarks on the [runtime](https://github.com/facebookresearch/xformers/blob/main/docs/plots/runtime_vs_attention.png) +and [memory usage](https://github.com/facebookresearch/xformers/blob/main/docs/plots/memory_vs_attention.png) of the various attentions. diff --git a/examples/xglm/README.md b/examples/xglm/README.md new file mode 100644 index 0000000000..914e297669 --- /dev/null +++ b/examples/xglm/README.md @@ -0,0 +1,195 @@ +# Few-shot Learning with Multilingual Language Models + +## Introduction + +In this work, we train a family of multilingual generative language models, dubbed XGLM, on a balanced corpus covering a diverse set of languages, and study their few- and zero-shot learning capabilities in a wide range of tasks. Our largest model with 7.5 billion parameters sets new state of the art in few-shot learning on more than 20 representative languages, outperforming GPT-3 of comparable size in multilingual commonsense reasoning (+7.4 accuracy points for 0-shot, +9.4 for 4-shot) and natural language inference (+5.4 for 0-shot, +5.4 for 4-shot). We have included a [model card](model_card.md) of XGLM for transparency and accountability. + +## Data and Languages +XGLM models are trained on a new multilingual corpus extracted from CommonCrawl (CC100-XL), a significantly larger multilingual dataset covering 68 Common Crawl (CC) snapshots (from [Summer 2013](http://commoncrawl.org/2013/11/new-crawl-data-available/) to [March/April 2020](https://commoncrawl.org/2020/04/march-april-2020-crawl-archive-now-available/) consisting of 134 languages. The detailed languages and data statistics are reported in the paper (Table A.1). + +## Pre-trained models + +Model | Layers | Model Dim | FFN Dim | Languages | Download +---|---|---|---|---|--- +`XGLM 564M` | 24 | 1024 | 4096 | trained on 30 languages| [xglm.564M.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.564M.tar.gz) +`XGLM 1.7B` | 24 | 2048 | 8192 | trained on 30 languages| [xglm.1.7B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.1.7B.tar.gz) +`XGLM 2.9B` | 48 | 2048 | 8192 | trained on 30 languages| [xglm.2.9B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.2.9B.tar.gz) +`XGLM 7.5B` | 32 | 4096 | 16384 | trained on 30 languages| [xglm.7.5B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.7.5B.tar.gz) +`XGLM 4.5B` | 48 | 2048 | 16384 | trained on 134 languages| [xglm.4.5B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.4.5B.tar.gz) + +## Pre-training Data Format +Our models were pre-trained with data in the following format (i.e. paragraphs are separated with new lines and documents were separated with double new lines). +``` +<doc0,para0,tok0> ... <doc0,para0,tokX0> # X0: number of tokens in para0 of doc0 +<doc0,para1,tok0> ... <doc0,para1,tokY0> # Y0: number of tokens in para1 of doc0 + +<doc1,para0,tok0> ... <doc1,para0,tokX1> # X1: number of tokens in para0 of doc1 +<doc1,para1,tok0> ... <doc1,para1,tokY1> # Y1: number of tokens in para1 of doc1 + +... +``` +Fairseq's preprocessing replaces newlines with the end-of-sentence symbol (`</s>`). As a result, the models never saw newline characters during pretraining and the same preprocessing should be run prior to few-shot inference to maximize performance. For example, our language model scoring function has `replace_newlines_with_eos` argument to trigger this preprocessing: +```python +from fairseq.models.transformer_lm import TransformerLanguageModel + +model_dir = 'path_to_decompressed_tar_gz_dir' +lm = TransformerLanguageModel.from_pretrained(model_dir, bpe='sentencepiece') + +text = """First paragraph of the first document. +Second paragraph of the first document. + +First paragraph of the second document. +""" +tokens = lm.score(text, replace_newlines_with_eos=True)['tokens'] +assert '\n' not in lm.decode(tokens) # no newlines were encoded +``` + +## Evaluation + +### Example (COPA) + +The following snippet show how to evaluate our models on the Choice of Plausible Alternatives (COPA) task, using examples in English, Chinese and Hindi. + +```python +data_samples = { + 'en': [ + { + "premise": "I wanted to conserve energy.", + "choice1": "I swept the floor in the unoccupied room.", + "choice2": "I shut off the light in the unoccupied room.", + "question": "effect", + "label": "1" + }, + { + "premise": "The flame on the candle went out.", + "choice1": "I blew on the wick.", + "choice2": "I put a match to the wick.", + "question": "cause", + "label": "0" + } + ], + 'zh': [ + { + "premise": "我想节约能源。", + "choice1": "我在空着的房间里扫了地板。", + "choice2": "我把空房间里的灯关了。", + "question": "effect", + "label": "1" + }, + { + "premise": "蜡烛上的火焰熄灭了。", + "choice1": "我吹灭了灯芯。", + "choice2": "我把一根火柴放在灯芯上。", + "question": "cause", + "label": "0" + } + ], + 'hi': [ + { + "premise": "M te vle konsève enèji.", + "choice1": "Mwen te fin baleye chanm lib la.", + "choice2": "Mwen te femen limyè nan chanm lib la.", + "question": "effect", + "label": "1" + }, + { + "premise": "Flam bouji a te etenn.", + "choice1": "Mwen te soufle bouji a.", + "choice2": "Mwen te limen mèch bouji a.", + "question": "cause", + "label": "0" + } + ] +} +``` +In this example, we format the examples use the non-verbal prompts `{premise}\n{choice1}` and `{premise}\n{choice2}`, which are shared by all three languages. +```python +from fairseq.models.transformer_lm import TransformerLanguageModel + +model_dir = 'path_to_decompressed_tar_gz_dir' +lm = TransformerLanguageModel.from_pretrained(model_dir, bpe='sentencepiece') +lm = lm.eval() +lm = lm.half() +lm = lm.cuda() + +def get_logprobs(prompt): + import re + prompt = re.sub('\n+' , '\n', prompt) # collapse repeated newlines, which indicate separate documents + return lm.score(prompt, replace_newlines_with_eos=True)['positional_scores'] + +# Zero-shot evaluation for the Choice of Plausible Alternatives (COPA) task. +# A return value of 0 indicates that the first alternative is more plausible, +# while 1 indicates that the second alternative is more plausible. +def COPA_eval(prompt, alternative1, alternative2): + lprob1 = get_logprobs(prompt + "\n" + alternative1).sum() + lprob2 = get_logprobs(prompt + "\n" + alternative2).sum() + return 0 if lprob1 > lprob2 else 1 + +for lang in ['en', 'zh', 'hi']: + for idx, example in enumerate(data_samples[lang]): + predict = COPA_eval(example["premise"], example["choice1"], example["choice2"]) + print(f'{lang}-{idx}', predict, example['label']) + +# en-0 1 1 +# en-1 0 0 +# zh-0 1 1 +# zh-1 0 0 +# hi-0 1 1 +# hi-1 0 0 +``` + +## XStoryCloze + +We release XStoryCloze, a new multilingual dataset intended for few-shot evaluation, alongside this paper. XStoryCloze consists of professional translation of the validation split of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 other languages. It is opensourced under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode), the same license as the English StoryCloze. + +You can download the dataset via [this link](https://dl.fbaipublicfiles.com/xstorycloze.zip). + +Language | ar | es | eu | hi | id | my | ru | sw | te | zh +---|---|---|---|---|---|---|---|---|---|--- +Train size | 360 | 360 | 360 | 360 | 360 | 360 | 360 | 360 | 360 | 360 +Eval size | 1511 | 1511 | 1511 | 1511 | 1511 | 1511 | 1511 | 1511 | 1511 | 1511 + +Please refer to [the dataset doc](XStoryCloze.md) for more information. + + +## Publication +[Few-shot Learning with Multilingual Generative Language Models](https://arxiv.org/abs/2112.10668). +Xi Victoria Lin*, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li* (* Equal Contribution). +EMNLP 2022. + +## Citation +``` +@article{DBLP:journals/corr/abs-2112-10668, + author = {Xi Victoria Lin and + Todor Mihaylov and + Mikel Artetxe and + Tianlu Wang and + Shuohui Chen and + Daniel Simig and + Myle Ott and + Naman Goyal and + Shruti Bhosale and + Jingfei Du and + Ramakanth Pasunuru and + Sam Shleifer and + Punit Singh Koura and + Vishrav Chaudhary and + Brian O'Horo and + Jeff Wang and + Luke Zettlemoyer and + Zornitsa Kozareva and + Mona T. Diab and + Veselin Stoyanov and + Xian Li}, + title = {Few-shot Learning with Multilingual Language Models}, + journal = {CoRR}, + volume = {abs/2112.10668}, + year = {2021}, + url = {https://arxiv.org/abs/2112.10668}, + eprinttype = {arXiv}, + eprint = {2112.10668}, + timestamp = {Tue, 04 Jan 2022 15:59:27 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/examples/xglm/XStoryCloze.md b/examples/xglm/XStoryCloze.md new file mode 100644 index 0000000000..9b0fce0715 --- /dev/null +++ b/examples/xglm/XStoryCloze.md @@ -0,0 +1,57 @@ +XStoryCloze consists of professional translation of the validation split of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 other languages. This dataset is released by FAIR (Fundamental Artificial Intelligence Research) alongside the paper [Few-shot Learning with Multilingual Generative Language Models. EMNLP 2022](https://arxiv.org/abs/2112.10668). + +# Languages +ru, zh (Simplified), es (Latin America), ar, hi, id, te, sw, eu, my. + +# Data Splits +This dataset is intended to be used for evaluating the zero- and few-shot learning capabilities of multlingual language models. We split the data for each language into train and test (360 vs. 1510 examples, respectively). The released data files for different languages maintain a line-by-line alignment. + +# Access English StoryCloze +Please request the original English StoryCloze dataset through the [official website](https://cs.rochester.edu/nlp/rocstories/). You can create a split of the en data following our data split scheme using the following commands: +``` +head -361 spring2016.val.tsv > spring2016.val.en.tsv.split_20_80_train.tsv + +head -1 spring2016.val.tsv > spring2016.val.en.tsv.split_20_80_eval.tsv # TSV header +tail -1511 spring2016.val.tsv >> spring2016.val.en.tsv.split_20_80_eval.tsv +``` + +# Licence +XStoryCloze is opensourced under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode), the same license as the original English StoryCloze. + +# Citation +We hope this dataset is helpful for the research and wider NLP community. If you use XStoryCloze in your work, please cite +``` +@article{DBLP:journals/corr/abs-2112-10668, + author = {Xi Victoria Lin and + Todor Mihaylov and + Mikel Artetxe and + Tianlu Wang and + Shuohui Chen and + Daniel Simig and + Myle Ott and + Naman Goyal and + Shruti Bhosale and + Jingfei Du and + Ramakanth Pasunuru and + Sam Shleifer and + Punit Singh Koura and + Vishrav Chaudhary and + Brian O'Horo and + Jeff Wang and + Luke Zettlemoyer and + Zornitsa Kozareva and + Mona T. Diab and + Veselin Stoyanov and + Xian Li}, + title = {Few-shot Learning with Multilingual Language Models}, + journal = {CoRR}, + volume = {abs/2112.10668}, + year = {2021}, + url = {https://arxiv.org/abs/2112.10668}, + eprinttype = {arXiv}, + eprint = {2112.10668}, + timestamp = {Tue, 04 Jan 2022 15:59:27 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/examples/xglm/model_card.md b/examples/xglm/model_card.md new file mode 100644 index 0000000000..2656ec5d63 --- /dev/null +++ b/examples/xglm/model_card.md @@ -0,0 +1,152 @@ +# XGLM multilingual model +## Version 1.0.0 + +### Model developer +FAIR (Fundamental Artificial Intelligence Research) + +### Model type +A family of multilingual autoregressive language models (ranging from 564 million to 7.5 billion parameters) trained on a balanced corpus of a diverse set of languages. The language model can learn tasks from natural language descriptions and a few examples. + +### Model Feedback Channel +https://github.com/pytorch/fairseq + +## Intended use +### Primary intended use +For research purposes only, e.g. reproducing model evaluation results. Generation is only used in a limited capacity for explanation/justification or for prompting/probing/priming for class labels. + +### Out of scope uses +The primary purpose of the model is not to generate language, although the model is capable of doing that. + +## Potential risks +This section lists the potential risks associated with using the model. + +### Relevant factors +Based on known problems with NLP technology, potential relevant factors include output correctness, robustness, bias (gender, profession, race and religion), etc. + +### Evaluation factors +The model was evaluated on hate speech detection and occupation identification. +* Hate speech detection (Huang et al. (2020)) - A safety task to test language models’ ability to identify hateful and offensive text. +* Occupation identification (De-Arteaga et al., 2019), (Zhao et al., 2020) - A bias task to study language models’ performance divergence between different gender groups on the task of occupation identification. + +## Metrics +### Model performance measures +The XGLM model was primarily evaluated on +1. Zero shot and few shot learning by looking at per-language performance on tasks spanning commonsense reasoning (XCOPA, XWinograd), natural language inference (XNLI) and paraphrasing (PAWS-X). The model is also evaluated on XStoryCloze, a new dataset created by FAIR (Fundamental Artificial Intelligence Research). +2. Cross lingual transfer through templates and few-shot examples. +3. Knowledge probing - Evaluate to what extent the XGLM model can effectively store factual knowledge in different languages using the mLAMA benchmark. +4. Translation - We report machine translation results on WMT benchmarks and a subset of FLORES-101 in the main paper. + +The model was also evaluated on hate speech datasets introduced by Huang et al. (2020) and an occupation identification dataset by De-Arteaga et al. 2019 to identify bias in the model. + +### Approaches to handle uncertainty +Report confidence intervals, variance metrics for the model performance metrics. Few-shot evaluation was conducted with different sampling with 5 seeds. We reported statistical significance. + +## Evaluation data +## Zero Shot and Few Shot evaluation + +### XNLI (Conneau et al., 2018) +#### Description +The Cross-lingual Natural Language Inference (XNLI) corpus is the extension of the Multi-Genre NLI (MultiNLI) corpus to 15 languages. The dataset was created by manually translating the validation and test sets of MultiNLI into each of those 15 languages. + +### XStoryCloze +#### Description +A new dataset created by FAIR along side this work by translating the validation split of the English StoryCloze dataset (Mostafazadeh et al., 2016) (Spring 2016 version) to 10 other typologically diverse languages (ru, zh Simplified, es Latin America, ar, hi, id, te, sw, eu, my). + +### XCOPA (Ponti et al., 2020) +#### Description +The Cross-lingual Choice of Plausible Alternatives (XCOPA) dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages. The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe. + +### XWinograd (Tikhonov and Ryabinin, 2021) +#### Description +XWinograd is a multilingual collection of Winograd Schemas in six languages that can be used for evaluation of cross-lingual commonsense reasoning capabilities. + +### PAWS-X (Yang et al., 2019) +#### Description +PAWS-X contains 23,659 human translated PAWS evaluation pairs and 296,406 machine translated training pairs in six typologically distinct languages: French, Spanish, German, Chinese, Japanese, and Korean. All translated pairs are sourced from examples in PAWS-Wiki. + +## Responsible AI (RAI) evaluation +### Hate speech (Huang et al. 2020) +This is a multilingual Twitter corpus for the task of hate speech detection with inferred four author demographic factors: age, country, gender and race/ethnicity. The corpus covers five languages: English, Italian, Polish, Portuguese and Spanish. + +### Bias dataset (De-Arteaga et al. 2019) +The aim of this dataset is to study the gender bias of models that identify a person’s occupation from their bios. + +---- + +## Training data +### CC100-XL +#### Description +Following the recent success of multilingual self-supervised pre-training (Devlin et al., 2019; Lample and Conneau, 2019; Con; Xue et al., 2020; Goyal et al., 2021a; Liu et al., 2020), we train our language models on a mixture of monolingual text of different languages. We extended the pipeline used for mining the CC100 corpus to generate CC100-XL, a significantly larger multilingual dataset covering 68 Common Crawl snapshots (from Summer 2013 to March/April 2020) and 134 languages. + +More details on the CC100-XL dataset can be found in the Appendix section of the paper. + +## RAI Dimensions +### Fairness (Bias and inclusion) +The XGLM model was evaluated on Hate speech and bias identification datasets. For hate speech, we observe that across the 5 languages in the dataset, in context learning results are only slightly better than random (50%). Another interesting observation is that most few shot results are worse than zero-shot, which indicates that the model is not able to utilize examples using the templates described in the paper. For bias identification, the XGLM (6.7B) English only model achieves the best performance on English and Spanish, while the GPT-3 model of comparable size (6.7B) model achieves the best in French. On certain occupations (e.g. model and teacher), XGLM 6.7B En only model and GPT-3 (6.7B) have very significant bias while XGLM 7.5B is much less biased. + +### Privacy and security +The XGLM model did not have any special Privacy and Security considerations. The training data and evaluation data were both public and went through standard Meta privacy and licensing procedures. + +### Transparency and control +In the spirit of transparency and accountability we have created this model card and a data card for the CC100-XL which can be found in the Appendix section of the paper. + +### Efficiency (Green AI) +From an engineering perspective, XGLM pertains to a family of models that represent single unified models catering to many languages which have wide application across many applications. Such a unified single model saves on carbon footprint as well as energy consumption (comparing to the alternative: separate models for different languages) leading to more energy efficiency. A single model, despite having the risk of being a single point of failure, has the powerful incentive of being easier to maintain, access, distribute, and track. + +## References +Edoardo Maria Ponti, Goran Glavas, Olga Majewska, Qianchu Liu, Ivan Vulic, and Anna Korhonen. 2020. XCOPA: A multilingual dataset for causal commonsense reasoning. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing, EMNLP 2020, Online, November 16-20, 2020, pages 2362–2376. Association for Computational Linguistics. +XCOPA Dataset | Papers With Code + +Alexey Tikhonov and Max Ryabinin. 2021. It’s all in the heads: Using attention heads as a baseline for cross-lingual transfer in commonsense reasoning. In Findings of the Association for Computational Linguistics: ACL/IJCNLP 2021, Online Event, August 1-6, 2021, volume ACL/IJCNLP 2021 of Findings of ACL, pages 3534–3546. Association for Computational Linguistics. +XWINO Dataset | Papers With Code (XWinograd) + +Yinfei Yang, Yuan Zhang, Chris Tar, and Jason Baldridge. 2019. PAWS-X: A cross-lingual adversarial dataset for paraphrase identification. CoRR, abs/1908.11828. +PAWS-X Dataset | Papers With Code + +Alexis Conneau, Guillaume Lample, Ruty Rinott, Adina Williams, Samuel R. Bowman, Holger Schwenk, and Veselin Stoyanov. 2018. XNLI: evaluating cross-lingual sentence representations. CoRR, abs/1809.05053. +XNLI Dataset | Papers With Code + +Xiaolei Huang, Linzi Xing, Franck Dernoncourt, and Michael Paul. 2020. Multilingual twitter corpus and baselines for evaluating demographic bias in hate speech recognition. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 1440–1448. + +Maria De-Arteaga, Alexey Romanov, Hanna Wallach, Jennifer Chayes, Christian Borgs, Alexandra Chouldechova, Sahin Geyik, Krishnaram Kenthapadi, and Adam Tauman Kalai. 2019. Bias in bios: A case study of semantic representation bias in a high-stakes setting. In proceedings of the Conference on Fairness, Accountability, and Transparency, pages 120–128. + +Nasrin Mostafazadeh, Nathanael Chambers, Xiaodong He, Devi Parikh, Dhruv Batra, Lucy Vanderwende, Pushmeet Kohli, James F. Allen. A Corpus and Evaluation Framework for Deeper Understanding of Commonsense Stories. CoRR abs/1604.01696. + +Jieyu Zhao, Subhabrata Mukherjee, Saghar Hosseini, Kai-Wei Chang, and Ahmed Hassan Awadallah. 2020. Gender bias in multilingual embeddings and crosslingual transfer. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 2896–2907. + +## Citation details +``` +@article{DBLP:journals/corr/abs-2112-10668, + author = {Xi Victoria Lin and + Todor Mihaylov and + Mikel Artetxe and + Tianlu Wang and + Shuohui Chen and + Daniel Simig and + Myle Ott and + Naman Goyal and + Shruti Bhosale and + Jingfei Du and + Ramakanth Pasunuru and + Sam Shleifer and + Punit Singh Koura and + Vishrav Chaudhary and + Brian O'Horo and + Jeff Wang and + Luke Zettlemoyer and + Zornitsa Kozareva and + Mona T. Diab and + Veselin Stoyanov and + Xian Li}, + title = {Few-shot Learning with Multilingual Language Models}, + journal = {CoRR}, + volume = {abs/2112.10668}, + year = {2021}, + url = {https://arxiv.org/abs/2112.10668}, + eprinttype = {arXiv}, + eprint = {2112.10668}, + timestamp = {Tue, 04 Jan 2022 15:59:27 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/examples/xlmr/README.md b/examples/xlmr/README.md index 65d4be13de..bba7910e30 100644 --- a/examples/xlmr/README.md +++ b/examples/xlmr/README.md @@ -1,9 +1,16 @@ # Unsupervised Cross-lingual Representation Learning at Scale (XLM-RoBERTa) https://arxiv.org/pdf/1911.02116.pdf +# Larger-Scale Transformers for Multilingual Masked Language Modeling +https://arxiv.org/pdf/2105.00572.pdf + + +## What's New: +- June 2021: `XLMR-XL` AND `XLMR-XXL` models released. + ## Introduction -XLM-R (XLM-RoBERTa) is a generic cross lingual sentence encoder that obtains state-of-the-art results on many cross-lingual understanding (XLU) benchmarks. It is trained on 2.5T of filtered CommonCrawl data in 100 languages (list below). +`XLM-R` (`XLM-RoBERTa`) is a generic cross lingual sentence encoder that obtains state-of-the-art results on many cross-lingual understanding (XLU) benchmarks. It is trained on `2.5T` of filtered CommonCrawl data in 100 languages (list below). Language | Language|Language |Language | Language ---|---|---|---|--- @@ -34,8 +41,8 @@ Model | Description | #params | vocab size | Download ---|---|---|---|--- `xlmr.base` | XLM-R using the BERT-base architecture | 250M | 250k | [xlm.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz) `xlmr.large` | XLM-R using the BERT-large architecture | 560M | 250k | [xlm.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz) - -(Note: Above are final model checkpoints. If you were using previously released `v0` version, we recommend using above. They have same architecture and dictionary.) +`xlmr.xl` | XLM-R (`layers=36, model_dim=2560`) | 3.5B | 250k | [xlm.xl.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xlmr/xlmr.xl.tar.gz) +`xlmr.xxl` | XLM-R (`layers=48, model_dim=4096`) | 10.7B | 250k | [xlm.xxl.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xlmr/xlmr.xxl.tar.gz) ## Results @@ -44,7 +51,9 @@ Model | Description | #params | vocab size | Download Model | average | en | fr | es | de | el | bg | ru | tr | ar | vi | th | zh | hi | sw | ur ---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|--- `roberta.large.mnli` _(TRANSLATE-TEST)_ | 77.8 | 91.3 | 82.9 | 84.3 | 81.2 | 81.7 | 83.1 | 78.3 | 76.8 | 76.6 | 74.2 | 74.1 | 77.5 | 70.9 | 66.7 | 66.8 -`xlmr.large` _(TRANSLATE-TRAIN-ALL)_ | **83.6** | 89.1 | 85.1 | 86.6 | 85.7 | 85.3 | 85.9 | 83.5 | 83.2 | 83.1 | 83.7 | 81.5 | 83.7 | 81.6 | 78.0 | 78.1 +`xlmr.large` _(TRANSLATE-TRAIN-ALL)_ | 83.6 | 89.1 | 85.1 | 86.6 | 85.7 | 85.3 | 85.9 | 83.5 | 83.2 | 83.1 | 83.7 | 81.5 | 83.7 | 81.6 | 78.0 | 78.1 +`xlmr.xl` _(TRANSLATE-TRAIN-ALL)_ | 85.4 | 91.1 | 87.2 | 88.1 | 87.0 | 87.4 | 87.8 | 85.3 | 85.2 | 85.3 | 86.2 | 83.8 | 85.3 | 83.1 | 79.8 | 78.2 | 85.4 +`xlmr.xxl` _(TRANSLATE-TRAIN-ALL)_ | 86.0 | 91.5 | 87.6 | 88.7 | 87.8 | 87.4 | 88.2 | 85.6 | 85.1 | 85.8 | 86.3 | 83.9 | 85.6 | 84.6 | 81.7 | 80.6 **[MLQA (Lewis et al., 2018)](https://arxiv.org/abs/1910.07475)** @@ -52,7 +61,9 @@ Model | average | en | es | de | ar | hi | vi | zh ---|---|---|---|---|---|---|---|--- `BERT-large` | - | 80.2/67.4 | - | - | - | - | - | - `mBERT` | 57.7 / 41.6 | 77.7 / 65.2 | 64.3 / 46.6 | 57.9 / 44.3 | 45.7 / 29.8| 43.8 / 29.7 | 57.1 / 38.6 | 57.5 / 37.3 -`xlmr.large` | **70.7 / 52.7** | 80.6 / 67.8 | 74.1 / 56.0 | 68.5 / 53.6 | 63.1 / 43.5 | 69.2 / 51.6 | 71.3 / 50.9 | 68.0 / 45.4 +`xlmr.large` | 70.7 / 52.7 | 80.6 / 67.8 | 74.1 / 56.0 | 68.5 / 53.6 | 63.1 / 43.5 | 69.2 / 51.6 | 71.3 / 50.9 | 68.0 / 45.4 +`xlmr.xl` | 73.4 / 55.3 | 85.1 / 72.6 | 66.7 / 46.2 | 70.5 / 55.5 | 74.3 / 56.9 | 72.2 / 54.7 | 74.4 / 52.9 | 70.9 / 48.5 +`xlmr.xxl` | 74.8 / 56.6 | 85.5 / 72.4 | 68.6 / 48.4 | 72.7 / 57.8 | 75.4 / 57.6 | 73.7 / 55.8 | 76.0 / 55.0 | 71.7 / 48.9 ## Example usage @@ -60,7 +71,7 @@ Model | average | en | es | de | ar | hi | vi | zh ##### Load XLM-R from torch.hub (PyTorch >= 1.1): ```python import torch -xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large') +xlmr = torch.hub.load('pytorch/fairseq:main', 'xlmr.large') xlmr.eval() # disable dropout (or leave in train mode to finetune) ``` @@ -121,3 +132,13 @@ assert torch.all(all_layers[-1] == last_layer_features) year={2019} } ``` + + +```bibtex +@article{goyal2021larger, + title={Larger-Scale Transformers for Multilingual Masked Language Modeling}, + author={Goyal, Naman and Du, Jingfei and Ott, Myle and Anantharaman, Giri and Conneau, Alexis}, + journal={arXiv preprint arXiv:2105.00572}, + year={2021} +} +``` diff --git a/examples/xmod/README.md b/examples/xmod/README.md new file mode 100644 index 0000000000..46958b8141 --- /dev/null +++ b/examples/xmod/README.md @@ -0,0 +1,151 @@ +# X-MOD: Lifting the Curse of Multilinguality by Pre-training Modular Transformers + +https://arxiv.org/abs/2205.06266 + + +## Introduction + +X-MOD extends multilingual masked language models like XLM-R to include language-specific modular components, introduced at each transformer layer. Each module is only used by one language. For fine-tuning, the modular components are frozen, and replaced with the target language in cross-lingual transfer settings. + + +## Pre-trained models + +Model | Size | # train steps | # langs | Download +---|---|---|---|--- +`xmod.base.13.125k` | BERT-base | 125k | 13 | [xmod.base.13.125k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.13.125k.tar.gz) +`xmod.base.30.125k` | BERT-base | 125k | 30 | [xmod.base.30.125k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.125k.tar.gz) +`xmod.base.30.195k` | BERT-base | 195k | 30 | [xmod.base.30.195k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.195k.tar.gz) +`xmod.base.60.125k` | BERT-base | 125k | 60 | [xmod.base.60.125k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.125k.tar.gz) +`xmod.base.60.265k` | BERT-base | 265k | 60 | [xmod.base.60.265k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.265k.tar.gz) +`xmod.base.75.125k` | BERT-base | 125k | 75 | [xmod.base.75.125k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.125k.tar.gz) +`xmod.base.75.269k` | BERT-base | 269k | 75 | [xmod.base.75.269k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.269k.tar.gz) +`xmod.base` | BERT-base | 1M | 81 | [xmod.base.81.1M.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.81.1M.tar.gz) +`xmod.large.prenorm` | BERT-large | 500k | 81 | [xmod.large.prenorm.81.500k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.large.prenorm.81.500k.tar.gz) + + +## Fine-tuning on NLI + +We next provide an example of how to fine-tune the pre-trained models above on Natural Language Inference (NLI). We use MNLI for training in English, and show how to run inference in other languages. + +### 1) Download a pre-trained model + +```bash +MODEL=xmod.base.81.1M +wget https://dl.fbaipublicfiles.com/fairseq/models/xmod/$MODEL.tar.gz +tar -xzf $MODEL.tar.gz +``` + +### 2) Download and preprocess [MNLI](https://cims.nyu.edu/~sbowman/multinli/) +```bash +wget https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip +unzip multinli_1.0.zip +python ./examples/xmod/preprocess_nli.py \ + --sentencepiece-model $MODEL/sentencepiece.bpe.model \ + --train multinli_1.0/multinli_1.0_train.jsonl \ + --valid multinli_1.0/multinli_1.0_dev_matched.jsonl \ + --destdir multinli_1.0/fairseq +``` + +### 3) Fine-tune on MNLI: + +```bash +MAX_EPOCH=5 +LR=1e-05 +BATCH_SIZE=32 +DATA_DIR=multinli_1.0/fairseq/bin + +CUDA_VISIBLE_DEVICES=0 fairseq-train $DATA_DIR \ + --restore-file $MODEL/model.pt \ + --save-dir $MODEL/nli \ + --reset-optimizer \ + --reset-dataloader \ + --reset-meters \ + --best-checkpoint-metric accuracy \ + --maximize-best-checkpoint-metric \ + --task sentence_prediction_adapters \ + --num-classes 3 \ + --init-token 0 \ + --separator-token 2 \ + --max-positions 512 \ + --shorten-method "truncate" \ + --arch xmod_base \ + --dropout 0.1 \ + --attention-dropout 0.1 \ + --weight-decay 0.01 \ + --criterion sentence_prediction_adapters \ + --optimizer adam \ + --adam-betas '(0.9, 0.98)' \ + --adam-eps 1e-06 \ + --clip-norm 0.0 \ + --lr-scheduler fixed \ + --lr $LR \ + --fp16 \ + --fp16-init-scale 4 \ + --threshold-loss-scale 1 \ + --fp16-scale-window 128 \ + --batch-size $BATCH_SIZE \ + --required-batch-size-multiple 1 \ + --update-freq 1 \ + --max-epoch $MAX_EPOCH +``` + +### 4) Run inference + +After training the model, we can load it and run inference in our target language. The default language is set to English, which is why we were not required to pass a language ID to the model during fine-tuning. To run inference in a non-English language, we need to tell the model that the module of the target language should be used instead: + +```python +from fairseq.models.xmod import XMODModel + +MODEL='xmod.base.81.1M/nli' +DATA='multinli_1.0/fairseq/bin' + +# Load model +model = XMODModel.from_pretrained( + model_name_or_path=MODEL, + checkpoint_file='checkpoint_best.pt', + data_name_or_path=DATA, + suffix='', + criterion='cross_entropy', + bpe='sentencepiece', + sentencepiece_model=DATA+'/input0/sentencepiece.bpe.model') +model = model.eval(); # disable dropout +model = model.half(); # use FP16 +model = model.cuda(); # move to GPU + +def predict(premise, hypothesis, lang): + tokens = model.encode(premise, hypothesis) + idx = model.predict('sentence_classification_head', tokens, lang_id=[lang]).argmax().item() + dictionary = model.task.label_dictionary + return dictionary[idx + dictionary.nspecial] + +predict( + premise='X-Mod hat spezifische Module die für jede Sprache existieren.', + hypothesis='X-Mod hat Module.', + lang='de_DE' +) # entailment + +predict( + premise='Londres es la capital del Reino Unido.', + hypothesis='Londres está en Francia.', + lang='es_XX', +) # contradiction + +predict( + premise='Patxik gogoko ditu babarrunak.', + hypothesis='Patxik babarrunak bazkaldu zituen.', + lang='eu_ES', +) # neutral +``` + + +## Citation + +```bibtex +@misc{pfeiffer2022xmod, + doi = {10.48550/ARXIV.2205.06266}, + url = {https://arxiv.org/abs/2205.06266}, + title = {Lifting the Curse of Multilinguality by Pre-training Modular Transformers}, + publisher = {arXiv}, + year = {2022}, +} +``` diff --git a/examples/xmod/preprocess_nli.py b/examples/xmod/preprocess_nli.py new file mode 100644 index 0000000000..e1fb91c5d3 --- /dev/null +++ b/examples/xmod/preprocess_nli.py @@ -0,0 +1,168 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import json +import collections +import argparse +import shutil +import subprocess +import sys +import tempfile +from multiprocessing import Pool +import sentencepiece as spm + + +def preprocess(spm_model_path, train_path, valid_path, test_path, dest_dir, remove_empty=False, output_format='piece', workers=20): + with tempfile.TemporaryDirectory() as tmp: + # Tokenize with SentencePiece + for split, path in ('train', train_path), ('valid', valid_path), ('test', test_path): + if path is None: + continue + if path == '-': + path = sys.stdin.fileno() + with open(path, encoding='utf-8', errors='surrogateescape') as fin: + with open(f'{tmp}/{split}', mode='w', encoding='utf-8', errors='surrogateescape') as fout: + encoder = MultiprocessingEncoder(model=spm_model_path, remove_empty=remove_empty, output_format=output_format) + pool = Pool(workers, initializer=encoder.initializer) + encoded_lines = pool.imap(encoder.encode, fin, 10000) + for i, line in enumerate(encoded_lines, start=1): + if line is not None: + print(line, file=fout) + if i % 10000 == 0: + print("tokenized {} lines".format(i), file=sys.stderr) + + # Generate dictionary + sp = spm.SentencePieceProcessor(model_file=spm_model_path) + if output_format == 'piece': + vocab = [sp.id_to_piece(i) for i in range(3, sp.vocab_size())] + else: + vocab = map(str, range(sp.vocab_size())) + with open(f'{tmp}/dict.txt', mode='w', encoding='utf-8', errors='surrogateescape') as f: + for word in vocab: + print(word, 1, file=f) + + # Binarize + command = [ + 'python3', '-m', 'fairseq_cli.preprocess', + '--only-source', + '--thresholdsrc', '0', + '--destdir', dest_dir, + '--srcdict', f'{tmp}/dict.txt', + '--workers', '20', + ] + for split, path in ('train', train_path), ('valid', valid_path), ('test', test_path): + if path is not None: + command += [f'--{split}pref', f'{tmp}/{split}'] + subprocess.run(command) + + # Copy SentencePiece model + shutil.copyfile(spm_model_path, f'{dest_dir}/sentencepiece.bpe.model') + + +class MultiprocessingEncoder(object): + def __init__(self, model, remove_empty, output_format): + self.model = model + self.remove_empty = remove_empty + self.output_format = output_format + + def initializer(self): + global sp + sp = spm.SentencePieceProcessor(model_file=self.model) + + def encode(self, line): + global sp + line = line.strip() + if len(line) == 0 and self.remove_empty: + return None + + if self.output_format == 'piece': + return ' '.join(sp.encode_as_pieces(line)) + else: + return ' '.join(map(str, sp.encode(line))) + + +def write_lines(lines, path): + with open(path, mode='x', encoding='utf-8') as f: + for line in lines: + print(line, file=f) + + +def read_jsonl(path): + with open(path, encoding='utf-8') as f: + return [json.loads(line) for line in f.read().splitlines()] + + +def read_nli(path, langs=None): + data = read_jsonl(path) + + if langs is not None: + data = [sample for sample in data if sample.get('language') in langs] + + lang2count = collections.defaultdict(int) + for sample in data: + lang2count[sample.get('language')] += 1 + + if langs: + assert set(lang2count.keys()) == set(langs) + + nlangs = len(lang2count) + assert nlangs > 0 + lens = list(lang2count.values()) + assert all([lens[0] == length for length in lens]) + + print(f'Loaded {lens[0]} samples in {nlangs} languages from {path}', file=sys.stderr) + return data + + +def main(): + parser = argparse.ArgumentParser(description='Tokenize and binarize NLI data') + parser.add_argument('--sentencepiece-model', required=True) + parser.add_argument('--train', required=True, help='Training data in jsonl format') + parser.add_argument('--valid', required=True, help='Validation data in jsonl format') + parser.add_argument('--destdir', required=True) + + args = parser.parse_args() + + os.makedirs(args.destdir + '/raw',) + os.makedirs(args.destdir + '/bin', ) + + # Extract input/labels + for split, path in ('train', args.train), ('valid', args.valid): + data = read_nli(path, langs=None) + original_size = len(data) + data = [sample for sample in data if sample['gold_label'] != '-'] + assert all(sample['gold_label'] in ('contradiction', 'entailment', 'neutral') for sample in data) + filtered_size = len(data) + if filtered_size != original_size: + print(f'Filtered {filtered_size}/{original_size} samples from {path}', file=sys.stderr) + for name, field in ('input0', 'sentence1'), ('input1', 'sentence2'), ('label', 'gold_label'): + write_lines([sample[field] for sample in data], f'{args.destdir}/raw/{split}.{name}.txt') + + # Tokenize and binarize input + for field in 'input0', 'input1': + preprocess( + spm_model_path=args.sentencepiece_model, + train_path=f'{args.destdir}/raw/train.{field}.txt', + valid_path=f'{args.destdir}/raw/valid.{field}.txt', + test_path=None, + dest_dir=f'{args.destdir}/bin/{field}', + workers=20, + ) + + # Binarize labels + subprocess.run([ + 'python3', '-m', 'fairseq_cli.preprocess', + '--trainpref', f'{args.destdir}/raw/train.label.txt', + '--validpref', f'{args.destdir}/raw/valid.label.txt', + '--only-source', + '--thresholdsrc', '0', + '--destdir', f'{args.destdir}/bin/label', + '--workers', '20', + ]) + + +if __name__ == '__main__': + main() diff --git a/fairseq/__init__.py b/fairseq/__init__.py index 4ccfc90257..080c988b2d 100644 --- a/fairseq/__init__.py +++ b/fairseq/__init__.py @@ -16,14 +16,22 @@ __all__ = ["pdb"] -# backwards compatibility to support `from fairseq.meters import AverageMeter` +# backwards compatibility to support `from fairseq.X import Y` +from fairseq.distributed import utils as distributed_utils from fairseq.logging import meters, metrics, progress_bar # noqa +sys.modules["fairseq.distributed_utils"] = distributed_utils sys.modules["fairseq.meters"] = meters sys.modules["fairseq.metrics"] = metrics sys.modules["fairseq.progress_bar"] = progress_bar +# initialize hydra +from fairseq.dataclass.initialize import hydra_init + +hydra_init() + import fairseq.criterions # noqa +import fairseq.distributed # noqa import fairseq.models # noqa import fairseq.modules # noqa import fairseq.optim # noqa diff --git a/fairseq/benchmark/__init__.py b/fairseq/benchmark/__init__.py index f6584661bd..0317d5c623 100644 --- a/fairseq/benchmark/__init__.py +++ b/fairseq/benchmark/__init__.py @@ -4,4 +4,4 @@ # LICENSE file in the root directory of this source tree. # import models/tasks to register them -from . import dummy_lm, dummy_masked_lm, dummy_model, dummy_mt # noqa +from . import dummy_dataset, dummy_lm, dummy_masked_lm, dummy_model, dummy_mt # noqa diff --git a/fairseq/benchmark/benchmark_multihead_attention.py b/fairseq/benchmark/benchmark_multihead_attention.py new file mode 100644 index 0000000000..a44847f250 --- /dev/null +++ b/fairseq/benchmark/benchmark_multihead_attention.py @@ -0,0 +1,172 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import itertools +import random + +import torch +from torch.utils import benchmark + +from fairseq.modules.multihead_attention import MultiheadAttention + +BATCH = [20, 41, 97] +SEQ = 64 +EMB = 48 +HEADS = 4 +DROP = 0.1 +DEVICE = torch.device("cuda") +ATTN_MASK_DTYPE = [torch.uint8, torch.bool, torch.float] +KEY_PADDING_MASK_DTYPE = [torch.uint8, torch.bool] + + +def _reset_seeds(): + torch.manual_seed(0) + random.seed(0) + + +def _get_mask(to_dtype: torch.dtype, dim0: int, dim1: int): + if to_dtype == torch.float: + mask = torch.randint(0, 2, (dim0, dim1)).to(dtype=torch.bool) + return mask.to(dtype=to_dtype).masked_fill(mask, -float("inf")) + return torch.randint(0, 2, (dim0, dim1)).to(dtype=to_dtype) + + +def benchmark_multihead_attention( + label="", + attn_dtype=torch.uint8, + key_padding_dtype=torch.uint8, + add_bias_kv=False, + add_zero_attn=False, + static_kv=False, + batch_size=20, + embedding=EMB, + seq_len=SEQ, + num_heads=HEADS, +): + + results = [] + # device = torch.device("cuda") + + xformers_att_config = '{"name": "scaled_dot_product"}' + + attn_mask = _get_mask(to_dtype=attn_dtype, dim0=seq_len, dim1=seq_len) + key_padding_mask = _get_mask( + to_dtype=key_padding_dtype, dim0=batch_size, dim1=seq_len + ) + + q = torch.rand(seq_len, batch_size, embedding, requires_grad=True) + k = torch.rand(seq_len, batch_size, embedding, requires_grad=True) + v = torch.rand(seq_len, batch_size, embedding, requires_grad=True) + + _reset_seeds() + + original_mha = MultiheadAttention( + embedding, + num_heads, + dropout=0.0, + xformers_att_config=None, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ) + + xformers_mha = MultiheadAttention( + embedding, + num_heads, + dropout=0.0, + xformers_att_config=xformers_att_config, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ) + + def original_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv): + original_mha( + query=q, + key=k, + value=v, + key_padding_mask=key_padding_mask, + attn_mask=attn_mask, + static_kv=static_kv, + ) + + def xformers_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv): + xformers_mha( + query=q, + key=k, + value=v, + key_padding_mask=key_padding_mask, + attn_mask=attn_mask, + static_kv=static_kv, + ) + + def original_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv): + output, _ = original_mha( + query=q, + key=k, + value=v, + key_padding_mask=key_padding_mask, + attn_mask=attn_mask, + static_kv=static_kv, + ) + loss = torch.norm(output) + loss.backward() + + def xformers_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv): + output, _ = xformers_mha( + query=q, + key=k, + value=v, + key_padding_mask=key_padding_mask, + attn_mask=attn_mask, + static_kv=static_kv, + ) + loss = torch.norm(output) + loss.backward() + + fns = [ + original_bench_fw, + xformers_bench_fw, + original_bench_fw_bw, + xformers_bench_fw_bw, + ] + + for fn in fns: + results.append( + benchmark.Timer( + stmt="fn(q, k, v, key_padding_mask, attn_mask, static_kv)", + globals={ + "q": q, + "k": k, + "v": v, + "key_padding_mask": key_padding_mask, + "attn_mask": attn_mask, + "static_kv": static_kv, + "fn": fn, + }, + label="multihead fw + bw", + sub_label=f"{fn.__name__}", + description=label, + ).blocked_autorange(min_run_time=1) + ) + + compare = benchmark.Compare(results) + compare.print() + + +def run_benchmarks(): + for attn_dtype, key_padding_dtype, add_bias_kv, add_zero_attn in itertools.product( + ATTN_MASK_DTYPE, KEY_PADDING_MASK_DTYPE, [True, False], [True, False] + ): + label = f"attn_dtype {attn_dtype}, key_padding_dtype {key_padding_dtype}, \ + add_bias_kv {add_bias_kv}, add_zero_attn {add_zero_attn}" + benchmark_multihead_attention( + label=label, + attn_dtype=attn_dtype, + key_padding_dtype=key_padding_dtype, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ) + + +run_benchmarks() diff --git a/fairseq/benchmark/dummy_dataset.py b/fairseq/benchmark/dummy_dataset.py new file mode 100644 index 0000000000..2f051754af --- /dev/null +++ b/fairseq/benchmark/dummy_dataset.py @@ -0,0 +1,36 @@ +import numpy as np +from fairseq.data import FairseqDataset + + +class DummyDataset(FairseqDataset): + def __init__(self, batch, num_items, item_size): + super().__init__() + self.batch = batch + self.num_items = num_items + self.item_size = item_size + + def __getitem__(self, index): + return index + + def __len__(self): + return self.num_items + + def collater(self, samples): + return self.batch + + @property + def sizes(self): + return np.array([self.item_size] * self.num_items) + + def num_tokens(self, index): + return self.item_size + + def size(self, index): + return self.item_size + + def ordered_indices(self): + return np.arange(self.num_items) + + @property + def supports_prefetch(self): + return False diff --git a/fairseq/benchmark/dummy_lm.py b/fairseq/benchmark/dummy_lm.py index 6429d04de3..c6246a0c0e 100644 --- a/fairseq/benchmark/dummy_lm.py +++ b/fairseq/benchmark/dummy_lm.py @@ -4,76 +4,74 @@ # LICENSE file in the root directory of this source tree. import logging +from dataclasses import dataclass, field +from typing import Optional -import numpy as np import torch -from fairseq.data import Dictionary, FairseqDataset -from fairseq.tasks import LegacyFairseqTask, register_task +from .dummy_dataset import DummyDataset +from fairseq.data import Dictionary +from fairseq.dataclass import FairseqDataclass +from fairseq.tasks import FairseqTask, register_task +from omegaconf import II logger = logging.getLogger(__name__) -@register_task("dummy_lm") -class DummyLMTask(LegacyFairseqTask): - @staticmethod - def add_args(parser): - """Add task-specific arguments to the parser.""" - parser.add_argument("--dict-size", default=49996, type=int) - parser.add_argument("--dataset-size", default=100000, type=int) - parser.add_argument( - "--tokens-per-sample", - default=512, - type=int, - help="max number of total tokens over all segments " - "per sample for BERT dataset", - ) +@dataclass +class DummyLMConfig(FairseqDataclass): + dict_size: int = 49996 + dataset_size: int = 100000 + tokens_per_sample: int = field( + default=512, metadata={"help": "max sequence length"} + ) + add_bos_token: bool = False + batch_size: Optional[int] = II("dataset.batch_size") + max_tokens: Optional[int] = II("dataset.max_tokens") + max_target_positions: int = II("task.tokens_per_sample") + - def __init__(self, args, dictionary): - super().__init__(args) - self.dictionary = dictionary - self.seed = args.seed +@register_task("dummy_lm", dataclass=DummyLMConfig) +class DummyLMTask(FairseqTask): + def __init__(self, cfg: DummyLMConfig): + super().__init__(cfg) - dictionary.pad_to_multiple_(8) # often faster if divisible by 8 + # load dictionary + self.dictionary = Dictionary() + for i in range(cfg.dict_size): + self.dictionary.add_symbol("word{}".format(i)) + self.dictionary.pad_to_multiple_(8) # often faster if divisible by 8 + logger.info("dictionary: {} types".format(len(self.dictionary))) - seq = torch.arange(args.tokens_per_sample + 1) + dictionary.pad() + 1 + seq = torch.arange(cfg.tokens_per_sample + 1) + self.dictionary.pad() + 1 self.dummy_src = seq[:-1] self.dummy_tgt = seq[1:] - @classmethod - def setup_task(cls, args, **kwargs): - """Setup the task. """ - dictionary = Dictionary() - for i in range(args.dict_size): - dictionary.add_symbol("word{}".format(i)) - logger.info("dictionary: {} types".format(len(dictionary))) - return cls(args, dictionary) - def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ - if self.args.batch_size is not None: - bsz = self.args.batch_size + if self.cfg.batch_size is not None: + bsz = self.cfg.batch_size else: - bsz = max(1, self.args.max_tokens // self.args.tokens_per_sample) + bsz = max(1, self.cfg.max_tokens // self.cfg.tokens_per_sample) self.datasets[split] = DummyDataset( { "id": 1, "net_input": { "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]), "src_lengths": torch.full( - (bsz,), self.args.tokens_per_sample, dtype=torch.long + (bsz,), self.cfg.tokens_per_sample, dtype=torch.long ), }, "target": torch.stack([self.dummy_tgt for _ in range(bsz)]), "nsentences": bsz, - "ntokens": bsz * self.args.tokens_per_sample, + "ntokens": bsz * self.cfg.tokens_per_sample, }, - num_items=self.args.dataset_size, - item_size=self.args.tokens_per_sample, + num_items=self.cfg.dataset_size, + item_size=self.cfg.tokens_per_sample, ) @property @@ -83,37 +81,3 @@ def source_dictionary(self): @property def target_dictionary(self): return self.dictionary - - -class DummyDataset(FairseqDataset): - def __init__(self, batch, num_items, item_size): - super().__init__() - self.batch = batch - self.num_items = num_items - self.item_size = item_size - - def __getitem__(self, index): - return index - - def __len__(self): - return self.num_items - - def collater(self, samples): - return self.batch - - @property - def sizes(self): - return np.array([self.item_size] * self.num_items) - - def num_tokens(self, index): - return self.item_size - - def size(self, index): - return self.item_size - - def ordered_indices(self): - return np.arange(self.num_items) - - @property - def supports_prefetch(self): - return False diff --git a/fairseq/benchmark/dummy_masked_lm.py b/fairseq/benchmark/dummy_masked_lm.py index ab506fe1d5..12b9c5d0f5 100644 --- a/fairseq/benchmark/dummy_masked_lm.py +++ b/fairseq/benchmark/dummy_masked_lm.py @@ -4,43 +4,53 @@ # LICENSE file in the root directory of this source tree. import logging +from dataclasses import dataclass, field +from typing import Optional -import numpy as np import torch -from fairseq.data import Dictionary, FairseqDataset -from fairseq.tasks import LegacyFairseqTask, register_task +from omegaconf import II +from .dummy_dataset import DummyDataset +from fairseq.data import Dictionary +from fairseq.dataclass import FairseqDataclass +from fairseq.tasks import FairseqTask, register_task logger = logging.getLogger(__name__) -@register_task("dummy_masked_lm") -class DummyMaskedLMTask(LegacyFairseqTask): - @staticmethod - def add_args(parser): - """Add task-specific arguments to the parser.""" - parser.add_argument("--dict-size", default=49995, type=int) - parser.add_argument("--dataset-size", default=100000, type=int) - parser.add_argument( - "--tokens-per-sample", - default=512, - type=int, - help="max number of total tokens over all segments " - "per sample for BERT dataset", - ) - - def __init__(self, args, dictionary): - super().__init__(args) - self.dictionary = dictionary - +@dataclass +class DummyMaskedLMConfig(FairseqDataclass): + dict_size: int = 49996 + dataset_size: int = 100000 + tokens_per_sample: int = field( + default=512, + metadata={ + "help": "max number of total tokens over all" + " segments per sample for BERT dataset" + }, + ) + batch_size: Optional[int] = II("dataset.batch_size") + max_tokens: Optional[int] = II("dataset.max_tokens") + max_target_positions: int = II("task.tokens_per_sample") + + +@register_task("dummy_masked_lm", dataclass=DummyMaskedLMConfig) +class DummyMaskedLMTask(FairseqTask): + def __init__(self, cfg: DummyMaskedLMConfig): + super().__init__(cfg) + + self.dictionary = Dictionary() + for i in range(cfg.dict_size): + self.dictionary.add_symbol("word{}".format(i)) + logger.info("dictionary: {} types".format(len(self.dictionary))) # add mask token - self.mask_idx = dictionary.add_symbol("<mask>") - dictionary.pad_to_multiple_(8) # often faster if divisible by 8 + self.mask_idx = self.dictionary.add_symbol("<mask>") + self.dictionary.pad_to_multiple_(8) # often faster if divisible by 8 mask_idx = 0 pad_idx = 1 - seq = torch.arange(args.tokens_per_sample) + pad_idx + 1 - mask = torch.arange(2, args.tokens_per_sample, 7) # ~15% + seq = torch.arange(cfg.tokens_per_sample) + pad_idx + 1 + mask = torch.arange(2, cfg.tokens_per_sample, 7) # ~15% src = seq.clone() src[mask] = mask_idx tgt = torch.full_like(seq, pad_idx) @@ -49,39 +59,30 @@ def __init__(self, args, dictionary): self.dummy_src = src self.dummy_tgt = tgt - @classmethod - def setup_task(cls, args, **kwargs): - """Setup the task. """ - dictionary = Dictionary() - for i in range(args.dict_size): - dictionary.add_symbol("word{}".format(i)) - logger.info("dictionary: {} types".format(len(dictionary))) - return cls(args, dictionary) - def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ - if self.args.batch_size is not None: - bsz = self.args.batch_size + if self.cfg.batch_size is not None: + bsz = self.cfg.batch_size else: - bsz = max(1, self.args.max_tokens // self.args.tokens_per_sample) + bsz = max(1, self.cfg.max_tokens // self.cfg.tokens_per_sample) self.datasets[split] = DummyDataset( { "id": 1, "net_input": { "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]), "src_lengths": torch.full( - (bsz,), self.args.tokens_per_sample, dtype=torch.long + (bsz,), self.cfg.tokens_per_sample, dtype=torch.long ), }, "target": torch.stack([self.dummy_tgt for _ in range(bsz)]), "nsentences": bsz, - "ntokens": bsz * self.args.tokens_per_sample, + "ntokens": bsz * self.cfg.tokens_per_sample, }, - num_items=self.args.dataset_size, - item_size=self.args.tokens_per_sample, + num_items=self.cfg.dataset_size, + item_size=self.cfg.tokens_per_sample, ) @property @@ -91,37 +92,3 @@ def source_dictionary(self): @property def target_dictionary(self): return self.dictionary - - -class DummyDataset(FairseqDataset): - def __init__(self, batch, num_items, item_size): - super().__init__() - self.batch = batch - self.num_items = num_items - self.item_size = item_size - - def __getitem__(self, index): - return index - - def __len__(self): - return self.num_items - - def collater(self, samples): - return self.batch - - @property - def sizes(self): - return np.array([self.item_size] * self.num_items) - - def num_tokens(self, index): - return self.item_size - - def size(self, index): - return self.item_size - - def ordered_indices(self): - return np.arange(self.num_items) - - @property - def supports_prefetch(self): - return False diff --git a/fairseq/benchmark/dummy_mt.py b/fairseq/benchmark/dummy_mt.py index 4ca7be93a3..28d78cffdb 100644 --- a/fairseq/benchmark/dummy_mt.py +++ b/fairseq/benchmark/dummy_mt.py @@ -7,10 +7,10 @@ import numpy as np import torch + from fairseq.data import Dictionary, FairseqDataset from fairseq.tasks import LegacyFairseqTask, register_task - logger = logging.getLogger(__name__) @@ -36,7 +36,7 @@ def __init__(self, args, dictionary): @classmethod def setup_task(cls, args, **kwargs): - """Setup the task. """ + """Setup the task.""" dictionary = Dictionary() for i in range(args.dict_size): dictionary.add_symbol("word{}".format(i)) diff --git a/fairseq/binarizer.py b/fairseq/binarizer.py index 0255c084b5..6f03d7a2cb 100644 --- a/fairseq/binarizer.py +++ b/fairseq/binarizer.py @@ -3,103 +3,379 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import logging import os +import typing as tp +from abc import ABC, abstractmethod from collections import Counter +from dataclasses import dataclass +from multiprocessing import Pool import torch + +from fairseq.data import Dictionary, indexed_dataset +from fairseq.file_chunker_utils import Chunker, find_offsets from fairseq.file_io import PathManager from fairseq.tokenizer import tokenize_line +logger = logging.getLogger("binarizer") + + +@dataclass +class BinarizeSummary: + """ + Keep track of what's going on in the binarizer + """ + + num_seq: int = 0 + replaced: tp.Optional[Counter] = None + num_tok: int = 0 + + @property + def num_replaced(self) -> int: + if self.replaced is None: + return 0 + return sum(self.replaced.values()) + + @property + def replaced_percent(self) -> float: + return 100 * self.num_replaced / self.num_tok + + def __str__(self) -> str: + base = f"{self.num_seq} sents, {self.num_tok} tokens" + if self.replaced is None: + return base + + return f"{base}, {self.replaced_percent:.3}% replaced" + + def merge(self, other: "BinarizeSummary"): + replaced = None + if self.replaced is not None: + replaced = self.replaced + if other.replaced is not None: + if replaced is None: + replaced = other.replaced + else: + replaced += other.replaced + self.replaced = replaced + self.num_seq += other.num_seq + self.num_tok += other.num_tok + + +class Binarizer(ABC): + """ + a binarizer describes how to take a string and build a tensor out of it + """ + + @abstractmethod + def binarize_line( + self, + line: str, + summary: BinarizeSummary, + ) -> torch.IntTensor: + ... + -def safe_readline(f): - pos = f.tell() - while True: - try: - return f.readline() - except UnicodeDecodeError: - pos -= 1 - f.seek(pos) # search where this character begins +def _worker_prefix(output_prefix: str, worker_id: int): + return f"{output_prefix}.pt{worker_id}" -class Binarizer: +class FileBinarizer: + """ + An file binarizer can take a file, tokenize it, and binarize each line to a tensor + """ + + @classmethod + def multiprocess_dataset( + cls, + input_file: str, + dataset_impl: str, + binarizer: Binarizer, + output_prefix: str, + vocab_size=None, + num_workers=1, + ) -> BinarizeSummary: + final_summary = BinarizeSummary() + + offsets = find_offsets(input_file, num_workers) + # find_offsets returns a list of position [pos1, pos2, pos3, pos4] but we would want pairs: + # [(pos1, pos2), (pos2, pos3), (pos3, pos4)] to process the chunks with start/end info + # we zip the list with itself shifted by one to get all the pairs. + (first_chunk, *more_chunks) = zip(offsets, offsets[1:]) + pool = None + if num_workers > 1: + pool = Pool(processes=num_workers - 1) + worker_results = [ + pool.apply_async( + cls._binarize_chunk_and_finalize, + args=( + binarizer, + input_file, + start_offset, + end_offset, + _worker_prefix( + output_prefix, + worker_id, + ), + dataset_impl, + ), + kwds={ + "vocab_size": vocab_size, + } + if vocab_size is not None + else {}, + ) + for worker_id, (start_offset, end_offset) in enumerate( + more_chunks, start=1 + ) + ] + + pool.close() + pool.join() + for r in worker_results: + summ = r.get() + final_summary.merge(summ) + + # do not close the bin file as we need to merge the worker results in + final_ds, summ = cls._binarize_file_chunk( + binarizer, + input_file, + offset_start=first_chunk[0], + offset_end=first_chunk[1], + output_prefix=output_prefix, + dataset_impl=dataset_impl, + vocab_size=vocab_size if vocab_size is not None else None, + ) + final_summary.merge(summ) + + if num_workers > 1: + for worker_id in range(1, num_workers): + # merge the worker outputs + worker_output_prefix = _worker_prefix( + output_prefix, + worker_id, + ) + final_ds.merge_file_(worker_output_prefix) + try: + os.remove(indexed_dataset.data_file_path(worker_output_prefix)) + os.remove(indexed_dataset.index_file_path(worker_output_prefix)) + except Exception as e: + logger.error( + f"couldn't remove {worker_output_prefix}.*", exc_info=e + ) + + # now we can close the file + idx_file = indexed_dataset.index_file_path(output_prefix) + final_ds.finalize(idx_file) + return final_summary + @staticmethod - def binarize( - filename, - dict, - consumer, - tokenize=tokenize_line, - append_eos=True, - reverse_order=False, - offset=0, - end=-1, - already_numberized=False, + def _binarize_file_chunk( + binarizer: Binarizer, + filename: str, + offset_start: int, + offset_end: int, + output_prefix: str, + dataset_impl: str, + vocab_size=None, + ) -> tp.Tuple[tp.Any, BinarizeSummary]: # (dataset builder, BinarizeSummary) + """ + creates a dataset builder and append binarized items to it. This function does not + finalize the builder, this is useful if you want to do other things with your bin file + like appending/merging other files + """ + bin_file = indexed_dataset.data_file_path(output_prefix) + ds = indexed_dataset.make_builder( + bin_file, + impl=dataset_impl, + vocab_size=vocab_size, + ) + summary = BinarizeSummary() + + with Chunker( + PathManager.get_local_path(filename), offset_start, offset_end + ) as line_iterator: + for line in line_iterator: + ds.add_item(binarizer.binarize_line(line, summary)) + + return ds, summary + + @classmethod + def _binarize_chunk_and_finalize( + cls, + binarizer: Binarizer, + filename: str, + offset_start: int, + offset_end: int, + output_prefix: str, + dataset_impl: str, + vocab_size=None, ): - nseq, ntok = 0, 0 - replaced = Counter() + """ + same as above, but also finalizes the builder + """ + ds, summ = cls._binarize_file_chunk( + binarizer, + filename, + offset_start, + offset_end, + output_prefix, + dataset_impl, + vocab_size=vocab_size, + ) + + idx_file = indexed_dataset.index_file_path(output_prefix) + ds.finalize(idx_file) + + return summ + + +class VocabularyDatasetBinarizer(Binarizer): + """ + Takes a Dictionary/Vocabulary, assign ids to each + token using the dictionary encode_line function. + """ + + def __init__( + self, + dict: Dictionary, + tokenize: tp.Callable[[str], tp.List[str]] = tokenize_line, + append_eos: bool = True, + reverse_order: bool = False, + already_numberized: bool = False, + ) -> None: + self.dict = dict + self.tokenize = tokenize + self.append_eos = append_eos + self.reverse_order = reverse_order + self.already_numberized = already_numberized + super().__init__() + + def binarize_line( + self, + line: str, + summary: BinarizeSummary, + ): + if summary.replaced is None: + summary.replaced = Counter() def replaced_consumer(word, idx): - if idx == dict.unk_index and word != dict.unk_word: - replaced.update([word]) - - with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f: - f.seek(offset) - # next(f) breaks f.tell(), hence readline() must be used - line = safe_readline(f) - while line: - if end > 0 and f.tell() > end: - break - if already_numberized: - id_strings = line.strip().split() - id_list = [int(id_string) for id_string in id_strings] - if reverse_order: - id_list.reverse() - if append_eos: - id_list.append(dict.eos()) - ids = torch.IntTensor(id_list) - else: - ids = dict.encode_line( - line=line, - line_tokenizer=tokenize, - add_if_not_exist=False, - consumer=replaced_consumer, - append_eos=append_eos, - reverse_order=reverse_order, - ) - nseq += 1 - ntok += len(ids) - consumer(ids) - line = f.readline() - return { - "nseq": nseq, - "nunk": sum(replaced.values()), - "ntok": ntok, - "replaced": replaced, - } + if idx == self.dict.unk_index and word != self.dict.unk_word: + summary.replaced.update([word]) + + if self.already_numberized: + id_strings = line.strip().split() + id_list = [int(id_string) for id_string in id_strings] + if self.reverse_order: + id_list.reverse() + if self.append_eos: + id_list.append(self.dict.eos()) + ids = torch.IntTensor(id_list) + else: + ids = self.dict.encode_line( + line=line, + line_tokenizer=self.tokenize, + add_if_not_exist=False, + consumer=replaced_consumer, + append_eos=self.append_eos, + reverse_order=self.reverse_order, + ) + + summary.num_seq += 1 + summary.num_tok += len(ids) + return ids - @staticmethod - def binarize_alignments(filename, alignment_parser, consumer, offset=0, end=-1): - nseq = 0 - - with open(PathManager.get_local_path(filename), "r") as f: - f.seek(offset) - line = safe_readline(f) - while line: - if end > 0 and f.tell() > end: - break - ids = alignment_parser(line) - nseq += 1 - consumer(ids) - line = f.readline() - return {"nseq": nseq} + +class AlignmentDatasetBinarizer(Binarizer): + """ + binarize by parsing a set of alignments and packing + them in a tensor (see utils.parse_alignment) + """ + + def __init__( + self, + alignment_parser: tp.Callable[[str], torch.IntTensor], + ) -> None: + super().__init__() + self.alignment_parser = alignment_parser + + def binarize_line( + self, + line: str, + summary: BinarizeSummary, + ): + ids = self.alignment_parser(line) + summary.num_seq += 1 + summary.num_tok += len(ids) + return ids + + +class LegacyBinarizer: + @classmethod + def binarize( + cls, + filename: str, + dico: Dictionary, + consumer: tp.Callable[[torch.IntTensor], None], + tokenize: tp.Callable[[str], tp.List[str]] = tokenize_line, + append_eos: bool = True, + reverse_order: bool = False, + offset: int = 0, + end: int = -1, + already_numberized: bool = False, + ) -> tp.Dict[str, int]: + binarizer = VocabularyDatasetBinarizer( + dict=dico, + tokenize=tokenize, + append_eos=append_eos, + reverse_order=reverse_order, + already_numberized=already_numberized, + ) + return cls._consume_file( + filename, + binarizer, + consumer, + offset_start=offset, + offset_end=end, + ) + + @classmethod + def binarize_alignments( + cls, + filename: str, + alignment_parser: tp.Callable[[str], torch.IntTensor], + consumer: tp.Callable[[torch.IntTensor], None], + offset: int = 0, + end: int = -1, + ) -> tp.Dict[str, int]: + binarizer = AlignmentDatasetBinarizer(alignment_parser) + return cls._consume_file( + filename, + binarizer, + consumer, + offset_start=offset, + offset_end=end, + ) @staticmethod - def find_offsets(filename, num_chunks): - with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f: - size = os.fstat(f.fileno()).st_size - chunk_size = size // num_chunks - offsets = [0 for _ in range(num_chunks + 1)] - for i in range(1, num_chunks): - f.seek(chunk_size * i) - safe_readline(f) - offsets[i] = f.tell() - return offsets + def _consume_file( + filename: str, + binarizer: Binarizer, + consumer: tp.Callable[[torch.IntTensor], None], + offset_start: int, + offset_end: int, + ) -> tp.Dict[str, int]: + summary = BinarizeSummary() + + with Chunker( + PathManager.get_local_path(filename), offset_start, offset_end + ) as line_iterator: + for line in line_iterator: + consumer(binarizer.binarize_line(line, summary)) + + return { + "nseq": summary.num_seq, + "nunk": summary.num_replaced, + "ntok": summary.num_tok, + "replaced": summary.replaced, + } diff --git a/fairseq/checkpoint_utils.py b/fairseq/checkpoint_utils.py index f8a5855622..e3f316b9e7 100644 --- a/fairseq/checkpoint_utils.py +++ b/fairseq/checkpoint_utils.py @@ -5,32 +5,38 @@ import ast import collections +import contextlib +import inspect import logging import os import re +import time import traceback from collections import OrderedDict -from typing import Optional, Union +from pathlib import Path +from typing import Any, Dict, Optional, Union +import numpy as np import torch +from fairseq.data import data_utils +from fairseq.dataclass.configs import CheckpointConfig from fairseq.dataclass.utils import ( convert_namespace_to_omegaconf, overwrite_args_by_name, ) +from fairseq.distributed.fully_sharded_data_parallel import FSDP, has_FSDP from fairseq.file_io import PathManager from fairseq.models import FairseqDecoder, FairseqEncoder -from omegaconf import DictConfig, open_dict -from torch.serialization import default_restore_location - +from omegaconf import DictConfig, OmegaConf, open_dict logger = logging.getLogger(__name__) -def save_checkpoint(cfg: DictConfig, trainer, epoch_itr, val_loss): +def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss): from fairseq import meters # only one worker should attempt to create the required dir - if cfg.distributed_rank == 0: + if trainer.data_parallel_rank == 0: os.makedirs(cfg.save_dir, exist_ok=True) prev_best = getattr(save_checkpoint, "best", val_loss) @@ -39,15 +45,14 @@ def save_checkpoint(cfg: DictConfig, trainer, epoch_itr, val_loss): save_checkpoint.best = best_function(val_loss, prev_best) if cfg.no_save: - return + return None - trainer.consolidate_optimizer() + trainer.consolidate_optimizer() # TODO(SS): do we need this if no_save_optimizer_state - if not trainer.is_data_parallel_master: - return - - def is_better(a, b): - return a >= b if cfg.maximize_best_checkpoint_metric else a <= b + if not trainer.should_save_checkpoint_on_current_rank: + if trainer.always_call_state_dict_during_save_checkpoint: + trainer.state_dict() + return None write_timer = meters.StopwatchMeter() write_timer.start() @@ -56,7 +61,12 @@ def is_better(a, b): end_of_epoch = epoch_itr.end_of_epoch() updates = trainer.get_num_updates() - suffix = cfg.checkpoint_suffix or "" + logger.info(f"Preparing to save checkpoint for epoch {epoch} @ {updates} updates") + + def is_better(a, b): + return a >= b if cfg.maximize_best_checkpoint_metric else a <= b + + suffix = trainer.checkpoint_suffix checkpoint_conds = collections.OrderedDict() checkpoint_conds["checkpoint{}{}.pt".format(epoch, suffix)] = ( end_of_epoch and not cfg.no_epoch_checkpoints and epoch % cfg.save_interval == 0 @@ -71,56 +81,117 @@ def is_better(a, b): or is_better(val_loss, save_checkpoint.best) ) if val_loss is not None and cfg.keep_best_checkpoints > 0: - checkpoint_conds[ - "checkpoint.best_{}_{:.2f}.pt".format(cfg.best_checkpoint_metric, val_loss) - ] = not hasattr(save_checkpoint, "best") or is_better( - val_loss, save_checkpoint.best + worst_best = getattr(save_checkpoint, "best", None) + chkpts = checkpoint_paths( + cfg.save_dir, + pattern=r"checkpoint\.best_{}_(\d+\.?\d*){}\.pt".format( + cfg.best_checkpoint_metric, suffix + ), ) + if len(chkpts) > 0: + p = chkpts[-1] if cfg.maximize_best_checkpoint_metric else chkpts[0] + worst_best = float(p.rsplit("_")[-1].replace("{}.pt".format(suffix), "")) + # add random digits to resolve ties + with data_utils.numpy_seed(epoch, updates, val_loss): + rand_sfx = np.random.randint(0, cfg.keep_best_checkpoints) + + checkpoint_conds[ + "checkpoint.best_{}_{:.3f}{}{}.pt".format( + cfg.best_checkpoint_metric, val_loss, rand_sfx, suffix + ) + ] = worst_best is None or is_better(val_loss, worst_best) checkpoint_conds[ "checkpoint_last{}.pt".format(suffix) ] = not cfg.no_last_checkpoints - extra_state = {"train_iterator": epoch_itr.state_dict(), "val_loss": val_loss} + extra_state = { + "train_iterator": epoch_itr.state_dict(), + "val_loss": val_loss, + } + + # Going forward, different tasks could expose an API like this to dump all + # the checkpoint worthy attributes in a dictionary which then will be + # merged with the parent dictionary to create the "extra_state". This + # allows for an extensible yet simple design to checkpoint task level + # attributes + if hasattr(trainer.task, "get_checkpoint_dict"): + extra_state = {**extra_state, **trainer.task.get_checkpoint_dict()} + logger.info(f"State of {trainer.task.__class__.__name__} is ready to be persisted with the checkpoint") + if hasattr(save_checkpoint, "best"): extra_state.update({"best": save_checkpoint.best}) checkpoints = [ os.path.join(cfg.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond ] - if len(checkpoints) > 0: - trainer.save_checkpoint(checkpoints[0], extra_state) + saved_cp = None + if len(checkpoints) > 0 and trainer.should_save_checkpoint_on_current_rank: + saved_cp = trainer.save_checkpoint(checkpoints[0], extra_state) for cp in checkpoints[1:]: - PathManager.copy(checkpoints[0], cp, overwrite=True) + if cfg.write_checkpoints_asynchronously: + # TODO[ioPath]: Need to implement a delayed asynchronous + # file copying/moving feature. + logger.warning( + f"ioPath is not copying {checkpoints[0]} to {cp} " + "since async write mode is on." + ) + else: + assert PathManager.copy( + checkpoints[0], cp, overwrite=True + ), f"Failed to copy {checkpoints[0]} to {cp}" write_timer.stop() logger.info( - "saved checkpoint {} (epoch {} @ {} updates, score {}) (writing took {} seconds)".format( + "Saved checkpoint {} (epoch {} @ {} updates, score {}) (writing took {} seconds)".format( checkpoints[0], epoch, updates, val_loss, write_timer.sum ) ) - if not end_of_epoch and cfg.keep_interval_updates > 0: + if ( + not end_of_epoch + and cfg.keep_interval_updates > 0 + and trainer.should_save_checkpoint_on_current_rank + ): # remove old checkpoints; checkpoints are sorted in descending order - checkpoints = checkpoint_paths( - cfg.save_dir, pattern=r"checkpoint_\d+_(\d+)\.pt" - ) + if cfg.keep_interval_updates_pattern == -1: + checkpoints = checkpoint_paths( + cfg.save_dir, pattern=r"checkpoint_\d+_(\d+){}\.pt".format(suffix) + ) + else: + checkpoints = checkpoint_paths( + cfg.save_dir, + pattern=r"checkpoint_\d+_(\d+){}\.pt".format(suffix), + keep_match=True, + ) + checkpoints = [ + x[0] + for x in checkpoints + if x[1] % cfg.keep_interval_updates_pattern != 0 + ] + for old_chk in checkpoints[cfg.keep_interval_updates :]: if os.path.lexists(old_chk): os.remove(old_chk) + elif PathManager.exists(old_chk): + PathManager.rm(old_chk) - if cfg.keep_last_epochs > 0: + if cfg.keep_last_epochs > 0 and trainer.should_save_checkpoint_on_current_rank: # remove old epoch checkpoints; checkpoints are sorted in descending order - checkpoints = checkpoint_paths(cfg.save_dir, pattern=r"checkpoint(\d+)\.pt") + checkpoints = checkpoint_paths( + cfg.save_dir, pattern=r"checkpoint(\d+){}\.pt".format(suffix) + ) for old_chk in checkpoints[cfg.keep_last_epochs :]: if os.path.lexists(old_chk): os.remove(old_chk) + elif PathManager.exists(old_chk): + PathManager.rm(old_chk) - if cfg.keep_best_checkpoints > 0: + if cfg.keep_best_checkpoints > 0 and trainer.should_save_checkpoint_on_current_rank: # only keep the best N checkpoints according to validation metric checkpoints = checkpoint_paths( cfg.save_dir, - pattern=r"checkpoint\.best_{}_(\d+\.?\d*)\.pt".format( - cfg.best_checkpoint_metric + pattern=r"checkpoint\.best_{}_(\d+\.?\d*){}\.pt".format( + cfg.best_checkpoint_metric, suffix ), ) if not cfg.maximize_best_checkpoint_metric: @@ -128,9 +199,13 @@ def is_better(a, b): for old_chk in checkpoints[cfg.keep_best_checkpoints :]: if os.path.lexists(old_chk): os.remove(old_chk) + elif PathManager.exists(old_chk): + PathManager.rm(old_chk) + + return saved_cp -def load_checkpoint(cfg: DictConfig, trainer, **passthrough_args): +def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args): """ Load a checkpoint and restore the training iterator. @@ -152,7 +227,7 @@ def load_checkpoint(cfg: DictConfig, trainer, **passthrough_args): " or reset_lr_scheduler or reset_meters or reset_dataloader" ) - suffix = cfg.checkpoint_suffix + suffix = trainer.checkpoint_suffix if ( cfg.restore_file == "checkpoint_last.pt" ): # default value of restore_file is 'checkpoint_last.pt' @@ -160,7 +235,9 @@ def load_checkpoint(cfg: DictConfig, trainer, **passthrough_args): cfg.save_dir, "checkpoint_last{}.pt".format(suffix) ) first_launch = not PathManager.exists(checkpoint_path) - if cfg.finetune_from_model is not None and first_launch: + if first_launch and getattr(cfg, "continue_once", None) is not None: + checkpoint_path = cfg.continue_once + elif cfg.finetune_from_model is not None and first_launch: # if there is no last checkpoint to restore, start the finetune from pretrained model # else just use usual logic to load checkpoint, e.g. restart from last checkpoint and etc. if PathManager.exists(cfg.finetune_from_model): @@ -175,9 +252,9 @@ def load_checkpoint(cfg: DictConfig, trainer, **passthrough_args): ) else: raise ValueError( - f"--funetune-from-model {cfg.finetune_from_model} does not exist" + f"--finetune-from-model {cfg.finetune_from_model} does not exist" ) - elif cfg.model_parallel_size > 1: + elif suffix is not None: checkpoint_path = cfg.restore_file.replace(".pt", suffix + ".pt") else: checkpoint_path = cfg.restore_file @@ -211,6 +288,11 @@ def load_checkpoint(cfg: DictConfig, trainer, **passthrough_args): epoch=itr_state["epoch"], load_dataset=True, **passthrough_args ) epoch_itr.load_state_dict(itr_state) + + # Preload the checkpoint for the task + task_cp_dict = extra_state.get(trainer.task.__class__.__name__, {}) + if task_cp_dict and hasattr(trainer.task, "set_checkpoint_dict"): + trainer.task.set_checkpoint_dict(task_cp_dict) else: epoch_itr = trainer.get_train_iterator( epoch=1, load_dataset=True, **passthrough_args @@ -221,27 +303,80 @@ def load_checkpoint(cfg: DictConfig, trainer, **passthrough_args): return extra_state, epoch_itr -def load_checkpoint_to_cpu(path, arg_overrides=None): - """Loads a checkpoint to CPU (with upgrading for backward compatibility).""" - with open(PathManager.get_local_path(path), "rb") as f: - state = torch.load( - f, map_location=lambda s, l: default_restore_location(s, "cpu") - ) +def load_checkpoint_to_cpu(path, arg_overrides=None, load_on_all_ranks=False): + """Loads a checkpoint to CPU (with upgrading for backward compatibility). + + If doing single-GPU training or if the checkpoint is only being loaded by at + most one process on each node (current default behavior is for only rank 0 + to read the checkpoint from disk), load_on_all_ranks should be False to + avoid errors from torch.distributed not having been initialized or + torch.distributed.barrier() hanging. + + If all processes on each node may be loading the checkpoint + simultaneously, load_on_all_ranks should be set to True to avoid I/O + conflicts. + + There's currently no support for > 1 but < all processes loading the + checkpoint on each node. + """ + local_path = PathManager.get_local_path(path) + # The locally cached file returned by get_local_path() may be stale for + # remote files that are periodically updated/overwritten (ex: + # checkpoint_last.pt) - so we remove the local copy, sync across processes + # (if needed), and then download a fresh copy. + if local_path != path and PathManager.path_requires_pathmanager(path): + try: + os.remove(local_path) + except FileNotFoundError: + # With potentially multiple processes removing the same file, the + # file being missing is benign (missing_ok isn't available until + # Python 3.8). + pass + if load_on_all_ranks: + torch.distributed.barrier() + local_path = PathManager.get_local_path(path) + + with open(local_path, "rb") as f: + state = torch.load(f, map_location=torch.device("cpu")) if "args" in state and state["args"] is not None and arg_overrides is not None: args = state["args"] for arg_name, arg_val in arg_overrides.items(): setattr(args, arg_name, arg_val) - if "cfg" in state and state["cfg"] is not None and arg_overrides is not None: - overwrite_args_by_name(state["cfg"], arg_overrides) + if "cfg" in state and state["cfg"] is not None: + + # hack to be able to set Namespace in dict config. this should be removed when we update to newer + # omegaconf version that supports object flags, or when we migrate all existing models + from omegaconf import __version__ as oc_version + from omegaconf import _utils + + if oc_version < "2.2": + old_primitive = _utils.is_primitive_type + _utils.is_primitive_type = lambda _: True + + state["cfg"] = OmegaConf.create(state["cfg"]) + + _utils.is_primitive_type = old_primitive + OmegaConf.set_struct(state["cfg"], True) + else: + state["cfg"] = OmegaConf.create(state["cfg"], flags={"allow_objects": True}) + + if arg_overrides is not None: + overwrite_args_by_name(state["cfg"], arg_overrides) state = _upgrade_state_dict(state) return state def load_model_ensemble( - filenames, arg_overrides=None, task=None, strict=True, suffix="", num_shards=1 + filenames, + arg_overrides: Optional[Dict[str, Any]] = None, + task=None, + strict=True, + suffix="", + num_shards=1, + state=None, ): """Loads an ensemble of models. @@ -261,30 +396,58 @@ def load_model_ensemble( strict, suffix, num_shards, + state, ) return ensemble, args +def get_maybe_sharded_checkpoint_filename( + filename: str, suffix: str, shard_idx: int, num_shards: int +) -> str: + orig_filename = filename + filename = filename.replace(".pt", suffix + ".pt") + fsdp_filename = filename[:-3] + f"-shard{shard_idx}.pt" + model_parallel_filename = orig_filename[:-3] + f"_part{shard_idx}.pt" + if PathManager.exists(fsdp_filename): + return fsdp_filename + elif num_shards > 1: + return model_parallel_filename + else: + return filename + + def load_model_ensemble_and_task( - filenames, arg_overrides=None, task=None, strict=True, suffix="", num_shards=1 + filenames, + arg_overrides: Optional[Dict[str, Any]] = None, + task=None, + strict=True, + suffix="", + num_shards=1, + state=None, ): + assert state is None or len(filenames) == 1 + from fairseq import tasks assert not ( strict and num_shards > 1 ), "Cannot load state dict with strict=True and checkpoint shards > 1" ensemble = [] + cfg = None for filename in filenames: orig_filename = filename + model_shard_state = {"shard_weights": [], "shard_metadata": []} + assert num_shards > 0 + st = time.time() for shard_idx in range(num_shards): - if num_shards == 1: - filename = filename.replace(".pt", suffix + ".pt") - else: - filename = orig_filename[:-3] + f"_part{shard_idx}.pt" + filename = get_maybe_sharded_checkpoint_filename( + orig_filename, suffix, shard_idx, num_shards + ) if not PathManager.exists(filename): raise IOError("Model file not found: {}".format(filename)) - state = load_checkpoint_to_cpu(filename, arg_overrides) + if state is None: + state = load_checkpoint_to_cpu(filename, arg_overrides) if "args" in state and state["args"] is not None: cfg = convert_namespace_to_omegaconf(state["args"]) elif "cfg" in state and state["cfg"] is not None: @@ -295,17 +458,102 @@ def load_model_ensemble_and_task( ) if task is None: - task = tasks.setup_task(cfg.task) + task = tasks.setup_task(cfg.task, from_checkpoint=True) + + if "task_state" in state: + task.load_state_dict(state["task_state"]) + + argspec = inspect.getfullargspec(task.build_model) + + if "fsdp_metadata" in state and num_shards > 1: + model_shard_state["shard_weights"].append(state["model"]) + model_shard_state["shard_metadata"].append(state["fsdp_metadata"]) + # check FSDP import before the code goes too far + if not has_FSDP: + raise ImportError( + "Cannot find FullyShardedDataParallel. " + "Please install fairscale with: pip install fairscale" + ) + if shard_idx == num_shards - 1: + consolidated_model_state = FSDP.consolidate_shard_weights( + shard_weights=model_shard_state["shard_weights"], + shard_metadata=model_shard_state["shard_metadata"], + ) + if "from_checkpoint" in argspec.args: + model = task.build_model(cfg.model, from_checkpoint=True) + else: + model = task.build_model(cfg.model) + if ( + "optimizer_history" in state + and len(state["optimizer_history"]) > 0 + and "num_updates" in state["optimizer_history"][-1] + ): + model.set_num_updates( + state["optimizer_history"][-1]["num_updates"] + ) + model.load_state_dict( + consolidated_model_state, strict=strict, model_cfg=cfg.model + ) + else: + # model parallel checkpoint or unsharded checkpoint + # support old external tasks + + if "from_checkpoint" in argspec.args: + model = task.build_model(cfg.model, from_checkpoint=True) + else: + model = task.build_model(cfg.model) + if ( + "optimizer_history" in state + and len(state["optimizer_history"]) > 0 + and "num_updates" in state["optimizer_history"][-1] + ): + model.set_num_updates(state["optimizer_history"][-1]["num_updates"]) + model.load_state_dict( + state["model"], strict=strict, model_cfg=cfg.model + ) - # build model for ensemble - model = task.build_model(cfg.model) + # reset state so it gets loaded for the next model in ensemble + state = None + if shard_idx % 10 == 0 and shard_idx > 0: + elapsed = time.time() - st + logger.info( + f"Loaded {shard_idx} shards in {elapsed:.2f}s, {elapsed / (shard_idx+1):.2f}s/shard" + ) - model.load_state_dict(state["model"], strict=strict, model_cfg=cfg.model) + # build model for ensemble ensemble.append(model) return ensemble, cfg, task -def checkpoint_paths(path, pattern=r"checkpoint(\d+)\.pt"): +def load_model_ensemble_and_task_from_hf_hub( + model_id, + cache_dir: Optional[str] = None, + arg_overrides: Optional[Dict[str, Any]] = None, + **kwargs: Any, +): + try: + from huggingface_hub import snapshot_download + except ImportError: + raise ImportError( + "You need to install huggingface_hub to use `load_from_hf_hub`. " + "See https://pypi.org/project/huggingface-hub/ for installation." + ) + + library_name = "fairseq" + cache_dir = cache_dir or (Path.home() / ".cache" / library_name).as_posix() + cache_dir = snapshot_download( + model_id, cache_dir=cache_dir, library_name=library_name, **kwargs + ) + + _arg_overrides = arg_overrides or {} + _arg_overrides["data"] = cache_dir + return load_model_ensemble_and_task( + [p.as_posix() for p in Path(cache_dir).glob("*.pt")], + arg_overrides=_arg_overrides, + ) + + +def checkpoint_paths(path, pattern=r"checkpoint(\d+)\.pt", keep_match=False): """Retrieves all checkpoints found in `path` directory. Checkpoints are identified by matching filename to the specified pattern. If @@ -313,7 +561,7 @@ def checkpoint_paths(path, pattern=r"checkpoint(\d+)\.pt"): descending order. """ pt_regexp = re.compile(pattern) - files = os.listdir(path) + files = PathManager.ls(path) entries = [] for i, f in enumerate(files): @@ -321,10 +569,29 @@ def checkpoint_paths(path, pattern=r"checkpoint(\d+)\.pt"): if m is not None: idx = float(m.group(1)) if len(m.groups()) > 0 else i entries.append((idx, m.group(0))) - return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)] + if keep_match: + return [(os.path.join(path, x[1]), x[0]) for x in sorted(entries, reverse=True)] + else: + return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)] -def torch_persistent_save(obj, f): +def torch_persistent_save(obj, filename, async_write: bool = False): + if async_write: + with PathManager.opena(filename, "wb") as f: + _torch_persistent_save(obj, f) + else: + if PathManager.supports_rename(filename): + # do atomic save + with PathManager.open(filename + ".tmp", "wb") as f: + _torch_persistent_save(obj, f) + PathManager.rename(filename + ".tmp", filename) + else: + # fallback to non-atomic save + with PathManager.open(filename, "wb") as f: + _torch_persistent_save(obj, f) + + +def _torch_persistent_save(obj, f): if isinstance(f, str): with PathManager.open(f, "wb") as h: torch_persistent_save(obj, h) @@ -335,62 +602,13 @@ def torch_persistent_save(obj, f): except Exception: if i == 2: logger.error(traceback.format_exc()) - - -def save_state( - filename, - cfg: DictConfig, - model_state_dict, - criterion, - optimizer, - lr_scheduler, - num_updates, - optim_history=None, - extra_state=None, - **kwargs, -): - from fairseq import utils - - if optim_history is None: - optim_history = [] - if extra_state is None: - extra_state = {} - state_dict = { - "cfg": cfg, - "args": kwargs.get("args", None), - "model": model_state_dict or {}, - "optimizer_history": optim_history - + [ - { - "criterion_name": criterion.__class__.__name__, - "optimizer_name": optimizer.__class__.__name__, - "lr_scheduler_state": lr_scheduler.state_dict(), - "num_updates": num_updates, - } - ], - "extra_state": extra_state, - } - if utils.has_parameters(criterion): - state_dict["criterion"] = criterion.state_dict() - - if cfg is None: - cfg = state_dict["args"] - assert cfg is not None, "must provide cfg or args" - - if isinstance(cfg, DictConfig): - no_save_optimizer_state = cfg.checkpoint.no_save_optimizer_state - else: - no_save_optimizer_state = cfg.no_save_optimizer_state - if not no_save_optimizer_state: - state_dict["last_optimizer_state"] = optimizer.state_dict() - - with PathManager.open(filename, "wb") as f: - torch_persistent_save(state_dict, f) + raise + else: + time.sleep(2.5) def _upgrade_state_dict(state): """Helper for upgrading old model checkpoints.""" - from fairseq import models, registry, tasks # add optimizer_history if "optimizer_history" not in state: @@ -430,13 +648,18 @@ def _upgrade_state_dict(state): # use stateful training data iterator if "train_iterator" not in state["extra_state"]: state["extra_state"]["train_iterator"] = { - "epoch": state["extra_state"]["epoch"], + "epoch": state["extra_state"].get("epoch", 0), "iterations_in_epoch": state["extra_state"].get("batch_offset", 0), } - # old model checkpoints may not have separate source/target positions # backward compatibility, cfg updates if "args" in state and state["args"] is not None: + # old model checkpoints may not have separate source/target positions + if hasattr(state["args"], "max_positions") and not hasattr( + state["args"], "max_source_positions" + ): + state["args"].max_source_positions = state["args"].max_positions + state["args"].max_target_positions = state["args"].max_positions # default to translation task if not hasattr(state["args"], "task"): state["args"].task = "translation" @@ -450,24 +673,72 @@ def _upgrade_state_dict(state): state["extra_state"]["train_iterator"]["epoch"] = max( state["extra_state"]["train_iterator"].get("epoch", 1), 1 ) - + # --remove-bpe ==> --postprocess if hasattr(state["args"], "remove_bpe"): state["args"].post_process = state["args"].remove_bpe + # --min-lr ==> --stop-min-lr + if hasattr(state["args"], "min_lr"): + state["args"].stop_min_lr = state["args"].min_lr + del state["args"].min_lr + # binary_cross_entropy / kd_binary_cross_entropy => wav2vec criterion + if hasattr(state["args"], "criterion") and state["args"].criterion in [ + "binary_cross_entropy", + "kd_binary_cross_entropy", + ]: + state["args"].criterion = "wav2vec" + # remove log_keys if it's None (criteria will supply a default value of []) + if hasattr(state["args"], "log_keys") and state["args"].log_keys is None: + delattr(state["args"], "log_keys") + # speech_pretraining => audio pretraining + if ( + hasattr(state["args"], "task") + and state["args"].task == "speech_pretraining" + ): + state["args"].task = "audio_pretraining" + # audio_cpc => wav2vec + if hasattr(state["args"], "arch") and state["args"].arch == "audio_cpc": + state["args"].arch = "wav2vec" + # convert legacy float learning rate to List[float] + if hasattr(state["args"], "lr") and isinstance(state["args"].lr, float): + state["args"].lr = [state["args"].lr] + # convert task data arg to a string instead of List[string] + if ( + hasattr(state["args"], "data") + and isinstance(state["args"].data, list) + and len(state["args"].data) > 0 + ): + state["args"].data = state["args"].data[0] state["cfg"] = convert_namespace_to_omegaconf(state["args"]) if "cfg" in state and state["cfg"] is not None: - with open_dict(state["cfg"]): - if state["cfg"].task is not None: - if hasattr(state["cfg"].task, "max_positions") and not hasattr( - state["cfg"].task, "max_source_positions" - ): - state["cfg"].task.max_source_positions = state[ - "cfg" - ].task.max_positions - state["cfg"].task.max_target_positions = state[ - "cfg" - ].task.max_positions + cfg = state["cfg"] + with open_dict(cfg): + # any upgrades for Hydra-based configs + if ( + "task" in cfg + and "eval_wer_config" in cfg.task + and isinstance(cfg.task.eval_wer_config.print_alignment, bool) + ): + cfg.task.eval_wer_config.print_alignment = "hard" + if "generation" in cfg and isinstance(cfg.generation.print_alignment, bool): + cfg.generation.print_alignment = ( + "hard" if cfg.generation.print_alignment else None + ) + if ( + "model" in cfg + and "w2v_args" in cfg.model + and cfg.model.w2v_args is not None + and ( + hasattr(cfg.model.w2v_args, "task") or "task" in cfg.model.w2v_args + ) + and hasattr(cfg.model.w2v_args.task, "eval_wer_config") + and cfg.model.w2v_args.task.eval_wer_config is not None + and isinstance( + cfg.model.w2v_args.task.eval_wer_config.print_alignment, bool + ) + ): + cfg.model.w2v_args.task.eval_wer_config.print_alignment = "hard" return state @@ -552,8 +823,11 @@ def create_pruning_pass(layers_to_keep, layer_name): # Since layers are now pruned, *_layers_to_keep are no longer needed. # This is more of "It would make it work fix" rather than a proper fix. - - with open_dict(model_cfg): + if isinstance(model_cfg, DictConfig): + context = open_dict(model_cfg) + else: + context = contextlib.ExitStack() + with context: if hasattr(model_cfg, "encoder_layers_to_keep"): model_cfg.encoder_layers_to_keep = None if hasattr(model_cfg, "decoder_layers_to_keep"): @@ -563,7 +837,9 @@ def create_pruning_pass(layers_to_keep, layer_name): def load_pretrained_component_from_model( - component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str + component: Union[FairseqEncoder, FairseqDecoder], + checkpoint: str, + strict: bool = True, ): """ Load a pretrained FairseqEncoder or FairseqDecoder from checkpoint into the @@ -589,7 +865,7 @@ def load_pretrained_component_from_model( # encoder.input_layers.0.0.weight --> input_layers.0.0.weight component_subkey = key[len(component_type) + 1 :] component_state_dict[component_subkey] = state["model"][key] - component.load_state_dict(component_state_dict, strict=True) + component.load_state_dict(component_state_dict, strict=strict) return component @@ -607,3 +883,54 @@ def verify_checkpoint_directory(save_dir: str) -> None: raise e else: os.remove(temp_file_path) + + +def save_ema_as_checkpoint(src_path, dst_path): + state = load_ema_from_checkpoint(src_path) + torch_persistent_save(state, dst_path) + + +def load_ema_from_checkpoint(fpath): + """Loads exponential moving averaged (EMA) checkpoint from input and + returns a model with ema weights. + + Args: + fpath: A string path of checkpoint to load from. + + Returns: + A dict of string keys mapping to various values. The 'model' key + from the returned dict should correspond to an OrderedDict mapping + string parameter names to torch Tensors. + """ + params_dict = collections.OrderedDict() + new_state = None + + with PathManager.open(fpath, "rb") as f: + new_state = torch.load( + f, + map_location=( + lambda s, _: torch.serialization.default_restore_location(s, "cpu") + ), + ) + + # EMA model is stored in a separate "extra state" + model_params = new_state["extra_state"]["ema"] + + for key in list(model_params.keys()): + p = model_params[key] + if isinstance(p, torch.HalfTensor): + p = p.float() + if key not in params_dict: + params_dict[key] = p.clone() + # NOTE: clone() is needed in case of p is a shared parameter + else: + raise ValueError("Key {} is repeated in EMA model params.".format(key)) + + if len(params_dict) == 0: + raise ValueError( + f"Input checkpoint path '{fpath}' does not contain " + "ema model weights, is this model trained with EMA?" + ) + + new_state["model"] = params_dict + return new_state diff --git a/fairseq/clib/cuda/ngram_repeat_block_cuda.cpp b/fairseq/clib/cuda/ngram_repeat_block_cuda.cpp new file mode 100644 index 0000000000..707219105a --- /dev/null +++ b/fairseq/clib/cuda/ngram_repeat_block_cuda.cpp @@ -0,0 +1,55 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT License. +*/ + +#include <torch/extension.h> +#include <vector> + +/* +CPP Binding for CUDA OP +*/ + +// CUDA forward declarations +torch::Tensor ngram_repeat_block_cuda_forward( + torch::Tensor tokens, + torch::Tensor lprobs, + int bsz, + int step, + int beam_size, + int no_repeat_ngram_size); + +#define CHECK_CUDA(x) \ + TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) \ + TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) \ + CHECK_CUDA(x); \ + CHECK_CONTIGUOUS(x) + +// Input check and call to CUDA OP +// Backward method not required +torch::Tensor ngram_repeat_block_forward( + torch::Tensor tokens, + torch::Tensor lprobs, + int bsz, + int step, + int beam_size, + int no_repeat_ngram_size) { + CHECK_INPUT(tokens); + CHECK_INPUT(lprobs); + assert(bsz > 0); + assert(step >= 0); + assert(beam_size > 0); + assert(no_repeat_ngram_size > 0); + + return ngram_repeat_block_cuda_forward( + tokens, lprobs, bsz, step, beam_size, no_repeat_ngram_size); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def( + "forward", + &ngram_repeat_block_forward, + "No Repeat Ngram Block forward (CUDA)"); +} diff --git a/fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu b/fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu new file mode 100644 index 0000000000..bd6106cba0 --- /dev/null +++ b/fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu @@ -0,0 +1,82 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT License. +*/ + +/* +Kernel implementation for blocking repeated n-grams. +*/ + +#include <cuda.h> +#include <cuda_runtime.h> +#include <math.h> +#include <torch/extension.h> +#include <vector> + +// Ban repeated ngrams of length = 'no_repeat_ngram_size' +__global__ void banRepeatedTokens( + long* __restrict__ tokens, + float* __restrict__ lprobs, + int max_predict_len, + int vocab_size, + int no_repeat_ngram_size) { + auto row = blockIdx.x; + auto col = threadIdx.x; + auto start = row * (max_predict_len) + col; + // Each thread compares ngram starting from + // thread index with final ngram starting from + // step - no_repeat_ngram_size +2 + auto check_start_pos = blockDim.x; + auto lprob_start = row * vocab_size; + bool is_banned = true; + extern __shared__ long tokens_shm[]; + tokens_shm[col] = tokens[start]; + if (col == blockDim.x - 1) { + for (int i = 1; i < no_repeat_ngram_size; i++) { + if (col + i < max_predict_len) { + tokens_shm[col + i] = tokens[start + i]; + } + } + } + __syncthreads(); + + for (int k = 0; k < no_repeat_ngram_size - 1; k++) { + if (tokens_shm[col + k] != tokens_shm[check_start_pos + k]) { + is_banned = false; + } + } + if (is_banned == true) { + auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1]; + lprobs[lprob_start + token_to_be_banned] = -INFINITY; + } +} + +// Allocate blocks and threads based on +// batch size and sequence length and launch +// kernel +torch::Tensor ngram_repeat_block_cuda_forward( + const torch::Tensor tokens, + torch::Tensor lprobs, + int bsz, + int step, + int beam_size, + int no_repeat_ngram_size) { + int threads = step - no_repeat_ngram_size + 2; + if (threads <= 0) + return lprobs; + int max_predict_len = tokens.size(1); + int vocab_size = lprobs.size(1); + auto token_ptr = tokens.data_ptr<long>(); + auto lprob_ptr = lprobs.data_ptr<float>(); + int blocks = bsz * beam_size; + int shared_mem_size = (step + 1) * sizeof(long); + + // Launching N blocks where N is number of samples in a batch (beams*bsz) + // Launching T threads where T is number of previous ngrams in a sample + // Allocating shared mem per block for fastser access of input tokens since + // each token will be accessed N times to compare with current Ngram where + // N is Ngram size. + banRepeatedTokens<<<blocks, threads, shared_mem_size>>>( + token_ptr, lprob_ptr, max_predict_len, vocab_size, no_repeat_ngram_size); + return lprobs; +} diff --git a/fairseq/clib/libbase/balanced_assignment.cpp b/fairseq/clib/libbase/balanced_assignment.cpp new file mode 100644 index 0000000000..1a5a1061f3 --- /dev/null +++ b/fairseq/clib/libbase/balanced_assignment.cpp @@ -0,0 +1,109 @@ +/** + * Copyright 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* +C++ code for solving the linear assignment problem. +Based on the Auction Algorithm from +https://dspace.mit.edu/bitstream/handle/1721.1/3265/P-2108-26912652.pdf and the +implementation from: https://github.com/bkj/auction-lap Adapted to be more +efficient when each worker is looking for k jobs instead of 1. +*/ +#include <torch/extension.h> +#include <iostream> +using namespace torch::indexing; +torch::Tensor balanced_assignment(torch::Tensor job_and_worker_to_score) { + int max_iterations = 100; + torch::Tensor epsilon = + (job_and_worker_to_score.max() - job_and_worker_to_score.min()) / 50; + epsilon.clamp_min_(1e-04); + torch::Tensor worker_and_job_to_score = + job_and_worker_to_score.detach().transpose(0, 1).contiguous(); + int num_workers = worker_and_job_to_score.size(0); + int num_jobs = worker_and_job_to_score.size(1); + auto device = worker_and_job_to_score.device(); + int jobs_per_worker = num_jobs / num_workers; + torch::Tensor value = worker_and_job_to_score.clone(); + int counter = 0; + torch::Tensor max_value = worker_and_job_to_score.max(); + + torch::Tensor bid_indices; + torch::Tensor cost = worker_and_job_to_score.new_zeros({1, num_jobs}); + torch::Tensor bids = + worker_and_job_to_score.new_empty({num_workers, num_jobs}); + torch::Tensor bid_increments = + worker_and_job_to_score.new_empty({num_workers, jobs_per_worker}); + torch::Tensor top_values = + worker_and_job_to_score.new_empty({num_workers, jobs_per_worker + 1}); + torch::Tensor high_bids = worker_and_job_to_score.new_empty({num_jobs}); + + torch::Tensor top_index = top_values.to(torch::kLong); + torch::Tensor high_bidders = top_index.new_empty({num_jobs}); + torch::Tensor have_bids = high_bidders.to(torch::kBool); + torch::Tensor jobs_indices = + torch::arange({num_jobs}, torch::dtype(torch::kLong).device(device)); + torch::Tensor true_tensor = + torch::ones({1}, torch::dtype(torch::kBool).device(device)); + + while (true) { + bids.zero_(); + torch::topk_out(top_values, top_index, value, jobs_per_worker + 1, 1); + + // Each worker bids the difference in value between that job and the k+1th + // job + torch::sub_out( + bid_increments, + top_values.index({Slice(None, None), Slice(0, jobs_per_worker)}), + top_values.index({Slice(None, None), jobs_per_worker}).unsqueeze(1)); + + bid_increments.add_(epsilon); + bids.scatter_( + 1, + top_index.index({Slice(None, None), Slice(0, jobs_per_worker)}), + bid_increments); + + if (counter < max_iterations && counter > 0) { + // Put in a minimal bid to retain items from the last round if no-one else + // bids for them this round + bids.view(-1).index_put_({bid_indices}, epsilon); + } + + // Find the highest bidding worker per job + torch::max_out(high_bids, high_bidders, bids, 0); + torch::gt_out(have_bids, high_bids, 0); + + if (have_bids.all().item<bool>()) { + // All jobs were bid for + break; + } + + // Make popular items more expensive + cost.add_(high_bids); + torch::sub_out(value, worker_and_job_to_score, cost); + + bid_indices = ((high_bidders * num_jobs) + jobs_indices).index({have_bids}); + + if (counter < max_iterations) { + // Make sure that this item will be in the winning worker's top-k next + // time. + value.view(-1).index_put_({bid_indices}, max_value); + } else { + // Suboptimal approximation that converges quickly from current solution + value.view(-1).index_put_( + {bid_indices}, worker_and_job_to_score.view(-1).index({bid_indices})); + } + + counter += 1; + } + + return top_index.index({Slice(None, None), Slice(0, jobs_per_worker)}) + .reshape(-1); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("balanced_assignment", &balanced_assignment, "Balanced Assignment"); +} diff --git a/fairseq/clib/libbleu/libbleu.cpp b/fairseq/clib/libbleu/libbleu.cpp index 3cf2d65b6d..939d9e1174 100644 --- a/fairseq/clib/libbleu/libbleu.cpp +++ b/fairseq/clib/libbleu/libbleu.cpp @@ -6,30 +6,32 @@ * LICENSE file in the root directory of this source tree. */ -#include <map> #include <array> -#include <cstring> #include <cstdio> +#include <cstring> +#include <map> -typedef struct -{ - size_t reflen; - size_t predlen; - size_t match1; - size_t count1; - size_t match2; - size_t count2; - size_t match3; - size_t count3; - size_t match4; - size_t count4; +// NOLINTNEXTLINE +typedef struct { + size_t reflen; + size_t predlen; + size_t match1; + size_t count1; + size_t match2; + size_t count2; + size_t match3; + size_t count3; + size_t match4; + size_t count4; } bleu_stat; // left trim (remove pad) void bleu_ltrim(size_t* len, int** sent, int pad) { size_t start = 0; - while(start < *len) { - if (*(*sent + start) != pad) { break; } + while (start < *len) { + if (*(*sent + start) != pad) { + break; + } start++; } *sent += start; @@ -40,7 +42,9 @@ void bleu_ltrim(size_t* len, int** sent, int pad) { void bleu_rtrim(size_t* len, int** sent, int pad, int eos) { size_t end = *len - 1; while (end > 0) { - if (*(*sent + end) != eos && *(*sent + end) != pad) { break; } + if (*(*sent + end) != eos && *(*sent + end) != pad) { + break; + } end--; } *len = end + 1; @@ -53,10 +57,10 @@ void bleu_trim(size_t* len, int** sent, int pad, int eos) { } size_t bleu_hash(int len, int* data) { - size_t h = 14695981039346656037ul; + size_t h = 14695981039346656037ul; size_t prime = 0x100000001b3; - char* b = (char*) data; - size_t blen = sizeof(int) * len; + char* b = (char*)data; + size_t blen = sizeof(int) * len; while (blen-- > 0) { h ^= *b++; @@ -67,15 +71,23 @@ size_t bleu_hash(int len, int* data) { } void bleu_addngram( - size_t *ntotal, size_t *nmatch, size_t n, - size_t reflen, int* ref, size_t predlen, int* pred) { - - if (predlen < n) { return; } + size_t* ntotal, + size_t* nmatch, + size_t n, + size_t reflen, + int* ref, + size_t predlen, + int* pred) { + if (predlen < n) { + return; + } predlen = predlen - n + 1; (*ntotal) += predlen; - if (reflen < n) { return; } + if (reflen < n) { + return; + } reflen = reflen - n + 1; @@ -90,7 +102,7 @@ void bleu_addngram( size_t w = bleu_hash(n, ref++); if (count[w] > 0) { (*nmatch)++; - count[w] -=1; + count[w] -= 1; } reflen--; } @@ -99,16 +111,16 @@ void bleu_addngram( extern "C" { #ifdef _WIN64 -__declspec(dllexport) +__declspec(dllexport) #endif -void bleu_zero_init(bleu_stat* stat) { + void bleu_zero_init(bleu_stat* stat) { std::memset(stat, 0, sizeof(bleu_stat)); } #ifdef _WIN64 -__declspec(dllexport) +__declspec(dllexport) #endif -void bleu_one_init(bleu_stat* stat) { + void bleu_one_init(bleu_stat* stat) { bleu_zero_init(stat); stat->count1 = 0; stat->count2 = 1; @@ -121,11 +133,16 @@ void bleu_one_init(bleu_stat* stat) { } #ifdef _WIN64 -__declspec(dllexport) +__declspec(dllexport) #endif -void bleu_add( - bleu_stat* stat, - size_t reflen, int* ref, size_t predlen, int* pred, int pad, int eos) { + void bleu_add( + bleu_stat* stat, + size_t reflen, + int* ref, + size_t predlen, + int* pred, + int pad, + int eos) { bleu_trim(&reflen, &ref, pad, eos); bleu_trim(&predlen, &pred, pad, eos); @@ -137,5 +154,4 @@ void bleu_add( bleu_addngram(&stat->count3, &stat->match3, 3, reflen, ref, predlen, pred); bleu_addngram(&stat->count4, &stat->match4, 4, reflen, ref, predlen, pred); } - } diff --git a/fairseq/clib/libbleu/module.cpp b/fairseq/clib/libbleu/module.cpp index 8ed9a84b1c..35288b3177 100644 --- a/fairseq/clib/libbleu/module.cpp +++ b/fairseq/clib/libbleu/module.cpp @@ -8,20 +8,16 @@ #include <Python.h> - -static PyMethodDef method_def[] = { - {NULL, NULL, 0, NULL} -}; +static PyMethodDef method_def[] = {{NULL, NULL, 0, NULL}}; // NOLINT static struct PyModuleDef module_def = { - PyModuleDef_HEAD_INIT, - "libbleu", /* name of module */ - NULL, /* module documentation, may be NULL */ - -1, /* size of per-interpreter state of the module, - or -1 if the module keeps state in global variables. */ - method_def -}; - + PyModuleDef_HEAD_INIT, + "libbleu", /* name of module */ + // NOLINTNEXTLINE + NULL, /* module documentation, may be NULL */ + -1, /* size of per-interpreter state of the module, + or -1 if the module keeps state in global variables. */ + method_def}; // NOLINT #if PY_MAJOR_VERSION == 2 PyMODINIT_FUNC init_libbleu() @@ -29,7 +25,7 @@ PyMODINIT_FUNC init_libbleu() PyMODINIT_FUNC PyInit_libbleu() #endif { - PyObject *m = PyModule_Create(&module_def); + PyObject* m = PyModule_Create(&module_def); if (!m) { return NULL; } diff --git a/fairseq/clib/libnat/edit_dist.cpp b/fairseq/clib/libnat/edit_dist.cpp index 6bc6a937d6..9ffb60569d 100644 --- a/fairseq/clib/libnat/edit_dist.cpp +++ b/fairseq/clib/libnat/edit_dist.cpp @@ -6,10 +6,9 @@ * LICENSE file in the root directory of this source tree. */ -#include <torch/torch.h> // @manual=//caffe2:torch_extension #include <pybind11/detail/common.h> #include <pybind11/pybind11.h> -#include <vector> +#include <torch/torch.h> // @manual=//caffe2:torch_extension #include <algorithm> #include <cstdint> #include <iosfwd> @@ -17,6 +16,7 @@ #include <new> #include <string> #include <utility> +#include <vector> using namespace ::std; diff --git a/fairseq/clib/libnat_cuda/binding.cpp b/fairseq/clib/libnat_cuda/binding.cpp index aaa6244d5c..ced91c0d0a 100644 --- a/fairseq/clib/libnat_cuda/binding.cpp +++ b/fairseq/clib/libnat_cuda/binding.cpp @@ -7,54 +7,61 @@ */ /* - This code is partially adpoted from https://github.com/1ytic/pytorch-edit-distance + This code is partially adpoted from + https://github.com/1ytic/pytorch-edit-distance */ -#include "edit_dist.h" #include <torch/types.h> +#include "edit_dist.h" #ifndef TORCH_CHECK #define TORCH_CHECK AT_CHECK #endif -#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") -#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) - +#define CHECK_CUDA(x) \ + TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) \ + TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) \ + CHECK_CUDA(x); \ + CHECK_CONTIGUOUS(x) torch::Tensor LevenshteinDistance( - torch::Tensor source, - torch::Tensor target, - torch::Tensor source_length, - torch::Tensor target_length) { - - CHECK_INPUT(source); - CHECK_INPUT(target); - CHECK_INPUT(source_length); - CHECK_INPUT(target_length); - return LevenshteinDistanceCuda(source, target, source_length, target_length); + torch::Tensor source, + torch::Tensor target, + torch::Tensor source_length, + torch::Tensor target_length) { + CHECK_INPUT(source); + CHECK_INPUT(target); + CHECK_INPUT(source_length); + CHECK_INPUT(target_length); + return LevenshteinDistanceCuda(source, target, source_length, target_length); } torch::Tensor GenerateDeletionLabel( - torch::Tensor source, - torch::Tensor operations) { - - CHECK_INPUT(source); - CHECK_INPUT(operations); - return GenerateDeletionLabelCuda(source, operations); + torch::Tensor source, + torch::Tensor operations) { + CHECK_INPUT(source); + CHECK_INPUT(operations); + return GenerateDeletionLabelCuda(source, operations); } std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabel( - torch::Tensor target, - torch::Tensor operations) { - - CHECK_INPUT(target); - CHECK_INPUT(operations); - return GenerateInsertionLabelCuda(target, operations); + torch::Tensor target, + torch::Tensor operations) { + CHECK_INPUT(target); + CHECK_INPUT(operations); + return GenerateInsertionLabelCuda(target, operations); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("levenshtein_distance", &LevenshteinDistance, "Levenshtein distance"); - m.def("generate_deletion_labels", &GenerateDeletionLabel, "Generate Deletion Label"); - m.def("generate_insertion_labels", &GenerateInsertionLabel, "Generate Insertion Label"); + m.def("levenshtein_distance", &LevenshteinDistance, "Levenshtein distance"); + m.def( + "generate_deletion_labels", + &GenerateDeletionLabel, + "Generate Deletion Label"); + m.def( + "generate_insertion_labels", + &GenerateInsertionLabel, + "Generate Insertion Label"); } diff --git a/fairseq/clib/libnat_cuda/edit_dist.cu b/fairseq/clib/libnat_cuda/edit_dist.cu index 22de16b270..1ea5ec7e3c 100644 --- a/fairseq/clib/libnat_cuda/edit_dist.cu +++ b/fairseq/clib/libnat_cuda/edit_dist.cu @@ -1,332 +1,344 @@ /** -* Copyright 2017-present, Facebook, Inc. -* All rights reserved. -* -* This source code is licensed under the license found in the -* LICENSE file in the root directory of this source tree. -*/ + * Copyright 2017-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the license found in the + * LICENSE file in the root directory of this source tree. + */ #include "edit_dist.h" -#include <THC/THC.h> + +#include <c10/cuda/CUDAStream.h> #include <cuda.h> #include <cuda_runtime.h> #include <device_launch_parameters.h> -#include <utility> // std::pair +#include <utility> // std::pair template <typename scalar_t> __global__ void generate_deletion_label_kernel( - const scalar_t* __restrict__ source, - const size_t source_size, - const size_t operation_size, - int* __restrict__ operations, - int* __restrict__ labels) { - - const int index = blockIdx.x; - const int offset = index * operation_size; - const int offset_label = index * source_size; - - for (int i = 0; i < source_size; i++) { - labels[offset_label + i] = 0; - } - - int k = 0; - for (int i = 0; i < operation_size; i++){ - if (operations[offset + i] == 0){ - break; - } else if (operations[offset + i] == 1){ - continue; - } else { - labels[offset_label + k] = 3 - operations[offset + i]; - k++; - } + const scalar_t* __restrict__ source, + const size_t source_size, + const size_t operation_size, + int* __restrict__ operations, + int* __restrict__ labels) { + const int index = blockIdx.x; + const int offset = index * operation_size; + const int offset_label = index * source_size; + + for (int i = 0; i < source_size; i++) { + labels[offset_label + i] = 0; + } + + int k = 0; + for (int i = 0; i < operation_size; i++) { + if (operations[offset + i] == 0) { + break; + } else if (operations[offset + i] == 1) { + continue; + } else { + labels[offset_label + k] = 3 - operations[offset + i]; + k++; } + } } template <typename scalar_t> __global__ void generate_insertion_label_kernel( - const scalar_t* __restrict__ target, - const size_t target_size, - const size_t operation_size, - int* __restrict__ operations, - int* __restrict__ labels, - int* __restrict__ masks) { - - const int index = blockIdx.x; - const int offset = index * operation_size; - const int offset_label = index * target_size; - - int k = 0; - int u = 0; - int m = 0; - - for (int i = 0; i < target_size; i++) { - labels[offset_label + i] = 0; - masks[offset_label + i] = 0; - } - - for (int i = 0; i < operation_size-1; i++){ - if (operations[offset + i] == 0){ - break; - } else if (operations[offset + i] == 2){ - continue; - } else if (operations[offset + i] == 1){ - masks[offset_label + m] = 1; - u++; m++; - } else { - labels[offset_label + k] = u; - masks[offset_label + m] = 0; - k++; m++; - u = 0; - } + const scalar_t* __restrict__ target, + const size_t target_size, + const size_t operation_size, + int* __restrict__ operations, + int* __restrict__ labels, + int* __restrict__ masks) { + const int index = blockIdx.x; + const int offset = index * operation_size; + const int offset_label = index * target_size; + + int k = 0; + int u = 0; + int m = 0; + + for (int i = 0; i < target_size; i++) { + labels[offset_label + i] = 0; + masks[offset_label + i] = 0; + } + + for (int i = 0; i < operation_size - 1; i++) { + if (operations[offset + i] == 0) { + break; + } else if (operations[offset + i] == 2) { + continue; + } else if (operations[offset + i] == 1) { + masks[offset_label + m] = 1; + u++; + m++; + } else { + labels[offset_label + k] = u; + masks[offset_label + m] = 0; + k++; + m++; + u = 0; } + } } template <typename scalar_t> __global__ void levenshtein_distance_kernel( - const scalar_t* __restrict__ source, - const scalar_t* __restrict__ target, - const int* __restrict__ source_length, - const int* __restrict__ target_length, - const size_t source_size, - const size_t target_size, - int* __restrict__ operations, - int* __restrict__ errors_curr) { - - const int index = blockIdx.x; - const int offset = index * (source_size + target_size); - const int d = index * (source_size + 1) * (target_size + 1); - const int t = target_size + 1; - - auto err_idx = [d, t](int i, int j) { return d + i * t + j; }; - auto opt_idx = [offset](int k) { return offset + k; }; - - const int hyp_len = source_length[index]; - const int ref_len = target_length[index]; - const scalar_t* hyp_begin = source + index * source_size; - const scalar_t* ref_begin = target + index * target_size; - - // dynamic programming - for (int i = 0; i <= hyp_len; i++){ - errors_curr[err_idx(i, 0)] = i; - } - for (int j = 0; j <= ref_len; j++){ - errors_curr[err_idx(0, j)] = j; - } - for (int i = 1; i <= hyp_len; i++){ - for (int j = 1; j <= ref_len; j++){ - errors_curr[err_idx(i, j)] = min( - min( - errors_curr[err_idx(i-1, j)], - errors_curr[err_idx(i, j-1)] - ) + 1, - errors_curr[err_idx(i-1, j-1)] + 2 * ( - *(hyp_begin+i-1) == *(ref_begin+j-1) ? 0 : 1 - ) - ); - } + const scalar_t* __restrict__ source, + const scalar_t* __restrict__ target, + const int* __restrict__ source_length, + const int* __restrict__ target_length, + const size_t source_size, + const size_t target_size, + int* __restrict__ operations, + int* __restrict__ errors_curr) { + const int index = blockIdx.x; + const int offset = index * (source_size + target_size); + const int d = index * (source_size + 1) * (target_size + 1); + const int t = target_size + 1; + + auto err_idx = [d, t](int i, int j) { return d + i * t + j; }; + auto opt_idx = [offset](int k) { return offset + k; }; + + const int hyp_len = source_length[index]; + const int ref_len = target_length[index]; + const scalar_t* hyp_begin = source + index * source_size; + const scalar_t* ref_begin = target + index * target_size; + + // dynamic programming + for (int i = 0; i <= hyp_len; i++) { + errors_curr[err_idx(i, 0)] = i; + } + for (int j = 0; j <= ref_len; j++) { + errors_curr[err_idx(0, j)] = j; + } + for (int i = 1; i <= hyp_len; i++) { + for (int j = 1; j <= ref_len; j++) { + errors_curr[err_idx(i, j)] = min( + min(errors_curr[err_idx(i - 1, j)], errors_curr[err_idx(i, j - 1)]) + + 1, + errors_curr[err_idx(i - 1, j - 1)] + + 2 * (*(hyp_begin + i - 1) == *(ref_begin + j - 1) ? 0 : 1)); } + } - // back-tracing - int i = hyp_len; - int j = ref_len; - int o = hyp_len + ref_len; + // back-tracing + int i = hyp_len; + int j = ref_len; + int o = hyp_len + ref_len; - for (int k = 0; k < source_size + target_size; k++) { - operations[opt_idx(k)] = 0; - } + for (int k = 0; k < source_size + target_size; k++) { + operations[opt_idx(k)] = 0; + } - while ((i >= 0) && (j >= 0)) { - if ((i == 0) && (j == 0)) { - break; - } - - if ((j > 0) && (errors_curr[err_idx(i, j-1)] < errors_curr[err_idx(i, j)])) { - o--; operations[opt_idx(o)] = 1; j--; // insertion - } else if ((i > 0) && (errors_curr[err_idx(i-1, j)] < errors_curr[err_idx(i, j)])) { - o--; operations[opt_idx(o)] = 2; i--; // deletion - } else { - o--; operations[opt_idx(o)] = 3; i--; j--; // do nothing - } + while ((i >= 0) && (j >= 0)) { + if ((i == 0) && (j == 0)) { + break; } - // moving to the left - for (int k = 0; k < hyp_len + ref_len; k++) { - if (k + o < hyp_len + ref_len){ - operations[opt_idx(k)] = operations[opt_idx(k+o)]; - } else{ - operations[opt_idx(k)] = 0; // padding - } + if ((j > 0) && + (errors_curr[err_idx(i, j - 1)] < errors_curr[err_idx(i, j)])) { + o--; + operations[opt_idx(o)] = 1; + j--; // insertion + } else if ( + (i > 0) && + (errors_curr[err_idx(i - 1, j)] < errors_curr[err_idx(i, j)])) { + o--; + operations[opt_idx(o)] = 2; + i--; // deletion + } else { + o--; + operations[opt_idx(o)] = 3; + i--; + j--; // do nothing } + } + // moving to the left + for (int k = 0; k < hyp_len + ref_len; k++) { + if (k + o < hyp_len + ref_len) { + operations[opt_idx(k)] = operations[opt_idx(k + o)]; + } else { + operations[opt_idx(k)] = 0; // padding + } + } } template <typename scalar_t> __global__ void faster_levenshtein_distance_kernel( - const scalar_t* __restrict__ source, - const scalar_t* __restrict__ target, - const int* __restrict__ source_length, - const int* __restrict__ target_length, - const size_t source_size, - const size_t target_size, - int* __restrict__ operations) { - - extern __shared__ short errors[]; - auto errors_curr = errors; - - const int index = blockIdx.x; - const int offset = index * (source_size + target_size); - const int t = target_size + 1; - - auto err_idx = [t](int i, int j) { return i * t + j; }; - auto opt_idx = [offset](int k) { return offset + k; }; - - const int hyp_len = source_length[index]; - const int ref_len = target_length[index]; - const scalar_t* hyp_begin = source + index * source_size; - const scalar_t* ref_begin = target + index * target_size; - - // dynamic programming - for (int i = 0; i <= hyp_len; i++){ - errors_curr[err_idx(i, 0)] = i; - } - for (int j = 0; j <= ref_len; j++){ - errors_curr[err_idx(0, j)] = j; - } - for (int i = 1; i <= hyp_len; i++){ - for (int j = 1; j <= ref_len; j++){ - errors_curr[err_idx(i, j)] = min( - min( - errors_curr[err_idx(i-1, j)], - errors_curr[err_idx(i, j-1)] - ) + 1, - errors_curr[err_idx(i-1, j-1)] + 2 * ( - *(hyp_begin+i-1) == *(ref_begin+j-1) ? 0 : 1 - ) - ); - } + const scalar_t* __restrict__ source, + const scalar_t* __restrict__ target, + const int* __restrict__ source_length, + const int* __restrict__ target_length, + const size_t source_size, + const size_t target_size, + int* __restrict__ operations) { + extern __shared__ short errors[]; + auto errors_curr = errors; + + const int index = blockIdx.x; + const int offset = index * (source_size + target_size); + const int t = target_size + 1; + + auto err_idx = [t](int i, int j) { return i * t + j; }; + auto opt_idx = [offset](int k) { return offset + k; }; + + const int hyp_len = source_length[index]; + const int ref_len = target_length[index]; + const scalar_t* hyp_begin = source + index * source_size; + const scalar_t* ref_begin = target + index * target_size; + + // dynamic programming + for (int i = 0; i <= hyp_len; i++) { + errors_curr[err_idx(i, 0)] = i; + } + for (int j = 0; j <= ref_len; j++) { + errors_curr[err_idx(0, j)] = j; + } + for (int i = 1; i <= hyp_len; i++) { + for (int j = 1; j <= ref_len; j++) { + errors_curr[err_idx(i, j)] = min( + min(errors_curr[err_idx(i - 1, j)], errors_curr[err_idx(i, j - 1)]) + + 1, + errors_curr[err_idx(i - 1, j - 1)] + + 2 * (*(hyp_begin + i - 1) == *(ref_begin + j - 1) ? 0 : 1)); } + } - // back-tracing - int i = hyp_len; - int j = ref_len; - int o = hyp_len + ref_len; + // back-tracing + int i = hyp_len; + int j = ref_len; + int o = hyp_len + ref_len; - for (int k = 0; k < source_size + target_size; k++) { - operations[opt_idx(k)] = 0; - } + for (int k = 0; k < source_size + target_size; k++) { + operations[opt_idx(k)] = 0; + } - while ((i >= 0) && (j >= 0)) { - if ((i == 0) && (j == 0)) { - break; - } - - if ((j > 0) && (errors_curr[err_idx(i, j-1)] < errors_curr[err_idx(i, j)])) { - o--; operations[opt_idx(o)] = 1; j--; // insertion - } else if ((i > 0) && (errors_curr[err_idx(i-1, j)] < errors_curr[err_idx(i, j)])) { - o--; operations[opt_idx(o)] = 2; i--; // deletion - } else { - o--; operations[opt_idx(o)] = 3; i--; j--; // do nothing - } + while ((i >= 0) && (j >= 0)) { + if ((i == 0) && (j == 0)) { + break; } - // moving to the left - for (int k = 0; k < hyp_len + ref_len; k++) { - if (k + o < hyp_len + ref_len){ - operations[opt_idx(k)] = operations[opt_idx(k+o)]; - } else{ - operations[opt_idx(k)] = 0; // padding - } + if ((j > 0) && + (errors_curr[err_idx(i, j - 1)] < errors_curr[err_idx(i, j)])) { + o--; + operations[opt_idx(o)] = 1; + j--; // insertion + } else if ( + (i > 0) && + (errors_curr[err_idx(i - 1, j)] < errors_curr[err_idx(i, j)])) { + o--; + operations[opt_idx(o)] = 2; + i--; // deletion + } else { + o--; + operations[opt_idx(o)] = 3; + i--; + j--; // do nothing } + } + // moving to the left + for (int k = 0; k < hyp_len + ref_len; k++) { + if (k + o < hyp_len + ref_len) { + operations[opt_idx(k)] = operations[opt_idx(k + o)]; + } else { + operations[opt_idx(k)] = 0; // padding + } + } } - torch::Tensor GenerateDeletionLabelCuda( - torch::Tensor source, - torch::Tensor operations) { - - const auto batch_size = source.size(0); - at::TensorOptions options(source.device()); - options = options.dtype(at::ScalarType::Int); - auto labels = torch::empty({batch_size, source.size(1)}, options); - auto stream = at::cuda::getCurrentCUDAStream(source.device().index()); - - AT_DISPATCH_ALL_TYPES(source.scalar_type(), "generate_deletion_labels", ([&] { - generate_deletion_label_kernel<scalar_t><<<batch_size, 1, 0, stream>>>( - source.data_ptr<scalar_t>(), - source.size(1), - operations.size(1), - operations.data_ptr<int>(), - labels.data_ptr<int>()); - })); - - return labels; + torch::Tensor source, + torch::Tensor operations) { + const auto batch_size = source.size(0); + at::TensorOptions options(source.device()); + options = options.dtype(at::ScalarType::Int); + auto labels = torch::empty({batch_size, source.size(1)}, options); + auto stream = at::cuda::getCurrentCUDAStream(source.device().index()); + + AT_DISPATCH_ALL_TYPES(source.scalar_type(), "generate_deletion_labels", ([&] { + generate_deletion_label_kernel<scalar_t> + <<<batch_size, 1, 0, stream>>>( + source.data_ptr<scalar_t>(), + source.size(1), + operations.size(1), + operations.data_ptr<int>(), + labels.data_ptr<int>()); + })); + + return labels; } std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabelCuda( torch::Tensor target, torch::Tensor operations) { + const auto batch_size = target.size(0); + at::TensorOptions options(target.device()); + options = options.dtype(at::ScalarType::Int); + auto labels = torch::empty({batch_size, target.size(1)}, options); + auto masks = torch::empty({batch_size, target.size(1)}, options); + auto stream = at::cuda::getCurrentCUDAStream(target.device().index()); + + AT_DISPATCH_ALL_TYPES( + target.scalar_type(), "generate_insertion_labels", ([&] { + generate_insertion_label_kernel<scalar_t><<<batch_size, 1, 0, stream>>>( + target.data_ptr<scalar_t>(), + target.size(1), + operations.size(1), + operations.data_ptr<int>(), + labels.data_ptr<int>(), + masks.data_ptr<int>()); + })); -const auto batch_size = target.size(0); -at::TensorOptions options(target.device()); -options = options.dtype(at::ScalarType::Int); -auto labels = torch::empty({batch_size, target.size(1)}, options); -auto masks = torch::empty({batch_size, target.size(1)}, options); -auto stream = at::cuda::getCurrentCUDAStream(target.device().index()); - -AT_DISPATCH_ALL_TYPES(target.scalar_type(), "generate_insertion_labels", ([&] { - generate_insertion_label_kernel<scalar_t><<<batch_size, 1, 0, stream>>>( - target.data_ptr<scalar_t>(), - target.size(1), - operations.size(1), - operations.data_ptr<int>(), - labels.data_ptr<int>(), - masks.data_ptr<int>()); -})); - -return std::make_pair(labels, masks); + return std::make_pair(labels, masks); } - torch::Tensor LevenshteinDistanceCuda( - torch::Tensor source, - torch::Tensor target, - torch::Tensor source_length, - torch::Tensor target_length) { - - const auto batch_size = source.size(0); - const auto shared_size = (source.size(1) + 1) * (target.size(1) + 1) * sizeof(short); - - at::TensorOptions options(source.device()); - options = options.dtype(at::ScalarType::Int); - auto operations = torch::empty({batch_size, source.size(1) + target.size(1)}, options); - auto stream = at::cuda::getCurrentCUDAStream(source.device().index()); - - if (shared_size > 40000) { - auto distances = torch::empty({batch_size, (source.size(1) + 1) * (target.size(1) + 1)}, options); - AT_DISPATCH_ALL_TYPES(source.scalar_type(), "levenshtein_distance", ([&] { - levenshtein_distance_kernel<scalar_t><<<batch_size, 1, 0, stream>>>( - source.data_ptr<scalar_t>(), - target.data_ptr<scalar_t>(), - source_length.data_ptr<int>(), - target_length.data_ptr<int>(), - source.size(1), - target.size(1), - operations.data_ptr<int>(), - distances.data_ptr<int>()); - })); - } else { - AT_DISPATCH_ALL_TYPES(source.scalar_type(), "faster_levenshtein_distance", ([&] { - faster_levenshtein_distance_kernel<scalar_t><<<batch_size, 1, shared_size, stream>>>( - source.data_ptr<scalar_t>(), - target.data_ptr<scalar_t>(), - source_length.data_ptr<int>(), - target_length.data_ptr<int>(), - source.size(1), - target.size(1), - operations.data_ptr<int>()); + torch::Tensor source, + torch::Tensor target, + torch::Tensor source_length, + torch::Tensor target_length) { + const auto batch_size = source.size(0); + const auto shared_size = + (source.size(1) + 1) * (target.size(1) + 1) * sizeof(short); + + at::TensorOptions options(source.device()); + options = options.dtype(at::ScalarType::Int); + auto operations = + torch::empty({batch_size, source.size(1) + target.size(1)}, options); + auto stream = at::cuda::getCurrentCUDAStream(source.device().index()); + + if (shared_size > 40000) { + auto distances = torch::empty( + {batch_size, (source.size(1) + 1) * (target.size(1) + 1)}, options); + AT_DISPATCH_ALL_TYPES(source.scalar_type(), "levenshtein_distance", ([&] { + levenshtein_distance_kernel<scalar_t> + <<<batch_size, 1, 0, stream>>>( + source.data_ptr<scalar_t>(), + target.data_ptr<scalar_t>(), + source_length.data_ptr<int>(), + target_length.data_ptr<int>(), + source.size(1), + target.size(1), + operations.data_ptr<int>(), + distances.data_ptr<int>()); + })); + } else { + AT_DISPATCH_ALL_TYPES( + source.scalar_type(), "faster_levenshtein_distance", ([&] { + faster_levenshtein_distance_kernel<scalar_t> + <<<batch_size, 1, shared_size, stream>>>( + source.data_ptr<scalar_t>(), + target.data_ptr<scalar_t>(), + source_length.data_ptr<int>(), + target_length.data_ptr<int>(), + source.size(1), + target.size(1), + operations.data_ptr<int>()); })); - } + } - return operations; + return operations; } diff --git a/fairseq/clib/libnat_cuda/edit_dist.h b/fairseq/clib/libnat_cuda/edit_dist.h index e3506cd34d..5220c52fd8 100644 --- a/fairseq/clib/libnat_cuda/edit_dist.h +++ b/fairseq/clib/libnat_cuda/edit_dist.h @@ -11,15 +11,15 @@ #include <torch/extension.h> torch::Tensor LevenshteinDistanceCuda( - torch::Tensor source, - torch::Tensor target, - torch::Tensor source_length, - torch::Tensor target_length); + torch::Tensor source, + torch::Tensor target, + torch::Tensor source_length, + torch::Tensor target_length); torch::Tensor GenerateDeletionLabelCuda( - torch::Tensor source, - torch::Tensor operations); + torch::Tensor source, + torch::Tensor operations); std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabelCuda( - torch::Tensor source, - torch::Tensor operations); + torch::Tensor source, + torch::Tensor operations); diff --git a/fairseq/config/__init__.py b/fairseq/config/__init__.py new file mode 100644 index 0000000000..6264236915 --- /dev/null +++ b/fairseq/config/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/fairseq/config/config.yaml b/fairseq/config/config.yaml new file mode 100644 index 0000000000..2ed7168cb7 --- /dev/null +++ b/fairseq/config/config.yaml @@ -0,0 +1,19 @@ +# @package _group_ + +hydra: + run: + dir: . + +defaults: + - _self_ + - task: null + - model: null + - criterion: cross_entropy + - optimizer: null + - lr_scheduler: fixed + - bpe: null + - tokenizer: null + - scoring: null + - generation: null + - common_eval: null + - eval_lm: null diff --git a/fairseq/config/fb_run_config/slurm.yaml b/fairseq/config/fb_run_config/slurm.yaml new file mode 100644 index 0000000000..20cf8f5201 --- /dev/null +++ b/fairseq/config/fb_run_config/slurm.yaml @@ -0,0 +1,29 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '__' + exclude_keys: + - fb_run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + launcher: + cpus_per_task: 60 + gpus_per_node: ??? + tasks_per_node: 1 + nodes: 1 + partition: learnfair + mem_gb: 400 + timeout_min: 4320 + max_num_timeout: 10 + name: ${env:PREFIX}_${hydra.job.config_name} + submitit_folder: ${hydra.sweep.dir} + +distributed_training: + ddp_backend: c10d + distributed_world_size: ??? + distributed_port: ??? diff --git a/config/model/transformer_lm_baevski_gbw.yaml b/fairseq/config/model/transformer_lm/transformer_lm_baevski_gbw.yaml similarity index 100% rename from config/model/transformer_lm_baevski_gbw.yaml rename to fairseq/config/model/transformer_lm/transformer_lm_baevski_gbw.yaml diff --git a/config/model/transformer_lm_baevski_wiki103.yaml b/fairseq/config/model/transformer_lm/transformer_lm_baevski_wiki103.yaml similarity index 100% rename from config/model/transformer_lm_baevski_wiki103.yaml rename to fairseq/config/model/transformer_lm/transformer_lm_baevski_wiki103.yaml diff --git a/config/model/transformer_lm_big.yaml b/fairseq/config/model/transformer_lm/transformer_lm_big.yaml similarity index 100% rename from config/model/transformer_lm_big.yaml rename to fairseq/config/model/transformer_lm/transformer_lm_big.yaml diff --git a/config/model/transformer_lm_gbw.yaml b/fairseq/config/model/transformer_lm/transformer_lm_gbw.yaml similarity index 100% rename from config/model/transformer_lm_gbw.yaml rename to fairseq/config/model/transformer_lm/transformer_lm_gbw.yaml diff --git a/config/model/transformer_lm_gpt.yaml b/fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml similarity index 100% rename from config/model/transformer_lm_gpt.yaml rename to fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml diff --git a/config/model/transformer_lm_gpt2_big.yaml b/fairseq/config/model/transformer_lm/transformer_lm_gpt2_big.yaml similarity index 100% rename from config/model/transformer_lm_gpt2_big.yaml rename to fairseq/config/model/transformer_lm/transformer_lm_gpt2_big.yaml diff --git a/config/model/transformer_lm_gpt2_medium.yaml b/fairseq/config/model/transformer_lm/transformer_lm_gpt2_medium.yaml similarity index 100% rename from config/model/transformer_lm_gpt2_medium.yaml rename to fairseq/config/model/transformer_lm/transformer_lm_gpt2_medium.yaml diff --git a/config/model/transformer_lm_gpt2_small.yaml b/fairseq/config/model/transformer_lm/transformer_lm_gpt2_small.yaml similarity index 100% rename from config/model/transformer_lm_gpt2_small.yaml rename to fairseq/config/model/transformer_lm/transformer_lm_gpt2_small.yaml diff --git a/config/model/transformer_lm_wiki103.yaml b/fairseq/config/model/transformer_lm/transformer_lm_wiki103.yaml similarity index 100% rename from config/model/transformer_lm_wiki103.yaml rename to fairseq/config/model/transformer_lm/transformer_lm_wiki103.yaml diff --git a/fairseq/config/model/wav2vec/vq_wav2vec_gumbel.yaml b/fairseq/config/model/wav2vec/vq_wav2vec_gumbel.yaml new file mode 100644 index 0000000000..ee1329bf46 --- /dev/null +++ b/fairseq/config/model/wav2vec/vq_wav2vec_gumbel.yaml @@ -0,0 +1,5 @@ +# @package _group_ +activation: gelu +vq_type: gumbel +vq_depth: 2 +combine_groups: true diff --git a/fairseq/config/model/wav2vec2/wav2vec2_base.yaml b/fairseq/config/model/wav2vec2/wav2vec2_base.yaml new file mode 100644 index 0000000000..ce65499b80 --- /dev/null +++ b/fairseq/config/model/wav2vec2/wav2vec2_base.yaml @@ -0,0 +1,8 @@ +# @package _group_ + +quantize_targets: true +final_dim: 256 +encoder_layerdrop: 0.05 +dropout_input: 0.1 +dropout_features: 0.1 +feature_grad_mult: 0.1 diff --git a/fairseq/config/model/wav2vec2/wav2vec2_large.yaml b/fairseq/config/model/wav2vec2/wav2vec2_large.yaml new file mode 100644 index 0000000000..5846f75243 --- /dev/null +++ b/fairseq/config/model/wav2vec2/wav2vec2_large.yaml @@ -0,0 +1,20 @@ +# @package _group_ + +quantize_targets: true +extractor_mode: layer_norm +layer_norm_first: true +final_dim: 768 +latent_temp: [2.0,0.1,0.999995] +encoder_layerdrop: 0.0 +dropout_input: 0.0 +dropout_features: 0.0 +dropout: 0.0 +attention_dropout: 0.0 +conv_bias: true + +encoder_layers: 24 +encoder_embed_dim: 1024 +encoder_ffn_embed_dim: 4096 +encoder_attention_heads: 16 + +feature_grad_mult: 1.0 diff --git a/fairseq/criterions/__init__.py b/fairseq/criterions/__init__.py index 8cc6c0f043..ecd65d34ad 100644 --- a/fairseq/criterions/__init__.py +++ b/fairseq/criterions/__init__.py @@ -25,12 +25,12 @@ ) -def build_criterion(cfg: DictConfig, task): - return build_criterion_(cfg, task) +def build_criterion(cfg: DictConfig, task, from_checkpoint=False): + return build_criterion_(cfg, task, from_checkpoint=from_checkpoint) # automatically import any Python files in the criterions/ directory -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): file_name = file[: file.find(".py")] importlib.import_module("fairseq.criterions." + file_name) diff --git a/fairseq/criterions/adaptive_loss.py b/fairseq/criterions/adaptive_loss.py index 04832295ec..fc1ac85404 100644 --- a/fairseq/criterions/adaptive_loss.py +++ b/fairseq/criterions/adaptive_loss.py @@ -7,11 +7,12 @@ from dataclasses import dataclass import torch.nn.functional as F -from fairseq import metrics, utils +from fairseq import utils +from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.constants import DDP_BACKEND_CHOICES -from omegaconf import II, DictConfig +from omegaconf import II @dataclass @@ -31,12 +32,12 @@ def __init__(self, task, sentence_avg): self.sentence_avg = sentence_avg @classmethod - def build_criterion(cls, cfg: DictConfig, task): - if cfg.ddp_backend == "c10d": + def build_criterion(cls, cfg: AdaptiveLossConfig, task): + if cfg.ddp_backend in {"c10d", "pytorch_ddp"}: raise Exception( - "AdaptiveLoss is not compatible with the c10d " + "AdaptiveLoss is not compatible with the PyTorch " "version of DistributedDataParallel. Please use " - "`--ddp-backend=no_c10d` instead." + "`--ddp-backend=legacy_ddp` instead." ) return cls(task, cfg.sentence_avg) diff --git a/fairseq/criterions/cross_entropy.py b/fairseq/criterions/cross_entropy.py index 758e727660..24d6bcd612 100644 --- a/fairseq/criterions/cross_entropy.py +++ b/fairseq/criterions/cross_entropy.py @@ -7,7 +7,8 @@ from dataclasses import dataclass import torch.nn.functional as F -from fairseq import metrics, utils +from fairseq import utils +from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from omegaconf import II @@ -64,6 +65,7 @@ def reduce_metrics(logging_outputs) -> None: ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + # we divide by log(2) to convert the loss from base e to base 2 metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) diff --git a/fairseq/criterions/ctc.py b/fairseq/criterions/ctc.py index 6b77ce47eb..368213cb2b 100644 --- a/fairseq/criterions/ctc.py +++ b/fairseq/criterions/ctc.py @@ -6,39 +6,107 @@ import math from argparse import Namespace +from dataclasses import dataclass, field +from omegaconf import II +from typing import Optional import torch import torch.nn.functional as F -from fairseq import metrics, utils -from fairseq.criterions import LegacyFairseqCriterion, register_criterion + +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass from fairseq.data.data_utils import post_process +from fairseq.tasks import FairseqTask from fairseq.logging.meters import safe_round -@register_criterion("ctc") -class CtcCriterion(LegacyFairseqCriterion): - def __init__(self, args, task): - super().__init__(args, task) - self.blank_idx = task.target_dictionary.bos() +@dataclass +class CtcCriterionConfig(FairseqDataclass): + zero_infinity: bool = field( + default=False, + metadata={"help": "zero inf loss when source length <= target length"}, + ) + sentence_avg: bool = II("optimization.sentence_avg") + post_process: str = field( + default="letter", + metadata={ + "help": "how to post process predictions into words. can be letter, " + "wordpiece, BPE symbols, etc. " + "See fairseq.data.data_utils.post_process() for full list of options" + }, + ) + wer_kenlm_model: Optional[str] = field( + default=None, + metadata={ + "help": "if this is provided, use kenlm to compute wer (along with other wer_* args)" + }, + ) + wer_lexicon: Optional[str] = field( + default=None, + metadata={"help": "lexicon to use with wer_kenlm_model"}, + ) + wer_lm_weight: float = field( + default=2.0, + metadata={"help": "lm weight to use with wer_kenlm_model"}, + ) + wer_word_score: float = field( + default=-1.0, + metadata={"help": "lm word score to use with wer_kenlm_model"}, + ) + wer_sil_weight: float = field( + default=0, + metadata={"help": "lm word score to use with wer_kenlm_model"}, + ) + + wer_args: Optional[str] = field( + default=None, + metadata={ + "help": "DEPRECATED: tuple of (wer_kenlm_model, wer_lexicon, wer_lm_weight, wer_word_score)" + }, + ) + + +@register_criterion("ctc", dataclass=CtcCriterionConfig) +class CtcCriterion(FairseqCriterion): + def __init__( + self, cfg: CtcCriterionConfig, task: FairseqTask, rdrop_alpha: int = 0.0 + ): + super().__init__(task) + self.blank_idx = ( + task.target_dictionary.index(task.blank_symbol) + if hasattr(task, "blank_symbol") + else 0 + ) self.pad_idx = task.target_dictionary.pad() self.eos_idx = task.target_dictionary.eos() - self.post_process = args.post_process if args.post_process else "letter" + self.post_process = cfg.post_process - if args.wer_args is not None: - from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder + self.rdrop_alpha = rdrop_alpha - wer_compute_kenlm, wer_lexicon, lm_w, ws_w = eval(args.wer_args) + if cfg.wer_args is not None: + ( + cfg.wer_kenlm_model, + cfg.wer_lexicon, + cfg.wer_lm_weight, + cfg.wer_word_score, + ) = eval(cfg.wer_args) + + if cfg.wer_kenlm_model is not None and cfg.wer_kenlm_model != "": + from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder dec_args = Namespace() dec_args.nbest = 1 dec_args.criterion = "ctc" - dec_args.kenlm_model = wer_compute_kenlm - dec_args.lexicon = wer_lexicon + dec_args.kenlm_model = cfg.wer_kenlm_model + dec_args.lexicon = cfg.wer_lexicon dec_args.beam = 50 dec_args.beam_size_token = min(50, len(task.target_dictionary)) dec_args.beam_threshold = min(50, len(task.target_dictionary)) - dec_args.lm_weight = lm_w - dec_args.word_score = ws_w + dec_args.lm_weight = cfg.wer_lm_weight + dec_args.word_score = cfg.wer_word_score + dec_args.sil_weight = cfg.wer_sil_weight dec_args.unk_weight = -math.inf dec_args.sil_weight = 0 @@ -46,49 +114,53 @@ def __init__(self, args, task): else: self.w2l_decoder = None - self.zero_infinity = args.zero_infinity - self.sentence_avg = args.sentence_avg + self.zero_infinity = cfg.zero_infinity + self.sentence_avg = cfg.sentence_avg - @staticmethod - def add_args(parser): - """Add criterion-specific arguments to the parser.""" - parser.add_argument( - "--zero-infinity", action="store_true", help="zero inf loss" - ) - try: - parser.add_argument( - "--post-process", - "--remove-bpe", - default="letter", - help="remove BPE tokens before scoring (can be set to sentencepiece, letter, and more)", - ) - except: - pass # this option might have been added from eval args - parser.add_argument( - "--wer-args", - type=str, - default=None, - help="options for wer computation on valid set using 4 gram lm. this should be a tuple of 4 elements: path to 4-gram lm, \ - path to lexicon, lm score, word score", - ) - - def forward(self, model, sample, reduce=True): + def forward(self, model, sample, reduce=True, **kwargs): net_output = model(**sample["net_input"]) lprobs = model.get_normalized_probs( net_output, log_probs=True ).contiguous() # (T, B, C) from the encoder + # CTC loss is calculated over duplicated inputs + # sample is already duplicated for R-Drop + if self.rdrop_alpha > 0: + for k, v in sample.items(): + if k in ["target", "target_lengths"]: + sample[k] = torch.cat([v, v.clone()], dim=0) + elif k == "net_input": + if sample[k]["src_tokens"].size(1) != sample[k]["src_lengths"].size( + 0 + ): + # for decoder CTC loss + sample[k]["src_lengths"] = torch.cat( + [ + sample[k]["src_lengths"], + sample[k]["src_lengths"].clone(), + ], + dim=0, + ) + if "src_lengths" in sample["net_input"]: input_lengths = sample["net_input"]["src_lengths"] else: - non_padding_mask = ~net_output["padding_mask"] - input_lengths = non_padding_mask.long().sum(-1) + if net_output["padding_mask"] is not None: + non_padding_mask = ~net_output["padding_mask"] + input_lengths = non_padding_mask.long().sum(-1) + else: + input_lengths = lprobs.new_full( + (lprobs.size(1),), lprobs.size(0), dtype=torch.long + ) pad_mask = (sample["target"] != self.pad_idx) & ( sample["target"] != self.eos_idx ) targets_flat = sample["target"].masked_select(pad_mask) - target_lengths = sample["target_lengths"] + if "target_lengths" in sample: + target_lengths = sample["target_lengths"] + else: + target_lengths = pad_mask.sum(-1) with torch.backends.cudnn.flags(enabled=False): loss = F.ctc_loss( diff --git a/fairseq/criterions/fairseq_criterion.py b/fairseq/criterions/fairseq_criterion.py index b2eda1a7e4..0b1e64a8e3 100644 --- a/fairseq/criterions/fairseq_criterion.py +++ b/fairseq/criterions/fairseq_criterion.py @@ -6,9 +6,10 @@ import inspect from typing import Any, Dict, List -from fairseq import metrics, utils +from fairseq import utils +from fairseq.logging import metrics +from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import gen_parser_from_dataclass -from omegaconf import DictConfig from torch.nn.modules.loss import _Loss @@ -28,7 +29,7 @@ def add_args(cls, parser): gen_parser_from_dataclass(parser, dc()) @classmethod - def build_criterion(cls, cfg: DictConfig, task): + def build_criterion(cls, cfg: FairseqDataclass, task): """Construct a criterion from command-line args.""" # arguments in the __init__. init_args = {} @@ -46,6 +47,8 @@ def build_criterion(cls, cfg: DictConfig, task): if p.name == "task": init_args["task"] = task + elif p.name == "cfg": + init_args["cfg"] = cfg elif hasattr(cfg, p.name): init_args[p.name] = getattr(cfg, p.name) elif p.default != p.empty: diff --git a/fairseq/criterions/fastspeech2_loss.py b/fairseq/criterions/fastspeech2_loss.py new file mode 100644 index 0000000000..ab7cd08e3b --- /dev/null +++ b/fairseq/criterions/fastspeech2_loss.py @@ -0,0 +1,137 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +from typing import List, Dict, Any +from dataclasses import dataclass, field + +import torch +import torch.nn.functional as F + +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass +from fairseq.data.data_utils import lengths_to_mask +from fairseq.models.fairseq_model import FairseqEncoderModel + + +@dataclass +class FastSpeech2CriterionConfig(FairseqDataclass): + ctc_weight: float = field(default=0.0, metadata={"help": "weight for CTC loss"}) + + +@register_criterion("fastspeech2", dataclass=FastSpeech2CriterionConfig) +class FastSpeech2Loss(FairseqCriterion): + def __init__(self, task, ctc_weight): + super().__init__(task) + self.ctc_weight = ctc_weight + + def forward(self, model: FairseqEncoderModel, sample, reduction="mean"): + src_tokens = sample["net_input"]["src_tokens"] + src_lens = sample["net_input"]["src_lengths"] + tgt_lens = sample["target_lengths"] + _feat_out, _feat_out_post, _, log_dur_out, pitch_out, energy_out = model( + src_tokens=src_tokens, + src_lengths=src_lens, + prev_output_tokens=sample["net_input"]["prev_output_tokens"], + incremental_state=None, + target_lengths=tgt_lens, + speaker=sample["speaker"], + durations=sample["durations"], + pitches=sample["pitches"], + energies=sample["energies"], + ) + + src_mask = lengths_to_mask(sample["net_input"]["src_lengths"]) + tgt_mask = lengths_to_mask(sample["target_lengths"]) + + pitches, energies = sample["pitches"], sample["energies"] + pitch_out, pitches = pitch_out[src_mask], pitches[src_mask] + energy_out, energies = energy_out[src_mask], energies[src_mask] + + feat_out, feat = _feat_out[tgt_mask], sample["target"][tgt_mask] + l1_loss = F.l1_loss(feat_out, feat, reduction=reduction) + if _feat_out_post is not None: + l1_loss += F.l1_loss(_feat_out_post[tgt_mask], feat, reduction=reduction) + + pitch_loss = F.mse_loss(pitch_out, pitches, reduction=reduction) + energy_loss = F.mse_loss(energy_out, energies, reduction=reduction) + + log_dur_out = log_dur_out[src_mask] + dur = sample["durations"].float() + dur = dur.half() if log_dur_out.type().endswith(".HalfTensor") else dur + log_dur = torch.log(dur + 1)[src_mask] + dur_loss = F.mse_loss(log_dur_out, log_dur, reduction=reduction) + + ctc_loss = torch.tensor(0.0).type_as(l1_loss) + if self.ctc_weight > 0.0: + lprobs = model.get_normalized_probs((_feat_out,), log_probs=True) + lprobs = lprobs.transpose(0, 1) # T x B x C + src_mask = lengths_to_mask(src_lens) + src_tokens_flat = src_tokens.masked_select(src_mask) + ctc_loss = ( + F.ctc_loss( + lprobs, + src_tokens_flat, + tgt_lens, + src_lens, + reduction=reduction, + zero_infinity=True, + ) + * self.ctc_weight + ) + + loss = l1_loss + dur_loss + pitch_loss + energy_loss + ctc_loss + + sample_size = sample["nsentences"] + logging_output = { + "loss": utils.item(loss.data), + "ntokens": sample["ntokens"], + "nsentences": sample["nsentences"], + "sample_size": sample_size, + "l1_loss": utils.item(l1_loss.data), + "dur_loss": utils.item(dur_loss.data), + "pitch_loss": utils.item(pitch_loss.data), + "energy_loss": utils.item(energy_loss.data), + "ctc_loss": utils.item(ctc_loss.data), + } + return loss, sample_size, logging_output + + @classmethod + def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None: + ns = [log.get("sample_size", 0) for log in logging_outputs] + ntot = sum(ns) + ws = [n / (ntot + 1e-8) for n in ns] + for key in [ + "loss", + "l1_loss", + "dur_loss", + "pitch_loss", + "energy_loss", + "ctc_loss", + ]: + vals = [log.get(key, 0) for log in logging_outputs] + val = sum(val * w for val, w in zip(vals, ws)) + metrics.log_scalar(key, val, ntot, round=3) + metrics.log_scalar("sample_size", ntot, len(logging_outputs)) + + # inference metrics + if "targ_frames" not in logging_outputs[0]: + return + n = sum(log.get("targ_frames", 0) for log in logging_outputs) + for key, new_key in [ + ("mcd_loss", "mcd_loss"), + ("pred_frames", "pred_ratio"), + ("nins", "ins_rate"), + ("ndel", "del_rate"), + ]: + val = sum(log.get(key, 0) for log in logging_outputs) + metrics.log_scalar(new_key, val / n, n, round=3) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + return False diff --git a/fairseq/criterions/hubert_criterion.py b/fairseq/criterions/hubert_criterion.py new file mode 100644 index 0000000000..262874b582 --- /dev/null +++ b/fairseq/criterions/hubert_criterion.py @@ -0,0 +1,195 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import re +from dataclasses import dataclass, field +from typing import List, Optional + +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass + + +@dataclass +class HubertCriterionConfig(FairseqDataclass): + pred_masked_weight: float = field( + default=1.0, + metadata={"help": "weight for predictive loss for masked frames"}, + ) + pred_nomask_weight: float = field( + default=0.0, + metadata={"help": "weight for predictive loss for unmasked frames"}, + ) + loss_weights: Optional[List[float]] = field( + default=None, + metadata={"help": "weights for additional loss terms (not first one)"}, + ) + log_keys: List[str] = field( + default_factory=lambda: [], + metadata={"help": "output keys to log"}, + ) + + +@register_criterion("hubert", dataclass=HubertCriterionConfig) +class HubertCriterion(FairseqCriterion): + def __init__( + self, + task, + pred_masked_weight, + pred_nomask_weight, + loss_weights=None, + log_keys=None, + ): + super().__init__(task) + self.pred_masked_weight = pred_masked_weight + self.pred_nomask_weight = pred_nomask_weight + self.loss_weights = loss_weights + self.log_keys = [] if log_keys is None else log_keys + + def forward(self, model, sample, reduce=True, log_pred=False): + """Compute the loss for the given sample. + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + net_output = model(target_list=sample["target_list"], **sample["net_input"]) + loss = 0.0 + sample_size = 0 + logging_output = {} + reduction = "sum" if reduce else "none" + + loss_m_list = [] + logp_m_list = model.get_logits(net_output, True) + targ_m_list = model.get_targets(net_output, True) + assert self.pred_masked_weight == 0 or len(logp_m_list) > 0 + for i, (logp_m, targ_m) in enumerate(zip(logp_m_list, targ_m_list)): + loss_m = F.cross_entropy(logp_m, targ_m, reduction=reduction) + loss_m_list.append(loss_m) + logging_output[f"loss_m_{i}"] = loss_m.detach().item() + if self.pred_masked_weight > 0: + loss += self.pred_masked_weight * sum(loss_m_list) + sample_size += targ_m_list[0].numel() + + loss_u_list = [] + logp_u_list = model.get_logits(net_output, False) + targ_u_list = model.get_targets(net_output, False) + assert self.pred_nomask_weight == 0 or len(logp_u_list) > 0 + for i, (logp_u, targ_u) in enumerate(zip(logp_u_list, targ_u_list)): + loss_u = F.cross_entropy(logp_u, targ_u, reduction=reduction) + loss_u_list.append(loss_u) + logging_output[f"loss_u_{i}"] = loss_u.detach().item() + if self.pred_nomask_weight > 0: + loss += self.pred_nomask_weight * sum(loss_u_list) + sample_size += targ_u_list[0].numel() + + if self.loss_weights is not None: + assert hasattr(model, "get_extra_losses") + extra_losses, names = model.get_extra_losses(net_output) + if torch.is_tensor(extra_losses): + extra_losses = [extra_losses] + names = [names] + if len(self.loss_weights) == 1 and len(extra_losses) != 1: + self.loss_weights = [self.loss_weights[0]] * len(extra_losses) + assert len(extra_losses) == len( + self.loss_weights + ), f"{len(extra_losses)}, {len(self.loss_weights)}" + for p, n, coef in zip(extra_losses, names, self.loss_weights): + if coef != 0 and p is not None: + p = coef * p.float() * sample_size + loss += p + logging_output[f"loss_{n}"] = p.item() + + logging_output = { + "loss": loss.item() if reduce else loss, + "ntokens": sample_size, + "nsentences": sample["id"].numel(), + "sample_size": sample_size, + **logging_output, + } + + for lk in self.log_keys: + if lk in net_output: + logging_output[lk] = float((net_output[lk])) + + def compute_correct(logits): + if logits.numel() == 0: + return 0, 0 + else: + assert logits.dim() > 1, logits.shape + max = logits.argmax(-1) == 0 + min = logits.argmin(-1) == 0 + both = max & min + corr = max.long().sum().item() - both.long().sum().item() + count = max.numel() + return corr, count + + with torch.no_grad(): + for i, logp_m in enumerate(logp_m_list): + corr_m, count_m = compute_correct(logp_m) + logging_output[f"correct_m_{i}"] = corr_m + logging_output[f"count_m_{i}"] = count_m + + for i, logp_u in enumerate(logp_u_list): + corr_u, count_u = compute_correct(logp_u) + logging_output[f"correct_u_{i}"] = corr_u + logging_output[f"count_u_{i}"] = count_u + + return loss, sample_size, logging_output + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training (copied from normal cross entropy).""" + loss_sum = sum(log.get("loss", 0) for log in logging_outputs) + ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + + metrics.log_scalar( + "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 + ) + if sample_size != ntokens: + metrics.log_scalar( + "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 + ) + metrics.log_derived( + "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) + ) + else: + metrics.log_derived( + "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg) + ) + + counts = {} + for lk in logging_outputs[0].keys(): + if lk.startswith("count_"): + val = sum(log[lk] for log in logging_outputs) + metrics.log_scalar(lk, val) + counts[lk] = val + + for lk in logging_outputs[0].keys(): + if lk.startswith("loss_"): + val = sum(log[lk] for log in logging_outputs) + metrics.log_scalar(lk, val / sample_size / math.log(2), round=3) + elif lk.startswith("correct_"): + val = sum(log[lk] for log in logging_outputs) + metrics.log_scalar(lk, val / counts[re.sub("correct", "count", lk)]) + + @staticmethod + def aggregate_logging_outputs(logging_outputs): + """Aggregate logging outputs from data parallel training.""" + raise NotImplementedError() + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return False diff --git a/fairseq/criterions/label_smoothed_cross_entropy.py b/fairseq/criterions/label_smoothed_cross_entropy.py index 2dc7f7a47d..325679bb16 100644 --- a/fairseq/criterions/label_smoothed_cross_entropy.py +++ b/fairseq/criterions/label_smoothed_cross_entropy.py @@ -4,10 +4,31 @@ # LICENSE file in the root directory of this source tree. import math +from dataclasses import dataclass, field import torch -from fairseq import metrics, utils +from fairseq import utils +from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass +from omegaconf import II + + +@dataclass +class LabelSmoothedCrossEntropyCriterionConfig(FairseqDataclass): + label_smoothing: float = field( + default=0.0, + metadata={"help": "epsilon for label smoothing, 0 means no label smoothing"}, + ) + report_accuracy: bool = field( + default=False, + metadata={"help": "report accuracy metric"}, + ) + ignore_prefix_size: int = field( + default=0, + metadata={"help": "Ignore first N tokens"}, + ) + sentence_avg: bool = II("optimization.sentence_avg") def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=True): @@ -25,12 +46,14 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=T if reduce: nll_loss = nll_loss.sum() smooth_loss = smooth_loss.sum() - eps_i = epsilon / lprobs.size(-1) - loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss + eps_i = epsilon / (lprobs.size(-1) - 1) + loss = (1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss return loss, nll_loss -@register_criterion("label_smoothed_cross_entropy") +@register_criterion( + "label_smoothed_cross_entropy", dataclass=LabelSmoothedCrossEntropyCriterionConfig +) class LabelSmoothedCrossEntropyCriterion(FairseqCriterion): def __init__( self, @@ -46,18 +69,6 @@ def __init__( self.ignore_prefix_size = ignore_prefix_size self.report_accuracy = report_accuracy - @staticmethod - def add_args(parser): - """Add criterion-specific arguments to the parser.""" - # fmt: off - parser.add_argument('--label-smoothing', default=0., type=float, metavar='D', - help='epsilon for label smoothing, 0 means no label smoothing') - parser.add_argument('--report-accuracy', action='store_true', - help='report accuracy metric') - parser.add_argument('--ignore-prefix-size', default=0, type=int, - help='Ignore first N tokens') - # fmt: on - def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. @@ -88,12 +99,9 @@ def get_lprobs_and_target(self, model, net_output, sample): lprobs = model.get_normalized_probs(net_output, log_probs=True) target = model.get_targets(sample, net_output) if self.ignore_prefix_size > 0: - if getattr(lprobs, "batch_first", False): - lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous() - target = target[:, self.ignore_prefix_size :].contiguous() - else: - lprobs = lprobs[self.ignore_prefix_size :, :, :].contiguous() - target = target[self.ignore_prefix_size :, :].contiguous() + # lprobs: B x T x C + lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous() + target = target[:, self.ignore_prefix_size :].contiguous() return lprobs.view(-1, lprobs.size(-1)), target.view(-1) def compute_loss(self, model, net_output, sample, reduce=True): diff --git a/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py b/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py new file mode 100644 index 0000000000..6eaedab9cf --- /dev/null +++ b/fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py @@ -0,0 +1,221 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass, field +import torch +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import register_criterion +from fairseq.criterions.label_smoothed_cross_entropy import ( + LabelSmoothedCrossEntropyCriterion, + LabelSmoothedCrossEntropyCriterionConfig, +) + +try: + from simuleval.metrics.latency import ( + AverageLagging, + AverageProportion, + DifferentiableAverageLagging, + ) + + LATENCY_METRICS = { + "average_lagging": AverageLagging, + "average_proportion": AverageProportion, + "differentiable_average_lagging": DifferentiableAverageLagging, + } +except ImportError: + LATENCY_METRICS = None + + +@dataclass +class LabelSmoothedCrossEntropyCriterionLatencyAugmentConfig( + LabelSmoothedCrossEntropyCriterionConfig +): + latency_avg_weight: float = field( + default=0.0, + metadata={"help": "weight fot average latency loss."}, + ) + latency_var_weight: float = field( + default=0.0, + metadata={"help": "weight fot variance latency loss."}, + ) + latency_avg_type: str = field( + default="differentiable_average_lagging", + metadata={"help": "latency type for average loss"}, + ) + latency_var_type: str = field( + default="variance_delay", + metadata={"help": "latency typ for variance loss"}, + ) + latency_gather_method: str = field( + default="weighted_average", + metadata={"help": "method to gather latency loss for all heads"}, + ) + latency_update_after: int = field( + default=0, + metadata={"help": "Add latency loss after certain steps"}, + ) + + +@register_criterion( + "latency_augmented_label_smoothed_cross_entropy", + dataclass=LabelSmoothedCrossEntropyCriterionLatencyAugmentConfig, +) +class LatencyAugmentedLabelSmoothedCrossEntropyCriterion( + LabelSmoothedCrossEntropyCriterion +): + def __init__( + self, + task, + sentence_avg, + label_smoothing, + ignore_prefix_size, + report_accuracy, + latency_avg_weight, + latency_var_weight, + latency_avg_type, + latency_var_type, + latency_gather_method, + latency_update_after, + ): + super().__init__( + task, sentence_avg, label_smoothing, ignore_prefix_size, report_accuracy + ) + assert LATENCY_METRICS is not None, "Please make sure SimulEval is installed." + + self.latency_avg_weight = latency_avg_weight + self.latency_var_weight = latency_var_weight + self.latency_avg_type = latency_avg_type + self.latency_var_type = latency_var_type + self.latency_gather_method = latency_gather_method + self.latency_update_after = latency_update_after + + def forward(self, model, sample, reduce=True): + net_output = model(**sample["net_input"]) + # 1. Compute cross entropy loss + loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce) + + # 2. Compute cross latency loss + latency_loss, expected_latency, expected_delays_var = self.compute_latency_loss( + model, sample, net_output + ) + + if self.latency_update_after > 0: + num_updates = getattr(model.decoder, "num_updates", None) + assert ( + num_updates is not None + ), "model.decoder doesn't have attribute 'num_updates'" + if num_updates <= self.latency_update_after: + latency_loss = 0 + + loss += latency_loss + + sample_size = ( + sample["target"].size(0) if self.sentence_avg else sample["ntokens"] + ) + + logging_output = { + "loss": loss.data, + "nll_loss": nll_loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample["target"].size(0), + "sample_size": sample_size, + "latency": expected_latency, + "delays_var": expected_delays_var, + "latency_loss": latency_loss, + } + + if self.report_accuracy: + n_correct, total = self.compute_accuracy(model, net_output, sample) + logging_output["n_correct"] = utils.item(n_correct.data) + logging_output["total"] = utils.item(total.data) + return loss, sample_size, logging_output + + def compute_latency_loss(self, model, sample, net_output): + assert ( + net_output[-1].encoder_padding_mask is None + or not net_output[-1].encoder_padding_mask[:, 0].any() + ), "Only right padding on source is supported." + # 1. Obtain the expected alignment + alpha_list = [item["alpha"] for item in net_output[1].attn_list] + num_layers = len(alpha_list) + bsz, num_heads, tgt_len, src_len = alpha_list[0].size() + + # bsz * num_layers * num_heads, tgt_len, src_len + alpha_all = torch.cat(alpha_list, dim=1).view(-1, tgt_len, src_len) + + # 2 compute expected delays + # bsz * num_heads * num_layers, tgt_len, src_len for MMA + steps = ( + torch.arange(1, 1 + src_len) + .unsqueeze(0) + .unsqueeze(1) + .expand_as(alpha_all) + .type_as(alpha_all) + ) + + expected_delays = torch.sum(steps * alpha_all, dim=-1) + + target_padding_mask = ( + model.get_targets(sample, net_output) + .eq(self.padding_idx) + .unsqueeze(1) + .expand(bsz, num_layers * num_heads, tgt_len) + .contiguous() + .view(-1, tgt_len) + ) + + src_lengths = ( + sample["net_input"]["src_lengths"] + .unsqueeze(1) + .expand(bsz, num_layers * num_heads) + .contiguous() + .view(-1) + ) + expected_latency = LATENCY_METRICS[self.latency_avg_type]( + expected_delays, src_lengths, None, target_padding_mask=target_padding_mask + ) + + # 2.1 average expected latency of heads + # bsz, num_layers * num_heads + expected_latency = expected_latency.view(bsz, -1) + if self.latency_gather_method == "average": + # bsz * tgt_len + expected_latency = expected_delays.mean(dim=1) + elif self.latency_gather_method == "weighted_average": + weights = torch.nn.functional.softmax(expected_latency, dim=1) + expected_latency = torch.sum(expected_latency * weights, dim=1) + elif self.latency_gather_method == "max": + expected_latency = expected_latency.max(dim=1)[0] + else: + raise NotImplementedError + + expected_latency = expected_latency.sum() + avg_loss = self.latency_avg_weight * expected_latency + + # 2.2 variance of expected delays + expected_delays_var = ( + expected_delays.view(bsz, -1, tgt_len).var(dim=1).mean(dim=1) + ) + expected_delays_var = expected_delays_var.sum() + var_loss = self.latency_avg_weight * expected_delays_var + + # 3. Final loss + latency_loss = avg_loss + var_loss + + return latency_loss, expected_latency, expected_delays_var + + @classmethod + def reduce_metrics(cls, logging_outputs) -> None: + super().reduce_metrics(logging_outputs) + latency = sum(log.get("latency", 0) for log in logging_outputs) + delays_var = sum(log.get("delays_var", 0) for log in logging_outputs) + latency_loss = sum(log.get("latency_loss", 0) for log in logging_outputs) + nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) + metrics.log_scalar("latency", latency.float() / nsentences, nsentences, round=3) + metrics.log_scalar("delays_var", delays_var / nsentences, nsentences, round=3) + metrics.log_scalar( + "latency_loss", latency_loss / nsentences, nsentences, round=3 + ) diff --git a/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py b/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py index 73cfa05310..b55f65e5cc 100644 --- a/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py +++ b/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py @@ -5,13 +5,31 @@ import math -from fairseq import metrics, utils +from fairseq import utils +from fairseq.logging import metrics from fairseq.criterions import register_criterion -from .label_smoothed_cross_entropy import LabelSmoothedCrossEntropyCriterion +from .label_smoothed_cross_entropy import ( + LabelSmoothedCrossEntropyCriterion, + LabelSmoothedCrossEntropyCriterionConfig, +) +from dataclasses import dataclass, field -@register_criterion("label_smoothed_cross_entropy_with_alignment") + +@dataclass +class LabelSmoothedCrossEntropyCriterionWithAlignmentConfig( + LabelSmoothedCrossEntropyCriterionConfig +): + alignment_lambda: float = field( + default=0.05, metadata={"help": "weight for the alignment loss"} + ) + + +@register_criterion( + "label_smoothed_cross_entropy_with_alignment", + dataclass=LabelSmoothedCrossEntropyCriterionWithAlignmentConfig, +) class LabelSmoothedCrossEntropyCriterionWithAlignment( LabelSmoothedCrossEntropyCriterion ): @@ -19,18 +37,6 @@ def __init__(self, task, sentence_avg, label_smoothing, alignment_lambda): super().__init__(task, sentence_avg, label_smoothing) self.alignment_lambda = alignment_lambda - @staticmethod - def add_args(parser): - """Add criterion-specific arguments to the parser.""" - LabelSmoothedCrossEntropyCriterion.add_args(parser) - parser.add_argument( - "--alignment-lambda", - default=0.05, - type=float, - metavar="D", - help="weight for the alignment loss", - ) - def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. diff --git a/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py b/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py new file mode 100644 index 0000000000..f2e8cdf3bf --- /dev/null +++ b/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py @@ -0,0 +1,97 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass, field + +import torch +import torch.nn.functional as F + +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import register_criterion +from fairseq.criterions.label_smoothed_cross_entropy import ( + LabelSmoothedCrossEntropyCriterion, + LabelSmoothedCrossEntropyCriterionConfig, +) +from fairseq.data.data_utils import lengths_to_mask + + +@dataclass +class LabelSmoothedCrossEntropyWithCtcCriterionConfig( + LabelSmoothedCrossEntropyCriterionConfig +): + ctc_weight: float = field(default=1.0, metadata={"help": "weight for CTC loss"}) + + +@register_criterion( + "label_smoothed_cross_entropy_with_ctc", + dataclass=LabelSmoothedCrossEntropyWithCtcCriterionConfig, +) +class LabelSmoothedCrossEntropyWithCtcCriterion(LabelSmoothedCrossEntropyCriterion): + def __init__( + self, + task, + sentence_avg, + label_smoothing, + ignore_prefix_size, + report_accuracy, + ctc_weight, + ): + super().__init__( + task, sentence_avg, label_smoothing, ignore_prefix_size, report_accuracy + ) + self.ctc_weight = ctc_weight + + def forward(self, model, sample, reduce=True): + net_output = model(**sample["net_input"]) + loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce) + + ctc_loss = torch.tensor(0.0).type_as(loss) + if self.ctc_weight > 0.0: + ctc_lprobs, ctc_lens = model.get_ctc_output(net_output, sample) + ctc_tgt, ctc_tgt_lens = model.get_ctc_target(sample) + ctc_tgt_mask = lengths_to_mask(ctc_tgt_lens) + ctc_tgt_flat = ctc_tgt.masked_select(ctc_tgt_mask) + reduction = "sum" if reduce else "none" + ctc_loss = ( + F.ctc_loss( + ctc_lprobs, + ctc_tgt_flat, + ctc_lens, + ctc_tgt_lens, + reduction=reduction, + zero_infinity=True, + ) + * self.ctc_weight + ) + loss += ctc_loss + + sample_size = ( + sample["target"].size(0) if self.sentence_avg else sample["ntokens"] + ) + logging_output = { + "loss": utils.item(loss.data), + "nll_loss": utils.item(nll_loss.data), + "ctc_loss": utils.item(ctc_loss.data), + "ntokens": sample["ntokens"], + "nsentences": sample["target"].size(0), + "sample_size": sample_size, + } + if self.report_accuracy: + n_correct, total = self.compute_accuracy(model, net_output, sample) + logging_output["n_correct"] = utils.item(n_correct.data) + logging_output["total"] = utils.item(total.data) + return loss, sample_size, logging_output + + @classmethod + def reduce_metrics(cls, logging_outputs) -> None: + super().reduce_metrics(logging_outputs) + loss_sum = sum(log.get("ctc_loss", 0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + + metrics.log_scalar( + "ctc_loss", loss_sum / sample_size / math.log(2), sample_size, round=3 + ) diff --git a/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py b/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py new file mode 100644 index 0000000000..47ee263a8d --- /dev/null +++ b/fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py @@ -0,0 +1,177 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass, field + +import torch + +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import register_criterion +from fairseq.criterions.label_smoothed_cross_entropy import ( + LabelSmoothedCrossEntropyCriterion, + LabelSmoothedCrossEntropyCriterionConfig, + label_smoothed_nll_loss, +) + + +@dataclass +class RdropLabelSmoothedCrossEntropyCriterionConfig( + LabelSmoothedCrossEntropyCriterionConfig +): + rdrop_alpha: float = field( + default=0.0, + metadata={"help": "alpha for r-drop, 0 means no r-drop"}, + ) + + +@register_criterion( + "label_smoothed_cross_entropy_with_rdrop", + dataclass=RdropLabelSmoothedCrossEntropyCriterionConfig, +) +class RdropLabelSmoothedCrossEntropyCriterion(LabelSmoothedCrossEntropyCriterion): + def __init__( + self, + task, + sentence_avg, + label_smoothing, + ignore_prefix_size=0, + report_accuracy=False, + rdrop_alpha=0.0, + ): + super().__init__( + task, + sentence_avg, + label_smoothing, + ignore_prefix_size=ignore_prefix_size, + report_accuracy=report_accuracy, + ) + self.sentence_avg = sentence_avg + self.eps = label_smoothing + self.ignore_prefix_size = ignore_prefix_size + self.report_accuracy = report_accuracy + self.rdrop_alpha = rdrop_alpha + + def forward(self, model, sample, reduce=True, net_output=None): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + if net_output is None: + if self.rdrop_alpha > 0 and sample["net_input"]["src_tokens"].size( + 0 + ) == sample["target"].size(0): + sample = duplicate_input(sample) + net_output = model(**sample["net_input"]) + loss, nll_loss, rdrop_kl_loss = self.compute_loss( + model, net_output, sample, reduce=reduce + ) + sample_size = ( + sample["target"].size(0) if self.sentence_avg else sample["ntokens"] + ) + logging_output = { + "loss": loss.data, + "nll_loss": nll_loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample["target"].size(0), + "sample_size": sample_size, + } + if self.report_accuracy: + n_correct, total = self.compute_accuracy(model, net_output, sample) + logging_output["n_correct"] = utils.item(n_correct.data) + logging_output["total"] = utils.item(total.data) + if self.rdrop_alpha > 0: + logging_output["rdrop_kl_loss"] = utils.item(rdrop_kl_loss.data) + return loss, sample_size, logging_output + + def get_lprobs_and_target(self, model, net_output, sample): + lprobs = model.get_normalized_probs(net_output, log_probs=True) + target = model.get_targets(sample, net_output) + if self.rdrop_alpha > 0 or target.size(0) != lprobs.size(0): + target = torch.cat([target, target.clone()], dim=0) + + if self.ignore_prefix_size > 0: + # lprobs: B x T x C + lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous() + target = target[:, self.ignore_prefix_size :].contiguous() + return lprobs.view(-1, lprobs.size(-1)), target.view(-1) + + def compute_loss(self, model, net_output, sample, reduce=True): + lprobs, target = self.get_lprobs_and_target(model, net_output, sample) + loss, nll_loss = label_smoothed_nll_loss( + lprobs, + target, + self.eps, + ignore_index=self.padding_idx, + reduce=reduce, + ) + + if self.rdrop_alpha > 0: + pad_mask = target[: target.size(0) // 2].unsqueeze(-1).eq(self.padding_idx) + rdrop_kl_loss = compute_kl_loss(model, net_output, pad_mask) + loss += self.rdrop_alpha * rdrop_kl_loss + else: + rdrop_kl_loss = loss.new_zeros(1) + return loss, nll_loss, rdrop_kl_loss + + @classmethod + def reduce_metrics(cls, logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + super().reduce_metrics(logging_outputs) + + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + + rdrop_kl_loss = utils.item( + sum(log.get("rdrop_kl_loss", 0) for log in logging_outputs) + / sample_size + / math.log(2) + ) + if rdrop_kl_loss > 0: + metrics.log_scalar("rdrop_kl_loss", rdrop_kl_loss) + + +def duplicate_input(sample): + if "net_input" in sample.keys(): + sample_input = sample["net_input"] + else: + sample_input = sample + + for k, v in sample_input.items(): + if isinstance(v, torch.Tensor): + sample_input[k] = torch.cat([v, v.clone()], dim=0) + if "net_input" in sample.keys(): + sample["net_input"] = sample_input + else: + sample = sample_input + return sample + + +def compute_kl_loss(model, net_output, pad_mask=None, reduce=True): + net_prob = model.get_normalized_probs(net_output, log_probs=True) + net_prob_tec = model.get_normalized_probs(net_output, log_probs=False) + + net_prob = net_prob.view(-1, net_prob.size(-1)) + net_prob_tec = net_prob_tec.view(-1, net_prob_tec.size(-1)) + + p, q = torch.split(net_prob, net_prob.size(0) // 2, dim=0) + p_tec, q_tec = torch.split(net_prob_tec, net_prob_tec.size(0) // 2, dim=0) + + p_loss = torch.nn.functional.kl_div(p, q_tec, reduction="none") + q_loss = torch.nn.functional.kl_div(q, p_tec, reduction="none") + + if pad_mask is not None: + p_loss.masked_fill_(pad_mask, 0.0) + q_loss.masked_fill_(pad_mask, 0.0) + + if reduce: + p_loss = p_loss.sum() + q_loss = q_loss.sum() + + loss = (p_loss + q_loss) / 2 + return loss diff --git a/fairseq/criterions/legacy_masked_lm.py b/fairseq/criterions/legacy_masked_lm.py index c70608c5a1..5cf70df2ab 100644 --- a/fairseq/criterions/legacy_masked_lm.py +++ b/fairseq/criterions/legacy_masked_lm.py @@ -7,7 +7,8 @@ import torch import torch.nn.functional as F -from fairseq import metrics, utils +from fairseq import utils +from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion diff --git a/fairseq/criterions/masked_lm.py b/fairseq/criterions/masked_lm.py index b04cfbff6d..09ddd9f3e6 100644 --- a/fairseq/criterions/masked_lm.py +++ b/fairseq/criterions/masked_lm.py @@ -3,23 +3,31 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +from dataclasses import dataclass import math +from omegaconf import II import torch -import torch.nn.functional as F -from fairseq import metrics, modules, utils +from fairseq import modules, utils +from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass -@register_criterion("masked_lm") +@dataclass +class MaskedLmConfig(FairseqDataclass): + tpu: bool = II("common.tpu") + + +@register_criterion("masked_lm", dataclass=MaskedLmConfig) class MaskedLmLoss(FairseqCriterion): """ Implementation for the loss used in masked language model (MLM) training. """ - def __init__(self, task, tpu=False): + def __init__(self, cfg: MaskedLmConfig, task): super().__init__(task) - self.tpu = tpu + self.tpu = cfg.tpu def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. diff --git a/fairseq/criterions/model_criterion.py b/fairseq/criterions/model_criterion.py new file mode 100644 index 0000000000..4c020ddbd2 --- /dev/null +++ b/fairseq/criterions/model_criterion.py @@ -0,0 +1,177 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from dataclasses import dataclass, field +from typing import Dict, List + +import torch + +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass +from fairseq.logging.meters import safe_round + + +logger = logging.getLogger(__name__) + + +@dataclass +class ModelCriterionConfig(FairseqDataclass): + loss_weights: Dict[str, float] = field( + default_factory=dict, + metadata={"help": "weights for the loss terms"}, + ) + log_keys: List[str] = field( + default_factory=list, + metadata={"help": "additional output keys to log"}, + ) + can_sum: bool = True + + +@register_criterion("model", dataclass=ModelCriterionConfig) +class ModelCriterion(FairseqCriterion): + """ + This criterion relies on the model to supply losses. + The losses should be a dictionary of name -> scalar returned by + the model either by including it in the net_output dict or by + implementing a get_losses(net_output, sample) method. The final loss is + a scaled sum of all losses according to weights in loss_weights. + If no weights are provided, then all losses are scaled by 1.0. + + The losses will be automatically logged. Additional keys from + net_output dict can be logged via the log_keys parameter. + """ + + def __init__(self, task, loss_weights=None, log_keys=None, can_sum=True): + super().__init__(task) + self.loss_weights = loss_weights + self.log_keys = log_keys + self.can_sum = can_sum + + def forward(self, model, sample, reduce=True): + net_output = model(**sample["net_input"]) + + scaled_losses = {} + + if hasattr(model, "get_losses"): + losses = model.get_losses(net_output, sample) + elif isinstance(net_output, dict) and "losses" in net_output: + losses = net_output["losses"] + else: + raise Exception("Could not retrieve losses") + + for lk, p in losses.items(): + try: + coef = 1.0 if len(self.loss_weights) == 0 else self.loss_weights[lk] + except KeyError: + logger.error( + f"weight for loss {lk} is not in loss_weights ({self.loss_weights})" + ) + raise + if coef != 0 and p is not None: + scaled_losses[lk] = coef * p.float().sum() + + loss = sum(scaled_losses.values()) + + if "sample_size" in net_output: + sample_size = net_output["sample_size"] + else: + sample_size = loss.numel() + + if reduce and loss.numel() > 1: + loss = loss.sum() + + logging_output = { + "loss": loss.data, + "ntokens": sample_size, + "nsentences": sample["id"].numel(), + "sample_size": sample_size, + "_world_size": 1, + } + + for lk in self.log_keys: + if lk in net_output and net_output[lk] is not None: + if not torch.is_tensor(net_output[lk]) or net_output[lk].numel() == 1: + logging_output[lk] = float(net_output[lk]) + elif lk.startswith("_"): + logging_output[lk] = net_output[lk] + else: + for i, v in enumerate(net_output[lk]): + logging_output[f"{lk}_{i}"] = float(v) + + if len(scaled_losses) > 1: + for lk, l in scaled_losses.items(): + if l.numel() > 1: + l = l.sum() + logging_output[f"loss_{lk}"] = l.item() + + if "logs" in net_output: + for lgw in net_output["logs"]: + logging_output[lgw] = net_output["logs"][lgw] + + return loss, sample_size, logging_output + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) + ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs)) + nsentences = utils.item( + sum(log.get("nsentences", 0) for log in logging_outputs) + ) + sample_size = utils.item( + sum(log.get("sample_size", 0) for log in logging_outputs) + ) + + metrics.log_scalar("loss", loss_sum / sample_size, sample_size, round=3) + metrics.log_scalar("ntokens", ntokens) + metrics.log_scalar("nsentences", nsentences) + metrics.log_scalar("sample_size", sample_size) + + builtin_keys = { + "loss", + "ntokens", + "nsentences", + "sample_size", + "_world_size", + } + + world_size = utils.item( + sum(log.get("_world_size", 0) for log in logging_outputs) + ) + + for k in logging_outputs[0]: + if k not in builtin_keys and not k.startswith("_"): + val = sum(log.get(k, 0) for log in logging_outputs) + if k.startswith("loss_"): + metrics.log_scalar(k, val / sample_size, sample_size, round=3) + else: + metrics.log_scalar(k, val / world_size, round=3) + + correct = sum(log.get("correct", 0) for log in logging_outputs) + total = sum(log.get("count", 0) for log in logging_outputs) + + if total > 0: + metrics.log_scalar("_correct", correct) + metrics.log_scalar("_total", total) + + metrics.log_derived( + "accuracy", + lambda meters: safe_round( + meters["_correct"].sum / meters["_total"].sum, 5 + ) + if meters["_total"].sum > 0 + else float("nan"), + ) + + def logging_outputs_can_be_summed(self) -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return self.can_sum diff --git a/fairseq/criterions/nat_loss.py b/fairseq/criterions/nat_loss.py index cdc7da861d..fc0bdaf851 100644 --- a/fairseq/criterions/nat_loss.py +++ b/fairseq/criterions/nat_loss.py @@ -7,28 +7,29 @@ import torch import torch.nn.functional as F -from fairseq import metrics, utils +from fairseq import utils +from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass from torch import Tensor +from dataclasses import dataclass, field -@register_criterion("nat_loss") + +@dataclass +class LabelSmoothedDualImitationCriterionConfig(FairseqDataclass): + label_smoothing: float = field( + default=0.0, + metadata={"help": "epsilon for label smoothing, 0 means no label smoothing"}, + ) + + +@register_criterion("nat_loss", dataclass=LabelSmoothedDualImitationCriterionConfig) class LabelSmoothedDualImitationCriterion(FairseqCriterion): def __init__(self, task, label_smoothing): super().__init__(task) self.label_smoothing = label_smoothing - @staticmethod - def add_args(parser): - """Add criterion-specific arguments to the parser.""" - parser.add_argument( - "--label-smoothing", - default=0.0, - type=float, - metavar="D", - help="epsilon for label smoothing, 0 means no label smoothing", - ) - def _compute_loss( self, outputs, targets, masks=None, label_smoothing=0.0, name="loss", factor=1.0 ): diff --git a/fairseq/criterions/sentence_prediction.py b/fairseq/criterions/sentence_prediction.py index 9519fdc56d..298b805768 100644 --- a/fairseq/criterions/sentence_prediction.py +++ b/fairseq/criterions/sentence_prediction.py @@ -4,27 +4,79 @@ # LICENSE file in the root directory of this source tree. import math +from dataclasses import dataclass, field +from itertools import chain +import numpy as np import torch import torch.nn.functional as F -from fairseq import metrics, utils +from sklearn.metrics import f1_score +from sklearn.metrics import matthews_corrcoef as _matthews_corrcoef +from scipy.stats import pearsonr, spearmanr + +from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass +from fairseq.logging.meters import safe_round + + +def simple_accuracy(preds, labels): + return (preds == labels).mean() + + +def acc_and_f1(preds, labels): + acc = simple_accuracy(preds, labels) + f1 = f1_score(y_true=labels, y_pred=preds) + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + +def pearson_and_spearman(preds, labels): + pearson_corr = pearsonr(preds, labels)[0] + spearman_corr = spearmanr(preds, labels)[0] + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + "corr": (pearson_corr + spearman_corr) / 2, + } -@register_criterion("sentence_prediction") +def matthews_corrcoef(preds, labels): + # make it consistent with other metrics taking (preds, labels) as input + mcc = _matthews_corrcoef(labels, preds) + return mcc + + +@dataclass +class SentencePredictionConfig(FairseqDataclass): + classification_head_name: str = field( + default="sentence_classification_head", + metadata={"help": "name of the classification head to use"}, + ) + regression_target: bool = field( + default=False, + ) + report_mcc: bool = False + report_acc_and_f1: bool = False + report_pearson_and_spearman: bool = False + + +@register_criterion("sentence_prediction", dataclass=SentencePredictionConfig) class SentencePredictionCriterion(FairseqCriterion): - def __init__(self, task, classification_head_name, regression_target): + def __init__(self, cfg: SentencePredictionConfig, task): super().__init__(task) - self.classification_head_name = classification_head_name - self.regression_target = regression_target - - @staticmethod - def add_args(parser): - # fmt: off - parser.add_argument('--classification-head-name', - default='sentence_classification_head', - help='name of the classification head to use') - # fmt: on + self.classification_head_name = cfg.classification_head_name + self.regression_target = cfg.regression_target + self.keep_pred_and_targ = ( + cfg.report_mcc or cfg.report_acc_and_f1 or cfg.report_pearson_and_spearman + ) + self.report_mcc = cfg.report_mcc + self.report_acc_and_f1 = cfg.report_acc_and_f1 + self.report_pearson_and_spearman = cfg.report_pearson_and_spearman + self.label_dict = task.label_dictionary def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. @@ -49,21 +101,62 @@ def forward(self, model, sample, reduce=True): if not self.regression_target: lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) - loss = F.nll_loss(lprobs, targets, reduction="sum") + task_loss = F.nll_loss(lprobs, targets, reduction="sum") else: logits = logits.view(-1).float() targets = targets.float() - loss = F.mse_loss(logits, targets, reduction="sum") - - logging_output = { - "loss": loss.data, - "ntokens": sample["ntokens"], - "nsentences": sample_size, - "sample_size": sample_size, - } + task_loss = F.mse_loss(logits, targets, reduction="sum") + + logging_output = {} + loss = task_loss + # mha & ffn regularization update + if ( + hasattr(model, "args") + and hasattr(model.args, "mha_reg_scale_factor") + and model.args.mha_reg_scale_factor != 0.0 + ): + mha_reg_loss = model._get_adaptive_head_loss() + loss += mha_reg_loss + logging_output.update({"mha_reg_loss": mha_reg_loss}) + if ( + hasattr(model, "args") + and hasattr(model.args, "ffn_reg_scale_factor") + and model.args.ffn_reg_scale_factor != 0.0 + ): + ffn_reg_loss = model._get_adaptive_ffn_loss() + loss += ffn_reg_loss + logging_output.update({"ffn_reg_loss": ffn_reg_loss}) + + logging_output.update( + { + "loss": loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample_size, + "sample_size": sample_size, + } + ) if not self.regression_target: preds = logits.argmax(dim=1) logging_output["ncorrect"] = (preds == targets).sum() + if self.keep_pred_and_targ and not model.training: + if self.regression_target: + logging_output["pred"] = logits.detach().cpu().tolist() + logging_output["targ"] = targets.detach().cpu().tolist() + else: + # remove offset `self.label_dict.nspecial` from OffsetTokensDataset + preds = self.label_dict.string(preds + self.label_dict.nspecial).split() + targets = self.label_dict.string( + targets + self.label_dict.nspecial + ).split() + logging_output["pred"] = list(map(int, preds)) + logging_output["targ"] = list(map(int, targets)) + + if self.report_mcc: + logging_output["report_mcc"] = True + if self.report_acc_and_f1: + logging_output["report_acc_and_f1"] = True + if self.report_pearson_and_spearman: + logging_output["report_pearson_and_spearman"] = True return loss, sample_size, logging_output @@ -74,10 +167,26 @@ def reduce_metrics(logging_outputs) -> None: ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + mha_reg_loss_sum = sum(log.get("mha_reg_loss", 0) for log in logging_outputs) + ffn_reg_loss_sum = sum(log.get("ffn_reg_loss", 0) for log in logging_outputs) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) + if mha_reg_loss_sum: + metrics.log_scalar( + "mha_reg_loss", + mha_reg_loss_sum / sample_size / math.log(2), + sample_size, + round=3, + ) + if ffn_reg_loss_sum: + metrics.log_scalar( + "ffn_reg_loss", + ffn_reg_loss_sum / sample_size / math.log(2), + sample_size, + round=3, + ) if sample_size != ntokens: metrics.log_scalar( "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 @@ -89,6 +198,86 @@ def reduce_metrics(logging_outputs) -> None: "accuracy", 100.0 * ncorrect / nsentences, nsentences, round=1 ) + # Metrics used by GLUE + pred = np.array( + list(chain.from_iterable(log.get("pred", []) for log in logging_outputs)) + ) + targ = np.array( + list(chain.from_iterable(log.get("targ", []) for log in logging_outputs)) + ) + if len(pred): + metrics.log_concat_tensor("pred", torch.from_numpy(pred), dim=0) + metrics.log_concat_tensor("targ", torch.from_numpy(targ), dim=0) + if any("report_mcc" in log for log in logging_outputs): + metrics.log_derived( + "mcc", + lambda meters: safe_round( + matthews_corrcoef( + meters["pred"].tensor.numpy(), + meters["targ"].tensor.numpy(), + ) + * 100, + 1, + ), + ) + if any("report_acc_and_f1" in log for log in logging_outputs): + metrics.log_derived( + "acc_and_f1", + lambda meters: safe_round( + acc_and_f1( + meters["pred"].tensor.numpy(), + meters["targ"].tensor.numpy(), + )["acc_and_f1"] + * 100, + 1, + ), + ) + metrics.log_derived( + "f1", + lambda meters: safe_round( + acc_and_f1( + meters["pred"].tensor.numpy(), + meters["targ"].tensor.numpy(), + )["f1"] + * 100, + 1, + ), + ) + if any("report_pearson_and_spearman" in log for log in logging_outputs): + metrics.log_derived( + "pearson_and_spearman", + lambda meters: safe_round( + pearson_and_spearman( + meters["pred"].tensor.numpy(), + meters["targ"].tensor.numpy(), + )["corr"] + * 100, + 1, + ), + ) + metrics.log_derived( + "pearson", + lambda meters: safe_round( + pearson_and_spearman( + meters["pred"].tensor.numpy(), + meters["targ"].tensor.numpy(), + )["pearson"] + * 100, + 1, + ), + ) + metrics.log_derived( + "spearman", + lambda meters: safe_round( + pearson_and_spearman( + meters["pred"].tensor.numpy(), + meters["targ"].tensor.numpy(), + )["spearmanr"] + * 100, + 1, + ), + ) + @staticmethod def logging_outputs_can_be_summed() -> bool: """ diff --git a/fairseq/criterions/sentence_prediction_adapters.py b/fairseq/criterions/sentence_prediction_adapters.py new file mode 100644 index 0000000000..8a873a45b3 --- /dev/null +++ b/fairseq/criterions/sentence_prediction_adapters.py @@ -0,0 +1,63 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn.functional as F +from fairseq.criterions import register_criterion +from fairseq.criterions.sentence_prediction import ( + SentencePredictionCriterion, + SentencePredictionConfig, +) + + +@register_criterion("sentence_prediction_adapters", dataclass=SentencePredictionConfig) +class SentencePredictionCriterionAdapters(SentencePredictionCriterion): + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + assert ( + hasattr(model, "classification_heads") + and self.classification_head_name in model.classification_heads + ), "model must provide sentence classification head for --criterion=sentence_prediction" + + if not hasattr(sample, "lang_id"): + # If no language ID is given, we fall back to English + lang_id = ["en_XX"] * sample["nsentences"] + else: + lang_id = sample["lang_id"] + + logits, _ = model( + **sample["net_input"], + features_only=True, + classification_head_name=self.classification_head_name, + lang_id=lang_id, + ) + targets = model.get_targets(sample, [logits]).view(-1) + sample_size = targets.numel() + + if not self.regression_target: + lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) + loss = F.nll_loss(lprobs, targets, reduction="sum") + else: + logits = logits.view(-1).float() + targets = targets.float() + loss = F.mse_loss(logits, targets, reduction="sum") + + logging_output = { + "loss": loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample_size, + "sample_size": sample_size, + } + if not self.regression_target: + preds = logits.argmax(dim=1) + logging_output["ncorrect"] = (preds == targets).sum() + + return loss, sample_size, logging_output diff --git a/fairseq/criterions/sentence_ranking.py b/fairseq/criterions/sentence_ranking.py index d4c76341d4..bfb9f058f9 100644 --- a/fairseq/criterions/sentence_ranking.py +++ b/fairseq/criterions/sentence_ranking.py @@ -7,7 +7,8 @@ import torch import torch.nn.functional as F -from fairseq import metrics, utils +from fairseq import utils +from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion diff --git a/fairseq/criterions/speech_dlm_criterion.py b/fairseq/criterions/speech_dlm_criterion.py new file mode 100644 index 0000000000..8888180114 --- /dev/null +++ b/fairseq/criterions/speech_dlm_criterion.py @@ -0,0 +1,335 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass, field +from typing import Optional + +import torch.nn.functional as F +from fairseq import metrics, utils +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass +from omegaconf import II + + +@dataclass +class SpeechDLMCriterionConfig(FairseqDataclass): + sentence_avg: bool = II("optimization.sentence_avg") + main_and_cross_weights: Optional[str] = field( + default="1,0", + metadata={ + "help": "Comma-separated list of weights of Main-channel vs Cross-channel Prediction Losses" + "(default: 1,0)" + }, + ) + general_unit_loss_weight: float = field( + default=0, + metadata={ + "help": "The weight of the General Prediction Loss (Next-step Unit Prediction Loss)" + "(default: 0)" + }, + ) + edge_unit_loss_weight: float = field( + default=1, + metadata={"help": "The weight of the Edge Unit Prediction Loss" "(default: 1)"}, + ) + duration_loss_weight: float = field( + default=1, + metadata={ + "help": "The weight of the Edge Unit Duration Prediction Loss" + "(default: 1)" + }, + ) + + +@register_criterion("speech_dlm_criterion", dataclass=SpeechDLMCriterionConfig) +class SpeechDLMCriterion(FairseqCriterion): + """Criteron for the SpeechDLM model as described in the paper: + https://arxiv.org/pdf/2203.16502.pdf + + There are 3 possible losses depending on the targets of the model: + - general_unit_loss : The next unit prediction loss, corresponding to + 'next' target + - edge_unit_loss : The edge unit prediction loss, corresponding to + 'edge' target + - duration_loss : The duration prediction loss, corresponding to + 'duration' target + """ + + def __init__( + self, + task, + sentence_avg, + main_and_cross_weights, + general_unit_loss_weight, + edge_unit_loss_weight, + duration_loss_weight, + ): + super().__init__(task) + self.sentence_avg = sentence_avg + + self.channels = task.channels + self.targets = task.targets + self.delayed_duration_target = task.delayed_duration_target + + self.main_channel_weight = float(main_and_cross_weights.split(",")[0]) + self.cross_channel_weight = float(main_and_cross_weights.split(",")[1]) + assert self.main_channel_weight >= 0 and self.cross_channel_weight >= 0 + + self.channel_weights = { + channel: weight + for channel, weight in zip(self.channels, task.channel_weights) + } + + self.target_weights = {} + for t in self.targets: + if t == "next": + self.target_weights[t] = general_unit_loss_weight + assert ( + general_unit_loss_weight > 0 + ), "Expect a positive --general-unit-loss-weight for next unit prediction" + elif t == "edge": + self.target_weights[t] = edge_unit_loss_weight + assert ( + edge_unit_loss_weight > 0 + ), "Expect a positive --edge-unit-loss-weight for edge unit prediction" + elif t == "duration": + self.target_weights[t] = duration_loss_weight + assert ( + duration_loss_weight > 0 + ), "Expect a positive --duration-loss-weight for duration prediction" + + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + net_output = model(**sample["net_input"]) + loss_dict, stats_dict = self.compute_loss( + model, net_output, sample, reduce=reduce + ) + nsentences = sample["net_input"]["src_tokens"][self.channels[0]].size(0) + + logging_output = { + "nsentences": nsentences, + } + logging_output["nsentences"] = nsentences + + loss_all = {t: 0 for t in self.targets} + correct_all = {t: 0 for t in self.targets} + count_all = {t: 0 for t in self.targets} + ntokens_all = 0 + sample_size_all = 0 + for channel in loss_dict: + for pred_channel in loss_dict[channel]: + # Get ntokens & sample_size + ntokens = sample["net_input"]["src_tokens"][channel].numel() + sample_size = nsentences if self.sentence_avg else ntokens + prefix = "[{}-{}]".format(channel, pred_channel) + log_keys = { + "next": "general_token", + "edge": "edge_token", + "duration": "edge_duration", + } + + # Log & Update the sizes + logging_output["{}ntokens".format(prefix)] = ntokens + logging_output["{}sample_size".format(prefix)] = sample_size + ntokens_all += ntokens + sample_size_all += sample_size + + for t in self.targets: + log_key = log_keys[t] + loss = loss_dict[channel][pred_channel][t] + correct, count = stats_dict[channel][pred_channel][t] + + # Log the statistics + logging_output["{}{}_loss".format(prefix, log_key)] = loss.data + logging_output["{}{}_correct".format(prefix, log_key)] = correct + logging_output["{}{}_count".format(prefix, log_key)] = count + + # Scale the training loss by weights + target_loss = loss * self.channel_weights[channel] + if pred_channel == channel: + target_loss = target_loss * self.main_channel_weight + else: + target_loss = target_loss * self.cross_channel_weight + # Normalize the losses in the training by the number of edges + if t in ["edge", "duration"]: + target_loss = target_loss / count * sample_size + + # Update the statistics + loss_all[t] += target_loss + correct_all[t] += correct + count_all[t] += count + + # Logging the average statistics + logging_output["ntokens"] = ntokens_all + logging_output["sample_size"] = sample_size_all + for t in self.targets: + log_key = { + "next": "general_token", + "edge": "edge_token", + "duration": "edge_duration", + }[t] + logging_output["{}_loss".format(log_key)] = loss_all[t].data + logging_output["{}_correct".format(log_key)] = correct_all[t] + logging_output["{}_count".format(log_key)] = count_all[t] + + # Define the training loss + training_loss = 0 + for t in self.targets: + training_loss += loss_all[t] * self.target_weights[t] + logging_output["loss"] = training_loss.data + + return training_loss, sample_size_all, logging_output + + def compute_loss(self, model, net_output, sample, reduce=True): + # Get the model outputs and target + lprobs_dict = model.get_normalized_probs(net_output, log_probs=True) + target_dict = model.get_targets(sample, net_output) + + # Init the dictionaries + loss_dict, stats_dict = {}, {} + + for channel in lprobs_dict: + # Init the dictionaries + loss_dict[channel], stats_dict[channel] = {}, {} + + for pred_channel in lprobs_dict[channel]: + # Init the dictionaries + loss_dict[channel][pred_channel] = {} + stats_dict[channel][pred_channel] = {} + + # Get token & duration predictions + outputs = lprobs_dict[channel][pred_channel] + if not isinstance(outputs, dict): + token_lprobs = outputs + else: + token_lprobs = outputs["pred_token"] + dur_preds = outputs["pred_duration"] + dur_preds = dur_preds.view(-1) + token_lprobs = token_lprobs.view(-1, token_lprobs.size(-1)) + token_preds = token_lprobs.argmax(dim=-1) + + # Get edge indices + if "edge" in self.targets or "duration" in self.targets: + edge_indices = target_dict["edge_indices"][pred_channel] + + # Compute loss and statistics + for t in self.targets: + if t in ["next", "edge"]: + if t == "next": + target = target_dict["next"][pred_channel].view(-1) + lprobs = token_lprobs + preds = token_preds + elif t == "edge": + target = target_dict["edge"][pred_channel] + lprobs = token_lprobs[edge_indices] + preds = token_preds[edge_indices] + + loss = F.nll_loss( + lprobs, + target, + ignore_index=self.padding_idx, + reduction="sum" if reduce else "none", + ) + elif t == "duration": + target = target_dict["duration"][pred_channel] + if self.delayed_duration_target: + duration_indices = edge_indices + 1 + if duration_indices[-1] == len(dur_preds): + duration_indices = duration_indices[:-1] + target = target[:-1] + else: + duration_indices = edge_indices + preds = dur_preds[duration_indices] + + loss = F.l1_loss( + preds, + target, + reduction="sum" if reduce else "none", + ) + preds = preds.round() + + correct = (preds == target).sum().float().cpu().item() + count = float(target.size(0)) + + loss_dict[channel][pred_channel][t] = loss + stats_dict[channel][pred_channel][t] = (correct, count) + + return loss_dict, stats_dict + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + logging_keys = next(iter(logging_outputs)).keys() + channels = [item[:-7] for item in logging_keys if item.endswith("ntokens")] + target_prefixes = set( + [ + item[:-5].split("]")[-1] + for item in logging_keys + if item.endswith("_loss") + ] + ) + for channel_prefix in channels: + for target_prefix in target_prefixes: + prefix = "{}{}".format(channel_prefix, target_prefix) + count_sum = sum( + log.get("{}_count".format(prefix), 0) for log in logging_outputs + ) + correct_sum = sum( + log.get("{}_correct".format(prefix), 0) for log in logging_outputs + ) + loss_sum = sum( + log.get("{}_loss".format(prefix), 0) for log in logging_outputs + ) + + if "duration" not in target_prefix: + # we divide by log(2) to convert the loss from base e to base 2 + metrics.log_scalar( + "{}_loss".format(prefix), + loss_sum / count_sum / math.log(2), + count_sum, + round=3, + ) + metrics.log_derived( + "{}_ppl".format(prefix), + lambda meters, prefix=prefix: utils.get_perplexity( + meters["{}_loss".format(prefix)].avg + ), + ) + else: + # for duration we don't need to divide by log(2) + metrics.log_scalar( + "{}_loss".format(prefix), + loss_sum / count_sum, + count_sum, + round=3, + ) + + accuracy = 100 * correct_sum / count_sum + metrics.log_scalar("{}_pred_acc".format(prefix), accuracy, round=3) + + # Logging training loss + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + loss_sum = sum(log.get("loss", 0) for log in logging_outputs) + + # we divide by log(2) to convert the loss from base e to base 2 + metrics.log_scalar( + "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 + ) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True diff --git a/fairseq/criterions/speech_to_speech_criterion.py b/fairseq/criterions/speech_to_speech_criterion.py new file mode 100644 index 0000000000..06a8252140 --- /dev/null +++ b/fairseq/criterions/speech_to_speech_criterion.py @@ -0,0 +1,517 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import math +from collections import OrderedDict + +import torch + +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import register_criterion +from fairseq.criterions.ctc import CtcCriterion +from fairseq.criterions.label_smoothed_cross_entropy_with_rdrop import ( + RdropLabelSmoothedCrossEntropyCriterion, + RdropLabelSmoothedCrossEntropyCriterionConfig, + duplicate_input, +) +from fairseq.criterions.tacotron2_loss import ( + Tacotron2Criterion, + Tacotron2CriterionConfig, +) + +logger = logging.getLogger(__name__) + + +class MultitaskCriterion: + def __init__(self, multitask_tasks, rdrop_alpha=0.0): + self.rdrop_alpha = rdrop_alpha + self.rdrop_alpha_mtl = rdrop_alpha + + self.multitask_criterion = OrderedDict() + self.multitask_loss_weight = OrderedDict() + for task_name, task_obj in multitask_tasks.items(): + if task_obj.args.get_loss_weight(0) == 0: + logger.info(f"Skip {task_name} loss criterion") + continue + + rdrop_alpha_task = task_obj.args.rdrop_alpha + if rdrop_alpha_task is None: + rdrop_alpha_task = rdrop_alpha + self.rdrop_alpha_mtl = rdrop_alpha_task + logger.info(f"rdrop_alpha is set to {rdrop_alpha_task} for {task_name}") + + if task_obj.args.decoder_type == "ctc": + self.multitask_criterion[task_name] = CtcCriterion( + task_obj.args.criterion_cfg, + task_obj, + rdrop_alpha=rdrop_alpha_task, + ) + else: + self.multitask_criterion[ + task_name + ] = RdropLabelSmoothedCrossEntropyCriterion( + task_obj, + task_obj.args.criterion_cfg.sentence_avg, + label_smoothing=task_obj.args.criterion_cfg.label_smoothing, + rdrop_alpha=rdrop_alpha_task, + ) + + def set_multitask_loss_weight(self, task_name, weight=0.0): + self.multitask_loss_weight[task_name] = weight + + def get_multitask_loss(self, model, sample, model_out): + logging_output = {} + loss = 0.0 + for task_name, task_criterion in self.multitask_criterion.items(): + layer_id = task_criterion.task.args.input_layer + if isinstance(task_criterion, CtcCriterion): + if task_criterion.task.args.input_from == "encoder": + if len(model_out["encoder_padding_mask"]) > 0: + non_padding_mask = ~model_out["encoder_padding_mask"][0] + input_lengths = non_padding_mask.long().sum(-1) + else: + out = model_out["encoder_states"][layer_id] + input_lengths = out.new_full( + (out.shape[1],), out.shape[0] + ).long() + + task_sample = { + "net_input": { + "src_tokens": model_out["encoder_states"][ + layer_id + ], # check batch idx + "src_lengths": input_lengths, + }, + "id": sample["id"], + } + else: + task_sample = { + "net_input": { + "src_tokens": model_out["inner_states"][layer_id], + "src_lengths": sample["target_lengths"], + }, + "id": sample["id"], + } + else: + task_sample = { + "net_input": { + "src_tokens": sample["multitask"][task_name]["net_input"][ + "prev_output_tokens" + ], + "encoder_out": { + "encoder_out": [model_out["encoder_states"][layer_id]], + "encoder_padding_mask": model_out["encoder_padding_mask"], + }, + } + } + + for key in ["target", "target_lengths", "ntokens"]: + task_sample[key] = sample["multitask"][task_name][key] + + if task_name == getattr(model, "mt_task_name", None): + decoder_out = model_out["mt_decoder_out"] + else: + decoder_out = None + task_loss, task_sample_size, task_logging_output = task_criterion( + model.multitask_decoders[task_name], task_sample, net_output=decoder_out + ) + + loss = loss + self.multitask_loss_weight[task_name] * task_loss + task_logging_output["loss_weight"] = self.multitask_loss_weight[task_name] + logging_output[task_name] = task_logging_output + return loss, logging_output + + @classmethod + def reduce_metrics(cls, logging_outputs) -> None: + for task_name in logging_outputs[0]["multitask"].keys(): + # different criterion may return different logging + # currently only reduce on loss, the most common one + # ideally the way that losses are reduced should also depend on the task type + loss_sum = sum( + log["multitask"][task_name].get("loss", 0) for log in logging_outputs + ) + sample_size = sum( + log["multitask"][task_name].get("sample_size", 0) + for log in logging_outputs + ) + + metrics.log_scalar( + f"multitask_{task_name}_loss", + loss_sum / sample_size / math.log(2), + sample_size, + round=3, + ) + + loss_weight = logging_outputs[0]["multitask"][task_name].get( + "loss_weight", 0 + ) + metrics.log_scalar( + f"multitask_{task_name}_loss_weight", + loss_weight, + weight=0, + priority=250, + ) + + +@register_criterion( + "speech_to_unit", dataclass=RdropLabelSmoothedCrossEntropyCriterionConfig +) +class SpeechToUnitMultitaskTaskCriterion( + RdropLabelSmoothedCrossEntropyCriterion, MultitaskCriterion +): + def __init__( + self, + task, + sentence_avg, + label_smoothing, + ignore_prefix_size=0, + report_accuracy=False, + rdrop_alpha=0.0, + ): + super().__init__( + task, + sentence_avg, + label_smoothing, + ignore_prefix_size, + report_accuracy, + rdrop_alpha, + ) + MultitaskCriterion.__init__(self, task.multitask_tasks, rdrop_alpha) + + def forward(self, model, sample, reduce=True): + net_input_concat = { + "src_tokens": sample["net_input"]["src_tokens"], + "src_lengths": sample["net_input"]["src_lengths"], + "prev_output_tokens": sample["net_input"]["prev_output_tokens"], + "tgt_speaker": sample["net_input"].get("tgt_speaker", None), + "return_all_hiddens": True, + } + + if self.rdrop_alpha > 0 or self.rdrop_alpha_mtl > 0: + net_input_concat = duplicate_input(net_input_concat) + + net_output, extra = model(**net_input_concat) + loss, nll_loss, rdrop_kl_loss = self.compute_loss( + model, [net_output], sample, reduce=reduce + ) + sample_size = ( + sample["target"].size(0) if self.sentence_avg else sample["ntokens"] + ) + logging_output = { + "loss": loss.data, + "nll_loss": nll_loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample["target"].size(0), + "sample_size": sample_size, + } + if self.report_accuracy: + n_correct, total = self.compute_accuracy(model, [net_output], sample) + logging_output["n_correct"] = utils.item(n_correct.data) + logging_output["total"] = utils.item(total.data) + if self.rdrop_alpha > 0: + logging_output["rdrop_kl_loss"] = utils.item(rdrop_kl_loss.data) + + if len(self.multitask_criterion) == 0: + return loss, sample_size, logging_output + + # multitask + multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra) + loss += multitask_loss + logging_output["multitask"] = multitask_log + + return loss, sample_size, logging_output + + @classmethod + def reduce_metrics(cls, logging_outputs) -> None: + super().reduce_metrics(logging_outputs) + + # inference metrics + if "targ_frames" in logging_outputs[0]: + n = sum(log.get("norm_frames", 0) for log in logging_outputs) + for key, new_key in [ + ("mcd_loss", "mcd_loss"), + ("pred_frames", "pred_ratio"), + ("nins", "ins_rate"), + ("ndel", "del_rate"), + ]: + val = sum(log.get(key, 0) for log in logging_outputs) + metrics.log_scalar(new_key, val / n, n, round=3) + + if "multitask" not in logging_outputs[0]: + return + + MultitaskCriterion.reduce_metrics(logging_outputs) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return False + + +@register_criterion( + "speech_to_unit_2pass", dataclass=RdropLabelSmoothedCrossEntropyCriterionConfig +) +class SpeechToUnit2passMultitaskTaskCriterion(SpeechToUnitMultitaskTaskCriterion): + def __init__( + self, + task, + sentence_avg, + label_smoothing, + ignore_prefix_size=0, + report_accuracy=False, + rdrop_alpha=0.0, + ): + super().__init__( + task, + sentence_avg, + label_smoothing, + ignore_prefix_size, + report_accuracy, + rdrop_alpha, + ) + + def forward(self, model, sample, reduce=True): + net_input_concat = { + "src_tokens": sample["net_input"]["src_tokens"], + "src_lengths": sample["net_input"]["src_lengths"], + "prev_output_tokens": sample["net_input"]["prev_output_tokens"], + "prev_output_tokens_mt": sample["multitask"][model.mt_task_name][ + "net_input" + ]["prev_output_tokens"], + "tgt_speaker": sample["net_input"].get("tgt_speaker", None), + "return_all_hiddens": True, + } + if getattr(model, "asr_task_name", None) is not None: + net_input_concat["prev_output_tokens_asr"] = sample["multitask"][ + model.asr_task_name + ]["net_input"]["prev_output_tokens"] + + if self.rdrop_alpha > 0 or self.rdrop_alpha_mtl > 0: + net_input_concat = duplicate_input(net_input_concat) + + net_output, extra = model(**net_input_concat) + loss, nll_loss, rdrop_kl_loss = self.compute_loss( + model, [net_output], sample, reduce=reduce + ) + + sample_size = ( + sample["target"].size(0) if self.sentence_avg else sample["ntokens"] + ) + logging_output = { + "loss": loss.data, + "nll_loss": nll_loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample["target"].size(0), + "sample_size": sample_size, + } + if self.report_accuracy: + n_correct, total = self.compute_accuracy(model, [net_output], sample) + logging_output["n_correct"] = utils.item(n_correct.data) + logging_output["total"] = utils.item(total.data) + if self.rdrop_alpha > 0: + logging_output["rdrop_kl_loss"] = utils.item(rdrop_kl_loss.data) + + if len(self.multitask_criterion) == 0: + return loss, sample_size, logging_output + + # multitask + multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra) + loss += multitask_loss + logging_output["multitask"] = multitask_log + + return loss, sample_size, logging_output + + +@register_criterion("speech_to_spectrogram", dataclass=Tacotron2CriterionConfig) +class SpeechToSpectrogramMultitaskTaskCriterion(Tacotron2Criterion, MultitaskCriterion): + def __init__( + self, + task, + sentence_avg, + use_guided_attention_loss, + guided_attention_loss_sigma, + bce_pos_weight, + ctc_weight, + ): + super().__init__( + task, + sentence_avg, + use_guided_attention_loss, + guided_attention_loss_sigma, + bce_pos_weight, + ctc_weight, + ) + MultitaskCriterion.__init__(self, task.multitask_tasks) + + def forward(self, model, sample, reduction="mean"): + bsz, max_len, _ = sample["target"].size() + feat_tgt = sample["target"] + feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len) + eos_tgt = torch.arange(max_len).to(sample["target"].device) + eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1) + eos_tgt = (eos_tgt == (feat_len - 1)).float() + + feat_out, eos_out, extra = model( + src_tokens=sample["net_input"]["src_tokens"], + src_lengths=sample["net_input"]["src_lengths"], + prev_output_tokens=sample["net_input"]["prev_output_tokens"], + tgt_speaker=sample["net_input"]["tgt_speaker"], + target_lengths=sample["target_lengths"], + return_all_hiddens=True, + ) + + l1_loss, mse_loss, eos_loss = self.compute_loss( + extra["feature_out"], + feat_out, + eos_out, + feat_tgt, + eos_tgt, + sample["target_lengths"], + reduction, + ) + attn_loss = torch.tensor(0.0).type_as(l1_loss) + if self.guided_attn is not None: + attn_loss = self.guided_attn( + extra["attn"], + sample["net_input"]["src_lengths"], + sample["target_lengths"], + reduction, + ) + loss = ( + l1_loss + mse_loss + eos_loss + attn_loss + ) # do not include ctc loss as there's no text target + + sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"] + logging_output = { + "loss": utils.item(loss.data), + "ntokens": sample["ntokens"], + "nsentences": sample["nsentences"], + "sample_size": sample_size, + "l1_loss": utils.item(l1_loss.data), + "mse_loss": utils.item(mse_loss.data), + "eos_loss": utils.item(eos_loss.data), + "attn_loss": utils.item(attn_loss.data), + } + + if len(self.multitask_criterion) == 0: + return loss, sample_size, logging_output + + # multitask + multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra) + loss += multitask_loss + logging_output["multitask"] = multitask_log + return loss, sample_size, logging_output + + @classmethod + def reduce_metrics(cls, logging_outputs) -> None: + super().reduce_metrics(logging_outputs) + + # inference metrics + if "targ_frames" in logging_outputs[0]: + n = sum(log.get("norm_frames", 0) for log in logging_outputs) + for key, new_key in [ + ("mcd_loss", "mcd_loss"), + ("pred_frames", "pred_ratio"), + ("nins", "ins_rate"), + ("ndel", "del_rate"), + ]: + val = sum(log.get(key, 0) for log in logging_outputs) + metrics.log_scalar(new_key, val / n, n, round=3) + + if "multitask" not in logging_outputs[0]: + return + + MultitaskCriterion.reduce_metrics(logging_outputs) + + +@register_criterion("speech_to_spectrogram_2pass", dataclass=Tacotron2CriterionConfig) +class SpeechToSpectrogram2passMultitaskTaskCriterion( + SpeechToSpectrogramMultitaskTaskCriterion +): + def __init__( + self, + task, + sentence_avg, + use_guided_attention_loss, + guided_attention_loss_sigma, + bce_pos_weight, + ctc_weight, + ): + super().__init__( + task, + sentence_avg, + use_guided_attention_loss, + guided_attention_loss_sigma, + bce_pos_weight, + ctc_weight, + ) + + def forward(self, model, sample, reduction="mean"): + bsz, max_len, _ = sample["target"].size() + feat_tgt = sample["target"] + feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len) + eos_tgt = torch.arange(max_len).to(sample["target"].device) + eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1) + eos_tgt = (eos_tgt == (feat_len - 1)).float() + + feat_out, eos_out, extra = model( + src_tokens=sample["net_input"]["src_tokens"], + src_lengths=sample["net_input"]["src_lengths"], + prev_output_tokens=sample["net_input"]["prev_output_tokens"], + prev_output_tokens_mt=sample["multitask"][model.mt_task_name]["net_input"][ + "prev_output_tokens" + ], + tgt_speaker=sample["net_input"]["tgt_speaker"], + target_lengths=sample["target_lengths"], + return_all_hiddens=True, + ) + + l1_loss, mse_loss, eos_loss = self.compute_loss( + extra["feature_out"], + feat_out, + eos_out, + feat_tgt, + eos_tgt, + sample["target_lengths"], + reduction, + ) + attn_loss = torch.tensor(0.0).type_as(l1_loss) + if self.guided_attn is not None: + attn_loss = self.guided_attn( + extra["attn"], + sample["net_input"]["src_lengths"], + sample["target_lengths"], + reduction, + ) + loss = ( + l1_loss + mse_loss + eos_loss + attn_loss + ) # do not include ctc loss as there's no text target + + sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"] + logging_output = { + "loss": utils.item(loss.data), + "ntokens": sample["ntokens"], + "nsentences": sample["nsentences"], + "sample_size": sample_size, + "l1_loss": utils.item(l1_loss.data), + "mse_loss": utils.item(mse_loss.data), + "eos_loss": utils.item(eos_loss.data), + "attn_loss": utils.item(attn_loss.data), + } + + if len(self.multitask_criterion) == 0: + return loss, sample_size, logging_output + + # multitask + multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra) + loss += multitask_loss + logging_output["multitask"] = multitask_log + return loss, sample_size, logging_output diff --git a/fairseq/criterions/speech_ulm_criterion.py b/fairseq/criterions/speech_ulm_criterion.py new file mode 100644 index 0000000000..eea74bae26 --- /dev/null +++ b/fairseq/criterions/speech_ulm_criterion.py @@ -0,0 +1,126 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from dataclasses import dataclass, field + +import torch.nn.functional as F +from fairseq.logging import metrics +from fairseq.tasks import FairseqTask +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass +from omegaconf import II + + +@dataclass +class SpeechUnitLmCriterionConfig(FairseqDataclass): + sentence_avg: bool = II("optimization.sentence_avg") + loss_weights: str = field( + default="1.;0.0;0.0", + metadata={ + "help": "Weights of the losses that correspond to token, duration, and F0 streams" + }, + ) + discrete_duration: bool = II("task.discrete_duration") + discrete_f0: bool = II("task.discrete_f0") + + +def mae_loss(pred, targ, mask, reduce=True): + if pred.ndim == 3: + pred = pred.squeeze(2) + else: + assert pred.ndim == 2 + loss = (pred.float() - targ.float()).abs() * (~mask).float() + loss = loss.sum() if reduce else loss.view(-1) + return loss + + +def nll_loss(pred, targ, mask, reduce=True): + lprob = F.log_softmax(pred, dim=-1) + loss = F.nll_loss(lprob.view(-1, lprob.size(-1)), targ.view(-1), reduction="none") + loss = loss * (~mask).float().view(-1) + loss = loss.sum() if reduce else loss.view(-1) + return loss + + +@register_criterion("speech_unit_lm_criterion", dataclass=SpeechUnitLmCriterionConfig) +class SpeechUnitLmCriterion(FairseqCriterion): + def __init__(self, cfg: SpeechUnitLmCriterionConfig, task: FairseqTask): + super().__init__(task) + self.sentence_avg = cfg.sentence_avg + self.weights = torch.tensor([float(w) for w in cfg.loss_weights.split(";")]) + assert self.weights.size(0) == 3 + assert (self.weights >= 0.0).all() + + self.dur_loss_fn = nll_loss if cfg.discrete_duration else mae_loss + self.f0_loss_fn = nll_loss if cfg.discrete_f0 else mae_loss + + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + net_output = model(**sample["net_input"]) + + token_loss = nll_loss( + net_output["token"], sample["target"], sample["mask"], reduce + ) + dur_loss = self.dur_loss_fn( + net_output["duration"], + sample["dur_target"], + sample["dur_mask"], + reduce, + ) + f0_loss = self.f0_loss_fn( + net_output["f0"], + sample["f0_target"], + sample["f0_mask"], + reduce, + ) + loss = self.weights.to(token_loss.device) * torch.stack( + [token_loss, dur_loss, f0_loss], dim=-1 + ) + loss = loss.sum() if reduce else loss.sum(-1) + + sample_size = ( + sample["target"].size(0) if self.sentence_avg else sample["ntokens"] + ) + logging_output = { + "loss": loss.detach().sum().item(), + "token_loss": token_loss.detach().sum().item(), + "dur_loss": dur_loss.detach().sum().item(), + "f0_loss": f0_loss.detach().sum().item(), + "ntokens": sample["ntokens"], + "nsentences": sample["target"].size(0), + "sample_size": sample_size, + } + return loss, sample_size, logging_output + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + loss_sum = sum(log.get("loss", 0) for log in logging_outputs) + token_loss_sum = sum(log.get("token_loss", 0) for log in logging_outputs) + dur_loss_sum = sum(log.get("dur_loss", 0) for log in logging_outputs) + f0_loss_sum = sum(log.get("f0_loss", 0) for log in logging_outputs) + + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + + metrics.log_scalar("loss", loss_sum / sample_size, sample_size, round=3) + + metrics.log_scalar( + "token_loss", token_loss_sum / sample_size, sample_size, round=3 + ) + + metrics.log_scalar("dur_loss", dur_loss_sum / sample_size, sample_size, round=3) + + metrics.log_scalar("f0_loss", f0_loss_sum / sample_size, sample_size, round=3) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + return True diff --git a/fairseq/criterions/tacotron2_loss.py b/fairseq/criterions/tacotron2_loss.py new file mode 100644 index 0000000000..4113fdc548 --- /dev/null +++ b/fairseq/criterions/tacotron2_loss.py @@ -0,0 +1,227 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import logging +from dataclasses import dataclass, field +from functools import lru_cache +from typing import Any, Dict, List + +import torch +import torch.nn.functional as F +from omegaconf import II + +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.data.data_utils import lengths_to_mask +from fairseq.dataclass import FairseqDataclass + +logger = logging.getLogger(__name__) + + +@dataclass +class Tacotron2CriterionConfig(FairseqDataclass): + bce_pos_weight: float = field( + default=1.0, + metadata={"help": "weight of positive examples for BCE loss"}, + ) + use_guided_attention_loss: bool = field( + default=False, + metadata={"help": "use guided attention loss"}, + ) + guided_attention_loss_sigma: float = field( + default=0.4, + metadata={"help": "weight of positive examples for BCE loss"}, + ) + ctc_weight: float = field(default=0.0, metadata={"help": "weight for CTC loss"}) + sentence_avg: bool = II("optimization.sentence_avg") + + +class GuidedAttentionLoss(torch.nn.Module): + """ + Efficiently Trainable Text-to-Speech System Based on Deep Convolutional + Networks with Guided Attention (https://arxiv.org/abs/1710.08969) + """ + + def __init__(self, sigma): + super().__init__() + self.sigma = sigma + + @staticmethod + @lru_cache(maxsize=8) + def _get_weight(s_len, t_len, sigma): + grid_x, grid_y = torch.meshgrid(torch.arange(t_len), torch.arange(s_len)) + grid_x = grid_x.to(s_len.device) + grid_y = grid_y.to(s_len.device) + w = (grid_y.float() / s_len - grid_x.float() / t_len) ** 2 + return 1.0 - torch.exp(-w / (2 * (sigma**2))) + + def _get_weights(self, src_lens, tgt_lens): + bsz, max_s_len, max_t_len = len(src_lens), max(src_lens), max(tgt_lens) + weights = torch.zeros((bsz, max_t_len, max_s_len)) + for i, (s_len, t_len) in enumerate(zip(src_lens, tgt_lens)): + weights[i, :t_len, :s_len] = self._get_weight(s_len, t_len, self.sigma) + return weights + + @staticmethod + def _get_masks(src_lens, tgt_lens): + in_masks = lengths_to_mask(src_lens) + out_masks = lengths_to_mask(tgt_lens) + return out_masks.unsqueeze(2) & in_masks.unsqueeze(1) + + def forward(self, attn, src_lens, tgt_lens, reduction="mean"): + weights = self._get_weights(src_lens, tgt_lens).to(attn.device) + masks = self._get_masks(src_lens, tgt_lens).to(attn.device) + loss = (weights * attn.transpose(1, 2)).masked_select(masks) + loss = torch.sum(loss) if reduction == "sum" else torch.mean(loss) + return loss + + +@register_criterion("tacotron2", dataclass=Tacotron2CriterionConfig) +class Tacotron2Criterion(FairseqCriterion): + def __init__( + self, + task, + sentence_avg, + use_guided_attention_loss, + guided_attention_loss_sigma, + bce_pos_weight, + ctc_weight, + ): + super().__init__(task) + self.sentence_avg = sentence_avg + self.bce_pos_weight = bce_pos_weight + + self.guided_attn = None + if use_guided_attention_loss: + self.guided_attn = GuidedAttentionLoss(guided_attention_loss_sigma) + self.ctc_weight = ctc_weight + + def forward(self, model, sample, reduction="mean"): + bsz, max_len, _ = sample["target"].size() + feat_tgt = sample["target"] + feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len) + eos_tgt = torch.arange(max_len).to(sample["target"].device) + eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1) + eos_tgt = (eos_tgt == (feat_len - 1)).float() + src_tokens = sample["net_input"]["src_tokens"] + src_lens = sample["net_input"]["src_lengths"] + tgt_lens = sample["target_lengths"] + + feat_out, eos_out, extra = model( + src_tokens=src_tokens, + src_lengths=src_lens, + prev_output_tokens=sample["net_input"]["prev_output_tokens"], + incremental_state=None, + target_lengths=tgt_lens, + speaker=sample["speaker"], + ) + + l1_loss, mse_loss, eos_loss = self.compute_loss( + extra["feature_out"], + feat_out, + eos_out, + feat_tgt, + eos_tgt, + tgt_lens, + reduction, + ) + attn_loss = torch.tensor(0.0).type_as(l1_loss) + if self.guided_attn is not None: + attn_loss = self.guided_attn(extra["attn"], src_lens, tgt_lens, reduction) + ctc_loss = torch.tensor(0.0).type_as(l1_loss) + if self.ctc_weight > 0.0: + net_output = (feat_out, eos_out, extra) + lprobs = model.get_normalized_probs(net_output, log_probs=True) + lprobs = lprobs.transpose(0, 1) # T x B x C + src_mask = lengths_to_mask(src_lens) + src_tokens_flat = src_tokens.masked_select(src_mask) + ctc_loss = ( + F.ctc_loss( + lprobs, + src_tokens_flat, + tgt_lens, + src_lens, + reduction=reduction, + zero_infinity=True, + ) + * self.ctc_weight + ) + loss = l1_loss + mse_loss + eos_loss + attn_loss + ctc_loss + + sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"] + logging_output = { + "loss": utils.item(loss.data), + "ntokens": sample["ntokens"], + "nsentences": sample["nsentences"], + "sample_size": sample_size, + "l1_loss": utils.item(l1_loss.data), + "mse_loss": utils.item(mse_loss.data), + "eos_loss": utils.item(eos_loss.data), + "attn_loss": utils.item(attn_loss.data), + "ctc_loss": utils.item(ctc_loss.data), + } + return loss, sample_size, logging_output + + def compute_loss( + self, + feat_out, + feat_out_post, + eos_out, + feat_tgt, + eos_tgt, + tgt_lens, + reduction="mean", + ): + mask = lengths_to_mask(tgt_lens) + _eos_out = eos_out[mask].squeeze() + _eos_tgt = eos_tgt[mask] + _feat_tgt = feat_tgt[mask] + _feat_out = feat_out[mask] + _feat_out_post = feat_out_post[mask] + + l1_loss = F.l1_loss(_feat_out, _feat_tgt, reduction=reduction) + F.l1_loss( + _feat_out_post, _feat_tgt, reduction=reduction + ) + mse_loss = F.mse_loss(_feat_out, _feat_tgt, reduction=reduction) + F.mse_loss( + _feat_out_post, _feat_tgt, reduction=reduction + ) + eos_loss = F.binary_cross_entropy_with_logits( + _eos_out, + _eos_tgt, + pos_weight=torch.tensor(self.bce_pos_weight), + reduction=reduction, + ) + return l1_loss, mse_loss, eos_loss + + @classmethod + def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None: + ns = [log.get("sample_size", 0) for log in logging_outputs] + ntot = sum(ns) + ws = [n / (ntot + 1e-8) for n in ns] + for key in ["loss", "l1_loss", "mse_loss", "eos_loss", "attn_loss", "ctc_loss"]: + vals = [log.get(key, 0) for log in logging_outputs] + val = sum(val * w for val, w in zip(vals, ws)) + metrics.log_scalar(key, val, ntot, round=3) + metrics.log_scalar("sample_size", ntot, len(logging_outputs)) + + # inference metrics + if "targ_frames" not in logging_outputs[0]: + return + n = sum(log.get("targ_frames", 0) for log in logging_outputs) + for key, new_key in [ + ("mcd_loss", "mcd_loss"), + ("pred_frames", "pred_ratio"), + ("nins", "ins_rate"), + ("ndel", "del_rate"), + ]: + val = sum(log.get(key, 0) for log in logging_outputs) + metrics.log_scalar(new_key, val / n, n, round=3) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + return False diff --git a/fairseq/criterions/wav2vec_criterion.py b/fairseq/criterions/wav2vec_criterion.py index 6ac7557dcc..3975468487 100644 --- a/fairseq/criterions/wav2vec_criterion.py +++ b/fairseq/criterions/wav2vec_criterion.py @@ -4,35 +4,46 @@ # LICENSE file in the root directory of this source tree. import math +from dataclasses import dataclass, field +from typing import List, Optional import torch import torch.nn.functional as F -from fairseq import metrics, utils +from fairseq import utils +from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass from fairseq.logging.meters import safe_round - - -@register_criterion("wav2vec") +from fairseq.utils import is_xla_tensor + + +@dataclass +class Wav2VecCriterionConfig(FairseqDataclass): + infonce: bool = field( + default=False, + metadata={ + "help": "if set, uses cross entropy instead of binary cross entropy (i.e. InfoNCE loss)" + }, + ) + loss_weights: Optional[List[float]] = field( + default=None, + metadata={"help": "weights for additional loss terms (not first one)"}, + ) + log_keys: List[str] = field( + default_factory=lambda: [], + metadata={"help": "output keys to log"}, + ) + + +@register_criterion("wav2vec", dataclass=Wav2VecCriterionConfig) class Wav2vecCriterion(FairseqCriterion): def __init__(self, task, infonce=False, loss_weights=None, log_keys=None): super().__init__(task) self.infonce = infonce - self.loss_weights = None if loss_weights is None else eval(loss_weights) - self.log_keys = [] if log_keys is None else eval(log_keys) + self.loss_weights = loss_weights + self.log_keys = [] if log_keys is None else log_keys - @staticmethod - def add_args(parser): - """Add criterion-specific arguments to the parser.""" - # fmt: off - parser.add_argument('--infonce', action='store_true', - help='if set, uses cross entropy instead of binary cross entropy (i.e. InfoNCE loss)') - parser.add_argument('--loss-weights', type=str, default=None, - help='weights for additional loss terms (not first one)') - parser.add_argument('--log-keys', type=str, default=None, - help='output keys to log') - # fmt: on - - def forward(self, model, sample, reduce=True, log_pred=False): + def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: @@ -43,7 +54,9 @@ def forward(self, model, sample, reduce=True, log_pred=False): net_output = model(**sample["net_input"]) logits = model.get_logits(net_output).float() target = model.get_targets(sample, net_output) + self.xla = is_xla_tensor(logits) + # XXX: handle weights on xla. weights = None if hasattr(model, "get_target_weights") and not self.infonce: weights = model.get_target_weights(target, net_output) @@ -52,21 +65,31 @@ def forward(self, model, sample, reduce=True, log_pred=False): losses = [] + reduction = "none" if ((not reduce) or self.xla) else "sum" if self.infonce: - loss = F.cross_entropy( - logits, - target, - reduction="sum" if reduce else "none", - ) + loss = F.cross_entropy(logits, target, reduction=reduction) else: loss = F.binary_cross_entropy_with_logits( - logits, - target.float(), - weights, - reduction="sum" if reduce else "none", + logits, target.float(), weights, reduction=reduction ) - sample_size = target.numel() if self.infonce else target.long().sum().item() + if self.xla: + # tpu-comment: since dynamic shapes lead to recompilations on xla, + # we don't shrink tensors using mask_indices. + # Instead, we use mask indices to adjust loss. + mi = ( + sample["net_input"]["mask_indices"] + .transpose(0, 1) # logits are transposed in `model.get_logits` + .reshape(logits.size(0)) + ) + loss = (loss * mi).sum() if reduce else (loss * mi) + + if "sample_size" in sample: + sample_size = sample["sample_size"] + elif "mask_indices" in sample["net_input"]: + sample_size = sample["net_input"]["mask_indices"].sum() + else: + sample_size = target.numel() if self.infonce else target.long().sum().item() losses.append(loss.detach().clone()) if self.loss_weights is not None: @@ -86,19 +109,36 @@ def forward(self, model, sample, reduce=True, log_pred=False): losses.append(p) logging_output = { - "loss": loss.item() if reduce else loss, + "loss": loss.item() if (reduce and not self.xla) else loss.detach(), "ntokens": sample_size, "nsentences": sample["id"].numel(), "sample_size": sample_size, } for lk in self.log_keys: - if lk in net_output: - logging_output[lk] = float((net_output[lk])) + # Only store "logits" and "target" for computing MAP and MAUC + # during validation + if lk == "logits": + if not self.training: + logging_output["logits"] = logits.cpu().numpy() + elif lk == "target": + if not self.training: + # If the targets have been mixed with the predictions of + # teacher models, find the original targets + if hasattr(model, "get_original_targets"): + original_target = model.get_original_targets(sample, net_output) + else: + original_target = target + logging_output["target"] = original_target.cpu().numpy() + elif lk in net_output: + value = net_output[lk] + if not is_xla_tensor(value): + value = float(value) + logging_output[lk] = value if len(losses) > 1: for i, l in enumerate(losses): - logging_output[f"loss_{i}"] = l.item() + logging_output[f"loss_{i}"] = l.item() if not self.xla else l.detach() if self.infonce: with torch.no_grad(): @@ -109,16 +149,19 @@ def forward(self, model, sample, reduce=True, log_pred=False): assert logits.dim() > 1, logits.shape max = logits.argmax(-1) == 0 min = logits.argmin(-1) == 0 - both = max & min - corr = max.long().sum().item() - both.long().sum().item() - count = max.numel() + if is_xla_tensor(logits): + max, min = max * mi, min * mi + both = max & min + corr = max.long().sum() - both.long().sum() + count = mi.sum() + else: + both = max & min + corr = max.long().sum().item() - both.long().sum().item() + count = float(max.numel()) logging_output["correct"] = corr logging_output["count"] = count - if log_pred: - logging_output["logits"] = logits.cpu().numpy() - logging_output["target"] = target.cpu().numpy() return loss, sample_size, logging_output @staticmethod @@ -134,7 +177,7 @@ def reduce_metrics(logging_outputs) -> None: ) metrics.log_scalar( - "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 + "loss", loss_sum / (sample_size or 1) / math.log(2), sample_size, round=3 ) metrics.log_scalar("ntokens", ntokens) metrics.log_scalar("nsentences", nsentences) @@ -166,19 +209,23 @@ def reduce_metrics(logging_outputs) -> None: for k in logging_outputs[0]: if k not in builtin_keys: - val = sum(log.get(k, 0) for log in logging_outputs) / len( - logging_outputs - ) + val = sum(log.get(k, 0) for log in logging_outputs) if k.startswith("loss"): - metrics.log_scalar(k, val / sample_size / math.log(2), sample_size) + metrics.log_scalar( + k, val / (sample_size or 1) / math.log(2), sample_size, round=3 + ) else: - metrics.log_scalar(k, val, round=3) + metrics.log_scalar(k, val / len(logging_outputs), round=3) - @staticmethod - def logging_outputs_can_be_summed() -> bool: + # FIXME: revert when gather based xla reduction is implemented + # @staticmethod + # def logging_outputs_can_be_summed() -> bool: + def logging_outputs_can_be_summed(self) -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ - return False + # XXX: Gather based reduction not implemented for xla yet. + # So we fall to sum based reduction for xla. + return self.xla diff --git a/fairseq/data/__init__.py b/fairseq/data/__init__.py index 9b30813955..eeaae2b254 100644 --- a/fairseq/data/__init__.py +++ b/fairseq/data/__init__.py @@ -12,7 +12,8 @@ from .add_target_dataset import AddTargetDataset from .append_token_dataset import AppendTokenDataset -from .audio.raw_audio_dataset import FileAudioDataset +from .audio.raw_audio_dataset import BinarizedAudioDataset, FileAudioDataset +from .audio.hubert_dataset import HubertDataset from .backtranslation_dataset import BacktranslationDataset from .bucket_pad_length_dataset import BucketPadLengthDataset from .colorize_dataset import ColorizeDataset @@ -38,6 +39,11 @@ from .numel_dataset import NumelDataset from .num_samples_dataset import NumSamplesDataset from .offset_tokens_dataset import OffsetTokensDataset +from .padding_mask_dataset import ( + LeftPaddingMaskDataset, + PaddingMaskDataset, + RightPaddingMaskDataset, +) from .pad_dataset import LeftPadDataset, PadDataset, RightPadDataset from .prepend_dataset import PrependDataset from .prepend_token_dataset import PrependTokenDataset @@ -47,6 +53,7 @@ from .roll_dataset import RollDataset from .round_robin_zip_datasets import RoundRobinZipDatasets from .sort_dataset import SortDataset +from .speech_dlm_dataset import SpeechDLMDataset from .strip_token_dataset import StripTokenDataset from .subsample_dataset import SubsampleDataset from .token_block_dataset import TokenBlockDataset @@ -56,6 +63,7 @@ from .multilingual.sampled_multi_dataset import SampledMultiDataset from .multilingual.sampled_multi_epoch_dataset import SampledMultiEpochDataset from .fasta_dataset import FastaDataset, EncodedFastaDataset +from .transform_eos_concat_langpair_dataset import TransformEosConcatLangPairDataset from .iterators import ( CountingIterator, @@ -69,6 +77,7 @@ "AppendTokenDataset", "BacktranslationDataset", "BaseWrapperDataset", + "BinarizedAudioDataset", "BucketPadLengthDataset", "ColorizeDataset", "ConcatDataset", @@ -81,7 +90,9 @@ "FairseqDataset", "FairseqIterableDataset", "FastaDataset", + "FileAudioDataset", "GroupedIterator", + "HubertDataset", "IdDataset", "IndexedCachedDataset", "IndexedDataset", @@ -103,22 +114,24 @@ "PadDataset", "PrependDataset", "PrependTokenDataset", - "ReplaceDataset", - "RollDataset", - "FileAudioDataset", + "RandomCropDataset", "RawLabelDataset", "ResamplingDataset", + "ReplaceDataset", "RightPadDataset", + "RollDataset", "RoundRobinZipDatasets", "SampledMultiDataset", "SampledMultiEpochDataset", "ShardedIterator", "SortDataset", + "SpeechDLMDataset", "StripTokenDataset", "SubsampleDataset", "TokenBlockDataset", "TransformEosDataset", "TransformEosLangPairDataset", + "TransformEosConcatLangPairDataset", "TruncateDataset", "TruncatedDictionary", ] diff --git a/fairseq/data/add_class_target_dataset.py b/fairseq/data/add_class_target_dataset.py new file mode 100644 index 0000000000..bf89f25656 --- /dev/null +++ b/fairseq/data/add_class_target_dataset.py @@ -0,0 +1,79 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from . import BaseWrapperDataset, data_utils +from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel + + +class AddTargetDataset(BaseWrapperDataset): + def __init__( + self, + dataset, + labels, + pad, + eos, + batch_targets, + process_label=None, + label_len_fn=None, + add_to_input=False, + text_compression_level=TextCompressionLevel.none, + ): + super().__init__(dataset) + self.labels = labels + self.batch_targets = batch_targets + self.pad = pad + self.eos = eos + self.process_label = process_label + self.label_len_fn = label_len_fn + self.add_to_input = add_to_input + self.text_compressor = TextCompressor(level=text_compression_level) + + def get_label(self, index, process_fn=None): + lbl = self.labels[index] + lbl = self.text_compressor.decompress(lbl) + return lbl if process_fn is None else process_fn(lbl) + + def __getitem__(self, index): + item = self.dataset[index] + item["label"] = self.get_label(index, process_fn=self.process_label) + return item + + def size(self, index): + sz = self.dataset.size(index) + own_sz = self.label_len_fn(self.get_label(index)) + return sz, own_sz + + def collater(self, samples): + collated = self.dataset.collater(samples) + if len(collated) == 0: + return collated + indices = set(collated["id"].tolist()) + target = [s["label"] for s in samples if s["id"] in indices] + + if self.batch_targets: + collated["target_lengths"] = torch.LongTensor([len(t) for t in target]) + target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False) + collated["ntokens"] = collated["target_lengths"].sum().item() + else: + collated["ntokens"] = sum([len(t) for t in target]) + + collated["target"] = target + + if self.add_to_input: + eos = target.new_full((target.size(0), 1), self.eos) + collated["target"] = torch.cat([target, eos], dim=-1).long() + collated["net_input"]["prev_output_tokens"] = torch.cat( + [eos, target], dim=-1 + ).long() + collated["ntokens"] += target.size(0) + return collated + + def filter_indices_by_size(self, indices, max_sizes): + indices, ignored = data_utils._filter_by_size_dynamic( + indices, self.size, max_sizes + ) + return indices, ignored diff --git a/fairseq/data/add_target_dataset.py b/fairseq/data/add_target_dataset.py index 9ef467058b..978a5b1903 100644 --- a/fairseq/data/add_target_dataset.py +++ b/fairseq/data/add_target_dataset.py @@ -6,6 +6,7 @@ import torch from . import BaseWrapperDataset, data_utils +from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel class AddTargetDataset(BaseWrapperDataset): @@ -17,7 +18,9 @@ def __init__( eos, batch_targets, process_label=None, + label_len_fn=None, add_to_input=False, + text_compression_level=TextCompressionLevel.none, ): super().__init__(dataset) self.labels = labels @@ -25,24 +28,24 @@ def __init__( self.pad = pad self.eos = eos self.process_label = process_label + self.label_len_fn = label_len_fn self.add_to_input = add_to_input + self.text_compressor = TextCompressor(level=text_compression_level) - def get_label(self, index): - return ( - self.labels[index] - if self.process_label is None - else self.process_label(self.labels[index]) - ) + def get_label(self, index, process_fn=None): + lbl = self.labels[index] + lbl = self.text_compressor.decompress(lbl) + return lbl if process_fn is None else process_fn(lbl) def __getitem__(self, index): item = self.dataset[index] - item["label"] = self.get_label(index) + item["label"] = self.get_label(index, process_fn=self.process_label) return item def size(self, index): sz = self.dataset.size(index) - own_sz = len(self.get_label(index)) - return (sz, own_sz) + own_sz = self.label_len_fn(self.get_label(index)) + return sz, own_sz def collater(self, samples): collated = self.dataset.collater(samples) @@ -51,20 +54,30 @@ def collater(self, samples): indices = set(collated["id"].tolist()) target = [s["label"] for s in samples if s["id"] in indices] + if self.add_to_input: + eos = torch.LongTensor([self.eos]) + prev_output_tokens = [torch.cat([eos, t], axis=-1) for t in target] + target = [torch.cat([t, eos], axis=-1) for t in target] + collated["net_input"]["prev_output_tokens"] = prev_output_tokens + if self.batch_targets: collated["target_lengths"] = torch.LongTensor([len(t) for t in target]) target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False) collated["ntokens"] = collated["target_lengths"].sum().item() + if getattr(collated["net_input"], "prev_output_tokens", None): + collated["net_input"]["prev_output_tokens"] = data_utils.collate_tokens( + collated["net_input"]["prev_output_tokens"], + pad_idx=self.pad, + left_pad=False, + ) else: collated["ntokens"] = sum([len(t) for t in target]) collated["target"] = target - - if self.add_to_input: - eos = target.new_full((target.size(0), 1), self.eos) - collated["target"] = torch.cat([target, eos], dim=-1).long() - collated["net_input"]["prev_output_tokens"] = torch.cat( - [eos, target], dim=-1 - ).long() - collated["ntokens"] += target.size(0) return collated + + def filter_indices_by_size(self, indices, max_sizes): + indices, ignored = data_utils._filter_by_size_dynamic( + indices, self.size, max_sizes + ) + return indices, ignored diff --git a/fairseq/data/audio/__init__.py b/fairseq/data/audio/__init__.py index e69de29bb2..dff90fadfc 100644 --- a/fairseq/data/audio/__init__.py +++ b/fairseq/data/audio/__init__.py @@ -0,0 +1,93 @@ +from abc import ABC, abstractmethod +from typing import Dict, Optional +import importlib +import os +import numpy as np + + +class AudioTransform(ABC): + @classmethod + @abstractmethod + def from_config_dict(cls, config: Optional[Dict] = None): + pass + + +class CompositeAudioTransform(AudioTransform): + def _from_config_dict( + cls, + transform_type, + get_audio_transform, + composite_cls, + config=None, + return_empty=False, + ): + _config = {} if config is None else config + _transforms = _config.get(f"{transform_type}_transforms") + + if _transforms is None: + if return_empty: + _transforms = [] + else: + return None + + transforms = [ + get_audio_transform(_t).from_config_dict(_config.get(_t)) + for _t in _transforms + ] + return composite_cls(transforms) + + def __init__(self, transforms): + self.transforms = [t for t in transforms if t is not None] + + def __call__(self, x): + for t in self.transforms: + x = t(x) + return x + + def __repr__(self): + format_string = ( + [self.__class__.__name__ + "("] + + [f" {t.__repr__()}" for t in self.transforms] + + [")"] + ) + return "\n".join(format_string) + + +def register_audio_transform(name, cls_type, registry, class_names): + def register_audio_transform_cls(cls): + if name in registry: + raise ValueError(f"Cannot register duplicate transform ({name})") + if not issubclass(cls, cls_type): + raise ValueError( + f"Transform ({name}: {cls.__name__}) must extend " + f"{cls_type.__name__}" + ) + if cls.__name__ in class_names: + raise ValueError( + f"Cannot register audio transform with duplicate " + f"class name ({cls.__name__})" + ) + registry[name] = cls + class_names.add(cls.__name__) + return cls + + return register_audio_transform_cls + + +def import_transforms(transforms_dir, transform_type): + for file in os.listdir(transforms_dir): + path = os.path.join(transforms_dir, file) + if ( + not file.startswith("_") + and not file.startswith(".") + and (file.endswith(".py") or os.path.isdir(path)) + ): + name = file[: file.find(".py")] if file.endswith(".py") else file + importlib.import_module( + f"fairseq.data.audio.{transform_type}_transforms." + name + ) + + +# Utility fn for uniform numbers in transforms +def rand_uniform(a, b): + return np.random.uniform() * (b - a) + a diff --git a/fairseq/data/audio/audio_utils.py b/fairseq/data/audio/audio_utils.py index de08669851..590a7493ae 100644 --- a/fairseq/data/audio/audio_utils.py +++ b/fairseq/data/audio/audio_utils.py @@ -1,39 +1,221 @@ -import os.path as op -from typing import BinaryIO, Optional, Tuple, Union +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import mmap +from pathlib import Path +import io +from typing import BinaryIO, List, Optional, Tuple, Union import numpy as np +import torch +import torch.nn.functional as F + +from fairseq.data.audio.waveform_transforms import CompositeAudioWaveformTransform + +SF_AUDIO_FILE_EXTENSIONS = {".wav", ".flac", ".ogg"} +FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS = {".npy", ".wav", ".flac", ".ogg"} + + +def convert_waveform( + waveform: Union[np.ndarray, torch.Tensor], + sample_rate: int, + normalize_volume: bool = False, + to_mono: bool = False, + to_sample_rate: Optional[int] = None, +) -> Tuple[Union[np.ndarray, torch.Tensor], int]: + """convert a waveform: + - to a target sample rate + - from multi-channel to mono channel + - volume normalization + + Args: + waveform (numpy.ndarray or torch.Tensor): 2D original waveform + (channels x length) + sample_rate (int): original sample rate + normalize_volume (bool): perform volume normalization + to_mono (bool): convert to mono channel if having multiple channels + to_sample_rate (Optional[int]): target sample rate + Returns: + waveform (numpy.ndarray): converted 2D waveform (channels x length) + sample_rate (float): target sample rate + """ + try: + import torchaudio.sox_effects as ta_sox + except ImportError: + raise ImportError("Please install torchaudio: pip install torchaudio") + + effects = [] + if normalize_volume: + effects.append(["gain", "-n"]) + if to_sample_rate is not None and to_sample_rate != sample_rate: + effects.append(["rate", f"{to_sample_rate}"]) + if to_mono and waveform.shape[0] > 1: + effects.append(["channels", "1"]) + if len(effects) > 0: + is_np_input = isinstance(waveform, np.ndarray) + _waveform = torch.from_numpy(waveform) if is_np_input else waveform + converted, converted_sample_rate = ta_sox.apply_effects_tensor( + _waveform, sample_rate, effects + ) + if is_np_input: + converted = converted.numpy() + return converted, converted_sample_rate + return waveform, sample_rate def get_waveform( - path_or_fp: Union[str, BinaryIO], normalization=True + path_or_fp: Union[str, BinaryIO], + normalization: bool = True, + mono: bool = True, + frames: int = -1, + start: int = 0, + always_2d: bool = True, + output_sample_rate: Optional[int] = None, + normalize_volume: bool = False, + waveform_transforms: Optional[CompositeAudioWaveformTransform] = None, ) -> Tuple[np.ndarray, int]: - """Get the waveform and sample rate of a 16-bit mono-channel WAV or FLAC. + """Get the waveform and sample rate of a 16-bit WAV/FLAC/OGG Vorbis audio. Args: path_or_fp (str or BinaryIO): the path or file-like object - normalization (bool): Normalize values to [-1, 1] (Default: True) + normalization (bool): normalize values to [-1, 1] (Default: True) + mono (bool): convert multi-channel audio to mono-channel one + frames (int): the number of frames to read. (-1 for reading all) + start (int): Where to start reading. A negative value counts from the end. + always_2d (bool): always return 2D array even for mono-channel audios + output_sample_rate (Optional[int]): output sample rate + normalize_volume (bool): normalize volume + Returns: + waveform (numpy.ndarray): 1D or 2D waveform (channels x length) + sample_rate (float): sample rate """ if isinstance(path_or_fp, str): - ext = op.splitext(op.basename(path_or_fp))[1] - if ext not in {".flac", ".wav"}: + ext = Path(path_or_fp).suffix + if ext not in SF_AUDIO_FILE_EXTENSIONS: raise ValueError(f"Unsupported audio format: {ext}") try: import soundfile as sf except ImportError: - raise ImportError("Please install soundfile to load WAV/FLAC file") + raise ImportError("Please install soundfile: pip install soundfile") + + waveform, sample_rate = sf.read( + path_or_fp, dtype="float32", always_2d=True, frames=frames, start=start + ) + waveform = waveform.T # T x C -> C x T + waveform, sample_rate = convert_waveform( + waveform, + sample_rate, + normalize_volume=normalize_volume, + to_mono=mono, + to_sample_rate=output_sample_rate, + ) - waveform, sample_rate = sf.read(path_or_fp, dtype="float32") if not normalization: - waveform *= 2 ** 15 # denormalized to 16-bit signed integers + waveform *= 2**15 # denormalized to 16-bit signed integers + + if waveform_transforms is not None: + waveform, sample_rate = waveform_transforms(waveform, sample_rate) + + if not always_2d: + waveform = waveform.squeeze(axis=0) + return waveform, sample_rate -def _get_kaldi_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]: +def get_features_from_npy_or_audio(path, waveform_transforms=None): + ext = Path(path).suffix + if ext not in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS: + raise ValueError(f'Unsupported file format for "{path}"') + return ( + np.load(path) + if ext == ".npy" + else get_fbank(path, waveform_transforms=waveform_transforms) + ) + + +def get_features_or_waveform_from_stored_zip( + path, + byte_offset, + byte_size, + need_waveform=False, + use_sample_rate=None, + waveform_transforms=None, +): + assert path.endswith(".zip") + data = read_from_stored_zip(path, byte_offset, byte_size) + f = io.BytesIO(data) + if is_npy_data(data): + features_or_waveform = np.load(f) + elif is_sf_audio_data(data): + features_or_waveform = ( + get_waveform( + f, + always_2d=False, + output_sample_rate=use_sample_rate, + waveform_transforms=waveform_transforms, + )[0] + if need_waveform + else get_fbank(f, waveform_transforms=waveform_transforms) + ) + else: + raise ValueError(f'Unknown file format for "{path}"') + return features_or_waveform + + +def get_features_or_waveform( + path: str, need_waveform=False, use_sample_rate=None, waveform_transforms=None +): + """Get speech features from .npy file or waveform from .wav/.flac file. + The file may be inside an uncompressed ZIP file and is accessed via byte + offset and length. + + Args: + path (str): File path in the format of "<.npy/.wav/.flac path>" or + "<zip path>:<byte offset>:<byte length>". + need_waveform (bool): return waveform instead of features. + use_sample_rate (int): change sample rate for the input wave file + + Returns: + features_or_waveform (numpy.ndarray): speech features or waveform. + """ + _path, slice_ptr = parse_path(path) + if len(slice_ptr) == 0: + if need_waveform: + return get_waveform( + _path, + always_2d=False, + output_sample_rate=use_sample_rate, + waveform_transforms=waveform_transforms, + )[0] + return get_features_from_npy_or_audio( + _path, waveform_transforms=waveform_transforms + ) + elif len(slice_ptr) == 2: + features_or_waveform = get_features_or_waveform_from_stored_zip( + _path, + slice_ptr[0], + slice_ptr[1], + need_waveform=need_waveform, + use_sample_rate=use_sample_rate, + waveform_transforms=waveform_transforms, + ) + else: + raise ValueError(f"Invalid path: {path}") + + return features_or_waveform + + +def _get_kaldi_fbank( + waveform: np.ndarray, sample_rate: int, n_bins=80 +) -> Optional[np.ndarray]: """Get mel-filter bank features via PyKaldi.""" try: + from kaldi.feat.fbank import Fbank, FbankOptions from kaldi.feat.mel import MelBanksOptions - from kaldi.feat.fbank import FbankOptions, Fbank from kaldi.feat.window import FrameExtractionOptions from kaldi.matrix import Vector @@ -45,19 +227,20 @@ def _get_kaldi_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]: opts.mel_opts = mel_opts opts.frame_opts = frame_opts fbank = Fbank(opts=opts) - features = fbank.compute(Vector(waveform), 1.0).numpy() + features = fbank.compute(Vector(waveform.squeeze()), 1.0).numpy() return features except ImportError: return None -def _get_torchaudio_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]: +def _get_torchaudio_fbank( + waveform: np.ndarray, sample_rate, n_bins=80 +) -> Optional[np.ndarray]: """Get mel-filter bank features via TorchAudio.""" try: - import torch import torchaudio.compliance.kaldi as ta_kaldi - waveform = torch.from_numpy(waveform).unsqueeze(0) + waveform = torch.from_numpy(waveform) features = ta_kaldi.fbank( waveform, num_mel_bins=n_bins, sample_frequency=sample_rate ) @@ -66,16 +249,20 @@ def _get_torchaudio_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarr return None -def get_fbank(path_or_fp: Union[str, BinaryIO], n_bins=80) -> np.ndarray: +def get_fbank( + path_or_fp: Union[str, BinaryIO], n_bins=80, waveform_transforms=None +) -> np.ndarray: """Get mel-filter bank features via PyKaldi or TorchAudio. Prefer PyKaldi (faster CPP implementation) to TorchAudio (Python implementation). Note that Kaldi/TorchAudio requires 16-bit signed integers as inputs and hence the waveform should not be normalized.""" - sound, sample_rate = get_waveform(path_or_fp, normalization=False) + waveform, sample_rate = get_waveform( + path_or_fp, normalization=False, waveform_transforms=waveform_transforms + ) - features = _get_kaldi_fbank(sound, sample_rate, n_bins) + features = _get_kaldi_fbank(waveform, sample_rate, n_bins) if features is None: - features = _get_torchaudio_fbank(sound, sample_rate, n_bins) + features = _get_torchaudio_fbank(waveform, sample_rate, n_bins) if features is None: raise ImportError( "Please install pyKaldi or torchaudio to enable " @@ -83,3 +270,120 @@ def get_fbank(path_or_fp: Union[str, BinaryIO], n_bins=80) -> np.ndarray: ) return features + + +def is_npy_data(data: bytes) -> bool: + return data[0] == 147 and data[1] == 78 + + +def is_sf_audio_data(data: bytes) -> bool: + is_wav = data[0] == 82 and data[1] == 73 and data[2] == 70 + is_flac = data[0] == 102 and data[1] == 76 and data[2] == 97 + is_ogg = data[0] == 79 and data[1] == 103 and data[2] == 103 + return is_wav or is_flac or is_ogg + + +def mmap_read(path: str, offset: int, length: int) -> bytes: + with open(path, "rb") as f: + with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_o: + data = mmap_o[offset : offset + length] + return data + + +def read_from_stored_zip(zip_path: str, offset: int, length: int) -> bytes: + return mmap_read(zip_path, offset, length) + + +def parse_path(path: str) -> Tuple[str, List[int]]: + """Parse data path which is either a path to + 1. a .npy/.wav/.flac/.ogg file + 2. a stored ZIP file with slicing info: "[zip_path]:[offset]:[length]" + + Args: + path (str): the data path to parse + + Returns: + file_path (str): the file path + slice_ptr (list of int): empty in case 1; + byte offset and length for the slice in case 2 + """ + + if Path(path).suffix in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS: + _path, slice_ptr = path, [] + else: + _path, *slice_ptr = path.split(":") + if not Path(_path).is_file(): + raise FileNotFoundError(f"File not found: {_path}") + assert len(slice_ptr) in {0, 2}, f"Invalid path: {path}" + slice_ptr = [int(i) for i in slice_ptr] + return _path, slice_ptr + + +def get_window(window_fn: callable, n_fft: int, win_length: int) -> torch.Tensor: + padding = n_fft - win_length + assert padding >= 0 + return F.pad(window_fn(win_length), (padding // 2, padding - padding // 2)) + + +def get_fourier_basis(n_fft: int) -> torch.Tensor: + basis = np.fft.fft(np.eye(n_fft)) + basis = np.vstack( + [np.real(basis[: n_fft // 2 + 1, :]), np.imag(basis[: n_fft // 2 + 1, :])] + ) + return torch.from_numpy(basis).float() + + +def get_mel_filters( + sample_rate: int, n_fft: int, n_mels: int, f_min: float, f_max: float +) -> torch.Tensor: + try: + import librosa + except ImportError: + raise ImportError("Please install librosa: pip install librosa") + basis = librosa.filters.mel(sample_rate, n_fft, n_mels, f_min, f_max) + return torch.from_numpy(basis).float() + + +class TTSSpectrogram(torch.nn.Module): + def __init__( + self, + n_fft: int, + win_length: int, + hop_length: int, + window_fn: callable = torch.hann_window, + return_phase: bool = False, + ) -> None: + super(TTSSpectrogram, self).__init__() + self.n_fft = n_fft + self.hop_length = hop_length + self.return_phase = return_phase + + basis = get_fourier_basis(n_fft).unsqueeze(1) + basis *= get_window(window_fn, n_fft, win_length) + self.register_buffer("basis", basis) + + def forward( + self, waveform: torch.Tensor + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + padding = (self.n_fft // 2, self.n_fft // 2) + x = F.pad(waveform.unsqueeze(1), padding, mode="reflect") + x = F.conv1d(x, self.basis, stride=self.hop_length) + real_part = x[:, : self.n_fft // 2 + 1, :] + imag_part = x[:, self.n_fft // 2 + 1 :, :] + magnitude = torch.sqrt(real_part**2 + imag_part**2) + if self.return_phase: + phase = torch.atan2(imag_part, real_part) + return magnitude, phase + return magnitude + + +class TTSMelScale(torch.nn.Module): + def __init__( + self, n_mels: int, sample_rate: int, f_min: float, f_max: float, n_stft: int + ) -> None: + super(TTSMelScale, self).__init__() + basis = get_mel_filters(sample_rate, (n_stft - 1) * 2, n_mels, f_min, f_max) + self.register_buffer("basis", basis) + + def forward(self, specgram: torch.Tensor) -> torch.Tensor: + return torch.matmul(self.basis, specgram) diff --git a/fairseq/data/audio/data_cfg.py b/fairseq/data/audio/data_cfg.py new file mode 100644 index 0000000000..6be6f6521c --- /dev/null +++ b/fairseq/data/audio/data_cfg.py @@ -0,0 +1,387 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from argparse import Namespace +from copy import deepcopy +from pathlib import Path +from typing import Dict, Optional + +from fairseq.data import Dictionary + +logger = logging.getLogger(__name__) + + +def get_config_from_yaml(yaml_path: Path): + try: + import yaml + except ImportError: + print("Please install PyYAML: pip install PyYAML") + config = {} + if yaml_path.is_file(): + try: + with open(yaml_path) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + except Exception as e: + raise Exception(f"Failed to load config from {yaml_path.as_posix()}: {e}") + else: + raise FileNotFoundError(f"{yaml_path.as_posix()} not found") + + return config + + +class S2TDataConfig(object): + """Wrapper class for data config YAML""" + + def __init__(self, yaml_path: Path): + self.config = get_config_from_yaml(yaml_path) + self.root = yaml_path.parent + + def _auto_convert_to_abs_path(self, x): + if isinstance(x, str): + if not Path(x).exists() and (self.root / x).exists(): + return (self.root / x).as_posix() + elif isinstance(x, dict): + return {k: self._auto_convert_to_abs_path(v) for k, v in x.items()} + return x + + @property + def vocab_filename(self): + """fairseq vocabulary file under data root""" + return self.config.get("vocab_filename", "dict.txt") + + @property + def speaker_set_filename(self): + """speaker set file under data root""" + return self.config.get("speaker_set_filename", None) + + @property + def shuffle(self) -> bool: + """Shuffle dataset samples before batching""" + return self.config.get("shuffle", False) + + @property + def pre_tokenizer(self) -> Dict: + """Pre-tokenizer to apply before subword tokenization. Returning + a dictionary with `tokenizer` providing the tokenizer name and + the other items providing the tokenizer-specific arguments. + Tokenizers are defined in `fairseq.data.encoders.*`""" + tokenizer = self.config.get("pre_tokenizer", {"tokenizer": None}) + return self._auto_convert_to_abs_path(tokenizer) + + @property + def bpe_tokenizer(self) -> Dict: + """Subword tokenizer to apply after pre-tokenization. Returning + a dictionary with `bpe` providing the tokenizer name and + the other items providing the tokenizer-specific arguments. + Tokenizers are defined in `fairseq.data.encoders.*`""" + tokenizer = self.config.get("bpe_tokenizer", {"bpe": None}) + return self._auto_convert_to_abs_path(tokenizer) + + @property + def prepend_tgt_lang_tag(self) -> bool: + """Prepend target lang ID token as the target BOS (e.g. for to-many + multilingual setting). During inference, this requires `--prefix-size 1` + to force BOS to be lang ID token.""" + return self.config.get("prepend_tgt_lang_tag", False) + + @property + def prepend_bos_and_append_tgt_lang_tag(self) -> bool: + """Prepend BOS and append target lang ID token to the target (e.g. mBART with language token pretraining).""" + return self.config.get("prepend_bos_and_append_tgt_lang_tag", False) + + @property + def input_feat_per_channel(self): + """The dimension of input features (per audio channel)""" + return self.config.get("input_feat_per_channel", 80) + + @property + def input_channels(self): + """The number of channels in the input audio""" + return self.config.get("input_channels", 1) + + @property + def sample_rate(self): + return self.config.get("sample_rate", 16_000) + + @property + def sampling_alpha(self): + """Hyper-parameter alpha = 1/T for temperature-based resampling. + (alpha = 1 for no resampling)""" + return self.config.get("sampling_alpha", 1.0) + + @property + def use_audio_input(self): + """Needed by the dataset loader to see if the model requires + raw audio as inputs.""" + return self.config.get("use_audio_input", False) + + def standardize_audio(self) -> bool: + return self.use_audio_input and self.config.get("standardize_audio", False) + + @property + def use_sample_rate(self): + """Needed by the dataset loader to see if the model requires + raw audio with specific sample rate as inputs.""" + return self.config.get("use_sample_rate", 16000) + + @property + def audio_root(self): + """Audio paths in the manifest TSV can be relative and this provides + the root path. Set this to empty string when using absolute paths.""" + return self.config.get("audio_root", "") + + def get_transforms(self, transform_type, split, is_train): + """Split-specific feature transforms. Allowing train set + wildcard `_train`, evaluation set wildcard `_eval` and general + wildcard `*` for matching.""" + from copy import deepcopy + + cfg = deepcopy(self.config) + _cur = cfg.get(f"{transform_type}transforms", {}) + cur = _cur.get(split) + cur = _cur.get("_train") if cur is None and is_train else cur + cur = _cur.get("_eval") if cur is None and not is_train else cur + cur = _cur.get("*") if cur is None else cur + return cur + + def get_feature_transforms(self, split, is_train): + cfg = deepcopy(self.config) + # TODO: deprecate transforms + cur = self.get_transforms("", split, is_train) + if cur is not None: + logger.warning( + "Auto converting transforms into feature_transforms, " + "but transforms will be deprecated in the future. Please " + "update this in the config." + ) + ft_transforms = self.get_transforms("feature_", split, is_train) + if ft_transforms: + cur.extend(ft_transforms) + else: + cur = self.get_transforms("feature_", split, is_train) + cfg["feature_transforms"] = cur + return cfg + + def get_waveform_transforms(self, split, is_train): + cfg = deepcopy(self.config) + cfg["waveform_transforms"] = self.get_transforms("waveform_", split, is_train) + return cfg + + def get_dataset_transforms(self, split, is_train): + cfg = deepcopy(self.config) + cfg["dataset_transforms"] = self.get_transforms("dataset_", split, is_train) + return cfg + + @property + def global_cmvn_stats_npz(self) -> Optional[str]: + path = self.config.get("global_cmvn", {}).get("stats_npz_path", None) + return self._auto_convert_to_abs_path(path) + + @property + def vocoder(self) -> Dict[str, str]: + vocoder = self.config.get("vocoder", {"type": "griffin_lim"}) + return self._auto_convert_to_abs_path(vocoder) + + @property + def hub(self) -> Dict[str, str]: + return self.config.get("hub", {}) + + +class S2SDataConfig(S2TDataConfig): + """Wrapper class for data config YAML""" + + @property + def vocab_filename(self): + """fairseq vocabulary file under data root""" + return self.config.get("vocab_filename", None) + + @property + def pre_tokenizer(self) -> Dict: + return None + + @property + def bpe_tokenizer(self) -> Dict: + return None + + @property + def input_transformed_channels(self): + """The number of channels in the audio after feature transforms""" + # TODO: move this into individual transforms + # TODO: deprecate transforms + _cur = self.config.get("transforms", {}) + ft_transforms = self.config.get("feature_transforms", {}) + if _cur and ft_transforms: + _cur.update(ft_transforms) + else: + _cur = self.config.get("feature_transforms", {}) + cur = _cur.get("_train", []) + + _channels = self.input_channels + if "delta_deltas" in cur: + _channels *= 3 + + return _channels + + @property + def output_sample_rate(self): + """The audio sample rate of output target speech""" + return self.config.get("output_sample_rate", 22050) + + @property + def target_speaker_embed(self): + """Target speaker embedding file (one line per target audio sample)""" + return self.config.get("target_speaker_embed", None) + + @property + def prepend_tgt_lang_tag_as_bos(self) -> bool: + """Prepend target lang ID token as the target BOS.""" + return self.config.get("prepend_tgt_lang_tag_as_bos", False) + + +class MultitaskConfig(object): + """Wrapper class for data config YAML""" + + def __init__(self, yaml_path: Path): + config = get_config_from_yaml(yaml_path) + self.config = {} + for k, v in config.items(): + self.config[k] = SingleTaskConfig(k, v) + + def get_all_tasks(self): + return self.config + + def get_single_task(self, name): + assert name in self.config, f"multitask '{name}' does not exist!" + return self.config[name] + + @property + def first_pass_decoder_task_index(self): + """Return the task index of the first-pass text decoder. + If there are multiple 'is_first_pass_decoder: True' in the config file, + the last task is used for the first-pass decoder. + If there is no 'is_first_pass_decoder: True' in the config file, + the last task whose task_name includes 'target' and decoder_type is not ctc. + """ + idx = -1 + for i, (k, v) in enumerate(self.config.items()): + if v.is_first_pass_decoder: + idx = i + if idx < 0: + for i, (k, v) in enumerate(self.config.items()): + if k.startswith("target") and v.decoder_type == "transformer": + idx = i + return idx + + +class SingleTaskConfig(object): + def __init__(self, name, config): + self.task_name = name + self.config = config + dict_path = config.get("dict", "") + self.tgt_dict = Dictionary.load(dict_path) if Path(dict_path).exists() else None + + @property + def data(self): + return self.config.get("data", "") + + @property + def decoder_type(self): + return self.config.get("decoder_type", "transformer") + + @property + def decoder_args(self): + """Decoder arch related args""" + args = self.config.get("decoder_args", {}) + return Namespace(**args) + + @property + def criterion_cfg(self): + """cfg for the multitask criterion""" + if self.decoder_type == "ctc": + from fairseq.criterions.ctc import CtcCriterionConfig + + cfg = CtcCriterionConfig + cfg.zero_infinity = self.config.get("zero_infinity", True) + else: + from fairseq.criterions.label_smoothed_cross_entropy import ( + LabelSmoothedCrossEntropyCriterionConfig, + ) + + cfg = LabelSmoothedCrossEntropyCriterionConfig + cfg.label_smoothing = self.config.get("label_smoothing", 0.2) + return cfg + + @property + def input_from(self): + """Condition on encoder/decoder of the main model""" + return "decoder" if "decoder_layer" in self.config else "encoder" + + @property + def input_layer(self): + if self.input_from == "decoder": + return self.config["decoder_layer"] - 1 + else: + # default using the output from the last encoder layer (-1) + return self.config.get("encoder_layer", 0) - 1 + + @property + def loss_weight_schedule(self): + return ( + "decay" + if "loss_weight_max" in self.config + and "loss_weight_decay_steps" in self.config + else "fixed" + ) + + def get_loss_weight(self, num_updates): + if self.loss_weight_schedule == "fixed": + weight = self.config.get("loss_weight", 1.0) + else: # "decay" + assert ( + self.config.get("loss_weight_decay_steps", 0) > 0 + ), "loss_weight_decay_steps must be greater than 0 for a decay schedule" + loss_weight_min = self.config.get("loss_weight_min", 0.0001) + loss_weight_decay_stepsize = ( + self.config["loss_weight_max"] - loss_weight_min + ) / self.config["loss_weight_decay_steps"] + weight = max( + self.config["loss_weight_max"] + - loss_weight_decay_stepsize * num_updates, + loss_weight_min, + ) + return weight + + @property + def prepend_bos_and_append_tgt_lang_tag(self) -> bool: + """Prepend BOS and append target lang ID token to the target (e.g. mBART with language token pretraining).""" + return self.config.get("prepend_bos_and_append_tgt_lang_tag", False) + + @property + def eos_token(self): + """EOS token during generation""" + return self.config.get("eos_token", "<eos>") + + @property + def rdrop_alpha(self): + return self.config.get("rdrop_alpha", 0.0) + + @property + def is_first_pass_decoder(self): + flag = self.config.get("is_first_pass_decoder", False) + if flag: + if self.decoder_type == "ctc": + raise ValueError( + "First-pass decoder in the multi-decoder model must not be CTC." + ) + if "target" not in self.task_name: + raise Warning( + 'The name of the first-pass decoder does not include "target".' + ) + return flag + + @property + def get_lang_tag_mapping(self): + return self.config.get("lang_tag_mapping", {}) diff --git a/fairseq/data/audio/dataset_transforms/__init__.py b/fairseq/data/audio/dataset_transforms/__init__.py new file mode 100644 index 0000000000..b24c6f731f --- /dev/null +++ b/fairseq/data/audio/dataset_transforms/__init__.py @@ -0,0 +1,53 @@ +import os +from fairseq.data.audio import ( + AudioTransform, + CompositeAudioTransform, + import_transforms, + register_audio_transform, +) + + +class AudioDatasetTransform(AudioTransform): + pass + + +AUDIO_DATASET_TRANSFORM_REGISTRY = {} +AUDIO_DATASET_TRANSFORM_CLASS_NAMES = set() + + +def get_audio_dataset_transform(name): + return AUDIO_DATASET_TRANSFORM_REGISTRY[name] + + +def register_audio_dataset_transform(name): + return register_audio_transform( + name, + AudioDatasetTransform, + AUDIO_DATASET_TRANSFORM_REGISTRY, + AUDIO_DATASET_TRANSFORM_CLASS_NAMES, + ) + + +import_transforms(os.path.dirname(__file__), "dataset") + + +class CompositeAudioDatasetTransform(CompositeAudioTransform): + @classmethod + def from_config_dict(cls, config=None): + return super()._from_config_dict( + cls, + "dataset", + get_audio_dataset_transform, + CompositeAudioDatasetTransform, + config, + return_empty=True, + ) + + def get_transform(self, cls): + for t in self.transforms: + if isinstance(t, cls): + return t + return None + + def has_transform(self, cls): + return self.get_transform(cls) is not None diff --git a/fairseq/data/audio/dataset_transforms/concataugment.py b/fairseq/data/audio/dataset_transforms/concataugment.py new file mode 100644 index 0000000000..0b632ccf2b --- /dev/null +++ b/fairseq/data/audio/dataset_transforms/concataugment.py @@ -0,0 +1,61 @@ +from typing import List +import numpy as np + +from fairseq.data.audio.dataset_transforms import ( + AudioDatasetTransform, + register_audio_dataset_transform, +) + +_DEFAULTS = {"rate": 0.25, "max_tokens": 3000, "attempts": 5} + + +@register_audio_dataset_transform("concataugment") +class ConcatAugment(AudioDatasetTransform): + @classmethod + def from_config_dict(cls, config=None): + _config = {} if config is None else config + return ConcatAugment( + _config.get("rate", _DEFAULTS["rate"]), + _config.get("max_tokens", _DEFAULTS["max_tokens"]), + _config.get("attempts", _DEFAULTS["attempts"]), + ) + + def __init__( + self, + rate=_DEFAULTS["rate"], + max_tokens=_DEFAULTS["max_tokens"], + attempts=_DEFAULTS["attempts"], + ): + self.rate, self.max_tokens, self.attempts = rate, max_tokens, attempts + + def __repr__(self): + return ( + self.__class__.__name__ + + "(" + + ", ".join( + [ + f"rate={self.rate}", + f"max_tokens={self.max_tokens}", + f"attempts={self.attempts}", + ] + ) + + ")" + ) + + def find_indices(self, index: int, n_frames: List[int], n_samples: int): + # skip conditions: application rate, max_tokens limit exceeded + if np.random.random() > self.rate: + return [index] + if self.max_tokens and n_frames[index] > self.max_tokens: + return [index] + + # pick second sample to concatenate + for _ in range(self.attempts): + index2 = np.random.randint(0, n_samples) + if index2 != index and ( + not self.max_tokens + or n_frames[index] + n_frames[index2] < self.max_tokens + ): + return [index, index2] + + return [index] diff --git a/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py b/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py new file mode 100644 index 0000000000..e9ebec2388 --- /dev/null +++ b/fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py @@ -0,0 +1,105 @@ +import numpy as np +import torch + +from fairseq.data.audio import rand_uniform +from fairseq.data.audio.dataset_transforms import ( + AudioDatasetTransform, + register_audio_dataset_transform, +) +from fairseq.data.audio.waveform_transforms.noiseaugment import ( + NoiseAugmentTransform, +) + +_DEFAULTS = { + "rate": 0.25, + "mixing_noise_rate": 0.1, + "noise_path": "", + "noise_snr_min": -5, + "noise_snr_max": 5, + "utterance_snr_min": -5, + "utterance_snr_max": 5, +} + + +@register_audio_dataset_transform("noisyoverlapaugment") +class NoisyOverlapAugment(AudioDatasetTransform): + @classmethod + def from_config_dict(cls, config=None): + _config = {} if config is None else config + return NoisyOverlapAugment( + _config.get("rate", _DEFAULTS["rate"]), + _config.get("mixing_noise_rate", _DEFAULTS["mixing_noise_rate"]), + _config.get("noise_path", _DEFAULTS["noise_path"]), + _config.get("noise_snr_min", _DEFAULTS["noise_snr_min"]), + _config.get("noise_snr_max", _DEFAULTS["noise_snr_max"]), + _config.get("utterance_snr_min", _DEFAULTS["utterance_snr_min"]), + _config.get("utterance_snr_max", _DEFAULTS["utterance_snr_max"]), + ) + + def __init__( + self, + rate=_DEFAULTS["rate"], + mixing_noise_rate=_DEFAULTS["mixing_noise_rate"], + noise_path=_DEFAULTS["noise_path"], + noise_snr_min=_DEFAULTS["noise_snr_min"], + noise_snr_max=_DEFAULTS["noise_snr_max"], + utterance_snr_min=_DEFAULTS["utterance_snr_min"], + utterance_snr_max=_DEFAULTS["utterance_snr_max"], + ): + self.rate = rate + self.mixing_noise_rate = mixing_noise_rate + self.noise_shaper = NoiseAugmentTransform(noise_path) + self.noise_snr_min = noise_snr_min + self.noise_snr_max = noise_snr_max + self.utterance_snr_min = utterance_snr_min + self.utterance_snr_max = utterance_snr_max + + def __repr__(self): + return ( + self.__class__.__name__ + + "(" + + ", ".join( + [ + f"rate={self.rate}", + f"mixing_noise_rate={self.mixing_noise_rate}", + f"noise_snr_min={self.noise_snr_min}", + f"noise_snr_max={self.noise_snr_max}", + f"utterance_snr_min={self.utterance_snr_min}", + f"utterance_snr_max={self.utterance_snr_max}", + ] + ) + + ")" + ) + + def __call__(self, sources): + for i, source in enumerate(sources): + if np.random.random() > self.rate: + continue + + pri = source.numpy() + + if np.random.random() > self.mixing_noise_rate: + sec = sources[np.random.randint(0, len(sources))].numpy() + snr = rand_uniform(self.utterance_snr_min, self.utterance_snr_max) + else: + sec = self.noise_shaper.pick_sample(source.shape) + snr = rand_uniform(self.noise_snr_min, self.noise_snr_max) + + L1 = pri.shape[-1] + L2 = sec.shape[-1] + l = np.random.randint(0, min(round(L1 / 2), L2)) # mix len + s_source = np.random.randint(0, L1 - l) + s_sec = np.random.randint(0, L2 - l) + + get_power = lambda x: np.mean(x**2) + if get_power(sec) == 0: + continue + + scl = np.sqrt(get_power(pri) / (np.power(10, snr / 10) * get_power(sec))) + + pri[s_source : s_source + l] = np.add( + pri[s_source : s_source + l], np.multiply(scl, sec[s_sec : s_sec + l]) + ) + sources[i] = torch.from_numpy(pri).float() + + return sources diff --git a/fairseq/data/audio/feature_transforms/__init__.py b/fairseq/data/audio/feature_transforms/__init__.py index 359fa06971..d295013b90 100644 --- a/fairseq/data/audio/feature_transforms/__init__.py +++ b/fairseq/data/audio/feature_transforms/__init__.py @@ -1,82 +1,43 @@ -import importlib import os -from abc import ABC, abstractmethod -from typing import Dict, Optional +from fairseq.data.audio import ( + AudioTransform, + CompositeAudioTransform, + import_transforms, + register_audio_transform, +) -class AudioFeatureTransform(ABC): - @classmethod - @abstractmethod - def from_config_dict(cls, config: Optional[Dict] = None): - pass +class AudioFeatureTransform(AudioTransform): + pass AUDIO_FEATURE_TRANSFORM_REGISTRY = {} AUDIO_FEATURE_TRANSFORM_CLASS_NAMES = set() -def register_audio_feature_transform(name): - def register_audio_feature_transform_cls(cls): - if name in AUDIO_FEATURE_TRANSFORM_REGISTRY: - raise ValueError(f"Cannot register duplicate transform ({name})") - if not issubclass(cls, AudioFeatureTransform): - raise ValueError( - f"Transform ({name}: {cls.__name__}) must extend " - "AudioFeatureTransform" - ) - if cls.__name__ in AUDIO_FEATURE_TRANSFORM_CLASS_NAMES: - raise ValueError( - f"Cannot register audio feature transform with duplicate " - f"class name ({cls.__name__})" - ) - AUDIO_FEATURE_TRANSFORM_REGISTRY[name] = cls - AUDIO_FEATURE_TRANSFORM_CLASS_NAMES.add(cls.__name__) - return cls - - return register_audio_feature_transform_cls - - def get_audio_feature_transform(name): return AUDIO_FEATURE_TRANSFORM_REGISTRY[name] -transforms_dir = os.path.dirname(__file__) -for file in os.listdir(transforms_dir): - path = os.path.join(transforms_dir, file) - if ( - not file.startswith("_") - and not file.startswith(".") - and (file.endswith(".py") or os.path.isdir(path)) - ): - name = file[: file.find(".py")] if file.endswith(".py") else file - importlib.import_module("fairseq.data.audio.feature_transforms." + name) - +def register_audio_feature_transform(name): + return register_audio_transform( + name, + AudioFeatureTransform, + AUDIO_FEATURE_TRANSFORM_REGISTRY, + AUDIO_FEATURE_TRANSFORM_CLASS_NAMES, + ) -class CompositeAudioFeatureTransform(AudioFeatureTransform): - @classmethod - def from_config_dict(cls, config=None): - _config = {} if config is None else config - _transforms = _config.get("transforms") - if _transforms is None: - return None - transforms = [ - get_audio_feature_transform(_t).from_config_dict(_config.get(_t)) - for _t in _transforms - ] - return CompositeAudioFeatureTransform(transforms) - def __init__(self, transforms): - self.transforms = [t for t in transforms if t is not None] +import_transforms(os.path.dirname(__file__), "feature") - def __call__(self, x): - for t in self.transforms: - x = t(x) - return x - def __repr__(self): - format_string = ( - [self.__class__.__name__ + "("] - + [f" {t.__repr__()}" for t in self.transforms] - + [")"] +class CompositeAudioFeatureTransform(CompositeAudioTransform): + @classmethod + def from_config_dict(cls, config=None): + return super()._from_config_dict( + cls, + "feature", + get_audio_feature_transform, + CompositeAudioFeatureTransform, + config, ) - return "\n".join(format_string) diff --git a/fairseq/data/audio/feature_transforms/delta_deltas.py b/fairseq/data/audio/feature_transforms/delta_deltas.py new file mode 100644 index 0000000000..49d090b11e --- /dev/null +++ b/fairseq/data/audio/feature_transforms/delta_deltas.py @@ -0,0 +1,37 @@ +import numpy as np +import torch +from fairseq.data.audio.feature_transforms import ( + AudioFeatureTransform, + register_audio_feature_transform, +) + + +@register_audio_feature_transform("delta_deltas") +class DeltaDeltas(AudioFeatureTransform): + """Expand delta-deltas features from spectrum.""" + + @classmethod + def from_config_dict(cls, config=None): + _config = {} if config is None else config + return DeltaDeltas(_config.get("win_length", 5)) + + def __init__(self, win_length=5): + self.win_length = win_length + + def __repr__(self): + return self.__class__.__name__ + + def __call__(self, spectrogram): + from torchaudio.functional import compute_deltas + + assert len(spectrogram.shape) == 2, "spectrogram must be a 2-D tensor." + # spectrogram is T x F, while compute_deltas takes (…, F, T) + spectrogram = torch.from_numpy(spectrogram).transpose(0, 1) + delta = compute_deltas(spectrogram) + delta_delta = compute_deltas(delta) + + out_feat = np.concatenate( + [spectrogram, delta.numpy(), delta_delta.numpy()], axis=0 + ) + out_feat = np.transpose(out_feat) + return out_feat diff --git a/fairseq/data/audio/feature_transforms/global_cmvn.py b/fairseq/data/audio/feature_transforms/global_cmvn.py index d512fed300..e457ff176f 100644 --- a/fairseq/data/audio/feature_transforms/global_cmvn.py +++ b/fairseq/data/audio/feature_transforms/global_cmvn.py @@ -16,9 +16,13 @@ def from_config_dict(cls, config=None): return GlobalCMVN(_config.get("stats_npz_path")) def __init__(self, stats_npz_path): + self.stats_npz_path = stats_npz_path stats = np.load(stats_npz_path) self.mean, self.std = stats["mean"], stats["std"] + def __repr__(self): + return self.__class__.__name__ + f'(stats_npz_path="{self.stats_npz_path}")' + def __call__(self, x): x = np.subtract(x, self.mean) x = np.divide(x, self.std) diff --git a/fairseq/data/audio/feature_transforms/specaugment.py b/fairseq/data/audio/feature_transforms/specaugment.py index 2ef4778b85..ce5802b41a 100644 --- a/fairseq/data/audio/feature_transforms/specaugment.py +++ b/fairseq/data/audio/feature_transforms/specaugment.py @@ -98,7 +98,7 @@ def __call__(self, spectrogram): import cv2 w0 = np.random.randint(self.time_warp_w, num_frames - self.time_warp_w) - w = np.random.randint(0, self.time_warp_w) + w = np.random.randint(-self.time_warp_w + 1, self.time_warp_w) upper, lower = distorted[:w0, :], distorted[w0:, :] upper = cv2.resize( upper, dsize=(num_freqs, w0 + w), interpolation=cv2.INTER_LINEAR diff --git a/fairseq/data/audio/feature_transforms/utterance_cmvn.py b/fairseq/data/audio/feature_transforms/utterance_cmvn.py index 6bbd0ae821..37637bc09a 100644 --- a/fairseq/data/audio/feature_transforms/utterance_cmvn.py +++ b/fairseq/data/audio/feature_transforms/utterance_cmvn.py @@ -1,4 +1,5 @@ import numpy as np + from fairseq.data.audio.feature_transforms import ( AudioFeatureTransform, register_audio_feature_transform, @@ -28,12 +29,12 @@ def __repr__(self): def __call__(self, x): mean = x.mean(axis=0) - square_sums = (x ** 2).sum(axis=0) + square_sums = (x**2).sum(axis=0) if self.norm_means: x = np.subtract(x, mean) if self.norm_vars: - var = square_sums / x.shape[0] - mean ** 2 + var = square_sums / x.shape[0] - mean**2 std = np.sqrt(np.maximum(var, 1e-10)) x = np.divide(x, std) diff --git a/fairseq/data/audio/frm_text_to_speech_dataset.py b/fairseq/data/audio/frm_text_to_speech_dataset.py new file mode 100644 index 0000000000..b54654d492 --- /dev/null +++ b/fairseq/data/audio/frm_text_to_speech_dataset.py @@ -0,0 +1,205 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory.abs + +import csv +import logging +import os.path as op +from typing import List, Optional + +import numpy as np +import torch +from fairseq.data import Dictionary +from fairseq.data.audio.speech_to_text_dataset import S2TDataConfig +from fairseq.data.audio.text_to_speech_dataset import ( + TextToSpeechDataset, + TextToSpeechDatasetCreator, +) + +logger = logging.getLogger(__name__) + + +class FrmTextToSpeechDataset(TextToSpeechDataset): + def __init__( + self, + split: str, + is_train_split: bool, + data_cfg: S2TDataConfig, + audio_paths: List[str], + n_frames: List[int], + src_texts: Optional[List[str]] = None, + tgt_texts: Optional[List[str]] = None, + speakers: Optional[List[str]] = None, + src_langs: Optional[List[str]] = None, + tgt_langs: Optional[List[str]] = None, + ids: Optional[List[str]] = None, + tgt_dict: Optional[Dictionary] = None, + pre_tokenizer=None, + bpe_tokenizer=None, + n_frames_per_step=1, + speaker_to_id=None, + do_chunk=False, + chunk_bound=-1, + chunk_init=50, + chunk_incr=5, + add_eos=True, + dedup=True, + ref_fpu=-1, + ): + # It assumes texts are encoded at a fixed frame-rate + super().__init__( + split=split, + is_train_split=is_train_split, + data_cfg=data_cfg, + audio_paths=audio_paths, + n_frames=n_frames, + src_texts=src_texts, + tgt_texts=tgt_texts, + speakers=speakers, + src_langs=src_langs, + tgt_langs=tgt_langs, + ids=ids, + tgt_dict=tgt_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + n_frames_per_step=n_frames_per_step, + speaker_to_id=speaker_to_id, + ) + + self.do_chunk = do_chunk + self.chunk_bound = chunk_bound + self.chunk_init = chunk_init + self.chunk_incr = chunk_incr + self.add_eos = add_eos + self.dedup = dedup + self.ref_fpu = ref_fpu + + self.chunk_size = -1 + + if do_chunk: + assert self.chunk_incr >= 0 + assert self.pre_tokenizer is None + + def __getitem__(self, index): + index, source, target, speaker_id, _, _, _ = super().__getitem__(index) + if target[-1].item() == self.tgt_dict.eos_index: + target = target[:-1] + + fpu = source.size(0) / target.size(0) # frame-per-unit + fps = self.n_frames_per_step + assert ( + self.ref_fpu == -1 or abs((fpu * fps - self.ref_fpu) / self.ref_fpu) < 0.1 + ), f"{fpu*fps} != {self.ref_fpu}" + + # only chunk training split + if self.is_train_split and self.do_chunk and self.chunk_size > 0: + lang = target[: int(self.data_cfg.prepend_tgt_lang_tag)] + text = target[int(self.data_cfg.prepend_tgt_lang_tag) :] + size = len(text) + chunk_size = min(self.chunk_size, size) + chunk_start = np.random.randint(size - chunk_size + 1) + text = text[chunk_start : chunk_start + chunk_size] + target = torch.cat((lang, text), 0) + + f_size = int(np.floor(chunk_size * fpu)) + f_start = int(np.floor(chunk_start * fpu)) + assert f_size > 0 + source = source[f_start : f_start + f_size, :] + + if self.dedup: + target = torch.unique_consecutive(target) + + if self.add_eos: + eos_idx = self.tgt_dict.eos_index + target = torch.cat((target, torch.LongTensor([eos_idx])), 0) + + return index, source, target, speaker_id + + def set_epoch(self, epoch): + if self.is_train_split and self.do_chunk: + old = self.chunk_size + self.chunk_size = self.chunk_init + epoch * self.chunk_incr + if self.chunk_bound > 0: + self.chunk_size = min(self.chunk_size, self.chunk_bound) + logger.info( + ( + f"{self.split}: setting chunk size " + f"from {old} to {self.chunk_size}" + ) + ) + + +class FrmTextToSpeechDatasetCreator(TextToSpeechDatasetCreator): + # inherit for key names + @classmethod + def from_tsv( + cls, + root: str, + data_cfg: S2TDataConfig, + split: str, + tgt_dict, + pre_tokenizer, + bpe_tokenizer, + is_train_split: bool, + n_frames_per_step: int, + speaker_to_id, + do_chunk: bool = False, + chunk_bound: int = -1, + chunk_init: int = 50, + chunk_incr: int = 5, + add_eos: bool = True, + dedup: bool = True, + ref_fpu: float = -1, + ) -> FrmTextToSpeechDataset: + tsv_path = op.join(root, f"{split}.tsv") + if not op.isfile(tsv_path): + raise FileNotFoundError(f"Dataset not found: {tsv_path}") + with open(tsv_path) as f: + reader = csv.DictReader( + f, + delimiter="\t", + quotechar=None, + doublequote=False, + lineterminator="\n", + quoting=csv.QUOTE_NONE, + ) + s = [dict(e) for e in reader] + assert len(s) > 0 + + ids = [ss[cls.KEY_ID] for ss in s] + audio_paths = [op.join(data_cfg.audio_root, ss[cls.KEY_AUDIO]) for ss in s] + n_frames = [int(ss[cls.KEY_N_FRAMES]) for ss in s] + tgt_texts = [ss[cls.KEY_TGT_TEXT] for ss in s] + src_texts = [ss.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for ss in s] + speakers = [ss.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for ss in s] + src_langs = [ss.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for ss in s] + tgt_langs = [ss.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for ss in s] + + return FrmTextToSpeechDataset( + split=split, + is_train_split=is_train_split, + data_cfg=data_cfg, + audio_paths=audio_paths, + n_frames=n_frames, + src_texts=src_texts, + tgt_texts=tgt_texts, + speakers=speakers, + src_langs=src_langs, + tgt_langs=tgt_langs, + ids=ids, + tgt_dict=tgt_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + n_frames_per_step=n_frames_per_step, + speaker_to_id=speaker_to_id, + do_chunk=do_chunk, + chunk_bound=chunk_bound, + chunk_init=chunk_init, + chunk_incr=chunk_incr, + add_eos=add_eos, + dedup=dedup, + ref_fpu=ref_fpu, + ) diff --git a/fairseq/data/audio/hubert_dataset.py b/fairseq/data/audio/hubert_dataset.py new file mode 100644 index 0000000000..f09b065fdc --- /dev/null +++ b/fairseq/data/audio/hubert_dataset.py @@ -0,0 +1,356 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import itertools +import logging +import os +import sys +from typing import Any, List, Optional, Union + +import numpy as np + +import torch +import torch.nn.functional as F +from fairseq.data import data_utils +from fairseq.data.fairseq_dataset import FairseqDataset +from fairseq.data.audio.audio_utils import ( + parse_path, + read_from_stored_zip, +) +import io + +logger = logging.getLogger(__name__) + + +def load_audio(manifest_path, max_keep, min_keep): + n_long, n_short = 0, 0 + names, inds, sizes = [], [], [] + with open(manifest_path) as f: + root = f.readline().strip() + for ind, line in enumerate(f): + items = line.strip().split("\t") + assert len(items) == 2, line + sz = int(items[1]) + if min_keep is not None and sz < min_keep: + n_short += 1 + elif max_keep is not None and sz > max_keep: + n_long += 1 + else: + names.append(items[0]) + inds.append(ind) + sizes.append(sz) + tot = ind + 1 + logger.info( + ( + f"max_keep={max_keep}, min_keep={min_keep}, " + f"loaded {len(names)}, skipped {n_short} short and {n_long} long, " + f"longest-loaded={max(sizes)}, shortest-loaded={min(sizes)}" + ) + ) + return root, names, inds, tot, sizes + + +def load_label(label_path, inds, tot): + with open(label_path) as f: + labels = [line.rstrip() for line in f] + assert ( + len(labels) == tot + ), f"number of labels does not match ({len(labels)} != {tot})" + labels = [labels[i] for i in inds] + return labels + + +def load_label_offset(label_path, inds, tot): + with open(label_path) as f: + code_lengths = [len(line.encode("utf-8")) for line in f] + assert ( + len(code_lengths) == tot + ), f"number of labels does not match ({len(code_lengths)} != {tot})" + offsets = list(itertools.accumulate([0] + code_lengths)) + offsets = [(offsets[i], offsets[i + 1]) for i in inds] + return offsets + + +def verify_label_lengths( + audio_sizes, + audio_rate, + label_path, + label_rate, + inds, + tot, + tol=0.1, # tolerance in seconds +): + if label_rate < 0: + logger.info(f"{label_path} is sequence label. skipped") + return + + with open(label_path) as f: + lengths = [len(line.rstrip().split()) for line in f] + assert len(lengths) == tot + lengths = [lengths[i] for i in inds] + num_invalid = 0 + for i, ind in enumerate(inds): + dur_from_audio = audio_sizes[i] / audio_rate + dur_from_label = lengths[i] / label_rate + if abs(dur_from_audio - dur_from_label) > tol: + logger.warning( + ( + f"audio and label duration differ too much " + f"(|{dur_from_audio} - {dur_from_label}| > {tol}) " + f"in line {ind+1} of {label_path}. Check if `label_rate` " + f"is correctly set (currently {label_rate}). " + f"num. of samples = {audio_sizes[i]}; " + f"label length = {lengths[i]}" + ) + ) + num_invalid += 1 + if num_invalid > 0: + logger.warning( + f"total {num_invalid} (audio, label) pairs with mismatched lengths" + ) + + +class HubertDataset(FairseqDataset): + def __init__( + self, + manifest_path: str, + sample_rate: float, + label_paths: List[str], + label_rates: Union[List[float], float], # -1 for sequence labels + pad_list: List[str], + eos_list: List[str], + label_processors: Optional[List[Any]] = None, + max_keep_sample_size: Optional[int] = None, + min_keep_sample_size: Optional[int] = None, + max_sample_size: Optional[int] = None, + shuffle: bool = True, + pad_audio: bool = False, + normalize: bool = False, + store_labels: bool = True, + random_crop: bool = False, + single_target: bool = False, + ): + self.audio_root, self.audio_names, inds, tot, self.sizes = load_audio( + manifest_path, max_keep_sample_size, min_keep_sample_size + ) + self.sample_rate = sample_rate + self.shuffle = shuffle + self.random_crop = random_crop + + self.num_labels = len(label_paths) + self.pad_list = pad_list + self.eos_list = eos_list + self.label_processors = label_processors + self.single_target = single_target + self.label_rates = ( + [label_rates for _ in range(len(label_paths))] + if isinstance(label_rates, float) + else label_rates + ) + self.store_labels = store_labels + if store_labels: + self.label_list = [load_label(p, inds, tot) for p in label_paths] + else: + self.label_paths = label_paths + self.label_offsets_list = [ + load_label_offset(p, inds, tot) for p in label_paths + ] + assert label_processors is None or len(label_processors) == self.num_labels + for label_path, label_rate in zip(label_paths, self.label_rates): + verify_label_lengths( + self.sizes, sample_rate, label_path, label_rate, inds, tot + ) + + self.max_sample_size = ( + max_sample_size if max_sample_size is not None else sys.maxsize + ) + self.pad_audio = pad_audio + self.normalize = normalize + logger.info( + f"pad_audio={pad_audio}, random_crop={random_crop}, " + f"normalize={normalize}, max_sample_size={self.max_sample_size}" + ) + + def get_audio(self, index): + import soundfile as sf + + wav_path = os.path.join(self.audio_root, self.audio_names[index]) + _path, slice_ptr = parse_path(wav_path) + if len(slice_ptr) == 0: + wav, cur_sample_rate = sf.read(_path) + else: + assert _path.endswith(".zip") + data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1]) + f = io.BytesIO(data) + wav, cur_sample_rate = sf.read(f) + wav = torch.from_numpy(wav).float() + wav = self.postprocess(wav, cur_sample_rate) + return wav + + def get_label(self, index, label_idx): + if self.store_labels: + label = self.label_list[label_idx][index] + else: + with open(self.label_paths[label_idx]) as f: + offset_s, offset_e = self.label_offsets_list[label_idx][index] + f.seek(offset_s) + label = f.read(offset_e - offset_s) + + if self.label_processors is not None: + label = self.label_processors[label_idx](label) + return label + + def get_labels(self, index): + return [self.get_label(index, i) for i in range(self.num_labels)] + + def __getitem__(self, index): + wav = self.get_audio(index) + labels = self.get_labels(index) + return {"id": index, "source": wav, "label_list": labels} + + def __len__(self): + return len(self.sizes) + + def crop_to_max_size(self, wav, target_size): + size = len(wav) + diff = size - target_size + if diff <= 0: + return wav, 0 + + start, end = 0, target_size + if self.random_crop: + start = np.random.randint(0, diff + 1) + end = size - diff + start + return wav[start:end], start + + def collater(self, samples): + # target = max(sizes) -> random_crop not used + # target = max_sample_size -> random_crop used for long + samples = [s for s in samples if s["source"] is not None] + if len(samples) == 0: + return {} + + audios = [s["source"] for s in samples] + audio_sizes = [len(s) for s in audios] + if self.pad_audio: + audio_size = min(max(audio_sizes), self.max_sample_size) + else: + audio_size = min(min(audio_sizes), self.max_sample_size) + collated_audios, padding_mask, audio_starts = self.collater_audio( + audios, audio_size + ) + + targets_by_label = [ + [s["label_list"][i] for s in samples] for i in range(self.num_labels) + ] + targets_list, lengths_list, ntokens_list = self.collater_label( + targets_by_label, audio_size, audio_starts + ) + + net_input = {"source": collated_audios, "padding_mask": padding_mask} + batch = { + "id": torch.LongTensor([s["id"] for s in samples]), + "net_input": net_input, + } + + if self.single_target: + batch["target_lengths"] = lengths_list[0] + batch["ntokens"] = ntokens_list[0] + batch["target"] = targets_list[0] + else: + batch["target_lengths_list"] = lengths_list + batch["ntokens_list"] = ntokens_list + batch["target_list"] = targets_list + return batch + + def collater_audio(self, audios, audio_size): + collated_audios = audios[0].new_zeros(len(audios), audio_size) + padding_mask = ( + torch.BoolTensor(collated_audios.shape).fill_(False) + # if self.pad_audio else None + ) + audio_starts = [0 for _ in audios] + for i, audio in enumerate(audios): + diff = len(audio) - audio_size + if diff == 0: + collated_audios[i] = audio + elif diff < 0: + assert self.pad_audio + collated_audios[i] = torch.cat([audio, audio.new_full((-diff,), 0.0)]) + padding_mask[i, diff:] = True + else: + collated_audios[i], audio_starts[i] = self.crop_to_max_size( + audio, audio_size + ) + return collated_audios, padding_mask, audio_starts + + def collater_frm_label(self, targets, audio_size, audio_starts, label_rate, pad): + assert label_rate > 0 + s2f = label_rate / self.sample_rate + frm_starts = [int(round(s * s2f)) for s in audio_starts] + frm_size = int(round(audio_size * s2f)) + if not self.pad_audio: + rem_size = [len(t) - s for t, s in zip(targets, frm_starts)] + frm_size = min(frm_size, *rem_size) + targets = [t[s : s + frm_size] for t, s in zip(targets, frm_starts)] + logger.debug(f"audio_starts={audio_starts}") + logger.debug(f"frame_starts={frm_starts}") + logger.debug(f"frame_size={frm_size}") + + lengths = torch.LongTensor([len(t) for t in targets]) + ntokens = lengths.sum().item() + targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False) + return targets, lengths, ntokens + + def collater_seq_label(self, targets, pad): + lengths = torch.LongTensor([len(t) for t in targets]) + ntokens = lengths.sum().item() + targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False) + return targets, lengths, ntokens + + def collater_label(self, targets_by_label, audio_size, audio_starts): + targets_list, lengths_list, ntokens_list = [], [], [] + itr = zip(targets_by_label, self.label_rates, self.pad_list) + for targets, label_rate, pad in itr: + if label_rate == -1.0: + targets, lengths, ntokens = self.collater_seq_label(targets, pad) + else: + targets, lengths, ntokens = self.collater_frm_label( + targets, audio_size, audio_starts, label_rate, pad + ) + targets_list.append(targets) + lengths_list.append(lengths) + ntokens_list.append(ntokens) + return targets_list, lengths_list, ntokens_list + + def num_tokens(self, index): + return self.size(index) + + def size(self, index): + if self.pad_audio: + return self.sizes[index] + return min(self.sizes[index], self.max_sample_size) + + def ordered_indices(self): + if self.shuffle: + order = [np.random.permutation(len(self))] + else: + order = [np.arange(len(self))] + + order.append(self.sizes) + return np.lexsort(order)[::-1] + + def postprocess(self, wav, cur_sample_rate): + if wav.dim() == 2: + wav = wav.mean(-1) + assert wav.dim() == 1, wav.dim() + + if cur_sample_rate != self.sample_rate: + raise Exception(f"sr {cur_sample_rate} != {self.sample_rate}") + + if self.normalize: + with torch.no_grad(): + wav = F.layer_norm(wav, wav.shape) + return wav diff --git a/fairseq/data/audio/multi_modality_dataset.py b/fairseq/data/audio/multi_modality_dataset.py new file mode 100644 index 0000000000..0a42c10611 --- /dev/null +++ b/fairseq/data/audio/multi_modality_dataset.py @@ -0,0 +1,267 @@ +# Copyright (c) 2021-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import logging +import math +from typing import List, Optional, NamedTuple + +import numpy as np +import torch +from fairseq.data import ( + ConcatDataset, + LanguagePairDataset, + FileAudioDataset, + data_utils, +) +from fairseq.data import FairseqDataset + +logger = logging.getLogger(__name__) + + +class ModalityDatasetItem(NamedTuple): + datasetname: str + dataset: any + max_positions: List[int] + max_tokens: Optional[int] = None + max_sentences: Optional[int] = None + + +# MultiModalityDataset: it concate multiple datasets with different modalities. +# Compared with ConcatDataset it can 1) sample data given the ratios for different datasets +# 2) it adds mode to indicate what type of the data samples come from. +# It will be used with GroupedEpochBatchIterator together to generate mini-batch with samples +# from the same type of dataset +# If only one dataset is used, it will perform like the original dataset with mode added +class MultiModalityDataset(ConcatDataset): + def __init__(self, datasets: List[ModalityDatasetItem]): + id_to_mode = [] + dsets = [] + max_tokens = [] + max_sentences = [] + max_positions = [] + for dset in datasets: + id_to_mode.append(dset.datasetname) + dsets.append(dset.dataset) + max_tokens.append(dset.max_tokens) + max_positions.append(dset.max_positions) + max_sentences.append(dset.max_sentences) + weights = [1.0 for s in dsets] + super().__init__(dsets, weights) + self.max_tokens = max_tokens + self.max_positions = max_positions + self.max_sentences = max_sentences + self.id_to_mode = id_to_mode + self.raw_sub_batch_samplers = [] + self._cur_epoch = 0 + + def set_epoch(self, epoch): + super().set_epoch(epoch) + self._cur_epoch = epoch + + def __getitem__(self, idx): + dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx) + sample = self.datasets[dataset_idx][sample_idx] + return (dataset_idx, sample) + + def collater(self, samples): + if len(samples) == 0: + return {} + dataset_idx = samples[0][0] + # make sure all samples in samples are from same dataset + assert sum([0 if dataset_idx == s[0] else 1 for s in samples]) == 0 + samples = self.datasets[dataset_idx].collater([x[1] for x in samples]) + # add mode + samples["net_input"]["mode"] = self.id_to_mode[dataset_idx] + + return samples + + def size(self, index: int): + if len(self.datasets) == 1: + return self.datasets[0].size(index) + return super().size(index) + + @property + def sizes(self): + if len(self.datasets) == 1: + return self.datasets[0].sizes + return super().sizes + + def ordered_indices(self): + """ + Returns indices sorted by length. So less padding is needed. + """ + if len(self.datasets) == 1: + return [self.datasets[0].ordered_indices()] + indices_group = [] + for d_idx, ds in enumerate(self.datasets): + sample_num = self.cumulative_sizes[d_idx] + if d_idx > 0: + sample_num = sample_num - self.cumulative_sizes[d_idx - 1] + assert sample_num == len(ds) + indices_group.append(ds.ordered_indices()) + return indices_group + + def get_raw_batch_samplers(self, required_batch_size_multiple, seed): + if len(self.raw_sub_batch_samplers) > 0: + logger.info(" raw_sub_batch_samplers exists. No action is taken") + return + with data_utils.numpy_seed(seed): + indices = self.ordered_indices() + + for i, ds in enumerate(self.datasets): + indices[i] = ds.filter_indices_by_size( + indices[i], + self.max_positions[i], + )[0] + sub_batch_sampler = ds.batch_by_size( + indices[i], + max_tokens=self.max_tokens[i], + max_sentences=self.max_sentences[i], + required_batch_size_multiple=required_batch_size_multiple, + ) + self.raw_sub_batch_samplers.append(sub_batch_sampler) + + def get_batch_samplers(self, mult_ratios, required_batch_size_multiple, seed): + self.get_raw_batch_samplers(required_batch_size_multiple, seed) + batch_samplers = [] + for i, _ in enumerate(self.datasets): + if i > 0: + sub_batch_sampler = [ + [y + self.cumulative_sizes[i - 1] for y in x] + for x in self.raw_sub_batch_samplers[i] + ] + else: + sub_batch_sampler = list(self.raw_sub_batch_samplers[i]) + smp_r = mult_ratios[i] + if smp_r != 1: + is_increase = "increased" if smp_r > 1 else "decreased" + logger.info( + "number of batch for the dataset {} is {} from {} to {}".format( + self.id_to_mode[i], + is_increase, + len(sub_batch_sampler), + int(len(sub_batch_sampler) * smp_r), + ) + ) + mul_samplers = [] + for _ in range(math.floor(smp_r)): + mul_samplers = mul_samplers + sub_batch_sampler + if math.floor(smp_r) != smp_r: + with data_utils.numpy_seed(seed + self._cur_epoch): + np.random.shuffle(sub_batch_sampler) + smp_num = int( + (smp_r - math.floor(smp_r)) * len(sub_batch_sampler) + ) + mul_samplers = mul_samplers + sub_batch_sampler[:smp_num] + sub_batch_sampler = mul_samplers + else: + logger.info( + "dataset {} batch number is {} ".format( + self.id_to_mode[i], len(sub_batch_sampler) + ) + ) + batch_samplers.append(sub_batch_sampler) + + return batch_samplers + + +class LangPairMaskDataset(FairseqDataset): + def __init__( + self, + dataset: LanguagePairDataset, + src_eos: int, + src_bos: Optional[int] = None, + noise_id: Optional[int] = -1, + mask_ratio: Optional[float] = 0, + mask_type: Optional[str] = "random", + ): + self.dataset = dataset + self.src_eos = src_eos + self.src_bos = src_bos + self.noise_id = noise_id + self.mask_ratio = mask_ratio + self.mask_type = mask_type + assert mask_type in ("random", "tail") + + @property + def src_sizes(self): + return self.dataset.src_sizes + + @property + def tgt_sizes(self): + return self.dataset.tgt_sizes + + @property + def sizes(self): + # dataset.sizes can be a dynamically computed sizes: + return self.dataset.sizes + + def get_batch_shapes(self): + if hasattr(self.dataset, "get_batch_shapes"): + return self.dataset.get_batch_shapes() + return self.dataset.buckets + + def num_tokens_vec(self, indices): + return self.dataset.num_tokens_vec(indices) + + def __len__(self): + return len(self.dataset) + + def num_tokens(self, index): + return self.dataset.num_tokens(index) + + def size(self, index): + return self.dataset.size(index) + + def ordered_indices(self): + return self.dataset.ordered_indices() + + @property + def supports_prefetch(self): + return getattr(self.dataset, "supports_prefetch", False) + + def prefetch(self, indices): + return self.dataset.prefetch(indices) + + def mask_src_tokens(self, sample): + src_item = sample["source"] + mask = None + if self.mask_type == "random": + mask = torch.rand(len(src_item)).le(self.mask_ratio) + else: + mask = torch.ones(len(src_item)) + mask[: int(len(src_item) * (1 - self.mask_ratio))] = 0 + mask = mask.eq(1) + if src_item[0] == self.src_bos: + mask[0] = False + if src_item[-1] == self.src_eos: + mask[-1] = False + mask_src_item = src_item.masked_fill(mask, self.noise_id) + smp = {"id": sample["id"], "source": mask_src_item, "target": sample["target"]} + return smp + + def __getitem__(self, index): + sample = self.dataset[index] + if self.mask_ratio > 0: + sample = self.mask_src_tokens(sample) + return sample + + def collater(self, samples, pad_to_length=None): + return self.dataset.collater(samples, pad_to_length) + + +class FileAudioDatasetWrapper(FileAudioDataset): + def collater(self, samples): + samples = super().collater(samples) + if len(samples) == 0: + return {} + samples["net_input"]["src_tokens"] = samples["net_input"]["source"] + samples["net_input"]["prev_output_tokens"] = None + del samples["net_input"]["source"] + samples["net_input"]["src_lengths"] = None + samples["net_input"]["alignment"] = None + return samples diff --git a/fairseq/data/audio/raw_audio_dataset.py b/fairseq/data/audio/raw_audio_dataset.py index 8d6ce85ecc..ec202d5574 100644 --- a/fairseq/data/audio/raw_audio_dataset.py +++ b/fairseq/data/audio/raw_audio_dataset.py @@ -7,12 +7,21 @@ import logging import os import sys +import time +import io import numpy as np import torch import torch.nn.functional as F from .. import FairseqDataset +from ..data_utils import compute_block_mask_1d, get_buckets, get_bucketed_sizes +from fairseq.data.audio.audio_utils import ( + parse_path, + read_from_stored_zip, + is_sf_audio_data, +) +from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel logger = logging.getLogger(__name__) @@ -23,11 +32,22 @@ def __init__( self, sample_rate, max_sample_size=None, - min_sample_size=None, + min_sample_size=0, shuffle=True, - min_length=0, pad=False, normalize=False, + compute_mask=False, + feature_encoder_spec: str = "None", + mask_prob: float = 0.75, + mask_prob_adjust: float = 0, + mask_length: int = 1, + inverse_mask: bool = False, + require_same_masks: bool = True, + clone_batch: int = 1, + expand_adjacent: bool = False, + mask_dropout: float = 0, + non_overlapping: bool = False, + corpus_key=None, ): super().__init__() @@ -37,11 +57,24 @@ def __init__( max_sample_size if max_sample_size is not None else sys.maxsize ) self.min_sample_size = min_sample_size - self.min_length = min_length self.pad = pad self.shuffle = shuffle self.normalize = normalize + self.is_compute_mask = compute_mask + self.feature_encoder_spec = eval(feature_encoder_spec) + self._features_size_map = {} + self.mask_prob = mask_prob + self.mask_prob_adjust = mask_prob_adjust + self.mask_length = mask_length + self.inverse_mask = inverse_mask + self.require_same_masks = require_same_masks + self.clone_batch = clone_batch + self.expand_adjacent = expand_adjacent + self.mask_dropout = mask_dropout + self.non_overlapping = non_overlapping + self.corpus_key = corpus_key + def __getitem__(self, index): raise NotImplementedError() @@ -62,15 +95,25 @@ def postprocess(self, feats, curr_sample_rate): feats = F.layer_norm(feats, feats.shape) return feats - def crop_to_max_size(self, wav, target_size): - size = len(wav) + def crop_to_max_size(self, t, target_size, dim=0): + size = t.size(dim) diff = size - target_size if diff <= 0: - return wav + return t start = np.random.randint(0, diff + 1) end = size - diff + start - return wav[start:end] + + slices = [] + for d in range(dim): + slices.append(slice(None)) + slices.append(slice(start, end)) + + return t[slices] + + @staticmethod + def _bucket_tensor(tensor, num_pad, value): + return F.pad(tensor, (0, num_pad), value=value) def collater(self, samples): samples = [s for s in samples if s["source"] is not None] @@ -103,9 +146,43 @@ def collater(self, samples): collated_sources[i] = self.crop_to_max_size(source, target_size) input = {"source": collated_sources} + if self.corpus_key is not None: + input["corpus_key"] = [self.corpus_key] * len(sources) + out = {"id": torch.LongTensor([s["id"] for s in samples])} if self.pad: input["padding_mask"] = padding_mask - return {"id": torch.LongTensor([s["id"] for s in samples]), "net_input": input} + + if hasattr(self, "num_buckets") and self.num_buckets > 0: + assert self.pad, "Cannot bucket without padding first." + bucket = max(self._bucketed_sizes[s["id"]] for s in samples) + num_pad = bucket - collated_sources.size(-1) + if num_pad: + input["source"] = self._bucket_tensor(collated_sources, num_pad, 0) + input["padding_mask"] = self._bucket_tensor(padding_mask, num_pad, True) + + if "precomputed_mask" in samples[0]: + target_size = self._get_mask_indices_dims(target_size) + collated_mask = torch.cat( + [ + self.crop_to_max_size(s["precomputed_mask"], target_size, dim=1) + for s in samples + ], + dim=0, + ) + input["precomputed_mask"] = collated_mask + + out["net_input"] = input + return out + + def _get_mask_indices_dims(self, size, padding=0, dilation=1): + if size not in self.feature_encoder_spec: + L_in = size + for (_, kernel_size, stride) in self.feature_encoder_spec: + L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1 + L_out = 1 + L_out // stride + L_in = L_out + self._features_size_map[size] = L_out + return self._features_size_map[size] def num_tokens(self, index): return self.size(index) @@ -123,11 +200,37 @@ def ordered_indices(self): if self.shuffle: order = [np.random.permutation(len(self))] + order.append( + np.minimum( + np.array(self.sizes), + self.max_sample_size, + ) + ) + return np.lexsort(order)[::-1] else: - order = [np.arange(len(self))] - - order.append(self.sizes) - return np.lexsort(order)[::-1] + return np.arange(len(self)) + + def set_bucket_info(self, num_buckets): + self.num_buckets = num_buckets + if self.num_buckets > 0: + self._collated_sizes = np.minimum( + np.array(self.sizes), + self.max_sample_size, + ) + self.buckets = get_buckets( + self._collated_sizes, + self.num_buckets, + ) + self._bucketed_sizes = get_bucketed_sizes( + self._collated_sizes, self.buckets + ) + logger.info( + f"{len(self.buckets)} bucket(s) for the audio dataset: " + f"{self.buckets}" + ) + + def filter_indices_by_size(self, indices, max_sizes): + return indices, [] class FileAudioDataset(RawAudioDataset): @@ -136,43 +239,193 @@ def __init__( manifest_path, sample_rate, max_sample_size=None, - min_sample_size=None, + min_sample_size=0, shuffle=True, - min_length=0, pad=False, normalize=False, + num_buckets=0, + compute_mask=False, + text_compression_level=TextCompressionLevel.none, + **mask_compute_kwargs, ): super().__init__( sample_rate=sample_rate, max_sample_size=max_sample_size, min_sample_size=min_sample_size, shuffle=shuffle, - min_length=min_length, pad=pad, normalize=normalize, + compute_mask=compute_mask, + **mask_compute_kwargs, ) - self.fnames = [] + self.text_compressor = TextCompressor(level=text_compression_level) skipped = 0 + self.fnames = [] + sizes = [] + self.skipped_indices = set() + with open(manifest_path, "r") as f: self.root_dir = f.readline().strip() - for line in f: + for i, line in enumerate(f): items = line.strip().split("\t") assert len(items) == 2, line sz = int(items[1]) - if min_length is not None and sz < min_length: + if min_sample_size is not None and sz < min_sample_size: skipped += 1 + self.skipped_indices.add(i) continue - self.fnames.append(items[0]) - self.sizes.append(sz) + self.fnames.append(self.text_compressor.compress(items[0])) + sizes.append(sz) logger.info(f"loaded {len(self.fnames)}, skipped {skipped} samples") + self.sizes = np.array(sizes, dtype=np.int64) + + try: + import pyarrow + + self.fnames = pyarrow.array(self.fnames) + except: + logger.debug( + "Could not create a pyarrow array. Please install pyarrow for better performance" + ) + pass + + self.set_bucket_info(num_buckets) + + def __getitem__(self, index): + import soundfile as sf + + fn = self.fnames[index] + fn = fn if isinstance(self.fnames, list) else fn.as_py() + fn = self.text_compressor.decompress(fn) + path_or_fp = os.path.join(self.root_dir, fn) + _path, slice_ptr = parse_path(path_or_fp) + if len(slice_ptr) == 2: + byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1]) + assert is_sf_audio_data(byte_data) + path_or_fp = io.BytesIO(byte_data) + + retry = 3 + wav = None + for i in range(retry): + try: + wav, curr_sample_rate = sf.read(path_or_fp, dtype="float32") + break + except Exception as e: + logger.warning( + f"Failed to read {path_or_fp}: {e}. Sleeping for {1 * i}" + ) + time.sleep(1 * i) + + if wav is None: + raise Exception(f"Failed to load {path_or_fp}") + + feats = torch.from_numpy(wav).float() + feats = self.postprocess(feats, curr_sample_rate) + + v = {"id": index, "source": feats} + + if self.is_compute_mask: + T = self._get_mask_indices_dims(feats.size(-1)) + mask = compute_block_mask_1d( + shape=(self.clone_batch, T), + mask_prob=self.mask_prob, + mask_length=self.mask_length, + mask_prob_adjust=self.mask_prob_adjust, + inverse_mask=self.inverse_mask, + require_same_masks=True, + expand_adjcent=self.expand_adjacent, + mask_dropout=self.mask_dropout, + non_overlapping=self.non_overlapping, + ) + + v["precomputed_mask"] = mask + + return v + + +class BinarizedAudioDataset(RawAudioDataset): + def __init__( + self, + data_dir, + split, + sample_rate, + max_sample_size=None, + min_sample_size=0, + shuffle=True, + pad=False, + normalize=False, + num_buckets=0, + compute_mask=False, + **mask_compute_kwargs, + ): + super().__init__( + sample_rate=sample_rate, + max_sample_size=max_sample_size, + min_sample_size=min_sample_size, + shuffle=shuffle, + pad=pad, + normalize=normalize, + compute_mask=compute_mask, + **mask_compute_kwargs, + ) + + from fairseq.data import data_utils, Dictionary + + self.fnames_dict = Dictionary.load(os.path.join(data_dir, "dict.txt")) + + root_path = os.path.join(data_dir, f"{split}.root") + if os.path.exists(root_path): + with open(root_path, "r") as f: + self.root_dir = next(f).strip() + else: + self.root_dir = None + + fnames_path = os.path.join(data_dir, split) + self.fnames = data_utils.load_indexed_dataset(fnames_path, self.fnames_dict) + lengths_path = os.path.join(data_dir, f"{split}.lengths") + + with open(lengths_path, "r") as f: + for line in f: + sz = int(line.rstrip()) + assert ( + sz >= min_sample_size + ), f"Min sample size is not supported for binarized dataset, but found a sample with size {sz}" + self.sizes.append(sz) + + self.sizes = np.array(self.sizes, dtype=np.int64) + + self.set_bucket_info(num_buckets) + logger.info(f"loaded {len(self.fnames)} samples") + def __getitem__(self, index): import soundfile as sf - fname = os.path.join(self.root_dir, self.fnames[index]) + fname = self.fnames_dict.string(self.fnames[index], separator="") + if self.root_dir: + fname = os.path.join(self.root_dir, fname) + wav, curr_sample_rate = sf.read(fname) feats = torch.from_numpy(wav).float() feats = self.postprocess(feats, curr_sample_rate) - return {"id": index, "source": feats} + v = {"id": index, "source": feats} + + if self.is_compute_mask: + T = self._get_mask_indices_dims(feats.size(-1)) + mask = compute_block_mask_1d( + shape=(self.clone_batch, T), + mask_prob=self.mask_prob, + mask_length=self.mask_length, + mask_prob_adjust=self.mask_prob_adjust, + inverse_mask=self.inverse_mask, + require_same_masks=True, + expand_adjcent=self.expand_adjacent, + mask_dropout=self.mask_dropout, + non_overlapping=self.non_overlapping, + ) + + v["precomputed_mask"] = mask + + return v diff --git a/fairseq/data/audio/speech_to_speech_dataset.py b/fairseq/data/audio/speech_to_speech_dataset.py new file mode 100644 index 0000000000..fe4b61f831 --- /dev/null +++ b/fairseq/data/audio/speech_to_speech_dataset.py @@ -0,0 +1,379 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import torch + +from fairseq.data import ConcatDataset, Dictionary +from fairseq.data import data_utils as fairseq_data_utils +from fairseq.data.audio.audio_utils import get_features_or_waveform +from fairseq.data.audio.data_cfg import S2SDataConfig +from fairseq.data.audio.speech_to_text_dataset import ( + SpeechToTextDataset, + SpeechToTextDatasetCreator, + TextTargetMultitaskData, + _collate_frames, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class SpeechToSpeechDatasetItem(object): + index: int + source: torch.Tensor + target: Optional[torch.Tensor] = None + target_speaker: Optional[torch.Tensor] = None + tgt_lang_tag: Optional[int] = None + + +class SpeechToSpeechDataset(SpeechToTextDataset): + def __init__( + self, + split: str, + is_train_split: bool, + data_cfg: S2SDataConfig, + src_audio_paths: List[str], + src_n_frames: List[int], + tgt_audio_paths: List[str], + tgt_n_frames: List[int], + src_langs: Optional[List[str]] = None, + tgt_langs: Optional[List[str]] = None, + ids: Optional[List[str]] = None, + target_is_code: bool = False, + tgt_dict: Dictionary = None, + n_frames_per_step: int = 1, + ): + tgt_texts = tgt_audio_paths if target_is_code else None + super().__init__( + split=split, + is_train_split=is_train_split, + cfg=data_cfg, + audio_paths=src_audio_paths, + n_frames=src_n_frames, + ids=ids, + tgt_dict=tgt_dict, + tgt_texts=tgt_texts, + src_langs=src_langs, + tgt_langs=tgt_langs, + n_frames_per_step=n_frames_per_step, + ) + + self.tgt_audio_paths = tgt_audio_paths + self.tgt_lens = [t // self.n_frames_per_step for t in tgt_n_frames] + + assert not target_is_code or tgt_dict is not None + self.target_is_code = target_is_code + + assert len(tgt_audio_paths) == self.n_samples + assert len(tgt_n_frames) == self.n_samples + + self.tgt_speakers = None + if self.cfg.target_speaker_embed: + samples = SpeechToTextDatasetCreator._load_samples_from_tsv( + self.cfg.target_speaker_embed, split + ) + spk_emb_dict = {s["id"]: s["speaker_embed"] for s in samples} + self.tgt_speakers = [spk_emb_dict[id] for id in self.ids] + assert len(self.tgt_speakers) == self.n_samples + + logger.info(self.__repr__()) + + def pack_units(self, input: torch.Tensor) -> torch.Tensor: + if self.n_frames_per_step <= 1: + return input + + offset = 4 + vocab_size = ( + len(self.tgt_dict) - offset + ) # remove offset from <bos>, <pad>, <eos>, <unk>, which is specific to fairseq dictionary + + assert input.dim() == 1 + stacked_input = ( + input[:-1].view(-1, self.n_frames_per_step) - offset + ) # remove <eos> + scale = [ + pow(vocab_size, self.n_frames_per_step - 1 - i) + for i in range(self.n_frames_per_step) + ] + scale = torch.LongTensor(scale).squeeze(0) + res = input.new((len(input) - 1) // self.n_frames_per_step + 1).fill_(input[-1]) + res[:-1] = (stacked_input * scale).sum(dim=1) + offset + + return res + + def __getitem__(self, index: int) -> SpeechToSpeechDatasetItem: + source = self._get_source_audio(index) + + tgt_lang_tag = None + if self.cfg.prepend_tgt_lang_tag_as_bos: + # prepend_tgt_lang_tag_as_bos: put tgt_lang_tag as bos of target + tgt_lang_tag = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict) + + if not self.target_is_code: + target = get_features_or_waveform(self.tgt_audio_paths[index]) + target = torch.from_numpy(target).float() + target = self.pack_frames(target) + else: + target = self.tgt_dict.encode_line( + self.tgt_audio_paths[index], + add_if_not_exist=False, + append_eos=True, + ).long() + if self.n_frames_per_step > 1: + n_tgt_frame = target.size(0) - 1 # exclude <eos> + keep_n_tgt_frame = n_tgt_frame - n_tgt_frame % self.n_frames_per_step + target = torch.cat( + ( + target[:keep_n_tgt_frame], + target.new_full((1,), self.tgt_dict.eos()), + ), + dim=0, + ) + + if self.tgt_speakers: + tgt_spk = get_features_or_waveform(self.tgt_speakers[index]) + tgt_spk = torch.from_numpy(tgt_spk).float() + else: + tgt_spk = torch.FloatTensor([]) + + return SpeechToSpeechDatasetItem( + index=index, + source=source, + target=target, + target_speaker=tgt_spk, + tgt_lang_tag=tgt_lang_tag, + ) + + def _collate_target(self, samples: List[SpeechToSpeechDatasetItem]) -> torch.Tensor: + if self.target_is_code: + target = fairseq_data_utils.collate_tokens( + [x.target for x in samples], + self.tgt_dict.pad(), + self.tgt_dict.eos(), + left_pad=False, + move_eos_to_beginning=False, + ) + # convert stacked units to a single id + pack_targets = [self.pack_units(x.target) for x in samples] + prev_output_tokens = fairseq_data_utils.collate_tokens( + pack_targets, + self.tgt_dict.pad(), + self.tgt_dict.eos(), + left_pad=False, + move_eos_to_beginning=True, + ) + target_lengths = torch.tensor( + [x.size(0) for x in pack_targets], dtype=torch.long + ) + else: + target = _collate_frames([x.target for x in samples], is_audio_input=False) + bsz, _, d = target.size() + prev_output_tokens = torch.cat( + (target.new_full((bsz, 1, d), 0.0), target[:, :-1, :]), dim=1 + ) + target_lengths = torch.tensor( + [x.target.size(0) for x in samples], dtype=torch.long + ) + + return target, prev_output_tokens, target_lengths + + def collater( + self, samples: List[SpeechToSpeechDatasetItem], return_order: bool = False + ) -> Dict: + if len(samples) == 0: + return {} + indices = torch.tensor([x.index for x in samples], dtype=torch.long) + frames = _collate_frames([x.source for x in samples], self.cfg.use_audio_input) + # sort samples by descending number of frames + n_frames = torch.tensor([x.source.size(0) for x in samples], dtype=torch.long) + n_frames, order = n_frames.sort(descending=True) + indices = indices.index_select(0, order) + frames = frames.index_select(0, order) + + target, prev_output_tokens, target_lengths = self._collate_target(samples) + target = target.index_select(0, order) + target_lengths = target_lengths.index_select(0, order) + prev_output_tokens = prev_output_tokens.index_select(0, order) + ntokens = sum(x.target.size(0) for x in samples) + + tgt_speakers = None + if self.cfg.target_speaker_embed: + tgt_speakers = _collate_frames( + [x.target_speaker for x in samples], is_audio_input=True + ).index_select(0, order) + + net_input = { + "src_tokens": frames, + "src_lengths": n_frames, + "prev_output_tokens": prev_output_tokens, + "tgt_speaker": tgt_speakers, # TODO: unify "speaker" and "tgt_speaker" + } + if self.tgt_texts is not None and samples[0].tgt_lang_tag is not None: + for i in range(len(samples)): + net_input["prev_output_tokens"][i][0] = samples[order[i]].tgt_lang_tag + out = { + "id": indices, + "net_input": net_input, + "speaker": tgt_speakers, # to support Tacotron2 loss for speech-to-spectrogram model + "target": target, + "target_lengths": target_lengths, + "ntokens": ntokens, + "nsentences": len(samples), + } + if return_order: + out["order"] = order + return out + + +class SpeechToSpeechMultitaskDataset(SpeechToSpeechDataset): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.multitask_data = {} + + def add_multitask_dataset(self, task_name, task_data): + self.multitask_data[task_name] = task_data + + def __getitem__( + self, index: int + ) -> Tuple[SpeechToSpeechDatasetItem, Dict[str, torch.Tensor]]: + s2s_data = super().__getitem__(index) + + multitask_target = {} + sample_id = self.ids[index] + tgt_lang = self.tgt_langs[index] + for task_name, task_dataset in self.multitask_data.items(): + multitask_target[task_name] = task_dataset.get(sample_id, tgt_lang) + + return s2s_data, multitask_target + + def collater( + self, samples: List[Tuple[SpeechToSpeechDatasetItem, Dict[str, torch.Tensor]]] + ) -> Dict: + if len(samples) == 0: + return {} + + out = super().collater([s for s, _ in samples], return_order=True) + order = out["order"] + del out["order"] + + for task_name, task_dataset in self.multitask_data.items(): + if "multitask" not in out: + out["multitask"] = {} + d = [s[task_name] for _, s in samples] + task_target = task_dataset.collater(d) + out["multitask"][task_name] = { + "target": task_target["target"].index_select(0, order), + "target_lengths": task_target["target_lengths"].index_select(0, order), + "ntokens": task_target["ntokens"], + } + out["multitask"][task_name]["net_input"] = { + "prev_output_tokens": task_target["prev_output_tokens"].index_select( + 0, order + ), + } + + return out + + +class SpeechToSpeechDatasetCreator(object): + # mandatory columns + KEY_ID, KEY_SRC_AUDIO, KEY_SRC_N_FRAMES = "id", "src_audio", "src_n_frames" + KEY_TGT_AUDIO, KEY_TGT_N_FRAMES = "tgt_audio", "tgt_n_frames" + # optional columns + KEY_SRC_LANG, KEY_TGT_LANG = "src_lang", "tgt_lang" + # default values + DEFAULT_LANG = "" + + @classmethod + def _from_list( + cls, + split_name: str, + is_train_split, + samples: List[Dict], + data_cfg: S2SDataConfig, + target_is_code: bool = False, + tgt_dict: Dictionary = None, + n_frames_per_step: int = 1, + multitask: Optional[Dict] = None, + ) -> SpeechToSpeechDataset: + audio_root = Path(data_cfg.audio_root) + ids = [s[cls.KEY_ID] for s in samples] + src_audio_paths = [ + (audio_root / s[cls.KEY_SRC_AUDIO]).as_posix() for s in samples + ] + tgt_audio_paths = [ + s[cls.KEY_TGT_AUDIO] + if target_is_code + else (audio_root / s[cls.KEY_TGT_AUDIO]).as_posix() + for s in samples + ] + src_n_frames = [int(s[cls.KEY_SRC_N_FRAMES]) for s in samples] + tgt_n_frames = [int(s[cls.KEY_TGT_N_FRAMES]) for s in samples] + src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples] + tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples] + + has_multitask = multitask is not None and len(multitask.keys()) > 0 + dataset_cls = ( + SpeechToSpeechMultitaskDataset if has_multitask else SpeechToSpeechDataset + ) + + ds = dataset_cls( + split=split_name, + is_train_split=is_train_split, + data_cfg=data_cfg, + src_audio_paths=src_audio_paths, + src_n_frames=src_n_frames, + tgt_audio_paths=tgt_audio_paths, + tgt_n_frames=tgt_n_frames, + src_langs=src_langs, + tgt_langs=tgt_langs, + ids=ids, + target_is_code=target_is_code, + tgt_dict=tgt_dict, + n_frames_per_step=n_frames_per_step, + ) + + if has_multitask: + for task_name, task_obj in multitask.items(): + task_data = TextTargetMultitaskData( + task_obj.args, split_name, task_obj.target_dictionary + ) + ds.add_multitask_dataset(task_name, task_data) + return ds + + @classmethod + def from_tsv( + cls, + root: str, + data_cfg: S2SDataConfig, + splits: str, + is_train_split: bool, + epoch: int, + seed: int, + target_is_code: bool = False, + tgt_dict: Dictionary = None, + n_frames_per_step: int = 1, + multitask: Optional[Dict] = None, + ) -> SpeechToSpeechDataset: + datasets = [] + for split in splits.split(","): + samples = SpeechToTextDatasetCreator._load_samples_from_tsv(root, split) + ds = cls._from_list( + split_name=split, + is_train_split=is_train_split, + samples=samples, + data_cfg=data_cfg, + target_is_code=target_is_code, + tgt_dict=tgt_dict, + n_frames_per_step=n_frames_per_step, + multitask=multitask, + ) + datasets.append(ds) + return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0] diff --git a/fairseq/data/audio/speech_to_text_dataset.py b/fairseq/data/audio/speech_to_text_dataset.py index 39d22c7a5e..cdf71558fd 100644 --- a/fairseq/data/audio/speech_to_text_dataset.py +++ b/fairseq/data/audio/speech_to_text_dataset.py @@ -4,193 +4,34 @@ # LICENSE file in the root directory of this source tree. import csv -import io import logging -import os.path as op import re -from typing import Dict, List, Optional, Tuple +from argparse import Namespace +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union import numpy as np import torch -from fairseq.data import ( - ConcatDataset, - Dictionary, - FairseqDataset, - ResamplingDataset, - data_utils as fairseq_data_utils, +import torch.nn.functional as F + +from fairseq.data import ConcatDataset, Dictionary, FairseqDataset, ResamplingDataset +from fairseq.data import data_utils as fairseq_data_utils +from fairseq.data import encoders +from fairseq.data.audio.audio_utils import get_features_or_waveform +from fairseq.data.audio.data_cfg import S2TDataConfig +from fairseq.data.audio.dataset_transforms import CompositeAudioDatasetTransform +from fairseq.data.audio.dataset_transforms.concataugment import ConcatAugment +from fairseq.data.audio.dataset_transforms.noisyoverlapaugment import ( + NoisyOverlapAugment, ) -from fairseq.data.audio.audio_utils import get_fbank, get_waveform from fairseq.data.audio.feature_transforms import CompositeAudioFeatureTransform - +from fairseq.data.audio.waveform_transforms import CompositeAudioWaveformTransform logger = logging.getLogger(__name__) -class S2TDataConfig(object): - """Wrapper class for data config YAML""" - - def __init__(self, yaml_path): - try: - import yaml - except ImportError: - print("Please install PyYAML to load YAML files for " "S2T data config") - self.config = {} - if op.isfile(yaml_path): - try: - with open(yaml_path) as f: - self.config = yaml.load(f, Loader=yaml.FullLoader) - except Exception as e: - logger.info(f"Failed to load config from {yaml_path}: {e}") - else: - logger.info(f"Cannot find {yaml_path}") - - @property - def vocab_filename(self): - """fairseq vocabulary file under data root""" - return self.config.get("vocab_filename", "dict.txt") - - @property - def shuffle(self) -> bool: - """Shuffle dataset samples before batching""" - return self.config.get("shuffle", False) - - @property - def pre_tokenizer(self) -> Dict: - """Pre-tokenizer to apply before subword tokenization. Returning - a dictionary with `tokenizer` providing the tokenizer name and - the other items providing the tokenizer-specific arguments. - Tokenizers are defined in `fairseq.data.encoders.*`""" - return self.config.get("pre_tokenizer", {"tokenizer": None}) - - @property - def bpe_tokenizer(self) -> Dict: - """Subword tokenizer to apply after pre-tokenization. Returning - a dictionary with `bpe` providing the tokenizer name and - the other items providing the tokenizer-specific arguments. - Tokenizers are defined in `fairseq.data.encoders.*`""" - return self.config.get("bpe_tokenizer", {"bpe": None}) - - @property - def prepend_tgt_lang_tag(self) -> bool: - """Prepend target lang ID token as the target BOS (e.g. for to-many - multilingual setting). During inference, this requires `--prefix-size 1` - to force BOS to be lang ID token.""" - return self.config.get("prepend_tgt_lang_tag", False) - - @property - def input_feat_per_channel(self): - """The dimension of input features (per audio channel)""" - return self.config.get("input_feat_per_channel", 80) - - @property - def input_channels(self): - """The number of channels in the input audio""" - return self.config.get("input_channels", 1) - - @property - def sampling_alpha(self): - """Hyper-parameter alpha = 1/T for temperature-based resampling. - (alpha = 1 for no resampling)""" - return self.config.get("sampling_alpha", 1.0) - - @property - def use_audio_input(self): - """Needed by the dataset loader to see if the model requires - raw audio as inputs.""" - return self.config.get("use_audio_input", False) - - @property - def audio_root(self): - """Audio paths in the manifest TSV can be relative and this provides - the root path. Set this to empty string when using absolute paths.""" - return self.config.get("audio_root", "") - - def get_feature_transforms(self, split, is_train): - """Split-specific feature transforms. Allowing train set wildcard `_train`, - evaluation set wildcard `_eval` and general wildcard `*` for matching.""" - from copy import deepcopy - - cfg = deepcopy(self.config) - _cur = cfg.get("transforms", {}) - cur = _cur.get(split) - cur = _cur.get("_train") if cur is None and is_train else cur - cur = _cur.get("_eval") if cur is None and not is_train else cur - cur = _cur.get("*") if cur is None else cur - cfg["transforms"] = cur - return cfg - - -def is_npy_data(data: bytes) -> bool: - return data[0] == 147 and data[1] == 78 - - -def is_flac_or_wav_data(data: bytes) -> bool: - is_flac = data[0] == 102 and data[1] == 76 - is_wav = data[0] == 82 and data[1] == 73 - return is_flac or is_wav - - -def read_from_uncompressed_zip(file_path, offset, file_size) -> bytes: - with open(file_path, "rb") as f: - f.seek(offset) - data = f.read(file_size) - return data - - -def get_features_from_npy_or_audio(path): - ext = op.splitext(op.basename(path))[1] - if ext not in {".npy", ".flac", ".wav"}: - raise ValueError(f'Unsupported file format for "{path}"') - return np.load(path) if ext == ".npy" else get_fbank(path) - - -def get_features_or_waveform_from_uncompressed_zip( - path, byte_offset, byte_size, need_waveform=False -): - assert path.endswith(".zip") - data = read_from_uncompressed_zip(path, byte_offset, byte_size) - f = io.BytesIO(data) - if is_npy_data(data): - features_or_waveform = np.load(f) - elif is_flac_or_wav_data(data): - features_or_waveform = get_waveform(f)[0] if need_waveform else get_fbank(f) - else: - raise ValueError(f'Unknown file format for "{path}"') - return features_or_waveform - - -def get_features_or_waveform(path: str, need_waveform=False): - """Get speech features from .npy file or waveform from .wav/.flac file. - The file may be inside an uncompressed ZIP file and is accessed via byte - offset and length. - - Args: - path (str): File path in the format of "<.npy/.wav/.flac path>" or - "<zip path>:<byte offset>:<byte length>". - need_waveform (bool): return waveform instead of features. - - Returns: - features_or_waveform (numpy.ndarray): speech features or waveform. - """ - _path, *extra = path.split(":") - if not op.exists(_path): - raise FileNotFoundError(f"File not found: {_path}") - - if len(extra) == 0: - if need_waveform: - return get_waveform(_path) - return get_features_from_npy_or_audio(_path) - elif len(extra) == 2: - extra = [int(i) for i in extra] - features_or_waveform = get_features_or_waveform_from_uncompressed_zip( - _path, extra[0], extra[1], need_waveform=need_waveform - ) - else: - raise ValueError(f"Invalid path: {path}") - - return features_or_waveform - - def _collate_frames( frames: List[torch.Tensor], is_audio_input: bool = False ) -> torch.Tensor: @@ -212,6 +53,20 @@ def _collate_frames( return out +def _is_int_or_np_int(n): + return isinstance(n, int) or ( + isinstance(n, np.generic) and isinstance(n.item(), int) + ) + + +@dataclass +class SpeechToTextDatasetItem(object): + index: int + source: torch.Tensor + target: Optional[torch.Tensor] = None + speaker_id: Optional[int] = None + + class SpeechToTextDataset(FairseqDataset): LANG_TAG_TEMPLATE = "<lang:{}>" @@ -219,7 +74,7 @@ def __init__( self, split: str, is_train_split: bool, - data_cfg: S2TDataConfig, + cfg: S2TDataConfig, audio_paths: List[str], n_frames: List[int], src_texts: Optional[List[str]] = None, @@ -231,9 +86,12 @@ def __init__( tgt_dict: Optional[Dictionary] = None, pre_tokenizer=None, bpe_tokenizer=None, + n_frames_per_step=1, + speaker_to_id=None, + append_eos=True, ): self.split, self.is_train_split = split, is_train_split - self.data_cfg = data_cfg + self.cfg = cfg self.audio_paths, self.n_frames = audio_paths, n_frames self.n_samples = len(audio_paths) assert len(n_frames) == self.n_samples > 0 @@ -248,26 +106,68 @@ def __init__( ) self.src_texts, self.tgt_texts = src_texts, tgt_texts self.src_langs, self.tgt_langs = src_langs, tgt_langs + self.speakers = speakers self.tgt_dict = tgt_dict self.check_tgt_lang_tag() self.ids = ids - self.shuffle = data_cfg.shuffle if is_train_split else False + self.shuffle = cfg.shuffle if is_train_split else False self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict( - self.data_cfg.get_feature_transforms(split, is_train_split) + self.cfg.get_feature_transforms(split, is_train_split) + ) + self.waveform_transforms = CompositeAudioWaveformTransform.from_config_dict( + self.cfg.get_waveform_transforms(split, is_train_split) + ) + # TODO: add these to data_cfg.py + self.dataset_transforms = CompositeAudioDatasetTransform.from_config_dict( + self.cfg.get_dataset_transforms(split, is_train_split) ) + # check proper usage of transforms + if self.feature_transforms and self.cfg.use_audio_input: + logger.warning( + "Feature transforms will not be applied. To use feature transforms, " + "set use_audio_input as False in config." + ) + self.pre_tokenizer = pre_tokenizer self.bpe_tokenizer = bpe_tokenizer + self.n_frames_per_step = n_frames_per_step + self.speaker_to_id = speaker_to_id + + self.tgt_lens = self.get_tgt_lens_and_check_oov() + self.append_eos = append_eos logger.info(self.__repr__()) + def get_tgt_lens_and_check_oov(self): + if self.tgt_texts is None: + return [0 for _ in range(self.n_samples)] + tgt_lens = [] + n_tokens, n_oov_tokens = 0, 0 + for i in range(self.n_samples): + tokenized = self.get_tokenized_tgt_text(i).split(" ") + oov_tokens = [ + t + for t in tokenized + if self.tgt_dict.index(t) == self.tgt_dict.unk_index + ] + n_tokens += len(tokenized) + n_oov_tokens += len(oov_tokens) + tgt_lens.append(len(tokenized)) + logger.info(f"'{self.split}' has {n_oov_tokens / n_tokens * 100:.2f}% OOV") + return tgt_lens + def __repr__(self): return ( self.__class__.__name__ - + f'(split="{self.split}", n_samples={self.n_samples}, ' - f"prepend_tgt_lang_tag={self.data_cfg.prepend_tgt_lang_tag}, " - f"shuffle={self.shuffle}, transforms={self.feature_transforms})" + + f'(split="{self.split}", n_samples={self.n_samples:_}, ' + f"prepend_tgt_lang_tag={self.cfg.prepend_tgt_lang_tag}, " + f"n_frames_per_step={self.n_frames_per_step}, " + f"shuffle={self.shuffle}, " + f"feature_transforms={self.feature_transforms}, " + f"waveform_transforms={self.waveform_transforms}, " + f"dataset_transforms={self.dataset_transforms})" ) @classmethod @@ -276,55 +176,136 @@ def is_lang_tag(cls, token): return re.match(pattern, token) def check_tgt_lang_tag(self): - if self.data_cfg.prepend_tgt_lang_tag: + if self.cfg.prepend_tgt_lang_tag: assert self.tgt_langs is not None and self.tgt_dict is not None tgt_lang_tags = [ self.LANG_TAG_TEMPLATE.format(t) for t in set(self.tgt_langs) ] assert all(t in self.tgt_dict for t in tgt_lang_tags) - def tokenize_text(self, text: str): - if self.pre_tokenizer is not None: - text = self.pre_tokenizer.encode(text) - if self.bpe_tokenizer is not None: - text = self.bpe_tokenizer.encode(text) + @classmethod + def tokenize(cls, tokenizer, text: str): + return text if tokenizer is None else tokenizer.encode(text) + + def get_tokenized_tgt_text(self, index: Union[int, List[int]]): + if _is_int_or_np_int(index): + text = self.tgt_texts[index] + else: + text = " ".join([self.tgt_texts[i] for i in index]) + + text = self.tokenize(self.pre_tokenizer, text) + text = self.tokenize(self.bpe_tokenizer, text) return text - def __getitem__( - self, index: int - ) -> Tuple[int, torch.Tensor, Optional[torch.Tensor]]: - source = get_features_or_waveform( - self.audio_paths[index], need_waveform=self.data_cfg.use_audio_input - ) - if self.feature_transforms is not None: - assert not self.data_cfg.use_audio_input - source = self.feature_transforms(source) - source = torch.from_numpy(source).float() + def pack_frames(self, feature: torch.Tensor): + if self.n_frames_per_step == 1: + return feature + n_packed_frames = feature.shape[0] // self.n_frames_per_step + feature = feature[: self.n_frames_per_step * n_packed_frames] + return feature.reshape(n_packed_frames, -1) + + @classmethod + def get_lang_tag_idx(cls, lang: str, dictionary: Dictionary): + lang_tag_idx = dictionary.index(cls.LANG_TAG_TEMPLATE.format(lang)) + assert lang_tag_idx != dictionary.unk() + return lang_tag_idx + + def _get_source_audio(self, index: Union[int, List[int]]) -> torch.Tensor: + """ + Gives source audio for given index with any relevant transforms + applied. For ConcatAug, source audios for given indices are + concatenated in given order. + Args: + index (int or List[int]): index—or in the case of ConcatAug, + indices—to pull the source audio for + Returns: + source audios concatenated for given indices with + relevant transforms appplied + """ + if _is_int_or_np_int(index): + source = get_features_or_waveform( + self.audio_paths[index], + need_waveform=self.cfg.use_audio_input, + use_sample_rate=self.cfg.use_sample_rate, + waveform_transforms=self.waveform_transforms, + ) + else: + source = np.concatenate( + [ + get_features_or_waveform( + self.audio_paths[i], + need_waveform=self.cfg.use_audio_input, + use_sample_rate=self.cfg.use_sample_rate, + waveform_transforms=self.waveform_transforms, + ) + for i in index + ] + ) + if self.cfg.use_audio_input: + source = torch.from_numpy(source).float() + if self.cfg.standardize_audio: + with torch.no_grad(): + source = F.layer_norm(source, source.shape) + else: + if self.feature_transforms is not None: + source = self.feature_transforms(source) + source = torch.from_numpy(source).float() + return source + + def __getitem__(self, index: int) -> SpeechToTextDatasetItem: + has_concat = self.dataset_transforms.has_transform(ConcatAugment) + if has_concat: + concat = self.dataset_transforms.get_transform(ConcatAugment) + indices = concat.find_indices(index, self.n_frames, self.n_samples) + + source = self._get_source_audio(indices if has_concat else index) + source = self.pack_frames(source) target = None if self.tgt_texts is not None: - tokenized = self.tokenize_text(self.tgt_texts[index]) + tokenized = self.get_tokenized_tgt_text(indices if has_concat else index) target = self.tgt_dict.encode_line( - tokenized, add_if_not_exist=False, append_eos=True + tokenized, add_if_not_exist=False, append_eos=self.append_eos ).long() - if self.data_cfg.prepend_tgt_lang_tag: - lang_tag = self.LANG_TAG_TEMPLATE.format(self.tgt_langs[index]) - lang_tag_idx = self.tgt_dict.index(lang_tag) + if self.cfg.prepend_tgt_lang_tag: + lang_tag_idx = self.get_lang_tag_idx( + self.tgt_langs[index], self.tgt_dict + ) target = torch.cat((torch.LongTensor([lang_tag_idx]), target), 0) - return index, source, target + + if self.cfg.prepend_bos_and_append_tgt_lang_tag: + bos = torch.LongTensor([self.tgt_dict.bos()]) + lang_tag_idx = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict) + assert lang_tag_idx != self.tgt_dict.unk() + lang_tag_idx = torch.LongTensor([lang_tag_idx]) + target = torch.cat((bos, target, lang_tag_idx), 0) + + speaker_id = None + if self.speaker_to_id is not None: + speaker_id = self.speaker_to_id[self.speakers[index]] + return SpeechToTextDatasetItem( + index=index, source=source, target=target, speaker_id=speaker_id + ) def __len__(self): return self.n_samples - def collater(self, samples: List[Tuple[int, torch.Tensor, torch.Tensor]]) -> Dict: + def collater( + self, samples: List[SpeechToTextDatasetItem], return_order: bool = False + ) -> Dict: if len(samples) == 0: return {} - indices = torch.tensor([i for i, _, _ in samples], dtype=torch.long) - frames = _collate_frames( - [s for _, s, _ in samples], self.data_cfg.use_audio_input - ) + indices = torch.tensor([x.index for x in samples], dtype=torch.long) + + sources = [x.source for x in samples] + has_NOAug = self.dataset_transforms.has_transform(NoisyOverlapAugment) + if has_NOAug and self.cfg.use_audio_input: + NOAug = self.dataset_transforms.get_transform(NoisyOverlapAugment) + sources = NOAug(sources) + + frames = _collate_frames(sources, self.cfg.use_audio_input) # sort samples by descending number of frames - n_frames = torch.tensor([s.size(0) for _, s, _ in samples], dtype=torch.long) + n_frames = torch.tensor([x.size(0) for x in sources], dtype=torch.long) n_frames, order = n_frames.sort(descending=True) indices = indices.index_select(0, order) frames = frames.index_select(0, order) @@ -334,7 +315,7 @@ def collater(self, samples: List[Tuple[int, torch.Tensor, torch.Tensor]]) -> Dic ntokens = None if self.tgt_texts is not None: target = fairseq_data_utils.collate_tokens( - [t for _, _, t in samples], + [x.target for x in samples], self.tgt_dict.pad(), self.tgt_dict.eos(), left_pad=False, @@ -342,41 +323,49 @@ def collater(self, samples: List[Tuple[int, torch.Tensor, torch.Tensor]]) -> Dic ) target = target.index_select(0, order) target_lengths = torch.tensor( - [t.size(0) for _, _, t in samples], dtype=torch.long + [x.target.size(0) for x in samples], dtype=torch.long ).index_select(0, order) prev_output_tokens = fairseq_data_utils.collate_tokens( - [t for _, _, t in samples], + [x.target for x in samples], self.tgt_dict.pad(), - self.tgt_dict.eos(), + eos_idx=None, left_pad=False, move_eos_to_beginning=True, ) prev_output_tokens = prev_output_tokens.index_select(0, order) - ntokens = sum(t.size(0) for _, _, t in samples) + ntokens = sum(x.target.size(0) for x in samples) + + speaker = None + if self.speaker_to_id is not None: + speaker = ( + torch.tensor([s.speaker_id for s in samples], dtype=torch.long) + .index_select(0, order) + .view(-1, 1) + ) + net_input = { + "src_tokens": frames, + "src_lengths": n_frames, + "prev_output_tokens": prev_output_tokens, + } out = { "id": indices, - "net_input": { - "src_tokens": frames, - "src_lengths": n_frames, - "prev_output_tokens": prev_output_tokens, - }, + "net_input": net_input, + "speaker": speaker, "target": target, "target_lengths": target_lengths, "ntokens": ntokens, "nsentences": len(samples), } + if return_order: + out["order"] = order return out def num_tokens(self, index): return self.n_frames[index] def size(self, index): - t_len = 0 - if self.tgt_texts is not None: - tokenized = self.tokenize_text(self.tgt_texts[index]) - t_len = len(tokenized.split(" ")) - return self.n_frames[index], t_len + return self.n_frames[index], self.tgt_lens[index] @property def sizes(self): @@ -399,6 +388,160 @@ def prefetch(self, indices): raise False +class TextTargetMultitaskData(object): + # mandatory columns + KEY_ID, KEY_TEXT = "id", "tgt_text" + LANG_TAG_TEMPLATE = "<lang:{}>" + + def __init__(self, args, split, tgt_dict): + samples = SpeechToTextDatasetCreator._load_samples_from_tsv(args.data, split) + self.data = {s[self.KEY_ID]: s[self.KEY_TEXT] for s in samples} + self.dict = tgt_dict + self.append_eos = args.decoder_type != "ctc" + self.pre_tokenizer = self.build_tokenizer(args) + self.bpe_tokenizer = self.build_bpe(args) + self.prepend_bos_and_append_tgt_lang_tag = ( + args.prepend_bos_and_append_tgt_lang_tag + ) + self.eos_token = args.eos_token + self.lang_tag_mapping = args.get_lang_tag_mapping + + @classmethod + def is_lang_tag(cls, token): + pattern = cls.LANG_TAG_TEMPLATE.replace("{}", "(.*)") + return re.match(pattern, token) + + @classmethod + def tokenize(cls, tokenizer, text: str): + return text if tokenizer is None else tokenizer.encode(text) + + def get_tokenized_tgt_text(self, index: int): + text = self.tokenize(self.pre_tokenizer, self.data[index]) + text = self.tokenize(self.bpe_tokenizer, text) + return text + + def get_lang_tag_idx(self, lang: str, dictionary: Dictionary): + lang_tag = self.LANG_TAG_TEMPLATE.format(lang) + lang_tag = self.lang_tag_mapping.get(lang_tag, lang_tag) + lang_tag_idx = dictionary.index(lang_tag) + assert lang_tag_idx != dictionary.unk(), (lang, lang_tag) + return lang_tag_idx + + def build_tokenizer(self, args): + pre_tokenizer = args.config.get("pre_tokenizer") + if pre_tokenizer is not None: + logger.info(f"pre-tokenizer: {pre_tokenizer}") + return encoders.build_tokenizer(Namespace(**pre_tokenizer)) + else: + return None + + def build_bpe(self, args): + bpe_tokenizer = args.config.get("bpe_tokenizer") + if bpe_tokenizer is not None: + logger.info(f"tokenizer: {bpe_tokenizer}") + return encoders.build_bpe(Namespace(**bpe_tokenizer)) + else: + return None + + def get(self, sample_id, tgt_lang=None): + if sample_id in self.data: + tokenized = self.get_tokenized_tgt_text(sample_id) + target = self.dict.encode_line( + tokenized, + add_if_not_exist=False, + append_eos=self.append_eos, + ) + if self.prepend_bos_and_append_tgt_lang_tag: + bos = torch.LongTensor([self.dict.bos()]) + lang_tag_idx = self.get_lang_tag_idx(tgt_lang, self.dict) + assert lang_tag_idx != self.dict.unk() + lang_tag_idx = torch.LongTensor([lang_tag_idx]) + target = torch.cat((bos, target, lang_tag_idx), 0) + return target + else: + logger.warning(f"no target for {sample_id}") + return torch.IntTensor([]) + + def collater(self, samples: List[torch.Tensor]) -> torch.Tensor: + out = fairseq_data_utils.collate_tokens( + samples, + self.dict.pad(), + eos_idx=None, + left_pad=False, + move_eos_to_beginning=False, + ).long() + + prev_out = fairseq_data_utils.collate_tokens( + samples, + self.dict.pad(), + eos_idx=None, + left_pad=False, + move_eos_to_beginning=True, + ).long() + + target_lengths = torch.tensor([t.size(0) for t in samples], dtype=torch.long) + ntokens = sum(t.size(0) for t in samples) + + output = { + "prev_output_tokens": prev_out, + "target": out, + "target_lengths": target_lengths, + "ntokens": ntokens, + } + + return output + + +class SpeechToTextMultitaskDataset(SpeechToTextDataset): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.multitask_data = {} + + def add_multitask_dataset(self, task_name, task_data): + self.multitask_data[task_name] = task_data + + def __getitem__( + self, index: int + ) -> Tuple[SpeechToTextDatasetItem, Dict[str, torch.Tensor]]: + s2t_data = super().__getitem__(index) + + multitask_target = {} + sample_id = self.ids[index] + tgt_lang = self.tgt_langs[index] + for task_name, task_dataset in self.multitask_data.items(): + multitask_target[task_name] = task_dataset.get(sample_id, tgt_lang) + + return s2t_data, multitask_target + + def collater( + self, samples: List[Tuple[SpeechToTextDatasetItem, Dict[str, torch.Tensor]]] + ) -> Dict: + if len(samples) == 0: + return {} + + out = super().collater([s for s, _ in samples], return_order=True) + order = out["order"] + del out["order"] + + for task_name, task_dataset in self.multitask_data.items(): + if "multitask" not in out: + out["multitask"] = {} + d = [s[task_name] for _, s in samples] + task_target = task_dataset.collater(d) + out["multitask"][task_name] = { + "target": task_target["target"].index_select(0, order), + "target_lengths": task_target["target_lengths"].index_select(0, order), + "ntokens": task_target["ntokens"], + } + out["multitask"][task_name]["net_input"] = { + "prev_output_tokens": task_target["prev_output_tokens"].index_select( + 0, order + ), + } + + return out + + class SpeechToTextDatasetCreator(object): # mandatory columns KEY_ID, KEY_AUDIO, KEY_N_FRAMES = "id", "audio", "n_frames" @@ -414,67 +557,142 @@ def _from_list( cls, split_name: str, is_train_split, - samples: List[List[Dict]], - data_cfg: S2TDataConfig, + samples: List[Dict], + cfg: S2TDataConfig, tgt_dict, pre_tokenizer, bpe_tokenizer, + n_frames_per_step, + speaker_to_id, + multitask: Optional[Dict] = None, ) -> SpeechToTextDataset: - audio_paths, n_frames, src_texts, tgt_texts, ids = [], [], [], [], [] - speakers, src_langs, tgt_langs = [], [], [] - for s in samples: - ids.extend([ss[cls.KEY_ID] for ss in s]) - audio_paths.extend( - [op.join(data_cfg.audio_root, ss[cls.KEY_AUDIO]) for ss in s] - ) - n_frames.extend([int(ss[cls.KEY_N_FRAMES]) for ss in s]) - tgt_texts.extend([ss[cls.KEY_TGT_TEXT] for ss in s]) - src_texts.extend( - [ss.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for ss in s] + audio_root = Path(cfg.audio_root) + ids = [s[cls.KEY_ID] for s in samples] + audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples] + n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples] + tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples] + src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples] + speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples] + src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples] + tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples] + + has_multitask = multitask is not None and len(multitask.keys()) > 0 + dataset_cls = ( + SpeechToTextMultitaskDataset if has_multitask else SpeechToTextDataset + ) + + ds = dataset_cls( + split=split_name, + is_train_split=is_train_split, + cfg=cfg, + audio_paths=audio_paths, + n_frames=n_frames, + src_texts=src_texts, + tgt_texts=tgt_texts, + speakers=speakers, + src_langs=src_langs, + tgt_langs=tgt_langs, + ids=ids, + tgt_dict=tgt_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + n_frames_per_step=n_frames_per_step, + speaker_to_id=speaker_to_id, + ) + + if has_multitask: + for task_name, task_obj in multitask.items(): + task_data = TextTargetMultitaskData( + task_obj.args, split_name, task_obj.target_dictionary + ) + ds.add_multitask_dataset(task_name, task_data) + return ds + + @classmethod + def get_size_ratios( + cls, datasets: List[SpeechToTextDataset], alpha: float = 1.0 + ) -> List[float]: + """Size ratios for temperature-based sampling + (https://arxiv.org/abs/1907.05019)""" + + id_to_lp, lp_to_sz = {}, defaultdict(int) + for ds in datasets: + lang_pairs = {f"{s}->{t}" for s, t in zip(ds.src_langs, ds.tgt_langs)} + assert len(lang_pairs) == 1 + lang_pair = list(lang_pairs)[0] + id_to_lp[ds.split] = lang_pair + lp_to_sz[lang_pair] += sum(ds.n_frames) + + sz_sum = sum(v for v in lp_to_sz.values()) + lp_to_prob = {k: v / sz_sum for k, v in lp_to_sz.items()} + lp_to_tgt_prob = {k: v**alpha for k, v in lp_to_prob.items()} + prob_sum = sum(v for v in lp_to_tgt_prob.values()) + lp_to_tgt_prob = {k: v / prob_sum for k, v in lp_to_tgt_prob.items()} + lp_to_sz_ratio = { + k: (lp_to_tgt_prob[k] * sz_sum) / v for k, v in lp_to_sz.items() + } + size_ratio = [lp_to_sz_ratio[id_to_lp[ds.split]] for ds in datasets] + + p_formatted = { + k: f"{lp_to_prob[k]:.3f}->{lp_to_tgt_prob[k]:.3f}" for k in lp_to_sz + } + logger.info(f"sampling probability balancing: {p_formatted}") + sr_formatted = {ds.split: f"{r:.3f}" for ds, r in zip(datasets, size_ratio)} + logger.info(f"balanced sampling size ratio: {sr_formatted}") + return size_ratio + + @classmethod + def _load_samples_from_tsv(cls, root: str, split: str): + tsv_path = Path(root) / f"{split}.tsv" + if not tsv_path.is_file(): + raise FileNotFoundError(f"Dataset not found: {tsv_path}") + with open(tsv_path) as f: + reader = csv.DictReader( + f, + delimiter="\t", + quotechar=None, + doublequote=False, + lineterminator="\n", + quoting=csv.QUOTE_NONE, ) - speakers.extend([ss.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for ss in s]) - src_langs.extend([ss.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for ss in s]) - tgt_langs.extend([ss.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for ss in s]) - return SpeechToTextDataset( - split_name, + samples = [dict(e) for e in reader] + if len(samples) == 0: + raise ValueError(f"Empty manifest: {tsv_path}") + return samples + + @classmethod + def _from_tsv( + cls, + root: str, + cfg: S2TDataConfig, + split: str, + tgt_dict, + is_train_split: bool, + pre_tokenizer, + bpe_tokenizer, + n_frames_per_step, + speaker_to_id, + multitask: Optional[Dict] = None, + ) -> SpeechToTextDataset: + samples = cls._load_samples_from_tsv(root, split) + return cls._from_list( + split, is_train_split, - data_cfg, - audio_paths, - n_frames, - src_texts, - tgt_texts, - speakers, - src_langs, - tgt_langs, - ids, + samples, + cfg, tgt_dict, pre_tokenizer, bpe_tokenizer, + n_frames_per_step, + speaker_to_id, + multitask, ) - @classmethod - def _get_size_ratios(cls, ids: List[str], sizes: List[int], alpha: float = 1.0): - """Size ratios for temperature-based sampling - (https://arxiv.org/abs/1907.05019)""" - _sizes = np.array(sizes) - prob = _sizes / _sizes.sum() - smoothed_prob = prob ** alpha - smoothed_prob = smoothed_prob / smoothed_prob.sum() - size_ratio = (smoothed_prob * _sizes.sum()) / _sizes - - o_str = str({_i: f"{prob[i]:.3f}" for i, _i in enumerate(ids)}) - logger.info(f"original sampling probability: {o_str}") - p_str = str({_i: f"{smoothed_prob[i]:.3f}" for i, _i in enumerate(ids)}) - logger.info(f"balanced sampling probability: {p_str}") - sr_str = str({_id: f"{size_ratio[i]:.3f}" for i, _id in enumerate(ids)}) - logger.info(f"balanced sampling size ratio: {sr_str}") - return size_ratio.tolist() - @classmethod def from_tsv( cls, root: str, - data_cfg: S2TDataConfig, + cfg: S2TDataConfig, splits: str, tgt_dict, pre_tokenizer, @@ -482,47 +700,34 @@ def from_tsv( is_train_split: bool, epoch: int, seed: int, + n_frames_per_step: int = 1, + speaker_to_id=None, + multitask: Optional[Dict] = None, ) -> SpeechToTextDataset: - samples = [] - _splits = splits.split(",") - for split in _splits: - tsv_path = op.join(root, f"{split}.tsv") - if not op.isfile(tsv_path): - raise FileNotFoundError(f"Dataset not found: {tsv_path}") - with open(tsv_path) as f: - reader = csv.DictReader( - f, - delimiter="\t", - quotechar=None, - doublequote=False, - lineterminator="\n", - quoting=csv.QUOTE_NONE, - ) - samples.append([dict(e) for e in reader]) - assert len(samples) > 0 - datasets = [ - cls._from_list( - name, - is_train_split, - [s], - data_cfg, - tgt_dict, - pre_tokenizer, - bpe_tokenizer, + cls._from_tsv( + root=root, + cfg=cfg, + split=split, + tgt_dict=tgt_dict, + is_train_split=is_train_split, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + n_frames_per_step=n_frames_per_step, + speaker_to_id=speaker_to_id, + multitask=multitask, ) - for name, s in zip(_splits, samples) + for split in splits.split(",") ] - if is_train_split and len(_splits) > 1 and data_cfg.sampling_alpha != 1.0: + if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0: # temperature-based sampling - size_ratios = cls._get_size_ratios( - _splits, [len(s) for s in samples], alpha=data_cfg.sampling_alpha - ) + size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha) datasets = [ ResamplingDataset( d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0) ) - for d, r in zip(datasets, size_ratios) + for r, d in zip(size_ratios, datasets) ] - return ConcatDataset(datasets) + + return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0] diff --git a/fairseq/data/audio/speech_to_text_joint_dataset.py b/fairseq/data/audio/speech_to_text_joint_dataset.py new file mode 100644 index 0000000000..06922ea083 --- /dev/null +++ b/fairseq/data/audio/speech_to_text_joint_dataset.py @@ -0,0 +1,359 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from pathlib import Path +from typing import Dict, List, NamedTuple, Optional + +import torch + +from fairseq.data import ConcatDataset, Dictionary, ResamplingDataset +from fairseq.data import data_utils as fairseq_data_utils +from fairseq.data.audio.speech_to_text_dataset import ( + S2TDataConfig, + SpeechToTextDataset, + SpeechToTextDatasetCreator, +) + +logger = logging.getLogger(__name__) + + +class S2TJointDataConfig(S2TDataConfig): + """Wrapper class for data config YAML""" + + @property + def src_vocab_filename(self): + """fairseq vocabulary file under data root""" + return self.config.get("src_vocab_filename", "src_dict.txt") + + @property + def src_pre_tokenizer(self) -> Dict: + """Pre-tokenizer to apply before subword tokenization. Returning + a dictionary with `tokenizer` providing the tokenizer name and + the other items providing the tokenizer-specific arguments. + Tokenizers are defined in `fairseq.data.encoders.*`""" + return self.config.get("src_pre_tokenizer", {"tokenizer": None}) + + @property + def src_bpe_tokenizer(self) -> Dict: + """Subword tokenizer to apply on source text after pre-tokenization. + Returning a dictionary with `bpe` providing the tokenizer name and + the other items providing the tokenizer-specific arguments. + Tokenizers are defined in `fairseq.data.encoders.*`""" + return self.config.get("src_bpe_tokenizer", {"bpe": None}) + + @property + def prepend_tgt_lang_tag_no_change(self) -> bool: + """Prepend target lang ID token as the prev_output_tokens BOS (e.g. for + to-many multilingual setting). No change needed during inference. + This option is deprecated and replaced by prepend_tgt_lang_tag_as_bos. + """ + value = self.config.get("prepend_tgt_lang_tag_no_change", None) + if value is None: + return self.config.get("prepend_tgt_lang_tag_as_bos", False) + return value + + @property + def sampling_text_alpha(self): + """Hyper-parameter alpha = 1/T for temperature-based resampling. (text + input only) (alpha = 1 for no resampling)""" + return self.config.get("sampling_text_alpha", 1.0) + + +class SpeechToTextJointDatasetItem(NamedTuple): + index: int + source: torch.Tensor + target: Optional[torch.Tensor] = None + src_txt_tokens: Optional[torch.Tensor] = None + tgt_lang_tag: Optional[int] = None + src_lang_tag: Optional[int] = None + tgt_alignment: Optional[torch.Tensor] = None + + +# use_src_lang_id: +# 0: don't use src_lang_id +# 1: attach src_lang_id to the src_txt_tokens as eos +class SpeechToTextJointDataset(SpeechToTextDataset): + def __init__( + self, + split: str, + is_train_split: bool, + cfg: S2TJointDataConfig, + audio_paths: List[str], + n_frames: List[int], + src_texts: Optional[List[str]] = None, + tgt_texts: Optional[List[str]] = None, + speakers: Optional[List[str]] = None, + src_langs: Optional[List[str]] = None, + tgt_langs: Optional[List[str]] = None, + ids: Optional[List[str]] = None, + tgt_dict: Optional[Dictionary] = None, + src_dict: Optional[Dictionary] = None, + pre_tokenizer=None, + bpe_tokenizer=None, + src_pre_tokenizer=None, + src_bpe_tokenizer=None, + append_eos: Optional[bool] = True, + alignment: Optional[List[str]] = None, + use_src_lang_id: Optional[int] = 0, + ): + super().__init__( + split, + is_train_split, + cfg, + audio_paths, + n_frames, + src_texts=src_texts, + tgt_texts=tgt_texts, + speakers=speakers, + src_langs=src_langs, + tgt_langs=tgt_langs, + ids=ids, + tgt_dict=tgt_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + append_eos=append_eos, + ) + + self.src_dict = src_dict + self.src_pre_tokenizer = src_pre_tokenizer + self.src_bpe_tokenizer = src_bpe_tokenizer + self.alignment = None + self.use_src_lang_id = use_src_lang_id + if alignment is not None: + self.alignment = [ + [float(s) for s in sample.split()] for sample in alignment + ] + + def get_tokenized_src_text(self, index: int): + text = self.tokenize(self.src_pre_tokenizer, self.src_texts[index]) + text = self.tokenize(self.src_bpe_tokenizer, text) + return text + + def __getitem__(self, index: int) -> SpeechToTextJointDatasetItem: + s2t_dataset_item = super().__getitem__(index) + src_tokens = None + src_lang_tag = None + if self.src_texts is not None and self.src_dict is not None: + src_tokens = self.get_tokenized_src_text(index) + src_tokens = self.src_dict.encode_line( + src_tokens, add_if_not_exist=False, append_eos=True + ).long() + if self.use_src_lang_id > 0: + src_lang_tag = self.get_lang_tag_idx( + self.src_langs[index], self.src_dict + ) + tgt_lang_tag = None + if self.cfg.prepend_tgt_lang_tag_no_change: + # prepend_tgt_lang_tag_no_change: modify prev_output_tokens instead + tgt_lang_tag = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict) + ali = None + if self.alignment is not None: + ali = torch.Tensor(self.alignment[index]).float() + + return SpeechToTextJointDatasetItem( + index=index, + source=s2t_dataset_item.source, + target=s2t_dataset_item.target, + src_txt_tokens=src_tokens, + tgt_lang_tag=tgt_lang_tag, + src_lang_tag=src_lang_tag, + tgt_alignment=ali, + ) + + def __len__(self): + return self.n_samples + + def collater(self, samples: List[SpeechToTextJointDatasetItem]) -> Dict: + s2t_out = super().collater(samples, return_order=True) + if s2t_out == {}: + return s2t_out + net_input, order = s2t_out["net_input"], s2t_out["order"] + + if self.src_texts is not None and self.src_dict is not None: + src_txt_tokens = fairseq_data_utils.collate_tokens( + [x.src_txt_tokens for x in samples], + self.src_dict.pad(), + self.src_dict.eos(), + left_pad=False, + move_eos_to_beginning=False, + ) + src_txt_lengths = torch.tensor( + [x.src_txt_tokens.size()[0] for x in samples], dtype=torch.long + ) + if self.use_src_lang_id > 0: + src_lang_idxs = torch.tensor( + [s.src_lang_tag for s in samples], dtype=src_txt_tokens.dtype + ) + if self.use_src_lang_id == 1: # replace eos with lang_id + eos_idx = src_txt_lengths - 1 + src_txt_tokens.scatter_( + 1, eos_idx.view(-1, 1), src_lang_idxs.view(-1, 1) + ) + else: + raise NotImplementedError("Implementation is required") + + src_txt_tokens = src_txt_tokens.index_select(0, order) + src_txt_lengths = src_txt_lengths.index_select(0, order) + net_input["src_txt_tokens"] = src_txt_tokens + net_input["src_txt_lengths"] = src_txt_lengths + + net_input["alignment"] = None + if self.alignment is not None: + max_len = max([s.tgt_alignment.size(0) for s in samples]) + alignment = torch.ones(len(samples), max_len).float() + for i, s in enumerate(samples): + cur_len = s.tgt_alignment.size(0) + alignment[i][:cur_len].copy_(s.tgt_alignment) + net_input["alignment"] = alignment.index_select(0, order) + + if self.tgt_texts is not None and samples[0].tgt_lang_tag is not None: + for i in range(len(samples)): + net_input["prev_output_tokens"][i][0] = samples[order[i]].tgt_lang_tag + + out = { + "id": s2t_out["id"], + "net_input": net_input, + "target": s2t_out["target"], + "target_lengths": s2t_out["target_lengths"], + "ntokens": s2t_out["ntokens"], + "nsentences": len(samples), + } + return out + + +class SpeechToTextJointDatasetCreator(SpeechToTextDatasetCreator): + KEY_ALIGN = "align" + + @classmethod + def _from_list( + cls, + split_name: str, + is_train_split, + samples: List[Dict], + cfg: S2TJointDataConfig, + tgt_dict, + src_dict, + pre_tokenizer, + bpe_tokenizer, + src_pre_tokenizer, + src_bpe_tokenizer, + append_eos, + use_src_lang_id, + ) -> SpeechToTextJointDataset: + audio_root = Path(cfg.audio_root) + ids = [s[cls.KEY_ID] for s in samples] + audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples] + n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples] + tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples] + src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples] + speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples] + src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples] + tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples] + tgt_alignment = None + if cls.KEY_ALIGN in samples[0].keys(): + tgt_alignment = [s[cls.KEY_ALIGN] for s in samples] + return SpeechToTextJointDataset( + split_name, + is_train_split, + cfg, + audio_paths, + n_frames, + src_texts=src_texts, + tgt_texts=tgt_texts, + speakers=speakers, + src_langs=src_langs, + tgt_langs=tgt_langs, + ids=ids, + tgt_dict=tgt_dict, + src_dict=src_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + src_pre_tokenizer=src_pre_tokenizer, + src_bpe_tokenizer=src_bpe_tokenizer, + append_eos=append_eos, + alignment=tgt_alignment, + use_src_lang_id=use_src_lang_id, + ) + + @classmethod + def _from_tsv( + cls, + root: str, + cfg: S2TJointDataConfig, + split: str, + tgt_dict, + src_dict, + is_train_split: bool, + pre_tokenizer, + bpe_tokenizer, + src_pre_tokenizer, + src_bpe_tokenizer, + append_eos: bool, + use_src_lang_id: int, + ) -> SpeechToTextJointDataset: + samples = cls._load_samples_from_tsv(root, split) + return cls._from_list( + split, + is_train_split, + samples, + cfg, + tgt_dict, + src_dict, + pre_tokenizer, + bpe_tokenizer, + src_pre_tokenizer, + src_bpe_tokenizer, + append_eos, + use_src_lang_id, + ) + + @classmethod + def from_tsv( + cls, + root: str, + cfg: S2TJointDataConfig, + splits: str, + tgt_dict, + src_dict, + pre_tokenizer, + bpe_tokenizer, + src_pre_tokenizer, + src_bpe_tokenizer, + is_train_split: bool, + epoch: int, + seed: int, + append_eos: Optional[bool] = True, + use_src_lang_id: Optional[int] = 0, + ) -> SpeechToTextJointDataset: + datasets = [ + cls._from_tsv( + root, + cfg, + split, + tgt_dict, + src_dict, + is_train_split, + pre_tokenizer, + bpe_tokenizer, + src_pre_tokenizer, + src_bpe_tokenizer, + append_eos=append_eos, + use_src_lang_id=use_src_lang_id, + ) + for split in splits.split(",") + ] + + if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0: + # temperature-based sampling + size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha) + datasets = [ + ResamplingDataset( + d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0) + ) + for r, d in zip(size_ratios, datasets) + ] + + return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0] diff --git a/fairseq/data/audio/text_to_speech_dataset.py b/fairseq/data/audio/text_to_speech_dataset.py new file mode 100644 index 0000000000..13612b458b --- /dev/null +++ b/fairseq/data/audio/text_to_speech_dataset.py @@ -0,0 +1,250 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory.abs + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional + +import numpy as np +import torch + +from fairseq.data import Dictionary +from fairseq.data import data_utils as fairseq_data_utils +from fairseq.data.audio.audio_utils import get_features_or_waveform +from fairseq.data.audio.speech_to_text_dataset import ( + S2TDataConfig, + SpeechToTextDataset, + SpeechToTextDatasetCreator, + _collate_frames, +) + + +@dataclass +class TextToSpeechDatasetItem(object): + index: int + source: torch.Tensor + target: Optional[torch.Tensor] = None + speaker_id: Optional[int] = None + duration: Optional[torch.Tensor] = None + pitch: Optional[torch.Tensor] = None + energy: Optional[torch.Tensor] = None + + +class TextToSpeechDataset(SpeechToTextDataset): + def __init__( + self, + split: str, + is_train_split: bool, + cfg: S2TDataConfig, + audio_paths: List[str], + n_frames: List[int], + src_texts: Optional[List[str]] = None, + tgt_texts: Optional[List[str]] = None, + speakers: Optional[List[str]] = None, + src_langs: Optional[List[str]] = None, + tgt_langs: Optional[List[str]] = None, + ids: Optional[List[str]] = None, + tgt_dict: Optional[Dictionary] = None, + pre_tokenizer=None, + bpe_tokenizer=None, + n_frames_per_step=1, + speaker_to_id=None, + durations: Optional[List[List[int]]] = None, + pitches: Optional[List[str]] = None, + energies: Optional[List[str]] = None, + ): + super(TextToSpeechDataset, self).__init__( + split, + is_train_split, + cfg, + audio_paths, + n_frames, + src_texts=src_texts, + tgt_texts=tgt_texts, + speakers=speakers, + src_langs=src_langs, + tgt_langs=tgt_langs, + ids=ids, + tgt_dict=tgt_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, + n_frames_per_step=n_frames_per_step, + speaker_to_id=speaker_to_id, + ) + self.durations = durations + self.pitches = pitches + self.energies = energies + + def __getitem__(self, index: int) -> TextToSpeechDatasetItem: + s2t_item = super().__getitem__(index) + + duration, pitch, energy = None, None, None + if self.durations is not None: + duration = torch.tensor( + self.durations[index] + [0], dtype=torch.long # pad 0 for EOS + ) + if self.pitches is not None: + pitch = get_features_or_waveform(self.pitches[index]) + pitch = torch.from_numpy( + np.concatenate((pitch, [0])) # pad 0 for EOS + ).float() + if self.energies is not None: + energy = get_features_or_waveform(self.energies[index]) + energy = torch.from_numpy( + np.concatenate((energy, [0])) # pad 0 for EOS + ).float() + return TextToSpeechDatasetItem( + index=index, + source=s2t_item.source, + target=s2t_item.target, + speaker_id=s2t_item.speaker_id, + duration=duration, + pitch=pitch, + energy=energy, + ) + + def collater(self, samples: List[TextToSpeechDatasetItem]) -> Dict[str, Any]: + if len(samples) == 0: + return {} + + src_lengths, order = torch.tensor( + [s.target.shape[0] for s in samples], dtype=torch.long + ).sort(descending=True) + id_ = torch.tensor([s.index for s in samples], dtype=torch.long).index_select( + 0, order + ) + feat = _collate_frames( + [s.source for s in samples], self.cfg.use_audio_input + ).index_select(0, order) + target_lengths = torch.tensor( + [s.source.shape[0] for s in samples], dtype=torch.long + ).index_select(0, order) + + src_tokens = fairseq_data_utils.collate_tokens( + [s.target for s in samples], + self.tgt_dict.pad(), + self.tgt_dict.eos(), + left_pad=False, + move_eos_to_beginning=False, + ).index_select(0, order) + + speaker = None + if self.speaker_to_id is not None: + speaker = ( + torch.tensor([s.speaker_id for s in samples], dtype=torch.long) + .index_select(0, order) + .view(-1, 1) + ) + + bsz, _, d = feat.size() + prev_output_tokens = torch.cat( + (feat.new_zeros((bsz, 1, d)), feat[:, :-1, :]), dim=1 + ) + + durations, pitches, energies = None, None, None + if self.durations is not None: + durations = fairseq_data_utils.collate_tokens( + [s.duration for s in samples], 0 + ).index_select(0, order) + assert src_tokens.shape[1] == durations.shape[1] + if self.pitches is not None: + pitches = _collate_frames([s.pitch for s in samples], True) + pitches = pitches.index_select(0, order) + assert src_tokens.shape[1] == pitches.shape[1] + if self.energies is not None: + energies = _collate_frames([s.energy for s in samples], True) + energies = energies.index_select(0, order) + assert src_tokens.shape[1] == energies.shape[1] + src_texts = [self.tgt_dict.string(samples[i].target) for i in order] + + return { + "id": id_, + "net_input": { + "src_tokens": src_tokens, + "src_lengths": src_lengths, + "prev_output_tokens": prev_output_tokens, + }, + "speaker": speaker, + "target": feat, + "durations": durations, + "pitches": pitches, + "energies": energies, + "target_lengths": target_lengths, + "ntokens": sum(target_lengths).item(), + "nsentences": len(samples), + "src_texts": src_texts, + } + + +class TextToSpeechDatasetCreator(SpeechToTextDatasetCreator): + KEY_DURATION = "duration" + KEY_PITCH = "pitch" + KEY_ENERGY = "energy" + + @classmethod + def _from_list( + cls, + split_name: str, + is_train_split, + samples: List[Dict], + cfg: S2TDataConfig, + tgt_dict, + pre_tokenizer, + bpe_tokenizer, + n_frames_per_step, + speaker_to_id, + multitask=None, + ) -> TextToSpeechDataset: + audio_root = Path(cfg.audio_root) + ids = [s[cls.KEY_ID] for s in samples] + audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples] + n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples] + tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples] + src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples] + speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples] + src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples] + tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples] + + durations = [s.get(cls.KEY_DURATION, None) for s in samples] + durations = [ + None if dd is None else [int(d) for d in dd.split(" ")] for dd in durations + ] + durations = None if any(dd is None for dd in durations) else durations + + pitches = [s.get(cls.KEY_PITCH, None) for s in samples] + pitches = [ + None if pp is None else (audio_root / pp).as_posix() for pp in pitches + ] + pitches = None if any(pp is None for pp in pitches) else pitches + + energies = [s.get(cls.KEY_ENERGY, None) for s in samples] + energies = [ + None if ee is None else (audio_root / ee).as_posix() for ee in energies + ] + energies = None if any(ee is None for ee in energies) else energies + + return TextToSpeechDataset( + split_name, + is_train_split, + cfg, + audio_paths, + n_frames, + src_texts, + tgt_texts, + speakers, + src_langs, + tgt_langs, + ids, + tgt_dict, + pre_tokenizer, + bpe_tokenizer, + n_frames_per_step, + speaker_to_id, + durations, + pitches, + energies, + ) diff --git a/fairseq/data/audio/waveform_transforms/__init__.py b/fairseq/data/audio/waveform_transforms/__init__.py new file mode 100644 index 0000000000..57f8bb571b --- /dev/null +++ b/fairseq/data/audio/waveform_transforms/__init__.py @@ -0,0 +1,48 @@ +import os +from fairseq.data.audio import ( + AudioTransform, + CompositeAudioTransform, + import_transforms, + register_audio_transform, +) + + +class AudioWaveformTransform(AudioTransform): + pass + + +AUDIO_WAVEFORM_TRANSFORM_REGISTRY = {} +AUDIO_WAVEFORM_TRANSFORM_CLASS_NAMES = set() + + +def get_audio_waveform_transform(name): + return AUDIO_WAVEFORM_TRANSFORM_REGISTRY[name] + + +def register_audio_waveform_transform(name): + return register_audio_transform( + name, + AudioWaveformTransform, + AUDIO_WAVEFORM_TRANSFORM_REGISTRY, + AUDIO_WAVEFORM_TRANSFORM_CLASS_NAMES, + ) + + +import_transforms(os.path.dirname(__file__), "waveform") + + +class CompositeAudioWaveformTransform(CompositeAudioTransform): + @classmethod + def from_config_dict(cls, config=None): + return super()._from_config_dict( + cls, + "waveform", + get_audio_waveform_transform, + CompositeAudioWaveformTransform, + config, + ) + + def __call__(self, x, sample_rate): + for t in self.transforms: + x, sample_rate = t(x, sample_rate) + return x, sample_rate diff --git a/fairseq/data/audio/waveform_transforms/noiseaugment.py b/fairseq/data/audio/waveform_transforms/noiseaugment.py new file mode 100644 index 0000000000..401ce30943 --- /dev/null +++ b/fairseq/data/audio/waveform_transforms/noiseaugment.py @@ -0,0 +1,201 @@ +from pathlib import Path +import numpy as np +from math import ceil + +from fairseq.data.audio import rand_uniform +from fairseq.data.audio.waveform_transforms import ( + AudioWaveformTransform, + register_audio_waveform_transform, +) + +SNR_MIN = 5.0 +SNR_MAX = 15.0 +RATE = 0.25 + +NOISE_RATE = 1.0 +NOISE_LEN_MEAN = 0.2 +NOISE_LEN_STD = 0.05 + + +class NoiseAugmentTransform(AudioWaveformTransform): + @classmethod + def from_config_dict(cls, config=None): + _config = {} if config is None else config + return cls( + _config.get("samples_path", None), + _config.get("snr_min", SNR_MIN), + _config.get("snr_max", SNR_MAX), + _config.get("rate", RATE), + ) + + def __init__( + self, + samples_path: str, + snr_min: float = SNR_MIN, + snr_max: float = SNR_MAX, + rate: float = RATE, + ): + # Sanity checks + assert ( + samples_path + ), "need to provide path to audio samples for noise augmentation" + assert snr_max >= snr_min, f"empty signal-to-noise range ({snr_min}, {snr_max})" + assert rate >= 0 and rate <= 1, "rate should be a float between 0 to 1" + + self.paths = list(Path(samples_path).glob("**/*.wav")) # load music + self.n_samples = len(self.paths) + assert self.n_samples > 0, f"no audio files found in {samples_path}" + + self.snr_min = snr_min + self.snr_max = snr_max + self.rate = rate + + def __repr__(self): + return ( + self.__class__.__name__ + + "(" + + ", ".join( + [ + f"n_samples={self.n_samples}", + f"snr={self.snr_min}-{self.snr_max}dB", + f"rate={self.rate}", + ] + ) + + ")" + ) + + def pick_sample(self, goal_shape, always_2d=False, use_sample_rate=None): + from fairseq.data.audio.audio_utils import get_waveform + + path = self.paths[np.random.randint(0, self.n_samples)] + sample = get_waveform( + path, always_2d=always_2d, output_sample_rate=use_sample_rate + )[0] + + # Check dimensions match, else silently skip adding noise to sample + # NOTE: SHOULD THIS QUIT WITH AN ERROR? + is_2d = len(goal_shape) == 2 + if len(goal_shape) != sample.ndim or ( + is_2d and goal_shape[0] != sample.shape[0] + ): + return np.zeros(goal_shape) + + # Cut/repeat sample to size + len_dim = len(goal_shape) - 1 + n_repeat = ceil(goal_shape[len_dim] / sample.shape[len_dim]) + repeated = np.tile(sample, [1, n_repeat] if is_2d else n_repeat) + start = np.random.randint(0, repeated.shape[len_dim] - goal_shape[len_dim] + 1) + return ( + repeated[:, start : start + goal_shape[len_dim]] + if is_2d + else repeated[start : start + goal_shape[len_dim]] + ) + + def _mix(self, source, noise, snr): + get_power = lambda x: np.mean(x**2) + if get_power(noise): + scl = np.sqrt( + get_power(source) / (np.power(10, snr / 10) * get_power(noise)) + ) + else: + scl = 0 + return 1 * source + scl * noise + + def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None): + return self.pick_sample(goal_shape, always_2d, use_sample_rate) + + def __call__(self, source, sample_rate): + if np.random.random() > self.rate: + return source, sample_rate + + noise = self._get_noise( + source.shape, always_2d=True, use_sample_rate=sample_rate + ) + + return ( + self._mix(source, noise, rand_uniform(self.snr_min, self.snr_max)), + sample_rate, + ) + + +@register_audio_waveform_transform("musicaugment") +class MusicAugmentTransform(NoiseAugmentTransform): + pass + + +@register_audio_waveform_transform("backgroundnoiseaugment") +class BackgroundNoiseAugmentTransform(NoiseAugmentTransform): + pass + + +@register_audio_waveform_transform("babbleaugment") +class BabbleAugmentTransform(NoiseAugmentTransform): + def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None): + for i in range(np.random.randint(3, 8)): + speech = self.pick_sample(goal_shape, always_2d, use_sample_rate) + if i == 0: + agg_noise = speech + else: # SNR scaled by i (how many noise signals already in agg_noise) + agg_noise = self._mix(agg_noise, speech, i) + return agg_noise + + +@register_audio_waveform_transform("sporadicnoiseaugment") +class SporadicNoiseAugmentTransform(NoiseAugmentTransform): + @classmethod + def from_config_dict(cls, config=None): + _config = {} if config is None else config + return cls( + _config.get("samples_path", None), + _config.get("snr_min", SNR_MIN), + _config.get("snr_max", SNR_MAX), + _config.get("rate", RATE), + _config.get("noise_rate", NOISE_RATE), + _config.get("noise_len_mean", NOISE_LEN_MEAN), + _config.get("noise_len_std", NOISE_LEN_STD), + ) + + def __init__( + self, + samples_path: str, + snr_min: float = SNR_MIN, + snr_max: float = SNR_MAX, + rate: float = RATE, + noise_rate: float = NOISE_RATE, # noises per second + noise_len_mean: float = NOISE_LEN_MEAN, # length of noises in seconds + noise_len_std: float = NOISE_LEN_STD, + ): + super().__init__(samples_path, snr_min, snr_max, rate) + self.noise_rate = noise_rate + self.noise_len_mean = noise_len_mean + self.noise_len_std = noise_len_std + + def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None): + agg_noise = np.zeros(goal_shape) + len_dim = len(goal_shape) - 1 + is_2d = len(goal_shape) == 2 + + n_noises = round(self.noise_rate * goal_shape[len_dim] / use_sample_rate) + start_pointers = [ + round(rand_uniform(0, goal_shape[len_dim])) for _ in range(n_noises) + ] + + for start_pointer in start_pointers: + noise_shape = list(goal_shape) + len_seconds = np.random.normal(self.noise_len_mean, self.noise_len_std) + noise_shape[len_dim] = round(max(0, len_seconds) * use_sample_rate) + end_pointer = start_pointer + noise_shape[len_dim] + if end_pointer >= goal_shape[len_dim]: + continue + + noise = self.pick_sample(noise_shape, always_2d, use_sample_rate) + if is_2d: + agg_noise[:, start_pointer:end_pointer] = ( + agg_noise[:, start_pointer:end_pointer] + noise + ) + else: + agg_noise[start_pointer:end_pointer] = ( + agg_noise[start_pointer:end_pointer] + noise + ) + + return agg_noise diff --git a/fairseq/data/bucket_pad_length_dataset.py b/fairseq/data/bucket_pad_length_dataset.py index cda8834ac8..0f94100148 100644 --- a/fairseq/data/bucket_pad_length_dataset.py +++ b/fairseq/data/bucket_pad_length_dataset.py @@ -6,6 +6,7 @@ import numpy as np import torch.nn.functional as F from fairseq.data import BaseWrapperDataset +from fairseq.data.data_utils import get_buckets, get_bucketed_sizes class BucketPadLengthDataset(BaseWrapperDataset): @@ -29,42 +30,43 @@ def __init__( num_buckets, pad_idx, left_pad, + tensor_key=None, ): super().__init__(dataset) self.pad_idx = pad_idx self.left_pad = left_pad assert num_buckets > 0 - self.buckets = np.unique( - np.percentile( - sizes, - np.linspace(0, 100, num_buckets + 1), - interpolation="lower", - )[1:] - ) + self.buckets = get_buckets(sizes, num_buckets) + self._bucketed_sizes = get_bucketed_sizes(sizes, self.buckets) + self._tensor_key = tensor_key - def get_bucketed_sizes(orig_sizes, buckets): - sizes = np.copy(orig_sizes) - assert np.min(sizes) >= 0 - start_val = -1 - for end_val in buckets: - mask = (sizes > start_val) & (sizes <= end_val) - sizes[mask] = end_val - start_val = end_val - return sizes + def _set_tensor(self, item, val): + if self._tensor_key is None: + return val + item[self._tensor_key] = val + return item - self._bucketed_sizes = get_bucketed_sizes(sizes, self.buckets) + def _get_tensor(self, item): + if self._tensor_key is None: + return item + return item[self._tensor_key] - def __getitem__(self, index): - item = self.dataset[index] - bucket_size = self._bucketed_sizes[index] - num_pad = bucket_size - item.size(-1) + def _pad(self, tensor, bucket_size, dim=-1): + num_pad = bucket_size - tensor.size(dim) return F.pad( - item, + tensor, (num_pad if self.left_pad else 0, 0 if self.left_pad else num_pad), value=self.pad_idx, ) + def __getitem__(self, index): + item = self.dataset[index] + bucket_size = self._bucketed_sizes[index] + tensor = self._get_tensor(item) + padded = self._pad(tensor, bucket_size) + return self._set_tensor(item, padded) + @property def sizes(self): return self._bucketed_sizes diff --git a/fairseq/data/codedataset.py b/fairseq/data/codedataset.py new file mode 100644 index 0000000000..a433091956 --- /dev/null +++ b/fairseq/data/codedataset.py @@ -0,0 +1,576 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import json +import logging +import os +import random +from pathlib import Path + +import numpy as np +import torch +import torch.utils.data + +from . import data_utils +from fairseq.data.fairseq_dataset import FairseqDataset + +F0_FRAME_SPACE = 0.005 # sec + + +logger = logging.getLogger(__name__) + + +class ExpressiveCodeDataConfig(object): + def __init__(self, json_path): + with open(json_path, "r") as f: + self.config = json.load(f) + self._manifests = self.config["manifests"] + + @property + def manifests(self): + return self._manifests + + @property + def n_units(self): + return self.config["n_units"] + + @property + def sampling_rate(self): + return self.config["sampling_rate"] + + @property + def code_hop_size(self): + return self.config["code_hop_size"] + + @property + def f0_stats(self): + """pre-computed f0 statistics path""" + return self.config.get("f0_stats", None) + + @property + def f0_vq_type(self): + """naive or precomp""" + return self.config["f0_vq_type"] + + @property + def f0_vq_name(self): + return self.config["f0_vq_name"] + + def get_f0_vq_naive_quantizer(self, log, norm_mean, norm_std): + key = "log" if log else "linear" + if norm_mean and norm_std: + key += "_mean_std_norm" + elif norm_mean: + key += "_mean_norm" + else: + key += "_none_norm" + return self.config["f0_vq_naive_quantizer"][key] + + @property + def f0_vq_n_units(self): + return self.config["f0_vq_n_units"] + + @property + def multispkr(self): + """how to parse speaker label from audio path""" + return self.config.get("multispkr", None) + + +def get_f0(audio, rate=16000): + try: + import amfm_decompy.basic_tools as basic + import amfm_decompy.pYAAPT as pYAAPT + from librosa.util import normalize + except ImportError: + raise "Please install amfm_decompy (`pip install AMFM-decompy`) and librosa (`pip install librosa`)." + + assert audio.ndim == 1 + frame_length = 20.0 # ms + to_pad = int(frame_length / 1000 * rate) // 2 + + audio = normalize(audio) * 0.95 + audio = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0) + audio = basic.SignalObj(audio, rate) + pitch = pYAAPT.yaapt( + audio, + frame_length=frame_length, + frame_space=F0_FRAME_SPACE * 1000, + nccf_thresh1=0.25, + tda_frame_length=25.0, + ) + f0 = pitch.samp_values + return f0 + + +def interpolate_f0(f0): + try: + from scipy.interpolate import interp1d + except ImportError: + raise "Please install scipy (`pip install scipy`)" + + orig_t = np.arange(f0.shape[0]) + f0_interp = f0[:] + ii = f0_interp != 0 + if ii.sum() > 1: + f0_interp = interp1d( + orig_t[ii], f0_interp[ii], bounds_error=False, kind="linear", fill_value=0 + )(orig_t) + f0_interp = torch.Tensor(f0_interp).type_as(f0).to(f0.device) + return f0_interp + + +def naive_quantize(x, edges): + bin_idx = (x.view(-1, 1) > edges.view(1, -1)).long().sum(dim=1) + return bin_idx + + +def load_wav(full_path): + try: + import soundfile as sf + except ImportError: + raise "Please install soundfile (`pip install SoundFile`)" + data, sampling_rate = sf.read(full_path) + return data, sampling_rate + + +def parse_code(code_str, dictionary, append_eos): + code, duration = torch.unique_consecutive( + torch.ShortTensor(list(map(int, code_str.split()))), return_counts=True + ) + code = " ".join(map(str, code.tolist())) + code = dictionary.encode_line(code, append_eos).short() + + if append_eos: + duration = torch.cat((duration, duration.new_zeros((1,))), dim=0) # eos + duration = duration.short() + return code, duration + + +def parse_manifest(manifest, dictionary): + audio_files = [] + codes = [] + durations = [] + speakers = [] + + with open(manifest) as info: + for line in info.readlines(): + sample = eval(line.strip()) + if "cpc_km100" in sample: + k = "cpc_km100" + elif "hubert_km100" in sample: + k = "hubert_km100" + elif "phone" in sample: + k = "phone" + else: + assert False, "unknown format" + code = sample[k] + code, duration = parse_code(code, dictionary, append_eos=True) + + codes.append(code) + durations.append(duration) + audio_files.append(sample["audio"]) + speakers.append(sample.get("speaker", None)) + + return audio_files, codes, durations, speakers + + +def parse_speaker(path, method): + if type(path) == str: + path = Path(path) + + if method == "parent_name": + return path.parent.name + elif method == "parent_parent_name": + return path.parent.parent.name + elif method == "_": + return path.name.split("_")[0] + elif method == "single": + return "A" + elif callable(method): + return method(path) + else: + raise NotImplementedError() + + +def get_f0_by_filename(filename, tgt_sampling_rate): + audio, sampling_rate = load_wav(filename) + if sampling_rate != tgt_sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format(sampling_rate, tgt_sampling_rate) + ) + + # compute un-interpolated f0, and use Ann's interp in __getitem__ if set + f0 = get_f0(audio, rate=tgt_sampling_rate) + f0 = torch.from_numpy(f0.astype(np.float32)) + return f0 + + +def align_f0_to_durations(f0, durations, f0_code_ratio, tol=1): + code_len = durations.sum() + targ_len = int(f0_code_ratio * code_len) + diff = f0.size(0) - targ_len + assert abs(diff) <= tol, ( + f"Cannot subsample F0: |{f0.size(0)} - {f0_code_ratio}*{code_len}|" + f" > {tol} (dur=\n{durations})" + ) + if diff > 0: + f0 = f0[:targ_len] + elif diff < 0: + f0 = torch.cat((f0, f0.new_full((-diff,), f0[-1])), 0) + + f0_offset = 0.0 + seg_f0s = [] + for dur in durations: + f0_dur = dur.item() * f0_code_ratio + seg_f0 = f0[int(f0_offset) : int(f0_offset + f0_dur)] + seg_f0 = seg_f0[seg_f0 != 0] + if len(seg_f0) == 0: + seg_f0 = torch.tensor(0).type(seg_f0.type()) + else: + seg_f0 = seg_f0.mean() + seg_f0s.append(seg_f0) + f0_offset += f0_dur + + assert int(f0_offset) == f0.size(0), f"{f0_offset} {f0.size()} {durations.sum()}" + return torch.tensor(seg_f0s) + + +class Paddings(object): + def __init__(self, code_val, dur_val=0, f0_val=-2.0): + self.code = code_val + self.dur = dur_val + self.f0 = f0_val + + +class Shifts(object): + def __init__(self, shifts_str, pads): + self._shifts = list(map(int, shifts_str.split(","))) + assert len(self._shifts) == 2, self._shifts + assert all(s >= 0 for s in self._shifts) + self.extra_length = max(s for s in self._shifts) + self.pads = pads + + @property + def dur(self): + return self._shifts[0] + + @property + def f0(self): + return self._shifts[1] + + @staticmethod + def shift_one(seq, left_pad_num, right_pad_num, pad): + assert seq.ndim == 1 + bos = seq.new_full((left_pad_num,), pad) + eos = seq.new_full((right_pad_num,), pad) + seq = torch.cat([bos, seq, eos]) + mask = torch.ones_like(seq).bool() + mask[left_pad_num : len(seq) - right_pad_num] = 0 + return seq, mask + + def __call__(self, code, dur, f0): + if self.extra_length == 0: + code_mask = torch.zeros_like(code).bool() + dur_mask = torch.zeros_like(dur).bool() + f0_mask = torch.zeros_like(f0).bool() + return code, code_mask, dur, dur_mask, f0, f0_mask + + code, code_mask = self.shift_one(code, 0, self.extra_length, self.pads.code) + dur, dur_mask = self.shift_one( + dur, self.dur, self.extra_length - self.dur, self.pads.dur + ) + f0, f0_mask = self.shift_one( + f0, self.f0, self.extra_length - self.f0, self.pads.f0 + ) + return code, code_mask, dur, dur_mask, f0, f0_mask + + +class CodeDataset(FairseqDataset): + def __init__( + self, + manifest, + dictionary, + dur_dictionary, + f0_dictionary, + config, + discrete_dur, + discrete_f0, + log_f0, + normalize_f0_mean, + normalize_f0_std, + interpolate_f0, + return_filename=False, + strip_filename=True, + shifts="0,0", + return_continuous_f0=False, + ): + random.seed(1234) + self.dictionary = dictionary + self.dur_dictionary = dur_dictionary + self.f0_dictionary = f0_dictionary + self.config = config + + # duration config + self.discrete_dur = discrete_dur + + # pitch config + self.discrete_f0 = discrete_f0 + self.log_f0 = log_f0 + self.normalize_f0_mean = normalize_f0_mean + self.normalize_f0_std = normalize_f0_std + self.interpolate_f0 = interpolate_f0 + + self.return_filename = return_filename + self.strip_filename = strip_filename + self.f0_code_ratio = config.code_hop_size / ( + config.sampling_rate * F0_FRAME_SPACE + ) + + # use lazy loading to avoid sharing file handlers across workers + self.manifest = manifest + self._codes = None + self._durs = None + self._f0s = None + with open(f"{manifest}.leng.txt", "r") as f: + lengs = [int(line.rstrip()) for line in f] + edges = np.cumsum([0] + lengs) + self.starts, self.ends = edges[:-1], edges[1:] + with open(f"{manifest}.path.txt", "r") as f: + self.file_names = [line.rstrip() for line in f] + logger.info(f"num entries: {len(self.starts)}") + + if os.path.exists(f"{manifest}.f0_stat.pt"): + self.f0_stats = torch.load(f"{manifest}.f0_stat.pt") + elif config.f0_stats: + self.f0_stats = torch.load(config.f0_stats) + + self.multispkr = config.multispkr + if config.multispkr: + with open(f"{manifest}.speaker.txt", "r") as f: + self.spkrs = [line.rstrip() for line in f] + self.id_to_spkr = sorted(self.spkrs) + self.spkr_to_id = {k: v for v, k in enumerate(self.id_to_spkr)} + + self.pads = Paddings( + dictionary.pad(), + 0, # use 0 for duration padding + f0_dictionary.pad() if discrete_f0 else -5.0, + ) + self.shifts = Shifts(shifts, pads=self.pads) + self.return_continuous_f0 = return_continuous_f0 + + def get_data_handlers(self): + logging.info(f"loading data for {self.manifest}") + self._codes = np.load(f"{self.manifest}.code.npy", mmap_mode="r") + self._durs = np.load(f"{self.manifest}.dur.npy", mmap_mode="r") + + if self.discrete_f0: + if self.config.f0_vq_type == "precomp": + self._f0s = np.load( + f"{self.manifest}.{self.config.f0_vq_name}.npy", mmap_mode="r" + ) + elif self.config.f0_vq_type == "naive": + self._f0s = np.load(f"{self.manifest}.f0.npy", mmap_mode="r") + quantizers_path = self.config.get_f0_vq_naive_quantizer( + self.log_f0, self.normalize_f0_mean, self.normalize_f0_std + ) + quantizers = torch.load(quantizers_path) + n_units = self.config.f0_vq_n_units + self._f0_quantizer = torch.from_numpy(quantizers[n_units]) + else: + raise ValueError(f"f0_vq_type {self.config.f0_vq_type} not supported") + else: + self._f0s = np.load(f"{self.manifest}.f0.npy", mmap_mode="r") + + def preprocess_f0(self, f0, stats): + """ + 1. interpolate + 2. log transform (keep unvoiced frame 0) + """ + # TODO: change this to be dependent on config for naive quantizer + f0 = f0.clone() + if self.interpolate_f0: + f0 = interpolate_f0(f0) + + mask = f0 != 0 # only process voiced frames + if self.log_f0: + f0[mask] = f0[mask].log() + if self.normalize_f0_mean: + mean = stats["logf0_mean"] if self.log_f0 else stats["f0_mean"] + f0[mask] = f0[mask] - mean + if self.normalize_f0_std: + std = stats["logf0_std"] if self.log_f0 else stats["f0_std"] + f0[mask] = f0[mask] / std + return f0 + + def _get_raw_item(self, index): + start, end = self.starts[index], self.ends[index] + if self._codes is None: + self.get_data_handlers() + code = torch.from_numpy(np.array(self._codes[start:end])).long() + dur = torch.from_numpy(np.array(self._durs[start:end])) + f0 = torch.from_numpy(np.array(self._f0s[start:end])) + return code, dur, f0 + + def __getitem__(self, index): + code, dur, f0 = self._get_raw_item(index) + code = torch.cat([code.new([self.dictionary.bos()]), code]) + + # use 0 for eos and bos + dur = torch.cat([dur.new([0]), dur]) + if self.discrete_dur: + dur = self.dur_dictionary.encode_line( + " ".join(map(str, dur.tolist())), append_eos=False + ).long() + else: + dur = dur.float() + + # TODO: find a more elegant approach + raw_f0 = None + if self.discrete_f0: + if self.config.f0_vq_type == "precomp": + f0 = self.f0_dictionary.encode_line( + " ".join(map(str, f0.tolist())), append_eos=False + ).long() + else: + f0 = f0.float() + f0 = self.preprocess_f0(f0, self.f0_stats[self.spkrs[index]]) + if self.return_continuous_f0: + raw_f0 = f0 + raw_f0 = torch.cat([raw_f0.new([self.f0_dictionary.bos()]), raw_f0]) + f0 = naive_quantize(f0, self._f0_quantizer) + f0 = torch.cat([f0.new([self.f0_dictionary.bos()]), f0]) + else: + f0 = f0.float() + if self.multispkr: + f0 = self.preprocess_f0(f0, self.f0_stats[self.spkrs[index]]) + else: + f0 = self.preprocess_f0(f0, self.f0_stats) + f0 = torch.cat([f0.new([0]), f0]) + + if raw_f0 is not None: + *_, raw_f0, raw_f0_mask = self.shifts(code, dur, raw_f0) + else: + raw_f0_mask = None + + code, code_mask, dur, dur_mask, f0, f0_mask = self.shifts(code, dur, f0) + if raw_f0_mask is not None: + assert (raw_f0_mask == f0_mask).all() + + # is a padded frame if either input or output is padded + feats = { + "source": code[:-1], + "target": code[1:], + "mask": code_mask[1:].logical_or(code_mask[:-1]), + "dur_source": dur[:-1], + "dur_target": dur[1:], + "dur_mask": dur_mask[1:].logical_or(dur_mask[:-1]), + "f0_source": f0[:-1], + "f0_target": f0[1:], + "f0_mask": f0_mask[1:].logical_or(f0_mask[:-1]), + } + + if raw_f0 is not None: + feats["raw_f0"] = raw_f0[1:] + + if self.return_filename: + fname = self.file_names[index] + feats["filename"] = ( + fname if not self.strip_filename else Path(fname).with_suffix("").name + ) + return feats + + def __len__(self): + return len(self.starts) + + def size(self, index): + return self.ends[index] - self.starts[index] + self.shifts.extra_length + + def num_tokens(self, index): + return self.size(index) + + def collater(self, samples): + pad_idx, eos_idx = self.dictionary.pad(), self.dictionary.eos() + if len(samples) == 0: + return {} + + src_tokens = data_utils.collate_tokens( + [s["source"] for s in samples], pad_idx, eos_idx, left_pad=False + ) + + tgt_tokens = data_utils.collate_tokens( + [s["target"] for s in samples], + pad_idx=pad_idx, + eos_idx=pad_idx, # appending padding, eos is there already + left_pad=False, + ) + + src_durs, tgt_durs = [ + data_utils.collate_tokens( + [s[k] for s in samples], + pad_idx=self.pads.dur, + eos_idx=self.pads.dur, + left_pad=False, + ) + for k in ["dur_source", "dur_target"] + ] + + src_f0s, tgt_f0s = [ + data_utils.collate_tokens( + [s[k] for s in samples], + pad_idx=self.pads.f0, + eos_idx=self.pads.f0, + left_pad=False, + ) + for k in ["f0_source", "f0_target"] + ] + + mask, dur_mask, f0_mask = [ + data_utils.collate_tokens( + [s[k] for s in samples], + pad_idx=1, + eos_idx=1, + left_pad=False, + ) + for k in ["mask", "dur_mask", "f0_mask"] + ] + + src_lengths = torch.LongTensor([s["source"].numel() for s in samples]) + n_tokens = sum(len(s["source"]) for s in samples) + + result = { + "nsentences": len(samples), + "ntokens": n_tokens, + "net_input": { + "src_tokens": src_tokens, + "src_lengths": src_lengths, + "dur_src": src_durs, + "f0_src": src_f0s, + }, + "target": tgt_tokens, + "dur_target": tgt_durs, + "f0_target": tgt_f0s, + "mask": mask, + "dur_mask": dur_mask, + "f0_mask": f0_mask, + } + + if "filename" in samples[0]: + result["filename"] = [s["filename"] for s in samples] + + # TODO: remove this hack into the inference dataset + if "prefix" in samples[0]: + result["prefix"] = [s["prefix"] for s in samples] + + if "raw_f0" in samples[0]: + raw_f0s = data_utils.collate_tokens( + [s["raw_f0"] for s in samples], + pad_idx=self.pads.f0, + eos_idx=self.pads.f0, + left_pad=False, + ) + result["raw_f0"] = raw_f0s + return result diff --git a/fairseq/data/colorize_dataset.py b/fairseq/data/colorize_dataset.py index 6ef097bff1..7a6d271379 100644 --- a/fairseq/data/colorize_dataset.py +++ b/fairseq/data/colorize_dataset.py @@ -9,7 +9,7 @@ class ColorizeDataset(BaseWrapperDataset): - """ Adds 'colors' property to net input that is obtained from the provided color getter for use by models """ + """Adds 'colors' property to net input that is obtained from the provided color getter for use by models""" def __init__(self, dataset, color_getter): super().__init__(dataset) diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py index 81f457365a..9a19cc3c18 100644 --- a/fairseq/data/data_utils.py +++ b/fairseq/data/data_utils.py @@ -10,13 +10,17 @@ import contextlib import itertools import logging -import os +import re import warnings from typing import Optional, Tuple +import math import numpy as np import torch +from fairseq.file_io import PathManager +from fairseq import utils +import os logger = logging.getLogger(__name__) @@ -24,7 +28,7 @@ def infer_language_pair(path): """Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx""" src, dst = None, None - for filename in os.listdir(path): + for filename in PathManager.ls(path): parts = filename.split(".") if len(parts) >= 3 and len(parts[1].split("-")) == 2: return parts[1].split("-") @@ -39,13 +43,16 @@ def collate_tokens( move_eos_to_beginning=False, pad_to_length=None, pad_to_multiple=1, + pad_to_bsz=None, ): """Convert a list of 1d tensors into a padded 2d tensor.""" size = max(v.size(0) for v in values) size = size if pad_to_length is None else max(size, pad_to_length) if pad_to_multiple != 1 and size % pad_to_multiple != 0: size = int(((size - 0.1) // pad_to_multiple + 1) * pad_to_multiple) - res = values[0].new(len(values), size).fill_(pad_idx) + + batch_size = len(values) if pad_to_bsz is None else max(len(values), pad_to_bsz) + res = values[0].new(batch_size, size).fill_(pad_idx) def copy_tensor(src, dst): assert dst.numel() == src.numel() @@ -80,13 +87,19 @@ def load_indexed_dataset( combine 'data-bin/train', 'data-bin/train1', ... and return a single ConcatDataset instance. """ - from fairseq.data.concat_dataset import ConcatDataset import fairseq.data.indexed_dataset as indexed_dataset + from fairseq.data.concat_dataset import ConcatDataset datasets = [] for k in itertools.count(): path_k = path + (str(k) if k > 0 else "") - path_k = indexed_dataset.get_indexed_dataset_to_local(path_k) + try: + path_k = indexed_dataset.get_indexed_dataset_to_local(path_k) + except Exception as e: + if "StorageException: [404] Path not found" in str(e): + logger.warning(f"path_k: {e} not found") + else: + raise e dataset_impl_k = dataset_impl if dataset_impl_k is None: @@ -99,7 +112,7 @@ def load_indexed_dataset( ) if dataset is None: break - logger.info("loaded {} examples from: {}".format(len(dataset), path_k)) + logger.info("loaded {:,} examples from: {}".format(len(dataset), path_k)) datasets.append(dataset) if not combine: break @@ -164,12 +177,6 @@ def check_size(idx): for key in intersect_keys ) else: - # Hacky as heck, for the specific case of multilingual training with RoundRobin. - if isinstance(size_fn(idx), dict) and isinstance(max_positions, tuple): - return all( - a is None or b is None or compare_leq(a, b) - for a, b in zip(size_fn(idx).values(), max_positions) - ) # For MultiCorpusSampledDataset, will generalize it later if not isinstance(size_fn(idx), Iterable): return all(size_fn(idx) <= b for b in max_positions) @@ -276,6 +283,7 @@ def filter_paired_dataset_indices_by_size(src_sizes, tgt_sizes, indices, max_siz def batch_by_size( indices, num_tokens_fn, + num_tokens_vec=None, max_tokens=None, max_sentences=None, required_batch_size_multiple=1, @@ -289,6 +297,8 @@ def batch_by_size( indices (List[int]): ordered list of dataset indices num_tokens_fn (callable): function that returns the number of tokens at a given index + num_tokens_vec (List[int], optional): precomputed vector of the number + of tokens for each index in indices (to enable faster batch generation) max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each @@ -301,30 +311,54 @@ def batch_by_size( """ try: from fairseq.data.data_utils_fast import ( - batch_by_size_fast, + batch_by_size_fn, + batch_by_size_vec, batch_fixed_shapes_fast, ) except ImportError: raise ImportError( - "Please build Cython components with: `pip install --editable .` " - "or `python setup.py build_ext --inplace`" + "Please build Cython components with: " + "`python setup.py build_ext --inplace`" + ) + except ValueError: + raise ValueError( + "Please build (or rebuild) Cython components with `python setup.py build_ext --inplace`." ) - max_tokens = max_tokens if max_tokens is not None else -1 + # added int() to avoid TypeError: an integer is required + max_tokens = int(max_tokens) if max_tokens is not None else -1 max_sentences = max_sentences if max_sentences is not None else -1 bsz_mult = required_batch_size_multiple if not isinstance(indices, np.ndarray): indices = np.fromiter(indices, dtype=np.int64, count=-1) + if num_tokens_vec is not None and not isinstance(num_tokens_vec, np.ndarray): + num_tokens_vec = np.fromiter(num_tokens_vec, dtype=np.int64, count=-1) + if fixed_shapes is None: - return batch_by_size_fast( - indices, - num_tokens_fn, - max_tokens, - max_sentences, - bsz_mult, - ) + if num_tokens_vec is None: + b = batch_by_size_fn( + indices, + num_tokens_fn, + max_tokens, + max_sentences, + bsz_mult, + ) + else: + b = batch_by_size_vec( + indices, + num_tokens_vec, + max_tokens, + max_sentences, + bsz_mult, + ) + + if bsz_mult > 1 and len(b[-1]) % bsz_mult != 0: + b = b[:-1] + + return b + else: fixed_shapes = np.array(fixed_shapes, dtype=np.int64) sort_order = np.lexsort( @@ -344,10 +378,21 @@ def post_process(sentence: str, symbol: str): sentence = sentence.replace(" ", "").replace("_", " ").strip() elif symbol == "letter": sentence = sentence.replace(" ", "").replace("|", " ").strip() + elif symbol == "silence": + import re + + sentence = sentence.replace("<SIL>", "") + sentence = re.sub(" +", " ", sentence).strip() elif symbol == "_EOW": sentence = sentence.replace(" ", "").replace("_EOW", " ").strip() - elif symbol is not None and symbol != "none": + elif symbol in {"subword_nmt", "@@ ", "@@"}: + if symbol == "subword_nmt": + symbol = "@@ " sentence = (sentence + " ").replace(symbol, "").rstrip() + elif symbol == "none": + pass + elif symbol is not None: + raise NotImplementedError(f"Unknown post_process option: {symbol}") return sentence @@ -361,6 +406,14 @@ def compute_mask_indices( min_masks: int = 0, no_overlap: bool = False, min_space: int = 0, + require_same_masks: bool = True, + mask_dropout: float = 0.0, + add_masks: bool = False, + seed: Optional[int] = None, + epoch: Optional[int] = None, + indices: Optional[torch.Tensor] = None, + idc_select_ver: int = 1, # 2 to reproduce mask_tokens_dataset + num_mask_ver: int = 2, # 2 to reproduce mask_tokens_dataset ) -> np.ndarray: """ Computes random mask spans for a given shape @@ -380,60 +433,86 @@ def compute_mask_indices( min_masks: minimum number of masked spans no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans + require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample + mask_dropout: randomly dropout this percentage of masks in each example """ bsz, all_sz = shape mask = np.full((bsz, all_sz), False) - all_num_mask = int( - # add a random number for probabilistic rounding - mask_prob * all_sz / float(mask_length) - + np.random.rand() - ) - - all_num_mask = max(min_masks, all_num_mask) + if num_mask_ver == 1: + all_num_mask = int( + # add a random number for probabilistic rounding + mask_prob * all_sz / float(mask_length) + + np.random.rand() + ) + all_num_mask = max(min_masks, all_num_mask) mask_idcs = [] for i in range(bsz): + if seed is not None and epoch is not None and indices is not None: + seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6) + else: + seed_i = None + + rng = np.random.default_rng(seed_i) + if padding_mask is not None: sz = all_sz - padding_mask[i].long().sum().item() + assert sz >= 0, sz + else: + sz = all_sz + + if num_mask_ver == 1: + if padding_mask is not None: + num_mask = int( + # add a random number for probabilistic rounding + mask_prob * sz / float(mask_length) + + np.random.rand() + ) + num_mask = max(min_masks, num_mask) + else: + num_mask = all_num_mask + elif num_mask_ver == 2: num_mask = int( # add a random number for probabilistic rounding mask_prob * sz / float(mask_length) - + np.random.rand() + + rng.random() ) num_mask = max(min_masks, num_mask) else: - sz = all_sz - num_mask = all_num_mask + raise ValueError() if mask_type == "static": lengths = np.full(num_mask, mask_length) elif mask_type == "uniform": - lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask) + lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask) elif mask_type == "normal": - lengths = np.random.normal(mask_length, mask_other, size=num_mask) + lengths = rng.normal(mask_length, mask_other, size=num_mask) lengths = [max(1, int(round(x))) for x in lengths] elif mask_type == "poisson": - lengths = np.random.poisson(mask_length, size=num_mask) + lengths = rng.poisson(mask_length, size=num_mask) lengths = [int(round(x)) for x in lengths] else: raise Exception("unknown mask selection " + mask_type) if sum(lengths) == 0: - lengths[0] = min(mask_length, sz - 1) + if mask_type == "static": + raise ValueError(f"this should never happens") + else: + lengths = [min(mask_length, sz - 1)] if no_overlap: mask_idc = [] def arrange(s, e, length, keep_length): - span_start = np.random.randint(s, e - length) + span_start = rng.randint(s, e - length) mask_idc.extend(span_start + i for i in range(length)) new_parts = [] if span_start - s - min_space >= keep_length: new_parts.append((s, span_start - min_space + 1)) - if e - span_start - keep_length - min_space > keep_length: + if e - span_start - length - min_space > keep_length: new_parts.append((span_start + length + min_space, e)) return new_parts @@ -448,16 +527,20 @@ def arrange(s, e, length, keep_length): if l_sum == 0: break probs = lens / np.sum(lens) - c = np.random.choice(len(parts), p=probs) + c = rng.choice(len(parts), p=probs) s, e = parts.pop(c) parts.extend(arrange(s, e, length, min_length)) mask_idc = np.asarray(mask_idc) else: - min_len = min(lengths) - if sz - min_len <= num_mask: - min_len = sz - num_mask - 1 - - mask_idc = np.random.choice(sz - min_len, num_mask, replace=False) + if idc_select_ver == 1: + min_len = min(lengths) + if sz - min_len <= num_mask: + min_len = sz - num_mask - 1 + mask_idc = rng.choice(sz - min_len, num_mask, replace=False) + elif idc_select_ver == 2: + mask_idc = rng.choice(sz, num_mask, replace=False) + else: + raise ValueError() mask_idc = np.asarray( [ @@ -467,14 +550,300 @@ def arrange(s, e, length, keep_length): ] ) - mask_idcs.append(np.unique(mask_idc[mask_idc < sz])) + mask_idc = np.unique(mask_idc[mask_idc < sz]) + if len(mask_idc) >= sz: + raise ValueError( + ( + f"the entire sequence is masked. " + f"sz={sz}; mask_idc[mask_idc]; " + f"index={indices[i] if indices is not None else None}" + ) + ) + mask_idcs.append(mask_idc) + + target_len = None + if require_same_masks: + if add_masks: + target_len = max([len(m) for m in mask_idcs]) + else: + target_len = min([len(m) for m in mask_idcs]) - min_len = min([len(m) for m in mask_idcs]) for i, mask_idc in enumerate(mask_idcs): - if len(mask_idc) > min_len: - mask_idc = np.random.choice(mask_idc, min_len, replace=False) + if target_len is not None and len(mask_idc) > target_len: + mask_idc = rng.choice(mask_idc, target_len, replace=False) + mask[i, mask_idc] = True + if target_len is not None and len(mask_idc) < target_len: + unmasked = np.flatnonzero(~mask[i]) + to_mask = rng.choice(unmasked, target_len - len(mask_idc), replace=False) + mask[i, to_mask] = True + + if mask_dropout > 0: + masked = np.flatnonzero(mask[i]) + num_holes = np.rint(len(masked) * mask_dropout).astype(int) + to_drop = rng.choice(masked, num_holes, replace=False) + mask[i, to_drop] = False + + return mask + + +def compute_block_mask_2d( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + mask_prob_adjust: float = 0, + inverse_mask: bool = False, + require_same_masks: bool = True, + expand_adjcent: bool = False, + mask_dropout: float = 0, + non_overlapping: bool = False, +) -> torch.Tensor: + + assert mask_length > 1 + + B, L = shape + + d = int(L**0.5) + + if inverse_mask: + mask_prob = 1 - mask_prob + + if non_overlapping: + sz = math.ceil(d / mask_length) + inp_len = sz * sz + + inp = torch.zeros((B, 1, sz, sz)) + w = torch.ones((1, 1, mask_length, mask_length)) + + mask_inds = torch.multinomial( + 1 - inp.view(B, -1), + int(inp_len * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)), + replacement=False, + ) + inp.view(B, -1).scatter_(1, mask_inds, 1) + + mask = torch.nn.functional.conv_transpose2d(inp, w, stride=mask_length).squeeze( + 1 + ) + if mask.size(-1) > d: + mask = mask[..., :d, :d] + else: + mask = torch.zeros((B, d, d)) + mask_inds = torch.randint( + 0, + L, + size=( + B, + int( + L + * ((mask_prob + mask_prob_adjust) / mask_length**2) + * (1 + mask_dropout) + ), + ), + ) + mask.view(B, -1).scatter_(1, mask_inds, 1) + centers = mask.nonzero(as_tuple=True) + + inds = ([], [], []) + + offset = mask_length // 2 + for i in range(mask_length): + for j in range(mask_length): + k1 = i - offset + k2 = j - offset + inds[0].append(centers[0]) + inds[1].append(centers[1] + k1) + inds[2].append(centers[2] + k2) + + i0 = torch.cat(inds[0]) + i1 = torch.cat(inds[1]).clamp_(min=0, max=d - 1) + i2 = torch.cat(inds[2]).clamp_(min=0, max=d - 1) + + mask[(i0, i1, i2)] = 1 + + def get_nbs(b, m, w): + all_nbs = torch.nn.functional.conv2d(m.unsqueeze(1), w, padding="same") + all_nbs = all_nbs.clamp_max_(1).view(b, -1) + return all_nbs + + if require_same_masks and expand_adjcent: + w = torch.zeros((1, 1, 3, 3)) + w[..., 0, 1] = 1 + w[..., 2, 1] = 1 + w[..., 1, 0] = 1 + w[..., 1, 2] = 1 + + all_nbs = get_nbs(B, mask, w) + + mask = mask.reshape(B, -1) + + if require_same_masks: + n_masks = mask.sum(dim=-1) + final_target_len = int(L * (mask_prob)) + target_len = int(final_target_len * (1 + mask_dropout)) + + for i in range(len(mask)): + n = n_masks[i] + m = mask[i] + r = 0 + while expand_adjcent and n < target_len: + if r == 0: + nbs = all_nbs[i] + else: + nbs = get_nbs(1, m.view(1, d, d), w).flatten() + + cands = (1 - m + nbs) > 1 + cand_sz = int(cands.sum().item()) + + assert cand_sz > 0, f"{nbs} {cand_sz}" + + to_mask = torch.multinomial( + cands.float(), min(cand_sz, int(target_len - n)), replacement=False + ) + m[to_mask] = 1 + assert to_mask.numel() > 0 + n += to_mask.numel() + r += 1 + + if n > final_target_len: + to_unmask = torch.multinomial( + m, int(n - final_target_len), replacement=False + ) + m[to_unmask] = 0 + elif n < final_target_len: + to_mask = torch.multinomial( + (1 - m), int(final_target_len - n), replacement=False + ) + m[to_mask] = 1 + + if inverse_mask: + mask = 1 - mask + + return mask + + +def compute_block_mask_1d( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + mask_prob_adjust: float = 0, + inverse_mask: bool = False, + require_same_masks: bool = True, + expand_adjcent: bool = False, + mask_dropout: float = 0, + non_overlapping: bool = False, +) -> torch.Tensor: + + B, L = shape + + if inverse_mask: + mask_prob = 1 - mask_prob + + if non_overlapping: + sz = math.ceil(L / mask_length) + + inp = torch.zeros((B, 1, sz)) + w = torch.ones((1, 1, mask_length)) + + mask_inds = torch.multinomial( + 1 - inp.view(B, -1), + int(sz * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)), + replacement=False, + ) + inp.view(B, -1).scatter_(1, mask_inds, 1) + + mask = torch.nn.functional.conv_transpose1d(inp, w, stride=mask_length).squeeze( + 1 + ) + if mask.size(-1) > L: + mask = mask[..., :L] + + else: + mask = torch.zeros((B, L)) + mask_inds = torch.randint( + 0, + L, + size=( + B, + int( + L + * ((mask_prob + mask_prob_adjust) / mask_length) + * (1 + mask_dropout) + ), + ), + ) + + mask.view(B, -1).scatter_(1, mask_inds, 1) + centers = mask.nonzero(as_tuple=True) + + inds = ([], []) + + offset = mask_length // 2 + for i in range(mask_length): + k1 = i - offset + inds[0].append(centers[0]) + inds[1].append(centers[1] + k1) + + i0 = torch.cat(inds[0]) + i1 = torch.cat(inds[1]).clamp_(min=0, max=L - 1) + + mask[(i0, i1)] = 1 + + def get_nbs(b, m, w): + all_nbs = torch.nn.functional.conv1d(m.unsqueeze(1), w, padding="same") + all_nbs = all_nbs.clamp_max_(1).view(b, -1) + return all_nbs + + if require_same_masks and expand_adjcent: + w = torch.ones((1, 1, 3)) + w[..., 1] = 0 + all_nbs = get_nbs(B, mask, w) + + mask = mask.view(B, -1) + + if require_same_masks: + n_masks = mask.sum(dim=-1) + final_target_len = int(L * (mask_prob)) + target_len = int(final_target_len * (1 + mask_dropout)) + + for i in range(len(mask)): + n = n_masks[i] + m = mask[i] + r = 0 + while expand_adjcent and n < target_len: + if r == 0: + nbs = all_nbs[i] + else: + nbs = get_nbs(1, m.unsqueeze(0), w).squeeze(0) + + cands = (1 - m + nbs) > 1 + cand_sz = int(cands.sum().item()) + + assert cand_sz > 0, f"{nbs} {cand_sz}" + + to_mask = torch.multinomial( + cands.float(), min(cand_sz, int(target_len - n)), replacement=False + ) + m[to_mask] = 1 + assert to_mask.numel() > 0 + n += to_mask.numel() + r += 1 + + if n > final_target_len: + to_unmask = torch.multinomial( + m, int(n - final_target_len), replacement=False + ) + m[to_unmask] = 0 + elif n < final_target_len: + to_mask = torch.multinomial( + (1 - m), int(final_target_len - n), replacement=False + ) + m[to_mask] = 1 + + if inverse_mask: + mask = 1 - mask + return mask @@ -488,12 +857,288 @@ def get_mem_usage(): return "N/A" -def lengths_to_padding_mask(lens: torch.LongTensor) -> torch.BoolTensor: +# lens: torch.LongTensor +# returns: torch.BoolTensor +def lengths_to_padding_mask(lens): bsz, max_lens = lens.size(0), torch.max(lens).item() mask = torch.arange(max_lens).to(lens.device).view(1, max_lens) mask = mask.expand(bsz, -1) >= lens.view(bsz, 1).expand(-1, max_lens) return mask -def lengths_to_mask(lens: torch.LongTensor) -> torch.BoolTensor: +# lens: torch.LongTensor +# returns: torch.BoolTensor +def lengths_to_mask(lens): return ~lengths_to_padding_mask(lens) + + +def get_buckets(sizes, num_buckets): + buckets = np.unique( + np.percentile( + sizes, + np.linspace(0, 100, num_buckets + 1), + interpolation="lower", + )[1:] + ) + return buckets + + +def get_bucketed_sizes(orig_sizes, buckets): + sizes = np.copy(orig_sizes) + assert np.min(sizes) >= 0 + start_val = -1 + for end_val in buckets: + mask = (sizes > start_val) & (sizes <= end_val) + sizes[mask] = end_val + start_val = end_val + return sizes + + +def _find_extra_valid_paths(dataset_path: str) -> set: + paths = utils.split_paths(dataset_path) + all_valid_paths = set() + for sub_dir in paths: + contents = PathManager.ls(sub_dir) + valid_paths = [c for c in contents if re.match("valid*[0-9].*", c) is not None] + all_valid_paths |= {os.path.basename(p) for p in valid_paths} + # Remove .bin, .idx etc + roots = {os.path.splitext(p)[0] for p in all_valid_paths} + return roots + + +def raise_if_valid_subsets_unintentionally_ignored(train_cfg) -> None: + """Raises if there are paths matching 'valid*[0-9].*' which are not combined or ignored.""" + if ( + train_cfg.dataset.ignore_unused_valid_subsets + or train_cfg.dataset.combine_valid_subsets + or train_cfg.dataset.disable_validation + or not hasattr(train_cfg.task, "data") + ): + return + other_paths = _find_extra_valid_paths(train_cfg.task.data) + specified_subsets = train_cfg.dataset.valid_subset.split(",") + ignored_paths = [p for p in other_paths if p not in specified_subsets] + if ignored_paths: + advice = "Set --combine-val to combine them or --ignore-unused-valid-subsets to ignore them." + msg = f"Valid paths {ignored_paths} will be ignored. {advice}" + raise ValueError(msg) + + +def compute_mask_indices_for_one( + sz, + mask_prob: float, + mask_length: int, + seed=None, + epoch=None, + index=None, + min_masks=0, +): + """ + set seed, epoch, index for deterministic masking + """ + seed = int(hash((seed, epoch, index)) % 1e6) if seed else None + rng = np.random.default_rng(seed) + + # decide elements to mask + mask = np.full(sz, False) + num_mask = int( + # add a random number for probabilistic rounding + mask_prob * sz / float(mask_length) + + rng.random() + ) + num_mask = max(min_masks, num_mask) + + # multiple masking as described in the vq-wav2vec paper (https://arxiv.org/abs/1910.05453) + mask_idc = rng.choice(sz, num_mask, replace=False) + mask_idc = np.concatenate([mask_idc + i for i in range(mask_length)]) + mask_idc = mask_idc[mask_idc < len(mask)] + try: + mask[mask_idc] = True + except: # something wrong + print(f"Assigning mask indexes {mask_idc} to mask {mask} failed!") + raise + + return mask + + +def compute_mask_indices_v2( + shape: Tuple[int, int], + padding_mask: Optional[torch.Tensor], + mask_prob: float, + mask_length: int, + min_masks: int = 0, + require_same_masks: bool = True, + seed: Optional[int] = None, + epoch: Optional[int] = None, + indices: Optional[torch.Tensor] = None, +) -> np.ndarray: + bsz, all_sz = shape + mask = np.full((bsz, all_sz), False) + for i in range(bsz): + if padding_mask is not None: + sz = all_sz - padding_mask[i].long().sum().item() + else: + sz = all_sz + index = indices[i].item() if indices is not None else None + mask_for_one = compute_mask_indices_for_one( + sz, mask_prob, mask_length, seed, epoch, index, min_masks + ) + mask[i, :sz] = mask_for_one + + if require_same_masks: + index_sum = indices.sum().item() if indices is not None else None + seed = int(hash((seed, epoch, index_sum)) % 1e6) if seed else None + rng = np.random.default_rng(seed) + + num_mask = mask.sum(-1).min() + for i in range(bsz): + extra = mask[i].sum() - num_mask + if extra > 0: + to_unmask = rng.choice(np.nonzero(mask[i])[0], extra, replace=False) + mask[i, to_unmask] = False + + return mask + + +# TODO: a copy of the original compute_mask_indices +def compute_mask_indices_v3( + shape: Tuple[int, int], + padding_mask: Optional[torch.Tensor], + mask_prob: float, + mask_length: int, + mask_type: str = "static", + mask_other: float = 0.0, + min_masks: int = 0, + no_overlap: bool = False, + min_space: int = 0, + require_same_masks: bool = True, + mask_dropout: float = 0.0, + seed: Optional[int] = None, + epoch: Optional[int] = None, + indices: Optional[torch.Tensor] = None, +) -> np.ndarray: + """ + Computes random mask spans for a given shape + + Args: + shape: the the shape for which to compute masks. + should be of size 2 where first element is batch size and 2nd is timesteps + padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements + mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by + number of timesteps divided by length of mask span to mask approximately this percentage of all elements. + however due to overlaps, the actual number will be smaller (unless no_overlap is True) + mask_type: how to compute mask lengths + static = fixed size + uniform = sample from uniform distribution [mask_other, mask_length*2] + normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element + poisson = sample from possion distribution with lambda = mask length + min_masks: minimum number of masked spans + no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping + min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans + require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample + mask_dropout: randomly dropout this percentage of masks in each example + """ + bsz, all_sz = shape + mask = np.full((bsz, all_sz), False) + + all_num_mask = int( + # add a random number for probabilistic rounding + mask_prob * all_sz / float(mask_length) + + np.random.rand() + ) + + all_num_mask = max(min_masks, all_num_mask) + + mask_idcs = [] + for i in range(bsz): + if seed is not None and epoch is not None and indices is not None: + seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6) + else: + seed_i = None + rng = np.random.default_rng(seed_i) + + if padding_mask is not None: + sz = all_sz - padding_mask[i].long().sum().item() + num_mask = int( + # add a random number for probabilistic rounding + mask_prob * sz / float(mask_length) + + rng.random() + ) + num_mask = max(min_masks, num_mask) + else: + sz = all_sz + num_mask = all_num_mask + + if mask_type == "static": + lengths = np.full(num_mask, mask_length) + elif mask_type == "uniform": + lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask) + elif mask_type == "normal": + lengths = rng.normal(mask_length, mask_other, size=num_mask) + lengths = [max(1, int(round(x))) for x in lengths] + elif mask_type == "poisson": + lengths = rng.poisson(mask_length, size=num_mask) + lengths = [int(round(x)) for x in lengths] + else: + raise Exception("unknown mask selection " + mask_type) + + if sum(lengths) == 0: + lengths[0] = min(mask_length, sz - 1) + + if no_overlap: + mask_idc = [] + + def arrange(s, e, length, keep_length): + span_start = rng.randint(s, e - length) + mask_idc.extend(span_start + i for i in range(length)) + + new_parts = [] + if span_start - s - min_space >= keep_length: + new_parts.append((s, span_start - min_space + 1)) + if e - span_start - length - min_space > keep_length: + new_parts.append((span_start + length + min_space, e)) + return new_parts + + parts = [(0, sz)] + min_length = min(lengths) + for length in sorted(lengths, reverse=True): + lens = np.fromiter( + (e - s if e - s >= length + min_space else 0 for s, e in parts), + np.int, + ) + l_sum = np.sum(lens) + if l_sum == 0: + break + probs = lens / np.sum(lens) + c = rng.choice(len(parts), p=probs) + s, e = parts.pop(c) + parts.extend(arrange(s, e, length, min_length)) + mask_idc = np.asarray(mask_idc) + else: + min_len = min(lengths) + if sz - min_len <= num_mask: + min_len = sz - num_mask - 1 + + mask_idc = rng.choice(sz - min_len, num_mask, replace=False) + + mask_idc = np.asarray( + [ + mask_idc[j] + offset + for j in range(len(mask_idc)) + for offset in range(lengths[j]) + ] + ) + + mask_idcs.append(np.unique(mask_idc[mask_idc < sz])) + + min_len = min([len(m) for m in mask_idcs]) + for i, mask_idc in enumerate(mask_idcs): + if len(mask_idc) > min_len and require_same_masks: + mask_idc = rng.choice(mask_idc, min_len, replace=False) + if mask_dropout > 0: + num_holes = np.rint(len(mask_idc) * mask_dropout).astype(int) + mask_idc = rng.choice(mask_idc, len(mask_idc) - num_holes, replace=False) + + mask[i, mask_idc] = True + + return mask diff --git a/fairseq/data/data_utils_fast.pyx b/fairseq/data/data_utils_fast.pyx index 38b4aa67dd..c61f31d6b2 100644 --- a/fairseq/data/data_utils_fast.pyx +++ b/fairseq/data/data_utils_fast.pyx @@ -10,63 +10,118 @@ cimport cython cimport numpy as np from libc.stdint cimport int32_t, int64_t +from libcpp cimport bool as bool_t ctypedef int64_t DTYPE_t - -cdef _is_batch_full(int64_t num_sentences, int64_t num_tokens, int64_t max_tokens, int64_t max_sentences): - if num_sentences == 0: - return 0 - if max_sentences > 0 and num_sentences == max_sentences: - return 1 - if max_tokens > 0 and num_tokens > max_tokens: - return 1 - return 0 - - @cython.cdivision(True) -cpdef list batch_by_size_fast( +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef list batch_by_size_vec( + np.ndarray[int64_t, ndim=1] indices, + np.ndarray[int64_t, ndim=1] num_tokens_vec, + int64_t max_tokens, + int64_t max_sentences, + int32_t bsz_mult, +): + if indices.shape[0] == 0: + return [] + + assert max_tokens <= 0 or np.max(num_tokens_vec) <= max_tokens, ( + f"Sentences lengths should not exceed max_tokens={max_tokens}" + ) + + cdef int32_t indices_len = indices.shape[0] + cdef np.ndarray[int32_t, ndim=1] batches_ends = \ + np.zeros(indices_len, dtype=np.int32) + cdef int32_t[:] batches_ends_view = batches_ends + cdef int64_t[:] num_tokens_view = num_tokens_vec + + cdef int32_t pos = 0 + cdef int32_t new_batch_end = 0 + + cdef int64_t new_batch_max_tokens = 0 + cdef int32_t new_batch_sentences = 0 + cdef int64_t new_batch_num_tokens = 0 + + cdef bool_t overflow = False + cdef bool_t size_matches_with_bsz_mult = False + + cdef int32_t batches_count = 0 + cdef int32_t batch_start = 0 + cdef int64_t tail_max_tokens = 0 + cdef int64_t batch_max_tokens = 0 + + for pos in range(indices_len): + # At every pos we keep stats about the last complete batch [batch_start:batch_end), + # and tail [batch_end:pos]. + # 1) Every time when (batch + tail) forms a valid batch + # (according to max_tokens, max_sentences and bsz_mult) we append tail to batch. + # 2) When (batch+tail) violates max_tokens or max_sentences constraints + # we finalize running batch, and tail becomes a new batch. + # 3) There is a corner case when tail also violates constraints. + # In that situation [batch_end:pos-1] (tail without the current pos) + # gets added to the finalized batches, while [pos:pos] becomes a new tail. + # + # Important: For the sake of performance try to avoid using function calls within this loop. + + tail_max_tokens = tail_max_tokens \ + if tail_max_tokens > num_tokens_view[pos] \ + else num_tokens_view[pos] + new_batch_end = pos + 1 + new_batch_max_tokens = batch_max_tokens \ + if batch_max_tokens > tail_max_tokens \ + else tail_max_tokens + new_batch_sentences = new_batch_end - batch_start + new_batch_num_tokens = new_batch_sentences * new_batch_max_tokens + + overflow = (new_batch_sentences > max_sentences > 0 or + new_batch_num_tokens > max_tokens > 0) + size_matches_with_bsz_mult = (new_batch_sentences < bsz_mult or + new_batch_sentences % bsz_mult == 0) + + if overflow: + tail_num_tokens = tail_max_tokens * \ + (new_batch_end - batches_ends_view[batches_count]) + tail_overflow = tail_num_tokens > max_tokens > 0 + # In case of a tail overflow finalize two batches + if tail_overflow: + batches_count += 1 + batches_ends_view[batches_count] = pos + tail_max_tokens = num_tokens_view[pos] + batch_start = batches_ends_view[batches_count] + batches_count += 1 + new_batch_max_tokens = tail_max_tokens + + if overflow or size_matches_with_bsz_mult: + batches_ends_view[batches_count] = new_batch_end + batch_max_tokens = new_batch_max_tokens + tail_max_tokens = 0 + if batches_ends_view[batches_count] != indices_len: + batches_count += 1 + # Memory and time-efficient split + return np.split(indices, batches_ends[:batches_count]) + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef list batch_by_size_fn( np.ndarray[DTYPE_t, ndim=1] indices, num_tokens_fn, int64_t max_tokens, int64_t max_sentences, int32_t bsz_mult, ): - cdef int64_t sample_len = 0 - cdef list sample_lens = [] - cdef list batch = [] - cdef list batches = [] - cdef int64_t mod_len - cdef int64_t i - cdef int64_t idx - cdef int64_t num_tokens + cdef int32_t indices_len = indices.shape[0] + cdef np.ndarray[int64_t, ndim=1] num_tokens_vec = np.zeros(indices_len, + dtype=np.int64) cdef DTYPE_t[:] indices_view = indices - - for i in range(len(indices_view)): - idx = indices_view[i] - num_tokens = num_tokens_fn(idx) - sample_lens.append(num_tokens) - sample_len = max(sample_len, num_tokens) - - assert max_tokens <= 0 or sample_len <= max_tokens, ( - "sentence at index {} of size {} exceeds max_tokens " - "limit of {}!".format(idx, sample_len, max_tokens) - ) - num_tokens = (len(batch) + 1) * sample_len - - if _is_batch_full(len(batch), num_tokens, max_tokens, max_sentences): - mod_len = max( - bsz_mult * (len(batch) // bsz_mult), - len(batch) % bsz_mult, - ) - batches.append(batch[:mod_len]) - batch = batch[mod_len:] - sample_lens = sample_lens[mod_len:] - sample_len = max(sample_lens) if len(sample_lens) > 0 else 0 - batch.append(idx) - if len(batch) > 0: - batches.append(batch) - return batches + cdef DTYPE_t[:] num_tokens_vec_view = num_tokens_vec + cdef int64_t pos + for pos in range(indices_len): + num_tokens_vec[pos] = num_tokens_fn(indices_view[pos]) + return batch_by_size_vec(indices, num_tokens_vec, max_tokens, + max_sentences, bsz_mult,) cdef _find_valid_shape( diff --git a/fairseq/data/denoising_dataset.py b/fairseq/data/denoising_dataset.py index bdb62c8d5d..a900fc6f96 100644 --- a/fairseq/data/denoising_dataset.py +++ b/fairseq/data/denoising_dataset.py @@ -107,7 +107,6 @@ class DenoisingDataset(FairseqDataset): shuffle (bool, optional): shuffle the elements before batching. Default: ``True`` seed: Seed for random number generator for reproducibility. - args: argparse arguments. """ def __init__( @@ -119,7 +118,15 @@ def __init__( mask_whole_words, shuffle, seed, - args, + mask, + mask_random, + insert, + rotate, + permute_sentences, + bpe, + replace_length, + mask_length, + poisson_lambda, eos=None, item_transform_func=None, ): @@ -132,31 +139,31 @@ def __init__( self.seed = seed self.mask_idx = mask_idx self.mask_whole_word = mask_whole_words - self.mask_ratio = args.mask - self.random_ratio = args.mask_random - self.insert_ratio = args.insert - self.rotate_ratio = args.rotate - self.permute_sentence_ratio = args.permute_sentences + self.mask_ratio = mask + self.random_ratio = mask_random + self.insert_ratio = insert + self.rotate_ratio = rotate + self.permute_sentence_ratio = permute_sentences self.eos = eos if eos is not None else vocab.eos() self.item_transform_func = item_transform_func - if args.bpe != "gpt2": + if bpe != "gpt2": self.full_stop_index = self.vocab.eos() else: - assert args.bpe == "gpt2" + assert bpe == "gpt2" self.full_stop_index = self.vocab.index("13") - self.replace_length = args.replace_length + self.replace_length = replace_length if self.replace_length not in [-1, 0, 1]: raise ValueError(f"invalid arg: replace_length={self.replace_length}") - if args.mask_length not in ["subword", "word", "span-poisson"]: - raise ValueError(f"invalid arg: mask-length={args.mask_length}") - if args.mask_length == "subword" and args.replace_length not in [0, 1]: + if mask_length not in ["subword", "word", "span-poisson"]: + raise ValueError(f"invalid arg: mask-length={mask_length}") + if mask_length == "subword" and replace_length not in [0, 1]: raise ValueError(f"if using subwords, use replace-length=1 or 0") self.mask_span_distribution = None - if args.mask_length == "span-poisson": - _lambda = args.poisson_lambda + if mask_length == "span-poisson": + _lambda = poisson_lambda lambda_to_the_k = 1 e_to_the_minus_lambda = math.exp(-_lambda) diff --git a/fairseq/data/dictionary.py b/fairseq/data/dictionary.py index e2df08e092..7ad590a19b 100644 --- a/fairseq/data/dictionary.py +++ b/fairseq/data/dictionary.py @@ -9,13 +9,13 @@ import torch from fairseq import utils -from fairseq.binarizer import safe_readline from fairseq.data import data_utils +from fairseq.file_chunker_utils import Chunker, find_offsets from fairseq.file_io import PathManager from fairseq.tokenizer import tokenize_line -class Dictionary(object): +class Dictionary: """A mapping from symbols to consecutive integers""" def __init__( @@ -26,19 +26,21 @@ def __init__( eos="</s>", unk="<unk>", extra_special_symbols=None, + add_special_symbols=True, ): self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos self.symbols = [] self.count = [] self.indices = {} - self.bos_index = self.add_symbol(bos) - self.pad_index = self.add_symbol(pad) - self.eos_index = self.add_symbol(eos) - self.unk_index = self.add_symbol(unk) - if extra_special_symbols: - for s in extra_special_symbols: - self.add_symbol(s) - self.nspecial = len(self.symbols) + if add_special_symbols: + self.bos_index = self.add_symbol(bos) + self.pad_index = self.add_symbol(pad) + self.eos_index = self.add_symbol(eos) + self.unk_index = self.add_symbol(unk) + if extra_special_symbols: + for s in extra_special_symbols: + self.add_symbol(s) + self.nspecial = len(self.symbols) def __eq__(self, other): return self.indices == other.indices @@ -48,6 +50,9 @@ def __getitem__(self, idx): return self.symbols[idx] return self.unk_word + def get_count(self, idx): + return self.count[idx] + def __len__(self): """Returns the number of symbols in the dictionary""" return len(self.symbols) @@ -69,6 +74,8 @@ def string( escape_unk=False, extra_symbols_to_ignore=None, unk_string=None, + include_eos=False, + separator=" ", ): """Helper for converting a tensor of token indices to a string. @@ -76,12 +83,19 @@ def string( """ if torch.is_tensor(tensor) and tensor.dim() == 2: return "\n".join( - self.string(t, bpe_symbol, escape_unk, extra_symbols_to_ignore) + self.string( + t, + bpe_symbol, + escape_unk, + extra_symbols_to_ignore, + include_eos=include_eos, + ) for t in tensor ) extra_symbols_to_ignore = set(extra_symbols_to_ignore or []) - extra_symbols_to_ignore.add(self.eos()) + if not include_eos: + extra_symbols_to_ignore.add(self.eos()) def token_string(i): if i == self.unk(): @@ -95,7 +109,7 @@ def token_string(i): if hasattr(self, "bos_index"): extra_symbols_to_ignore.add(self.bos()) - sent = " ".join( + sent = separator.join( token_string(i) for i in tensor if utils.item(i) not in extra_symbols_to_ignore @@ -201,7 +215,7 @@ def unk(self): return self.unk_index @classmethod - def load(cls, f): + def load(cls, f, add_special_symbols=True): """Loads the dictionary from a text file with the format: ``` @@ -210,7 +224,7 @@ def load(cls, f): ... ``` """ - d = cls() + d = cls(add_special_symbols=add_special_symbols) d.add_from_file(f) return d @@ -256,7 +270,7 @@ def add_from_file(self, f): self.add_symbol(word, n=count, overwrite=overwrite) except ValueError: raise ValueError( - "Incorrect dictionary format, expected '<token> <cnt> [flags]'" + f"Incorrect dictionary format, expected '<token> <cnt> [flags]': \"{line}\"" ) def _save(self, f, kv_iterator): @@ -297,7 +311,7 @@ def encode_line( consumer=None, append_eos=True, reverse_order=False, - ): + ) -> torch.IntTensor: words = line_tokenizer(line) if reverse_order: words = list(reversed(words)) @@ -318,25 +332,18 @@ def encode_line( @staticmethod def _add_file_to_dictionary_single_worker( - filename, tokenize, eos_word, worker_id=0, num_workers=1 + filename, + tokenize, + eos_word, + start_offset, + end_offset, ): counter = Counter() - with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f: - size = os.fstat(f.fileno()).st_size - chunk_size = size // num_workers - offset = worker_id * chunk_size - end = offset + chunk_size - f.seek(offset) - if offset > 0: - safe_readline(f) # drop first incomplete line - line = f.readline() - while line: + with Chunker(filename, start_offset, end_offset) as line_iterator: + for line in line_iterator: for word in tokenize(line): counter.update([word]) counter.update([eos_word]) - if f.tell() > end: - break - line = f.readline() return counter @staticmethod @@ -345,14 +352,23 @@ def merge_result(counter): for w, c in sorted(counter.items()): dict.add_symbol(w, c) + local_file = PathManager.get_local_path(filename) + offsets = find_offsets(local_file, num_workers) if num_workers > 1: + chunks = zip(offsets, offsets[1:]) pool = Pool(processes=num_workers) results = [] - for worker_id in range(num_workers): + for (start_offset, end_offset) in chunks: results.append( pool.apply_async( Dictionary._add_file_to_dictionary_single_worker, - (filename, tokenize, dict.eos_word, worker_id, num_workers), + ( + local_file, + tokenize, + dict.eos_word, + start_offset, + end_offset, + ), ) ) pool.close() @@ -362,7 +378,7 @@ def merge_result(counter): else: merge_result( Dictionary._add_file_to_dictionary_single_worker( - filename, tokenize, dict.eos_word + local_file, tokenize, dict.eos_word, offsets[0], offsets[1] ) ) diff --git a/fairseq/data/encoders/__init__.py b/fairseq/data/encoders/__init__.py index 2e807d8ae7..7cbe00a105 100644 --- a/fairseq/data/encoders/__init__.py +++ b/fairseq/data/encoders/__init__.py @@ -23,7 +23,7 @@ # automatically import any Python files in the encoders/ directory -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): module = file[: file.find(".py")] importlib.import_module("fairseq.data.encoders." + module) diff --git a/fairseq/data/encoders/gpt2_bpe_utils.py b/fairseq/data/encoders/gpt2_bpe_utils.py index 688d4e36e3..996d3d4a11 100644 --- a/fairseq/data/encoders/gpt2_bpe_utils.py +++ b/fairseq/data/encoders/gpt2_bpe_utils.py @@ -27,10 +27,10 @@ def bytes_to_unicode(): ) cs = bs[:] n = 0 - for b in range(2 ** 8): + for b in range(2**8): if b not in bs: bs.append(b) - cs.append(2 ** 8 + n) + cs.append(2**8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) diff --git a/fairseq/data/encoders/hf_byte_bpe.py b/fairseq/data/encoders/hf_byte_bpe.py index 92d2c3922c..c508578d41 100644 --- a/fairseq/data/encoders/hf_byte_bpe.py +++ b/fairseq/data/encoders/hf_byte_bpe.py @@ -7,6 +7,7 @@ from fairseq.data.encoders import register_bpe from fairseq.dataclass import FairseqDataclass +from fairseq import file_utils @dataclass @@ -28,9 +29,12 @@ def __init__(self, cfg): "Please install huggingface/tokenizers with: " "pip install tokenizers" ) + bpe_vocab = file_utils.cached_path(cfg.bpe_vocab) + bpe_merges = file_utils.cached_path(cfg.bpe_merges) + self.bpe = ByteLevelBPETokenizer( - cfg.bpe_vocab, - cfg.bpe_merges, + bpe_vocab, + bpe_merges, add_prefix_space=cfg.bpe_add_prefix_space, ) diff --git a/fairseq/data/encoders/moses_tokenizer.py b/fairseq/data/encoders/moses_tokenizer.py index fa004dd4af..e236dad167 100644 --- a/fairseq/data/encoders/moses_tokenizer.py +++ b/fairseq/data/encoders/moses_tokenizer.py @@ -24,7 +24,7 @@ class MosesTokenizerConfig(FairseqDataclass): @register_tokenizer("moses", dataclass=MosesTokenizerConfig) class MosesTokenizer(object): - def __init__(self, cfg): + def __init__(self, cfg: MosesTokenizerConfig): self.cfg = cfg try: diff --git a/fairseq/data/encoders/nltk_tokenizer.py b/fairseq/data/encoders/nltk_tokenizer.py index ee164710a0..0ab92377b3 100644 --- a/fairseq/data/encoders/nltk_tokenizer.py +++ b/fairseq/data/encoders/nltk_tokenizer.py @@ -4,9 +4,10 @@ # LICENSE file in the root directory of this source tree. from fairseq.data.encoders import register_tokenizer +from fairseq.dataclass import FairseqDataclass -@register_tokenizer("nltk") +@register_tokenizer("nltk", dataclass=FairseqDataclass) class NLTKTokenizer(object): def __init__(self, *unused): try: diff --git a/fairseq/data/encoders/sentencepiece_bpe.py b/fairseq/data/encoders/sentencepiece_bpe.py index a76d46a201..0aa6cd7681 100644 --- a/fairseq/data/encoders/sentencepiece_bpe.py +++ b/fairseq/data/encoders/sentencepiece_bpe.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field +from typing import Optional from fairseq import file_utils from fairseq.data.encoders import register_bpe @@ -15,11 +16,23 @@ class SentencepieceConfig(FairseqDataclass): sentencepiece_model: str = field( default="???", metadata={"help": "path to sentencepiece model"} ) + sentencepiece_enable_sampling: bool = field( + default=False, metadata={"help": "enable sampling"} + ) + sentencepiece_alpha: Optional[float] = field( + default=None, + metadata={ + "help": "soothing parameter for unigram sampling, " + "and merge probability for BPE-dropout" + }, + ) @register_bpe("sentencepiece", dataclass=SentencepieceConfig) class SentencepieceBPE(object): def __init__(self, cfg): + self.enable_sampling = cfg.sentencepiece_enable_sampling + self.alpha = cfg.sentencepiece_alpha sentencepiece_model = file_utils.cached_path(cfg.sentencepiece_model) try: import sentencepiece as spm @@ -32,7 +45,11 @@ def __init__(self, cfg): ) def encode(self, x: str) -> str: - return " ".join(self.sp.EncodeAsPieces(x)) + return " ".join( + self.sp.Encode( + x, out_type=str, enable_sampling=self.enable_sampling, alpha=self.alpha + ) + ) def decode(self, x: str) -> str: return x.replace(" ", "").replace("\u2581", " ").strip() diff --git a/fairseq/data/encoders/space_tokenizer.py b/fairseq/data/encoders/space_tokenizer.py index 7c7f644d5c..925ad41b7c 100644 --- a/fairseq/data/encoders/space_tokenizer.py +++ b/fairseq/data/encoders/space_tokenizer.py @@ -6,9 +6,10 @@ import re from fairseq.data.encoders import register_tokenizer +from fairseq.dataclass import FairseqDataclass -@register_tokenizer("space") +@register_tokenizer("space", dataclass=FairseqDataclass) class SpaceTokenizer(object): def __init__(self, *unused): self.space_tok = re.compile(r"\s+") diff --git a/fairseq/data/fairseq_dataset.py b/fairseq/data/fairseq_dataset.py index ed08c1ba20..2bde7fc57b 100644 --- a/fairseq/data/fairseq_dataset.py +++ b/fairseq/data/fairseq_dataset.py @@ -3,10 +3,13 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import logging import numpy as np import torch.utils.data from fairseq.data import data_utils +logger = logging.getLogger(__name__) + class EpochListening: """Mixin for receiving updates whenever the epoch increments.""" @@ -54,6 +57,11 @@ def num_tokens(self, index): enforce ``--max-tokens`` during batching.""" raise NotImplementedError + def num_tokens_vec(self, indices): + """Return the number of tokens for a set of positions defined by indices. + This value is used to enforce ``--max-tokens`` during batching.""" + raise NotImplementedError + def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" @@ -129,9 +137,15 @@ def adjust_bsz(bsz, num_tokens): ] ) + try: + num_tokens_vec = self.num_tokens_vec(indices).astype("int64") + except NotImplementedError: + num_tokens_vec = None + return data_utils.batch_by_size( indices, num_tokens_fn=self.num_tokens, + num_tokens_vec=num_tokens_vec, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, diff --git a/fairseq/data/huffman/__init__.py b/fairseq/data/huffman/__init__.py new file mode 100644 index 0000000000..9b61fafadb --- /dev/null +++ b/fairseq/data/huffman/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .huffman_coder import HuffmanCodeBuilder, HuffmanCoder +from .huffman_mmap_indexed_dataset import ( + HuffmanMMapIndex, + HuffmanMMapIndexedDataset, + HuffmanMMapIndexedDatasetBuilder, + vocab_file_path, +) + +__all__ = [ + "HuffmanCoder", + "HuffmanCodeBuilder", + "HuffmanMMapIndexedDatasetBuilder", + "HuffmanMMapIndexedDataset", + "HuffmanMMapIndex", + "vocab_file_path", +] diff --git a/fairseq/data/huffman/huffman_coder.py b/fairseq/data/huffman/huffman_coder.py new file mode 100644 index 0000000000..c04f84564e --- /dev/null +++ b/fairseq/data/huffman/huffman_coder.py @@ -0,0 +1,267 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import re +import typing as tp +from collections import Counter, deque +from dataclasses import dataclass + +from bitarray import bitarray, util +from fairseq.data import Dictionary + +# basically we have to write to addressable bytes for the memory mapped +# dataset loader. Sentences that get encoded to a length that is not a +# multiple of BLOCKSIZE (a byte) will be padded to fit. (see _pad in the coder) +BLOCKSIZE = 8 + + +class HuffmanCoder: + def __init__( + self, root: "HuffmanNode", bos="<s>", pad="<pad>", eos="</s>", unk="<unk>" + ): + self.root = root + self.table = root.code_table() + self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos + + def _pad(self, a: bitarray) -> bitarray: + """ + bitpadding, 1 then 0. + + If the array is already a multiple of blocksize, we add a full block. + """ + pad_len = BLOCKSIZE - (len(a) % BLOCKSIZE) - 1 + padding = bitarray("1" + "0" * pad_len) + return a + padding + + def _unpad(self, a: bitarray) -> bitarray: + """ + remove the bitpadding. + + There will be a set of 0s preceded by a 1 at the end of the bitarray, we remove that + """ + # count the 0 padding at the end until we find the first 1 + # we want to remove the one too + remove_cnt = util.rindex(a, 1) + return a[:remove_cnt] + + def encode(self, iter: tp.List[str]) -> bytes: + """ + encode a list of tokens a return bytes. We use bitpadding to make sure the encoded bits fit in bytes. + """ + a = bitarray() + for token in iter: + code = self.get_code(token) + if code is None: + if self.unk_word is None: + raise Exception(f"unknown token {token} cannot be encoded.") + else: + token = self.unk_word + a = a + self.get_code(token) + return self._pad(a).tobytes() + + def decode(self, bits: bytes) -> tp.Iterator["HuffmanNode"]: + """ + take bitpadded bytes and decode it to a set of leaves. You can then use each node to find the symbol/id + """ + a = bitarray() + a.frombytes(bits) + return self.root.decode(self._unpad(a)) + + def get_code(self, symbol: str) -> tp.Optional[bitarray]: + node = self.get_node(symbol) + return None if node is None else node.code + + def get_node(self, symbol: str) -> "HuffmanNode": + return self.table.get(symbol) + + @classmethod + def from_file( + cls, + filename: str, + bos="<s>", + pad="<pad>", + eos="</s>", + unk="<unk>", + ) -> "HuffmanCoder": + builder = HuffmanCodeBuilder.from_file(filename) + return builder.build_code(bos=bos, pad=pad, eos=eos, unk=unk) + + def to_file(self, filename, sep="\t"): + nodes = list(self.table.values()) + nodes.sort(key=lambda n: n.id) + with open(filename, "w", encoding="utf-8") as output: + for n in nodes: + output.write(f"{n.symbol}{sep}{n.count}\n") + + def __iter__(self): + for n in self.table.values(): + yield n + + def merge(self, other_coder: "HuffmanCoder") -> "HuffmanCoder": + builder = HuffmanCodeBuilder() + for n in self: + builder.increment(n.symbol, n.count) + for n in other_coder: + builder.increment(n.symbol, n.count) + return builder.build_code() + + def __eq__(self, other: "HuffmanCoder") -> bool: + return self.table == other.table + + def __len__(self) -> int: + return len(self.table) + + def __contains__(self, sym: str) -> bool: + return sym in self.table + + def to_dictionary(self) -> Dictionary: + dictionary = Dictionary(bos=self.bos, unk=self.unk, pad=self.pad, eos=self.eos) + for n in self: + dictionary.add_symbol(n.symbol, n=n.count) + dictionary.finalize() + return dictionary + + +@dataclass +class HuffmanNode: + """ + a node in a Huffman tree + """ + + id: int + count: int + symbol: tp.Optional[str] = None + left: tp.Optional["HuffmanNode"] = None + right: tp.Optional["HuffmanNode"] = None + code: tp.Optional[bitarray] = None + + def is_leaf(self) -> bool: + return self.left is None and self.right is None + + def code_table( + self, prefix: tp.Optional[bitarray] = None + ) -> tp.Dict[str, "HuffmanNode"]: + defaulted_prefix = prefix if prefix is not None else bitarray() + if self.is_leaf(): + self.code = ( + defaulted_prefix if len(defaulted_prefix) > 0 else bitarray("0") + ) # leaf could be the root if there is only one symbol + return {self.symbol: self} + + codes_right = self.right.code_table(defaulted_prefix + bitarray([0])) + codes_left = self.left.code_table(defaulted_prefix + bitarray([1])) + return {**codes_left, **codes_right} + + def decode(self, bits: bitarray) -> tp.Iterator["HuffmanNode"]: + current_node = self + for bit in bits: + if bit == 0: # go right + current_node = current_node.right + else: # go left + current_node = current_node.left + if current_node is None: + # we shouldn't be on a leaf here + raise Exception("fell off a leaf") + if current_node.is_leaf(): + yield current_node + current_node = self + if current_node != self: + raise Exception("couldn't decode all the bits") + + +class HuffmanCodeBuilder: + """ + build a dictionary with occurence count and then build the Huffman code for it. + """ + + def __init__(self): + self.symbols = Counter() + + def add_symbols(self, *syms) -> None: + self.symbols.update(syms) + + def increment(self, symbol: str, cnt: int) -> None: + self.symbols[symbol] += cnt + + @classmethod + def from_file(cls, filename): + c = cls() + with open(filename, "r", encoding="utf-8") as input: + for line in input: + split = re.split(r"[\s]+", line) + c.increment(split[0], int(split[1])) + return c + + def to_file(self, filename, sep="\t"): + with open(filename, "w", encoding="utf-8") as output: + for (tok, cnt) in self.symbols.most_common(): + output.write(f"{tok}{sep}{cnt}\n") + + def _smallest(self, q1: deque, q2: deque) -> HuffmanNode: + if len(q1) == 0: + return q2.pop() + + if len(q2) == 0: + return q1.pop() + + if q1[-1].count < q2[-1].count: + return q1.pop() + + return q2.pop() + + def __add__(self, c: "HuffmanCodeBuilder") -> "HuffmanCodeBuilder": + new_c = self.symbols + c.symbols + new_b = HuffmanCodeBuilder() + new_b.symbols = new_c + return new_b + + def build_code( + self, + bos="<s>", + pad="<pad>", + eos="</s>", + unk="<unk>", + ) -> HuffmanCoder: + assert len(self.symbols) > 0, "cannot build code from empty list of symbols" + + if self.symbols[bos] == 0: + self.add_symbols(bos) + if self.symbols[pad] == 0: + self.add_symbols(pad) + if self.symbols[eos] == 0: + self.add_symbols(eos) + if self.symbols[unk] == 0: + self.add_symbols(unk) + + node_id = 0 + leaves_queue = deque( + [ + HuffmanNode(symbol=symbol, count=count, id=idx) + for idx, (symbol, count) in enumerate(self.symbols.most_common()) + ] + ) # left are the most common, right are the least common + + if len(leaves_queue) == 1: + root = leaves_queue.pop() + root.id = 0 + return HuffmanCoder(root) + + nodes_queue = deque() + + while len(leaves_queue) > 0 or len(nodes_queue) != 1: + # get the lowest two nodes at the head of each queue + node1 = self._smallest(leaves_queue, nodes_queue) + node2 = self._smallest(leaves_queue, nodes_queue) + + # add new node + nodes_queue.appendleft( + HuffmanNode( + count=node1.count + node2.count, left=node1, right=node2, id=node_id + ) + ) + node_id += 1 + + # we are left with the root + return HuffmanCoder(nodes_queue.pop(), bos=bos, pad=pad, eos=eos, unk=unk) diff --git a/fairseq/data/huffman/huffman_mmap_indexed_dataset.py b/fairseq/data/huffman/huffman_mmap_indexed_dataset.py new file mode 100644 index 0000000000..9b098f2c2b --- /dev/null +++ b/fairseq/data/huffman/huffman_mmap_indexed_dataset.py @@ -0,0 +1,287 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import mmap +import os +import shutil +import struct +import typing as tp +from functools import lru_cache + +import numpy as np +import torch +from fairseq.data import indexed_dataset +from fairseq.data.huffman import HuffmanCoder +from fairseq.file_io import PathManager + + +class HuffmanMMapIndex: + """ + keep an index of the offsets in the huffman binary file. + First a header, then the list of sizes (num tokens) for each instance and finally + the addresses of each instance. + """ + + _HDR_MAGIC = b"HUFFIDX\x00\x00" + _VERSION = 1 + + @classmethod + def writer(cls, path: str, data_len: int): + class _Writer: + def __enter__(self): + self._file = open(path, "wb") + + # write header (magic + version) + self._file.write(cls._HDR_MAGIC) + self._file.write(struct.pack("<Q", cls._VERSION)) + self._file.write(struct.pack("<Q", data_len)) + + return self + + def write(self, sizes, pointers): + # add number of items in the index to the header + self._file.write(struct.pack("<Q", len(sizes))) + + # write sizes + sizes = np.array(sizes, dtype=np.int32) + self._file.write(sizes.tobytes(order="C")) + del sizes + + # write address pointers + pointers = np.array(pointers, dtype=np.int64) + self._file.write(pointers.tobytes(order="C")) + del pointers + + def __exit__(self, exc_type, exc_val, exc_tb): + self._file.close() + + return _Writer() + + def __init__(self, path): + with open(path, "rb") as stream: + # read headers + magic_test = stream.read(9) + assert self._HDR_MAGIC == magic_test, ( + "Index file doesn't match expected format. " + "Make sure that --dataset-impl is configured properly." + ) + (version,) = struct.unpack("<Q", stream.read(8)) + assert ( + self._VERSION == version + ), f"Unexpected file version{version} != code version {self._VERSION}" + + # read length of data file + (self._data_len,) = struct.unpack("<Q", stream.read(8)) + # read number of items in data file/index + (self._len,) = struct.unpack("<Q", stream.read(8)) + offset = stream.tell() + + indexed_dataset._warmup_mmap_file(path) + + self._bin_buffer_mmap = np.memmap(path, mode="r", order="C") + self._bin_buffer = memoryview(self._bin_buffer_mmap) + self._sizes = np.frombuffer( + self._bin_buffer, dtype=np.int32, count=self._len, offset=offset + ) + self._pointers = np.frombuffer( + self._bin_buffer, + dtype=np.int64, + count=self._len, + offset=offset + self._sizes.nbytes, + ) + + def __del__(self): + self._bin_buffer_mmap._mmap.close() + del self._bin_buffer_mmap + + def __iter__(self): + for i in range(self._len): + yield self[i] + + @property + def data_len(self): + return self._data_len + + @property + def sizes(self): + return self._sizes + + @lru_cache(maxsize=8) + def __getitem__(self, i): + return self._pointers[i], self._sizes[i] + + def __len__(self): + return self._len + + +def vocab_file_path(prefix_path): + return prefix_path + ".vocab" + + +class HuffmanMMapIndexedDataset(torch.utils.data.Dataset): + """ + an indexed dataset that use mmap and memoryview to access data from disk + that was compressed with a HuffmanCoder. + """ + + def __init__(self, prefix_path): + super().__init__() + + self._prefix_path = None + self._index = None + self._bin_buffer = None + self._coder = None + self._file = None + + self._bin_buffer_mmap = None + + self._do_init(prefix_path) + + def __getstate__(self): + return self._prefix_path + + def __setstate__(self, state): + self._do_init(state) + + def _do_init(self, prefix_path): + self._prefix_path = prefix_path + self._index = HuffmanMMapIndex( + indexed_dataset.index_file_path(self._prefix_path) + ) + self._coder = HuffmanCoder.from_file(vocab_file_path(self._prefix_path)) + + indexed_dataset._warmup_mmap_file( + indexed_dataset.data_file_path(self._prefix_path) + ) + self._file = os.open( + indexed_dataset.data_file_path(self._prefix_path), os.O_RDONLY + ) + self._bin_buffer_mmap = mmap.mmap( + self._file, + self._index.data_len, + access=mmap.ACCESS_READ, + ) + self._bin_buffer = memoryview(self._bin_buffer_mmap) + + def __del__(self): + del self._bin_buffer + if self._file: + os.close(self._file) + del self._index + + def __len__(self): + return len(self._index) + + def _decode(self, i): + ptr, _ = self._index[i] + if i == 0: + raw_bytes = self._bin_buffer[:ptr] + else: + (prev_ptr, _) = self._index[i - 1] + raw_bytes = self._bin_buffer[prev_ptr:ptr] + + return self._coder.decode(raw_bytes.tobytes()) + + @lru_cache(maxsize=8) + def __getitem__(self, i): + nodes = self._decode(i) + return torch.tensor([n.id for n in nodes], dtype=torch.int64) + + def __iter__(self): + for idx in range(len(self)): + yield self[idx] + + def get_symbols(self, i): + nodes = self._decode(i) + for n in nodes: + yield n.symbol + + @property + def sizes(self): + return self._index.sizes + + @property + def supports_prefetch(self): + return False + + @property + def coder(self): + return self._coder + + @staticmethod + def exists(prefix_path): + return ( + PathManager.exists(indexed_dataset.index_file_path(prefix_path)) + and PathManager.exists(indexed_dataset.data_file_path(prefix_path)) + and PathManager.exists(vocab_file_path(prefix_path)) + ) + + +class HuffmanMMapIndexedDatasetBuilder: + """ + Helper to build a memory mapped datasets with a huffman encoder. + You can either open/close this manually or use it as a ContextManager. + Provide your own coder, it will then be stored alongside the dataset. + The builder will first write the vocab file, then open the binary file so you can stream + into it, finally the index will be written when the builder is closed (your index should fit in memory). + """ + + def __init__(self, path_prefix: str, coder: HuffmanCoder) -> None: + self._path_prefix = path_prefix + self._coder = coder + self._sizes = [] + self._ptrs = [] + self._data_len = 0 + + def open(self): + self._coder.to_file(vocab_file_path(self._path_prefix)) + self._data_file = open(indexed_dataset.data_file_path(self._path_prefix), "wb") + + def __enter__(self) -> "HuffmanMMapIndexedDatasetBuilder": + self.open() + return self + + def add_item(self, tokens: tp.List[str]) -> None: + """ + add a list of tokens to the dataset, they will compressed with the + provided coder before being written to file. + """ + encoded = self._coder.encode(tokens) + code_len = len(encoded) + last_ptr = 0 + if len(self._ptrs) > 0: + last_ptr = self._ptrs[-1] + self._sizes.append(len(tokens)) + self._ptrs.append(last_ptr + code_len) + self._data_len += code_len + self._data_file.write(encoded) + + def append(self, other_dataset_path_prefix: str) -> None: + """ + append an existing dataset. + Beware, if it wasn't built with the same coder, you are in trouble. + """ + other_index = HuffmanMMapIndex( + indexed_dataset.index_file_path(other_dataset_path_prefix) + ) + for (ptr, size) in other_index: + self._ptrs.append(ptr + self._data_len) + self._sizes.append(size) + + # Concatenate data + with open(indexed_dataset.data_file_path(other_dataset_path_prefix), "rb") as f: + shutil.copyfileobj(f, self._data_file) + + self._data_len += other_index.data_len + + def close(self): + self._data_file.close() + with HuffmanMMapIndex.writer( + indexed_dataset.index_file_path(self._path_prefix), self._data_len + ) as index: + index.write(self._sizes, self._ptrs) + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + self.close() diff --git a/fairseq/data/indexed_dataset.py b/fairseq/data/indexed_dataset.py index 3efecab3a6..1947d99408 100644 --- a/fairseq/data/indexed_dataset.py +++ b/fairseq/data/indexed_dataset.py @@ -3,28 +3,40 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import os import shutil import struct from functools import lru_cache import numpy as np import torch +from fairseq.dataclass.constants import DATASET_IMPL_CHOICES from fairseq.data.fasta_dataset import FastaDataset from fairseq.file_io import PathManager +from fairseq.data.huffman import HuffmanMMapIndexedDataset, HuffmanMMapIndex from . import FairseqDataset +from typing import Union -def __best_fitting_dtype(vocab_size=None): - if vocab_size is not None and vocab_size < 65500: + +def best_fitting_int_dtype( + max_int_to_represent, +) -> Union[np.uint16, np.uint32, np.int64]: + + if max_int_to_represent is None: + return np.uint32 # Safe guess + elif max_int_to_represent < 65500: return np.uint16 + elif max_int_to_represent < 4294967295: + return np.uint32 else: - return np.int32 + return np.int64 + # we avoid np.uint64 because it doesn't save space and its type promotion behaves unexpectedly + # https://github.com/numpy/numpy/issues/5745 def get_available_dataset_impl(): - return ["raw", "lazy", "cached", "mmap", "fasta"] + return list(map(str, DATASET_IMPL_CHOICES)) def infer_dataset_impl(path): @@ -37,6 +49,8 @@ def infer_dataset_impl(path): return "cached" elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]: return "mmap" + elif magic == HuffmanMMapIndex._HDR_MAGIC[:8]: + return "huffman" else: return None elif FastaDataset.exists(path): @@ -48,10 +62,14 @@ def infer_dataset_impl(path): def make_builder(out_file, impl, vocab_size=None): if impl == "mmap": return MMapIndexedDatasetBuilder( - out_file, dtype=__best_fitting_dtype(vocab_size) + out_file, dtype=best_fitting_int_dtype(vocab_size) ) elif impl == "fasta": raise NotImplementedError + elif impl == "huffman": + raise ValueError( + "Use HuffmanCodeBuilder directly as it has a different interface." + ) else: return IndexedDatasetBuilder(out_file) @@ -70,6 +88,8 @@ def make_dataset(path, impl, fix_lua_indexing=False, dictionary=None): from fairseq.data.fasta_dataset import EncodedFastaDataset return EncodedFastaDataset(path, dictionary) + elif impl == "huffman" and HuffmanMMapIndexedDataset.exists(path): + return HuffmanMMapIndexedDataset(path) return None @@ -78,6 +98,8 @@ def dataset_exists(path, impl): return IndexedRawTextDataset.exists(path) elif impl == "mmap": return MMapIndexedDataset.exists(path) + elif impl == "huffman": + return HuffmanMMapIndexedDataset.exists(path) else: return IndexedDataset.exists(path) @@ -92,21 +114,23 @@ def write_longs(f, a): f.write(np.array(a, dtype=np.int64)) -dtypes = { +_code_to_dtype = { 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, - 6: np.float, + 6: np.float64, 7: np.double, 8: np.uint16, + 9: np.uint32, + 10: np.uint64, } -def code(dtype): - for k in dtypes.keys(): - if dtypes[k] == dtype: +def _dtype_header_code(dtype) -> int: + for k in _code_to_dtype.keys(): + if _code_to_dtype[k] == dtype: return k raise ValueError(dtype) @@ -141,7 +165,7 @@ def read_index(self, path): version = f.read(8) assert struct.unpack("<Q", version) == (1,) code, self.element_size = struct.unpack("<QQ", f.read(16)) - self.dtype = dtypes[code] + self.dtype = _code_to_dtype[code] self._len, self.s = struct.unpack("<QQ", f.read(16)) self.dim_offsets = read_longs(f, self._len + 1) self.data_offsets = read_longs(f, self._len + 1) @@ -159,7 +183,7 @@ def __del__(self): self.data_file.close() @lru_cache(maxsize=8) - def __getitem__(self, i): + def __getitem__(self, i) -> torch.Tensor: if not self.data_file: self.read_data(self.path) self.check_index(i) @@ -296,14 +320,14 @@ def exists(path): return PathManager.exists(path) -class IndexedDatasetBuilder(object): +class IndexedDatasetBuilder: element_sizes = { np.uint8: 1, np.int8: 1, np.int16: 2, np.int32: 4, np.int64: 8, - np.float: 4, + np.float64: 4, np.double: 8, } @@ -348,7 +372,9 @@ def finalize(self, index_file): index = open(index_file, "wb") index.write(b"TNTIDX\x00\x00") index.write(struct.pack("<Q", 1)) - index.write(struct.pack("<QQ", code(self.dtype), self.element_size)) + index.write( + struct.pack("<QQ", _dtype_header_code(self.dtype), self.element_size) + ) index.write(struct.pack("<QQ", len(self.data_offsets) - 1, len(self.sizes))) write_longs(index, self.dim_offsets) write_longs(index, self.data_offsets) @@ -363,18 +389,18 @@ def _warmup_mmap_file(path): class MMapIndexedDataset(torch.utils.data.Dataset): - class Index(object): + class Index: _HDR_MAGIC = b"MMIDIDX\x00\x00" @classmethod def writer(cls, path, dtype): - class _Writer(object): + class _Writer: def __enter__(self): self._file = open(path, "wb") self._file.write(cls._HDR_MAGIC) self._file.write(struct.pack("<Q", 1)) - self._file.write(struct.pack("<B", code(dtype))) + self._file.write(struct.pack("<B", _dtype_header_code(dtype))) return self @@ -419,7 +445,7 @@ def __init__(self, path): assert (1,) == version (dtype_code,) = struct.unpack("<B", stream.read(1)) - self._dtype = dtypes[dtype_code] + self._dtype = _code_to_dtype[dtype_code] self._dtype_size = self._dtype().itemsize self._len = struct.unpack("<Q", stream.read(8))[0] @@ -516,8 +542,13 @@ def exists(path): data_file_path(path) ) + @property + def can_reuse_epoch_itr_across_epochs(self): + # TODO: a quick fix. make it a child class of FairseqDataset instead? + return True + -def get_indexed_dataset_to_local(path): +def get_indexed_dataset_to_local(path) -> str: local_index_path = PathManager.get_local_path(index_file_path(path)) local_data_path = PathManager.get_local_path(data_file_path(path)) @@ -531,7 +562,7 @@ def get_indexed_dataset_to_local(path): return local_path -class MMapIndexedDatasetBuilder(object): +class MMapIndexedDatasetBuilder: def __init__(self, out_file, dtype=np.int64): self._data_file = open(out_file, "wb") self._dtype = dtype diff --git a/fairseq/data/iterators.py b/fairseq/data/iterators.py index 15796234db..6a5a42a9cf 100644 --- a/fairseq/data/iterators.py +++ b/fairseq/data/iterators.py @@ -11,6 +11,7 @@ import queue import time from threading import Thread +from typing import Iterator, List import numpy as np import torch @@ -31,74 +32,54 @@ class CountingIterator(object): iterable (iterable): iterable to wrap start (int): starting iteration count. Note that this doesn't actually advance the iterator. - total (int): override the iterator length returned by - ``__len__``. This can be used to truncate *iterator*. + total (int): override the iterator length returned by ``__len``. + This can be used to truncate *iterator*. Attributes: n (int): number of elements consumed from this iterator """ def __init__(self, iterable, start=None, total=None): - self.iterable = iterable - self.itr = iter(self) - - if start is None: - self.n = getattr(iterable, "n", 0) - else: - self.n = start - - if total is None: - self.total = self.n + len(iterable) - else: - self.total = total + self._itr = iter(iterable) + self.n = start or getattr(iterable, "n", 0) + self.total = total if total is not None else self.n + len(iterable) def __len__(self): return self.total def __iter__(self): - for x in self.iterable: - if self.n >= self.total: - raise RuntimeError( - "Mismatch between actual and expected iterable length. " - "This may be caused by resuming training from a checkpoint using " - "a different number of GPUs, in which case you can try the " - "--reset-dataloader option. Alternatively you may have a train or " - "validation set that is smaller than the number of GPUs. If none " - "of these apply, please report this to the fairseq developers." - ) - self.n += 1 - yield x + return self def __next__(self): - return next(self.itr) + if not self.has_next(): + raise StopIteration + try: + x = next(self._itr) + except StopIteration: + raise IndexError( + f"Iterator expected to have length {self.total}, " + f"but exhausted at position {self.n}." + ) + self.n += 1 + return x def has_next(self): """Whether the iterator has been exhausted.""" - return self.n < len(self) + return self.n < self.total - def skip(self, num_to_skip): - """Fast-forward the iterator by skipping *num_to_skip* elements.""" - next(itertools.islice(self.itr, num_to_skip, num_to_skip), None) + def skip(self, n): + """Fast-forward the iterator by skipping n elements.""" + for _ in range(n): + next(self) return self def take(self, n): - """ - Truncates the iterator to n elements at most. - """ + """Truncate the iterator to n elements at most.""" self.total = min(self.total, n) - # Propagate this change to the underlying iterator - # Only take after what we have already consumed (i.e. after restarting - # from checkpoint mid epoch, we have to subtract self.n which is the - # starting point) - # - # This to maintain the invariant self.total = self.n + len(iterable), - # before calling __next__ or __iter__ - propagated_take = max(n - self.n, 0) - if hasattr(self.iterable, "take"): - self.iterable.take(propagated_take) - else: - self.iterable = itertools.islice(self.iterable, propagated_take) + if hasattr(self._itr, "take"): + self._itr.take(max(n - self.n, 0)) + return self class EpochBatchIterating(object): @@ -109,15 +90,19 @@ def __len__(self) -> int: def next_epoch_idx(self): raise NotImplementedError - def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False): + def next_epoch_itr( + self, shuffle=True, fix_batches_to_gpus=False, set_dataset_epoch=True + ): """Return a new iterator over the dataset. Args: shuffle (bool, optional): shuffle batches before returning the iterator (default: True). - fix_batches_to_gpus: ensure that batches are always + fix_batches_to_gpus (bool, optional): ensure that batches are always allocated to the same shards across epochs. Requires that :attr:`dataset` supports prefetching (default: False). + set_dataset_epoch (bool, optional): update the wrapped Dataset with + the new epoch number (default: True). """ raise NotImplementedError @@ -138,21 +123,54 @@ def load_state_dict(self, state_dict): """Copies the state of the iterator from the given *state_dict*.""" raise NotImplementedError + @property + def first_batch(self): + return "DUMMY" + class StreamingEpochBatchIterator(EpochBatchIterating): + """A steaming-style iterator over a :class:`torch.utils.data.IterableDataset`. + + Args: + dataset (~torch.utils.data.Dataset): dataset from which to load the data + max_sentences: batch size + collate_fn (callable): merges a list of samples to form a mini-batch + num_workers (int, optional): how many subprocesses to use for data + loading. 0 means the data will be loaded in the main process + (default: 0). + epoch (int, optional): the epoch to start the iterator from + (default: 1). + buffer_size (int, optional): the number of batches to keep ready in the + queue. Helps speeding up dataloading. When buffer_size is zero, the + default torch.utils.data.DataLoader preloading is used. + timeout (int, optional): if positive, the timeout value for collecting a batch + from workers. Should always be non-negative (default: ``0``). + """ + def __init__( self, dataset, + max_sentences=1, + collate_fn=None, epoch=1, - num_shards=1, - shard_id=0, + num_workers=0, + buffer_size=0, + timeout=0, + persistent_workers=True, ): assert isinstance(dataset, torch.utils.data.IterableDataset) self.dataset = dataset + self.max_sentences = max_sentences + self.collate_fn = collate_fn self.epoch = max(epoch, 1) # we use 1-based indexing for epochs + self.num_workers = num_workers + self.persistent_workers = persistent_workers and num_workers > 0 + # This upper limit here is to prevent people from abusing this feature + # in a shared computing environment. + self.buffer_size = min(buffer_size, 20) + self.timeout = timeout + self._current_epoch_iterator = None - self.num_shards = num_shards - self.shard_id = shard_id @property def next_epoch_idx(self): @@ -162,16 +180,13 @@ def next_epoch_idx(self): else: return self.epoch - def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False): + def next_epoch_itr( + self, shuffle=True, fix_batches_to_gpus=False, set_dataset_epoch=True + ): self.epoch = self.next_epoch_idx - self.dataset.set_epoch(self.epoch) - self._current_epoch_iterator = CountingIterator( - iterable=ShardedIterator( - iterable=self.dataset, - num_shards=self.num_shards, - shard_id=self.shard_id, - ), - ) + if set_dataset_epoch and hasattr(self.dataset, "set_epoch"): + self.dataset.set_epoch(self.epoch) + self._current_epoch_iterator = self._get_iterator_for_epoch(self.epoch, shuffle) return self._current_epoch_iterator def end_of_epoch(self) -> bool: @@ -191,6 +206,60 @@ def state_dict(self): def load_state_dict(self, state_dict): self.epoch = state_dict["epoch"] + def _get_iterator_for_epoch(self, epoch, shuffle, offset=0): + if self.num_workers > 0: + os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning" + + # Create data loader + worker_init_fn = getattr(self.dataset, "worker_init_fn", None) + itr = torch.utils.data.DataLoader( + self.dataset, + batch_size=self.max_sentences, + collate_fn=self.collate_fn, + num_workers=self.num_workers, + timeout=self.timeout, + worker_init_fn=worker_init_fn, + pin_memory=True, + persistent_workers=self.persistent_workers, + ) + + # Wrap with a BufferedIterator if needed + if self.buffer_size > 0: + itr = BufferedIterator(self.buffer_size, itr) + + # Wrap with CountingIterator + itr = CountingIterator(itr, start=offset) + + return itr + + +class FrozenBatchSampler: + def __init__( + self, + ordered_batches, + epoch, + fix_batches_to_gpus, + shuffle, + initial_offset, + ): + self.ordered_batches = ordered_batches + self.fix_batches_to_gpus = fix_batches_to_gpus + self.shuffle = shuffle + self.make_batches_for_epoch(epoch, initial_offset) + + def make_batches_for_epoch(self, epoch, offset=0): + self.batches = self.ordered_batches( + epoch, self.fix_batches_to_gpus, self.shuffle + ) + if offset > 0: + self.batches = self.batches[offset:] + + def __iter__(self) -> Iterator[List[int]]: + return iter(self.batches) + + def __len__(self) -> int: + return len(self.batches) + class EpochBatchIterator(EpochBatchIterating): """A multi-epoch iterator over a :class:`torch.utils.data.Dataset`. @@ -225,7 +294,15 @@ class EpochBatchIterator(EpochBatchIterating): queue. Helps speeding up dataloading. When buffer_size is zero, the default torch.utils.data.DataLoader preloading is used. timeout (int, optional): if positive, the timeout value for collecting a batch - from workers. Should always be non-negative. (default: ``0``) + from workers. Should always be non-negative (default: ``0``). + disable_shuffling (bool, optional): force disable shuffling + (default: ``False``). + skip_remainder_batch (bool, optional): if set, discard the last batch in an epoch + for the sake of training stability, as the last batch is usually smaller than + local_batch_size * distributed_word_size (default: ``False``). + grouped_shuffling (bool, optional): enable shuffling batches in groups + of num_shards. Ensures that each GPU receives similar length sequences when + batches are sorted by length. """ def __init__( @@ -240,6 +317,11 @@ def __init__( epoch=1, buffer_size=0, timeout=0, + disable_shuffling=False, + skip_remainder_batch=False, + grouped_shuffling=False, + reuse_dataloader=False, + persistent_workers=True, ): assert isinstance(dataset, torch.utils.data.Dataset) self.dataset = dataset @@ -252,17 +334,24 @@ def __init__( self.num_shards = num_shards self.shard_id = shard_id self.num_workers = num_workers + self.persistent_workers = persistent_workers and num_workers > 0 # This upper limit here is to prevent people from abusing this feature # in a shared computing environment. self.buffer_size = min(buffer_size, 20) self.timeout = timeout + self.disable_shuffling = disable_shuffling + self.skip_remainder_batch = skip_remainder_batch + self.grouped_shuffling = grouped_shuffling self.epoch = max(epoch, 1) # we use 1-based indexing for epochs - self.shuffle = True + self.shuffle = not disable_shuffling self._cur_epoch_itr = None self._next_epoch_itr = None self._supports_prefetch = getattr(dataset, "supports_prefetch", False) + self.dataloader = None + self.reuse_dataloader = reuse_dataloader + @property def frozen_batches(self): if self._frozen_batches is None: @@ -279,7 +368,7 @@ def first_batch(self): "a larger dataset." ) - if self.dataset.supports_fetch_outside_dataloader: + if getattr(self.dataset, "supports_fetch_outside_dataloader", True): return self.collate_fn([self.dataset[i] for i in self.frozen_batches[0]]) else: return "DUMMY" @@ -301,23 +390,31 @@ def next_epoch_idx(self): else: return self.epoch - def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False): + def next_epoch_itr( + self, shuffle=True, fix_batches_to_gpus=False, set_dataset_epoch=True + ): """Return a new iterator over the dataset. Args: shuffle (bool, optional): shuffle batches before returning the iterator (default: True). - fix_batches_to_gpus: ensure that batches are always + fix_batches_to_gpus (bool, optional): ensure that batches are always allocated to the same shards across epochs. Requires that :attr:`dataset` supports prefetching (default: False). + set_dataset_epoch (bool, optional): update the wrapped Dataset with + the new epoch number (default: True). """ + if self.disable_shuffling: + shuffle = False + prev_epoch = self.epoch self.epoch = self.next_epoch_idx - self.dataset.set_epoch(self.epoch) + if set_dataset_epoch and hasattr(self.dataset, "set_epoch"): + self.dataset.set_epoch(self.epoch) if self._next_epoch_itr is not None: self._cur_epoch_itr = self._next_epoch_itr self._next_epoch_itr = None else: - if callable(self.batch_sampler): + if callable(self.batch_sampler) and prev_epoch != self.epoch: # reset _frozen_batches to refresh the next epoch self._frozen_batches = None self._cur_epoch_itr = self._get_iterator_for_epoch( @@ -384,9 +481,69 @@ def load_state_dict(self, state_dict): def _get_iterator_for_epoch( self, epoch, shuffle, fix_batches_to_gpus=False, offset=0 ): + if self.reuse_dataloader and self.dataloader is not None: + self.epoch_batch_sampler.make_batches_for_epoch(epoch, offset) + itr = self.dataloader + else: + self.epoch_batch_sampler = FrozenBatchSampler( + self.ordered_batches, + epoch, + fix_batches_to_gpus, + shuffle, + initial_offset=offset, + ) + + if offset > 0 and len(self.epoch_batch_sampler) == 0: + return None + + if self.num_workers > 0: + os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning" + + # Create data loader + itr = torch.utils.data.DataLoader( + self.dataset, + collate_fn=self.collate_fn, + batch_sampler=self.epoch_batch_sampler, + num_workers=self.num_workers, + timeout=self.timeout, + pin_memory=True, + persistent_workers=self.persistent_workers, + ) + + if self.reuse_dataloader: + self.dataloader = itr + + # Wrap with a BufferedIterator if needed + if self.buffer_size > 0: + itr = BufferedIterator(self.buffer_size, itr) + + # Wrap with CountingIterator + itr = CountingIterator(itr, start=offset) + + if self.skip_remainder_batch: + # TODO: Below is a lazy implementation which discard the final batch regardless + # of whether it is a full batch or not. + + total_num_itrs = len(itr) - 1 + itr.take(total_num_itrs) + logger.info(f"skip final residual batch, total_num_itrs = {total_num_itrs}") + + return itr + + def ordered_batches(self, epoch, fix_batches_to_gpus, shuffle): def shuffle_batches(batches, seed): with data_utils.numpy_seed(seed): - np.random.shuffle(batches) + + if self.grouped_shuffling: + grouped_batches = [ + batches[(i * self.num_shards) : ((i + 1) * self.num_shards)] + for i in range((len(batches) // self.num_shards)) + ] + np.random.shuffle(grouped_batches) + batches = list(itertools.chain(*grouped_batches)) + else: + np.random.shuffle(batches) + return batches if self._supports_prefetch: @@ -410,29 +567,7 @@ def shuffle_batches(batches, seed): batches = list( ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[]) ) - - if offset > 0 and offset >= len(batches): - return None - - if self.num_workers > 0: - os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning" - - # Create data loader - itr = torch.utils.data.DataLoader( - self.dataset, - collate_fn=self.collate_fn, - batch_sampler=batches[offset:], - num_workers=self.num_workers, - timeout=self.timeout, - ) - - # Wrap with a BufferedIterator if needed - if self.buffer_size > 0: - itr = BufferedIterator(self.buffer_size, itr) - - # Wrap with CoutingIterator - itr = CountingIterator(itr, start=offset) - return itr + return batches class GroupedIterator(CountingIterator): @@ -441,29 +576,47 @@ class GroupedIterator(CountingIterator): Args: iterable (iterable): iterable to wrap chunk_size (int): size of each chunk - + skip_remainder_batch (bool, optional): if set, discard the last grouped batch in + each training epoch, as the last grouped batch is usually smaller than + local_batch_size * distributed_word_size * chunk_size (default: ``False``). Attributes: n (int): number of elements consumed from this iterator """ - def __init__(self, iterable, chunk_size): - itr = _chunk_iterator(iterable, chunk_size) + def __init__(self, iterable, chunk_size, skip_remainder_batch=False): + if skip_remainder_batch: + total_num_itrs = int(math.floor(len(iterable) / float(chunk_size))) + logger.info( + f"skip final residual batch, grouped total_num_itrs = {total_num_itrs}" + ) + else: + total_num_itrs = int(math.ceil(len(iterable) / float(chunk_size))) + logger.info(f"grouped total_num_itrs = {total_num_itrs}") + + itr = _chunk_iterator(iterable, chunk_size, skip_remainder_batch) super().__init__( itr, start=int(math.ceil(getattr(iterable, "n", 0) / float(chunk_size))), - total=int(math.ceil(len(iterable) / float(chunk_size))), + total=total_num_itrs, ) self.chunk_size = chunk_size + if skip_remainder_batch: + self.take(total_num_itrs) + # TODO: [Hack] Here the grouped iterator modifies the base iterator size so that + # training can move into the next epoch once the grouped iterator is exhausted. + # Double-check this implementation in case unexpected behavior occurs. + iterable.take(total_num_itrs * chunk_size) + -def _chunk_iterator(itr, chunk_size): +def _chunk_iterator(itr, chunk_size, skip_remainder_batch=False): chunk = [] for x in itr: chunk.append(x) if len(chunk) == chunk_size: yield chunk chunk = [] - if len(chunk) > 0: + if not skip_remainder_batch and len(chunk) > 0: yield chunk @@ -481,7 +634,12 @@ class ShardedIterator(CountingIterator): n (int): number of elements consumed from this iterator """ - def __init__(self, iterable, num_shards, shard_id, fill_value=None): + def __init__( + self, iterable, num_shards, shard_id, fill_value=None, skip_remainder_batch=None + ): + """ + Args: + skip_remainder_batch: ignored""" if shard_id < 0 or shard_id >= num_shards: raise ValueError("shard_id must be between 0 and num_shards") sharded_len = int(math.ceil(len(iterable) / float(num_shards))) @@ -501,15 +659,20 @@ def __init__(self, iterable, num_shards, shard_id, fill_value=None): class BackgroundConsumer(Thread): - def __init__(self, queue, source, max_len): + def __init__(self, queue, source, max_len, cuda_device): Thread.__init__(self) self._queue = queue self._source = source self._max_len = max_len self.count = 0 + self.cuda_device = cuda_device def run(self): + # set_device to avoid creation of GPU0 context when using pin_memory + if self.cuda_device is not None: + torch.cuda.set_device(self.cuda_device) + try: for item in self._source: self._queue.put(item) @@ -541,6 +704,7 @@ def _create_consumer(self): self._queue, self._iterable, self.total, + torch.cuda.current_device() if torch.cuda.is_available() else None, ) self._consumer.daemon = True self._consumer.start() @@ -553,10 +717,10 @@ def __len__(self): def take(self, n): self.total = min(self.total, n) - # Propagate this change to the underlying iterator if hasattr(self._iterable, "take"): self._iterable.take(n) + return self def __next__(self): # Create consumer if not created yet @@ -584,3 +748,132 @@ def __next__(self): if item is _sentinel: raise StopIteration() return item + + +class GroupedEpochBatchIterator(EpochBatchIterator): + """Grouped version of EpochBatchIterator + It takes several samplers from different datasets. + Each epoch shuffle the dataset wise sampler individually with different + random seed. The those sub samplers are combined with into + one big samplers with deterministic permutation to mix batches from + different datasets. It will act like EpochBatchIterator but make sure + 1) data from one data set each time + 2) for different workers, they use the same order to fetch the data + so they will use data from the same dataset everytime + mult_rate is used for update_freq > 1 case where we want to make sure update_freq + mini-batches come from same source + """ + + def __init__( + self, + dataset, + collate_fn, + batch_samplers, + seed=1, + num_shards=1, + shard_id=0, + num_workers=0, + epoch=0, + mult_rate=1, + buffer_size=0, + skip_remainder_batch=False, + ): + super().__init__( + dataset, + collate_fn, + batch_samplers, + seed, + num_shards, + shard_id, + num_workers, + epoch, + buffer_size, + skip_remainder_batch=skip_remainder_batch, + ) + # level 0: sub-samplers 1: batch_idx 2: batches + self._frozen_batches = tuple([tuple(sub_batch) for sub_batch in batch_samplers]) + self.step_size = mult_rate * num_shards + + self.lengths = [ + (len(x) // self.step_size) * self.step_size for x in self.frozen_batches + ] + + def __len__(self): + return sum(self.lengths) + + @property + def first_batch(self): + if len(self.frozen_batches) == 0: + raise Exception( + "The dataset is empty. This could indicate " + "that all elements in the dataset have been skipped. " + "Try increasing the max number of allowed tokens or using " + "a larger dataset." + ) + + if self.dataset.supports_fetch_outside_dataloader: + return self.collate_fn([self.dataset[i] for i in self.frozen_batches[0][0]]) + else: + return "DUMMY" + + def _get_iterator_for_epoch( + self, epoch, shuffle, fix_batches_to_gpus=False, offset=0 + ): + def shuffle_batches(batches, seed): + with data_utils.numpy_seed(seed): + np.random.shuffle(batches) + return batches + + def return_full_batches(batch_sets, seed, shuffle): + if shuffle: + batch_sets = [shuffle_batches(list(x), seed) for x in batch_sets] + + batch_sets = [ + batch_sets[i][: self.lengths[i]] for i in range(len(batch_sets)) + ] + batches = list(itertools.chain.from_iterable(batch_sets)) + + if shuffle: + with data_utils.numpy_seed(seed): + idx = np.random.permutation(len(batches) // self.step_size) + if len(idx) * self.step_size != len(batches): + raise ValueError( + "ERROR: %d %d %d %d" + % (len(idx), self.step_size, len(batches), self.shard_id), + ":".join(["%d" % x for x in self.lengths]), + ) + mini_shards = [ + batches[i * self.step_size : (i + 1) * self.step_size] + for i in idx + ] + batches = list(itertools.chain.from_iterable(mini_shards)) + + return batches + + if self._supports_prefetch: + raise NotImplementedError("To be implemented") + else: + batches = return_full_batches( + self.frozen_batches, self.seed + epoch, shuffle + ) + batches = list( + ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[]) + ) + + if offset > 0 and offset >= len(batches): + return None + + if self.num_workers > 0: + os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning" + + itr = torch.utils.data.DataLoader( + self.dataset, + collate_fn=self.collate_fn, + batch_sampler=batches[offset:], + num_workers=self.num_workers, + persistent_workers=self.persistent_workers, + ) + if self.buffer_size > 0: + itr = BufferedIterator(self.buffer_size, itr) + + return CountingIterator(itr, start=offset) diff --git a/fairseq/data/language_pair_dataset.py b/fairseq/data/language_pair_dataset.py index 62e7109b33..fd356ddd04 100644 --- a/fairseq/data/language_pair_dataset.py +++ b/fairseq/data/language_pair_dataset.py @@ -160,7 +160,7 @@ def compute_alignment_weights(alignments): constraints = torch.zeros((len(samples), max(lens))).long() for i, sample in enumerate(samples): constraints[i, 0 : lens[i]] = samples[i].get("constraints") - batch["constraints"] = constraints + batch["constraints"] = constraints.index_select(0, sort_order) return batch @@ -289,7 +289,7 @@ def __init__( # determine bucket sizes using self.num_tokens, which will return # the padded lengths (thanks to BucketPadLengthDataset) - num_tokens = np.vectorize(self.num_tokens, otypes=[np.long]) + num_tokens = np.vectorize(self.num_tokens, otypes=[np.compat.long]) self.bucketed_num_tokens = num_tokens(np.arange(len(self.src))) self.buckets = [ (None, num_tokens) for num_tokens in np.unique(self.bucketed_num_tokens) @@ -408,6 +408,14 @@ def num_tokens(self, index): self.tgt_sizes[index] if self.tgt_sizes is not None else 0, ) + def num_tokens_vec(self, indices): + """Return the number of tokens for a set of positions defined by indices. + This value is used to enforce ``--max-tokens`` during batching.""" + sizes = self.src_sizes[indices] + if self.tgt_sizes is not None: + sizes = np.maximum(sizes, self.tgt_sizes[indices]) + return sizes + def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" diff --git a/fairseq/data/lm_context_window_dataset.py b/fairseq/data/lm_context_window_dataset.py index 29ad887b7d..1a945927cf 100644 --- a/fairseq/data/lm_context_window_dataset.py +++ b/fairseq/data/lm_context_window_dataset.py @@ -5,16 +5,35 @@ import numpy as np import torch +from typing import Dict + from fairseq.data.monolingual_dataset import MonolingualDataset from . import FairseqDataset class LMContextWindowDataset(FairseqDataset): - """Wraps a MonolingualDataset and provides more context for evaluation.""" - - def __init__(self, dataset, tokens_per_sample, context_window, pad_idx): - assert isinstance(dataset, MonolingualDataset) + """ + Wraps a MonolingualDataset and provides more context for evaluation. + + Each item in the new dataset will have a maximum size of + ``tokens_per_sample + context_window``. + + Args: + dataset: dataset to wrap + tokens_per_sample (int): the max number of tokens in each dataset item + context_window (int): the number of accumulated tokens to add to each + dataset item + pad_idx (int): padding symbol + """ + + def __init__( + self, + dataset: MonolingualDataset, + tokens_per_sample: int, + context_window: int, + pad_idx: int, + ): assert context_window > 0 self.dataset = dataset self.tokens_per_sample = tokens_per_sample @@ -28,7 +47,7 @@ def __getitem__(self, index): def __len__(self): return len(self.dataset) - def collater(self, samples): + def collater(self, samples) -> Dict: sample = self.dataset.collater(samples) pad = self.pad_idx @@ -58,7 +77,6 @@ def collater(self, samples): sample["net_input"]["src_tokens"] = torch.from_numpy(new_toks) sample["target"] = torch.from_numpy(new_tgt) sample["start_indices"] = start_idxs - return sample def num_tokens(self, index): diff --git a/fairseq/data/mask_tokens_dataset.py b/fairseq/data/mask_tokens_dataset.py index 9e2c7119d8..0ca9051c9a 100644 --- a/fairseq/data/mask_tokens_dataset.py +++ b/fairseq/data/mask_tokens_dataset.py @@ -39,6 +39,10 @@ class MaskTokensDataset(BaseWrapperDataset): over vocab indices, indicating whether it is the beginning of a word. We will extend any mask to encompass the whole word. bpe: BPE to use for whole-word masking. + mask_multiple_length : repeat each mask index multiple times. Default + value is 1. + mask_stdev : standard deviation of masks distribution in case of + multiple masking. Default value is 0. """ @classmethod @@ -63,11 +67,16 @@ def __init__( random_token_prob: float = 0.1, freq_weighted_replacement: bool = False, mask_whole_words: torch.Tensor = None, + mask_multiple_length: int = 1, + mask_stdev: float = 0.0, + skip_masking: bool = False, ): assert 0.0 < mask_prob < 1.0 assert 0.0 <= random_token_prob <= 1.0 assert 0.0 <= leave_unmasked_prob <= 1.0 assert random_token_prob + leave_unmasked_prob <= 1.0 + assert mask_multiple_length >= 1 + assert mask_stdev >= 0.0 self.dataset = dataset self.vocab = vocab @@ -79,6 +88,9 @@ def __init__( self.leave_unmasked_prob = leave_unmasked_prob self.random_token_prob = random_token_prob self.mask_whole_words = mask_whole_words + self.mask_multiple_length = mask_multiple_length + self.mask_stdev = mask_stdev + self.skip_masking = skip_masking if random_token_prob > 0.0: if freq_weighted_replacement: @@ -98,81 +110,117 @@ def set_epoch(self, epoch, **unused): super().set_epoch(epoch) self.epoch = epoch - @lru_cache(maxsize=8) def __getitem__(self, index: int): - with data_utils.numpy_seed(self.seed, self.epoch, index): - item = self.dataset[index] - sz = len(item) - - assert ( - self.mask_idx not in item - ), "Dataset contains mask_idx (={}), this is not expected!".format( - self.mask_idx, - ) - - if self.mask_whole_words is not None: - word_begins_mask = self.mask_whole_words.gather(0, item) - word_begins_idx = word_begins_mask.nonzero(as_tuple=False).view(-1) - sz = len(word_begins_idx) - words = np.split(word_begins_mask, word_begins_idx)[1:] - assert len(words) == sz - word_lens = list(map(len, words)) - - # decide elements to mask - mask = np.full(sz, False) - num_mask = int( - # add a random number for probabilistic rounding - self.mask_prob * sz - + np.random.rand() - ) - mask[np.random.choice(sz, num_mask, replace=False)] = True + return self.__getitem_cached__(self.seed, self.epoch, index) - if self.return_masked_tokens: - # exit early if we're just returning the masked tokens - # (i.e., the targets for masked LM training) - if self.mask_whole_words is not None: - mask = np.repeat(mask, word_lens) - new_item = np.full(len(mask), self.pad_idx) - new_item[mask] = item[torch.from_numpy(mask.astype(np.uint8)) == 1] - return torch.from_numpy(new_item) - - # decide unmasking and random replacement - rand_or_unmask_prob = self.random_token_prob + self.leave_unmasked_prob - if rand_or_unmask_prob > 0.0: - rand_or_unmask = mask & (np.random.rand(sz) < rand_or_unmask_prob) - if self.random_token_prob == 0.0: - unmask = rand_or_unmask - rand_mask = None - elif self.leave_unmasked_prob == 0.0: - unmask = None - rand_mask = rand_or_unmask - else: - unmask_prob = self.leave_unmasked_prob / rand_or_unmask_prob - decision = np.random.rand(sz) < unmask_prob - unmask = rand_or_unmask & decision - rand_mask = rand_or_unmask & (~decision) - else: - unmask = rand_mask = None - - if unmask is not None: - mask = mask ^ unmask + @lru_cache(maxsize=8) + def __getitem_cached__(self, seed: int, epoch: int, index: int): + seed = int(hash((seed, epoch, index)) % 1e6) + rng = np.random.default_rng(seed) + item = self.dataset[index] + sz = len(item) + + assert ( + self.mask_idx not in item + ), "Dataset contains mask_idx (={}), this is not expected!".format( + self.mask_idx, + ) + if self.skip_masking: + return torch.from_numpy(np.copy(item)) + + if self.mask_whole_words is not None: + word_begins_mask = self.mask_whole_words.gather(0, item) + word_begins_idx = word_begins_mask.nonzero().view(-1) + sz = len(word_begins_idx) + words = np.split(word_begins_mask, word_begins_idx)[1:] + assert len(words) == sz + word_lens = list(map(len, words)) + + # decide elements to mask + mask = np.full(sz, False) + num_mask = int( + # add a random number for probabilistic rounding + self.mask_prob * sz / float(self.mask_multiple_length) + + rng.random() + ) + # multiple masking as described in the vq-wav2vec paper (https://arxiv.org/abs/1910.05453) + mask_idc = rng.choice(sz, num_mask, replace=False) + if self.mask_stdev > 0.0: + lengths = rng.normal( + self.mask_multiple_length, self.mask_stdev, size=num_mask + ) + lengths = [max(0, int(round(x))) for x in lengths] + mask_idc = np.asarray( + [ + mask_idc[j] + offset + for j in range(len(mask_idc)) + for offset in range(lengths[j]) + ], + dtype=np.int64, + ) + else: + mask_idc = np.concatenate( + [mask_idc + i for i in range(self.mask_multiple_length)] + ) + mask_idc = mask_idc[mask_idc < len(mask)] + try: + mask[mask_idc] = True + except: # something wrong + print("Assigning mask indexes {} to mask {} failed!".format(mask_idc, mask)) + raise + + # if self.return_masked_tokens: + # print(( + # f"IDX={index}; seed={seed}; epoch={epoch}; is_tgt={self.return_masked_tokens}: " + # f"{np.nonzero(mask)[0].sum()}" + # )) + if self.return_masked_tokens: + # exit early if we're just returning the masked tokens + # (i.e., the targets for masked LM training) if self.mask_whole_words is not None: mask = np.repeat(mask, word_lens) + new_item = np.full(len(mask), self.pad_idx) + new_item[mask] = item[torch.from_numpy(mask.astype(np.uint8)) == 1] + return torch.from_numpy(new_item) - new_item = np.copy(item) - new_item[mask] = self.mask_idx - if rand_mask is not None: - num_rand = rand_mask.sum() - if num_rand > 0: - if self.mask_whole_words is not None: - rand_mask = np.repeat(rand_mask, word_lens) - num_rand = rand_mask.sum() - - new_item[rand_mask] = np.random.choice( - len(self.vocab), - num_rand, - p=self.weights, - ) + # decide unmasking and random replacement + rand_or_unmask_prob = self.random_token_prob + self.leave_unmasked_prob + if rand_or_unmask_prob > 0.0: + rand_or_unmask = mask & (rng.random(sz) < rand_or_unmask_prob) + if self.random_token_prob == 0.0: + unmask = rand_or_unmask + rand_mask = None + elif self.leave_unmasked_prob == 0.0: + unmask = None + rand_mask = rand_or_unmask + else: + unmask_prob = self.leave_unmasked_prob / rand_or_unmask_prob + decision = rng.random(sz) < unmask_prob + unmask = rand_or_unmask & decision + rand_mask = rand_or_unmask & (~decision) + else: + unmask = rand_mask = None + + if unmask is not None: + mask = mask ^ unmask + + if self.mask_whole_words is not None: + mask = np.repeat(mask, word_lens) + + new_item = np.copy(item) + new_item[mask] = self.mask_idx + if rand_mask is not None: + num_rand = rand_mask.sum() + if num_rand > 0: + if self.mask_whole_words is not None: + rand_mask = np.repeat(rand_mask, word_lens) + num_rand = rand_mask.sum() - return torch.from_numpy(new_item) + new_item[rand_mask] = rng.choice( + len(self.vocab), + num_rand, + p=self.weights, + ) + + return torch.from_numpy(new_item) diff --git a/fairseq/data/monolingual_dataset.py b/fairseq/data/monolingual_dataset.py index ec73f1fda8..54fd583b64 100644 --- a/fairseq/data/monolingual_dataset.py +++ b/fairseq/data/monolingual_dataset.py @@ -9,7 +9,7 @@ from . import FairseqDataset, data_utils -def collate(samples, pad_idx, eos_idx): +def collate(samples, pad_idx, eos_idx, fixed_pad_length=None, pad_to_bsz=None): if len(samples) == 0: return {} @@ -23,6 +23,8 @@ def merge(key, is_list=False): pad_idx, eos_idx, left_pad=False, + pad_to_length=fixed_pad_length, + pad_to_bsz=pad_to_bsz, ) ) return res @@ -32,6 +34,8 @@ def merge(key, is_list=False): pad_idx, eos_idx, left_pad=False, + pad_to_length=fixed_pad_length, + pad_to_bsz=pad_to_bsz, ) src_tokens = merge("source") @@ -70,19 +74,27 @@ def __init__( dataset, sizes, src_vocab, - tgt_vocab, - add_eos_for_other_targets, - shuffle, + tgt_vocab=None, + add_eos_for_other_targets=False, + shuffle=False, targets=None, add_bos_token=False, + fixed_pad_length=None, + pad_to_bsz=None, + src_lang_idx=None, + tgt_lang_idx=None, ): self.dataset = dataset self.sizes = np.array(sizes) self.vocab = src_vocab - self.tgt_vocab = tgt_vocab + self.tgt_vocab = tgt_vocab or src_vocab self.add_eos_for_other_targets = add_eos_for_other_targets self.shuffle = shuffle self.add_bos_token = add_bos_token + self.fixed_pad_length = fixed_pad_length + self.pad_to_bsz = pad_to_bsz + self.src_lang_idx = src_lang_idx + self.tgt_lang_idx = tgt_lang_idx assert targets is None or all( t in {"self", "future", "past"} for t in targets @@ -165,6 +177,11 @@ def _maybe_add_bos(self, source, target): target = torch.cat([target.new([self.tgt_vocab.bos()]), target]) return source, target + def num_tokens_vec(self, indices): + """Return the number of tokens for a set of positions defined by indices. + This value is used to enforce ``--max-tokens`` during batching.""" + return self.sizes[indices] + def _filter_vocab(self, target): if len(self.tgt_vocab) != len(self.vocab): @@ -200,7 +217,13 @@ def collater(self, samples): target sentence of shape `(bsz, tgt_len)`. Padding will appear on the right. """ - return collate(samples, self.vocab.pad(), self.vocab.eos()) + return collate( + samples, + self.vocab.pad(), + self.vocab.eos(), + self.fixed_pad_length, + self.pad_to_bsz, + ) def num_tokens(self, index): """Return the number of tokens in a sample. This value is used to diff --git a/fairseq/data/multi_corpus_dataset.py b/fairseq/data/multi_corpus_dataset.py index d2457666d6..6f2fe074b2 100644 --- a/fairseq/data/multi_corpus_dataset.py +++ b/fairseq/data/multi_corpus_dataset.py @@ -3,28 +3,32 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import asyncio import logging +import time from collections import OrderedDict -from typing import Dict, List +from typing import Dict, List, Optional import numpy as np + from fairseq.data import data_utils from . import FairseqDataset - logger = logging.getLogger(__name__) class MultiCorpusDataset(FairseqDataset): """ - Stores multiple instances of FairseqDataset together. Requires each instance + Stores multiple instances of FairseqDataset together. + Unless batch_sample=True, requires each instance to be the same dataset, as the collate method needs to work on batches with samples from each dataset. Allows specifying a distribution over the datasets to use. Note that unlike MultiCorpusSampledDataset, this distribution allows sampling for each item, - rather than on a batch level. + rather than on a batch level. Note that datasets with sampling probabilty + of 0 will be skipped. Each time ordered_indices() is called, a new sample is generated with the specified distribution. @@ -35,6 +39,7 @@ class MultiCorpusDataset(FairseqDataset): corresponding dataset seed: random seed for sampling the datsets sort_indices: if true, will sort the ordered indices by size + batch_sample: if true, will ensure each batch is from a single dataset """ def __init__( @@ -43,65 +48,99 @@ def __init__( distribution: List[float], seed: int, sort_indices: bool = False, + batch_sample: bool = False, + distributed_rank: Optional[int] = None, ): super().__init__() assert isinstance(datasets, OrderedDict) assert len(datasets) == len(distribution) + assert sum(distribution) == 1 self.datasets = datasets self.distribution = distribution self.seed = seed self.sort_indices = sort_indices + self.batch_sample = batch_sample + self.distributed_rank = distributed_rank # Avoid repeated conversions to list later self.dataset_list = list(datasets.values()) self.total_num_instances = 0 - first_dataset = list(self.datasets.values())[0] + first_dataset = self.dataset_list[0] + self.num_instances_per_dataset = [] self.dataset_offsets = [] - for dataset in datasets.values(): + for i, dataset in enumerate(self.dataset_list): assert isinstance(dataset, FairseqDataset) assert type(dataset) is type(first_dataset) + self.num_instances_per_dataset.append( + 0 if self.distribution[i] == 0 else len(dataset) + ) self.dataset_offsets.append(self.total_num_instances) - self.total_num_instances += len(dataset) + self.total_num_instances += self.num_instances_per_dataset[i] def ordered_indices(self): + start = time.time() with data_utils.numpy_seed(self.seed, self.epoch): - # Used to store the order of indices of each dataset to use - indices = [ - np.random.permutation(len(dataset)) - for dataset in self.datasets.values() - ] - # Keep track of which samples we've used for each dataset - counters = [0 for _ in self.datasets] - - sampled_indices = [ - self._sample(indices, counters) for _ in range(self.total_num_instances) - ] + logger.info( + f"sampling new dataset with seed {self.seed} epoch {self.epoch}" + ) + sampled_indices = [] + num_selected_instances = 0 + + # For each dataset i, sample self.distribution[i] * self.total_num_instances + for i, key in enumerate(self.datasets): + if self.distribution[i] == 0: + # skip dataset if sampling probability is 0 + continue + + if i < len(self.datasets) - 1: + num_instances = int(self.distribution[i] * self.total_num_instances) + high = self.dataset_offsets[i + 1] + else: + num_instances = self.total_num_instances - num_selected_instances + high = self.total_num_instances + + logger.info(f"sampling {num_instances} from {key} dataset") + num_selected_instances += num_instances + + # First, add k copies of the dataset where k = num_instances // len(dataset). + # This ensures an equal distribution of the data points as much as possible. + # For the remaining entries randomly sample them + dataset_size = len(self.datasets[key]) + num_copies = num_instances // dataset_size + dataset_indices = ( + np.random.permutation(high - self.dataset_offsets[i]) + + self.dataset_offsets[i] + )[: num_instances - num_copies * dataset_size] + if num_copies > 0: + sampled_indices += list( + np.concatenate( + ( + np.repeat( + np.arange(self.dataset_offsets[i], high), num_copies + ), + dataset_indices, + ) + ) + ) + else: + sampled_indices += list(dataset_indices) + + assert ( + len(sampled_indices) == self.total_num_instances + ), f"{len(sampled_indices)} vs {self.total_num_instances}" + + np.random.shuffle(sampled_indices) if self.sort_indices: sampled_indices.sort(key=lambda i: self.num_tokens(i)) - return np.array(sampled_indices, dtype=np.int64) - - def _sample(self, indices, counters): - # First pick dataset - dataset_idx = np.random.choice(len(self.distribution), p=self.distribution) - - # Then get dataset internal index - idx = indices[dataset_idx][counters[dataset_idx]] - - # Convert to multi-datasets index - idx += self.dataset_offsets[dataset_idx] - counters[dataset_idx] += 1 - - # Reset if we reach end - if counters[dataset_idx] == len(self.dataset_list[dataset_idx]): - counters[dataset_idx] = 0 - indices[dataset_idx] = np.random.permutation( - len(self.dataset_list[dataset_idx]) + logger.info( + "multi_corpus_dataset ordered_indices took {}s".format( + time.time() - start + ) ) - - return idx + return np.array(sampled_indices, dtype=np.int64) def _map_index(self, index: int): """ @@ -110,10 +149,10 @@ def _map_index(self, index: int): maps to index 1 of B. """ counter = 0 - for key, dataset in self.datasets.items(): - if index < counter + len(dataset): + for num_instances, key in zip(self.num_instances_per_dataset, self.datasets): + if index < counter + num_instances: return index - counter, key - counter += len(dataset) + counter += num_instances raise ValueError( "Invalid index: {}, max: {}".format(index, self.total_num_instances) ) @@ -124,19 +163,62 @@ def __len__(self): """ return self.total_num_instances + async def getitem(self, index): + new_index, key = self._map_index(index) + try: + if hasattr(self.datasets[key], "getitem"): + item = await self.datasets[key].getitem(new_index) + else: + item = self.datasets[key][new_index] + item["full_id"] = index + return item + except Exception as e: + e.args = (f"Error from {key} dataset", *e.args) + raise + def __getitem__(self, index): - index, key = self._map_index(index) - return self.datasets[key][index] + return asyncio.run(self.getitem(index)) + + async def getitems(self, indices): + # initialize a bunch of everstore read operations + # wait in the end to reduce overhead + # very helpful if io is latency bounded + + max_concurrency = 32 + sem = asyncio.Semaphore(max_concurrency) + + async def controlled_getitem(index): + async with sem: + return await self.getitem(index) + + coroutines = [] + for index in indices: + coroutines.append(controlled_getitem(index)) + results = await asyncio.gather(*coroutines) + return results + + def __getitems__(self, indices): + return asyncio.run(self.getitems(indices)) def collater(self, samples): """ - Since we enforce all datsets to be the same, collating is just - picking the first one and doing collate. + If we are doing batch sampling, then pick the right collater to use. + + Otherwise we assume all collaters are the same. """ if len(samples) == 0: return None - - return list(self.datasets.values())[0].collater(samples) + if "full_id" in samples[0]: + _, key = self._map_index(samples[0]["full_id"]) + try: + batch = self.datasets[key].collater(samples) + except Exception: + print(f"Collating failed for key {key}", flush=True) + raise + return batch + else: + # Subclasses may override __getitem__ to not specify full_id + return list(self.datasets.values())[0].collater(samples) def num_tokens(self, index: int): index, key = self._map_index(index) @@ -152,8 +234,52 @@ def can_reuse_epoch_itr_across_epochs(self): def set_epoch(self, epoch, **unused): super().set_epoch(epoch) + logger.info(f"setting epoch of multi_corpus_dataset to {epoch}") self.epoch = epoch @property def supports_prefetch(self): return False + + @property + def supports_fetch_outside_dataloader(self): + return all( + self.datasets[key].supports_fetch_outside_dataloader + for key in self.datasets + ) + + def batch_by_size( + self, + indices, + max_tokens=None, + max_sentences=None, + required_batch_size_multiple=1, + ): + if not self.batch_sample: + return super().batch_by_size( + indices, max_tokens, max_sentences, required_batch_size_multiple + ) + + dataset_indices = {key: [] for key in self.datasets} + for i in indices: + _, key = self._map_index(i) + dataset_indices[key].append(i) + + batches = [] + for key in dataset_indices: + cur_batches = super().batch_by_size( + np.array(dataset_indices[key], dtype=np.int64), + max_tokens, + max_sentences, + required_batch_size_multiple, + ) + logger.info(f"Created {len(cur_batches)} batches for dataset {key}") + batches += cur_batches + + # If this dataset is used in a distributed training setup, + # then shuffle such that the order is seeded by the distributed rank + # as well + if self.distributed_rank is not None: + with data_utils.numpy_seed(self.seed, self.epoch, self.distributed_rank): + np.random.shuffle(batches) + return batches diff --git a/fairseq/data/multi_corpus_sampled_dataset.py b/fairseq/data/multi_corpus_sampled_dataset.py index ad8e951cc9..e2e9fdf004 100644 --- a/fairseq/data/multi_corpus_sampled_dataset.py +++ b/fairseq/data/multi_corpus_sampled_dataset.py @@ -143,3 +143,10 @@ def prefetch(self, indices): dataset.prefetch( [self._map_index_to_dataset(key, index) for index in indices] ) + + @property + def supports_fetch_outside_dataloader(self): + return all( + self.datasets[key].supports_fetch_outside_dataloader + for key in self.datasets + ) diff --git a/fairseq/data/multilingual/multilingual_data_manager.py b/fairseq/data/multilingual/multilingual_data_manager.py index 8c14f4e3ad..876dfcec36 100644 --- a/fairseq/data/multilingual/multilingual_data_manager.py +++ b/fairseq/data/multilingual/multilingual_data_manager.py @@ -9,6 +9,7 @@ import math import os from collections import OrderedDict, defaultdict +from argparse import ArgumentError from fairseq import utils from fairseq.data import ( @@ -39,6 +40,9 @@ logger = logging.getLogger(__name__) +SRC_DICT_NAME = "src" +TGT_DICT_NAME = "tgt" + def _lang_id(dic: Dictionary, lang: str): """Return language ID index.""" @@ -59,6 +63,17 @@ def __init__(self, args, lang_pairs, langs, dicts, sampling_method): self.args = args self.seed = args.seed self.lang_pairs = lang_pairs + self.extra_lang_pairs = ( + list({p for _, v in args.extra_lang_pairs.items() for p in v.split(",")}) + if args.extra_lang_pairs + else [] + ) + self.src_langs = { + p.split("-")[0] for p in args.lang_pairs + self.extra_lang_pairs + } + self.tgt_langs = { + p.split("-")[1] for p in args.lang_pairs + self.extra_lang_pairs + } self.langs = langs self.dicts = dicts self.lang_dict = self.create_lang_dictionary(self.langs) @@ -98,6 +113,18 @@ def add_args(parser): "note that the ordering determines language token IDs; " "--langs and --lang-dict are two exclusive options", ) + parser.add_argument( + "--source-dict", + default=None, + type=str, + help="path to source dictionary; if specified it will override per language dictionary loading", + ) + parser.add_argument( + "--target-dict", + default=None, + type=str, + help="path to target dictionary; if specified it will override per language dictionary loading", + ) parser.add_argument( "--lang-tok-style", default=LangTokStyle.multilingual.value, @@ -125,20 +152,24 @@ def add_args(parser): metavar="BOOL", help="pad the target on the left", ) - parser.add_argument( - "--max-source-positions", - default=1024, - type=int, - metavar="N", - help="max number of tokens in the source sequence", - ) - parser.add_argument( - "--max-target-positions", - default=1024, - type=int, - metavar="N", - help="max number of tokens in the target sequence", - ) + try: + parser.add_argument( + "--max-source-positions", + default=1024, + type=int, + metavar="N", + help="max number of tokens in the source sequence", + ) + parser.add_argument( + "--max-target-positions", + default=1024, + type=int, + metavar="N", + help="max number of tokens in the target sequence", + ) + except ArgumentError: + # this might have already been defined. Once we transition this to hydra it should be fine to add it here. + pass parser.add_argument( "--upsample-primary", default=1, @@ -236,7 +267,7 @@ def add_args(parser): ) parser.add_argument( "--virtual-epoch-size", - default=1000000, + default=None, type=int, help="virtual epoch size to speed up data loading", ) @@ -346,7 +377,30 @@ def check_langs(langs, pairs): ), ) - # load dictionaries + def load_dictionary_and_postproc(path): + d = load_dictionary(path) + augment_dictionary( + dictionary=d, + language_list=language_list, + lang_tok_style=args.lang_tok_style, + langtoks_specs=args.langtoks_specs, + extra_data=args.extra_data, + ) + return d + + dicts = cls.load_all_dictionaries( + args, language_list, load_dictionary_and_postproc, training + ) + return language_list, dicts, training + + @classmethod + def load_all_dictionaries(cls, args, language_list, load_dictionary, training): + dicts = OrderedDict() + if args.source_dict is not None: + dicts[SRC_DICT_NAME] = load_dictionary(args.source_dict) + if args.target_dict is not None: + dicts[TGT_DICT_NAME] = load_dictionary(args.target_dict) + if training: extra_lang_pairs = ( list( @@ -355,35 +409,55 @@ def check_langs(langs, pairs): if args.extra_lang_pairs else [] ) - langs_to_load_dicts = sorted( - {x for p in args.lang_pairs + extra_lang_pairs for x in p.split("-")} + src_langs_to_load_dicts = sorted( + {p.split("-")[0] for p in (args.lang_pairs + extra_lang_pairs)} + ) + tgt_langs_to_load_dicts = sorted( + {p.split("-")[1] for p in (args.lang_pairs + extra_lang_pairs)} ) else: - langs_to_load_dicts = sorted([args.source_lang, args.target_lang]) + src_langs_to_load_dicts = [args.source_lang] + tgt_langs_to_load_dicts = [args.target_lang] - dicts = OrderedDict() paths = utils.split_paths(args.data) assert len(paths) > 0 - for lang in langs_to_load_dicts: - if args.fixed_dictionary is not None: - dicts[lang] = load_dictionary(args.fixed_dictionary) - else: + + def load_dicts(langs_to_load_dicts): + for lang in langs_to_load_dicts: dicts[lang] = load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(lang)) ) - augment_dictionary( - dictionary=dicts[lang], - language_list=language_list, - lang_tok_style=args.lang_tok_style, - langtoks_specs=args.langtoks_specs, - extra_data=args.extra_data, - ) if len(dicts) > 0: - assert dicts[lang].pad() == dicts[langs_to_load_dicts[0]].pad() - assert dicts[lang].eos() == dicts[langs_to_load_dicts[0]].eos() - assert dicts[lang].unk() == dicts[langs_to_load_dicts[0]].unk() + dict0 = next(iter(dicts.values())) + assert dicts[lang].pad() == dict0.pad() + assert dicts[lang].eos() == dict0.eos() + assert dicts[lang].unk() == dict0.unk() logger.info("[{}] dictionary: {} types".format(lang, len(dicts[lang]))) - return language_list, dicts, training + + if args.fixed_dictionary is not None: + fixed_dict = load_dictionary(args.fixed_dictionary) + dicts = { + lang: fixed_dict + for lang in src_langs_to_load_dicts + tgt_langs_to_load_dicts + } + else: + if args.source_dict is None: + load_dicts(src_langs_to_load_dicts) + if args.target_dict is None: + load_dicts(tgt_langs_to_load_dicts) + return dicts + + def get_source_dictionary(self, lang): + if self.args.source_dict is not None: + return self.dicts[SRC_DICT_NAME] + else: + return self.dicts[lang] + + def get_target_dictionary(self, lang): + if self.args.target_dict is not None: + return self.dicts[TGT_DICT_NAME] + else: + return self.dicts[lang] @classmethod def create_lang_dictionary(cls, langs): @@ -418,7 +492,10 @@ def get_encoder_langtok(self, src_lang, tgt_lang, spec=None): lang=tgt_lang, lang_tok_style=self.args.lang_tok_style, spec=spec ) return self.get_langtok_index( - langtok, self.dicts[src_lang if src_lang else tgt_lang] + langtok, + self.get_source_dictionary(src_lang) + if src_lang + else self.get_target_dictionary(tgt_lang), ) def get_decoder_langtok(self, tgt_lang, spec=None): @@ -427,7 +504,7 @@ def get_decoder_langtok(self, tgt_lang, spec=None): langtok = get_lang_tok( lang=tgt_lang, lang_tok_style=self.args.lang_tok_style, spec=spec ) - return self.get_langtok_index(langtok, self.dicts[tgt_lang]) + return self.get_langtok_index(langtok, self.get_target_dictionary(tgt_lang)) @classmethod def load_data(cls, path, vdict, impl): @@ -760,9 +837,11 @@ def load_a_dataset( if self.args.lang_tok_replacing_bos_eos: ds = self.alter_dataset_langtok( langpair_ds, - src_eos=self.dicts[src if src else tgt].eos(), + src_eos=self.get_source_dictionary(src).eos() + if src + else self.get_target_dictionary(tgt).eos(), src_lang=src, - tgt_eos=self.dicts[tgt].eos(), + tgt_eos=self.get_target_dictionary(tgt).eos(), tgt_lang=tgt, src_langtok_spec=src_langtok_spec, tgt_langtok_spec=tgt_langtok_spec, @@ -840,7 +919,7 @@ def get_split_num_data_shards(self, split): # monolingual data requires tgt only assert src is None or src == tgt, ( f"error: src={src}, " - "tgt={tgt} for data_category={data_category}" + f"tgt={tgt} for data_category={data_category}" ) num_shards_dict[key] = shards_dict[tgt] else: @@ -893,7 +972,7 @@ def get_split_data_param_list(self, split, epoch, shard_epoch=None): lang_dirs = [x if len(x) > 1 else (x[0], x[0]) for x in lang_dirs] for src, tgt in lang_dirs: assert src is not None or data_category == "mono_dae", ( - f"error: src={src}, " "tgt={tgt} for data_category={data_category}" + f"error: src={src}, " f"tgt={tgt} for data_category={data_category}" ) # logger.info(f"preparing param for {data_category}: {src} - {tgt}") key = self.get_dataset_key(data_category, src, tgt) @@ -906,11 +985,11 @@ def get_split_data_param_list(self, split, epoch, shard_epoch=None): "data_path": data_path, "split": split, "src": src, - "src_dict": self.dicts[src] + "src_dict": self.get_source_dictionary(src) if src and data_category != "mono_dae" else None, "tgt": tgt, - "tgt_dict": self.dicts[tgt], + "tgt_dict": self.get_target_dictionary(tgt), "data_category": data_category, "langtok_spec": lang_tok_spec, } @@ -1040,3 +1119,38 @@ def load_sampled_multi_epoch_dataset( ) else: return self.load_into_concat_dataset(split, datasets, data_param_list) + + def load_sampled_multi_dataset( + self, split, training, epoch=0, combine=False, shard_epoch=None, **kwargs + ): + datasets, data_param_list = self.load_split_datasets( + split, training, epoch, combine, shard_epoch=shard_epoch, **kwargs + ) + if training and split == getattr(self.args, "train_subset", None): + sample_ratios = self.get_sampling_ratios(data_param_list, datasets, epoch) + return SampledMultiDataset( + OrderedDict(datasets), + epoch=epoch, + # valid and test datasets will be degerate to concating datasets: + sampling_ratios=sample_ratios, + eval_key=None, + collate_format=CollateFormat.single, + virtual_size=self.args.virtual_data_size, + split=split, + # if not using lang_tok altering, simplified to use the same collater + shared_collater=self._shared_collater(), + ) + else: + return self.load_into_concat_dataset(split, datasets, data_param_list) + + def load_dataset( + self, split, training, epoch=0, combine=False, shard_epoch=None, **kwargs + ): + if self.args.virtual_epoch_size is None: + return self.load_sampled_multi_dataset( + split, training, epoch, combine, shard_epoch, **kwargs + ) + else: + return self.load_sampled_multi_epoch_dataset( + split, training, epoch, combine, shard_epoch, **kwargs + ) diff --git a/fairseq/data/multilingual/sampled_multi_dataset.py b/fairseq/data/multilingual/sampled_multi_dataset.py index 3f544b099f..ece9a9721e 100644 --- a/fairseq/data/multilingual/sampled_multi_dataset.py +++ b/fairseq/data/multilingual/sampled_multi_dataset.py @@ -14,8 +14,9 @@ import numpy as np import torch -from fairseq import distributed_utils + from fairseq.data import FairseqDataset, data_utils +from fairseq.distributed import utils as distributed_utils def get_time_gap(s, e): @@ -160,9 +161,13 @@ def _sync_sample_ratios(self, ratios): ratios = torch.DoubleTensor(ratios) if torch.distributed.is_initialized(): if torch.cuda.is_available(): - distributed_utils.all_reduce(ratios.cuda()) + distributed_utils.all_reduce( + ratios.cuda(), group=distributed_utils.get_data_parallel_group() + ) else: - distributed_utils.all_reduce(ratios) + distributed_utils.all_reduce( + ratios, group=distributed_utils.get_data_parallel_group() + ) ret = ratios.cpu() ret = ret.numpy() return ret @@ -234,6 +239,11 @@ def __getitem__(self, index): def num_tokens(self, index): return self.sizes[index].max() + def num_tokens_vec(self, indices): + sizes_vec = self.sizes[np.array(indices)] + # max across all dimensions but first one + return np.amax(sizes_vec, axis=tuple(range(1, len(sizes_vec.shape)))) + def size(self, index): return self.sizes[index] @@ -397,8 +407,8 @@ def _establish_virtual_datasets(self): ).hexdigest(), 16, ) - % (2 ** 32), - self.seed % (2 ** 32), # global seed + % (2**32), + self.seed % (2**32), # global seed self._cur_epoch, # epoch index, ] ) diff --git a/fairseq/data/multilingual/sampled_multi_epoch_dataset.py b/fairseq/data/multilingual/sampled_multi_epoch_dataset.py index 17387b2f85..bb187a8dc2 100644 --- a/fairseq/data/multilingual/sampled_multi_epoch_dataset.py +++ b/fairseq/data/multilingual/sampled_multi_epoch_dataset.py @@ -8,11 +8,11 @@ import math import numpy as np + from fairseq.data import SampledMultiDataset from .sampled_multi_dataset import CollateFormat, default_virtual_size_func - logger = logging.getLogger(__name__) @@ -155,8 +155,8 @@ def _next_global_indices(self, epoch): ).hexdigest(), 16, ) - % (2 ** 32), - self.seed % (2 ** 32), # global seed + % (2**32), + self.seed % (2**32), # global seed epoch, # epoch index, ] ) diff --git a/fairseq/data/noising.py b/fairseq/data/noising.py index 9643d1aa6a..e92e83c2cd 100644 --- a/fairseq/data/noising.py +++ b/fairseq/data/noising.py @@ -296,6 +296,7 @@ def __init__( **kwargs, ) ) + self.sizes = src_dataset.sizes def __getitem__(self, index): """ diff --git a/fairseq/data/pad_dataset.py b/fairseq/data/pad_dataset.py index 8075bba6a9..b512d370f9 100644 --- a/fairseq/data/pad_dataset.py +++ b/fairseq/data/pad_dataset.py @@ -9,13 +9,16 @@ class PadDataset(BaseWrapperDataset): - def __init__(self, dataset, pad_idx, left_pad): + def __init__(self, dataset, pad_idx, left_pad, pad_length=None): super().__init__(dataset) self.pad_idx = pad_idx self.left_pad = left_pad + self.pad_length = pad_length def collater(self, samples): - return data_utils.collate_tokens(samples, self.pad_idx, left_pad=self.left_pad) + return data_utils.collate_tokens( + samples, self.pad_idx, left_pad=self.left_pad, pad_to_length=self.pad_length + ) class LeftPadDataset(PadDataset): diff --git a/fairseq/data/padding_mask_dataset.py b/fairseq/data/padding_mask_dataset.py new file mode 100644 index 0000000000..d7f7b88dbb --- /dev/null +++ b/fairseq/data/padding_mask_dataset.py @@ -0,0 +1,38 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from fairseq.data import data_utils +from . import BaseWrapperDataset + + +class PaddingMaskDataset(BaseWrapperDataset): + def __init__(self, dataset, left_pad, pad_length=None): + super().__init__(dataset) + self.left_pad = left_pad + self.pad_length = pad_length + + def __getitem__(self, index): + item = self.dataset[index] + return torch.zeros_like(item).bool() + + def __len__(self): + return len(self.dataset) + + def collater(self, samples): + return data_utils.collate_tokens( + samples, True, left_pad=self.left_pad, pad_to_length=self.pad_length + ) + + +class LeftPaddingMaskDataset(PaddingMaskDataset): + def __init__(self, dataset): + super().__init__(dataset, left_pad=True) + + +class RightPaddingMaskDataset(PaddingMaskDataset): + def __init__(self, dataset): + super().__init__(dataset, left_pad=False) diff --git a/fairseq/data/plasma_utils.py b/fairseq/data/plasma_utils.py index 2b12646783..459fb8acd7 100644 --- a/fairseq/data/plasma_utils.py +++ b/fairseq/data/plasma_utils.py @@ -3,11 +3,23 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + +import hashlib +import json import subprocess import tempfile +from typing import Hashable + +try: + import pyarrow.plasma as plasma + + PYARROW_AVAILABLE = True +except ImportError: + plasma = None + PYARROW_AVAILABLE = False -class PlasmaArray(object): +class PlasmaArray: """ Wrapper around numpy arrays that automatically moves the data to shared memory upon serialization. This is particularly helpful when passing numpy @@ -31,12 +43,7 @@ def __init__(self, array): @property def plasma(self): if self._plasma is None and not self.disable: - try: - import pyarrow.plasma as plasma - - self._plasma = plasma - except ImportError: - self._plasma = None + self._plasma = plasma return self._plasma def start_server(self): @@ -47,23 +54,18 @@ def start_server(self): self._server_tmp = tempfile.NamedTemporaryFile() self.path = self._server_tmp.name self._server = subprocess.Popen( - [ - "plasma_store", - "-m", - str(int(1.05 * self.array.nbytes)), - "-s", - self.path, - ] + ["plasma_store", "-m", str(int(1.05 * self.array.nbytes)), "-s", self.path] ) @property def client(self): if self._client is None: assert self.path is not None - self._client = self.plasma.connect(self.path) + self._client = self.plasma.connect(self.path, num_retries=200) return self._client def __getstate__(self): + """Called on pickle load""" if self.plasma is None: return self.__dict__ if self.object_id is None: @@ -78,6 +80,7 @@ def __getstate__(self): return state def __setstate__(self, state): + """Called on pickle save""" self.__dict__.update(state) if self.plasma is None: return @@ -89,3 +92,106 @@ def __del__(self): self._server = None self._server_tmp.close() self._server_tmp = None + + +DEFAULT_PLASMA_PATH = "/tmp/plasma" + + +class PlasmaView: + """Interface to write and read from shared memory. Whereas PlasmaArray writes to plasma on serialization, + PlasmaView writes to shared memory on instantiation.""" + + def __init__(self, array, split_path: str, hash_data: Hashable, plasma_path=None): + """ + Args: + array: numpy array to store. This can be read with ``PlasmaView().array`` + split_path: the path whence the data was read, used for hashing + hash_data: other metadata about the array that can be used to create a unique key. + as of writing, the 3 callers in ``TokenBlockDataset`` use:: + + hash_data = ((block_size, document_sep_len, str(break_mode), len(dataset)), 0|1|2) + + + """ + assert PYARROW_AVAILABLE + assert split_path is not None + if plasma_path is None: + plasma_path = DEFAULT_PLASMA_PATH + + self.path = plasma_path + self.split_path = split_path + self._client = None # Initialize lazily for pickle. plasma clients should not be deep copied or serialized. + self._n = None + + self.object_id = self.get_object_id(self.split_path, hash_data) + try: + self.client.put(array, object_id=self.object_id) + except plasma.PlasmaObjectExists: + pass + + @property + def client(self): + if self._client is None: + self._client = plasma.connect(self.path, num_retries=200) + return self._client + + @property + def array(self): + """Fetch a read only view of an np.array, stored in plasma.""" + ret = self.client.get(self.object_id) + return ret + + @staticmethod + def get_object_id(split_path: str, hash_data: Hashable): + """Returns plasma.ObjectID from hashing split_path and object_num.""" + hash = hashlib.blake2b(bytes(split_path, "utf-8"), digest_size=20) + harg = json.dumps(hash_data).encode("utf-8") + hash.update(harg) + return plasma.ObjectID(hash.digest()) + + def __getstate__(self): + """Called on pickle save""" + self.disconnect() + state = self.__dict__.copy() + assert state["_client"] is None + assert "object_id" in state + return state + + def __setstate__(self, state): + """Called on pickle load""" + self.__dict__.update(state) + + def __del__(self): + self.disconnect() + + def disconnect(self): + if self._client is not None: + self._client.disconnect() + self._client = None + + def __len__(self): + """Save reads by caching len""" + if self._n is None: + self._n = len(self.array) + return self._n + + +GB100 = (1024**3) * 100 + + +class PlasmaStore: + def __init__(self, path=DEFAULT_PLASMA_PATH, nbytes: int = GB100): + + self.server = self.start(path, nbytes) + + def __del__(self): + self.server.kill() + + @staticmethod + def start(path=DEFAULT_PLASMA_PATH, nbytes: int = GB100) -> subprocess.Popen: + if not PYARROW_AVAILABLE: + raise ImportError("please run pip install pyarrow to use --use_plasma_view") + # best practice is to allocate more space than we need. The limitation seems to be the size of /dev/shm + _server = subprocess.Popen(["plasma_store", "-m", str(nbytes), "-s", path]) + plasma.connect(path, num_retries=200) # If we can't connect we fail immediately + return _server diff --git a/fairseq/data/resampling_dataset.py b/fairseq/data/resampling_dataset.py index 3d3b993164..2d77ed79d7 100644 --- a/fairseq/data/resampling_dataset.py +++ b/fairseq/data/resampling_dataset.py @@ -6,8 +6,8 @@ import logging import numpy as np -from fairseq.data import BaseWrapperDataset, plasma_utils +from fairseq.data import BaseWrapperDataset, plasma_utils logger = logging.getLogger(__name__) @@ -125,7 +125,7 @@ def set_epoch(self, epoch): rng = np.random.RandomState( [ 42, # magic number - self.seed % (2 ** 32), # global seed + self.seed % (2**32), # global seed self._cur_epoch, # epoch index ] ) diff --git a/fairseq/data/round_robin_zip_datasets.py b/fairseq/data/round_robin_zip_datasets.py index 690823fc86..2cb7447ea9 100644 --- a/fairseq/data/round_robin_zip_datasets.py +++ b/fairseq/data/round_robin_zip_datasets.py @@ -3,11 +3,15 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import logging from collections import OrderedDict +from typing import Dict, Sequence import numpy as np -from . import FairseqDataset +from . import FairseqDataset, LanguagePairDataset + +logger = logging.getLogger(__name__) class RoundRobinZipDatasets(FairseqDataset): @@ -25,25 +29,26 @@ class RoundRobinZipDatasets(FairseqDataset): def __init__(self, datasets, eval_key=None): super().__init__() + if isinstance(datasets, dict): + datasets = OrderedDict(datasets) assert isinstance(datasets, OrderedDict) + assert datasets, "Can't make a RoundRobinZipDatasets out of nothing" + for dataset in datasets.values(): + assert isinstance(dataset, FairseqDataset) + self.datasets = datasets self.eval_key = eval_key - self.longest_dataset = None - self.longest_dataset_key = None - for key, dataset in datasets.items(): - assert isinstance(dataset, FairseqDataset) - if self.longest_dataset is None or len(dataset) > len(self.longest_dataset): - self.longest_dataset = dataset - self.longest_dataset_key = key - - self._ordered_indices = None + self.longest_dataset_key = max(datasets, key=lambda k: len(datasets[k])) + self.longest_dataset = datasets[self.longest_dataset_key] + self._ordered_indices: Dict[str, Sequence[int]] = None def _map_index(self, key, index): assert ( self._ordered_indices is not None ), "Must call RoundRobinZipDatasets.ordered_indices() first" - return self._ordered_indices[key][index % len(self.datasets[key])] + o = self._ordered_indices[key] + return o[index % len(o)] def __getitem__(self, index): if self.eval_key is None: @@ -58,6 +63,8 @@ def __getitem__(self, index): return self.datasets[self.eval_key][self._map_index(self.eval_key, index)] def __len__(self): + if self._ordered_indices is not None: + return len(self._ordered_indices[self.longest_dataset_key]) return len(self.longest_dataset) def collater(self, samples): @@ -96,7 +103,7 @@ def ordered_indices(self): if self._ordered_indices is None: # Call the underlying dataset's ordered_indices() here, so that we # get the same random ordering as we would have from using the - # underlying dataset directly. + # underlying sub-datasets directly. self._ordered_indices = OrderedDict( [ (key, dataset.ordered_indices()) @@ -105,6 +112,42 @@ def ordered_indices(self): ) return np.arange(len(self)) + def filter_indices_by_size(self, indices, max_positions=None): + """ + Filter each sub-dataset independently, then update the round robin to work + on the filtered sub-datasets. + """ + + def _deep_until_language_pair(dataset): + if isinstance(dataset, LanguagePairDataset): + return dataset + if hasattr(dataset, "tgt_dataset"): + return _deep_until_language_pair(dataset.tgt_dataset) + if hasattr(dataset, "dataset"): + return _deep_until_language_pair(dataset.dataset) + raise Exception(f"Don't know how to unwrap this dataset: {dataset}") + + if not isinstance(max_positions, dict): + max_positions = {k: max_positions for k in self.datasets.keys()} + ignored_some = False + for key, dataset in self.datasets.items(): + dataset = _deep_until_language_pair(dataset) + self._ordered_indices[key], ignored = dataset.filter_indices_by_size( + self._ordered_indices[key], max_positions[key] + ) + if len(ignored) > 0: + ignored_some = True + logger.warning( + f"{len(ignored)} samples from {key} have invalid sizes and will be skipped, " + f"max_positions={max_positions[key]}, first few sample ids={ignored[:10]}" + ) + # Since we are modifying in place the _ordered_indices, + # it's not possible anymore to return valid ignored indices. + # Hopefully the extra debug information print above should be enough to debug. + # Ideally we would receive ignore_invalid_inputs so that we could have + # a proper error message. + return (np.arange(len(self)), [0] if ignored_some else []) + @property def supports_prefetch(self): return all( diff --git a/fairseq/data/span_mask_tokens_dataset.py b/fairseq/data/span_mask_tokens_dataset.py new file mode 100644 index 0000000000..72189bd378 --- /dev/null +++ b/fairseq/data/span_mask_tokens_dataset.py @@ -0,0 +1,293 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch + +from . import Dictionary, FairseqDataset, data_utils + + +def collate( + samples, + pad_idx, + eos_idx, + vocab, + left_pad_source=False, + left_pad_target=False, + input_feeding=True, + pad_to_length=None, +): + assert input_feeding + if len(samples) == 0: + return {} + + def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None): + return data_utils.collate_tokens( + [s[key] for s in samples], + pad_idx, + eos_idx=None, # use eos_idx of each sample instead of vocab.eos() + left_pad=left_pad, + move_eos_to_beginning=move_eos_to_beginning, + pad_to_length=pad_to_length, + ) + + id = torch.LongTensor([s["id"] for s in samples]) + src_tokens = merge( + "source", + left_pad=left_pad_source, + pad_to_length=pad_to_length["source"] if pad_to_length is not None else None, + ) + # sort by descending source length + src_lengths = torch.LongTensor([s["source"].numel() for s in samples]) + src_lengths, sort_order = src_lengths.sort(descending=True) + id = id.index_select(0, sort_order) + src_tokens = src_tokens.index_select(0, sort_order) + + prev_output_tokens = None + target = None + if samples[0].get("target", None) is not None: + target = merge( + "target", + left_pad=left_pad_target, + pad_to_length=pad_to_length["target"] + if pad_to_length is not None + else None, + ) + target = target.index_select(0, sort_order) + ntokens = sum(len(s["target"]) for s in samples) + + if input_feeding: + # we create a shifted version of targets for feeding the + # previous output token(s) into the next decoder step + prev_output_tokens = merge( + "target", + left_pad=left_pad_target, + move_eos_to_beginning=True, + pad_to_length=pad_to_length["target"] + if pad_to_length is not None + else None, + ) + prev_output_tokens = prev_output_tokens.index_select(0, sort_order) + else: + ntokens = sum(len(s["source"]) for s in samples) + + batch = { + "id": id, + "ntokens": ntokens, + "net_input": { + "src_tokens": src_tokens, + "src_lengths": src_lengths, + }, + "target": target, + "target_lengths": torch.LongTensor([len(t) for t in target]), + "nsentences": samples[0]["source"].size(0), + "sort_order": sort_order, + } + if prev_output_tokens is not None: + batch["net_input"]["prev_output_tokens"] = prev_output_tokens + + return batch + + +class SpanMaskedTokensDataset(FairseqDataset): + """ + A wrapper around TokenBlockDataset for T5 dataset. + + Args: + dataset (~torch.utils.data.Dataset): dataset to wrap + vocab (~fairseq.data.Dictionary): vocabulary + noise_density (float): fraction of the tokens to select as noise. + mean_noise_span_length (float): mean noise span length. + shuffle (bool, optional): shuffle the elements before batching. + Default: ``True`` + seed: Seed for random number generator for reproducibility. + """ + + def __init__( + self, + dataset: torch.utils.data.Dataset, + vocab: Dictionary, + noise_density: float, + mean_noise_span_length: float, + shuffle: bool, + seed: int = 1, + ): + self.dataset = dataset + self.vocab = vocab + self.seed = seed + self.noise_density = noise_density + self.mean_noise_span_length = mean_noise_span_length + self.shuffle = shuffle + self.epoch = 0 + + @property + def can_reuse_epoch_itr_across_epochs(self): + return True # only the noise changes, not item sizes + + def set_epoch(self, epoch, **unused): + self.epoch = epoch + + def __getitem__(self, index): + with data_utils.numpy_seed(self.seed, self.epoch, index): + item = self.dataset[index] + assert item[-1] == self.vocab.eos() + + noise_mask = self.random_spans_noise_mask(len(item)) + + source_sentinel_ids = self.create_sentinel_ids(noise_mask.astype(np.int8)) + source = self.filter_input_ids(item, source_sentinel_ids) + + target_sentinel_ids = self.create_sentinel_ids( + (~noise_mask).astype(np.int8) + ) + target = self.filter_input_ids(item, target_sentinel_ids) + + return { + "id": index, + "source": torch.from_numpy(source), + "target": torch.from_numpy(target), + } + + def random_spans_noise_mask(self, length): + + """ + This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ . + Noise mask consisting of random spans of noise tokens. + The number of noise tokens and the number of noise spans and non-noise spans + are determined deterministically as follows: + num_noise_tokens = round(length * noise_density) + num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length) + Spans alternate between non-noise and noise, beginning with non-noise. + Subject to the above restrictions, all masks are equally likely. + Args: + length: an int32 scalar (length of the incoming token sequence) + Returns: + a boolean tensor with shape [length] + """ + + orig_length = length + + num_noise_tokens = int(np.round(length * self.noise_density)) + # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens. + num_noise_tokens = min(max(num_noise_tokens, 1), length - 1) + num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length)) + + # avoid degeneracy by ensuring positive number of noise spans + num_noise_spans = max(num_noise_spans, 1) + num_nonnoise_tokens = length - num_noise_tokens + + # pick the lengths of the noise spans and the non-noise spans + def _random_segmentation(num_items, num_segments): + """ + Partition a sequence of items randomly into non-empty segments. + Args: + num_items: an integer scalar > 0 + num_segments: an integer scalar in [1, num_items] + Returns: + a Tensor with shape [num_segments] containing positive integers that add up to num_items + """ + mask_indices = np.arange(num_items - 1) < (num_segments - 1) + np.random.shuffle(mask_indices) + first_in_segment = np.pad(mask_indices, [[1, 0]]) + segment_id = np.cumsum(first_in_segment) + # count length of subsegments assuming that list is sorted + _, segment_length = np.unique(segment_id, return_counts=True) + return segment_length + + noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans) + nonnoise_span_lengths = _random_segmentation( + num_nonnoise_tokens, num_noise_spans + ) + + interleaved_span_lengths = np.reshape( + np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), + [num_noise_spans * 2], + ) + span_starts = np.cumsum(interleaved_span_lengths)[:-1] + span_start_indicator = np.zeros((length,), dtype=np.int8) + span_start_indicator[span_starts] = True + span_num = np.cumsum(span_start_indicator) + is_noise = np.equal(span_num % 2, 1) + + return is_noise[:orig_length] + + def create_sentinel_ids(self, mask_indices): + """ + Sentinel ids creation given the indices that should be masked. + The start indices of each mask are replaced by the sentinel ids in increasing + order. Consecutive mask indices to be deleted are replaced with `-1`. + """ + start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices + + sentinel_ids = np.where( + start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices + ) + # making sure all sentinel tokens are unique over the example + sentinel_ids = np.where(sentinel_ids != 0, len(self.vocab) - sentinel_ids, 0) + sentinel_ids -= mask_indices - start_indices + return sentinel_ids + + @staticmethod + def filter_input_ids(input_ids, sentinel_ids): + """ + Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting. + This will reduce the sequence length from `expanded_inputs_length` to `input_length`. + """ + input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids) + + # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are + # masked tokens coming after sentinel tokens and should be removed + return input_ids_full[input_ids_full >= 0] + + def __len__(self): + return len(self.dataset) + + def collater(self, samples, pad_to_length=None): + """ + Merge a list of samples to form a mini-batch. + Args: + samples (List[dict]): samples to collate + Returns: + dict: a mini-batch of data + """ + return collate( + samples, + self.vocab.pad(), + self.vocab.eos(), + self.vocab, + pad_to_length=pad_to_length, + ) + + def num_tokens(self, index): + """Return the number of tokens in a sample. This value is used to + enforce ``--max-tokens`` during batching.""" + return self.dataset.sizes[index] + + def size(self, index): + """Return an example's size as a float or tuple. This value is used when + filtering a dataset with ``--max-positions``.""" + return self.dataset.sizes[index] + + def ordered_indices(self): + """Return an ordered list of indices. Batches will be constructed based + on this order.""" + if self.shuffle: + indices = np.random.permutation(len(self)) + else: + indices = np.arange(len(self)) + return indices[np.argsort(self.dataset.sizes[indices], kind="mergesort")] + + def prefetch(self, indices): + self.src.prefetch(indices) + self.tgt.prefetch(indices) + + @property + def supports_prefetch(self): + return ( + hasattr(self.src, "supports_prefetch") + and self.src.supports_prefetch + and hasattr(self.tgt, "supports_prefetch") + and self.tgt.supports_prefetch + ) diff --git a/fairseq/data/speech_dlm_dataset.py b/fairseq/data/speech_dlm_dataset.py new file mode 100644 index 0000000000..06c4808f0a --- /dev/null +++ b/fairseq/data/speech_dlm_dataset.py @@ -0,0 +1,307 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from collections import OrderedDict + +import numpy as np +import torch + +from fairseq.data import FairseqDataset, MonolingualDataset, data_utils + + +class SpeechDLMDataset(FairseqDataset): + """The dataset used to train the SpeechDLM model as described in the paper: + https://arxiv.org/pdf/2203.16502.pdf + + The input datasets is expected to be a dict over channel names with the values + being instances of :class:`~fairseq.data.MonolingualDataset`. + + Each element of SpeechDLMDataset is a dictionary with the following keys: + - `id` (int) : index of the item + - `source` (OrderedDict[str, Tensor of shape (seq_len,)]) : dictionary over + channels with the values containing the input unit tokens + - `target_next` (OrderedDict[str, Tensor of shape (seq_len,)]) : dictionary + over channels with the values containing the next unit tokens (input + tokens shifted by 1). + Its value is None if 'next' not in self.targets + - `target_edge` (OrderedDict[str, Tensor of shape (dedup_seq_len,)]) : dictionary + over channels with the values containing the edge unit tokens (input tokens + deduplicated). + Its value is None if 'edge' not in self.targets + - `target_duration` (OrderedDict[str, Tensor of shape (dedup_seq_len,)]) : + dictionary over channels with the values being the durations of the edge units. + Its value is None if 'duration' not in targets. + - `target_edge_indices` (OrderedDict[str, Tensor of shape (dedup_seq_len,)]) : + dictionary over channels with the values being the indices of the edge units + in the source sequence. + Its value is None if neither 'edge' or 'duration in targets. + + Args: + datasets (Dict[str, ~fairseq.data.MonolingualDataset]): a dictionary of + :class:`~fairseq.data.MonolingualDataset` instances. + targets (List[str]): list of the target types that the SpeechDLM model + should predict. Can be one of "next", "edge", "duration". + shuffle (bool, optional): shuffle the elements before batching + (default: True). + """ + + def __init__( + self, datasets, targets=None, max_target_durations=None, shuffle=False + ): + super().__init__() + if isinstance(datasets, dict): + datasets = OrderedDict(datasets) + assert isinstance( + datasets, OrderedDict + ), "datasets is expected to be an instance of Dictionary or OrderedDict" + assert datasets, "datasets is None" + for dataset in datasets.values(): + assert isinstance( + dataset, MonolingualDataset + ), "Each value of datasets is expected to be an instance of MonolingualDataset" + + self.datasets = datasets + self.targets = targets + if max_target_durations is not None and max_target_durations > 0: + self.max_target_durations = max_target_durations + else: + self.max_target_durations = float("inf") + self.sizes = next(iter(datasets.values())).sizes + self.vocab = next(iter(datasets.values())).vocab + self.length = len(next(iter(datasets.values()))) + self.shuffle = shuffle + + for channel, dataset in datasets.items(): + assert ( + len(dataset) == self.length + ), "[{}] length mismatch ({} vs {})".format( + channel, len(dataset), self.length + ) + assert (dataset.sizes == self.sizes).all(), "[{}] sizes mismatch".format( + channel + ) + + assert ( + dataset.vocab.pad() == self.vocab.pad() + ), "pad token is expected to be the same" + assert ( + dataset.vocab.eos() == self.vocab.eos() + ), "eos token is expected to be the same" + assert ( + dataset.vocab.bos() == self.vocab.bos() + ), "bos token is expected to be the same" + assert ( + dataset.vocab.unk() == self.vocab.unk() + ), "unk token is expected to be the same" + + def __getitem__(self, index): + source = OrderedDict( + [ + (key, dataset[index]["source"]) + for (key, dataset) in self.datasets.items() + ] + ) + + item = { + "id": index, + "source": source, + "target_next": None, + "target_edge": None, + "target_duration": None, + "target_edge_indices": None, + } + + if self.targets is not None: + for channel in self.datasets: + target = self._get_target(index, channel) + for t in target: + if item[f"target_{t}"] is None: + item[f"target_{t}"] = OrderedDict() + item[f"target_{t}"][channel] = target[t] + + return item + + def __len__(self): + return self.length + + def _get_target(self, index, channel): + """Get target in one of ['next', 'edge', 'duration'] + - 'next' is the future unit + - 'edge' is the edge unit + - 'duration' is the duration of the edge unit + """ + if self.targets is not None: + target = {} + pad_idx = self.vocab.pad() + max_dur = self.max_target_durations + future_target = self.datasets[channel][index]["target"] + if "edge" in self.targets or "duration" in self.targets: + edge_units, edge_unit_counts = torch.unique_consecutive( + future_target, return_counts=True + ) + padding_end = edge_units[-1] == pad_idx + if padding_end: + edge_units = edge_units[:-1] + edge_unit_counts = edge_unit_counts[:-1] + edge_indices = torch.cumsum(edge_unit_counts, 0) + edge_indices = torch.cat([torch.tensor([0]), edge_indices[:-1]]) + target["edge_indices"] = edge_indices + + for t in self.targets: + if t == "next": + target[t] = future_target + elif t == "edge": + target[t] = edge_units + elif t == "duration": + # count the remaining duration of the last edge indices in the next sentence + if not padding_end and index < len(self.datasets[channel]) - 1: + i = 0 + next_sentence_target = self.datasets[channel][index + 1][ + "target" + ] + while ( + next_sentence_target[i] == edge_units[-1] + and edge_unit_counts[-1] + i < max_dur + ): + i += 1 + edge_unit_counts[-1] += i + + # cut off to the maximal threshold + if max_dur: + edge_unit_counts[edge_unit_counts > max_dur] = max_dur + + target[t] = edge_unit_counts + else: + raise Exception("invalid target " + t) + + return target + + def collater(self, samples): + """Merge a list of samples to form a mini-batch. + + Args: + samples (List[dict]): samples to collate + + Returns: + dict: a mini-batch with the following keys: + + - `id` (LongTensor): example IDs in the original input order + - `ntokens` (int): total number of tokens in the batch + - `net_input` (dict): the input to the Model, containing keys: + + - `src_tokens` (OrderedDict[str, LongTensor]): dictionary + over channel with the values being padded 2D Tensor of + samples `source` of shape `(bsz, src_len)`. + Padding will appear on the right. + - `src_lengths` (LongTensor): lengths of source sentences + in the mini-batch + + - `target` (dict): the target of the Model, containing keys: + + - `next` (OrderedDict[str, LongTensor]): dictionary + over channel with the values being padded 2D Tensor of + batch samples' `target_next` of shape `(bsz, tgt_len)`. + Padding will appear on the right. + - `edge` (OrderedDict[str, LongTensor]): dictionary + over channel with the values being the concatenated + 1D Tensor of batch samples' `target_edge` of shape + `(sum of dedup_tgt_len,)` + - `duration` (OrderedDict[str, LongTensor]): dictionary + over channel with the values being the concatenated + 1D Tensor of batch samples' `target_duration` of shape + `(sum of dedup_tgt_len,)` + - `edge_indices` (OrderedDict[str, LongTensor]): dictionary + over channel with the values being the concatenated + 1D Tensor of batch samples' `target_edge_indices` of + shape `(sum of dedup_tgt_len,)`. + The indices are added to multiplies of batch size + such that they are the actual indices in the flatten + `src_tokens` Tensor + """ + if len(samples) == 0: + return {} + + pad_idx = self.vocab.pad() + eos_idx = self.vocab.eos() + + def merge(key, max_size=None): + if samples[0][key] is None: + return None + res = OrderedDict() + for channel in samples[0][key]: + if key in ["source", "target_next"]: + # fill batch of shape: (batch_size, max_size) + res[channel] = data_utils.collate_tokens( + [s[key][channel] for s in samples], + pad_idx, + eos_idx, + left_pad=False, + ) + elif key in ["target_edge", "target_duration"]: + # concatenate the edge units/duration + res[channel] = torch.cat([s[key][channel] for s in samples]) + elif key == "target_edge_indices": + # increase the edge indices to the indices in the flatten batch + res[channel] = torch.cat( + [s[key][channel] + i * max_size for i, s in enumerate(samples)] + ) + + return res + + src_tokens = merge("source") + tgt_next = merge("target_next") + tgt_edge = merge("target_edge") + tgt_duration = merge("target_duration") + tgt_edge_indices = merge( + "target_edge_indices", max_size=next(iter(src_tokens.values())).size(-1) + ) + return { + "id": torch.LongTensor([s["id"] for s in samples]), + "nsentences": len(samples), + "ntokens": sum(len(item) for s in samples for item in s["source"].values()), + "net_input": { + "src_tokens": src_tokens, + "src_lengths": torch.LongTensor( + [next(iter(s["source"].values())).numel() for s in samples] + ), + }, + "target": { + "next": tgt_next, + "edge": tgt_edge, + "duration": tgt_duration, + "edge_indices": tgt_edge_indices, + }, + } + + def num_tokens(self, index): + """Return the number of tokens in a sample. This value is used to + enforce ``--max-tokens`` during batching.""" + return self.sizes[index] + + def size(self, index): + """Return an example's size as a float or tuple. This value is used when + filtering a dataset with ``--max-positions``.""" + return self.sizes[index] + + def ordered_indices(self): + """Return an ordered list of indices. Batches will be constructed based + on this order.""" + if self.shuffle: + order = [np.random.permutation(len(self))] + else: + order = [np.arange(len(self))] + order.append(self.sizes) + return np.lexsort(order) + + @property + def supports_prefetch(self): + return all( + getattr(dataset, "supports_prefetch", False) + for dataset in self.datasets.values() + ) + + def prefetch(self, indices): + for key, dataset in self.datasets.items(): + dataset.prefetch(indices) diff --git a/fairseq/data/subsample_dataset.py b/fairseq/data/subsample_dataset.py index 48feaf883f..fe5c7e2ac8 100644 --- a/fairseq/data/subsample_dataset.py +++ b/fairseq/data/subsample_dataset.py @@ -3,9 +3,11 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import contextlib import logging import numpy as np +from fairseq.data.data_utils import numpy_seed from . import BaseWrapperDataset @@ -21,13 +23,14 @@ class SubsampleDataset(BaseWrapperDataset): size_ratio(float): the ratio to subsample to. must be between 0 and 1 (exclusive) """ - def __init__(self, dataset, size_ratio, shuffle=False): + def __init__(self, dataset, size_ratio, shuffle=False, seed=None): super().__init__(dataset) assert size_ratio < 1 self.actual_size = np.ceil(len(dataset) * size_ratio).astype(int) - self.indices = np.random.choice( - list(range(len(self.dataset))), self.actual_size, replace=False - ) + with numpy_seed(seed) if seed is not None else contextlib.ExitStack(): + self.indices = np.random.choice( + list(range(len(self.dataset))), self.actual_size, replace=False + ) self.shuffle = shuffle logger.info( "subsampled dataset from {} to {} (ratio={})".format( diff --git a/fairseq/data/text_compressor.py b/fairseq/data/text_compressor.py new file mode 100644 index 0000000000..d699f2ea29 --- /dev/null +++ b/fairseq/data/text_compressor.py @@ -0,0 +1,58 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from enum import Enum + + +class TextCompressionLevel(Enum): + none = 0 + low = 1 + high = 2 + + +class TextCompressor(object): + def __init__( + self, level: TextCompressionLevel, max_input_byte_length: int = 2**16 + ): + self.level = level + self.max_input_length = max_input_byte_length + + def compress(self, text: str) -> bytes: + if self.level == TextCompressionLevel.low: + import zlib + + # zlib: built-in, fast + return zlib.compress(text.encode(), level=0) + elif self.level == TextCompressionLevel.high: + try: + import unishox2 + + # unishox2: optimized for short text but slower + except ImportError: + raise ImportError( + "Please install unishox2 for the text compression feature: " + "pip install unishox2-py3" + ) + assert len(text.encode()) <= self.max_input_length + return unishox2.compress(text)[0] + else: + return text.encode() + + def decompress(self, compressed: bytes) -> str: + if self.level == TextCompressionLevel.low: + import zlib + + return zlib.decompress(compressed).decode() + elif self.level == TextCompressionLevel.high: + try: + import unishox2 + except ImportError: + raise ImportError( + "Please install unishox2 for the text compression feature: " + "pip install unishox2-py3" + ) + return unishox2.decompress(compressed, self.max_input_length) + else: + return compressed.decode() diff --git a/fairseq/data/token_block_dataset.py b/fairseq/data/token_block_dataset.py index aa33f9d06f..a414e7ef64 100644 --- a/fairseq/data/token_block_dataset.py +++ b/fairseq/data/token_block_dataset.py @@ -6,6 +6,8 @@ import numpy as np import torch from fairseq.data import FairseqDataset, plasma_utils +from fairseq.data.indexed_dataset import best_fitting_int_dtype +from typing import Tuple class TokenBlockDataset(FairseqDataset): @@ -41,7 +43,49 @@ def __init__( break_mode=None, include_targets=False, document_sep_len=1, + use_plasma_view=False, + split_path=None, + plasma_path=None, ): + + super().__init__() + self.dataset = dataset + self.pad = pad + self.eos = eos + self.include_targets = include_targets + + assert len(dataset) > 0 + + assert len(dataset) == len(sizes) + _sizes, block_to_dataset_index, slice_indices = self._build_slice_indices( + sizes, break_mode, document_sep_len, block_size + ) + if use_plasma_view: + plasma_id = (block_size, document_sep_len, str(break_mode), len(dataset)) + self._slice_indices = plasma_utils.PlasmaView( + slice_indices, split_path, (plasma_id, 0), plasma_path=plasma_path + ) + self._sizes = plasma_utils.PlasmaView( + _sizes, split_path, (plasma_id, 1), plasma_path=plasma_path + ) + self._block_to_dataset_index = plasma_utils.PlasmaView( + block_to_dataset_index, + split_path, + (plasma_id, 2), + plasma_path=plasma_path, + ) + else: + self._slice_indices = plasma_utils.PlasmaArray(slice_indices) + self._sizes = plasma_utils.PlasmaArray(_sizes) + self._block_to_dataset_index = plasma_utils.PlasmaArray( + block_to_dataset_index + ) + + @staticmethod + def _build_slice_indices( + sizes, break_mode, document_sep_len, block_size + ) -> Tuple[np.ndarray]: + """Use token_block_utils_fast to build arrays for indexing into self.dataset""" try: from fairseq.data.token_block_utils_fast import ( _get_slice_indices_fast, @@ -53,15 +97,6 @@ def __init__( "or `python setup.py build_ext --inplace`" ) - super().__init__() - self.dataset = dataset - self.pad = pad - self.eos = eos - self.include_targets = include_targets - - assert len(dataset) == len(sizes) - assert len(dataset) > 0 - if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: @@ -78,7 +113,7 @@ def __init__( slice_indices = _get_slice_indices_fast( sizes, str(break_mode), block_size, document_sep_len ) - self._sizes = slice_indices[:, 1] - slice_indices[:, 0] + _sizes = slice_indices[:, 1] - slice_indices[:, 0] # build index mapping block indices to the underlying dataset indices if break_mode == "eos": @@ -87,7 +122,7 @@ def __init__( [ np.arange(len(sizes)), # starting index in dataset np.zeros( - len(sizes), dtype=np.long + len(sizes), dtype=np.compat.long ), # starting offset within starting index np.arange(len(sizes)), # ending index in dataset ], @@ -98,9 +133,13 @@ def __init__( sizes, slice_indices, ) - self._slice_indices = plasma_utils.PlasmaArray(slice_indices) - self._sizes = plasma_utils.PlasmaArray(self._sizes) - self._block_to_dataset_index = plasma_utils.PlasmaArray(block_to_dataset_index) + size_dtype = np.uint16 if block_size < 65535 else np.uint32 + num_tokens = slice_indices[-1].max() + slice_indices_dtype = best_fitting_int_dtype(num_tokens) + slice_indices = slice_indices.astype(slice_indices_dtype) + _sizes = _sizes.astype(size_dtype) + block_to_dataset_index = block_to_dataset_index.astype(slice_indices_dtype) + return _sizes, block_to_dataset_index, slice_indices @property def slice_indices(self): @@ -124,7 +163,6 @@ def __getitem__(self, index): buffer = torch.cat( [self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)] ) - slice_s, slice_e = self.slice_indices[index] length = slice_e - slice_s s, e = start_offset, start_offset + length diff --git a/fairseq/data/token_block_utils_fast.pyx b/fairseq/data/token_block_utils_fast.pyx index 5a2f16ec34..08af4f3061 100644 --- a/fairseq/data/token_block_utils_fast.pyx +++ b/fairseq/data/token_block_utils_fast.pyx @@ -170,7 +170,7 @@ cdef class DatasetSearcher(object): self.current_offset += to_consume self.current_i += to_consume else: - assert remaining > 0 + assert remaining >= 0 self.current_i += remaining self.current_index += 1 self.current_offset = 0 diff --git a/fairseq/data/transform_eos_concat_langpair_dataset.py b/fairseq/data/transform_eos_concat_langpair_dataset.py new file mode 100644 index 0000000000..effa127d50 --- /dev/null +++ b/fairseq/data/transform_eos_concat_langpair_dataset.py @@ -0,0 +1,139 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch +from torch.utils.data.dataloader import default_collate + +from fairseq.data import ConcatDataset + +logger = logging.getLogger(__name__) + + +class TransformEosConcatLangPairDataset(ConcatDataset): + """ + It is a combination of TransformEosLangPairDataset and ConcatDataset for multiple LangPairDataset datasets. + Assume all datasets share the same src_eos, tgt_bos, left_pad_source and left_pad_target + """ + + def __init__( + self, + datasets, + src_eos, + tgt_bos, + new_src_eos=None, + new_tgt_bos=None, + ): + super().__init__(datasets) + if new_src_eos is not None and new_src_eos != []: + assert len(new_src_eos) == len(datasets) + else: + new_src_eos = [] + if new_tgt_bos is not None and new_tgt_bos != []: + assert len(new_tgt_bos) == len(datasets) + else: + new_tgt_bos = [] + self.src_eos = src_eos + self.tgt_bos = tgt_bos + self.new_src_eos = ( + torch.LongTensor(new_src_eos).cpu() if len(new_src_eos) > 0 else [] + ) + self.new_tgt_bos = ( + torch.LongTensor(new_tgt_bos).cpu() if len(new_tgt_bos) > 0 else [] + ) + self.left_pad_source = self.is_left_pad_source(datasets) + self.left_pad_target = self.is_left_pad_target(datasets) + self.pad_idx = self.src_dict_pad() + + def src_dict_pad(self): + if hasattr(self.datasets[0], "src_dict"): + return self.datasets[0].src_dict.pad() + if hasattr(self.datasets[0], "dataset"): + return self.datasets[0].dataset.src_dict.pad() + raise NotImplementedError("No src_dict is found") + + def __getitem__(self, idx): + dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx) + return dataset_idx, self.datasets[dataset_idx][sample_idx] + + def is_left_pad_source(self, datasets): + def _left_pad_source(ds): + if hasattr(ds, "left_pad_source"): + return ds.left_pad_source + if hasattr(ds, "dataset"): + return _left_pad_source(ds.dataset) + logger.warn(f"{type(ds)} has no left_pad_source, using default True") + return True + + left_pad_source = _left_pad_source(datasets[0]) + for ds in datasets: + if left_pad_source != _left_pad_source(ds): + raise ValueError("Different left_pad_source setting detected!") + return left_pad_source + + def is_left_pad_target(self, datasets): + def _left_pad_target(ds): + if hasattr(ds, "left_pad_target"): + return ds.left_pad_target + if hasattr(ds, "dataset"): + return _left_pad_target(ds.dataset) + logger.warn(f"{type(ds)} has no left_pad_target, using default False") + return False + + left_pad_target = _left_pad_target(datasets[0]) + for ds in datasets: + if left_pad_target != _left_pad_target(ds): + raise ValueError("Different left_pad_target setting detected!") + return left_pad_target + + def collater(self, samples, **extra_args): + if len(samples) == 0: + return samples + + dataset_ids = [s[0] for s in samples] + samples = [s[1] for s in samples] + + if hasattr(self.datasets[0], "collater"): + samples = self.datasets[0].collater(samples, **extra_args) + else: + samples = default_collate(samples, **extra_args) + + if len(self.new_src_eos) > 0: + if self.left_pad_source: + assert ( + samples["net_input"]["src_tokens"][:, -1] != self.src_eos + ).sum() == 0 + samples["net_input"]["src_tokens"][:, -1] = self.new_src_eos[ + dataset_ids + ] + + else: + eos_idx = samples["net_input"]["src_lengths"] - 1 + assert ( + samples["net_input"]["src_tokens"][ + torch.arange(eos_idx.size(0)), eos_idx + ] + != self.src_eos + ).sum() == 0 + samples["net_input"]["src_tokens"].scatter_( + 1, eos_idx.view(-1, 1), self.new_src_eos[dataset_ids].view(-1, 1) + ) + + if len(self.new_tgt_bos) > 0 and "prev_output_tokens" in samples["net_input"]: + if self.left_pad_target: + # TODO: support different padding direction on target side + raise NotImplementedError( + "TransformEosLangPairDataset does not implement --left-pad-target True option" + ) + else: + assert ( + samples["net_input"]["prev_output_tokens"][:, 0] != self.tgt_bos + ).sum() == 0 + samples["net_input"]["prev_output_tokens"][:, 0] = self.new_tgt_bos[ + dataset_ids + ] + + return samples diff --git a/fairseq/data/transform_eos_lang_pair_dataset.py b/fairseq/data/transform_eos_lang_pair_dataset.py index 1dd3d93d2b..d8b2109014 100644 --- a/fairseq/data/transform_eos_lang_pair_dataset.py +++ b/fairseq/data/transform_eos_lang_pair_dataset.py @@ -49,6 +49,11 @@ def __len__(self): def collater(self, samples, **extra_args): samples = self.dataset.collater(samples, **extra_args) + if len(samples) == 0: + return samples + + if "net_input" not in samples: + return samples if self.new_src_eos is not None: if self.dataset.left_pad_source: diff --git a/fairseq/dataclass/__init__.py b/fairseq/dataclass/__init__.py index 32870814d5..25408d28ec 100644 --- a/fairseq/dataclass/__init__.py +++ b/fairseq/dataclass/__init__.py @@ -3,7 +3,11 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .utils import ChoiceEnum, FairseqDataclass +from .configs import FairseqDataclass +from .constants import ChoiceEnum -__all__ = ["FairseqDataclass", "ChoiceEnum"] +__all__ = [ + "FairseqDataclass", + "ChoiceEnum", +] diff --git a/fairseq/dataclass/data_class.py b/fairseq/dataclass/configs.py similarity index 63% rename from fairseq/dataclass/data_class.py rename to fairseq/dataclass/configs.py index bcb802e651..af957fec64 100644 --- a/fairseq/dataclass/data_class.py +++ b/fairseq/dataclass/configs.py @@ -3,33 +3,96 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import logging +import os import sys -from argparse import Namespace from dataclasses import _MISSING_TYPE, dataclass, field -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Any, List, Optional import torch -from fairseq.data.indexed_dataset import get_available_dataset_impl +from omegaconf import II, MISSING + from fairseq.dataclass.constants import ( + DATASET_IMPL_CHOICES, DDP_BACKEND_CHOICES, - DISTRIBUTED_WRAPPER_CHOICES, + DDP_COMM_HOOK_CHOICES, GENERATION_CONSTRAINTS_CHOICES, GENERATION_DECODING_FORMAT_CHOICES, LOG_FORMAT_CHOICES, PIPELINE_CHECKPOINT_CHOICES, + PRINT_ALIGNMENT_CHOICES, ZERO_SHARDING_CHOICES, ) -from fairseq.dataclass.utils import ChoiceEnum, FairseqDataclass -from fairseq.models import ARCH_MODEL_REGISTRY, MODEL_DATACLASS_REGISTRY -from fairseq.optim.bmuf import FairseqBMUFConfig -from fairseq.registry import REGISTRIES -from fairseq.tasks import TASK_DATACLASS_REGISTRY -from hydra.core.config_store import ConfigStore -from omegaconf import II -logger = logging.getLogger(__name__) +@dataclass +class FairseqDataclass: + """fairseq base dataclass that supported fetching attributes and metas""" + + _name: Optional[str] = None + + @staticmethod + def name(): + return None + + def _get_all_attributes(self) -> List[str]: + return [k for k in self.__dataclass_fields__.keys()] + + def _get_meta( + self, attribute_name: str, meta: str, default: Optional[Any] = None + ) -> Any: + return self.__dataclass_fields__[attribute_name].metadata.get(meta, default) + + def _get_name(self, attribute_name: str) -> str: + return self.__dataclass_fields__[attribute_name].name + + def _get_default(self, attribute_name: str) -> Any: + if hasattr(self, attribute_name): + if str(getattr(self, attribute_name)).startswith("${"): + return str(getattr(self, attribute_name)) + elif str(self.__dataclass_fields__[attribute_name].default).startswith( + "${" + ): + return str(self.__dataclass_fields__[attribute_name].default) + elif ( + getattr(self, attribute_name) + != self.__dataclass_fields__[attribute_name].default + ): + return getattr(self, attribute_name) + + f = self.__dataclass_fields__[attribute_name] + if not isinstance(f.default_factory, _MISSING_TYPE): + return f.default_factory() + return f.default + + def _get_type(self, attribute_name: str) -> Any: + return self.__dataclass_fields__[attribute_name].type + + def _get_help(self, attribute_name: str) -> Any: + return self._get_meta(attribute_name, "help") + + def _get_argparse_const(self, attribute_name: str) -> Any: + return self._get_meta(attribute_name, "argparse_const") + + def _get_argparse_alias(self, attribute_name: str) -> Any: + return self._get_meta(attribute_name, "argparse_alias") + + def _get_choices(self, attribute_name: str) -> Any: + return self._get_meta(attribute_name, "choices") + + @classmethod + def from_namespace(cls, args): + if isinstance(args, cls): + return args + else: + config = cls() + for k in config.__dataclass_fields__.keys(): + if k.startswith("_"): + # private member, skip + continue + if hasattr(args, k): + setattr(config, k, getattr(args, k)) + + return config @dataclass @@ -48,6 +111,20 @@ class CommonConfig(FairseqDataclass): log_format: Optional[LOG_FORMAT_CHOICES] = field( default=None, metadata={"help": "log format to use"} ) + log_file: Optional[str] = field( + default=None, metadata={"help": "log file to copy metrics to."} + ) + aim_repo: Optional[str] = field( + default=None, + metadata={"help": "path to Aim repository"}, + ) + aim_run_hash: Optional[str] = field( + default=None, + metadata={ + "help": "Aim run hash. If skipped, creates or continues run " + "based on save_dir" + }, + ) tensorboard_logdir: Optional[str] = field( default=None, metadata={ @@ -55,6 +132,14 @@ class CommonConfig(FairseqDataclass): "of running tensorboard (default: no tensorboard logging)" }, ) + wandb_project: Optional[str] = field( + default=None, + metadata={"help": "Weights and Biases project name to use for logging"}, + ) + azureml_logging: Optional[bool] = field( + default=False, + metadata={"help": "Log scalars to AzureML context"}, + ) seed: int = field( default=1, metadata={"help": "pseudo random number generator seed"} ) @@ -78,7 +163,7 @@ class CommonConfig(FairseqDataclass): default=False, metadata={"help": "don't flatten FP16 grads tensor"} ) fp16_init_scale: int = field( - default=2 ** 7, metadata={"help": "default FP16 loss scale"} + default=2**7, metadata={"help": "default FP16 loss scale"} ) fp16_scale_window: Optional[int] = field( default=None, @@ -90,13 +175,36 @@ class CommonConfig(FairseqDataclass): "help": "pct of updates that can overflow before decreasing the loss scale" }, ) + on_cpu_convert_precision: bool = field( + default=False, + metadata={ + "help": "if set, the floating point conversion to fp16/bf16 runs on CPU. " + "This reduces bus transfer time and GPU memory usage." + }, + ) min_loss_scale: float = field( default=1e-4, - metadata={"help": "minimum FP16 loss scale, after which training is stopped"}, + metadata={ + "help": "minimum FP16/AMP loss scale, after which training is stopped" + }, ) threshold_loss_scale: Optional[float] = field( default=None, metadata={"help": "threshold FP16 loss scale from below"} ) + amp: bool = field(default=False, metadata={"help": "use automatic mixed precision"}) + amp_batch_retries: int = field( + default=2, + metadata={ + "help": "number of retries of same batch after reducing loss scale with AMP" + }, + ) + amp_init_scale: int = field( + default=2**7, metadata={"help": "default AMP loss scale"} + ) + amp_scale_window: Optional[int] = field( + default=None, + metadata={"help": "number of updates before increasing AMP loss scale"}, + ) user_dir: Optional[str] = field( default=None, metadata={ @@ -120,6 +228,28 @@ class CommonConfig(FairseqDataclass): profile: bool = field( default=False, metadata={"help": "enable autograd profiler emit_nvtx"} ) + reset_logging: bool = field( + default=False, + metadata={ + "help": "when using Hydra, reset the logging at the beginning of training" + }, + ) + suppress_crashes: bool = field( + default=False, + metadata={ + "help": "suppress crashes when training with the hydra_train entry point so that the " + "main method can return a value (useful for sweeps)" + }, + ) + use_plasma_view: bool = field( + default=False, metadata={"help": "Store indices and sizes in shared memory"} + ) + plasma_path: Optional[str] = field( + default="/tmp/plasma", + metadata={ + "help": "path to run plasma_store, defaults to /tmp/plasma. Paths outside /tmp tend to fail." + }, + ) @dataclass @@ -130,6 +260,12 @@ class DistributedTrainingConfig(FairseqDataclass): "help": "total number of GPUs across all nodes (default: all visible GPUs)" }, ) + distributed_num_procs: Optional[int] = field( + default=max(1, torch.cuda.device_count()), + metadata={ + "help": "total number of processes to fork (default: all visible GPUs)" + }, + ) distributed_rank: Optional[int] = field( default=0, metadata={"help": "rank of the current worker"} ) @@ -150,12 +286,11 @@ class DistributedTrainingConfig(FairseqDataclass): }, ) device_id: int = field( - default=0, - metadata={"help": "which GPU to use (usually configured automatically)"}, - ) - local_rank: int = field( - default=0, - metadata={"help": "which GPU to use (usually configured automatically)"}, + default=os.getenv("LOCAL_RANK", 0), + metadata={ + "help": "which GPU to use (by default looks for $LOCAL_RANK, usually configured automatically)", + "argparse_alias": "--local_rank", + }, ) distributed_no_spawn: bool = field( default=False, @@ -164,7 +299,10 @@ class DistributedTrainingConfig(FairseqDataclass): }, ) ddp_backend: DDP_BACKEND_CHOICES = field( - default="c10d", metadata={"help": "DistributedDataParallel backend"} + default="pytorch_ddp", metadata={"help": "DistributedDataParallel backend"} + ) + ddp_comm_hook: DDP_COMM_HOOK_CHOICES = field( + default="none", metadata={"help": "communication hook"} ) bucket_cap_mb: int = field( default=25, metadata={"help": "bucket size for reduction"} @@ -180,13 +318,27 @@ class DistributedTrainingConfig(FairseqDataclass): default=False, metadata={ "help": "disable unused parameter detection (not applicable to " - "no_c10d ddp-backend" + "--ddp-backend=legacy_ddp)" + }, + ) + gradient_as_bucket_view: bool = field( + default=False, + metadata={ + "help": "when set to True, gradients will be views pointing to different offsets of allreduce communication buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients size. " + "--gradient-as-bucket-view=gradient_as_bucket_view)" }, ) fast_stat_sync: bool = field( default=False, metadata={"help": "[deprecated] this is now defined per Criterion"}, ) + heartbeat_timeout: int = field( + default=-1, + metadata={ + "help": "kill the job if no progress is made in N seconds; " + "set to -1 to disable" + }, + ) broadcast_buffers: bool = field( default=False, metadata={ @@ -194,9 +346,6 @@ class DistributedTrainingConfig(FairseqDataclass): "batchnorm population statistics" }, ) - distributed_wrapper: DISTRIBUTED_WRAPPER_CHOICES = field( - default="DDP", metadata={"help": "DistributedDataParallel backend"} - ) slowmo_momentum: Optional[float] = field( default=None, metadata={ @@ -204,8 +353,14 @@ class DistributedTrainingConfig(FairseqDataclass): "0.2 for 32 GPUs; 0.5 for 64 GPUs, 0.6 for > 64 GPUs" }, ) - slowmo_algorithm: str = field( - default="LocalSGD", metadata={"help": "whether to use LocalSGD or SGP"} + slowmo_base_algorithm: str = field( + default="localsgd", + metadata={ + "help": "Base algorithm. Either 'localsgd' or 'sgp'. Please refer " + "to the documentation of 'slowmo_base_algorithm' parameter in " + "https://fairscale.readthedocs.io/en/latest/api/experimental/nn/slowmo_ddp.html " + "for more details" + }, ) localsgd_frequency: int = field( default=3, metadata={"help": "Local SGD allreduce frequency"} @@ -280,7 +435,29 @@ class DistributedTrainingConfig(FairseqDataclass): zero_sharding: ZERO_SHARDING_CHOICES = field( default="none", metadata={"help": "ZeRO sharding"} ) + fp16: bool = II("common.fp16") + memory_efficient_fp16: bool = II("common.memory_efficient_fp16") tpu: bool = II("common.tpu") + # configuration for --ddp-backend=fully_sharded + no_reshard_after_forward: bool = field( + default=False, + metadata={"help": "don't reshard parameters after forward pass"}, + ) + fp32_reduce_scatter: bool = field( + default=False, + metadata={"help": "reduce-scatter grads in FP32"}, + ) + cpu_offload: bool = field( + default=False, metadata={"help": "offload FP32 params to CPU"} + ) + use_sharded_state: bool = field( + default=False, + metadata={"help": "use sharded checkpoint files"}, + ) + not_fsdp_flatten_parameters: bool = field( + default=False, + metadata={"help": "not flatten parameter param for fsdp"}, + ) @dataclass @@ -311,7 +488,7 @@ class DatasetConfig(FairseqDataclass): "help": "maximum sequence length in batch will be a multiplier of this value" }, ) - dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = field( + dataset_impl: Optional[DATASET_IMPL_CHOICES] = field( default=None, metadata={"help": "output dataset implementation"} ) data_buffer_size: int = field( @@ -328,6 +505,19 @@ class DatasetConfig(FairseqDataclass): " (e.g. train, valid, test)" }, ) + combine_valid_subsets: Optional[bool] = field( + default=None, + metadata={ + "help": "comma separated list of data subsets to use for validation" + " (e.g. train, valid, test)", + "argparse_alias": "--combine-val", + }, + ) + ignore_unused_valid_subsets: Optional[bool] = field( + default=False, + metadata={"help": "do not raise error if valid subsets are ignored"}, + ) + validate_interval: int = field( default=1, metadata={"help": "validate every N epochs"} ) @@ -344,19 +534,23 @@ class DatasetConfig(FairseqDataclass): default=False, metadata={"help": "disable validation"} ) max_tokens_valid: Optional[int] = field( - default=None, + default=II("dataset.max_tokens"), metadata={ "help": "maximum number of tokens in a validation batch" " (defaults to --max-tokens)" }, ) batch_size_valid: Optional[int] = field( - default=None, + default=II("dataset.batch_size"), metadata={ - "help": "batch size of the validation batch" " (defaults to --batch-size)", + "help": "batch size of the validation batch (defaults to --batch-size)", "argparse_alias": "--max-sentences-valid", }, ) + max_valid_steps: Optional[int] = field( + default=None, + metadata={"help": "How many batches to evaluate", "argparse_alias": "--nval"}, + ) curriculum: int = field( default=0, metadata={"help": "don't shuffle batches for first N epochs"} ) @@ -370,6 +564,24 @@ class DatasetConfig(FairseqDataclass): shard_id: int = field( default=0, metadata={"help": "id of the shard to generate (id < num_shards)"} ) + grouped_shuffling: bool = field( + default=False, + metadata={ + "help": "shuffle batches in groups of num_shards to enable similar sequence lengths on each GPU worker when batches are sorted by length", + }, + ) + update_epoch_batch_itr: bool = field( + default=II("dataset.grouped_shuffling"), + metadata={ + "help": "if true then prevents the reuse the epoch batch iterator by setting can_reuse_epoch_itr to false, defaults to --grouped-shuffling )", + }, + ) + update_ordered_indices_seed: bool = field( + default=False, + metadata={ + "help": "if true then increment seed with epoch for getting batch iterators, defautls to False.", + }, + ) @dataclass @@ -387,7 +599,7 @@ class OptimizationConfig(FairseqDataclass): }, ) clip_norm: float = field( - default=25.0, metadata={"help": "clip threshold of gradients"} + default=0.0, metadata={"help": "clip threshold of gradients"} ) sentence_avg: bool = field( default=False, @@ -407,7 +619,7 @@ class OptimizationConfig(FairseqDataclass): " (note: this may be interpreted differently depending on --lr-scheduler)" }, ) - min_lr: float = field( + stop_min_lr: float = field( default=-1.0, metadata={"help": "stop training when the learning rate reaches this minimum"}, ) @@ -417,6 +629,14 @@ class OptimizationConfig(FairseqDataclass): "help": "specify global optimizer for syncing models on different GPUs/shards" }, ) + skip_remainder_batch: Optional[bool] = field( + default=False, + metadata={ + "help": "if set, include the last (partial) batch of each epoch in training" + " (default is to skip it)." + }, + ) + debug_param_names: bool = False @dataclass @@ -431,6 +651,12 @@ class CheckpointConfig(FairseqDataclass): "(default: <save-dir>/checkpoint_last.pt" }, ) + continue_once: Optional[str] = field( + default=None, + metadata={ + "help": "continues from this checkpoint, unless a checkpoint indicated in 'restore_file' option is present" + }, + ) finetune_from_model: Optional[str] = field( default=None, metadata={ @@ -475,6 +701,14 @@ class CheckpointConfig(FairseqDataclass): "help": "keep the last N checkpoints saved with --save-interval-updates" }, ) + keep_interval_updates_pattern: int = field( + default=-1, + metadata={ + "help": "when used with --keep-interval-updates, skips deleting " + "any checkpoints with update X where " + "X %% keep_interval_updates_pattern == 0" + }, + ) keep_last_epochs: int = field( default=-1, metadata={"help": "keep last N epoch checkpoints"} ) @@ -525,8 +759,51 @@ class CheckpointConfig(FairseqDataclass): "the checkpoint" }, ) + load_checkpoint_on_all_dp_ranks: bool = field( + default=False, + metadata={ + "help": "load checkpoints on all data parallel devices " + "(default: only load on rank 0 and broadcast to other devices)" + }, + ) + write_checkpoints_asynchronously: bool = field( + default=False, + metadata={ + "help": ( + "Write checkpoints asynchronously in a separate " + "thread. NOTE: This feature is currently being tested." + ), + "argparse_alias": "--save-async", + }, + ) model_parallel_size: int = II("common.model_parallel_size") - distributed_rank: int = II("distributed_training.distributed_rank") + + +@dataclass +class FairseqBMUFConfig(FairseqDataclass): + block_lr: float = field( + default=1, metadata={"help": "block learning rate for bmuf"} + ) + block_momentum: float = field( + default=0.875, metadata={"help": "block momentum for bmuf"} + ) + global_sync_iter: int = field( + default=50, metadata={"help": "Iteration for syncing global model"} + ) + warmup_iterations: int = field( + default=500, metadata={"help": "warmup iterations for model to broadcast"} + ) + use_nbm: bool = field( + default=False, + metadata={"help": "Specify whether you want to use classical BM / Nesterov BM"}, + ) + average_sync: bool = field( + default=False, + metadata={ + "help": "Specify whether you want to average the local momentum after each sync" + }, + ) + distributed_world_size: int = II("distributed_training.distributed_world_size") @dataclass @@ -535,6 +812,10 @@ class GenerationConfig(FairseqDataclass): default=5, metadata={"help": "beam size"}, ) + beam_mt: int = field( + default=0, + metadata={"help": "beam size for the first-pass decoder"}, + ) nbest: int = field( default=1, metadata={"help": "number of hypotheses to output"}, @@ -551,6 +832,18 @@ class GenerationConfig(FairseqDataclass): "help": "generate sequences of maximum length ax + b, where x is the source length" }, ) + max_len_a_mt: float = field( + default=0, + metadata={ + "help": "generate sequences of maximum length ax + b, where x is the source length for the first-pass decoder" + }, + ) + max_len_b_mt: int = field( + default=200, + metadata={ + "help": "generate sequences of maximum length ax + b, where x is the source length for the first-pass decoder" + }, + ) min_len: int = field( default=1, metadata={"help": "minimum generation length"}, @@ -577,6 +870,12 @@ class GenerationConfig(FairseqDataclass): "help": "length penalty: <1.0 favors shorter, >1.0 favors longer sentences" }, ) + lenpen_mt: float = field( + default=1, + metadata={ + "help": "length penalty for the first-pass decoder: <1.0 favors shorter, >1.0 favors longer sentences" + }, + ) unkpen: float = field( default=0, metadata={ @@ -645,10 +944,12 @@ class GenerationConfig(FairseqDataclass): default=-1.0, metadata={"help": "strength of diversity penalty for Diverse Siblings Search"}, ) - print_alignment: bool = field( - default=False, + print_alignment: Optional[PRINT_ALIGNMENT_CHOICES] = field( + default=None, metadata={ - "help": "if set, uses attention feedback to compute and print alignment to source tokens" + "help": "if set, uses attention feedback to compute and print alignment to source tokens " + "(valid options are: hard, soft, otherwise treated as hard alignment)", + "argparse_const": "hard", }, ) print_step: bool = field( @@ -701,7 +1002,9 @@ class GenerationConfig(FairseqDataclass): default=False, metadata={"help": "Use dropout at inference time"}, ) - retain_dropout_modules: Optional[List[str]] = field( + # temporarily set to Any until https://github.com/facebookresearch/hydra/issues/1117 is fixed + # retain_dropout_modules: Optional[List[str]] = field( + retain_dropout_modules: Any = field( default=None, metadata={ "help": "if set, only retain dropout for the specified modules; " @@ -717,6 +1020,10 @@ class GenerationConfig(FairseqDataclass): default=False, metadata={"help": "if set, dont use seed for initializing random generators"}, ) + eos_token: Optional[str] = field( + default=None, + metadata={"help": "EOS token"}, + ) @dataclass @@ -728,9 +1035,11 @@ class CommonEvalConfig(FairseqDataclass): post_process: Optional[str] = field( default=None, metadata={ - "help": "post-process text by removing pre-processing such as BPE, letter segmentation, etc " - "(valid options are: sentencepiece, wordpiece, letter, _EOW, none, otherwise treated as BPE symbol)", - "argparse_const": "@@ ", + "help": ( + "post-process text by removing BPE, letter segmentation, etc. " + "Valid options can be found in fairseq.data.utils.post_process." + ), + "argparse_const": "subword_nmt", "argparse_alias": "--remove-bpe", }, ) @@ -788,135 +1097,51 @@ class InteractiveConfig(FairseqDataclass): ) -CONFIGS = { - "common": CommonConfig, - "common_eval": CommonEvalConfig, - "distributed_training": DistributedTrainingConfig, - "dataset": DatasetConfig, - "optimization": OptimizationConfig, - "checkpoint": CheckpointConfig, - "bmuf": FairseqBMUFConfig, - "generation": GenerationConfig, - "eval_lm": EvalLMConfig, - "interactive": InteractiveConfig, -} - - -def register_module_dataclass( - cs: ConfigStore, registry: Dict[str, Any], group: str -) -> None: - """register dataclasses defined in modules in config store, for example, in migrated tasks, models, etc.""" - # note that if `group == model`, we register all model archs, not the model name. - for k, v in registry.items(): - node_ = v() - node_._name = k - cs.store(name=k, group=group, node=node_, provider="fairseq") - - -def register_hydra_cfg(cs: ConfigStore, name: str = "default") -> None: - """cs: config store instance, register common training configs""" - - for k, v in CONFIGS.items(): - try: - cs.store(name=k, node=v()) - except BaseException: - logger.error(f"{k} - {v()}") - raise - - register_module_dataclass(cs, TASK_DATACLASS_REGISTRY, "task") - register_module_dataclass(cs, MODEL_DATACLASS_REGISTRY, "model") - - for k, v in REGISTRIES.items(): - register_module_dataclass(cs, v["dataclass_registry"], k) - - -def _override_attr( - sub_node: str, data_class: Type[FairseqDataclass], args: Namespace -) -> List[str]: - overrides = [] - - def get_default(f): - if not isinstance(f.default_factory, _MISSING_TYPE): - return f.default_factory() - return f.default - - for k, v in data_class.__dataclass_fields__.items(): - if k.startswith("_"): - # private member, skip - continue +@dataclass +class EMAConfig(FairseqDataclass): + store_ema: bool = field( + default=False, metadata={help: "store exponential moving average shadow model"} + ) + ema_decay: float = field( + default=0.9999, metadata={"help": "decay for exponential moving average model"} + ) + ema_start_update: int = field( + default=0, metadata={"help": "start EMA update after this many model updates"} + ) + ema_seed_model: Optional[str] = field( + default=None, + metadata={ + "help": "Seed to load EMA model from. " + "Used to load EMA model separately from the actual model." + }, + ) + ema_update_freq: int = field( + default=1, metadata={"help": "Do EMA update every this many model updates"} + ) + ema_fp32: bool = field( + default=False, + metadata={"help": "If true, store EMA model in fp32 even if model is in fp16"}, + ) - val = get_default(v) if not hasattr(args, k) else getattr(args, k) - if val is None: - overrides.append("{}.{}=null".format(sub_node, k)) - elif val == "": - overrides.append("{}.{}=''".format(sub_node, k)) - elif isinstance(val, str): - overrides.append("{}.{}='{}'".format(sub_node, k, val)) - else: - overrides.append("{}.{}={}".format(sub_node, k, val)) - return overrides - - -def migrate_registry( - name, value, registry, args, overrides, deletes, use_name_as_val=False -): - if value in registry: - overrides.append("{}={}".format(name, value)) - overrides.append("{}._name={}".format(name, value)) - overrides.extend(_override_attr(name, registry[value], args)) - elif use_name_as_val and value is not None: - overrides.append("{}={}".format(name, value)) - else: - deletes.append(name) - - -def override_module_args(args: Namespace) -> Tuple[List[str], List[str]]: - """use the field in args to overrides those in cfg""" - overrides = [] - deletes = [] - - for k, v in CONFIGS.items(): - overrides.extend(_override_attr(k, v, args)) - - if args is not None: - if hasattr(args, "task"): - migrate_registry( - "task", args.task, TASK_DATACLASS_REGISTRY, args, overrides, deletes - ) - else: - deletes.append("task") - - # these options will be set to "None" if they have not yet been migrated - # so we can populate them with the entire flat args - CORE_REGISTRIES = {"criterion", "optimizer", "lr_scheduler"} - - for k, v in REGISTRIES.items(): - if hasattr(args, k): - migrate_registry( - k, - getattr(args, k), - v["dataclass_registry"], - args, - overrides, - deletes, - use_name_as_val=k not in CORE_REGISTRIES, - ) - else: - deletes.append(k) - - no_dc = True - if hasattr(args, "arch"): - if args.arch in ARCH_MODEL_REGISTRY: - m_cls = ARCH_MODEL_REGISTRY[args.arch] - dc = getattr(m_cls, "__dataclass", None) - if dc is not None: - overrides.append("model={}".format(args.arch)) - overrides.append("model._name={}".format(args.arch)) - # override model params with those exist in args - overrides.extend(_override_attr("model", dc, args)) - no_dc = False - if no_dc: - deletes.append("model") - - return overrides, deletes +@dataclass +class FairseqConfig(FairseqDataclass): + common: CommonConfig = CommonConfig() + common_eval: CommonEvalConfig = CommonEvalConfig() + distributed_training: DistributedTrainingConfig = DistributedTrainingConfig() + dataset: DatasetConfig = DatasetConfig() + optimization: OptimizationConfig = OptimizationConfig() + checkpoint: CheckpointConfig = CheckpointConfig() + bmuf: FairseqBMUFConfig = FairseqBMUFConfig() + generation: GenerationConfig = GenerationConfig() + eval_lm: EvalLMConfig = EvalLMConfig() + interactive: InteractiveConfig = InteractiveConfig() + model: Any = MISSING + task: Any = None + criterion: Any = None + optimizer: Any = None + lr_scheduler: Any = None + scoring: Any = None + bpe: Any = None + tokenizer: Any = None + ema: EMAConfig = EMAConfig() diff --git a/fairseq/dataclass/constants.py b/fairseq/dataclass/constants.py index 2fd87f5fc4..5af92f2b3a 100644 --- a/fairseq/dataclass/constants.py +++ b/fairseq/dataclass/constants.py @@ -3,13 +3,54 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from fairseq.dataclass.utils import ChoiceEnum +from enum import Enum, EnumMeta +from typing import List + + +class StrEnumMeta(EnumMeta): + # this is workaround for submitit pickling leading to instance checks failing in hydra for StrEnum, see + # https://github.com/facebookresearch/hydra/issues/1156 + @classmethod + def __instancecheck__(cls, other): + return "enum" in str(type(other)) + + +class StrEnum(Enum, metaclass=StrEnumMeta): + def __str__(self): + return self.value + + def __eq__(self, other: str): + return self.value == other + + def __repr__(self): + return self.value + + def __hash__(self): + return hash(str(self)) + + +def ChoiceEnum(choices: List[str]): + """return the Enum class used to enforce list of choices""" + return StrEnum("Choices", {k: k for k in choices}) LOG_FORMAT_CHOICES = ChoiceEnum(["json", "none", "simple", "tqdm"]) -DDP_BACKEND_CHOICES = ChoiceEnum(["c10d", "no_c10d"]) -DISTRIBUTED_WRAPPER_CHOICES = ChoiceEnum(["DDP", "SlowMo"]) +DDP_BACKEND_CHOICES = ChoiceEnum( + [ + "c10d", # alias for pytorch_ddp + "fully_sharded", # FullyShardedDataParallel from fairscale + "legacy_ddp", + "no_c10d", # alias for legacy_ddp + "pytorch_ddp", + "slowmo", + ] +) +DDP_COMM_HOOK_CHOICES = ChoiceEnum(["none", "fp16"]) +DATASET_IMPL_CHOICES = ChoiceEnum(["raw", "lazy", "cached", "mmap", "fasta", "huffman"]) GENERATION_CONSTRAINTS_CHOICES = ChoiceEnum(["ordered", "unordered"]) -GENERATION_DECODING_FORMAT_CHOICES = ChoiceEnum(["unigram", "ensemble", "vote", "dp", "bs"]) +GENERATION_DECODING_FORMAT_CHOICES = ChoiceEnum( + ["unigram", "ensemble", "vote", "dp", "bs"] +) ZERO_SHARDING_CHOICES = ChoiceEnum(["none", "os"]) PIPELINE_CHECKPOINT_CHOICES = ChoiceEnum(["always", "never", "except_last"]) +PRINT_ALIGNMENT_CHOICES = ChoiceEnum(["hard", "soft"]) diff --git a/fairseq/dataclass/initialize.py b/fairseq/dataclass/initialize.py new file mode 100644 index 0000000000..5a7784bad1 --- /dev/null +++ b/fairseq/dataclass/initialize.py @@ -0,0 +1,61 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +"""isort:skip_file""" + +import logging +from hydra.core.config_store import ConfigStore +from fairseq.dataclass.configs import FairseqConfig +from omegaconf import DictConfig, OmegaConf + + +logger = logging.getLogger(__name__) + + +def hydra_init(cfg_name="config") -> None: + + cs = ConfigStore.instance() + cs.store(name=f"{cfg_name}", node=FairseqConfig) + + for k in FairseqConfig.__dataclass_fields__: + v = FairseqConfig.__dataclass_fields__[k].default + try: + cs.store(name=k, node=v) + except BaseException: + logger.error(f"{k} - {v}") + raise + + +def add_defaults(cfg: DictConfig) -> None: + """This function adds default values that are stored in dataclasses that hydra doesn't know about""" + + from fairseq.registry import REGISTRIES + from fairseq.tasks import TASK_DATACLASS_REGISTRY + from fairseq.models import ARCH_MODEL_NAME_REGISTRY, MODEL_DATACLASS_REGISTRY + from fairseq.dataclass.utils import merge_with_parent + from typing import Any + + OmegaConf.set_struct(cfg, False) + + for k, v in FairseqConfig.__dataclass_fields__.items(): + field_cfg = cfg.get(k) + if field_cfg is not None and v.type == Any: + dc = None + + if isinstance(field_cfg, str): + field_cfg = DictConfig({"_name": field_cfg}) + field_cfg.__dict__["_parent"] = field_cfg.__dict__["_parent"] + + name = getattr(field_cfg, "_name", None) + + if k == "task": + dc = TASK_DATACLASS_REGISTRY.get(name) + elif k == "model": + name = ARCH_MODEL_NAME_REGISTRY.get(name, name) + dc = MODEL_DATACLASS_REGISTRY.get(name) + elif k in REGISTRIES: + dc = REGISTRIES[k]["dataclass_registry"].get(name) + + if dc is not None: + cfg[k] = merge_with_parent(dc, field_cfg) diff --git a/fairseq/dataclass/utils.py b/fairseq/dataclass/utils.py index 9c501c5b00..f6467d5f40 100644 --- a/fairseq/dataclass/utils.py +++ b/fairseq/dataclass/utils.py @@ -4,14 +4,22 @@ # LICENSE file in the root directory of this source tree. import ast +import inspect +import logging +import os +import re from argparse import ArgumentError, ArgumentParser, Namespace -from dataclasses import _MISSING_TYPE, MISSING, dataclass +from dataclasses import _MISSING_TYPE, MISSING, is_dataclass from enum import Enum -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple, Type +from fairseq.dataclass import FairseqDataclass +from fairseq.dataclass.configs import FairseqConfig from hydra.core.global_hydra import GlobalHydra from hydra.experimental import compose, initialize -from omegaconf import DictConfig, OmegaConf, open_dict +from omegaconf import DictConfig, OmegaConf, open_dict, _utils + +logger = logging.getLogger(__name__) def eval_str_list(x, x_type=float): @@ -27,102 +35,46 @@ def eval_str_list(x, x_type=float): return [x_type(x)] -class StrEnum(Enum): - def __str__(self): - return self.value - - def __eq__(self, other: str): - return self.value == other - - def __repr__(self): - return self.value - - -def ChoiceEnum(choices: List[str]): - """return the Enum class used to enforce list of choices""" - return StrEnum("Choices", {k: k for k in choices}) - - -@dataclass -class FairseqDataclass: - """fairseq base dataclass that supported fetching attributes and metas""" - - _name: Optional[str] = None - - @staticmethod - def name(): - return None - - def _get_all_attributes(self) -> List[str]: - return [k for k in self.__dataclass_fields__.keys()] - - def _get_meta( - self, attribute_name: str, meta: str, default: Optional[Any] = None - ) -> Any: - return self.__dataclass_fields__[attribute_name].metadata.get(meta, default) - - def _get_name(self, attribute_name: str) -> str: - return self.__dataclass_fields__[attribute_name].name - - def _get_default(self, attribute_name: str) -> Any: - if hasattr(self, attribute_name): - if str(getattr(self, attribute_name)).startswith("${"): - return str(getattr(self, attribute_name)) - elif str(self.__dataclass_fields__[attribute_name].default).startswith( - "${" - ): - return str(self.__dataclass_fields__[attribute_name].default) - elif ( - getattr(self, attribute_name) - != self.__dataclass_fields__[attribute_name].default - ): - return getattr(self, attribute_name) - - f = self.__dataclass_fields__[attribute_name] - if not isinstance(f.default_factory, _MISSING_TYPE): - return f.default_factory() - return f.default - - def _get_type(self, attribute_name: str) -> Any: - return self.__dataclass_fields__[attribute_name].type - - def _get_help(self, attribute_name: str) -> Any: - return self._get_meta(attribute_name, "help") - - def _get_argparse_const(self, attribute_name: str) -> Any: - return self._get_meta(attribute_name, "argparse_const") +def interpret_dc_type(field_type): + if isinstance(field_type, str): + raise RuntimeError("field should be a type") - def _get_argparse_alias(self, attribute_name: str) -> Any: - return self._get_meta(attribute_name, "argparse_alias") + if field_type == Any: + return str - def _get_choices(self, attribute_name: str) -> Any: - return self._get_meta(attribute_name, "choices") + typestring = str(field_type) + if re.match( + r"(typing.|^)Union\[(.*), NoneType\]$", typestring + ) or typestring.startswith("typing.Optional"): + return field_type.__args__[0] + return field_type def gen_parser_from_dataclass( parser: ArgumentParser, dataclass_instance: FairseqDataclass, delete_default: bool = False, + with_prefix: Optional[str] = None, ) -> None: - """convert a dataclass instance to tailing parser arguments""" - import re + """ + convert a dataclass instance to tailing parser arguments. + + If `with_prefix` is provided, prefix all the keys in the resulting parser with it. It means that we are + building a flat namespace from a structured dataclass (see transformer_config.py for example). + """ def argparse_name(name: str): - if name == "data": - # normally data is positional args + if name == "data" and (with_prefix is None or with_prefix == ""): + # normally data is positional args, so we don't add the -- nor the prefix return name if name == "_name": # private member, skip return None - return "--" + name.replace("_", "-") - - def interpret_dc_type(field_type): - if isinstance(field_type, str): - raise RuntimeError("field should be a type") - typestring = str(field_type) - if re.match(r"(typing.|^)Union\[(.*), NoneType\]$", typestring): - return field_type.__args__[0] - return field_type + full_name = "--" + name.replace("_", "-") + if with_prefix is not None and with_prefix != "": + # if a prefix is specified, construct the prefixed arg name + full_name = with_prefix + "-" + full_name[2:] # strip -- when composing + return full_name def get_kwargs_from_dc( dataclass_instance: FairseqDataclass, k: str @@ -151,9 +103,10 @@ def get_kwargs_from_dc( kwargs["required"] = True if field_choices is not None: kwargs["choices"] = field_choices - if (isinstance(inter_type, type) and issubclass(inter_type, List)) or ( - "List" in str(inter_type) - ): + if ( + isinstance(inter_type, type) + and (issubclass(inter_type, List) or issubclass(inter_type, Tuple)) + ) or ("List" in str(inter_type) or "Tuple" in str(inter_type)): if "int" in str(inter_type): kwargs["type"] = lambda x: eval_str_list(x, int) elif "float" in str(inter_type): @@ -161,7 +114,9 @@ def get_kwargs_from_dc( elif "str" in str(inter_type): kwargs["type"] = lambda x: eval_str_list(x, str) else: - raise NotImplementedError() + raise NotImplementedError( + "parsing of type " + str(inter_type) + " is not implemented" + ) if field_default is not MISSING: kwargs["default"] = ( ",".join(map(str, field_default)) @@ -187,6 +142,10 @@ def get_kwargs_from_dc( if field_default is not MISSING: kwargs["default"] = field_default + # build the help with the hierarchical prefix + if with_prefix is not None and with_prefix != "" and field_help is not None: + field_help = with_prefix[2:] + ": " + field_help + kwargs["help"] = field_help if field_const is not None: kwargs["const"] = field_const @@ -196,8 +155,20 @@ def get_kwargs_from_dc( for k in dataclass_instance._get_all_attributes(): field_name = argparse_name(dataclass_instance._get_name(k)) + field_type = dataclass_instance._get_type(k) if field_name is None: continue + elif inspect.isclass(field_type) and issubclass(field_type, FairseqDataclass): + # for fields that are of type FairseqDataclass, we can recursively + # add their fields to the namespace (so we add the args from model, task, etc. to the root namespace) + prefix = None + if with_prefix is not None: + # if a prefix is specified, then we don't want to copy the subfields directly to the root namespace + # but we prefix them with the name of the current field. + prefix = field_name + gen_parser_from_dataclass(parser, field_type(), delete_default, prefix) + continue + kwargs = get_kwargs_from_dc(dataclass_instance, k) field_args = [field_name] @@ -209,8 +180,12 @@ def get_kwargs_from_dc( if isinstance(kwargs["default"], str) and kwargs["default"].startswith( "${" ): - continue - if delete_default: + if kwargs["help"] is None: + # this is a field with a name that will be added elsewhere + continue + else: + del kwargs["default"] + if delete_default and "default" in kwargs: del kwargs["default"] try: parser.add_argument(*field_args, **kwargs) @@ -241,21 +216,193 @@ def _set_legacy_defaults(args, cls): setattr(args, key, default_value) +def _override_attr( + sub_node: str, data_class: Type[FairseqDataclass], args: Namespace +) -> List[str]: + overrides = [] + + if not inspect.isclass(data_class) or not issubclass(data_class, FairseqDataclass): + return overrides + + def get_default(f): + if not isinstance(f.default_factory, _MISSING_TYPE): + return f.default_factory() + return f.default + + for k, v in data_class.__dataclass_fields__.items(): + if k.startswith("_"): + # private member, skip + continue + + val = get_default(v) if not hasattr(args, k) else getattr(args, k) + + field_type = interpret_dc_type(v.type) + if ( + isinstance(val, str) + and not val.startswith("${") # not interpolation + and field_type != str + and ( + not inspect.isclass(field_type) or not issubclass(field_type, Enum) + ) # not choices enum + ): + # upgrade old models that stored complex parameters as string + val = ast.literal_eval(val) + + if isinstance(val, tuple): + val = list(val) + + v_type = getattr(v.type, "__origin__", None) + if ( + (v_type is List or v_type is list or v_type is Optional) + # skip interpolation + and not (isinstance(val, str) and val.startswith("${")) + ): + # if type is int but val is float, then we will crash later - try to convert here + if hasattr(v.type, "__args__"): + t_args = v.type.__args__ + if len(t_args) == 1 and (t_args[0] is float or t_args[0] is int): + val = list(map(t_args[0], val)) + elif val is not None and ( + field_type is int or field_type is bool or field_type is float + ): + try: + val = field_type(val) + except: + pass # ignore errors here, they are often from interpolation args + + if val is None: + overrides.append("{}.{}=null".format(sub_node, k)) + elif val == "": + overrides.append("{}.{}=''".format(sub_node, k)) + elif isinstance(val, str): + val = val.replace("'", r"\'") + overrides.append("{}.{}='{}'".format(sub_node, k, val)) + elif isinstance(val, FairseqDataclass): + overrides += _override_attr(f"{sub_node}.{k}", type(val), args) + elif isinstance(val, Namespace): + sub_overrides, _ = override_module_args(val) + for so in sub_overrides: + overrides.append(f"{sub_node}.{k}.{so}") + else: + overrides.append("{}.{}={}".format(sub_node, k, val)) + + return overrides + + +def migrate_registry( + name, value, registry, args, overrides, deletes, use_name_as_val=False +): + if value in registry: + overrides.append("{}={}".format(name, value)) + overrides.append("{}._name={}".format(name, value)) + overrides.extend(_override_attr(name, registry[value], args)) + elif use_name_as_val and value is not None: + overrides.append("{}={}".format(name, value)) + else: + deletes.append(name) + + +def override_module_args(args: Namespace) -> Tuple[List[str], List[str]]: + """use the field in args to overrides those in cfg""" + overrides = [] + deletes = [] + + for k in FairseqConfig.__dataclass_fields__.keys(): + overrides.extend( + _override_attr(k, FairseqConfig.__dataclass_fields__[k].type, args) + ) + + if args is not None: + if hasattr(args, "task"): + from fairseq.tasks import TASK_DATACLASS_REGISTRY + + migrate_registry( + "task", args.task, TASK_DATACLASS_REGISTRY, args, overrides, deletes + ) + else: + deletes.append("task") + + # these options will be set to "None" if they have not yet been migrated + # so we can populate them with the entire flat args + CORE_REGISTRIES = {"criterion", "optimizer", "lr_scheduler"} + + from fairseq.registry import REGISTRIES + + for k, v in REGISTRIES.items(): + if hasattr(args, k): + migrate_registry( + k, + getattr(args, k), + v["dataclass_registry"], + args, + overrides, + deletes, + use_name_as_val=k not in CORE_REGISTRIES, + ) + else: + deletes.append(k) + + no_dc = True + if hasattr(args, "arch"): + from fairseq.models import ARCH_MODEL_REGISTRY, ARCH_MODEL_NAME_REGISTRY + + if args.arch in ARCH_MODEL_REGISTRY: + m_cls = ARCH_MODEL_REGISTRY[args.arch] + dc = getattr(m_cls, "__dataclass", None) + if dc is not None: + m_name = ARCH_MODEL_NAME_REGISTRY[args.arch] + overrides.append("model={}".format(m_name)) + overrides.append("model._name={}".format(args.arch)) + # override model params with those exist in args + overrides.extend(_override_attr("model", dc, args)) + no_dc = False + if no_dc: + deletes.append("model") + + return overrides, deletes + + +class omegaconf_no_object_check: + def __init__(self): + # Changed in https://github.com/omry/omegaconf/pull/911 - both are kept for back compat. + if hasattr(_utils, "is_primitive_type"): + self.old_is_primitive = _utils.is_primitive_type + else: + self.old_is_primitive = _utils.is_primitive_type_annotation + + def __enter__(self): + if hasattr(_utils, "is_primitive_type"): + _utils.is_primitive_type = lambda _: True + else: + _utils.is_primitive_type_annotation = lambda _: True + + def __exit__(self, type, value, traceback): + if hasattr(_utils, "is_primitive_type"): + _utils.is_primitive_type = self.old_is_primitive + else: + _utils.is_primitive_type_annotation = self.old_is_primitive + + def convert_namespace_to_omegaconf(args: Namespace) -> DictConfig: - from fairseq.dataclass.data_class import override_module_args + """Convert a flat argparse.Namespace to a structured DictConfig.""" # Here we are using field values provided in args to override counterparts inside config object overrides, deletes = override_module_args(args) - cfg_name = "config" - cfg_path = f"../../{cfg_name}" + # configs will be in fairseq/config after installation + config_path = os.path.join("..", "config") - if not GlobalHydra().is_initialized(): - initialize(config_path=cfg_path) + GlobalHydra.instance().clear() + + with initialize(config_path=config_path): + try: + composed_cfg = compose("config", overrides=overrides, strict=False) + except: + logger.error("Error when composing. Overrides: " + str(overrides)) + raise - composed_cfg = compose(cfg_name, overrides=overrides, strict=False) - for k in deletes: - composed_cfg[k] = None + for k in deletes: + composed_cfg[k] = None cfg = OmegaConf.create( OmegaConf.to_container(composed_cfg, resolve=True, enum_to_str=True) @@ -265,64 +412,99 @@ def convert_namespace_to_omegaconf(args: Namespace) -> DictConfig: # omegaconf version that supports object flags, or when we migrate all existing models from omegaconf import _utils - old_primitive = _utils.is_primitive_type - _utils.is_primitive_type = lambda _: True - - if cfg.task is None and getattr(args, "task", None): - cfg.task = Namespace(**vars(args)) - from fairseq.tasks import TASK_REGISTRY - - _set_legacy_defaults(cfg.task, TASK_REGISTRY[args.task]) - cfg.task._name = args.task - if cfg.model is None and getattr(args, "arch", None): - cfg.model = Namespace(**vars(args)) - from fairseq.models import ARCH_MODEL_REGISTRY - - _set_legacy_defaults(cfg.model, ARCH_MODEL_REGISTRY[args.arch]) - cfg.model._name = args.arch - if cfg.optimizer is None and getattr(args, "optimizer", None): - cfg.optimizer = Namespace(**vars(args)) - from fairseq.optim import OPTIMIZER_REGISTRY - - _set_legacy_defaults(cfg.optimizer, OPTIMIZER_REGISTRY[args.optimizer]) - cfg.optimizer._name = args.optimizer - if cfg.lr_scheduler is None and getattr(args, "lr_scheduler", None): - cfg.lr_scheduler = Namespace(**vars(args)) - from fairseq.optim.lr_scheduler import LR_SCHEDULER_REGISTRY - - _set_legacy_defaults(cfg.lr_scheduler, LR_SCHEDULER_REGISTRY[args.lr_scheduler]) - cfg.lr_scheduler._name = args.lr_scheduler - if cfg.criterion is None and getattr(args, "criterion", None): - cfg.criterion = Namespace(**vars(args)) - from fairseq.criterions import CRITERION_REGISTRY - - _set_legacy_defaults(cfg.criterion, CRITERION_REGISTRY[args.criterion]) - cfg.criterion._name = args.criterion - - _utils.is_primitive_type = old_primitive + with omegaconf_no_object_check(): + if cfg.task is None and getattr(args, "task", None): + cfg.task = Namespace(**vars(args)) + from fairseq.tasks import TASK_REGISTRY + + _set_legacy_defaults(cfg.task, TASK_REGISTRY[args.task]) + cfg.task._name = args.task + if cfg.model is None and getattr(args, "arch", None): + cfg.model = Namespace(**vars(args)) + from fairseq.models import ARCH_MODEL_REGISTRY + + _set_legacy_defaults(cfg.model, ARCH_MODEL_REGISTRY[args.arch]) + cfg.model._name = args.arch + if cfg.optimizer is None and getattr(args, "optimizer", None): + cfg.optimizer = Namespace(**vars(args)) + from fairseq.optim import OPTIMIZER_REGISTRY + + _set_legacy_defaults(cfg.optimizer, OPTIMIZER_REGISTRY[args.optimizer]) + cfg.optimizer._name = args.optimizer + if cfg.lr_scheduler is None and getattr(args, "lr_scheduler", None): + cfg.lr_scheduler = Namespace(**vars(args)) + from fairseq.optim.lr_scheduler import LR_SCHEDULER_REGISTRY + + _set_legacy_defaults( + cfg.lr_scheduler, LR_SCHEDULER_REGISTRY[args.lr_scheduler] + ) + cfg.lr_scheduler._name = args.lr_scheduler + if cfg.criterion is None and getattr(args, "criterion", None): + cfg.criterion = Namespace(**vars(args)) + from fairseq.criterions import CRITERION_REGISTRY + + _set_legacy_defaults(cfg.criterion, CRITERION_REGISTRY[args.criterion]) + cfg.criterion._name = args.criterion + OmegaConf.set_struct(cfg, True) return cfg -def populate_dataclass( - args: Namespace, dataclass: FairseqDataclass -) -> FairseqDataclass: - for k in dataclass.__dataclass_fields__.keys(): - if k.startswith("_"): - # private member, skip - continue - if hasattr(args, k): - setattr(dataclass, k, getattr(args, k)) - - return dataclass - - def overwrite_args_by_name(cfg: DictConfig, overrides: Dict[str, any]): # this will be deprecated when we get rid of argparse and model_overrides logic + from fairseq.registry import REGISTRIES + with open_dict(cfg): for k in cfg.keys(): - if isinstance(cfg[k], DictConfig): - overwrite_args_by_name(cfg[k], overrides) + # "k in cfg" will return false if its a "mandatory value (e.g. ???)" + if k in cfg and isinstance(cfg[k], DictConfig): + if k in overrides and isinstance(overrides[k], dict): + for ok, ov in overrides[k].items(): + if isinstance(ov, dict) and cfg[k][ok] is not None: + overwrite_args_by_name(cfg[k][ok], ov) + else: + cfg[k][ok] = ov + else: + overwrite_args_by_name(cfg[k], overrides) + elif k in cfg and isinstance(cfg[k], Namespace): + for override_key, val in overrides.items(): + setattr(cfg[k], override_key, val) elif k in overrides: - cfg[k] = overrides[k] + if ( + k in REGISTRIES + and overrides[k] in REGISTRIES[k]["dataclass_registry"] + ): + cfg[k] = DictConfig( + REGISTRIES[k]["dataclass_registry"][overrides[k]] + ) + overwrite_args_by_name(cfg[k], overrides) + cfg[k]._name = overrides[k] + else: + cfg[k] = overrides[k] + + +def merge_with_parent(dc: FairseqDataclass, cfg: DictConfig, remove_missing=False): + if remove_missing: + + def remove_missing_rec(src_keys, target_cfg): + if is_dataclass(target_cfg): + target_keys = set(target_cfg.__dataclass_fields__.keys()) + else: + target_keys = set(target_cfg.keys()) + + for k in list(src_keys.keys()): + if k not in target_keys: + del src_keys[k] + elif OmegaConf.is_config(src_keys[k]): + tgt = getattr(target_cfg, k) + if tgt is not None and (is_dataclass(tgt) or hasattr(tgt, "keys")): + remove_missing_rec(src_keys[k], tgt) + + with open_dict(cfg): + remove_missing_rec(cfg, dc) + + merged_cfg = OmegaConf.merge(dc, cfg) + merged_cfg.__dict__["_parent"] = cfg.__dict__["_parent"] + OmegaConf.set_struct(merged_cfg, True) + return merged_cfg diff --git a/fairseq/distributed/__init__.py b/fairseq/distributed/__init__.py new file mode 100644 index 0000000000..9130db8f5d --- /dev/null +++ b/fairseq/distributed/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .distributed_timeout_wrapper import DistributedTimeoutWrapper +from .fully_sharded_data_parallel import ( + fsdp_enable_wrap, + fsdp_wrap, + FullyShardedDataParallel, +) +from .legacy_distributed_data_parallel import LegacyDistributedDataParallel +from .module_proxy_wrapper import ModuleProxyWrapper +from .tpu_distributed_data_parallel import TPUDistributedDataParallel + + +__all__ = [ + "DistributedTimeoutWrapper", + "fsdp_enable_wrap", + "fsdp_wrap", + "FullyShardedDataParallel", + "LegacyDistributedDataParallel", + "ModuleProxyWrapper", + "TPUDistributedDataParallel", +] diff --git a/fairseq/distributed/distributed_timeout_wrapper.py b/fairseq/distributed/distributed_timeout_wrapper.py new file mode 100644 index 0000000000..6e06b4b6dd --- /dev/null +++ b/fairseq/distributed/distributed_timeout_wrapper.py @@ -0,0 +1,97 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +import signal +import threading + +from torch import nn + + +logger = logging.getLogger(__name__) + + +class DistributedTimeoutWrapper(nn.Module): + """ + A wrapper that kills the process if no progress is made within a given + *timeout*. The timer is reset every time :func:`forward` is called. + + Usage:: + + module = DistributedTimeoutWrapper(module, timeout=30) + x = module(input) + time.sleep(20) # safe + x = module(input) + time.sleep(45) # job will be killed before this returns + + Args: + module (nn.Module): module to wrap + timeout (int): number of seconds before killing the process + (set to a value <= 0 to disable the timeout) + signal (Optional): signal to send once timeout is triggered + """ + + def __init__(self, module: nn.Module, timeout: int, signal=signal.SIGINT): + super().__init__() + self.module = module + self.timeout = timeout + self.signal = signal + + if timeout > 0: + self._heartbeat = threading.Event() + self._heartbeat_thread = threading.Thread( + target=self._check_heartbeat, + args=(os.getpid(),), + daemon=True, + ) + self._heartbeat_thread.start() + self._terminated = False + else: + self._heartbeat = None + self._heartbeat_thread = None + + def __del__(self): + self.stop_timeout() + + def __getattr__(self, name): + """Forward missing attributes to wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + return getattr(self.module, name) + + def stop_timeout(self): + if self._heartbeat_thread is not None: + self._terminated = True + self._heartbeat_thread.join() + + def state_dict(self, *args, **kwargs): + return self.module.state_dict(*args, **kwargs) + + def load_state_dict(self, *args, **kwargs): + return self.module.load_state_dict(*args, **kwargs) + + def forward(self, *args, **kwargs): + if self._heartbeat is not None: + self._heartbeat.set() + return self.module(*args, **kwargs) + + def _check_heartbeat(self, parent_pid): + self._heartbeat.wait() # wait for the first forward pass + while True: + self._heartbeat.clear() + success = self._heartbeat.wait(timeout=self.timeout) + if self._terminated: + break + elif not success: + logger.error( + ( + "Killing job for not making progress in {} seconds. " + "Set --heartbeat-timeout=-1 to disable this timeout." + ).format(int(self.timeout)) + ) + os.kill(parent_pid, self.signal) + return diff --git a/fairseq/distributed/fully_sharded_data_parallel.py b/fairseq/distributed/fully_sharded_data_parallel.py new file mode 100644 index 0000000000..1c508b05dd --- /dev/null +++ b/fairseq/distributed/fully_sharded_data_parallel.py @@ -0,0 +1,145 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +from typing import Optional + +import torch +from fairseq.dataclass.configs import DistributedTrainingConfig +from fairseq.distributed import utils as dist_utils + + +try: + from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP + + has_FSDP = True +except ImportError: + FSDP = torch.nn.Module + has_FSDP = False + + +class FullyShardedDataParallel(FSDP): + """ + A small wrapper around fairscale's FullyShardedDataParallel (FSDP) with some + fairseq-specific checkpoint saving/loading logic. + + Args: + use_sharded_state (bool): if True, then ``state_dict`` will return + ``FSDP.local_state_dict`` and ``load_state_dict`` will call + ``FSDP.load_local_state_dict``. Otherwise, ``state_dict`` will + return the full model weights on data parallel rank 0 (empty on + other ranks) and ``load_state_dict`` will broadcast model weights + from rank 0 to other ranks. + """ + + def __init__(self, *args, use_sharded_state: bool = False, **kwargs): + if not has_FSDP: + raise ImportError( + "Cannot find FullyShardedDataParallel. " + "Please install fairscale with: pip install fairscale" + ) + super().__init__(*args, **kwargs) + self.use_sharded_state = use_sharded_state + + @property + def unwrapped_module(self) -> torch.nn.Module: + if self.flatten_parameters: + return self.module.module + else: + return self.module + + def state_dict(self, destination=None, prefix="", keep_vars=False): + if self.use_sharded_state: + return super().local_state_dict( + destination=destination, prefix=prefix, keep_vars=keep_vars + ) + else: + if self.rank == 0: + return super().state_dict( + destination=destination, prefix=prefix, keep_vars=keep_vars + ) + else: + # We must call state_dict() due to use of communication + # primitives. But we don't use the result. + super().state_dict() + return destination or {} + + def load_state_dict(self, state_dict, strict=True, model_cfg=None): + if self.use_sharded_state: + return super().load_local_state_dict(state_dict, strict=strict) + else: + state_dict = dist_utils.broadcast_object( + state_dict, src_rank=0, group=self.process_group + ) + return super().load_state_dict(state_dict, strict=strict) + + +class DummyProcessGroup: + def __init__(self, rank: int, size: int): + self._rank = rank + self._size = size + + def rank(self) -> int: + return self._rank + + def size(self) -> int: + return self._size + + +@contextlib.contextmanager +def fsdp_enable_wrap(cfg: DistributedTrainingConfig): + try: + from fairscale.nn import enable_wrap + except ImportError: + raise ImportError( + "Cannot find FullyShardedDataParallel. " + "Please install fairscale with: pip install fairscale" + ) + if cfg.memory_efficient_fp16: + assert cfg.fp16 # memory_efficient_fp16 should imply fp16 + group = dist_utils.get_data_parallel_group() + if group is None and cfg.distributed_world_size == 1: + group = DummyProcessGroup(rank=0, size=1) + fsdp_config = { + "process_group": group, + "reshard_after_forward": not cfg.no_reshard_after_forward, + "mixed_precision": cfg.fp16 and not cfg.memory_efficient_fp16, + "fp32_reduce_scatter": cfg.fp32_reduce_scatter, + "flatten_parameters": not cfg.not_fsdp_flatten_parameters, + "cpu_offload": cfg.cpu_offload, + "compute_dtype": torch.float16 if cfg.fp16 else torch.float32, + "bucket_cap_mb": cfg.bucket_cap_mb, + "state_dict_device": torch.device("cpu"), # reduce GPU mem usage + } + with enable_wrap( + wrapper_cls=FullyShardedDataParallel, + use_sharded_state=cfg.use_sharded_state, + **fsdp_config, + ): + yield + + +def fsdp_wrap(module, min_num_params: Optional[int] = None, **kwargs): + """ + Helper to wrap layers/modules in FSDP. This falls back to a no-op if + fairscale is not available. + + Args: + module (nn.Module): module to (maybe) wrap + min_num_params (int, Optional): minimum number of layer params to wrap + """ + try: + from fairscale.nn import wrap + + if min_num_params is not None: + num_params = sum(p.numel() for p in module.parameters()) + if num_params >= min_num_params: + return wrap(module, **kwargs) + else: + return module + else: + return wrap(module, **kwargs) + except ImportError: + return module diff --git a/fairseq/legacy_distributed_data_parallel.py b/fairseq/distributed/legacy_distributed_data_parallel.py similarity index 89% rename from fairseq/legacy_distributed_data_parallel.py rename to fairseq/distributed/legacy_distributed_data_parallel.py index 44f87c7c42..cd434c7372 100644 --- a/fairseq/legacy_distributed_data_parallel.py +++ b/fairseq/distributed/legacy_distributed_data_parallel.py @@ -14,15 +14,13 @@ training with `--update-freq`. """ -import copy from collections import OrderedDict from contextlib import contextmanager import torch from torch import nn -from torch.autograd import Variable -from . import distributed_utils +from fairseq.distributed import utils class LegacyDistributedDataParallel(nn.Module): @@ -34,20 +32,18 @@ class LegacyDistributedDataParallel(nn.Module): Args: module (~torch.nn.Module): module to be parallelized - world_size (int): number of parallel workers - process_group (optional): the c10d process group to be used for - distributed data all-reduction. If None, the default process group - will be used. + process_group: the c10d process group to be used for distributed data + parallel all-reduction. buffer_size (int, optional): number of elements to buffer before performing all-reduce (default: 256M). """ - def __init__(self, module, world_size, process_group=None, buffer_size=2 ** 28): + def __init__(self, module, process_group, buffer_size=2**28): super().__init__() self.module = module - self.world_size = world_size self.process_group = process_group + self.world_size = utils.get_world_size(self.process_group) # Never use a bigger buffer than the number of model params self.buffer_size = min(buffer_size, sum(p.numel() for p in module.parameters())) @@ -66,13 +62,6 @@ def __init__(self, module, world_size, process_group=None, buffer_size=2 ** 28): paramlists[device] += [param] self.per_device_params = list(paramlists.values()) - def __getstate__(self): - attrs = copy.copy(self.__dict__) - return attrs - - def __setstate__(self, state): - super().__setstate__(state) - @contextmanager def no_sync(self): """A context manager to disable gradient synchronization.""" @@ -84,7 +73,7 @@ def no_sync(self): def forward(self, *inputs, **kwargs): return self.module(*inputs, **kwargs) - def all_reduce(self): + def all_reduce_grads(self): """ This function must be called explicitly after backward to reduce gradients. There is no automatic hook like c10d. @@ -118,7 +107,7 @@ def all_reduce_params(params): if nonzero_buffer: buffer.div_(self.world_size) - distributed_utils.all_reduce(buffer, self.process_group) + utils.all_reduce(buffer, self.process_group) # copy all-reduced grads back into their original place offset = 0 @@ -147,6 +136,11 @@ def reduction_fn(): continue if param.grad is None: param.grad = torch.zeros_like(param) + + if hasattr(param, "expert"): + # Skip gradient sync for unshared parameters + continue + if param.grad.requires_grad: raise RuntimeError( "DistributedDataParallel only works " diff --git a/fairseq/distributed/module_proxy_wrapper.py b/fairseq/distributed/module_proxy_wrapper.py new file mode 100644 index 0000000000..904dc0c202 --- /dev/null +++ b/fairseq/distributed/module_proxy_wrapper.py @@ -0,0 +1,56 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from torch import nn + + +class ModuleProxyWrapper(nn.Module): + """ + Wrap a DistributedDataParallel module and forward requests for missing + attributes to the module wrapped by DDP (the twice-wrapped module). + Also forward calls to :func:`state_dict` and :func:`load_state_dict`. + + Usage:: + + module.xyz = "hello world" + wrapped_module = DistributedDataParallel(module, **ddp_args) + wrapped_module = ModuleProxyWrapper(wrapped_module) + assert wrapped_module.xyz == "hello world" + assert wrapped_module.state_dict().keys() == module.state_dict().keys() + + Args: + module (nn.Module): module to wrap + """ + + def __init__(self, module: nn.Module): + super().__init__() + assert hasattr( + module, "module" + ), "ModuleProxyWrapper expects input to wrap another module" + self.module = module + + def __getattr__(self, name): + """Forward missing attributes to twice-wrapped module.""" + try: + # defer to nn.Module's logic + return super().__getattr__(name) + except AttributeError: + try: + # forward to the once-wrapped module + return getattr(self.module, name) + except AttributeError: + # forward to the twice-wrapped module + return getattr(self.module.module, name) + + def state_dict(self, *args, **kwargs): + """Forward to the twice-wrapped module.""" + return self.module.module.state_dict(*args, **kwargs) + + def load_state_dict(self, *args, **kwargs): + """Forward to the twice-wrapped module.""" + return self.module.module.load_state_dict(*args, **kwargs) + + def forward(self, *args, **kwargs): + return self.module(*args, **kwargs) diff --git a/fairseq/distributed/tpu_distributed_data_parallel.py b/fairseq/distributed/tpu_distributed_data_parallel.py new file mode 100644 index 0000000000..3b9e103301 --- /dev/null +++ b/fairseq/distributed/tpu_distributed_data_parallel.py @@ -0,0 +1,43 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from torch import nn + +from fairseq.distributed import utils + + +class TPUDistributedDataParallel(nn.Module): + def __init__(self, module, process_group): + super().__init__() + self.module = module + self.process_group = process_group + self.world_size = utils.get_world_size(self.process_group) + + def forward(self, *inputs, **kwargs): + return self.module(*inputs, **kwargs) + + def all_reduce_grads(self): + gradients = [] + for p in self.parameters(): + if not p.requires_grad: + continue + if p.grad is None: + p.grad = torch.zeros_like(p) + if p.grad.requires_grad: + raise RuntimeError( + "TPUDistributedDataParallel only works with gradients that don't " + "require grad" + ) + gradients.append(p.grad) + + import torch_xla.core.xla_model as xm + + xm.all_reduce( + "sum", + gradients, + scale=1.0 / self.world_size, + groups=self.process_group[1], + ) diff --git a/fairseq/distributed/utils.py b/fairseq/distributed/utils.py new file mode 100644 index 0000000000..968830d585 --- /dev/null +++ b/fairseq/distributed/utils.py @@ -0,0 +1,843 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import io +import logging +import os +import pickle +import random +import socket +import struct +import subprocess +import warnings +from argparse import Namespace +from collections import OrderedDict +from dataclasses import dataclass +from typing import Any, Dict, List, Mapping, Optional + +import torch +import torch.distributed as dist +from fairseq.dataclass.configs import DistributedTrainingConfig, FairseqConfig +from omegaconf import open_dict + +try: + import torch_xla.core.xla_model as xm +except ImportError: + xm = None + + +# Flag to indicate if we're using Megatron +# NOTE: this is a temporary hack until we move away from Megatron's model parallel init +_USE_MEGATRON = False + +# Whether to use XLA ops (e.g., on TPUs) instead of CUDA ops. +_USE_XLA = False + + +logger = logging.getLogger(__name__) + + +def is_master(cfg: DistributedTrainingConfig): + return cfg.distributed_rank == 0 + + +def infer_init_method(cfg: DistributedTrainingConfig, force_distributed=False): + if cfg.distributed_init_method is not None or cfg.tpu: + return + + num_pipelines_per_node = None + if cfg.pipeline_model_parallel: + num_pipeline_devices, num_pipelines_per_node = _pipeline_parallel_pre_init(cfg) + + if cfg.distributed_world_size == 1: + return + if all( + key in os.environ + for key in ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"] + ): + # support torch.distributed.launch + _infer_torch_distributed_launch_init(cfg) + else: + # we can determine the init method automatically for Slurm + if not _infer_slurm_init(cfg, num_pipelines_per_node): + if cfg.distributed_port <= 0 or force_distributed: + _infer_single_node_init(cfg) + elif cfg.distributed_port <= 0: + _infer_single_node_init(cfg) + + if cfg.pipeline_model_parallel: + _pipeline_parallel_post_init(cfg, num_pipeline_devices, num_pipelines_per_node) + elif not cfg.distributed_no_spawn: + with open_dict(cfg): + cfg.distributed_num_procs = min( + torch.cuda.device_count(), cfg.distributed_world_size + ) + else: + if cfg.device_id > 0: + logger.info( + "setting CUDA device={} on rank {}".format( + cfg.device_id, cfg.distributed_rank + ) + ) + torch.cuda.set_device(cfg.device_id) + + +def _infer_torch_distributed_launch_init(cfg: DistributedTrainingConfig): + cfg.distributed_init_method = "env://" + cfg.distributed_world_size = int(os.environ["WORLD_SIZE"]) + cfg.distributed_rank = int(os.environ["RANK"]) + cfg.device_id = cfg.distributed_rank % torch.cuda.device_count() + # processes are created by torch.distributed.launch + cfg.distributed_no_spawn = True + + +def _infer_slurm_init(cfg: DistributedTrainingConfig, num_pipelines_per_node): + node_list = os.environ.get("SLURM_STEP_NODELIST") + if node_list is None: + node_list = os.environ.get("SLURM_JOB_NODELIST") + if node_list is not None: + try: + hostnames = subprocess.check_output( + ["scontrol", "show", "hostnames", node_list] + ) + cfg.distributed_init_method = "tcp://{host}:{port}".format( + host=hostnames.split()[0].decode("utf-8"), + port=cfg.distributed_port, + ) + nnodes = int(os.environ.get("SLURM_NNODES")) + ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE") + if ntasks_per_node is not None: + ntasks_per_node = int(ntasks_per_node) + else: + ntasks = int(os.environ.get("SLURM_NTASKS")) + nnodes = int(os.environ.get("SLURM_NNODES")) + assert ntasks % nnodes == 0 + ntasks_per_node = int(ntasks / nnodes) + if ntasks_per_node == 1: + gpus_per_node = torch.cuda.device_count() + node_id = int(os.environ.get("SLURM_NODEID")) + cfg.distributed_rank = node_id * gpus_per_node + cfg.distributed_world_size = nnodes * gpus_per_node + elif cfg.pipeline_model_parallel: + assert ntasks_per_node == num_pipelines_per_node, ( + "SLURM --ntasks-per-node must match number of pipelines per " + "node (={})".format(num_pipelines_per_node) + ) + cfg.distributed_no_spawn = True + # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on + # the first node, [1, 2] on the second node, etc. This + # matches torch.distributed.launch. + node_id = int(os.environ.get("SLURM_NODEID")) + local_id = int(os.environ.get("SLURM_LOCALID")) + cfg.distributed_rank = node_id * num_pipelines_per_node + local_id + # In the above example, device_id will always be in [0, 1], + # which also matches torch.distributed.launch. + cfg.device_id = local_id + # We also want to set distributed_world_size to be the total + # number of pipelines across all nodes. + cfg.distributed_world_size = nnodes * num_pipelines_per_node + else: + assert ( + ntasks_per_node == cfg.distributed_world_size // nnodes + ), f"{ntasks_per_node}, {cfg.distributed_world_size}, {nnodes}" + cfg.distributed_no_spawn = True + cfg.distributed_rank = int(os.environ.get("SLURM_PROCID")) + cfg.device_id = int(os.environ.get("SLURM_LOCALID")) + logger.info(f"Rank {cfg.distributed_rank}, device_id: {cfg.device_id}") + return True + except subprocess.CalledProcessError as e: # scontrol failed + raise e + except FileNotFoundError: # Slurm is not installed + pass + + return False + + +def _infer_single_node_init(cfg: DistributedTrainingConfig): + assert ( + cfg.distributed_world_size <= torch.cuda.device_count() + ), f"world size is {cfg.distributed_world_size} but have {torch.cuda.device_count()} available devices" + + if cfg.distributed_port <= 0: + jobid = os.environ.get("SLURM_JOB_ID") + task_id = os.environ.get("SLURM_ARRAY_TASK_ID") + + if jobid is not None: + if task_id is not None: + jobid += str(task_id) + jobid = int(jobid) + rng = random.Random(jobid) + port = rng.randint(10000, 60000) + else: + port = random.randint(10000, 60000) + + cfg.distributed_port = port + cfg.distributed_init_method = "tcp://localhost:{port}".format( + port=cfg.distributed_port + ) + + +def _pipeline_parallel_pre_init(cfg: DistributedTrainingConfig): + from fairseq import utils + + balance_exists = ( + cfg.pipeline_balance is not None + or cfg.pipeline_encoder_balance is not None + or cfg.pipeline_decoder_balance is not None + ) + devices_exist = ( + cfg.pipeline_devices is not None + or cfg.pipeline_encoder_devices is not None + or cfg.pipeline_decoder_devices is not None + ) + if not balance_exists: + raise ValueError( + "--pipeline-balance is currently required for pipeline model parallelism" + ) + if not devices_exist: + raise ValueError( + "--pipeline-devices is currently required for pipeline model parallelism" + ) + + cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance, type=int) + if cfg.pipeline_devices is not None: + cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices, type=int) + num_pipeline_devices = len(set(cfg.pipeline_devices)) + else: + cfg.pipeline_encoder_devices = utils.eval_str_list( + cfg.pipeline_encoder_devices, type=int + ) + cfg.pipeline_decoder_devices = utils.eval_str_list( + cfg.pipeline_decoder_devices, type=int + ) + num_pipeline_devices = len( + set(cfg.pipeline_encoder_devices + cfg.pipeline_decoder_devices) + ) + gpus_per_node = torch.cuda.device_count() + assert ( + gpus_per_node >= num_pipeline_devices + and gpus_per_node % num_pipeline_devices == 0 + ), ( + "the number of unique device IDs in --pipeline-devices must evenly divide " + "the number of GPUs per node (multi-node pipelining is not yet supported)" + ) + num_pipelines_per_node = gpus_per_node // num_pipeline_devices + return num_pipeline_devices, num_pipelines_per_node + + +def _pipeline_parallel_post_init( + cfg: DistributedTrainingConfig, num_pipeline_devices, num_pipelines_per_node +): + if not cfg.distributed_no_spawn: + # When distributed_no_spawn is False, we expect distributed_rank and + # distributed_world_size to be based on the total number of GPUs, so + # we need to correct them to be based on the number of pipelines. + assert cfg.distributed_world_size % num_pipeline_devices == 0 + cfg.distributed_world_size = cfg.distributed_world_size // num_pipeline_devices + # In the case of 4-way MP on nodes with 8 GPUs, we want + # distributed_rank to be the starting GPU index for each pipeline + # i.e., 0, 2, ... + gpus_per_node = torch.cuda.device_count() + assert cfg.distributed_rank % gpus_per_node == 0 + assert cfg.distributed_rank % num_pipeline_devices == 0 + + with open_dict(cfg): + cfg.distributed_rank = cfg.distributed_rank // num_pipeline_devices + # launch one process per pipeline + cfg.distributed_num_procs = num_pipelines_per_node + + # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0 + # and 4, indicating the starting device IDs for each pipeline + cfg.device_id *= num_pipeline_devices + + if cfg.device_id > 0: + # if there's multiple pipelines on a node (e.g., 4-way MP on an 8 + # GPU node), we need to adjust pipeline_devices accordingly + logger.debug( + "setting CUDA device={} on rank {}".format( + cfg.device_id, cfg.distributed_rank + ) + ) + torch.cuda.set_device(cfg.device_id) + with open_dict(cfg): + cfg.pipeline_devices = [cfg.device_id + d for d in cfg.pipeline_devices] + logger.info( + "setting pipeline_devices={} on rank {}".format( + cfg.pipeline_devices, cfg.distributed_rank + ) + ) + + +def distributed_init(cfg: FairseqConfig): + if isinstance(cfg, Namespace): + from fairseq.dataclass.utils import convert_namespace_to_omegaconf + + cfg = convert_namespace_to_omegaconf(cfg) + + if not cfg.common.tpu: + if torch.distributed.is_available() and torch.distributed.is_initialized(): + warnings.warn( + "Distributed is already initialized, cannot initialize twice!" + ) + else: + logger.info( + "distributed init (rank {}): {}".format( + cfg.distributed_training.distributed_rank, + cfg.distributed_training.distributed_init_method, + ) + ) + dist.init_process_group( + backend=cfg.distributed_training.distributed_backend, + init_method=cfg.distributed_training.distributed_init_method, + world_size=cfg.distributed_training.distributed_world_size, + rank=cfg.distributed_training.distributed_rank, + ) + logger.info( + "initialized host {} as rank {}".format( + socket.gethostname(), + cfg.distributed_training.distributed_rank, + ) + ) + + # perform a dummy all-reduce to initialize the NCCL communicator + if torch.cuda.is_available(): + dist.all_reduce(torch.zeros(1).cuda()) + + cfg.distributed_training.distributed_rank = torch.distributed.get_rank() + else: + assert xm.xrt_world_size() == cfg.distributed_training.distributed_world_size + global _USE_XLA + _USE_XLA = True + cfg.distributed_training.device_id = xm.get_local_ordinal() + cfg.distributed_training.distributed_rank = xm.get_ordinal() + xm.rendezvous("distributed_init") # wait for all workers + + if is_master(cfg.distributed_training): + logging.getLogger().setLevel(logging.INFO) + else: + logging.getLogger().setLevel(logging.WARNING) + + if cfg.common.model_parallel_size > 1: + try: + from fairseq.model_parallel.megatron.mpu import ( + initialize_model_parallel, + model_parallel_cuda_manual_seed, + ) + except ImportError: + raise ImportError( + "\n\nPlease install the megatron submodule:" + "\n\n git submodule update --init " + "fairseq/model_parallel/megatron" + ) + global _USE_MEGATRON + _USE_MEGATRON = True + initialize_model_parallel(cfg.common.model_parallel_size) + model_parallel_cuda_manual_seed(cfg.common.seed) + model_part_number = get_model_parallel_rank() + cfg.checkpoint.checkpoint_suffix += "-model_part-{0}".format(model_part_number) + + if hasattr(cfg, "model") and getattr(cfg.model, "base_layers", 0) > 0: + cfg.checkpoint.checkpoint_suffix = ( + f"-rank-{cfg.distributed_training.distributed_rank}" + ) + + return cfg.distributed_training.distributed_rank + + +def distributed_main(i, main, cfg: FairseqConfig, kwargs): + cfg.distributed_training.device_id = i + if torch.cuda.is_available() and not cfg.common.cpu and not cfg.common.tpu: + torch.cuda.set_device(cfg.distributed_training.device_id) + if cfg.distributed_training.distributed_rank is None: # torch.multiprocessing.spawn + cfg.distributed_training.distributed_rank = kwargs.pop("start_rank", 0) + i + + cfg.distributed_training.distributed_rank = distributed_init(cfg) + + after_distributed_init_fn = kwargs.pop("after_distributed_init_fn", None) + if after_distributed_init_fn: + cfg = after_distributed_init_fn(cfg) + + main(cfg, **kwargs) + + if torch.distributed.is_initialized(): + torch.distributed.barrier(get_global_group()) + + +def call_main(cfg: FairseqConfig, main, **kwargs): + if cfg.distributed_training.distributed_init_method is None: + infer_init_method(cfg.distributed_training) + + if cfg.distributed_training.distributed_init_method is not None: + # distributed training + if not cfg.distributed_training.distributed_no_spawn: + start_rank = cfg.distributed_training.distributed_rank + cfg.distributed_training.distributed_rank = None # assign automatically + kwargs["start_rank"] = start_rank + + torch.multiprocessing.spawn( + fn=distributed_main, + args=(main, cfg, kwargs), + nprocs=min( + torch.cuda.device_count(), + cfg.distributed_training.distributed_world_size, + ), + join=True, + ) + else: + distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs) + elif cfg.common.tpu and cfg.distributed_training.distributed_world_size > 1: + import torch_xla.distributed.xla_multiprocessing as xmp + + torch.multiprocessing.set_sharing_strategy("file_system") + xmp.spawn( + fn=distributed_main, + args=(main, cfg, kwargs), + # tpu-comment: + # 8 devices in one TPU VM, is the max processes to be spawned. + # The rest is driven by xm.distributed.xla_dist + nprocs=min(cfg.distributed_training.distributed_world_size, 8), + ) + else: + # single GPU main + main(cfg, **kwargs) + + +def use_xla(): + global _USE_XLA + return _USE_XLA + + +def new_groups(grouped_ranks: List[List[int]]): + if use_xla(): + return ("tpu", grouped_ranks) + else: + groups = [dist.new_group(g) for g in grouped_ranks] + my_group_idx = _find_my_group_index(grouped_ranks) + return groups[my_group_idx] + + +def _find_my_group_index(grouped_ranks): + my_rank = get_global_rank() + for i, group in enumerate(grouped_ranks): + if my_rank in group: + return i + raise RuntimeError + + +def _find_my_group(grouped_ranks): + index = _find_my_group_index(grouped_ranks) + return grouped_ranks[index] + + +def get_rank(group): + if use_xla(): + assert group[0] == "tpu" + my_group = _find_my_group(group[1]) + return my_group.index(get_global_rank()) + else: + return dist.get_rank(group=group) + + +def get_world_size(group): + if use_xla(): + assert group[0] == "tpu" + my_group = _find_my_group(group[1]) + return len(my_group) + elif torch.distributed.is_initialized(): + return dist.get_world_size(group=group) + else: + return 1 + + +def get_global_group(): + if use_xla(): + return new_groups([list(range(get_global_world_size()))]) + elif torch.distributed.is_initialized(): + if not hasattr(get_global_group, "_global_group"): + # ideally we could use torch.distributed.group.WORLD, but it seems + # to cause random NCCL hangs in some cases + get_global_group._global_group = dist.new_group() + return get_global_group._global_group + else: + return None + + +def get_global_rank(): + if use_xla(): + return xm.get_ordinal() + elif torch.distributed.is_initialized(): + return torch.distributed.get_rank() + else: + return 0 + + +def get_global_world_size(): + if use_xla(): + return xm.xrt_world_size() + elif torch.distributed.is_initialized(): + return torch.distributed.get_world_size() + else: + return 1 + + +def get_data_parallel_group(): + """Get the data parallel group the caller rank belongs to.""" + global _USE_MEGATRON + if _USE_MEGATRON: + from fairseq.model_parallel.megatron import mpu + + return mpu.get_data_parallel_group() + else: + return get_global_group() + + +def get_data_parallel_rank(): + """Return my rank for the data parallel group.""" + return get_rank(get_data_parallel_group()) + + +def get_data_parallel_world_size(): + """Return world size for the data parallel group.""" + return get_world_size(get_data_parallel_group()) + + +def get_model_parallel_group(): + global _USE_MEGATRON + if _USE_MEGATRON: + from fairseq.model_parallel.megatron import mpu + + return mpu.get_model_parallel_group() + else: + return None + + +def get_model_parallel_rank(): + """Return my rank for the model parallel group.""" + return get_rank(get_model_parallel_group()) + + +def get_model_parallel_world_size(): + """Return world size for the model parallel group.""" + return get_world_size(get_model_parallel_group()) + + +def all_reduce(tensor, group, op="sum"): + if use_xla(): + assert isinstance(group, tuple) and group[0] == "tpu" + tensor = [tensor] # wrap in a list to make xm.all_reduce in-place + return xm.all_reduce(op, tensor, groups=group[1])[0] + else: + if op == "sum": + op = dist.ReduceOp.SUM + elif op == "max": + op = dist.ReduceOp.MAX + else: + raise NotImplementedError + dist.all_reduce(tensor, op=op, group=group) + return tensor + + +def broadcast(tensor, src, group): + if use_xla(): + # XLA doesn't support broadcast, hack it with all_reduce + if get_rank(group) != src: + tensor.zero_() + all_reduce(tensor, group) + else: + dist.broadcast(tensor, src=src, group=group) + + +def all_to_all(tensor, group): + """Perform an all-to-all operation on a 1D Tensor.""" + assert tensor.dim() == 1 + split_count = get_world_size(group=group) + assert tensor.numel() % split_count == 0 + if use_xla(): + assert isinstance(group, tuple) and group[0] == "tpu" + return xm.all_to_all( + tensor, + split_dimension=0, + concat_dimension=0, + split_count=split_count, + groups=group[1], + ) + else: + output = torch.zeros_like(tensor) + dist.all_to_all_single(output, tensor, group=group) + return output + + +def all_gather(tensor, group, return_tensor=False): + """Perform an all-gather operation.""" + if use_xla(): + result = xm.all_gather(tensor, groups=group[1]) + world_size = get_world_size(group=group) + result = result.view(world_size, *tensor.size()) + if return_tensor: + return result + else: + return [result[i] for i in range(world_size)] + else: + world_size = get_world_size(group=group) + rank = get_rank(group=group) + tensor_list = [ + tensor if i == rank else torch.empty_like(tensor) for i in range(world_size) + ] + dist.all_gather(tensor_list, tensor, group=group) + if return_tensor: + return torch.stack(tensor_list, dim=0) + else: + return tensor_list + + +def all_gather_list(data, group=None, max_size=16384): + """Gathers arbitrary data from all nodes into a list. + + Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python + data. Note that *data* must be picklable and any CUDA tensors will be moved + to CPU and returned on CPU as well. + + Args: + data (Any): data from the local worker to be gathered on other workers + group: group of the collective + max_size (int, optional): maximum size of the data to be gathered + across workers + """ + from fairseq import utils + + if group is None: + group = get_global_group() + rank = get_rank(group=group) + world_size = get_world_size(group=group) + + buffer_size = max_size * world_size + if ( + not hasattr(all_gather_list, "_buffer") + or all_gather_list._buffer.numel() < buffer_size + ): + all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size) + all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory() + buffer = all_gather_list._buffer + buffer.zero_() + cpu_buffer = all_gather_list._cpu_buffer + + data = utils.move_to_cpu(data) + enc = pickle.dumps(data) + enc_size = len(enc) + header_size = 4 # size of header that contains the length of the encoded data + size = header_size + enc_size + if size > max_size: + raise ValueError( + "encoded data size ({}) exceeds max_size ({})".format(size, max_size) + ) + + header = struct.pack(">I", enc_size) + cpu_buffer[:size] = torch.ByteTensor(list(header + enc)) + start = rank * max_size + buffer[start : start + size].copy_(cpu_buffer[:size]) + + all_reduce(buffer, group=group) + + buffer = buffer.cpu() + try: + result = [] + for i in range(world_size): + out_buffer = buffer[i * max_size : (i + 1) * max_size] + (enc_size,) = struct.unpack(">I", bytes(out_buffer[:header_size].tolist())) + if enc_size > 0: + result.append( + pickle.loads( + bytes(out_buffer[header_size : header_size + enc_size].tolist()) + ) + ) + return result + except pickle.UnpicklingError: + raise Exception( + "Unable to unpickle data from other workers. all_gather_list requires all " + "workers to enter the function together, so this error usually indicates " + "that the workers have fallen out of sync somehow. Workers can fall out of " + "sync if one of them runs out of memory, or if there are other conditions " + "in your training script that can cause one worker to finish an epoch " + "while other workers are still iterating over their portions of the data. " + "Try rerunning with --ddp-backend=legacy_ddp and see if that helps." + ) + + +def all_reduce_dict(data: Mapping[str, Any], device, group) -> Dict[str, Any]: + """ + AllReduce a dictionary of values across workers. We separately + reduce items that are already on the device and items on CPU for + better performance. + + Args: + data (Mapping[str, Any]): dictionary of data to all-reduce, but + cannot be a nested dictionary + device (torch.device): device for the reduction + group: group of the collective + """ + data_keys = list(data.keys()) + + # We want to separately reduce items that are already on the + # device and items on CPU for performance reasons. + cpu_data = OrderedDict() + device_data = OrderedDict() + for k in data_keys: + t = data[k] + if not torch.is_tensor(t): + cpu_data[k] = torch.tensor(t, dtype=torch.double) + elif t.device.type != device.type: + cpu_data[k] = t.to(dtype=torch.double) + else: + device_data[k] = t.to(dtype=torch.double) + + def _all_reduce_dict(data: OrderedDict): + if len(data) == 0: + return data + buf = torch.cat([t.view(-1) for t in data.values()]).to(device=device) + all_reduce(buf, group=group) + split_buf = torch.split(buf.clone(), [t.numel() for t in data.values()]) + reduced_data = [t.view_as(orig) for t, orig in zip(split_buf, data.values())] + return OrderedDict(zip(data.keys(), reduced_data)) + + cpu_data = _all_reduce_dict(cpu_data) + device_data = _all_reduce_dict(device_data) + + def get_from_stack(key): + if key in cpu_data: + return cpu_data[key] + elif key in device_data: + return device_data[key] + raise KeyError + + return OrderedDict([(key, get_from_stack(key)) for key in data_keys]) + + +def broadcast_tensors( + tensors: Optional[List[torch.Tensor]], + src_rank: int, + group: object, + dist_device: Optional[torch.device] = None, +) -> List[torch.Tensor]: + """ + Broadcasts a list of tensors without other (non-src) ranks needing to know + the dtypes/shapes of the tensors. + """ + if dist_device is None: + if torch.distributed.get_backend(group) == "nccl": + dist_device = torch.device("cuda") + else: + dist_device = torch.device("cpu") + + # share metadata first to simplify transfer + is_src_rank = get_rank(group) == src_rank + if is_src_rank: + metadata = [ + {"size": t.size(), "dtype": t.dtype, "device": t.device} for t in tensors + ] + metadata = _broadcast_object_slow(metadata, src_rank, group, dist_device) + else: + metadata = _broadcast_object_slow(None, src_rank, group, dist_device) + + out_tensors = [] + for i, meta in enumerate(metadata): + if is_src_rank: + tensor = tensors[i] + broadcast(tensors[i].to(dist_device), src=src_rank, group=group) + else: + tensor = torch.zeros( + [meta["size"].numel()], dtype=meta["dtype"], device=dist_device + ) + broadcast(tensor, src=src_rank, group=group) + tensor = tensor.view(meta["size"]).to(meta["device"]) + out_tensors.append(tensor) + return out_tensors + + +def broadcast_object( + obj: Any, + src_rank: int, + group: object, + dist_device: Optional[torch.device] = None, +) -> Any: + """Broadcast an arbitrary Python object to other workers.""" + if dist_device is None: + if torch.distributed.get_backend(group) == "nccl": + dist_device = torch.device("cuda") + else: + dist_device = torch.device("cpu") + + if get_rank(group) == src_rank: + # split the tensors from the non-tensors so we can broadcast them + # directly, avoiding unnecessary serialization/deserialization + tensors = [] + obj = _split_tensors_from_obj(obj, tensors) + obj = _broadcast_object_slow(obj, src_rank, group, dist_device) + tensors = broadcast_tensors(tensors, src_rank, group, dist_device) + else: + obj = _broadcast_object_slow(None, src_rank, group, dist_device) + tensors = broadcast_tensors(None, src_rank, group, dist_device) + return _put_tensors_in_obj(obj, tensors) + + +def _broadcast_object_slow( + obj: Any, + src_rank: int, + group: object, + dist_device: torch.device, +) -> Any: + if get_rank(group) == src_rank: + # Emit data + buffer = io.BytesIO() + torch.save(obj, buffer) + buffer = torch.ByteTensor(buffer.getbuffer()).to(dist_device) + length = torch.LongTensor([len(buffer)]).to(dist_device) + broadcast(length, src=src_rank, group=group) + broadcast(buffer, src=src_rank, group=group) + else: + # Fetch from the source + length = torch.LongTensor([0]).to(dist_device) + broadcast(length, src=src_rank, group=group) + buffer = torch.ByteTensor(int(length.item())).to(dist_device) + broadcast(buffer, src=src_rank, group=group) + buffer = io.BytesIO(buffer.cpu().numpy()) + obj = torch.load(buffer, map_location="cpu") + return obj + + +@dataclass(frozen=True) +class _TensorPlaceholder: + index: int + + +def _split_tensors_from_obj(obj: Any, tensors: List[torch.Tensor]) -> Any: + if torch.is_tensor(obj): + placeholder = _TensorPlaceholder(index=len(tensors)) + tensors.append(obj) + return placeholder + elif isinstance(obj, dict): + return {k: _split_tensors_from_obj(v, tensors) for k, v in obj.items()} + elif isinstance(obj, list): + return [_split_tensors_from_obj(v, tensors) for v in obj] + elif isinstance(obj, tuple): + return tuple(_split_tensors_from_obj(v, tensors) for v in obj) + elif isinstance(obj, set): + return {_split_tensors_from_obj(v, tensors) for v in obj} + else: + return obj + + +def _put_tensors_in_obj(obj: Any, tensors: List[torch.Tensor]) -> Any: + if isinstance(obj, _TensorPlaceholder): + return tensors[obj.index] + elif isinstance(obj, dict): + return {k: _put_tensors_in_obj(v, tensors) for k, v in obj.items()} + elif isinstance(obj, list): + return [_put_tensors_in_obj(v, tensors) for v in obj] + elif isinstance(obj, tuple): + return tuple(_put_tensors_in_obj(v, tensors) for v in obj) + elif isinstance(obj, set): + return {_put_tensors_in_obj(v, tensors) for v in obj} + else: + return obj diff --git a/fairseq/distributed_utils.py b/fairseq/distributed_utils.py deleted file mode 100644 index 23cdfc6938..0000000000 --- a/fairseq/distributed_utils.py +++ /dev/null @@ -1,455 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import logging -import os -import pickle -import random -import socket -import struct -import subprocess -import warnings -from argparse import Namespace -from collections import OrderedDict -from typing import Any, Dict, Mapping - -import torch -import torch.distributed as dist -from fairseq import utils -from fairseq.dataclass.utils import convert_namespace_to_omegaconf -from omegaconf import DictConfig, open_dict - - -logger = logging.getLogger(__name__) - - -def is_master(cfg: DictConfig): - return cfg.distributed_rank == 0 - - -def infer_init_method(cfg: DictConfig, force_distributed=False): - if cfg.distributed_init_method is not None or cfg.tpu: - return - - if cfg.pipeline_model_parallel: - balance_exists = ( - cfg.pipeline_balance is not None - or cfg.pipeline_encoder_balance is not None - or cfg.pipeline_decoder_balance is not None - ) - devices_exist = ( - cfg.pipeline_devices is not None - or cfg.pipeline_encoder_devices is not None - or cfg.pipeline_decoder_devices is not None - ) - if not balance_exists: - raise ValueError( - "--pipeline-balance is currently required for pipeline model parallelism" - ) - if not devices_exist: - raise ValueError( - "--pipeline-devices is currently required for pipeline model parallelism" - ) - - cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance, type=int) - if cfg.pipeline_devices is not None: - cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices, type=int) - num_pipeline_devices = len(set(cfg.pipeline_devices)) - else: - cfg.pipeline_encoder_devices = utils.eval_str_list( - cfg.pipeline_encoder_devices, type=int - ) - cfg.pipeline_decoder_devices = utils.eval_str_list( - cfg.pipeline_decoder_devices, type=int - ) - num_pipeline_devices = len( - set(cfg.pipeline_encoder_devices + cfg.pipeline_decoder_devices) - ) - gpus_per_node = torch.cuda.device_count() - assert ( - gpus_per_node >= num_pipeline_devices - and gpus_per_node % num_pipeline_devices == 0 - ), ( - "the number of unique device IDs in --pipeline-devices must evenly divide " - "the number of GPUs per node (multi-node pipelining is not yet supported)" - ) - num_pipelines_per_node = gpus_per_node // num_pipeline_devices - - # support torch.distributed.launch - if all( - key in os.environ - for key in ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"] - ): - cfg.distributed_init_method = "env://" - cfg.distributed_world_size = int(os.environ["WORLD_SIZE"]) - cfg.distributed_rank = int(os.environ["RANK"]) - # processes are created by torch.distributed.launch - cfg.distributed_no_spawn = True - - # we can determine the init method automatically for Slurm - elif cfg.distributed_port > 0: - node_list = os.environ.get("SLURM_STEP_NODELIST") - if node_list is None: - node_list = os.environ.get("SLURM_JOB_NODELIST") - if node_list is not None: - try: - hostnames = subprocess.check_output( - ["scontrol", "show", "hostnames", node_list] - ) - cfg.distributed_init_method = "tcp://{host}:{port}".format( - host=hostnames.split()[0].decode("utf-8"), - port=cfg.distributed_port, - ) - nnodes = int(os.environ.get("SLURM_NNODES")) - ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE") - if ntasks_per_node is not None: - ntasks_per_node = int(ntasks_per_node) - else: - ntasks = int(os.environ.get("SLURM_NTASKS")) - nnodes = int(os.environ.get("SLURM_NNODES")) - assert ntasks % nnodes == 0 - ntasks_per_node = int(ntasks / nnodes) - if ntasks_per_node == 1: - gpus_per_node = torch.cuda.device_count() - node_id = int(os.environ.get("SLURM_NODEID")) - cfg.distributed_rank = node_id * gpus_per_node - cfg.distributed_world_size = nnodes * gpus_per_node - elif cfg.pipeline_model_parallel: - assert ntasks_per_node == num_pipelines_per_node, ( - "SLURM --ntasks-per-node must match number of pipelines per " - "node (={})".format(num_pipelines_per_node) - ) - cfg.distributed_no_spawn = True - # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on - # the first node, [1, 2] on the second node, etc. This - # matches torch.distributed.launch. - node_id = int(os.environ.get("SLURM_NODEID")) - local_id = int(os.environ.get("SLURM_LOCALID")) - cfg.distributed_rank = node_id * num_pipelines_per_node + local_id - # In the above example, device_id will always be in [0, 1], - # which also matches torch.distributed.launch. - cfg.device_id = local_id - # We also want to set distributed_world_size to be the total - # number of pipelines across all nodes. - cfg.distributed_world_size = nnodes * num_pipelines_per_node - else: - assert ntasks_per_node == cfg.distributed_world_size // nnodes - cfg.distributed_no_spawn = True - cfg.distributed_rank = int(os.environ.get("SLURM_PROCID")) - cfg.device_id = int(os.environ.get("SLURM_LOCALID")) - except subprocess.CalledProcessError as e: # scontrol failed - raise e - except FileNotFoundError: # Slurm is not installed - pass - - elif cfg.distributed_world_size > 1 or force_distributed: - # fallback for single node with multiple GPUs - assert cfg.distributed_world_size <= torch.cuda.device_count() - port = random.randint(10000, 20000) - cfg.distributed_init_method = "tcp://localhost:{port}".format(port=port) - - if cfg.pipeline_model_parallel: - if not cfg.distributed_no_spawn: - # When distributed_no_spawn is False, we expect distributed_rank and - # distributed_world_size to be based on the total number of GPUs, so - # we need to correct them to be based on the number of pipelines. - assert cfg.distributed_world_size % num_pipeline_devices == 0 - cfg.distributed_world_size = ( - cfg.distributed_world_size // num_pipeline_devices - ) - # In the case of 4-way MP on nodes with 8 GPUs, we want - # distributed_rank to be the starting GPU index for each pipeline - # i.e., 0, 2, ... - assert cfg.distributed_rank % gpus_per_node == 0 - assert cfg.distributed_rank % num_pipeline_devices == 0 - - with open_dict(cfg): - cfg.distributed_rank = cfg.distributed_rank // num_pipeline_devices - # launch one process per pipeline - cfg.distributed_num_procs = num_pipelines_per_node - - # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0 - # and 4, indicating the starting device IDs for each pipeline - cfg.device_id *= num_pipeline_devices - - if cfg.device_id > 0: - # if there's multiple pipelines on a node (e.g., 4-way MP on an 8 - # GPU node), we need to adjust pipeline_devices accordingly - logger.debug( - "setting CUDA device={} on rank {}".format( - cfg.device_id, cfg.distributed_rank - ) - ) - torch.cuda.set_device(cfg.device_id) - with open_dict(cfg): - cfg.pipeline_devices = [cfg.device_id + d for d in cfg.pipeline_devices] - logger.info( - "setting pipeline_devices={} on rank {}".format( - cfg.pipeline_devices, cfg.distributed_rank - ) - ) - elif not cfg.distributed_no_spawn: - with open_dict(cfg): - cfg.distributed_num_procs = min( - torch.cuda.device_count(), cfg.distributed_world_size - ) - - -def distributed_init(cfg: DictConfig): - if isinstance(cfg, Namespace): - cfg = convert_namespace_to_omegaconf(cfg) - - if not cfg.common.tpu: - if torch.distributed.is_initialized(): - warnings.warn( - "Distributed is already initialized, cannot initialize twice!" - ) - else: - logger.info( - "distributed init (rank {}): {}".format( - cfg.distributed_training.distributed_rank, - cfg.distributed_training.distributed_init_method, - ) - ) - dist.init_process_group( - backend=cfg.distributed_training.distributed_backend, - init_method=cfg.distributed_training.distributed_init_method, - world_size=cfg.distributed_training.distributed_world_size, - rank=cfg.distributed_training.distributed_rank, - ) - logger.info( - "initialized host {} as rank {}".format( - socket.gethostname(), - cfg.distributed_training.distributed_rank, - ) - ) - - # perform a dummy all-reduce to initialize the NCCL communicator - if torch.cuda.is_available(): - dist.all_reduce(torch.zeros(1).cuda()) - - cfg.distributed_training.distributed_rank = torch.distributed.get_rank() - else: - import torch_xla.core.xla_model as xm - - assert xm.xrt_world_size() == cfg.distributed_training.distributed_world_size - cfg.distributed_training.device_id = xm.get_local_ordinal() - cfg.distributed_training.distributed_rank = xm.get_ordinal() - xm.rendezvous("distributed_init") # wait for all workers - xm.mark_step() - - if is_master(cfg.distributed_training): - logging.getLogger().setLevel(logging.INFO) - else: - logging.getLogger().setLevel(logging.WARNING) - - if cfg.common.model_parallel_size > 1: - try: - from fairseq.model_parallel.megatron.mpu import ( - get_model_parallel_rank, - initialize_model_parallel, - model_parallel_cuda_manual_seed, - ) - except ImportError: - raise ImportError( - "\n\nPlease install the megatron submodule:" - "\n\n git submodule update --init " - "fairseq/model_parallel/megatron" - ) - initialize_model_parallel(cfg.common.model_parallel_size) - model_parallel_cuda_manual_seed(cfg.common.seed) - model_part_number = get_model_parallel_rank() - cfg.checkpoint.checkpoint_suffix += "-model_part-{0}".format(model_part_number) - return cfg.distributed_training.distributed_rank - - -def distributed_main(i, main, cfg: DictConfig, kwargs): - cfg.distributed_training.device_id = i - if torch.cuda.is_available() and not cfg.common.cpu and not cfg.common.tpu: - torch.cuda.set_device(cfg.distributed_training.device_id) - if cfg.distributed_training.distributed_rank is None: # torch.multiprocessing.spawn - cfg.distributed_training.distributed_rank = kwargs.pop("start_rank", 0) + i - - cfg.distributed_training.distributed_rank = distributed_init(cfg) - - after_distributed_init_fn = kwargs.pop("after_distributed_init_fn", None) - if after_distributed_init_fn: - cfg = after_distributed_init_fn(cfg) - - main(cfg, **kwargs) - - -def call_main(cfg: DictConfig, main, **kwargs): - if cfg.distributed_training.distributed_init_method is None: - infer_init_method(cfg.distributed_training) - - if cfg.distributed_training.distributed_init_method is not None: - # distributed training - if not cfg.distributed_training.distributed_no_spawn: - start_rank = cfg.distributed_training.distributed_rank - cfg.distributed_training.distributed_rank = None # assign automatically - kwargs["start_rank"] = start_rank - torch.multiprocessing.spawn( - fn=distributed_main, - args=(main, cfg, kwargs), - nprocs=min( - torch.cuda.device_count(), - cfg.distributed_training.distributed_world_size, - ), - ) - else: - distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs) - elif cfg.common.tpu and cfg.distributed_training.distributed_world_size > 1: - import torch_xla.distributed.xla_multiprocessing as xmp - - torch.multiprocessing.set_sharing_strategy("file_system") - xmp.spawn( - fn=distributed_main, - args=(main, cfg, kwargs), - nprocs=8, # use all 8 TPU cores - ) - else: - # single GPU main - main(cfg, **kwargs) - - -def get_rank(): - return dist.get_rank() - - -def get_world_size(): - return dist.get_world_size() - - -def get_default_group(): - return dist.group.WORLD - - -def all_reduce(tensor, group=None): - if isinstance(group, tuple) and group[0] == "tpu": - import torch_xla.core.xla_model as xm - - return xm.all_reduce("sum", [tensor], groups=group[1]) - else: - if group is None: - group = get_default_group() - return dist.all_reduce(tensor, group=group) - - -def all_gather_list(data, group=None, max_size=16384): - """Gathers arbitrary data from all nodes into a list. - - Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python - data. Note that *data* must be picklable. - - Args: - data (Any): data from the local worker to be gathered on other workers - group (optional): group of the collective - max_size (int, optional): maximum size of the data to be gathered - across workers - """ - rank = get_rank() - world_size = get_world_size() - - buffer_size = max_size * world_size - if ( - not hasattr(all_gather_list, "_buffer") - or all_gather_list._buffer.numel() < buffer_size - ): - all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size) - all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory() - buffer = all_gather_list._buffer - buffer.zero_() - cpu_buffer = all_gather_list._cpu_buffer - - data = utils.move_to_cpu(data) - enc = pickle.dumps(data) - enc_size = len(enc) - header_size = 4 # size of header that contains the length of the encoded data - size = header_size + enc_size - if size > max_size: - raise ValueError( - "encoded data size ({}) exceeds max_size ({})".format(size, max_size) - ) - - header = struct.pack(">I", enc_size) - cpu_buffer[:size] = torch.ByteTensor(list(header + enc)) - start = rank * max_size - buffer[start : start + size].copy_(cpu_buffer[:size]) - - all_reduce(buffer, group=group) - - buffer = buffer.cpu() - try: - result = [] - for i in range(world_size): - out_buffer = buffer[i * max_size : (i + 1) * max_size] - (enc_size,) = struct.unpack(">I", bytes(out_buffer[:header_size].tolist())) - if enc_size > 0: - result.append( - pickle.loads( - bytes(out_buffer[header_size : header_size + enc_size].tolist()) - ) - ) - return result - except pickle.UnpicklingError: - raise Exception( - "Unable to unpickle data from other workers. all_gather_list requires all " - "workers to enter the function together, so this error usually indicates " - "that the workers have fallen out of sync somehow. Workers can fall out of " - "sync if one of them runs out of memory, or if there are other conditions " - "in your training script that can cause one worker to finish an epoch " - "while other workers are still iterating over their portions of the data. " - "Try rerunning with --ddp-backend=no_c10d and see if that helps." - ) - - -def all_reduce_dict(data: Mapping[str, Any], device, group=None) -> Dict[str, Any]: - """ - AllReduce a dictionary of values across workers. We separately - reduce items that are already on the device and items on CPU for - better performance. - - Args: - data (Mapping[str, Any]): dictionary of data to all-reduce, but - cannot be a nested dictionary - device (torch.device): device for the reduction - group (optional): group of the collective - """ - data_keys = list(data.keys()) - - # We want to separately reduce items that are already on the - # device and items on CPU for performance reasons. - cpu_data = OrderedDict() - device_data = OrderedDict() - for k in data_keys: - t = data[k] - if not torch.is_tensor(t): - cpu_data[k] = torch.tensor(t, dtype=torch.double) - elif t.device.type != device.type: - cpu_data[k] = t.to(dtype=torch.double) - else: - device_data[k] = t.to(dtype=torch.double) - - def _all_reduce_dict(data: OrderedDict): - if len(data) == 0: - return data - buf = torch.cat([t.view(-1) for t in data.values()]).to(device=device) - all_reduce(buf, group=group) - split_buf = torch.split(buf, [t.numel() for t in data.values()]) - reduced_data = [t.view_as(orig) for t, orig in zip(split_buf, data.values())] - return OrderedDict(zip(data.keys(), reduced_data)) - - cpu_data = _all_reduce_dict(cpu_data) - device_data = _all_reduce_dict(device_data) - - def get_from_stack(key): - if key in cpu_data: - return cpu_data[key] - elif key in device_data: - return device_data[key] - raise KeyError - - return OrderedDict([(key, get_from_stack(key)) for key in data_keys]) diff --git a/fairseq/file_chunker_utils.py b/fairseq/file_chunker_utils.py new file mode 100644 index 0000000000..3f27549099 --- /dev/null +++ b/fairseq/file_chunker_utils.py @@ -0,0 +1,84 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import typing as tp + + +def _safe_readline(fd) -> str: + pos = fd.tell() + while True: + try: + return fd.readline() + except UnicodeDecodeError: + pos -= 1 + fd.seek(pos) # search where this character begins + + +def find_offsets(filename: str, num_chunks: int) -> tp.List[int]: + """ + given a file and a number of chuncks, find the offsets in the file + to be able to chunk around full lines. + """ + with open(filename, "r", encoding="utf-8") as f: + size = os.fstat(f.fileno()).st_size + chunk_size = size // num_chunks + offsets = [0 for _ in range(num_chunks + 1)] + for i in range(1, num_chunks): + f.seek(chunk_size * i) + _safe_readline(f) + offsets[i] = f.tell() + offsets[-1] = size + return offsets + + +class ChunkLineIterator: + """ + Iterator to properly iterate over lines of a file chunck. + """ + + def __init__(self, fd, start_offset: int, end_offset: int): + self._fd = fd + self._start_offset = start_offset + self._end_offset = end_offset + + def __iter__(self) -> tp.Iterable[str]: + self._fd.seek(self._start_offset) + # next(f) breaks f.tell(), hence readline() must be used + line = _safe_readline(self._fd) + while line: + pos = self._fd.tell() + # f.tell() does not always give the byte position in the file + # sometimes it skips to a very large number + # it is unlikely that through a normal read we go from + # end bytes to end + 2**32 bytes (4 GB) and this makes it unlikely + # that the procedure breaks by the undeterministic behavior of + # f.tell() + if ( + self._end_offset > 0 + and pos > self._end_offset + and pos < self._end_offset + 2**32 + ): + break + yield line + line = self._fd.readline() + + +class Chunker: + """ + contextmanager to read a chunck of a file line by line. + """ + + def __init__(self, path: str, start_offset: int, end_offset: int): + self.path = path + self.start_offset = start_offset + self.end_offset = end_offset + + def __enter__(self) -> ChunkLineIterator: + self.fd = open(self.path, "r", encoding="utf-8") + return ChunkLineIterator(self.fd, self.start_offset, self.end_offset) + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + self.fd.close() diff --git a/fairseq/file_io.py b/fairseq/file_io.py index d667256922..8eca70a066 100644 --- a/fairseq/file_io.py +++ b/fairseq/file_io.py @@ -5,22 +5,38 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import logging import os import shutil from typing import List, Optional +logger = logging.getLogger(__file__) + + try: - from fvcore.common.file_io import PathManager as FVCorePathManager + from iopath.common.file_io import g_pathmgr as IOPathManager + + try: + # [FB only - for now] AWS PathHandler for PathManager + from .fb_pathhandlers import S3PathHandler + + IOPathManager.register_handler(S3PathHandler()) + except KeyError: + logging.warning("S3PathHandler already registered.") + except ImportError: + logging.debug( + "S3PathHandler couldn't be imported. Either missing fb-only files, or boto3 module." + ) except ImportError: - FVCorePathManager = None + IOPathManager = None class PathManager: """ Wrapper for insulating OSS I/O (using Python builtin operations) from - fvcore's PathManager abstraction (for transparently handling various + iopath's PathManager abstraction (for transparently handling various internal backends). """ @@ -33,8 +49,8 @@ def open( errors: Optional[str] = None, newline: Optional[str] = None, ): - if FVCorePathManager: - return FVCorePathManager.open( + if IOPathManager: + return IOPathManager.open( path=path, mode=mode, buffering=buffering, @@ -53,64 +69,128 @@ def open( @staticmethod def copy(src_path: str, dst_path: str, overwrite: bool = False) -> bool: - if FVCorePathManager: - return FVCorePathManager.copy( + if IOPathManager: + return IOPathManager.copy( src_path=src_path, dst_path=dst_path, overwrite=overwrite ) return shutil.copyfile(src_path, dst_path) @staticmethod def get_local_path(path: str, **kwargs) -> str: - if FVCorePathManager: - return FVCorePathManager.get_local_path(path, **kwargs) + if IOPathManager: + return IOPathManager.get_local_path(path, **kwargs) return path @staticmethod def exists(path: str) -> bool: - if FVCorePathManager: - return FVCorePathManager.exists(path) + if IOPathManager: + return IOPathManager.exists(path) return os.path.exists(path) @staticmethod def isfile(path: str) -> bool: - if FVCorePathManager: - return FVCorePathManager.isfile(path) + if IOPathManager: + return IOPathManager.isfile(path) return os.path.isfile(path) @staticmethod def ls(path: str) -> List[str]: - if FVCorePathManager: - return FVCorePathManager.ls(path) + if IOPathManager: + return IOPathManager.ls(path) return os.listdir(path) @staticmethod def mkdirs(path: str) -> None: - if FVCorePathManager: - return FVCorePathManager.mkdirs(path) + if IOPathManager: + return IOPathManager.mkdirs(path) os.makedirs(path, exist_ok=True) @staticmethod def rm(path: str) -> None: - if FVCorePathManager: - return FVCorePathManager.rm(path) + if IOPathManager: + return IOPathManager.rm(path) os.remove(path) @staticmethod def chmod(path: str, mode: int) -> None: - if "manifold" not in path: + if not PathManager.path_requires_pathmanager(path): os.chmod(path, mode) @staticmethod def register_handler(handler) -> None: - if FVCorePathManager: - return FVCorePathManager.register_handler(handler=handler) + if IOPathManager: + return IOPathManager.register_handler(handler=handler) @staticmethod def copy_from_local( local_path: str, dst_path: str, overwrite: bool = False, **kwargs ) -> None: - if FVCorePathManager: - return FVCorePathManager.copy_from_local( + if IOPathManager: + return IOPathManager.copy_from_local( local_path=local_path, dst_path=dst_path, overwrite=overwrite, **kwargs ) return shutil.copyfile(local_path, dst_path) + + @staticmethod + def path_requires_pathmanager(path: str) -> bool: + """Do we require PathManager to access given path?""" + if IOPathManager: + for p in IOPathManager._path_handlers.keys(): + if path.startswith(p): + return True + return False + + @staticmethod + def supports_rename(path: str) -> bool: + # PathManager doesn't yet support renames + return not PathManager.path_requires_pathmanager(path) + + @staticmethod + def rename(src: str, dst: str): + os.rename(src, dst) + + """ + ioPath async PathManager methods: + """ + + @staticmethod + def opena( + path: str, + mode: str = "r", + buffering: int = -1, + encoding: Optional[str] = None, + errors: Optional[str] = None, + newline: Optional[str] = None, + ): + """ + Return file descriptor with asynchronous write operations. + """ + global IOPathManager + if not IOPathManager: + logging.info("ioPath is initializing PathManager.") + try: + from iopath.common.file_io import PathManager + + IOPathManager = PathManager() + except Exception: + logging.exception("Failed to initialize ioPath PathManager object.") + return IOPathManager.opena( + path=path, + mode=mode, + buffering=buffering, + encoding=encoding, + errors=errors, + newline=newline, + ) + + @staticmethod + def async_close() -> bool: + """ + Wait for files to be written and clean up asynchronous PathManager. + NOTE: `PathManager.async_close()` must be called at the end of any + script that uses `PathManager.opena(...)`. + """ + global IOPathManager + if IOPathManager: + return IOPathManager.async_close() + return False diff --git a/fairseq/file_utils.py b/fairseq/file_utils.py index ec6de37f77..b99da2e8cd 100644 --- a/fairseq/file_utils.py +++ b/fairseq/file_utils.py @@ -139,6 +139,20 @@ def filename_to_url(filename, cache_dir=None): return url, etag +def cached_path_from_pm(url_or_filename): + """ + Tries to cache the specified URL using PathManager class. + Returns the cached path if success otherwise failure. + """ + try: + from fairseq.file_io import PathManager + + local_path = PathManager.get_local_path(url_or_filename) + return local_path + except Exception: + return None + + def cached_path(url_or_filename, cache_dir=None): """ Given something that might be a URL (or might be a local path), @@ -165,6 +179,9 @@ def cached_path(url_or_filename, cache_dir=None): # File, but it doesn't exist. raise EnvironmentError("file {} not found".format(url_or_filename)) else: + cached_path = cached_path_from_pm(url_or_filename) + if cached_path: + return cached_path # Something unknown raise ValueError( "unable to parse {} as a URL or as a local path".format(url_or_filename) diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py index 3be7078b7a..b0c2da15bf 100644 --- a/fairseq/hub_utils.py +++ b/fairseq/hub_utils.py @@ -11,11 +11,11 @@ from typing import Any, Dict, Iterator, List import torch -from fairseq import utils -from fairseq.data import encoders from omegaconf import open_dict from torch import nn +from fairseq import utils +from fairseq.data import encoders logger = logging.getLogger(__name__) @@ -60,6 +60,8 @@ def from_pretrained( "code": "bpe_codes", "bpecodes": "bpe_codes", "sentencepiece.bpe.model": "sentencepiece_model", + "merges.txt": "bpe_merges", + "vocab.json": "bpe_vocab", }.items(): path = os.path.join(model_path, file) if os.path.exists(path): @@ -68,10 +70,22 @@ def from_pretrained( if "user_dir" in kwargs: utils.import_user_module(argparse.Namespace(user_dir=kwargs["user_dir"])) - models, args, task = checkpoint_utils.load_model_ensemble_and_task( - [os.path.join(model_path, cpt) for cpt in checkpoint_file.split(os.pathsep)], - arg_overrides=kwargs, - ) + model_path = [ + os.path.join(model_path, cpt) for cpt in checkpoint_file.split(os.pathsep) + ] + + if "is_vocoder" in kwargs: + args = {"data": kwargs["data"], "model_path": model_path} + task = None + models = None + else: + models, args, task = checkpoint_utils.load_model_ensemble_and_task( + model_path, + arg_overrides=kwargs, + ) + if "generation_args" in kwargs and kwargs["generation_args"]: + for key in kwargs["generation_args"]: + setattr(args["generation"], key, kwargs["generation_args"][key]) return { "args": args, @@ -130,11 +144,22 @@ def sample( batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs) return [self.decode(hypos[0]["tokens"]) for hypos in batched_hypos] - def score(self, sentences: List[str], **kwargs): + def score( + self, sentences: List[str], replace_newline_with_eos: bool = False, **kwargs + ): if isinstance(sentences, str): - return self.score([sentences], **kwargs)[0] + return self.score( + [sentences], replace_newline_with_eos=replace_newline_with_eos, **kwargs + )[0] + + def encode(sentence): + if replace_newline_with_eos: + return torch.cat([self.encode(line) for line in sentence.splitlines()]) + else: + return self.encode(sentence) + # NOTE: this doesn't support translation tasks currently - tokenized_sentences = [self.encode(sentence) for sentence in sentences] + tokenized_sentences = [encode(sentence) for sentence in sentences] return [ hypos[0] for hypos in self.generate( @@ -149,6 +174,7 @@ def generate( verbose: bool = False, skip_invalid_size_inputs=False, inference_step_args=None, + prefix_allowed_tokens_fn=None, **kwargs ) -> List[List[Dict[str, torch.Tensor]]]: if torch.is_tensor(tokenized_sentences) and tokenized_sentences.dim() == 1: @@ -157,12 +183,16 @@ def generate( )[0] # build generator using current args as well as any kwargs - gen_args = copy.copy(self.cfg) + gen_args = copy.deepcopy(self.cfg.generation) with open_dict(gen_args): gen_args.beam = beam for k, v in kwargs.items(): setattr(gen_args, k, v) - generator = self.task.build_generator(self.models, gen_args) + generator = self.task.build_generator( + self.models, + gen_args, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + ) inference_step_args = inference_step_args or {} results = [] @@ -180,7 +210,7 @@ def generate( if verbose: def getarg(name, default): - return getattr(gen_args, name, getattr(self.args, name, default)) + return getattr(gen_args, name, getattr(self.cfg, name, default)) for source_tokens, target_hypotheses in zip(tokenized_sentences, outputs): src_str_with_unk = self.string(source_tokens) diff --git a/fairseq/iterative_refinement_generator.py b/fairseq/iterative_refinement_generator.py index 4fb0946f49..3d32c6bf4d 100644 --- a/fairseq/iterative_refinement_generator.py +++ b/fairseq/iterative_refinement_generator.py @@ -235,7 +235,7 @@ def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn): terminated.fill_(1) # collect finalized sentences - finalized_idxs = sent_idxs[terminated] + finalized_idxs = sent_idxs[terminated.to(sent_idxs.device)] finalized_tokens = decoder_out.output_tokens[terminated] finalized_scores = decoder_out.output_scores[terminated] finalized_attn = ( @@ -285,7 +285,7 @@ def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn): encoder_out = model.encoder.reorder_encoder_out( encoder_out, not_terminated.nonzero(as_tuple=False).squeeze() ) - sent_idxs = sent_idxs[not_terminated] + sent_idxs = sent_idxs[not_terminated.to(sent_idxs.device)] prev_output_tokens = prev_decoder_out.output_tokens.clone() if self.beam_size > 1: diff --git a/fairseq/logging/meters.py b/fairseq/logging/meters.py index 6793ef54e6..495bd08300 100644 --- a/fairseq/logging/meters.py +++ b/fairseq/logging/meters.py @@ -8,7 +8,6 @@ from collections import OrderedDict from typing import Dict, Optional - try: import torch @@ -18,7 +17,6 @@ def type_as(a, b): else: return a - except ImportError: torch = None @@ -109,6 +107,68 @@ def smoothed_value(self) -> float: return val +class SumMeter(Meter): + """Computes and stores the sum""" + + def __init__(self, round: Optional[int] = None): + self.round = round + self.reset() + + def reset(self): + self.sum = 0 # sum from all updates + + def update(self, val): + if val is not None: + self.sum = type_as(self.sum, val) + val + + def state_dict(self): + return { + "sum": self.sum, + "round": self.round, + } + + def load_state_dict(self, state_dict): + self.sum = state_dict["sum"] + self.round = state_dict.get("round", None) + + @property + def smoothed_value(self) -> float: + val = self.sum + if self.round is not None and val is not None: + val = safe_round(val, self.round) + return val + + +class ConcatTensorMeter(Meter): + """Concatenates tensors""" + + def __init__(self, dim=0): + super().__init__() + self.reset() + self.dim = dim + + def reset(self): + self.tensor = None + + def update(self, val): + if self.tensor is None: + self.tensor = val + else: + self.tensor = torch.cat([self.tensor, val], dim=self.dim) + + def state_dict(self): + return { + "tensor": self.tensor, + } + + def load_state_dict(self, state_dict): + self.tensor = state_dict["tensor"] + + @property + def smoothed_value(self) -> float: + return [] # return a dummy value + + class TimeMeter(Meter): """Computes the average occurrence of some event per second""" diff --git a/fairseq/logging/metrics.py b/fairseq/logging/metrics.py index 7b56e31592..49301f27f8 100644 --- a/fairseq/logging/metrics.py +++ b/fairseq/logging/metrics.py @@ -12,10 +12,9 @@ """ import contextlib -import time import uuid -from collections import OrderedDict, defaultdict -from typing import Callable, Dict, List, Optional +from collections import defaultdict +from typing import Callable, List, Optional from .meters import * @@ -132,6 +131,46 @@ def log_scalar( agg[key].update(value, weight) +def log_scalar_sum( + key: str, + value: float, + priority: int = 10, + round: Optional[int] = None, +): + """Log a scalar value that is summed for reporting. + + Args: + key (str): name of the field to log + value (float): value to log + priority (int): smaller values are logged earlier in the output + round (Optional[int]): number of digits to round to when displaying + """ + for agg in get_active_aggregators(): + if key not in agg: + agg.add_meter(key, SumMeter(round=round), priority) + agg[key].update(value) + + +def log_concat_tensor( + key: str, + value: torch.Tensor, + priority: int = 10, + dim: int = 0, +): + """Log a scalar value that is summed for reporting. + + Args: + key (str): name of the field to log + value (float): value to log + priority (int): smaller values are logged earlier in the output + round (Optional[int]): number of digits to round to when displaying + """ + for agg in get_active_aggregators(): + if key not in agg: + agg.add_meter(key, ConcatTensorMeter(dim=dim), priority) + agg[key].update(value) + + def log_derived(key: str, fn: Callable[[MetersDict], float], priority: int = 20): """Log a scalar value derived from other meters. @@ -286,3 +325,12 @@ def load_state_dict(state_dict): for name, agg_state in state_dict.items(): _aggregators[name] = MetersDict() _aggregators[name].load_state_dict(agg_state) + + +def xla_metrics_report(): + try: + import torch_xla.debug.metrics as met + + print(met.metrics_report()) + except ImportError: + return diff --git a/fairseq/logging/progress_bar.py b/fairseq/logging/progress_bar.py index 63e5394815..4c64b61bad 100644 --- a/fairseq/logging/progress_bar.py +++ b/fairseq/logging/progress_bar.py @@ -21,7 +21,6 @@ from .meters import AverageMeter, StopwatchMeter, TimeMeter - logger = logging.getLogger(__name__) @@ -29,13 +28,24 @@ def progress_bar( iterator, log_format: Optional[str] = None, log_interval: int = 100, + log_file: Optional[str] = None, epoch: Optional[int] = None, prefix: Optional[str] = None, + aim_repo: Optional[str] = None, + aim_run_hash: Optional[str] = None, + aim_param_checkpoint_dir: Optional[str] = None, tensorboard_logdir: Optional[str] = None, default_log_format: str = "tqdm", + wandb_project: Optional[str] = None, + wandb_run_name: Optional[str] = None, + azureml_logging: Optional[bool] = False, ): if log_format is None: log_format = default_log_format + if log_file is not None: + handler = logging.FileHandler(filename=log_file) + logger.addHandler(handler) + if log_format == "tqdm" and not sys.stderr.isatty(): log_format = "simple" @@ -50,16 +60,31 @@ def progress_bar( else: raise ValueError("Unknown log format: {}".format(log_format)) + if aim_repo: + bar = AimProgressBarWrapper( + bar, + aim_repo=aim_repo, + aim_run_hash=aim_run_hash, + aim_param_checkpoint_dir=aim_param_checkpoint_dir, + ) + if tensorboard_logdir: try: # [FB only] custom wrapper for TensorBoard import palaas # noqa + from .fb_tbmf_wrapper import FbTbmfWrapper bar = FbTbmfWrapper(bar, log_interval) except ImportError: bar = TensorboardProgressBarWrapper(bar, tensorboard_logdir) + if wandb_project: + bar = WandBProgressBarWrapper(bar, wandb_project, run_name=wandb_run_name) + + if azureml_logging: + bar = AzureMLProgressBarWrapper(bar) + return bar @@ -114,7 +139,7 @@ def __init__(self, iterable, epoch=None, prefix=None): if epoch is not None: self.prefix += "epoch {:03d}".format(epoch) if prefix is not None: - self.prefix += " | {}".format(prefix) + self.prefix += (" | " if self.prefix != "" else "") + prefix def __len__(self): return len(self.iterable) @@ -136,6 +161,10 @@ def print(self, stats, tag=None, step=None): """Print end-of-epoch stats.""" raise NotImplementedError + def update_config(self, config): + """Log latest configuration.""" + pass + def _str_commas(self, stats): return ", ".join(key + "=" + stats[key].strip() for key in stats.keys()) @@ -292,11 +321,95 @@ def print(self, stats, tag=None, step=None): logger.info("{} | {}".format(self.prefix, postfix)) +try: + import functools + + from aim import Repo as AimRepo + + @functools.lru_cache() + def get_aim_run(repo, run_hash): + from aim import Run + + return Run(run_hash=run_hash, repo=repo) + +except ImportError: + get_aim_run = None + AimRepo = None + + +class AimProgressBarWrapper(BaseProgressBar): + """Log to Aim.""" + + def __init__(self, wrapped_bar, aim_repo, aim_run_hash, aim_param_checkpoint_dir): + self.wrapped_bar = wrapped_bar + + if get_aim_run is None: + self.run = None + logger.warning("Aim not found, please install with: pip install aim") + else: + logger.info(f"Storing logs at Aim repo: {aim_repo}") + + if not aim_run_hash: + # Find run based on save_dir parameter + query = f"run.checkpoint.save_dir == '{aim_param_checkpoint_dir}'" + try: + runs_generator = AimRepo(aim_repo).query_runs(query) + run = next(runs_generator.iter_runs()) + aim_run_hash = run.run.hash + except Exception: + pass + + if aim_run_hash: + logger.info(f"Appending to run: {aim_run_hash}") + + self.run = get_aim_run(aim_repo, aim_run_hash) + + def __iter__(self): + return iter(self.wrapped_bar) + + def log(self, stats, tag=None, step=None): + """Log intermediate stats to Aim.""" + self._log_to_aim(stats, tag, step) + self.wrapped_bar.log(stats, tag=tag, step=step) + + def print(self, stats, tag=None, step=None): + """Print end-of-epoch stats.""" + self._log_to_aim(stats, tag, step) + self.wrapped_bar.print(stats, tag=tag, step=step) + + def update_config(self, config): + """Log latest configuration.""" + if self.run is not None: + for key in config: + self.run.set(key, config[key], strict=False) + self.wrapped_bar.update_config(config) + + def _log_to_aim(self, stats, tag=None, step=None): + if self.run is None: + return + + if step is None: + step = stats["num_updates"] + + if "train" in tag: + context = {"tag": tag, "subset": "train"} + elif "val" in tag: + context = {"tag": tag, "subset": "val"} + else: + context = {"tag": tag} + + for key in stats.keys() - {"num_updates"}: + self.run.track(stats[key], name=key, step=step, context=context) + + try: _tensorboard_writers = {} - from tensorboardX import SummaryWriter + from torch.utils.tensorboard import SummaryWriter except ImportError: - SummaryWriter = None + try: + from tensorboardX import SummaryWriter + except ImportError: + SummaryWriter = None def _close_writers(): @@ -316,7 +429,7 @@ def __init__(self, wrapped_bar, tensorboard_logdir): if SummaryWriter is None: logger.warning( - "tensorboard not found, please install with: pip install tensorboardX" + "tensorboard not found, please install with: pip install tensorboard" ) def _writer(self, key): @@ -341,6 +454,11 @@ def print(self, stats, tag=None, step=None): self._log_to_tensorboard(stats, tag, step) self.wrapped_bar.print(stats, tag=tag, step=step) + def update_config(self, config): + """Log latest configuration.""" + # TODO add hparams to Tensorboard + self.wrapped_bar.update_config(config) + def _log_to_tensorboard(self, stats, tag=None, step=None): writer = self._writer(tag or "") if writer is None: @@ -352,4 +470,113 @@ def _log_to_tensorboard(self, stats, tag=None, step=None): writer.add_scalar(key, stats[key].val, step) elif isinstance(stats[key], Number): writer.add_scalar(key, stats[key], step) + elif torch.is_tensor(stats[key]) and stats[key].numel() == 1: + writer.add_scalar(key, stats[key].item(), step) writer.flush() + + +try: + import wandb +except ImportError: + wandb = None + + +class WandBProgressBarWrapper(BaseProgressBar): + """Log to Weights & Biases.""" + + def __init__(self, wrapped_bar, wandb_project, run_name=None): + self.wrapped_bar = wrapped_bar + if wandb is None: + logger.warning("wandb not found, pip install wandb") + return + + # reinit=False to ensure if wandb.init() is called multiple times + # within one process it still references the same run + wandb.init(project=wandb_project, reinit=False, name=run_name) + + def __iter__(self): + return iter(self.wrapped_bar) + + def log(self, stats, tag=None, step=None): + """Log intermediate stats to tensorboard.""" + self._log_to_wandb(stats, tag, step) + self.wrapped_bar.log(stats, tag=tag, step=step) + + def print(self, stats, tag=None, step=None): + """Print end-of-epoch stats.""" + self._log_to_wandb(stats, tag, step) + self.wrapped_bar.print(stats, tag=tag, step=step) + + def update_config(self, config): + """Log latest configuration.""" + if wandb is not None: + wandb.config.update(config) + self.wrapped_bar.update_config(config) + + def _log_to_wandb(self, stats, tag=None, step=None): + if wandb is None: + return + if step is None: + step = stats["num_updates"] + + prefix = "" if tag is None else tag + "/" + + for key in stats.keys() - {"num_updates"}: + if isinstance(stats[key], AverageMeter): + wandb.log({prefix + key: stats[key].val}, step=step) + elif isinstance(stats[key], Number): + wandb.log({prefix + key: stats[key]}, step=step) + + +try: + from azureml.core import Run +except ImportError: + Run = None + + +class AzureMLProgressBarWrapper(BaseProgressBar): + """Log to Azure ML""" + + def __init__(self, wrapped_bar): + self.wrapped_bar = wrapped_bar + if Run is None: + logger.warning("azureml.core not found, pip install azureml-core") + return + self.run = Run.get_context() + + def __exit__(self, *exc): + if Run is not None: + self.run.complete() + return False + + def __iter__(self): + return iter(self.wrapped_bar) + + def log(self, stats, tag=None, step=None): + """Log intermediate stats to AzureML""" + self._log_to_azureml(stats, tag, step) + self.wrapped_bar.log(stats, tag=tag, step=step) + + def print(self, stats, tag=None, step=None): + """Print end-of-epoch stats""" + self._log_to_azureml(stats, tag, step) + self.wrapped_bar.print(stats, tag=tag, step=step) + + def update_config(self, config): + """Log latest configuration.""" + self.wrapped_bar.update_config(config) + + def _log_to_azureml(self, stats, tag=None, step=None): + if Run is None: + return + if step is None: + step = stats["num_updates"] + + prefix = "" if tag is None else tag + "/" + + for key in stats.keys() - {"num_updates"}: + name = prefix + key + if isinstance(stats[key], AverageMeter): + self.run.log_row(name=name, **{"step": step, key: stats[key].val}) + elif isinstance(stats[key], Number): + self.run.log_row(name=name, **{"step": step, key: stats[key]}) diff --git a/fairseq/model_parallel/criterions/__init__.py b/fairseq/model_parallel/criterions/__init__.py index 6239b50362..5fae7bd4c2 100644 --- a/fairseq/model_parallel/criterions/__init__.py +++ b/fairseq/model_parallel/criterions/__init__.py @@ -8,7 +8,7 @@ # automatically import any Python files in the criterions/ directory -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): module = file[: file.find(".py")] importlib.import_module("fairseq.model_parallel.criterions." + module) diff --git a/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py b/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py index 35c50ee152..5ffbaa8764 100644 --- a/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py +++ b/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py @@ -5,7 +5,8 @@ import math -from fairseq import metrics, utils +from fairseq import utils +from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion diff --git a/fairseq/model_parallel/megatron_trainer.py b/fairseq/model_parallel/megatron_trainer.py index 258551c933..aedf608bce 100644 --- a/fairseq/model_parallel/megatron_trainer.py +++ b/fairseq/model_parallel/megatron_trainer.py @@ -7,18 +7,16 @@ Train a network across multiple GPUs. """ -from fairseq import distributed_utils +from fairseq.dataclass.configs import FairseqConfig +from fairseq.distributed import utils as distributed_utils from fairseq.trainer import Trainer -from omegaconf import DictConfig - try: from fairseq.model_parallel.megatron.mpu import ( - get_data_parallel_group, get_data_parallel_rank, get_data_parallel_world_size, - get_model_parallel_group, get_model_parallel_src_rank, + get_cuda_rng_tracker, ) has_megatron_submodule = True @@ -29,7 +27,7 @@ class MegatronTrainer(Trainer): """Main class for model parallel with data parallel training.""" - def __init__(self, cfg: DictConfig, task, model, criterion, **kwargs): + def __init__(self, cfg: FairseqConfig, task, model, criterion, **kwargs): if not has_megatron_submodule: raise ImportError( "\n\nPlease install the megatron submodule:" @@ -38,30 +36,40 @@ def __init__(self, cfg: DictConfig, task, model, criterion, **kwargs): ) super().__init__(cfg, task, model, criterion, **kwargs) - @property - def data_parallel_world_size(self): - return get_data_parallel_world_size() - - @property - def data_parallel_process_group(self): - return get_data_parallel_group() - - @property - def data_parallel_rank(self): - return get_data_parallel_rank() - - @property - def is_data_parallel_master(self): - return get_model_parallel_src_rank() == 0 - def clip_grad_norm(self, clip_norm): def _aggregate_model_parallel_grad_norm(total_norm): - total_norm = total_norm ** 2 - distributed_utils.all_reduce(total_norm, group=get_model_parallel_group()) - total_norm = total_norm ** 0.5 + total_norm = total_norm**2 + distributed_utils.all_reduce( + total_norm, group=distributed_utils.get_model_parallel_group() + ) + total_norm = total_norm**0.5 return total_norm return self.optimizer.clip_grad_norm( clip_norm, aggregate_norm_fn=_aggregate_model_parallel_grad_norm, ) + + def save_checkpoint(self, filename, extra_state): + """Save all training state in a checkpoint file.""" + extra_state["rng_tracker_states"] = get_cuda_rng_tracker().get_states() + super().save_checkpoint(filename, extra_state) + + def load_checkpoint( + self, + filename, + reset_optimizer=False, + reset_lr_scheduler=False, + optimizer_overrides=None, + reset_meters=False, + ): + extra_state = super().load_checkpoint( + filename, + reset_optimizer=reset_optimizer, + reset_lr_scheduler=reset_lr_scheduler, + optimizer_overrides=optimizer_overrides, + reset_meters=reset_meters, + ) + if extra_state is not None and "rng_tracker_states" in extra_state: + get_cuda_rng_tracker().set_states(extra_state["rng_tracker_states"]) + return extra_state diff --git a/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py b/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py index eb81ded341..85dbd44b3c 100644 --- a/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py +++ b/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py @@ -9,6 +9,7 @@ import torch import torch.nn as nn import torch.nn.functional as F + from fairseq import options, utils from fairseq.modules import ( AdaptiveSoftmax, @@ -17,7 +18,6 @@ PositionalEmbedding, ) - EncoderOut = namedtuple( "TransformerEncoderOut", [ @@ -30,7 +30,7 @@ class TransformerEncoderEmbedding(nn.Module): - """ Encoder Embedding + Positional Embedding """ + """Encoder Embedding + Positional Embedding""" def __init__(self, args, embed_tokens): super().__init__() @@ -109,7 +109,7 @@ def forward(self, input): class TransformerDecoderEmbedding(nn.Module): - """ Decoder Embedding + Positional Embedding """ + """Decoder Embedding + Positional Embedding""" def __init__(self, args, embed_tokens): super().__init__() @@ -242,7 +242,7 @@ def __init__(self, args, embed_tokens, dictionary): torch.Tensor(len(dictionary), self.output_embed_dim) ) nn.init.normal_( - self.embed_tokens, mean=0, std=self.output_embed_dim ** -0.5 + self.embed_tokens, mean=0, std=self.output_embed_dim**-0.5 ) if args.decoder_normalize_before and not getattr( @@ -587,7 +587,7 @@ def make_generation_fast_(self, need_attn=False, **kwargs): def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) - nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) nn.init.constant_(m.weight[padding_idx], 0) return m diff --git a/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py b/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py index 76cfe3b0b4..7873ac6791 100644 --- a/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py +++ b/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py @@ -39,15 +39,52 @@ DEFAULT_MAX_SOURCE_POSITIONS = 1024 DEFAULT_MAX_TARGET_POSITIONS = 1024 +TORCH_PIPE = False +RPC_INIT = False + + +def import_pipe(): + global TORCH_PIPE + global RPC_INIT + try: + from torch.distributed.pipeline.sync import Pipe # noqa + + global Pipe + from torch.distributed.pipeline.sync.utils import partition_model + + global partition_model + from torch.distributed import rpc + import tempfile + + TORCH_PIPE = True + # Initialize single process RPC agent since TORCH_PIPE requires + # RRef. RRef depends on RPC being initialized and as a result we initialize + # RPC with a single node. + tmpfile = tempfile.NamedTemporaryFile() + if not RPC_INIT: + rpc.init_rpc( + name="worker", + rank=0, + world_size=1, + rpc_backend_options=rpc.TensorPipeRpcBackendOptions( + init_method="file://{}".format(tmpfile.name), + ), + ) + RPC_INIT = True + logger.info("Using torch pipe") + except ImportError: + try: + from fairscale.nn import Pipe # noqa + + logger.info("Using fairscale pipe") + except ImportError: + raise ImportError("Please install fairscale with: pip install fairscale") @register_model("pipeline_parallel_transformer") class PipelineParallelTransformerModel(BaseFairseqModel): def __init__(self, encoder, decoder, balance, devices, chunks, checkpoint): - try: - from fairscale.nn import Pipe - except ImportError: - raise ImportError("Please install fairscale with: pip install fairscale") + import_pipe() super().__init__() assert isinstance(encoder, FairseqEncoder) assert isinstance(decoder, FairseqDecoder) @@ -65,13 +102,20 @@ def __init__(self, encoder, decoder, balance, devices, chunks, checkpoint): self.num_decoder_modules = len(decoder_module_list) module_list = encoder_module_list + decoder_module_list self.devices = devices - self.model = Pipe( - nn.Sequential(*module_list), - balance=balance, - devices=devices, - chunks=chunks, - checkpoint=checkpoint, - ) + if TORCH_PIPE: + self.model = Pipe( + partition_model(nn.Sequential(*module_list), balance, devices), + chunks=chunks, + checkpoint=checkpoint, + ) + else: + self.model = Pipe( + nn.Sequential(*module_list), + balance=balance, + devices=devices, + chunks=chunks, + checkpoint=checkpoint, + ) self.encoder_max_positions = self.max_positions_helper( encoder.embedding_layer, "max_source_positions" ) @@ -87,7 +131,10 @@ def forward(self, src_tokens, src_lengths, prev_output_tokens): if self.training: input_lst = [src_tokens, src_lengths, prev_output_tokens] input = tuple(i.to(self.devices[0], non_blocking=True) for i in input_lst) - return self.model(input) + if TORCH_PIPE: + return self.model(input).local_value() + else: + return self.model(input) else: assert self.encoder is not None and self.decoder is not None, ( "encoder and decoder need to be initialized by " @@ -111,9 +158,14 @@ def prepare_for_inference_(self, cfg): decoder_module_list.append(module) module_count += 1 self.model = None - self.encoder = TransformerEncoder(cfg.model, None, None, encoder_module_list) + self.encoder = TransformerEncoder( + cfg.distributed_training, None, None, encoder_module_list + ) self.decoder = TransformerDecoder( - cfg.model, None, None, decoder_module_list=decoder_module_list + cfg.distributed_training, + None, + None, + decoder_module_list=decoder_module_list, ) @staticmethod @@ -320,7 +372,7 @@ def max_decoder_positions(self): """Maximum length supported by the decoder.""" return self.decoder_max_positions - def load_state_dict(self, state_dict, strict=True, cfg=None): + def load_state_dict(self, state_dict, strict=True, model_cfg=None): """Copies parameters and buffers from *state_dict* into this module and its descendants. @@ -389,7 +441,6 @@ def convert_to_pipeline_parallel_state_dict(self, state_dict): # fmt: off if isinstance(module, TransformerEncoderEmbedding): new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['encoder.embed_tokens.weight'] - new_state_dict[f'model.partitions.{pid}.{mid}.embed_positions._float_tensor'] = state_dict['encoder.embed_positions._float_tensor'] if isinstance(module, TransformerEncoderLayer): for suffix in encoder_key_suffixes: new_state_dict[f'model.partitions.{pid}.{mid}.{suffix}'] = state_dict[f'encoder.layers.{encoder_layer_idx}.{suffix}'] @@ -404,7 +455,6 @@ def convert_to_pipeline_parallel_state_dict(self, state_dict): new_state_dict[f'model.partitions.{pid}.{mid}.layer_norm.bias'] = state_dict['encoder.layer_norm.bias'] if isinstance(module, TransformerDecoderEmbedding): new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['decoder.embed_tokens.weight'] - new_state_dict[f'model.partitions.{pid}.{mid}.embed_positions._float_tensor'] = state_dict['decoder.embed_positions._float_tensor'] if isinstance(module, TransformerDecoderOutputLayer): new_state_dict[f'model.partitions.{pid}.{mid}.output_projection.weight'] = state_dict['decoder.output_projection.weight'] # fmt: on @@ -425,21 +475,19 @@ class TransformerEncoder(FairseqEncoder): def __init__(self, args, dictionary, embed_tokens, encoder_module_list=None): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) - try: - from fairscale.nn import Pipe - except ImportError: - raise ImportError("Please install fairscale with: pip install fairscale") - if encoder_module_list is None: - embedding_layer = TransformerEncoderEmbedding(args, embed_tokens) - layers = [TransformerEncoderLayer(args) for i in range(args.encoder_layers)] + import_pipe() + self.use_pipeline = encoder_module_list is not None + if not self.use_pipeline: + self.embedding_layer = TransformerEncoderEmbedding(args, embed_tokens) + self.encoder_layers = nn.Sequential( + *[TransformerEncoderLayer(args) for i in range(args.encoder_layers)] + ) if isinstance(embed_tokens, nn.ModuleList): emb_dim = sum(e.embedding_dim for e in embed_tokens) else: emb_dim = embed_tokens.embedding_dim - final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim) - encoder_module_list = [embedding_layer] + layers + [final_layer_norm] - self.use_pipeline = getattr(args, "pipeline_encoder_balance", None) is not None - if self.use_pipeline: + self.final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim) + else: encoder_balance = utils.eval_str_list( args.pipeline_encoder_balance, type=int ) @@ -450,17 +498,24 @@ def __init__(self, args, dictionary, embed_tokens, encoder_module_list=None): f"Sum of encoder_balance={encoder_balance} is not equal " + f"to num_encoder_modules={len(encoder_module_list)}" ) - self.model = Pipe( - module=nn.Sequential(*encoder_module_list), - balance=encoder_balance, - devices=encoder_devices, - chunks=args.pipeline_chunks, - checkpoint=args.pipeline_checkpoint, - ) - else: - self.embedding_layer = encoder_module_list[0] - self.encoder_layers = nn.Sequential(*encoder_module_list[1:-1]) - self.final_layer_norm = encoder_module_list[-1] + if TORCH_PIPE: + self.model = Pipe( + module=partition_model( + nn.Sequential(*encoder_module_list), + encoder_balance, + encoder_devices, + ), + chunks=args.pipeline_chunks, + checkpoint=args.pipeline_checkpoint, + ) + else: + self.model = Pipe( + module=nn.Sequential(*encoder_module_list), + balance=encoder_balance, + devices=encoder_devices, + chunks=args.pipeline_chunks, + checkpoint=args.pipeline_checkpoint, + ) def forward(self, src_tokens, src_lengths): """ @@ -490,7 +545,10 @@ def forward(self, src_tokens, src_lengths): input_tuple = (src_tokens, src_lengths, dummy_prev_output_tokens) if self.use_pipeline: input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple) - encoder_out = self.model(input_tuple) + if TORCH_PIPE: + encoder_out = self.model(input_tuple).local_value() + else: + encoder_out = self.model(input_tuple) else: encoder_embed_output_tuple = self.embedding_layer(input_tuple) encoder_layers_output = self.encoder_layers(encoder_embed_output_tuple) @@ -566,22 +624,20 @@ def __init__( ): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) - try: - from fairscale.nn import Pipe - except ImportError: - raise ImportError("Please install fairscale with: pip install fairscale") - if decoder_module_list is None: - embedding_layer = TransformerDecoderEmbedding(args, embed_tokens) - layers = [ - TransformerDecoderLayer(args, no_encoder_attn) - for _ in range(args.decoder_layers) - ] - decoder_output_layer = TransformerDecoderOutputLayer( + import_pipe() + self.use_pipeline = decoder_module_list is not None + if not self.use_pipeline: + self.embedding_layer = TransformerDecoderEmbedding(args, embed_tokens) + self.decoder_layers = nn.Sequential( + *[ + TransformerDecoderLayer(args, no_encoder_attn) + for _ in range(args.decoder_layers) + ] + ) + self.decoder_output_layer = TransformerDecoderOutputLayer( args, embed_tokens, dictionary ) - decoder_module_list = [embedding_layer] + layers + [decoder_output_layer] - self.use_pipeline = getattr(args, "pipeline_decoder_balance", None) is not None - if self.use_pipeline: + else: decoder_balance = utils.eval_str_list( args.pipeline_decoder_balance, type=int ) @@ -592,17 +648,24 @@ def __init__( f"Sum of decoder_balance={decoder_balance} is not equal " + f"to num_decoder_modules={len(decoder_module_list)}" ) - self.model = Pipe( - module=nn.Sequential(*decoder_module_list), - balance=decoder_balance, - devices=decoder_devices, - chunks=args.pipeline_chunks, - checkpoint=args.pipeline_checkpoint, - ) - else: - self.embedding_layer = decoder_module_list[0] - self.decoder_layers = nn.Sequential(*decoder_module_list[1:-1]) - self.decoder_output_layer = decoder_module_list[-1] + if TORCH_PIPE: + self.model = Pipe( + module=partition_model( + nn.Sequential(*decoder_module_list), + decoder_balance, + decoder_devices, + ), + chunks=args.pipeline_chunks, + checkpoint=args.pipeline_checkpoint, + ) + else: + self.model = Pipe( + module=nn.Sequential(*decoder_module_list), + balance=decoder_balance, + devices=decoder_devices, + chunks=args.pipeline_chunks, + checkpoint=args.pipeline_checkpoint, + ) def forward( self, @@ -632,7 +695,10 @@ def forward( ) if self.use_pipeline: input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple) - return (self.model(input_tuple),) + if TORCH_PIPE: + return (self.model(input_tuple).local_value(),) + else: + return (self.model(input_tuple),) else: embed_layer_output = self.embedding_layer(input_tuple) state = self.decoder_layers(embed_layer_output) @@ -673,14 +739,6 @@ def buffered_future_mask(self, tensor): def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" - if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): - weights_key = "{}.embed_positions.weights".format(name) - if weights_key in state_dict: - del state_dict[weights_key] - state_dict[ - "{}.embed_positions._float_tensor".format(name) - ] = torch.FloatTensor(1) - for i in range(len(self.layers)): # update layer norms layer_norm_map = { diff --git a/fairseq/model_parallel/models/roberta/model.py b/fairseq/model_parallel/models/roberta/model.py index 68ad88d2a5..77a80ef720 100644 --- a/fairseq/model_parallel/models/roberta/model.py +++ b/fairseq/model_parallel/models/roberta/model.py @@ -12,16 +12,15 @@ import torch.nn as nn import torch.nn.functional as F from fairseq import utils -from fairseq.model_parallel.modules import ModelParallelTransformerSentenceEncoder -from fairseq.models import FairseqEncoder, register_model, register_model_architecture +from fairseq.model_parallel.models.transformer import ModelParallelTransformerEncoder +from fairseq.models import register_model, register_model_architecture from fairseq.models.roberta import ( - RobertaClassificationHead, + roberta_base_architecture, + roberta_prenorm_architecture, RobertaEncoder, - RobertaLMHead, RobertaModel, ) -from fairseq.modules import LayerNorm, TransformerSentenceEncoder -from fairseq.modules.transformer_sentence_encoder import init_bert_params +from fairseq.modules import LayerNorm try: @@ -29,7 +28,7 @@ copy_to_model_parallel_region, gather_from_model_parallel_region, ColumnParallelLinear, - RowParallelLinear, + VocabParallelEmbedding, ) has_megatron_submodule = True @@ -48,7 +47,15 @@ def __init__(self, args, encoder): @staticmethod def add_args(parser): - super(ModelParallelRobertaModel, ModelParallelRobertaModel).add_args(parser) + RobertaModel.add_args(parser) + parser.add_argument( + "--no-final-layer-norm", + action="store_true", + help=( + "don't add final layernorm (only applicable when " + "--encoder-normalize-before=True" + ), + ) @classmethod def build_model(cls, args, task): @@ -165,121 +172,52 @@ def forward(self, features, **kwargs): return x -class ModelParallelRobertaEncoder(FairseqEncoder): - """RoBERTa encoder. - - Implements the :class:`~fairseq.models.FairseqDecoder` interface required - by :class:`~fairseq.models.FairseqLanguageModel`. - """ +class ModelParallelRobertaEncoder(RobertaEncoder): + """RoBERTa encoder.""" def __init__(self, args, dictionary): - super().__init__(dictionary) - self.args = args - - # RoBERTa is a sentence encoder model, so users will intuitively trim - # encoder layers. However, the implementation uses the fairseq decoder, - # so we fix here. - if args.encoder_layers_to_keep: - args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) - args.decoder_layers_to_keep = args.encoder_layers_to_keep - args.encoder_layers_to_keep = None - - self.sentence_encoder = ModelParallelTransformerSentenceEncoder( - padding_idx=dictionary.pad(), - vocab_size=len(dictionary), - num_encoder_layers=args.encoder_layers, - embedding_dim=args.encoder_embed_dim, - ffn_embedding_dim=args.encoder_ffn_embed_dim, - num_attention_heads=args.encoder_attention_heads, - dropout=args.dropout, - attention_dropout=args.attention_dropout, - activation_dropout=args.activation_dropout, - layerdrop=args.encoder_layerdrop, - max_seq_len=args.max_positions, - num_segments=0, - encoder_normalize_before=False, - apply_bert_init=False, - activation_fn=args.activation_fn, - ) - self.lm_head = ModelParallelRobertaLMHead( - embed_dim=args.encoder_embed_dim, - output_dim=len(dictionary), - activation_fn=args.activation_fn, - weight=self.sentence_encoder.embed_tokens.weight, - ) - - def forward( - self, - src_tokens, - features_only=False, - return_all_hiddens=False, - masked_tokens=None, - **unused - ): - """ - Args: - src_tokens (LongTensor): input tokens of shape `(batch, src_len)` - features_only (bool, optional): skip LM head and just return - features. If True, the output will be of shape - `(batch, src_len, embed_dim)`. - return_all_hiddens (bool, optional): also return all of the - intermediate hidden states (default: False). - - Returns: - tuple: - - the LM output of shape `(batch, src_len, vocab)` - - a dictionary of additional data, where 'inner_states' - is a list of hidden states. Note that the hidden - states have shape `(src_len, batch, vocab)`. - """ - x, extra = self.extract_features( - src_tokens, return_all_hiddens=return_all_hiddens - ) - if not features_only: - x = self.output_layer(x, masked_tokens=masked_tokens) - return x, extra + super().__init__(args, dictionary) + assert not self.args.untie_weights_roberta - def extract_features(self, src_tokens, return_all_hiddens=False, **unused): - inner_states, _ = self.sentence_encoder( - src_tokens, - last_state_only=not return_all_hiddens, - ) - features = inner_states[-1].transpose(0, 1) # T x B x C -> B x T x C - return features, {"inner_states": inner_states if return_all_hiddens else None} + def build_embedding(self, vocab_size, embedding_dim, padding_idx): + return VocabParallelEmbedding(vocab_size, embedding_dim, padding_idx) - def output_layer(self, features, masked_tokens=None, **unused): - return self.lm_head(features, masked_tokens) + def build_encoder(self, args, dictionary, embed_tokens): + return ModelParallelTransformerEncoder(args, dictionary, embed_tokens) - def max_positions(self): - """Maximum output length supported by the encoder.""" - return self.args.max_positions + def build_lm_head(self, embed_dim, output_dim, activation_fn, weight): + return ModelParallelRobertaLMHead(embed_dim, output_dim, activation_fn, weight) @register_model_architecture("model_parallel_roberta", "model_parallel_roberta") def base_architecture(args): - args.encoder_layers = getattr(args, "encoder_layers", 12) - args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) - args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072) - args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) + args.no_final_layer_norm = getattr(args, "no_final_layer_norm", False) + # model parallel RoBERTa defaults to "Pre-LN" formulation + roberta_prenorm_architecture(args) - args.activation_fn = getattr(args, "activation_fn", "gelu") - args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") - args.dropout = getattr(args, "dropout", 0.1) - args.attention_dropout = getattr(args, "attention_dropout", 0.1) - args.activation_dropout = getattr(args, "activation_dropout", 0.0) - args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) - args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None) - args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0) +# earlier versions of model parallel RoBERTa removed the final layer norm +@register_model_architecture("model_parallel_roberta", "model_parallel_roberta_v1") +def model_parallel_roberta_v1_architecture(args): + args.no_final_layer_norm = getattr(args, "no_final_layer_norm", True) + base_architecture(args) + + +@register_model_architecture( + "model_parallel_roberta", "model_parallel_roberta_postnorm" +) +def model_parallel_roberta_postnorm_architecture(args): + # the original BERT/RoBERTa uses the "Post-LN" formulation + roberta_base_architecture(args) @register_model_architecture("model_parallel_roberta", "model_parallel_roberta_base") -def roberta_base_architecture(args): +def model_parallel_roberta_base_architecture(args): base_architecture(args) @register_model_architecture("model_parallel_roberta", "model_parallel_roberta_large") -def roberta_large_architecture(args): +def model_parallel_roberta_large_architecture(args): args.encoder_layers = getattr(args, "encoder_layers", 24) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) diff --git a/fairseq/model_parallel/models/transformer.py b/fairseq/model_parallel/models/transformer.py index 4f34645226..cf3b2e8baf 100644 --- a/fairseq/model_parallel/models/transformer.py +++ b/fairseq/model_parallel/models/transformer.py @@ -6,7 +6,7 @@ import logging import torch.nn as nn -import torch.nn.functional as F + from fairseq.model_parallel.modules import ( ModelParallelTransformerDecoderLayer, ModelParallelTransformerEncoderLayer, @@ -18,12 +18,11 @@ TransformerModel, ) - try: from fairseq.model_parallel.megatron.mpu import ( + VocabParallelEmbedding, copy_to_model_parallel_region, gather_from_model_parallel_region, - VocabParallelEmbedding, ) has_megatron_submodule = True @@ -53,7 +52,7 @@ def build_embedding(cls, args, dictionary, embed_dim, path=None): padding_idx = dictionary.pad() def _vocab_init(tensor, **kwargs): - nn.init.normal_(tensor, mean=0, std=num_embeddings ** -0.5) + nn.init.normal_(tensor, mean=0, std=num_embeddings**-0.5) nn.init.constant_(tensor[1], 0) emb = VocabParallelEmbedding( @@ -86,6 +85,12 @@ class ModelParallelTransformerEncoder(TransformerEncoder): is a :class:`ModelParallelTransformerEncoderLayer`. """ + def __init__(self, args, dictionary, embed_tokens): + super().__init__(args, dictionary, embed_tokens) + + if args.no_final_layer_norm: + self.layer_norm = None + def build_encoder_layer(self, args): return ModelParallelTransformerEncoderLayer(args) diff --git a/fairseq/model_parallel/models/transformer_lm.py b/fairseq/model_parallel/models/transformer_lm.py index dc52f6e8dd..03e4dbe263 100644 --- a/fairseq/model_parallel/models/transformer_lm.py +++ b/fairseq/model_parallel/models/transformer_lm.py @@ -4,11 +4,11 @@ # LICENSE file in the root directory of this source tree. import torch.nn as nn + from fairseq.model_parallel.models.transformer import ModelParallelTransformerDecoder from fairseq.models import register_model, register_model_architecture from fairseq.models.transformer_lm import TransformerLanguageModel - try: from fairseq.model_parallel.megatron.mpu import VocabParallelEmbedding @@ -22,7 +22,6 @@ @register_model("model_parallel_transformer_lm") class ModelParallelTransformerLanguageModel(TransformerLanguageModel): - @staticmethod def add_args(parser): TransformerLanguageModel.add_args(parser) @@ -72,14 +71,10 @@ def build_model(cls, args, task): ) return cls(decoder) - @staticmethod - def add_args(parser): - TransformerLanguageModel.add_args(parser) - @classmethod def build_embedding(cls, args, dictionary, embed_dim, path=None): def _vocab_init(tensor, **kwargs): - nn.init.normal_(tensor, mean=0, std=embed_dim ** -0.5) + nn.init.normal_(tensor, mean=0, std=embed_dim**-0.5) nn.init.constant_(tensor[1], 0) embed_tokens = VocabParallelEmbedding( diff --git a/fairseq/model_parallel/modules/__init__.py b/fairseq/model_parallel/modules/__init__.py index fb45b3c9e0..11603217a1 100644 --- a/fairseq/model_parallel/modules/__init__.py +++ b/fairseq/model_parallel/modules/__init__.py @@ -9,15 +9,9 @@ ModelParallelTransformerEncoderLayer, ModelParallelTransformerDecoderLayer, ) -from .transformer_sentence_encoder_layer import ( - ModelParallelTransformerSentenceEncoderLayer, -) -from .transformer_sentence_encoder import ModelParallelTransformerSentenceEncoder __all__ = [ "ModelParallelMultiheadAttention", "ModelParallelTransformerEncoderLayer", "ModelParallelTransformerDecoderLayer", - "ModelParallelTransformerSentenceEncoder", - "ModelParallelTransformerSentenceEncoderLayer", ] diff --git a/fairseq/model_parallel/modules/multihead_attention.py b/fairseq/model_parallel/modules/multihead_attention.py index 4164bf9131..bbea450950 100644 --- a/fairseq/model_parallel/modules/multihead_attention.py +++ b/fairseq/model_parallel/modules/multihead_attention.py @@ -7,18 +7,18 @@ import torch import torch.nn.functional as F +from torch import Tensor, nn + from fairseq import utils from fairseq.incremental_decoding_utils import with_incremental_state from fairseq.modules.fairseq_dropout import FairseqDropout -from torch import Tensor, nn - try: from fairseq.model_parallel.megatron.mpu import ( - get_cuda_rng_tracker, - get_model_parallel_world_size, ColumnParallelLinear, RowParallelLinear, + get_cuda_rng_tracker, + get_model_parallel_world_size, ) has_megatron_submodule = True @@ -71,7 +71,7 @@ def __init__( assert ( self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention @@ -93,11 +93,6 @@ def __init__( embed_dim, embed_dim, bias=bias, input_is_parallel=True ) - self.tpu = False - - def prepare_for_tpu_(self, **kwargs): - self.tpu = True - def forward( self, query, @@ -123,6 +118,8 @@ def forward( assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] + is_tpu = query.device.type == "xla" + if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if saved_state is not None and "prev_key" in saved_state: @@ -250,7 +247,7 @@ def forward( attn_weights = attn_weights.view( bsz, self.num_heads_partition, tgt_len, src_len ) - if not self.tpu: + if not is_tpu: attn_weights = attn_weights.masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf"), diff --git a/fairseq/model_parallel/modules/transformer_sentence_encoder.py b/fairseq/model_parallel/modules/transformer_sentence_encoder.py deleted file mode 100644 index a5d50a33c6..0000000000 --- a/fairseq/model_parallel/modules/transformer_sentence_encoder.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import random -from typing import Optional, Tuple - -import torch -import torch.nn as nn -import torch.nn.functional as F -from fairseq.model_parallel.modules import ModelParallelTransformerSentenceEncoderLayer -from fairseq.modules import ( - LayerNorm, - MultiheadAttention, - PositionalEmbedding, - TransformerSentenceEncoder, -) - - -try: - from fairseq.model_parallel.megatron.mpu import VocabParallelEmbedding - - has_megatron_submodule = True -except (ImportError, ModuleNotFoundError): - has_megatron_submodule = False - - -class ModelParallelTransformerSentenceEncoder(TransformerSentenceEncoder): - """ - Implementation for a Model Parallel Bi-directional Transformer based - Sentence Encoder used in BERT/XLM style pre-trained models. - """ - - def build_embedding(self, vocab_size, embedding_dim, padding_idx): - return VocabParallelEmbedding(vocab_size, embedding_dim, padding_idx) - - def build_transformer_sentence_encoder_layer( - self, - embedding_dim, - ffn_embedding_dim, - num_attention_heads, - dropout, - attention_dropout, - activation_dropout, - activation_fn, - export, - **unused, - ): - return ModelParallelTransformerSentenceEncoderLayer( - embedding_dim=embedding_dim, - ffn_embedding_dim=ffn_embedding_dim, - num_attention_heads=num_attention_heads, - dropout=dropout, - attention_dropout=attention_dropout, - activation_dropout=activation_dropout, - activation_fn=activation_fn, - export=export, - ) diff --git a/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py b/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py deleted file mode 100644 index e10bf52332..0000000000 --- a/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import torch -import torch.nn.functional as F -from fairseq import utils -from fairseq.model_parallel.modules import ModelParallelMultiheadAttention -from fairseq.modules import TransformerSentenceEncoderLayer - - -try: - from fairseq.model_parallel.megatron.mpu import ( - ColumnParallelLinear, - RowParallelLinear, - ) - - has_megatron_submodule = True -except (ImportError, ModuleNotFoundError): - has_megatron_submodule = False - - -class ModelParallelTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer): - """ - Implements a Model Parallel Transformer Encoder Layer used in - BERT/XLM style pre-trained models. - """ - - def build_fc1(self, input_dim, output_dim, **unused): - return ColumnParallelLinear(input_dim, output_dim, gather_output=False) - - def build_fc2(self, input_dim, output_dim, **unused): - return RowParallelLinear(input_dim, output_dim, input_is_parallel=True) - - def build_self_attention( - self, - embed_dim, - num_attention_heads, - dropout, - **kwargs, - ): - return ModelParallelMultiheadAttention( - embed_dim, num_attention_heads, dropout=dropout, self_attention=True - ) - - def forward( - self, - x: torch.Tensor, - self_attn_mask: torch.Tensor = None, - self_attn_padding_mask: torch.Tensor = None, - ): - """ - LayerNorm is applied either before or after the self-attention/ffn - modules similar to the original Transformer imlementation. - """ - residual = x - x = self.self_attn_layer_norm(x) - x, attn = self.self_attn( - query=x, - key=x, - value=x, - key_padding_mask=self_attn_padding_mask, - need_weights=False, - attn_mask=self_attn_mask, - ) - x = self.dropout_module(x) - x = residual + x - - residual = x - x = self.final_layer_norm(x) - x = self.activation_fn(self.fc1(x)) - x = self.activation_dropout_module(x) - x = self.fc2(x) - x = self.dropout_module(x) - x = residual + x - return x, None diff --git a/fairseq/models/__init__.py b/fairseq/models/__init__.py index 3b4fd51d6c..11cf6ee530 100644 --- a/fairseq/models/__init__.py +++ b/fairseq/models/__init__.py @@ -8,9 +8,12 @@ import importlib import os -import fairseq +from contextlib import ExitStack + from fairseq.dataclass import FairseqDataclass -from omegaconf import DictConfig, OmegaConf +from fairseq.dataclass.utils import merge_with_parent +from hydra.core.config_store import ConfigStore +from omegaconf import open_dict, OmegaConf from .composite_encoder import CompositeEncoder from .distributed_fairseq_model import DistributedFairseqModel @@ -50,10 +53,57 @@ ] -def build_model(cfg: DictConfig, task): - if isinstance(cfg, DictConfig): - return ARCH_MODEL_REGISTRY[cfg._name].build_model(cfg, task) - return ARCH_MODEL_REGISTRY[cfg.arch].build_model(cfg, task) +def build_model(cfg: FairseqDataclass, task, from_checkpoint=False): + + model = None + model_type = getattr(cfg, "_name", None) or getattr(cfg, "arch", None) + + if not model_type and len(cfg) == 1: + # this is hit if config object is nested in directory that is named after model type + + model_type = next(iter(cfg)) + if model_type in MODEL_DATACLASS_REGISTRY: + cfg = cfg[model_type] + else: + raise Exception( + "Could not infer model type from directory. Please add _name field to indicate model type. " + "Available models: " + + str(MODEL_DATACLASS_REGISTRY.keys()) + + " Requested model type: " + + model_type + ) + + if model_type in ARCH_MODEL_REGISTRY: + # case 1: legacy models + model = ARCH_MODEL_REGISTRY[model_type] + elif model_type in MODEL_DATACLASS_REGISTRY: + # case 2: config-driven models + model = MODEL_REGISTRY[model_type] + + if model_type in MODEL_DATACLASS_REGISTRY: + # set defaults from dataclass. note that arch name and model name can be the same + dc = MODEL_DATACLASS_REGISTRY[model_type] + + if isinstance(cfg, argparse.Namespace): + cfg = dc.from_namespace(cfg) + else: + cfg = merge_with_parent(dc(), cfg, from_checkpoint) + else: + if model_type in ARCH_CONFIG_REGISTRY: + with open_dict(cfg) if OmegaConf.is_config(cfg) else ExitStack(): + # this calls the different "arch" functions (like base_architecture()) that you indicate + # if you specify --arch on the command line. this is only applicable to the old argparse based models + # hydra models should expose different architectures via different config files + # it will modify the cfg object and default parameters according to the arch + ARCH_CONFIG_REGISTRY[model_type](cfg) + + assert model is not None, ( + f"Could not infer model type from {cfg}. " + "Available models: {}".format(MODEL_DATACLASS_REGISTRY.keys()) + + f" Requested model type: {model_type}" + ) + + return model.build_model(cfg, task) def register_model(name, dataclass=None): @@ -78,7 +128,8 @@ class LSTM(FairseqEncoderDecoderModel): def register_model_cls(cls): if name in MODEL_REGISTRY: - raise ValueError("Cannot register duplicate model ({})".format(name)) + return MODEL_REGISTRY[name] + if not issubclass(cls, BaseFairseqModel): raise ValueError( "Model ({}: {}) must extend BaseFairseqModel".format(name, cls.__name__) @@ -92,6 +143,16 @@ def register_model_cls(cls): cls.__dataclass = dataclass if dataclass is not None: MODEL_DATACLASS_REGISTRY[name] = dataclass + + cs = ConfigStore.instance() + node = dataclass() + node._name = name + cs.store(name=name, group="model", node=node, provider="fairseq") + + @register_model_architecture(name, name) + def noop(_): + pass + return cls return register_model_cls @@ -121,15 +182,6 @@ def lstm_luong_wmt_en_de(cfg): arch_name (str): the name of the model architecture (``--arch``) """ - def arch_override_from_yaml(args, arch): - root_dir = os.path.dirname(os.path.dirname(fairseq.__file__)) - yaml_path = os.path.join(root_dir, "config/model/{}.yaml".format(arch)) - if not os.path.exists(yaml_path): - raise RuntimeError(f"yaml file {yaml_path} does not exist!") - arch_cfg = OmegaConf.load(yaml_path) - for k, v in arch_cfg.items(): - setattr(args, k, getattr(args, k, v)) - def register_model_arch_fn(fn): if model_name not in MODEL_REGISTRY: raise ValueError( @@ -148,39 +200,37 @@ def register_model_arch_fn(fn): ARCH_MODEL_REGISTRY[arch_name] = MODEL_REGISTRY[model_name] ARCH_MODEL_NAME_REGISTRY[arch_name] = model_name ARCH_MODEL_INV_REGISTRY.setdefault(model_name, []).append(arch_name) - if type(fn) is type and issubclass(fn, BaseFairseqModel): - # for model classes migrated with hydra - # in this case, we are using this decorator directly on model class since - # we do not need arch overriding functions. - ARCH_CONFIG_REGISTRY[arch_name] = lambda args: arch_override_from_yaml( - args, arch=arch_name - ) - else: - ARCH_CONFIG_REGISTRY[arch_name] = fn + ARCH_CONFIG_REGISTRY[arch_name] = fn return fn return register_model_arch_fn +def import_models(models_dir, namespace): + for file in os.listdir(models_dir): + path = os.path.join(models_dir, file) + if ( + not file.startswith("_") + and not file.startswith(".") + and (file.endswith(".py") or os.path.isdir(path)) + ): + model_name = file[: file.find(".py")] if file.endswith(".py") else file + importlib.import_module(namespace + "." + model_name) + + # extra `model_parser` for sphinx + if model_name in MODEL_REGISTRY: + parser = argparse.ArgumentParser(add_help=False) + group_archs = parser.add_argument_group("Named architectures") + group_archs.add_argument( + "--arch", choices=ARCH_MODEL_INV_REGISTRY[model_name] + ) + group_args = parser.add_argument_group( + "Additional command-line arguments" + ) + MODEL_REGISTRY[model_name].add_args(group_args) + globals()[model_name + "_parser"] = parser + + # automatically import any Python files in the models/ directory models_dir = os.path.dirname(__file__) -for file in os.listdir(models_dir): - path = os.path.join(models_dir, file) - if ( - not file.startswith("_") - and not file.startswith(".") - and (file.endswith(".py") or os.path.isdir(path)) - ): - model_name = file[: file.find(".py")] if file.endswith(".py") else file - module = importlib.import_module("fairseq.models." + model_name) - - # extra `model_parser` for sphinx - if model_name in MODEL_REGISTRY: - parser = argparse.ArgumentParser(add_help=False) - group_archs = parser.add_argument_group("Named architectures") - group_archs.add_argument( - "--arch", choices=ARCH_MODEL_INV_REGISTRY[model_name] - ) - group_args = parser.add_argument_group("Additional command-line arguments") - MODEL_REGISTRY[model_name].add_args(group_args) - globals()[model_name + "_parser"] = parser +import_models(models_dir, "fairseq.models") diff --git a/fairseq/models/bart/hub_interface.py b/fairseq/models/bart/hub_interface.py index 819ea8eeda..6b647c9642 100644 --- a/fairseq/models/bart/hub_interface.py +++ b/fairseq/models/bart/hub_interface.py @@ -23,7 +23,7 @@ class BARTHubInterface(GeneratorHubInterface): """A simple PyTorch Hub interface to BART. - Usage: https://github.com/pytorch/fairseq/tree/master/examples/bart + Usage: https://github.com/pytorch/fairseq/tree/main/examples/bart """ def __init__(self, cfg, task, model): @@ -69,7 +69,7 @@ def decode(self, tokens: torch.LongTensor): tokens = tokens[1:] # remove <s> eos_mask = tokens == self.task.source_dictionary.eos() doc_mask = eos_mask[1:] & eos_mask[:-1] - sentences = np.split(tokens, doc_mask.nonzero(as_tuple=False)[0] + 1) + sentences = np.split(tokens, doc_mask.nonzero()[0] + 1) sentences = [ self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences ] @@ -92,22 +92,29 @@ def generate( tokenized_sentences: List[torch.LongTensor], *args, inference_step_args=None, + skip_invalid_size_inputs=False, **kwargs ) -> List[List[Dict[str, torch.Tensor]]]: inference_step_args = inference_step_args or {} if "prefix_tokens" in inference_step_args: raise NotImplementedError("prefix generation not implemented for BART") - else: - bsz = len(tokenized_sentences) - inference_step_args["prefix_tokens"] = tokenized_sentences[0].new_full( - (bsz, 1), fill_value=self.task.source_dictionary.bos() + res = [] + for batch in self._build_batches(tokenized_sentences, skip_invalid_size_inputs): + src_tokens = batch["net_input"]["src_tokens"] + inference_step_args["prefix_tokens"] = src_tokens.new_full( + (src_tokens.size(0), 1), fill_value=self.task.source_dictionary.bos() ).to(device=self.device) - return super().generate( - tokenized_sentences, - *args, - inference_step_args=inference_step_args, - **kwargs - ) + results = super().generate( + src_tokens, + *args, + inference_step_args=inference_step_args, + skip_invalid_size_inputs=skip_invalid_size_inputs, + **kwargs + ) + for id, hypos in zip(batch["id"].tolist(), results): + res.append((id, hypos)) + res = [hypos for _, hypos in sorted(res, key=lambda x: x[0])] + return res def extract_features( self, tokens: torch.LongTensor, return_all_hiddens: bool = False @@ -165,37 +172,40 @@ def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = Fal def fill_mask( self, - masked_input: str, + masked_inputs: List[str], topk: int = 5, match_source_len: bool = True, **generate_kwargs ): - masked_token = '<mask>' - assert masked_token in masked_input, \ - "please add one {} token for the input".format(masked_token) - - text_spans = masked_input.split(masked_token) - text_spans_bpe = (' {0} '.format(masked_token)).join( - [self.bpe.encode(text_span.rstrip()) for text_span in text_spans] - ).strip() - tokens = self.task.source_dictionary.encode_line( - '<s> ' + text_spans_bpe + ' </s>', - append_eos=False, - add_if_not_exist=False, - ).long() - - if tokens.dim() == 1: - tokens = tokens.unsqueeze(0) + masked_token = "<mask>" + batch_tokens = [] + for masked_input in masked_inputs: + assert ( + masked_token in masked_input + ), "please add one {} token for the input".format(masked_token) + + text_spans = masked_input.split(masked_token) + text_spans_bpe = ( + (" {0} ".format(masked_token)) + .join([self.bpe.encode(text_span.rstrip()) for text_span in text_spans]) + .strip() + ) + tokens = self.task.source_dictionary.encode_line( + "<s> " + text_spans_bpe + " </s>", + append_eos=False, + add_if_not_exist=False, + ).long() + batch_tokens.append(tokens) # ensure beam size is at least as big as topk - generate_kwargs['beam'] = max( + generate_kwargs["beam"] = max( topk, - generate_kwargs.get('beam', -1), + generate_kwargs.get("beam", -1), ) - generate_kwargs['match_source_len'] = match_source_len - hypos = self.generate(tokens, **generate_kwargs)[0] + generate_kwargs["match_source_len"] = match_source_len + batch_hypos = self.generate(batch_tokens, **generate_kwargs) return [ - (self.decode(hypo['tokens']), hypo['score']) - for hypo in hypos[:topk] + [(self.decode(hypo["tokens"]), hypo["score"]) for hypo in hypos[:topk]] + for hypos in batch_hypos ] diff --git a/fairseq/models/bart/model.py b/fairseq/models/bart/model.py index 7263a78dc2..e3670c0a2c 100644 --- a/fairseq/models/bart/model.py +++ b/fairseq/models/bart/model.py @@ -6,11 +6,12 @@ BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension """ - import logging +from typing import Optional import torch import torch.nn as nn + from fairseq import utils from fairseq.models import register_model, register_model_architecture from fairseq.models.transformer import TransformerModel @@ -18,12 +19,13 @@ from .hub_interface import BARTHubInterface - logger = logging.getLogger(__name__) @register_model("bart") class BARTModel(TransformerModel): + __jit_unused_properties__ = ["supported_targets"] + @classmethod def hub_models(cls): return { @@ -41,6 +43,8 @@ def __init__(self, args, encoder, decoder): self.apply(init_bert_params) self.classification_heads = nn.ModuleDict() + if hasattr(self.encoder, "dictionary"): + self.eos: int = self.encoder.dictionary.eos() @staticmethod def add_args(parser): @@ -71,10 +75,12 @@ def forward( src_tokens, src_lengths, prev_output_tokens, - features_only=False, - classification_head_name=None, - token_embeddings=None, - **kwargs, + features_only: bool = False, + classification_head_name: Optional[str] = None, + token_embeddings: Optional[torch.Tensor] = None, + return_all_hiddens: bool = True, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, ): if classification_head_name is not None: features_only = True @@ -83,22 +89,27 @@ def forward( src_tokens, src_lengths=src_lengths, token_embeddings=token_embeddings, - **kwargs, + return_all_hiddens=return_all_hiddens, ) x, extra = self.decoder( prev_output_tokens, encoder_out=encoder_out, features_only=features_only, - **kwargs, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + src_lengths=src_lengths, + return_all_hiddens=return_all_hiddens, ) - + eos: int = self.eos if classification_head_name is not None: - sentence_representation = x[ - src_tokens.eq(self.encoder.dictionary.eos()), : - ].view(x.size(0), -1, x.size(-1))[:, -1, :] - x = self.classification_heads[classification_head_name]( - sentence_representation - ) + sentence_representation = x[src_tokens.eq(eos), :].view( + x.size(0), -1, x.size(-1) + )[:, -1, :] + for k, head in self.classification_heads.items(): + # for torch script only supports iteration + if k == classification_head_name: + x = head(sentence_representation) + break return x, extra @classmethod @@ -108,6 +119,7 @@ def from_pretrained( checkpoint_file="model.pt", data_name_or_path=".", bpe="gpt2", + sample_break_mode="eos", **kwargs, ): from fairseq import hub_utils @@ -119,6 +131,7 @@ def from_pretrained( archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, + sample_break_mode=sample_break_mode, **kwargs, ) return BARTHubInterface(x["args"], x["task"], x["models"][0]) @@ -233,7 +246,7 @@ def truncate_emb(key): embed_dim = state_dict["encoder.embed_tokens.weight"].size(1) new_lang_embed_to_add = torch.zeros(num_langids_to_add, embed_dim) - nn.init.normal_(new_lang_embed_to_add, mean=0, std=embed_dim ** -0.5) + nn.init.normal_(new_lang_embed_to_add, mean=0, std=embed_dim**-0.5) new_lang_embed_to_add = new_lang_embed_to_add.to( dtype=state_dict["encoder.embed_tokens.weight"].dtype, ) @@ -263,9 +276,20 @@ def truncate_emb(key): cur_state = self.classification_heads.state_dict() for k, v in cur_state.items(): if prefix + "classification_heads." + k not in state_dict: - logger.info("Overwriting", prefix + "classification_heads." + k) + logger.info("Overwriting " + prefix + "classification_heads." + k) state_dict[prefix + "classification_heads." + k] = v + def set_beam_size(self, beam): + """Set beam size for efficient beamable enc-dec attention.""" + beamable = False + for layer in self.decoder.layers: + if layer.encoder_attn is not None: + if hasattr(layer.encoder_attn, "set_beam_size"): + layer.encoder_attn.set_beam_size(beam) + beamable = True + if beamable: + self.encoder.reorder_encoder_out = self.encoder._reorder_encoder_out + class BARTClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" diff --git a/fairseq/models/distributed_fairseq_model.py b/fairseq/models/distributed_fairseq_model.py index ece10c6333..fd76bcd4bf 100644 --- a/fairseq/models/distributed_fairseq_model.py +++ b/fairseq/models/distributed_fairseq_model.py @@ -3,20 +3,36 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import inspect +import logging +import os +import signal +import threading +import torch import torch.nn as nn -from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel +from torch.nn.parallel import DistributedDataParallel +from fairseq.distributed import ( + DistributedTimeoutWrapper, + LegacyDistributedDataParallel, + ModuleProxyWrapper, + TPUDistributedDataParallel, +) -_GOSSIP_DISABLED = False +logger = logging.getLogger(__name__) + + +_SLOWMO_DDP_DISABLED = False try: - import gossip + from fairscale.experimental.nn.data_parallel import ( + SlowMoBaseAlgorithm, + SlowMoDistributedDataParallel, + ) except ImportError: - _GOSSIP_DISABLED = True + _SLOWMO_DDP_DISABLED = True -def DistributedFairseqModel(args, model, process_group=None): +def DistributedFairseqModel(args, model, process_group, device): """ Wrap a *model* to support distributed data parallel training. @@ -28,39 +44,59 @@ def DistributedFairseqModel(args, model, process_group=None): Args: args (argparse.Namespace): fairseq args model (BaseFairseqModel): model to wrap + process_group: the c10d process group to be used for distributed data + parallel all-reduction. + device: device to move model to """ - # determine which DDP class to extend assert isinstance(model, nn.Module) - if args.distributed_wrapper == "DDP" and args.ddp_backend == "c10d": - ddp_class = nn.parallel.DistributedDataParallel - init_kwargs = dict( - module=model, + if args.tpu: + wrapped_model = TPUDistributedDataParallel( + module=model.to(device), + process_group=process_group, + ) + # forward missing getattr and state_dict/load_state_dict to orig model + wrapped_model = ModuleProxyWrapper(wrapped_model) + elif args.ddp_backend in {"c10d", "pytorch_ddp"}: + wrapped_model = DistributedDataParallel( + module=model.to(device), device_ids=[args.device_id], output_device=args.device_id, broadcast_buffers=args.broadcast_buffers, bucket_cap_mb=args.bucket_cap_mb, process_group=process_group, + find_unused_parameters=args.find_unused_parameters, + gradient_as_bucket_view=args.gradient_as_bucket_view, ) - # Maintain backward compatibility - if "check_reduction" in inspect.getargspec(ddp_class)[0]: - init_kwargs["check_reduction"] = True - if "find_unused_parameters" in inspect.getargspec(ddp_class)[0]: - init_kwargs["find_unused_parameters"] = args.find_unused_parameters - elif args.distributed_wrapper == "DDP" and args.ddp_backend == "no_c10d": - ddp_class = LegacyDistributedDataParallel - init_kwargs = dict( - module=model, - world_size=args.distributed_world_size, - buffer_size=2 ** 28, + if args.ddp_comm_hook == "fp16": + logger.info("enable fp16 communication hook in DDP") + try: + from torch.distributed.algorithms.ddp_comm_hooks import ( + DDPCommHookType, + register_ddp_comm_hook, + ) + except: + logger.error( + "Could not import from torch.distributed.algorithms.ddp_comm_hooks; you may need to update your pytorch version" + ) + raise + + register_ddp_comm_hook(DDPCommHookType.FP16_COMPRESS, wrapped_model) + # forward missing getattr and state_dict/load_state_dict to orig model + wrapped_model = ModuleProxyWrapper(wrapped_model) + elif args.ddp_backend in {"no_c10d", "legacy_ddp"}: + wrapped_model = LegacyDistributedDataParallel( + module=model.to(device), + buffer_size=2**28, process_group=process_group, ) - elif args.distributed_wrapper == "SlowMo": - if _GOSSIP_DISABLED: + # forward missing getattr and state_dict/load_state_dict to orig model + wrapped_model = ModuleProxyWrapper(wrapped_model) + elif args.ddp_backend == "slowmo": + if _SLOWMO_DDP_DISABLED: raise ImportError( - "Cannot find gossip library. Please install from: " - "github.com/facebookresearch/stochastic_gradient_push" + "Cannot find SlowMoDistributedDataParallel. " + "Please install fairscale with: pip install fairscale" ) - ddp_class = gossip.GossipDataParallel # The values of slowmo_momentum below were obtained by tuning on the # En-De 16 dataset by training the transformer_wmt_en_de_large model @@ -73,31 +109,39 @@ def DistributedFairseqModel(args, model, process_group=None): args.slowmo_momentum = 0.5 else: args.slowmo_momentum = 0.6 + slowmo_base_algorithm = SlowMoBaseAlgorithm[args.slowmo_base_algorithm.upper()] - init_kwargs = dict( - module=model, - device_ids=[args.device_id], - output_device=args.device_id, + wrapped_model = SlowMoDistributedDataParallel( + module=model.to(device), broadcast_buffers=args.broadcast_buffers, nprocs_per_node=args.nprocs_per_node, slowmo_momentum=args.slowmo_momentum, - localsgd=(args.slowmo_algorithm == "LocalSGD"), + slowmo_base_algorithm=slowmo_base_algorithm, localsgd_frequency=args.localsgd_frequency, ) + # forward missing getattr and state_dict/load_state_dict to orig model + wrapped_model = ModuleProxyWrapper(wrapped_model) + elif args.ddp_backend == "fully_sharded": + try: + from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP + except ImportError: + raise ImportError( + "Cannot find FullyShardedDataParallel. " + "Please install fairscale with: pip install fairscale" + ) + assert isinstance(model, FSDP), "expected model to already be wrapped in FSDP" + wrapped_model = model + if args.memory_efficient_fp16: + wrapped_model = wrapped_model.half() + if not args.cpu_offload: + wrapped_model = wrapped_model.to(device=device) else: raise ValueError("Unknown --ddp-backend: " + args.ddp_backend) - class _DistributedFairseqModel(ddp_class): - """Extend DistributedDataParallel to check for missing - attributes in the wrapped module.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def __getattr__(self, name): - wrapped_module = super().__getattr__("module") - if hasattr(wrapped_module, name): - return getattr(wrapped_module, name) - return super().__getattr__(name) + # kill hung distributed jobs after a timeout + if getattr(args, "heartbeat_timeout", -1) > 0: + wrapped_model = DistributedTimeoutWrapper( + wrapped_model, timeout=getattr(args, "heartbeat_timeout", -1) + ) - return _DistributedFairseqModel(**init_kwargs) + return wrapped_model diff --git a/fairseq/models/ema/__init__.py b/fairseq/models/ema/__init__.py new file mode 100644 index 0000000000..503ceaa609 --- /dev/null +++ b/fairseq/models/ema/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import importlib +import os + +from .ema import EMA + + +def build_ema(model, cfg, device): + return EMA(model, cfg, device) + + +# automatically import any Python files in the models/ema/ directory +for file in sorted(os.listdir(os.path.dirname(__file__))): + if file.endswith(".py") and not file.startswith("_"): + file_name = file[: file.find(".py")] + importlib.import_module("fairseq.models.ema." + file_name) diff --git a/fairseq/models/ema/ema.py b/fairseq/models/ema/ema.py new file mode 100644 index 0000000000..472d5d5f17 --- /dev/null +++ b/fairseq/models/ema/ema.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 + +""" +This module has the EMA class used to store a copy of the exponentially decayed +model params. + +Typical usage of EMA class involves initializing an object using an existing +model (random or from a seed model) and setting the config like ema_decay, +ema_start_update which determine how the EMA model is updated. After every +update of the model i.e. at the end of the train_step, the EMA should be updated +by passing the new model to the EMA.step function. The EMA model state dict +can be stored in the extra state under the key of "ema" and dumped +into a checkpoint and loaded. The EMA object can be passed to tasks +by setting task.uses_ema property. +EMA is a smoothed/ensemble model which might have better performance +when used for inference or further fine-tuning. EMA class has a +reverse function to load the EMA params into a model and use it +like a regular model. + +This implementation is used for trainer-level ema tracking. For EMA tracking +inside the model, please use fairseq/modules/ema_module.py instead. +""" + +import copy +import logging + +import torch + +from fairseq import checkpoint_utils + + +class EMA(object): + """Exponential Moving Average of Fairseq Models + EMA keeps a copy of the exponentially decayed model params. + The set of params should include both gradient-descent and + non-gradient descent params, such as batch mean/var and buffers. + This is a modified implementation of + the open source code in https://github.com/zhawe01/fairseq-gec.git, + and internal source code in + fbcode/mobile-vision/projects/classification_pytorch/lib/utils/model_ema.py. + + Similar to TF EMA. + https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage. + EMA provides a averaged and smoothed set of model weights, and has been shown to + improve vision models. EMA class does all necessary functions to update, reload, + or init EMA methods. + + EMA object is initialized from an arbitrary model. By default, it is stored in + the same device (unless device specified at initialization) and with the + same precision as the model (unless ema_fp32 is True). ema_fp32 is recommended. + This stores the EMA parameters in fp32 only for the EMA update step, and + is used at the default precision otherwise. + EMA is usually enabled using EMAConfig with store_ema=True. Some important + parameters to configure EMA are + 1) ema_decay - The decay of EMA + 2) ema_update_freq - EMA is updated every this many model updates. + 3) ema_start_update - Start EMA update after this many model updates [default 0] + + Key methods: + 1) step - One update of EMA using new model + 2) restore - Update EMA from a state dict + 3) reverse - Load EMA into a model + 4) get_decay, _set_decay - Used to get or set the decay. Note _set_decay is + called from step. + 5) build_fp32_params - Used to initialize or update the fp32 copy of EMA params. + Note this is enabled only when ema_fp32=True + """ + + def __init__(self, model, config, device=None, skip_keys=None): + """ + @param model model to initialize the EMA with + @param config EMAConfig object with configuration like + ema_decay, ema_update_freq, ema_fp32 + @param device If provided, copy EMA to this device (e.g. gpu). + Otherwise EMA is in the same device as the model. + """ + + self.decay = config.ema_decay + self.model = copy.deepcopy(model) + self.model.requires_grad_(False) + self.config = config + self.skip_keys = skip_keys or set() + self.fp32_params = {} + + if self.config.ema_seed_model is not None: + state = checkpoint_utils.load_ema_from_checkpoint( + self.config.ema_seed_model + ) + self.model.load_state_dict(state["model"], strict=True) + + if device is not None: + logging.info(f"Copying EMA model to device {device}") + self.model = self.model.to(device=device) + + if self.config.ema_fp32: + self.build_fp32_params() + + self.update_freq_counter = 0 + + def get_model(self): + return self.model + + def build_fp32_params(self, state_dict=None): + """ + Store a copy of the EMA params in fp32. + If state dict is passed, the EMA params is copied from + the provided state dict. Otherwise, it is copied from the + current EMA model parameters. + """ + if not self.config.ema_fp32: + raise RuntimeError( + "build_fp32_params should not be called if ema_fp32=False. " + "Use ema_fp32=True if this is really intended." + ) + + if state_dict is None: + state_dict = self.model.state_dict() + + def _to_float(t): + return t.float() if torch.is_floating_point(t) else t + + for param_key in state_dict: + if param_key in self.fp32_params: + self.fp32_params[param_key].copy_(state_dict[param_key]) + else: + self.fp32_params[param_key] = _to_float(state_dict[param_key]) + + def restore(self, state_dict, build_fp32_params=False): + """Load data from a model spec into EMA model""" + self.model.load_state_dict(state_dict, strict=False) + if build_fp32_params: + self.build_fp32_params(state_dict) + + def _set_decay(self, decay): + self.decay = decay + + def get_decay(self): + return self.decay + + def _step_internal(self, new_model, updates=None): + """One update of the EMA model based on new model weights""" + decay = self.decay + + ema_state_dict = {} + ema_params = ( + self.fp32_params if self.config.ema_fp32 else self.model.state_dict() + ) + for key, param in new_model.state_dict().items(): + if isinstance(param, dict): + continue + try: + ema_param = ema_params[key] + except KeyError: + ema_param = ( + param.float().clone() if param.ndim == 1 else copy.deepcopy(param) + ) + + if param.shape != ema_param.shape: + raise ValueError( + "incompatible tensor shapes between model param and ema param" + + "{} vs. {}".format(param.shape, ema_param.shape) + ) + + if "version" in key: + # Do not decay a model.version pytorch param + continue + + if key in self.skip_keys: + ema_param = param.to(dtype=ema_param.dtype).clone() + else: + ema_param.mul_(decay) + ema_param.add_(param.to(dtype=ema_param.dtype), alpha=1 - decay) + ema_state_dict[key] = ema_param + self.restore(ema_state_dict, build_fp32_params=False) + + def step(self, new_model, updates=None): + """ + One update of EMA which is done every self.config.ema_update_freq + updates of the model. + + @param updates The current number of model updates done. + Decay is set of 0 if model updates < ema_start_update, which means + the model will be simply copied over to the EMA. + When model updates >= ema_start_updates, then EMA is updated with + a decay of self.config.ema_decay. + """ + if updates is not None: + self._set_decay( + 0 if updates < self.config.ema_start_update else self.config.ema_decay + ) + if self.config.ema_update_freq > 1: + self.update_freq_counter += 1 + if self.update_freq_counter >= self.config.ema_update_freq: + self._step_internal(new_model, updates) + self.update_freq_counter = 0 + else: + self._step_internal(new_model, updates) + + def reverse(self, model): + """ + Load the model parameters from EMA model. + Useful for inference or fine-tuning from the EMA model. + """ + d = self.model.state_dict() + if "_ema" in d: + del d["_ema"] + + model.load_state_dict(d, strict=False) + return model diff --git a/fairseq/models/fairseq_decoder.py b/fairseq/models/fairseq_decoder.py index fb6c52dc7f..13b73d639e 100644 --- a/fairseq/models/fairseq_decoder.py +++ b/fairseq/models/fairseq_decoder.py @@ -17,6 +17,7 @@ def __init__(self, dictionary): super().__init__() self.dictionary = dictionary self.onnx_trace = False + self.adaptive_softmax = None def forward(self, prev_output_tokens, encoder_out=None, **kwargs): """ @@ -62,6 +63,19 @@ def get_normalized_probs( sample: Optional[Dict[str, Tensor]] = None, ): """Get normalized probabilities (or log probs) from a net's output.""" + return self.get_normalized_probs_scriptable(net_output, log_probs, sample) + + # TorchScript doesn't support super() method so that the scriptable Subclass + # can't access the base class model in Torchscript. + # Current workaround is to add a helper function with different name and + # call the helper function from scriptable Subclass. + def get_normalized_probs_scriptable( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): + """Get normalized probabilities (or log probs) from a net's output.""" if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None: if sample is not None: @@ -82,8 +96,8 @@ def max_positions(self): """Maximum input length supported by the decoder.""" return 1e6 # an arbitrary large number - def upgrade_state_dict(self, state_dict): - """Upgrade a (possibly old) state dict for new versions of fairseq.""" + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade old state dicts to work with newer code.""" return state_dict def prepare_for_onnx_export_(self): diff --git a/fairseq/models/fairseq_encoder.py b/fairseq/models/fairseq_encoder.py index c8873daa28..08cbde15a4 100644 --- a/fairseq/models/fairseq_encoder.py +++ b/fairseq/models/fairseq_encoder.py @@ -78,8 +78,8 @@ def max_positions(self): """Maximum input length supported by the encoder.""" return 1e6 # an arbitrary large number - def upgrade_state_dict(self, state_dict): - """Upgrade a (possibly old) state dict for new versions of fairseq.""" + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade old state dicts to work with newer code.""" return state_dict def set_num_updates(self, num_updates): diff --git a/fairseq/models/fairseq_model.py b/fairseq/models/fairseq_model.py index 15c2c4ab2e..65ead9dcf2 100644 --- a/fairseq/models/fairseq_model.py +++ b/fairseq/models/fairseq_model.py @@ -14,7 +14,6 @@ import torch.nn as nn import torch.nn.functional as F from fairseq import utils -from fairseq.checkpoint_utils import prune_state_dict from fairseq.data import Dictionary from fairseq.dataclass.utils import ( convert_namespace_to_omegaconf, @@ -28,6 +27,15 @@ logger = logging.getLogger(__name__) +def check_type(module, expected_type): + if hasattr(module, "unwrapped_module"): + assert isinstance( + module.unwrapped_module, expected_type + ), f"{type(module.unwrapped_module)} != {expected_type}" + else: + assert isinstance(module, expected_type), f"{type(module)} != {expected_type}" + + class BaseFairseqModel(nn.Module): """Base class for fairseq models.""" @@ -40,7 +48,8 @@ def add_args(cls, parser): """Add model-specific arguments to the parser.""" dc = getattr(cls, "__dataclass", None) if dc is not None: - gen_parser_from_dataclass(parser, dc()) + # do not set defaults so that settings defaults from various architectures still works + gen_parser_from_dataclass(parser, dc(), delete_default=True) @classmethod def build_model(cls, args, task): @@ -106,10 +115,15 @@ def load_state_dict( """ if model_cfg is None and args is not None: - logger.warn("using 'args' is deprecated, please update your code to use dataclass config") + logger.warn( + "using 'args' is deprecated, please update your code to use dataclass config" + ) model_cfg = convert_namespace_to_omegaconf(args).model self.upgrade_state_dict(state_dict) + + from fairseq.checkpoint_utils import prune_state_dict + new_state_dict = prune_state_dict(state_dict, model_cfg) return super().load_state_dict(new_state_dict, strict) @@ -142,12 +156,14 @@ def do_upgrade(m, prefix): def set_num_updates(self, num_updates): """State from trainer to pass along to model at every update.""" - - def _apply(m): + for m in self.modules(): if hasattr(m, "set_num_updates") and m != self: m.set_num_updates(num_updates) - self.apply(_apply) + def set_epoch(self, epoch): + for m in self.modules(): + if hasattr(m, "set_epoch") and m != self: + m.set_epoch(epoch) def prepare_for_inference_(self, cfg: DictConfig): """Prepare model for inference.""" @@ -176,7 +192,7 @@ def make_generation_fast_(self, **kwargs): def apply_remove_weight_norm(module): try: nn.utils.remove_weight_norm(module) - except ValueError: # this module didn't have weight norm + except (AttributeError, ValueError): # this module didn't have weight norm return self.apply(apply_remove_weight_norm) @@ -222,21 +238,6 @@ def apply_prepare_for_onnx_export_(module): self.apply(apply_prepare_for_onnx_export_) - def prepare_for_tpu_(self, **kwargs): - """Optionally modify model for use on TPUs.""" - seen = set() - - def apply_prepare_for_tpu_(module): - if ( - module != self - and hasattr(module, "prepare_for_tpu_") - and module not in seen - ): - seen.add(module) - module.prepare_for_tpu_(**kwargs) - - self.apply(apply_prepare_for_tpu_) - @classmethod def from_pretrained( cls, @@ -296,8 +297,9 @@ def __init__(self, encoder, decoder): self.encoder = encoder self.decoder = decoder - assert isinstance(self.encoder, FairseqEncoder) - assert isinstance(self.decoder, FairseqDecoder) + + check_type(self.encoder, FairseqEncoder) + check_type(self.decoder, FairseqDecoder) def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs): """ @@ -377,8 +379,8 @@ def __init__(self, encoders, decoders): assert encoders.keys() == decoders.keys() self.keys = list(encoders.keys()) for key in self.keys: - assert isinstance(encoders[key], FairseqEncoder) - assert isinstance(decoders[key], FairseqDecoder) + check_type(encoders[key], FairseqEncoder) + check_type(decoders[key], FairseqDecoder) self.models = nn.ModuleDict( { @@ -460,10 +462,15 @@ def load_state_dict( """ if model_cfg is None and args is not None: - logger.warn("using 'args' is deprecated, please update your code to use dataclass config") + logger.warn( + "using 'args' is deprecated, please update your code to use dataclass config" + ) model_cfg = convert_namespace_to_omegaconf(args).model self.upgrade_state_dict(state_dict) + + from fairseq.checkpoint_utils import prune_state_dict + new_state_dict = prune_state_dict(state_dict, model_cfg) return super().load_state_dict(new_state_dict, strict) @@ -478,7 +485,7 @@ class FairseqLanguageModel(BaseFairseqModel): def __init__(self, decoder): super().__init__() self.decoder = decoder - assert isinstance(self.decoder, FairseqDecoder) + check_type(self.decoder, FairseqDecoder) def forward(self, src_tokens, **kwargs): """ @@ -539,7 +546,7 @@ class FairseqEncoderModel(BaseFairseqModel): def __init__(self, encoder): super().__init__() self.encoder = encoder - assert isinstance(self.encoder, FairseqEncoder) + check_type(self.encoder, FairseqEncoder) def forward(self, src_tokens, src_lengths, **kwargs): """ diff --git a/fairseq/models/fconv_lm.py b/fairseq/models/fconv_lm.py index 07391eaa29..4b243d6669 100644 --- a/fairseq/models/fconv_lm.py +++ b/fairseq/models/fconv_lm.py @@ -10,6 +10,7 @@ register_model_architecture, ) from fairseq.models.fconv import FConvDecoder +from fairseq.utils import safe_hasattr @register_model("fconv_lm") @@ -66,7 +67,7 @@ def build_model(cls, args, task): # make sure all arguments are present in older models base_lm_architecture(args) - if hasattr(args, "max_target_positions") and not hasattr( + if safe_hasattr(args, "max_target_positions") and not safe_hasattr( args, "tokens_per_sample" ): args.tokens_per_sample = args.max_target_positions diff --git a/fairseq/models/hubert/__init__.py b/fairseq/models/hubert/__init__.py new file mode 100644 index 0000000000..a1b0eabbdb --- /dev/null +++ b/fairseq/models/hubert/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .hubert import * # noqa +from .hubert_asr import * # noqa diff --git a/fairseq/models/hubert/hubert.py b/fairseq/models/hubert/hubert.py new file mode 100644 index 0000000000..cc3b777efd --- /dev/null +++ b/fairseq/models/hubert/hubert.py @@ -0,0 +1,576 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Tuple + +import numpy as np +import torch +import torch.nn as nn +from omegaconf import II + +from fairseq import utils +from fairseq.data.data_utils import compute_mask_indices +from fairseq.data.dictionary import Dictionary +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.models import BaseFairseqModel, register_model +from fairseq.models.wav2vec.wav2vec2 import ( + EXTRACTOR_MODE_CHOICES, + MASKING_DISTRIBUTION_CHOICES, + LAYER_TYPE_CHOICES, + ConvFeatureExtractionModel, + TransformerEncoder, +) +from fairseq.modules import GradMultiply, LayerNorm +from fairseq.tasks.hubert_pretraining import ( + HubertPretrainingConfig, + HubertPretrainingTask, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class HubertConfig(FairseqDataclass): + label_rate: float = II("task.label_rate") + + extractor_mode: EXTRACTOR_MODE_CHOICES = field( + default="default", + metadata={ + "help": "mode for feature extractor. default has a single group " + "norm with d groups in the first conv block, whereas layer_norm " + "has layer norms in every block (meant to use with normalize=True)" + }, + ) + encoder_layers: int = field( + default=12, metadata={"help": "num encoder layers in the transformer"} + ) + encoder_embed_dim: int = field( + default=768, metadata={"help": "encoder embedding dimension"} + ) + encoder_ffn_embed_dim: int = field( + default=3072, metadata={"help": "encoder embedding dimension for FFN"} + ) + encoder_attention_heads: int = field( + default=12, metadata={"help": "num encoder attention heads"} + ) + activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( + default="gelu", metadata={"help": "activation function to use"} + ) + layer_type: LAYER_TYPE_CHOICES = field( + default="transformer", metadata={"help": "layer type in encoder"} + ) + + # dropouts + dropout: float = field( + default=0.1, + metadata={"help": "dropout probability for the transformer"}, + ) + attention_dropout: float = field( + default=0.1, + metadata={"help": "dropout probability for attention weights"}, + ) + activation_dropout: float = field( + default=0.0, + metadata={"help": "dropout probability after activation in FFN"}, + ) + encoder_layerdrop: float = field( + default=0.0, + metadata={"help": "probability of dropping a tarnsformer layer"}, + ) + dropout_input: float = field( + default=0.0, + metadata={"help": "dropout to apply to the input (after feat extr)"}, + ) + dropout_features: float = field( + default=0.0, + metadata={"help": "dropout to apply to the features (after feat extr)"}, + ) + + final_dim: int = field( + default=0, + metadata={ + "help": "project final representations and targets to this many " + "dimensions. set to encoder_embed_dim is <= 0" + }, + ) + untie_final_proj: bool = field( + default=False, + metadata={"help": "use separate projection for each target"}, + ) + layer_norm_first: bool = field( + default=False, + metadata={"help": "apply layernorm first in the transformer"}, + ) + conv_feature_layers: str = field( + default="[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", + metadata={ + "help": "string describing convolutional feature extraction " + "layers in form of a python list that contains " + "[(dim, kernel_size, stride), ...]" + }, + ) + conv_bias: bool = field( + default=False, metadata={"help": "include bias in conv encoder"} + ) + logit_temp: float = field( + default=0.1, metadata={"help": "temperature to divide logits by"} + ) + target_glu: bool = field( + default=False, metadata={"help": "adds projection + glu to targets"} + ) + feature_grad_mult: float = field( + default=1.0, + metadata={"help": "multiply feature extractor var grads by this"}, + ) + + # masking + mask_length: int = field(default=10, metadata={"help": "mask length"}) + mask_prob: float = field( + default=0.65, + metadata={"help": "probability of replacing a token with mask"}, + ) + mask_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose mask length"} + ) + mask_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument " + "(used for more complex distributions), " + "see help in compute_mask_indicesh" + }, + ) + no_mask_overlap: bool = field( + default=False, metadata={"help": "whether to allow masks to overlap"} + ) + mask_min_space: int = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + + # channel masking + mask_channel_length: int = field( + default=10, + metadata={"help": "length of the mask for features (channels)"}, + ) + mask_channel_prob: float = field( + default=0.0, + metadata={"help": "probability of replacing a feature with 0"}, + ) + mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", + metadata={"help": "how to choose mask length for channel masking"}, + ) + mask_channel_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument " + "(used for more complex distributions), " + "see help in compute_mask_indicesh" + }, + ) + no_mask_channel_overlap: bool = field( + default=False, + metadata={"help": "whether to allow channel masks to overlap"}, + ) + mask_channel_min_space: int = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + + # positional embeddings + conv_pos: int = field( + default=128, + metadata={"help": "number of filters for convolutional positional embeddings"}, + ) + conv_pos_groups: int = field( + default=16, + metadata={"help": "number of groups for convolutional positional embedding"}, + ) + conv_pos_batch_norm: bool = field( + default=False, + metadata={ + "help": "use batch norm instead of weight norm in conv_pos (for bf16 models)" + }, + ) + + latent_temp: Tuple[float, float, float] = field( + default=(2, 0.5, 0.999995), + metadata={"help": "legacy (to be removed)"}, + ) + + # loss computation + skip_masked: bool = field( + default=False, + metadata={"help": "skip computing losses over masked frames"}, + ) + skip_nomask: bool = field( + default=False, + metadata={"help": "skip computing losses over unmasked frames"}, + ) + + checkpoint_activations: bool = field( + default=False, + metadata={"help": "recompute activations and save memory for extra compute"}, + ) + + # FP16 optimization + required_seq_len_multiple: int = field( + default=2, + metadata={ + "help": "pad the input to encoder such that the sequence length is divisible by multiple" + }, + ) + + # Conformer + depthwise_conv_kernel_size: int = field( + default=31, + metadata={ + "help": "depthwise-conv-kernel-size for convolution in conformer layer" + }, + ) + attn_type: str = field( + default="", + metadata={"help": "if espnet use ESPNET MHA"}, + ) + pos_enc_type: str = field( + default="abs", + metadata={"help": "Positional encoding type to use in conformer"}, + ) + fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"}) + + +@register_model("hubert", dataclass=HubertConfig) +class HubertModel(BaseFairseqModel): + def __init__( + self, + cfg: HubertConfig, + task_cfg: HubertPretrainingConfig, + dictionaries: List[Dictionary], + ) -> None: + super().__init__() + logger.info(f"HubertModel Config: {cfg}") + + feature_enc_layers = eval(cfg.conv_feature_layers) # noqa + self.embed = feature_enc_layers[-1][0] + + self.feature_extractor = ConvFeatureExtractionModel( + conv_layers=feature_enc_layers, + dropout=0.0, + mode=cfg.extractor_mode, + conv_bias=cfg.conv_bias, + ) + feature_ds_rate = np.prod([s for _, _, s in feature_enc_layers]) + self.feat2tar_ratio = cfg.label_rate * feature_ds_rate / task_cfg.sample_rate + + self.post_extract_proj = ( + nn.Linear(self.embed, cfg.encoder_embed_dim) + if self.embed != cfg.encoder_embed_dim + else None + ) + + self.mask_prob = cfg.mask_prob + self.mask_selection = cfg.mask_selection + self.mask_other = cfg.mask_other + self.mask_length = cfg.mask_length + self.no_mask_overlap = cfg.no_mask_overlap + self.mask_min_space = cfg.mask_min_space + + self.mask_channel_prob = cfg.mask_channel_prob + self.mask_channel_selection = cfg.mask_channel_selection + self.mask_channel_other = cfg.mask_channel_other + self.mask_channel_length = cfg.mask_channel_length + self.no_mask_channel_overlap = cfg.no_mask_channel_overlap + self.mask_channel_min_space = cfg.mask_channel_min_space + + self.dropout_input = nn.Dropout(cfg.dropout_input) + self.dropout_features = nn.Dropout(cfg.dropout_features) + + self.feature_grad_mult = cfg.feature_grad_mult + self.logit_temp = cfg.logit_temp + self.skip_masked = cfg.skip_masked + self.skip_nomask = cfg.skip_nomask + + final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim + + self.mask_emb = nn.Parameter( + torch.FloatTensor(cfg.encoder_embed_dim).uniform_() + ) + + self.encoder = TransformerEncoder(cfg) + self.layer_norm = LayerNorm(self.embed) + + self.target_glu = None + if cfg.target_glu: + self.target_glu = nn.Sequential( + nn.Linear(final_dim, final_dim * 2), nn.GLU() + ) + + self.untie_final_proj = cfg.untie_final_proj + if self.untie_final_proj: + self.final_proj = nn.Linear( + cfg.encoder_embed_dim, final_dim * len(dictionaries) + ) + else: + self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim) + + # modules below are not needed during fine-tuning + if any([d is None for d in dictionaries]): + logger.info("cannot find dictionary. assume will be used for fine-tuning") + else: + self.num_classes = [len(d) for d in dictionaries] + self.label_embs_concat = nn.Parameter( + torch.FloatTensor(sum(self.num_classes), final_dim) + ) + nn.init.uniform_(self.label_embs_concat) + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + + super().upgrade_state_dict_named(state_dict, name) + return state_dict + + @classmethod + def build_model(cls, cfg: HubertConfig, task: HubertPretrainingTask): + """Build a new model instance.""" + + model = HubertModel(cfg, task.cfg, task.dictionaries) + return model + + def apply_mask(self, x, padding_mask, target_list): + B, T, C = x.shape + if self.mask_prob > 0: + mask_indices = compute_mask_indices( + (B, T), + padding_mask, + self.mask_prob, + self.mask_length, + self.mask_selection, + self.mask_other, + min_masks=2, + no_overlap=self.no_mask_overlap, + min_space=self.mask_min_space, + ) + mask_indices = torch.from_numpy(mask_indices).to(x.device) + x[mask_indices] = self.mask_emb + else: + mask_indices = None + + if self.mask_channel_prob > 0: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + torch.from_numpy(mask_channel_indices) + .to(x.device) + .unsqueeze(1) + .expand(-1, T, -1) + ) + x[mask_channel_indices] = 0 + + return x, mask_indices + + def compute_nce(self, x, pos, negs): + neg_is_pos = (pos == negs).all(-1) + pos = pos.unsqueeze(0) + targets = torch.cat([pos, negs], dim=0) + + logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x) + logits /= self.logit_temp + if neg_is_pos.any(): + logits[1:][neg_is_pos] = float("-inf") + logits = logits.transpose(0, 1) # (num_x, num_cls+1) + return logits + + def forward_features(self, source: torch.Tensor) -> torch.Tensor: + if self.feature_grad_mult > 0: + features = self.feature_extractor(source) + if self.feature_grad_mult != 1.0: + features = GradMultiply.apply(features, self.feature_grad_mult) + else: + with torch.no_grad(): + features = self.feature_extractor(source) + return features + + def forward_targets( + self, + features: torch.Tensor, + target_list: List[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Trim features to ensure labels exist and then get aligned labels + feat_tsz = features.size(2) + targ_tsz = min([t.size(1) for t in target_list]) + if self.feat2tar_ratio * feat_tsz > targ_tsz: + feat_tsz = int(targ_tsz / self.feat2tar_ratio) + features = features[..., :feat_tsz] + target_inds = torch.arange(feat_tsz).float() * self.feat2tar_ratio + target_list = [t[:, target_inds.long()] for t in target_list] + return features, target_list + + def forward_padding_mask( + self, + features: torch.Tensor, + padding_mask: torch.Tensor, + ) -> torch.Tensor: + extra = padding_mask.size(1) % features.size(1) + if extra > 0: + padding_mask = padding_mask[:, :-extra] + padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) + padding_mask = padding_mask.all(-1) + return padding_mask + + def forward( + self, + source: torch.Tensor, + target_list: Optional[List[torch.Tensor]] = None, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = True, + features_only: bool = False, + output_layer: Optional[int] = None, + ) -> Dict[str, torch.Tensor]: + """output layer is 1-based""" + features = self.forward_features(source) + if target_list is not None: + features, target_list = self.forward_targets(features, target_list) + + features_pen = features.float().pow(2).mean() + + features = features.transpose(1, 2) + features = self.layer_norm(features) + unmasked_features = features.clone() + + if padding_mask is not None: + padding_mask = self.forward_padding_mask(features, padding_mask) + + if self.post_extract_proj is not None: + features = self.post_extract_proj(features) + + features = self.dropout_input(features) + unmasked_features = self.dropout_features(unmasked_features) + + if mask: + x, mask_indices = self.apply_mask(features, padding_mask, target_list) + else: + x = features + mask_indices = None + + # feature: (B, T, D), float + # target: (B, T), long + # x: (B, T, D), float + # padding_mask: (B, T), bool + # mask_indices: (B, T), bool + x, _ = self.encoder( + x, + padding_mask=padding_mask, + layer=None if output_layer is None else output_layer - 1, + ) + + if features_only: + return {"x": x, "padding_mask": padding_mask, "features": features} + + def compute_pred(proj_x, target, label_embs): + # compute logits for the i-th label set + y = torch.index_select(label_embs, 0, target.long()) + negs = label_embs.unsqueeze(1).expand(-1, proj_x.size(0), -1) + if self.target_glu: + y = self.target_glu(y) + negs = self.target_glu(negs) + # proj_x: (S, D) + # y: (S, D) + # negs: (Neg, S, D) + return self.compute_nce(proj_x, y, negs) + + label_embs_list = self.label_embs_concat.split(self.num_classes, 0) + + if not self.skip_masked: + masked_indices = torch.logical_and(~padding_mask, mask_indices) + proj_x_m = self.final_proj(x[masked_indices]) + if self.untie_final_proj: + proj_x_m_list = proj_x_m.chunk(len(target_list), dim=-1) + else: + proj_x_m_list = [proj_x_m for _ in range(len(target_list))] + logit_m_list = [ + compute_pred(proj_x_m, t[masked_indices], label_embs_list[i]) + for i, (proj_x_m, t) in enumerate(zip(proj_x_m_list, target_list)) + ] + else: + logit_m_list = [None for _ in target_list] + + if not self.skip_nomask: + nomask_indices = torch.logical_and(~padding_mask, ~mask_indices) + proj_x_u = self.final_proj(x[nomask_indices]) + if self.untie_final_proj: + proj_x_u_list = proj_x_u.chunk(len(target_list), dim=-1) + else: + proj_x_u_list = [proj_x_u for _ in range(len(target_list))] + + logit_u_list = [ + compute_pred(proj_x_u, t[nomask_indices], label_embs_list[i]) + for i, (proj_x_u, t) in enumerate(zip(proj_x_u_list, target_list)) + ] + else: + logit_u_list = [None for _ in target_list] + + result = { + "logit_m_list": logit_m_list, + "logit_u_list": logit_u_list, + "padding_mask": padding_mask, + "features_pen": features_pen, + } + return result + + def extract_features( + self, + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = False, + ret_conv: bool = False, + output_layer: Optional[int] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + res = self.forward( + source, + padding_mask=padding_mask, + mask=mask, + features_only=True, + output_layer=output_layer, + ) + feature = res["features"] if ret_conv else res["x"] + return feature, res["padding_mask"] + + def get_logits(self, net_output, is_masked=True): + if is_masked: + logits_list = net_output["logit_m_list"] + else: + logits_list = net_output["logit_u_list"] + logits_list = [x.float() for x in logits_list if x is not None] + return logits_list + + def get_targets(self, net_output, is_masked=True): + logits_list = self.get_logits(net_output, is_masked) + targets_list = [x.new_zeros(x.size(0), dtype=torch.long) for x in logits_list] + return targets_list + + def get_extra_losses(self, net_output): + extra_losses = [] + names = [] + + if "features_pen" in net_output: + extra_losses.append(net_output["features_pen"]) + names.append("features_pen") + + return extra_losses, names + + def remove_pretraining_modules(self): + self.target_glu = None + self.final_proj = None diff --git a/fairseq/models/hubert/hubert_asr.py b/fairseq/models/hubert/hubert_asr.py new file mode 100644 index 0000000000..11c85ce7d1 --- /dev/null +++ b/fairseq/models/hubert/hubert_asr.py @@ -0,0 +1,675 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import copy +import logging +import math +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Any, Optional +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from omegaconf import II, MISSING, open_dict + +from fairseq import checkpoint_utils, tasks, utils +from fairseq.dataclass import FairseqDataclass +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.models import ( + BaseFairseqModel, + FairseqEncoder, + FairseqEncoderDecoderModel, + FairseqIncrementalDecoder, + register_model, +) +from fairseq.models.hubert.hubert import MASKING_DISTRIBUTION_CHOICES +from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerDecoderLayer +from fairseq.tasks import FairseqTask + +logger = logging.getLogger(__name__) + + +@dataclass +class HubertAsrConfig(FairseqDataclass): + w2v_path: str = field(default=MISSING, metadata={"help": "path to hubert model"}) + no_pretrained_weights: bool = field( + default=False, + metadata={"help": "if true, does not load pretrained weights"}, + ) + dropout_input: float = field( + default=0.0, + metadata={"help": "dropout to apply to the input (after feat extr)"}, + ) + final_dropout: float = field( + default=0.0, + metadata={"help": "dropout after transformer and before final projection"}, + ) + dropout: float = field( + default=0.0, + metadata={"help": "dropout probability inside hubert model"}, + ) + attention_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability for attention weights " "inside hubert model" + }, + ) + activation_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability after activation in FFN " "inside hubert model" + }, + ) + encoder_embed_dim: Optional[int] = field( + default=768, metadata={"help": "encoder embedding dimension"} + ) + + # masking + apply_mask: bool = field( + default=False, metadata={"help": "apply masking during fine-tuning"} + ) + mask_length: int = field( + default=10, metadata={"help": "repeat the mask indices multiple times"} + ) + mask_prob: float = field( + default=0.5, + metadata={ + "help": "probability of replacing a token with mask " + "(normalized by length)" + }, + ) + mask_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose masks"} + ) + mask_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument " + "(used for more complex distributions), " + "see help in compute_mask_indices" + }, + ) + no_mask_overlap: bool = field( + default=False, metadata={"help": "whether to allow masks to overlap"} + ) + + # channel masking + mask_channel_length: int = field( + default=10, + metadata={"help": "length of the mask for features (channels)"}, + ) + mask_channel_prob: float = field( + default=0.0, + metadata={"help": "probability of replacing a feature with 0"}, + ) + mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", + metadata={"help": "how to choose mask length for channel masking"}, + ) + mask_channel_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument " + "(used for more complex distributions), " + "see help in compute_mask_indices" + }, + ) + no_mask_channel_overlap: bool = field( + default=False, + metadata={"help": "whether to allow channel masks to overlap"}, + ) + freeze_finetune_updates: int = field( + default=0, + metadata={"help": "dont finetune hubert for this many updates"}, + ) + feature_grad_mult: float = field( + default=0.0, + metadata={"help": "reset feature grad mult in hubert to this"}, + ) + layerdrop: float = field( + default=0.0, + metadata={"help": "probability of dropping a layer in hubert"}, + ) + normalize: bool = II("task.normalize") + data: str = II("task.data") + + # this holds the loaded hubert args + w2v_args: Any = None + + +@dataclass +class HubertCtcConfig(HubertAsrConfig): + pass + + +@register_model("hubert_ctc", dataclass=HubertCtcConfig) +class HubertCtc(BaseFairseqModel): + def __init__(self, cfg: HubertCtcConfig, w2v_encoder: BaseFairseqModel): + super().__init__() + self.cfg = cfg + self.w2v_encoder = w2v_encoder + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + return state_dict + + @classmethod + def build_model(cls, cfg: HubertCtcConfig, task: FairseqTask): + """Build a new model instance.""" + w2v_encoder = HubertEncoder(cfg, task) + return cls(cfg, w2v_encoder) + + def get_normalized_probs(self, net_output, log_probs): + """Get normalized probabilities (or log probs) from a net's output.""" + + logits = net_output["encoder_out"] + if log_probs: + return utils.log_softmax(logits.float(), dim=-1) + else: + return utils.softmax(logits.float(), dim=-1) + + def get_logits(self, net_output): + logits = net_output["encoder_out"] + padding = net_output["encoder_padding_mask"] + if padding is not None and padding.any(): + padding = padding.T + logits[padding][..., 0] = 0 + logits[padding][..., 1:] = float("-inf") + + return logits + + def forward(self, **kwargs): + x = self.w2v_encoder(**kwargs) + return x + + +@dataclass +class HubertSeq2SeqConfig(HubertAsrConfig): + decoder_embed_dim: int = field( + default=768, metadata={"help": "decoder embedding dimension"} + ) + decoder_ffn_embed_dim: int = field( + default=3072, metadata={"help": "decoder embedding dimension for FFN"} + ) + decoder_layers: int = field(default=6, metadata={"help": "num of decoder layers"}) + decoder_layerdrop: float = field( + default=0.0, metadata={"help": "decoder layerdrop chance"} + ) + decoder_attention_heads: int = field( + default=4, metadata={"help": "num decoder attention heads"} + ) + decoder_learned_pos: bool = field( + default=False, + metadata={"help": "use learned positional embeddings in the decoder"}, + ) + decoder_normalize_before: bool = field( + default=False, metadata={"help": "apply layernorm before each decoder block"} + ) + no_token_positional_embeddings: bool = field( + default=False, + metadata={ + "help": "if set, disables positional embeddings (outside self attention)" + }, + ) + decoder_dropout: float = field( + default=0.0, metadata={"help": "dropout probability in the decoder"} + ) + decoder_attention_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability for attention weights inside the decoder" + }, + ) + decoder_activation_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability after activation in FFN inside the decoder" + }, + ) + max_target_positions: int = field( + default=2048, metadata={"help": "max target positions"} + ) + share_decoder_input_output_embed: bool = field( + default=False, metadata={"help": "share decoder input and output embeddings"} + ) + autoregressive: bool = II("task.autoregressive") + seq2seq_path: str = field( + default="", + metadata={"help": "reset_dict"}, + ) + reset_dict: bool = field( + default=False, + metadata={"help": "reset_dict"}, + ) + + +@register_model("hubert_seq2seq", dataclass=HubertSeq2SeqConfig) +class HubertSeq2SeqModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @classmethod + def build_model(cls, cfg: HubertSeq2SeqConfig, task: FairseqTask): + """Build a new model instance.""" + + assert ( + cfg.autoregressive + ), "Please set task.autoregressive=true for seq2seq asr models" + + src_dict, tgt_dict = task.source_dictionary, task.target_dictionary + + def build_embedding(dictionary, embed_dim): + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + emb = Embedding(num_embeddings, embed_dim, padding_idx) + return emb + + decoder_embed_tokens = build_embedding(tgt_dict, cfg.decoder_embed_dim) + + encoder = cls.build_encoder(cfg, task) + decoder = cls.build_decoder(cfg, tgt_dict, decoder_embed_tokens) + + model = HubertSeq2SeqModel(encoder, decoder) + + if cfg["seq2seq_path"]: + state = checkpoint_utils.load_checkpoint_to_cpu(cfg.seq2seq_path) + state = state["model"] + if cfg["reset_dict"]: + del state["decoder.embed_out"] + del state["decoder.embed_tokens.weight"] + model.load_state_dict(state, strict=False) + return model + + @classmethod + def build_encoder(cls, cfg: HubertAsrConfig, task): + return HubertEncoder(cfg, task) + + @classmethod + def build_decoder(cls, cfg: HubertSeq2SeqConfig, tgt_dict, embed_tokens): + return TransformerDecoder(cfg, tgt_dict, embed_tokens) + + def forward(self, **kwargs): + encoder_out = self.encoder(**kwargs) + decoder_out = self.decoder(encoder_out=encoder_out, **kwargs) + return decoder_out + + def upgrade_state_dict_named(self, state_dict, name): + return state_dict + + def load_state_dict( + self, + state_dict, + strict=True, + model_cfg=None, + args: Optional[Namespace] = None, + ): + if model_cfg.reset_dict: + logger.warn("Overriding loading strict state dict!") + del state_dict["decoder.embed_out"] + del state_dict["decoder.embed_tokens.weight"] + return super().load_state_dict(state_dict, False, model_cfg, args) + return super().load_state_dict(state_dict, strict, model_cfg, args) + + +class HubertEncoder(FairseqEncoder): + def __init__(self, cfg: HubertAsrConfig, task): + self.apply_mask = cfg.apply_mask + + arg_overrides = { + "dropout": cfg.dropout, + "activation_dropout": cfg.activation_dropout, + "dropout_input": cfg.dropout_input, + "attention_dropout": cfg.attention_dropout, + "mask_length": cfg.mask_length, + "mask_prob": cfg.mask_prob, + "mask_selection": cfg.mask_selection, + "mask_other": cfg.mask_other, + "no_mask_overlap": cfg.no_mask_overlap, + "mask_channel_length": cfg.mask_channel_length, + "mask_channel_prob": cfg.mask_channel_prob, + "mask_channel_selection": cfg.mask_channel_selection, + "mask_channel_other": cfg.mask_channel_other, + "no_mask_channel_overlap": cfg.no_mask_channel_overlap, + "encoder_layerdrop": cfg.layerdrop, + "feature_grad_mult": cfg.feature_grad_mult, + } + + if cfg.w2v_args is None: + state = checkpoint_utils.load_checkpoint_to_cpu(cfg.w2v_path, arg_overrides) + w2v_args = state.get("cfg", None) + if w2v_args is None: + w2v_args = convert_namespace_to_omegaconf(state["args"]) + cfg.w2v_args = w2v_args + else: + state = None + w2v_args = cfg.w2v_args + if isinstance(w2v_args, Namespace): + cfg.w2v_args = w2v_args = convert_namespace_to_omegaconf(w2v_args) + + assert cfg.normalize == w2v_args.task.normalize, ( + "Fine-tuning works best when data normalization is the same. " + "Please check that --normalize is set or unset for " + "both pre-training and here" + ) + + w2v_args.task.data = cfg.data + pretrain_task = tasks.setup_task(w2v_args.task) + if state is not None and "task_state" in state: + # This will load the stored "dictionaries" object + pretrain_task.load_state_dict(state["task_state"]) + else: + pretrain_task.load_state_dict(task.state_dict()) + + model = pretrain_task.build_model(w2v_args.model, from_checkpoint=True) + if state is not None and not cfg.no_pretrained_weights: + # set strict=False because we omit some modules + model.load_state_dict(state["model"], strict=False) + + model.remove_pretraining_modules() + + super().__init__(pretrain_task.source_dictionary) + + d = w2v_args.model.encoder_embed_dim + + self.w2v_model = model + + self.final_dropout = nn.Dropout(cfg.final_dropout) + self.freeze_finetune_updates = cfg.freeze_finetune_updates + self.num_updates = 0 + + if task.target_dictionary is not None and not cfg.autoregressive: + self.proj = Linear(d, len(task.target_dictionary)) + elif getattr(cfg, "decoder_embed_dim", d) != d: + self.proj = Linear(d, cfg.decoder_embed_dim) + else: + self.proj = None + + def set_num_updates(self, num_updates): + """Set the number of parameters updates.""" + super().set_num_updates(num_updates) + self.num_updates = num_updates + + def forward(self, source, padding_mask, tbc=True, **kwargs): + + w2v_args = { + "source": source, + "padding_mask": padding_mask, + "mask": self.apply_mask and self.training, + } + + ft = self.freeze_finetune_updates <= self.num_updates + + with torch.no_grad() if not ft else contextlib.ExitStack(): + x, padding_mask = self.w2v_model.extract_features(**w2v_args) + + if tbc: + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + x = self.final_dropout(x) + + if self.proj: + x = self.proj(x) + + return { + "encoder_out": x, # T x B x C + "encoder_padding_mask": padding_mask, # B x T + "padding_mask": padding_mask, + } + + def reorder_encoder_out(self, encoder_out, new_order): + if encoder_out["encoder_out"] is not None: + encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( + 1, new_order + ) + if encoder_out["encoder_padding_mask"] is not None: + encoder_out["encoder_padding_mask"] = encoder_out[ + "encoder_padding_mask" + ].index_select(0, new_order) + if encoder_out["padding_mask"] is not None: + encoder_out["padding_mask"] = encoder_out["padding_mask"].index_select( + 0, new_order + ) + return encoder_out + + def max_positions(self): + """Maximum input length supported by the encoder.""" + return None + + def upgrade_state_dict_named(self, state_dict, name): + return state_dict + + +class TransformerDecoder(FairseqIncrementalDecoder): + """ + Transformer decoder consisting of *args.decoder_layers* layers. Each layer + is a :class:`TransformerDecoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__( + self, + cfg: HubertSeq2SeqConfig, + dictionary, + embed_tokens, + no_encoder_attn=False, + ): + super().__init__(dictionary) + + self.dropout = cfg.decoder_dropout + self.share_input_output_embed = cfg.share_decoder_input_output_embed + + input_embed_dim = embed_tokens.embedding_dim + embed_dim = cfg.decoder_embed_dim + self.output_embed_dim = cfg.decoder_embed_dim + + self.layerdrop = cfg.decoder_layerdrop + + self.padding_idx = embed_tokens.padding_idx + self.max_target_positions = cfg.max_target_positions + + self.embed_tokens = embed_tokens + self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim + + self.project_in_dim = ( + Linear(input_embed_dim, embed_dim, bias=False) + if embed_dim != input_embed_dim + else None + ) + + self.embed_positions = ( + PositionalEmbedding( + cfg.max_target_positions, + embed_dim, + self.padding_idx, + learned=cfg.decoder_learned_pos, + ) + if not cfg.no_token_positional_embeddings + else None + ) + + # TODO: update this when transformer gets converted to dataclass configs + transformer_cfg = copy.deepcopy(cfg) + with open_dict(transformer_cfg): + transformer_cfg.dropout = transformer_cfg.decoder_dropout + transformer_cfg.attention_dropout = ( + transformer_cfg.decoder_attention_dropout + ) + transformer_cfg.activation_dropout = ( + transformer_cfg.decoder_activation_dropout + ) + + self.layers = nn.ModuleList([]) + self.layers.extend( + [ + TransformerDecoderLayer(transformer_cfg, no_encoder_attn) + for _ in range(transformer_cfg.decoder_layers) + ] + ) + + if not self.share_input_output_embed: + self.embed_out = nn.Parameter( + torch.Tensor(len(dictionary), self.output_embed_dim) + ) + nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) + + if transformer_cfg.decoder_normalize_before: + self.layer_norm = LayerNorm(embed_dim) + else: + self.layer_norm = None + + def forward( + self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (Tensor, optional): output from the encoder, used for + encoder-side attention + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + if type(prev_output_tokens) == list: + max_len = max((len(x) for x in prev_output_tokens)) + tmp = torch.zeros( + [len(prev_output_tokens), max_len], device=prev_output_tokens[0].device + ) + for (i, p) in enumerate(prev_output_tokens): + tmp[i, : len(p)] = p + prev_output_tokens = tmp + prev_output_tokens = prev_output_tokens.long() + x, extra = self.extract_features( + prev_output_tokens, encoder_out, incremental_state + ) + x = self.output_layer(x) + return x, extra + + def extract_features( + self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused + ): + """ + Similar to *forward* but only return features. + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + + # embed positions + positions = ( + self.embed_positions( + prev_output_tokens, incremental_state=incremental_state + ) + if self.embed_positions is not None + else None + ) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + # embed tokens and positions + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + if positions is not None: + x += positions + x = F.dropout(x, p=self.dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + attn = None + + inner_states = [x] + + # decoder layers + self_attn_padding_mask = None + if prev_output_tokens.eq(self.padding_idx).any(): + self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) + for layer in self.layers: + dropout_probability = np.random.random() + if not self.training or (dropout_probability > self.layerdrop): + x, attn, _ = layer( + x, + encoder_out["encoder_out"] if encoder_out is not None else None, + encoder_out["padding_mask"] if encoder_out is not None else None, + incremental_state, + self_attn_mask=self.buffered_future_mask(x) + if incremental_state is None + else None, + self_attn_padding_mask=self_attn_padding_mask, + ) + inner_states.append(x) + + if self.layer_norm: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + return x, {"attn": attn, "inner_states": inner_states} + + def output_layer(self, features, **kwargs): + """Project features to the vocabulary size.""" + # project back to size of vocabulary + if self.share_input_output_embed: + return F.linear(features, self.embed_tokens.weight) + else: + return F.linear(features, self.embed_out) + + def max_positions(self): + """Maximum output length supported by the decoder.""" + if self.embed_positions is None: + return self.max_target_positions + return min(self.max_target_positions, self.embed_positions.max_positions) + + def buffered_future_mask(self, tensor): + dim = tensor.size(0) + if ( + not hasattr(self, "_future_mask") + or self._future_mask is None + or self._future_mask.device != tensor.device + or self._future_mask.size(0) < dim + ): + self._future_mask = torch.triu( + utils.fill_with_neg_inf(tensor.new(dim, dim)), 1 + ) + return self._future_mask[:dim, :dim] + + def upgrade_state_dict_named(self, state_dict, name): + return state_dict + + +def Embedding(num_embeddings, embedding_dim, padding_idx): + m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) + nn.init.constant_(m.weight[padding_idx], 0) + return m + + +def Linear(in_features, out_features, bias=True): + m = nn.Linear(in_features, out_features, bias) + nn.init.xavier_uniform_(m.weight) + if bias: + nn.init.constant_(m.bias, 0.0) + return m diff --git a/fairseq/models/huggingface/hf_gpt2.py b/fairseq/models/huggingface/hf_gpt2.py index a823453794..3a8eb78198 100644 --- a/fairseq/models/huggingface/hf_gpt2.py +++ b/fairseq/models/huggingface/hf_gpt2.py @@ -17,20 +17,6 @@ ) -try: - # Prepend the transformers submodule to the path, so that - # it's prioritized over other installations. This allows - # making local changes in the submodule. - hf_path = os.path.join(os.path.dirname(__file__), "transformers", "src") - sys.path.insert(0, hf_path) - from transformers import GPT2Config, GPT2LMHeadModel - - sys.path.remove(hf_path) - has_hf = True -except ImportError: - has_hf = False - - logger = logging.getLogger(__name__) @@ -41,14 +27,6 @@ class HuggingFaceGPT2LanguageModel(FairseqLanguageModel): def __init__(self, decoder): super().__init__(decoder) - if not has_hf: - raise ImportError( - "\n\nPlease install huggingface/transformers with:" - "\n\n pip install transformers" - "\n\nOr to make local edits, install the submodule:" - "\n\n git submodule update --init " - "fairseq/models/huggingface/transformers" - ) @staticmethod def add_args(parser): @@ -76,17 +54,16 @@ def build_model(cls, args, task): class HuggingFaceGPT2Decoder(FairseqIncrementalDecoder): def __init__(self, args, task): - super().__init__(task.target_dictionary) - - if not has_hf: + try: + from transformers import GPT2Config, GPT2LMHeadModel + except ImportError: raise ImportError( "\n\nPlease install huggingface/transformers with:" "\n\n pip install transformers" - "\n\nOr to make local edits, install the submodule:" - "\n\n git submodule update --init " - "fairseq/models/huggingface/transformers" ) + super().__init__(task.target_dictionary) + config = GPT2Config( vocab_size=len(task.target_dictionary), n_positions=args.max_target_positions + 1, diff --git a/fairseq/models/huggingface/transformers b/fairseq/models/huggingface/transformers deleted file mode 160000 index 839f8a563c..0000000000 --- a/fairseq/models/huggingface/transformers +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 839f8a563cefcb7f2048b310024c217e7829a198 diff --git a/fairseq/models/lightconv.py b/fairseq/models/lightconv.py index b614da3665..7950280e30 100644 --- a/fairseq/models/lightconv.py +++ b/fairseq/models/lightconv.py @@ -4,10 +4,12 @@ # LICENSE file in the root directory of this source tree. import math +from typing import Any, Dict, List, Optional, Tuple import torch import torch.nn as nn import torch.nn.functional as F + from fairseq import utils from fairseq.models import ( FairseqEncoder, @@ -18,13 +20,15 @@ ) from fairseq.modules import ( AdaptiveSoftmax, - DynamicConv, + DynamicConv_scripatable as DynamicConv, FairseqDropout, LayerNorm, LightweightConv, MultiheadAttention, PositionalEmbedding, ) +from fairseq.utils import safe_hasattr +from torch import Tensor @register_model("lightconv") @@ -257,9 +261,9 @@ def build_model(cls, args, task): # make sure all arguments are present in older models base_architecture(args) - if not hasattr(args, "max_source_positions"): + if not safe_hasattr(args, "max_source_positions"): args.max_source_positions = 1024 - if not hasattr(args, "max_target_positions"): + if not safe_hasattr(args, "max_target_positions"): args.max_target_positions = 1024 src_dict, tgt_dict = task.source_dictionary, task.target_dictionary @@ -306,6 +310,42 @@ def build_embedding(dictionary, embed_dim, path=None): decoder = LightConvDecoder(args, tgt_dict, decoder_embed_tokens) return LightConvModel(encoder, decoder) + def forward( + self, + src_tokens: Tensor, + src_lengths: Tensor, + prev_output_tokens: Tensor, + ): + """ + (The forward method inherited from the base class has a **kwargs + argument in its input, which is not supported in torchscript. This + method overwrites the forward method definition without **kwargs.) + + Run the forward pass for an encoder-decoder model. + + First feed a batch of source tokens through the encoder. Then, feed the + encoder output and previous decoder outputs (i.e., teacher forcing) to + the decoder to produce the next outputs:: + + encoder_out = self.encoder(src_tokens, src_lengths) + return self.decoder(prev_output_tokens, encoder_out) + + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (LongTensor): source sentence lengths of shape `(batch)` + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + encoder_out = self.encoder(src_tokens, src_lengths) + decoder_out = self.decoder(prev_output_tokens, encoder_out=encoder_out) + return decoder_out + class LightConvEncoder(FairseqEncoder): """ @@ -354,8 +394,12 @@ def __init__(self, args, dictionary, embed_tokens): self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim) + else: + self.layer_norm = None - def forward(self, src_tokens, **unused): + def forward( + self, src_tokens: Tensor, src_lengths: Optional[Tensor] = None + ) -> Dict[str, List[Tensor]]: """ Args: src_tokens (LongTensor): tokens in the source language of shape @@ -378,23 +422,32 @@ def forward(self, src_tokens, **unused): x = x.transpose(0, 1) # compute padding mask - encoder_padding_mask = src_tokens.eq(self.padding_idx) + encoder_padding_mask = src_tokens.eq(self.padding_idx) # B x T if not encoder_padding_mask.any(): - encoder_padding_mask = None + encoder_mask = None + else: + encoder_mask = encoder_padding_mask # encoder layers for layer in self.layers: - x = layer(x, encoder_padding_mask) + x = layer(x, encoder_mask) - if self.normalize: + if self.layer_norm is not None: x = self.layer_norm(x) - return { - "encoder_out": x, # T x B x C - "encoder_padding_mask": encoder_padding_mask, # B x T - } + output_dict: Dict[str, List[Tensor]] = {} + if src_lengths is not None: + output_dict["src_lengths"] = [src_lengths] + output_dict["encoder_out"] = [x] # T x B x C + if encoder_mask is not None: + output_dict["encoder_padding_mask"] = [encoder_mask] # B x T - def reorder_encoder_out(self, encoder_out, new_order): + return output_dict + + @torch.jit.export + def reorder_encoder_out( + self, encoder_out: Dict[str, List[Tensor]], new_order: Tensor + ): """ Reorder encoder output according to *new_order*. @@ -405,15 +458,22 @@ def reorder_encoder_out(self, encoder_out, new_order): Returns: *encoder_out* rearranged according to *new_order* """ - if encoder_out["encoder_out"] is not None: - encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( - 1, new_order - ) - if encoder_out["encoder_padding_mask"] is not None: - encoder_out["encoder_padding_mask"] = encoder_out[ - "encoder_padding_mask" - ].index_select(0, new_order) - return encoder_out + if len(encoder_out["encoder_out"]) == 0: + encoder = [] + else: + encoder = [encoder_out["encoder_out"][0].index_select(1, new_order)] + output_dict = {"encoder_out": encoder} + + if ("encoder_padding_mask" not in encoder_out) or ( + len(encoder_out["encoder_padding_mask"]) == 0 + ): + encoder_padding_mask = [] + else: + encoder_padding_mask = [ + encoder_out["encoder_padding_mask"][0].index_select(0, new_order) + ] + output_dict["encoder_padding_mask"] = encoder_padding_mask + return output_dict def max_positions(self): """Maximum input length supported by the encoder.""" @@ -475,13 +535,17 @@ def __init__( self.layers.extend( [ LightConvDecoderLayer( - args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i] + args, + no_encoder_attn, + kernel_size=args.decoder_kernel_size_list[i], + dictionary=dictionary, ) for i in range(args.decoder_layers) ] ) self.adaptive_softmax = None + self.output_projection = None self.project_out_dim = ( Linear(embed_dim, output_embed_dim, bias=False) @@ -499,18 +563,34 @@ def __init__( factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) - elif not self.share_input_output_embed: - self.embed_out = nn.Parameter( - torch.Tensor(len(dictionary), output_embed_dim) + elif self.share_input_output_embed: + self.output_projection = nn.Linear( + self.embed_tokens.weight.shape[1], + self.embed_tokens.weight.shape[0], + bias=False, + ) + self.output_projection.weight = self.embed_tokens.weight + + else: + self.output_projection = nn.Linear( + output_embed_dim, len(dictionary), bias=False + ) + nn.init.normal_( + self.output_projection.weight, mean=0, std=output_embed_dim**-0.5 ) - nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) + else: + self.layer_norm = None def forward( - self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs + self, + prev_output_tokens: Tensor, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + src_lengths: Optional[Any] = None, ): """ Args: @@ -544,7 +624,7 @@ def forward( positions = positions[:, -1:] # embed tokens and positions - x = self.embed_scale * self.embed_tokens(prev_output_tokens) + x = self.embed_scale * self.embed_tokens(prev_output_tokens.contiguous()) if self.project_in_dim is not None: x = self.project_in_dim(x) @@ -557,21 +637,30 @@ def forward( x = x.transpose(0, 1) attn = None - inner_states = [x] + inner_states: List[Optional[Tensor]] = [x] # decoder layers + attn: Optional[Tensor] = None for layer in self.layers: + encoder: Optional[Tensor] = None + encoder_padding_mask: Optional[Tensor] = None + if encoder_out is not None: + if len(encoder_out["encoder_out"]) > 0: + encoder = encoder_out["encoder_out"][0] + if ( + "encoder_padding_mask" in encoder_out + and len(encoder_out["encoder_padding_mask"]) > 0 + ): + encoder_padding_mask = encoder_out["encoder_padding_mask"][0] x, attn = layer( x, - encoder_out["encoder_out"] if encoder_out is not None else None, - encoder_out["encoder_padding_mask"] - if encoder_out is not None - else None, + encoder, + encoder_padding_mask, incremental_state, ) inner_states.append(x) - if self.normalize: + if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C @@ -582,12 +671,9 @@ def forward( if self.adaptive_softmax is None: # project back to size of vocabulary - if self.share_input_output_embed: - x = F.linear(x, self.embed_tokens.weight) - else: - x = F.linear(x, self.embed_out) + x = self.output_projection(x) - return x, {"attn": attn, "inner_states": inner_states} + return x, {"attn": [attn], "inner_states": inner_states} def max_positions(self): """Maximum output length supported by the decoder.""" @@ -670,9 +756,10 @@ def __init__(self, args, kernel_size=0): self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) - self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)]) + self.layer_norm1 = LayerNorm(self.embed_dim) + self.layer_norm2 = LayerNorm(self.embed_dim) - def forward(self, x, encoder_padding_mask): + def forward(self, x, encoder_padding_mask: Optional[Tensor] = None) -> Tensor: """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` @@ -683,7 +770,9 @@ def forward(self, x, encoder_padding_mask): encoded output of shape `(batch, src_len, embed_dim)` """ residual = x - x = self.maybe_layer_norm(0, x, before=True) + normalize = self.maybe_layer_norm(before=True) + if normalize: + x = self.layer_norm1(x) x = self.input_dropout_module(x) x = self.linear1(x) if self.act is not None: @@ -694,24 +783,27 @@ def forward(self, x, encoder_padding_mask): x = self.linear2(x) x = self.dropout_module(x) x = residual + x - x = self.maybe_layer_norm(0, x, after=True) + normalize = self.maybe_layer_norm(after=True) + if normalize: + x = self.layer_norm1(x) residual = x - x = self.maybe_layer_norm(1, x, before=True) + normalize = self.maybe_layer_norm(before=True) + if normalize: + x = self.layer_norm2(x) x = F.relu(self.fc1(x)) x = self.relu_dropout_module(x) x = self.fc2(x) x = self.dropout_module(x) x = residual + x - x = self.maybe_layer_norm(1, x, after=True) + normalize = self.maybe_layer_norm(after=True) + if normalize: + x = self.layer_norm2(x) return x - def maybe_layer_norm(self, i, x, before=False, after=False): - assert before ^ after - if after ^ self.normalize_before: - return self.layer_norms[i](x) - else: - return x + def maybe_layer_norm(self, before: bool = False, after: bool = False): + assert before ^ after, "Incorrect arguments" + return after ^ self.normalize_before def extra_repr(self): return ( @@ -734,7 +826,7 @@ class LightConvDecoderLayer(nn.Module): kernel_size: kernel size of the convolution """ - def __init__(self, args, no_encoder_attn=False, kernel_size=0): + def __init__(self, args, no_encoder_attn=False, kernel_size=0, dictionary=None): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim @@ -788,6 +880,7 @@ def __init__(self, args, no_encoder_attn=False, kernel_size=0): args.decoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True, + dictionary=dictionary, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) @@ -799,14 +892,14 @@ def __init__(self, args, no_encoder_attn=False, kernel_size=0): def forward( self, - x, - encoder_out, - encoder_padding_mask, - incremental_state, - prev_conv_state=None, - prev_attn_state=None, - conv_mask=None, - conv_padding_mask=None, + x: Tensor, + encoder_out: Optional[Tensor], + encoder_padding_mask: Optional[Tensor], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], + prev_conv_state: Optional[Tensor] = None, + prev_attn_state: Optional[Tuple[Tensor, Tensor]] = None, + conv_mask: Optional[Tensor] = None, + conv_padding_mask: Optional[Tensor] = None, ): """ Args: @@ -818,10 +911,10 @@ def forward( encoded output of shape `(batch, src_len, embed_dim)` """ residual = x - x = self.maybe_layer_norm(self.conv_layer_norm, x, before=True) + normalize = self.maybe_layer_norm(before=True) + if normalize: + x = self.conv_layer_norm(x) if prev_conv_state is not None: - if incremental_state is None: - incremental_state = {} self.conv._set_input_buffer(incremental_state, prev_conv_state) x = self.input_dropout_module(x) x = self.linear1(x) @@ -831,17 +924,22 @@ def forward( x = self.linear2(x) x = self.dropout_module(x) x = residual + x - x = self.maybe_layer_norm(self.conv_layer_norm, x, after=True) + normalize = self.maybe_layer_norm(after=True) + if normalize: + x = self.conv_layer_norm(x) - attn = None + attn: Optional[Tensor] = None if self.encoder_attn is not None: residual = x - x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True) + normalize = self.maybe_layer_norm(before=True) + if normalize: + x = self.encoder_attn_layer_norm(x) + if prev_attn_state is not None: - if incremental_state is None: - incremental_state = {} - prev_key, prev_value = prev_attn_state - saved_state = {"prev_key": prev_key, "prev_value": prev_value} + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_attn_state[0], + "prev_value": prev_attn_state[1], + } self.encoder_attn._set_input_buffer(incremental_state, saved_state) x, attn = self.encoder_attn( query=x, @@ -854,26 +952,29 @@ def forward( ) x = self.dropout_module(x) x = residual + x - x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True) + normalize = self.maybe_layer_norm(after=True) + if normalize: + x = self.encoder_attn_layer_norm(x) residual = x - x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) + normalize = self.maybe_layer_norm(before=True) + if normalize: + x = self.final_layer_norm(x) x = F.relu(self.fc1(x)) x = self.relu_dropout_module(x) x = self.fc2(x) x = self.dropout_module(x) x = residual + x - x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) + normalize = self.maybe_layer_norm(after=True) + if normalize: + x = self.final_layer_norm(x) return x, attn - def maybe_layer_norm(self, layer_norm, x, before=False, after=False): - assert before ^ after - if after ^ self.normalize_before: - return layer_norm(x) - else: - return x + def maybe_layer_norm(self, before: bool = False, after: bool = False): + assert before ^ after, "Incorrect usage" + return after ^ self.normalize_before - def make_generation_fast_(self, need_attn=False, **kwargs): + def make_generation_fast_(self, need_attn: bool = False, **kwargs): self.need_attn = need_attn def extra_repr(self): @@ -889,7 +990,7 @@ def extra_repr(self): def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) - nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) nn.init.constant_(m.weight[padding_idx], 0) return m diff --git a/fairseq/models/lstm.py b/fairseq/models/lstm.py index 1a9dca3c75..8a29156270 100644 --- a/fairseq/models/lstm.py +++ b/fairseq/models/lstm.py @@ -225,10 +225,10 @@ def __init__( super().__init__(dictionary) self.num_layers = num_layers self.dropout_in_module = FairseqDropout( - dropout_in, module_name=self.__class__.__name__ + dropout_in * 1.0, module_name=self.__class__.__name__ ) self.dropout_out_module = FairseqDropout( - dropout_out, module_name=self.__class__.__name__ + dropout_out * 1.0, module_name=self.__class__.__name__ ) self.bidirectional = bidirectional self.hidden_size = hidden_size @@ -329,7 +329,9 @@ def combine_bidir(self, outs, bsz: int): out = outs.view(self.num_layers, 2, bsz, -1).transpose(1, 2).contiguous() return out.view(self.num_layers, bsz, -1) - def reorder_encoder_out(self, encoder_out, new_order): + def reorder_encoder_out( + self, encoder_out: Tuple[Tensor, Tensor, Tensor, Tensor], new_order + ): return tuple( ( encoder_out[0].index_select(1, new_order), @@ -402,10 +404,10 @@ def __init__( ): super().__init__(dictionary) self.dropout_in_module = FairseqDropout( - dropout_in, module_name=self.__class__.__name__ + dropout_in * 1.0, module_name=self.__class__.__name__ ) self.dropout_out_module = FairseqDropout( - dropout_out, module_name=self.__class__.__name__ + dropout_out * 1.0, module_name=self.__class__.__name__ ) self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed @@ -535,7 +537,7 @@ def extract_features( assert ( srclen > 0 or self.attention is None ), "attention is not supported if there are no encoder outputs" - attn_scores = ( + attn_scores: Optional[Tensor] = ( x.new_zeros(srclen, seqlen, bsz) if self.attention is not None else None ) outs = [] diff --git a/fairseq/models/masked_lm.py b/fairseq/models/masked_lm.py index c786de9125..b71254cef8 100644 --- a/fairseq/models/masked_lm.py +++ b/fairseq/models/masked_lm.py @@ -21,6 +21,7 @@ TransformerSentenceEncoder, ) from fairseq.modules.transformer_sentence_encoder import init_bert_params +from fairseq.utils import safe_hasattr logger = logging.getLogger(__name__) @@ -158,7 +159,7 @@ def build_model(cls, args, task): # make sure all arguments are present in older models base_architecture(args) - if not hasattr(args, "max_positions"): + if not safe_hasattr(args, "max_positions"): args.max_positions = args.tokens_per_sample logger.info(args) @@ -293,12 +294,6 @@ def max_positions(self): return self.max_positions def upgrade_state_dict_named(self, state_dict, name): - if isinstance( - self.sentence_encoder.embed_positions, SinusoidalPositionalEmbedding - ): - state_dict[ - name + ".sentence_encoder.embed_positions._float_tensor" - ] = torch.FloatTensor(1) if not self.load_softmax: for k in list(state_dict.keys()): if ( diff --git a/fairseq/models/multilingual_transformer.py b/fairseq/models/multilingual_transformer.py index 2e1f86f36e..e722b647ed 100644 --- a/fairseq/models/multilingual_transformer.py +++ b/fairseq/models/multilingual_transformer.py @@ -18,6 +18,7 @@ TransformerModel, base_architecture, ) +from fairseq.utils import safe_hasattr @register_model("multilingual_transformer") @@ -75,9 +76,9 @@ def build_model(cls, args, task): # make sure all arguments are present in older models base_multilingual_architecture(args) - if not hasattr(args, "max_source_positions"): + if not safe_hasattr(args, "max_source_positions"): args.max_source_positions = 1024 - if not hasattr(args, "max_target_positions"): + if not safe_hasattr(args, "max_target_positions"): args.max_target_positions = 1024 src_langs = [lang_pair.split("-")[0] for lang_pair in task.model_lang_pairs] diff --git a/fairseq/models/multires_hubert/__init__.py b/fairseq/models/multires_hubert/__init__.py new file mode 100644 index 0000000000..ec36505b08 --- /dev/null +++ b/fairseq/models/multires_hubert/__init__.py @@ -0,0 +1,2 @@ +from .multires_hubert import * # noqa +from .multires_hubert_asr import * # noqa diff --git a/fairseq/models/multires_hubert/multires_hubert.py b/fairseq/models/multires_hubert/multires_hubert.py new file mode 100644 index 0000000000..eacb29e5fe --- /dev/null +++ b/fairseq/models/multires_hubert/multires_hubert.py @@ -0,0 +1,1231 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Tuple + +import numpy as np +import torch +import math +import torch.nn as nn +from omegaconf import II +from fairseq.models.wav2vec.wav2vec import norm_block + +from fairseq import utils +from fairseq.data.data_utils import compute_mask_indices +from fairseq.data.dictionary import Dictionary +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.models import BaseFairseqModel, register_model +from fairseq.models.wav2vec.wav2vec2 import ( + EXTRACTOR_MODE_CHOICES, + MASKING_DISTRIBUTION_CHOICES, + LAYER_TYPE_CHOICES, + ConvFeatureExtractionModel, + TransformerEncoder, +) +from omegaconf import II, MISSING, open_dict +from fairseq.modules import GradMultiply, LayerNorm +from fairseq.tasks.multires_hubert_pretraining import ( + MultiresHubertPretrainingConfig, + MultiresHubertPretrainingTask, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class MultiresHubertConfig(FairseqDataclass): + label_rate: float = II("task.label_rate") + # label_rate: 1,2,2,5 + # (imply (1,2), (2,5)) + # if base label_rate = 50 + # (1,2), (2,5) --> label rates 50, 25, 10 + label_rate_ratios: List[int] = field( + default=MISSING, metadata={"help": "tuple for label rates e.g., [(1,2), (2,5)]"} + ) + + extractor_mode: EXTRACTOR_MODE_CHOICES = field( + default="default", + metadata={ + "help": "mode for feature extractor. default has a single group " + "norm with d groups in the first conv block, whereas layer_norm " + "has layer norms in every block (meant to use with normalize=True)" + }, + ) + # the blocks for each label rate + encoder_layers: int = field( + default="2", + metadata={ + "help": "num encoder layers in the each block (one sub module of the U-net)" + }, + ) + override_encoder_layers: str = field( + default="", + metadata={ + "help": "specific layer numbers for each block (one sub module of the U-net) for the training" + }, + ) + encoder_embed_dim: int = field( + default=768, metadata={"help": "encoder embedding dimension"} + ) + encoder_ffn_embed_dim: int = field( + default=3072, metadata={"help": "encoder embedding dimension for FFN"} + ) + encoder_attention_heads: int = field( + default=12, metadata={"help": "num encoder attention heads"} + ) + activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( + default="gelu", metadata={"help": "activation function to use"} + ) + layer_type: LAYER_TYPE_CHOICES = field( + default="transformer", metadata={"help": "layer type in encoder"} + ) + conv_adapator_kernal: int = field( + default=7, metadata={"help": "kernal size for conv adaptor"} + ) + use_plain_updownsample: bool = field( + default=False, metadata={"help": "whether to use plain up downsample"} + ) + + # dropouts + dropout: float = field( + default=0.1, + metadata={"help": "dropout probability for the transformer"}, + ) + attention_dropout: float = field( + default=0.1, + metadata={"help": "dropout probability for attention weights"}, + ) + activation_dropout: float = field( + default=0.0, + metadata={"help": "dropout probability after activation in FFN"}, + ) + encoder_layerdrop: float = field( + default=0.0, + metadata={"help": "probability of dropping a tarnsformer layer"}, + ) + dropout_input: float = field( + default=0.0, + metadata={"help": "dropout to apply to the input (after feat extr)"}, + ) + dropout_features: float = field( + default=0.0, + metadata={"help": "dropout to apply to the features (after feat extr)"}, + ) + + final_dim: int = field( + default=0, + metadata={ + "help": "project final representations and targets to this many " + "dimensions. set to encoder_embed_dim is <= 0" + }, + ) + untie_final_proj: bool = field( + default=True, + metadata={"help": "use separate projection for each target"}, + ) + layer_norm_first: bool = field( + default=False, + metadata={"help": "apply layernorm first in the transformer"}, + ) + conv_feature_layers: str = field( + default="[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", + metadata={ + "help": "string describing convolutional feature extraction " + "layers in form of a python list that contains " + "[(dim, kernel_size, stride), ...]" + }, + ) + conv_bias: bool = field( + default=False, metadata={"help": "include bias in conv encoder"} + ) + logit_temp: float = field( + default=0.1, metadata={"help": "temperature to divide logits by"} + ) + target_glu: bool = field( + default=False, metadata={"help": "adds projection + glu to targets"} + ) + feature_grad_mult: float = field( + default=1.0, + metadata={"help": "multiply feature extractor var grads by this"}, + ) + use_single_target: bool = field( + default=False, + metadata={ + "help": "whether to use single data (in that case, we will compute with the fixed label rate)" + }, + ) + use_single_prediction: bool = field( + default=False, + metadata={ + "help": "if true, we will not conduct mlm prediction in low resolution in the middle" + }, + ) + use_multi_stream: bool = field( + default=False, + metadata={ + "help": "whether to use multi-stream setting (in this setting, we have multiple streams with the same resolution)" + }, + ) + + # masking + mask_length: int = field(default=10, metadata={"help": "mask length"}) + mask_prob: float = field( + default=0.65, + metadata={"help": "probability of replacing a token with mask"}, + ) + mask_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose mask length"} + ) + mask_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument " + "(used for more complex distributions), " + "see help in compute_mask_indicesh" + }, + ) + no_mask_overlap: bool = field( + default=False, metadata={"help": "whether to allow masks to overlap"} + ) + mask_min_space: int = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + + # channel masking + mask_channel_length: int = field( + default=10, + metadata={"help": "length of the mask for features (channels)"}, + ) + mask_channel_prob: float = field( + default=0.0, + metadata={"help": "probability of replacing a feature with 0"}, + ) + mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", + metadata={"help": "how to choose mask length for channel masking"}, + ) + mask_channel_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument " + "(used for more complex distributions), " + "see help in compute_mask_indicesh" + }, + ) + no_mask_channel_overlap: bool = field( + default=False, + metadata={"help": "whether to allow channel masks to overlap"}, + ) + mask_channel_min_space: int = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + + # positional embeddings + conv_pos: int = field( + default=128, + metadata={"help": "number of filters for convolutional positional embeddings"}, + ) + conv_pos_groups: int = field( + default=16, + metadata={"help": "number of groups for convolutional positional embedding"}, + ) + + latent_temp: Tuple[float, float, float] = field( + default=(2, 0.5, 0.999995), + metadata={"help": "legacy (to be removed)"}, + ) + + # loss computation + skip_masked: bool = field( + default=False, + metadata={"help": "skip computing losses over masked frames"}, + ) + skip_nomask: bool = field( + default=False, + metadata={"help": "skip computing losses over unmasked frames"}, + ) + + checkpoint_activations: bool = field( + default=False, + metadata={"help": "recompute activations and save memory for extra compute"}, + ) + + # FP16 optimization + required_seq_len_multiple: int = field( + default=2, + metadata={ + "help": "pad the input to encoder such that the sequence length is divisible by multiple" + }, + ) + + # Conformer + depthwise_conv_kernel_size: int = field( + default=31, + metadata={ + "help": "depthwise-conv-kernel-size for convolution in conformer layer" + }, + ) + attn_type: str = field( + default="", + metadata={"help": "if espnet use ESPNET MHA"}, + ) + pos_enc_type: str = field( + default="abs", + metadata={"help": "Positional encoding type to use in conformer"}, + ) + fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"}) + + +@register_model("multires_hubert", dataclass=MultiresHubertConfig) +class MultiresHubertModel(BaseFairseqModel): + def __init__( + self, + cfg: MultiresHubertConfig, + task_cfg: MultiresHubertPretrainingConfig, + dictionaries: List[Dictionary], + ) -> None: + super().__init__() + logger.info(f"MultiresHubertModel Config: {cfg}") + + feature_enc_layers = eval(cfg.conv_feature_layers) # noqa + self.embed = feature_enc_layers[-1][0] + + self.feature_extractor = ConvFeatureExtractionModel( + conv_layers=feature_enc_layers, + dropout=0.0, + mode=cfg.extractor_mode, + conv_bias=cfg.conv_bias, + ) + self.post_extract_proj = ( + nn.Linear(self.embed, cfg.encoder_embed_dim) + if self.embed != cfg.encoder_embed_dim + else None + ) + + # Estimate label rates + assert ( + cfg.label_rate_ratios != "None" + ), "without ratios, the model is exactly as the Hubert model" + self.label_rate_ratios = [] + self.base_rate = cfg.label_rate + self.label_rates = [] + self.downsample_modules = nn.ModuleList() + self.upsample_modules = nn.ModuleList() + self.encoders = nn.ModuleList() + self.decoders = nn.ModuleList() + self.use_single_target = cfg.use_single_target + self.use_single_prediction = cfg.use_single_prediction + self.use_plain_updownsample = cfg.use_plain_updownsample + + # For decide the override encoder layers, so that the layer number is not equally distributed + if cfg.override_encoder_layers != "": + self.override_encoder_layers = eval(cfg.override_encoder_layers) + assert ( + len(self.override_encoder_layers) % 2 == 1 + ), "must be odd number of layers if specify detailed layers" + assert ( + len(self.override_encoder_layers) // 2 + == len(cfg.label_rate_ratios) // 2 + ), "number of override encoder layers must match the label rate ratios information" + self.len_encoder_modules = len(self.override_encoder_layers) + else: + self.override_encoder_layers = None + self.len_encoder_modules = None + + # use different layers instead of equally distributed ones + middle_override_encoder_layer = ( + self.override_encoder_layers[self.len_encoder_modules // 2] + if self.override_encoder_layers is not None + else None + ) + skip_middle_pos_conv = False if len(cfg.label_rate_ratios) < 2 else True + + self.middle_encoder = TransformerEncoder( + cfg, + skip_pos_conv=skip_middle_pos_conv, + override_encoder_layer=middle_override_encoder_layer, + ) + + first_pos_conv = False # only enable pos_conv for the first encoder + raw_label_rate_ratios = cfg.label_rate_ratios + for i in range(len(raw_label_rate_ratios) // 2): + # check if have override encoder layers + if self.override_encoder_layers is not None: + override_encoder_layer = self.override_encoder_layers[i] + override_decoder_layer = self.override_encoder_layers[ + self.len_encoder_modules - 1 - i + ] + else: + override_encoder_layer, override_decoder_layer = None, None + + self.label_rate_ratios.append( + (raw_label_rate_ratios[i * 2], raw_label_rate_ratios[i * 2 + 1]) + ) + if self.use_plain_updownsample: + self.downsample_modules.append( + ConvDownsampler( + k=cfg.conv_adapator_kernal, + label_rate=( + ( + raw_label_rate_ratios[i * 2], + raw_label_rate_ratios[i * 2 + 1], + ) + ), + dropout=0.0, + channels=cfg.encoder_embed_dim, + activation=nn.GELU(), + log_compression=False, + skip_connections=True, + highway=True, + residual_scale=0.4, + ) + ) + else: + self.downsample_modules.append( + ConvAdapter( + k=cfg.conv_adapator_kernal, + label_rate=( + ( + raw_label_rate_ratios[i * 2], + raw_label_rate_ratios[i * 2 + 1], + ) + ), + dropout=0.0, + channels=cfg.encoder_embed_dim, + activation=nn.GELU(), + log_compression=False, + skip_connections=True, + highway=True, + residual_scale=0.4, + ) + ) + if not first_pos_conv: + self.encoders.append( + TransformerEncoder( + cfg, override_encoder_layer=override_encoder_layer + ) + ) # TODO(jiatong): add conformer options + first_pos_conv = True + else: + self.encoders.append( + TransformerEncoder( + cfg, + skip_pos_conv=True, + override_encoder_layer=override_encoder_layer, + ) + ) + if self.use_plain_updownsample: + self.upsample_modules.append( + ConvUpsampler( + k=cfg.conv_adapator_kernal, + label_rate=( + ( + raw_label_rate_ratios[i * 2 + 1], + raw_label_rate_ratios[i * 2], + ) + ), + dropout=0.0, + channels=cfg.encoder_embed_dim, + activation=nn.GELU(), + log_compression=False, + skip_connections=True, + highway=True, + residual_scale=0.4, + ) + ) + else: + self.upsample_modules.append( + ConvAdapter( + k=cfg.conv_adapator_kernal, + label_rate=( + ( + raw_label_rate_ratios[i * 2 + 1], + raw_label_rate_ratios[i * 2], + ) + ), + dropout=0.0, + channels=cfg.encoder_embed_dim, + activation=nn.GELU(), + log_compression=False, + skip_connections=True, + highway=True, + residual_scale=0.4, + ) + ) + self.decoders.append( + TransformerEncoder( + cfg, + skip_pos_conv=True, + override_encoder_layer=override_decoder_layer, + ) + ) + + base_ds_rate = np.prod([s for _, _, s in feature_enc_layers]) + self.feature_ds_rates = [base_ds_rate] + running_rate = self.base_rate + + if cfg.use_single_target or cfg.use_multi_stream: + self.label_rates = self.base_rate + else: + self.label_rates.append(self.base_rate) + + for label_rate_ratio in self.label_rate_ratios: + upsample_rate, downsample_rate = label_rate_ratio + if (base_ds_rate * upsample_rate) % downsample_rate != 0: + logger.warning( + "base rate: {} cannot be ideally processed with downsample rate {}".format( + base_ds_rate, downsample_rate + ) + ) + + base_ds_rate = base_ds_rate * downsample_rate // upsample_rate + self.feature_ds_rates.append(base_ds_rate) + + if not cfg.use_single_target and not cfg.use_multi_stream: + running_rate = running_rate * upsample_rate // downsample_rate + self.label_rates.append(running_rate) + self.label_nums = len( + self.feature_ds_rates + ) # the number of labels for prediction (activate at iter 2) + + if type(self.label_rates) == float: + self.feat2tar_ratios = [ + self.feature_ds_rates[i] * self.label_rates / task_cfg.sample_rate + for i in range(len(self.feature_ds_rates)) + ] + else: + self.feat2tar_ratios = [ + self.feature_ds_rates[i] * self.label_rates[i] / task_cfg.sample_rate + for i in range(len(self.feature_ds_rates)) + ] + + # self.feat2tar_ratios = self.feat2tar_ratios[::-1] + + # An running example of the label rate: + # base_ds_rate = 320 + # self.label_rate_ratios = [(1, 2)] + # self.feature_ds_rates = [320, 640] + # self.label_rates = [50, 25] + # self.feat2tar_ratios = [1, 1] + + # Another running example of the label rate: + # base_ds_rate = 320 + # self.label_rate_ratios = [(1, 2)] + # self.feature_ds_rates = [320, 640] + # self.label_rates = 100 + # self.feat2tar_ratios = [4, 2] + # self.use_sinlge_target = True + + logging.info( + "ds_rates: {}, label_rates: {}, feat2tar_ratios: {}".format( + self.feature_ds_rates, self.label_rates, self.feat2tar_ratios + ) + ) + + self.mask_prob = cfg.mask_prob + self.mask_selection = cfg.mask_selection + self.mask_other = cfg.mask_other + self.mask_length = cfg.mask_length + self.no_mask_overlap = cfg.no_mask_overlap + self.mask_min_space = cfg.mask_min_space + + self.mask_channel_prob = cfg.mask_channel_prob + self.mask_channel_selection = cfg.mask_channel_selection + self.mask_channel_other = cfg.mask_channel_other + self.mask_channel_length = cfg.mask_channel_length + self.no_mask_channel_overlap = cfg.no_mask_channel_overlap + self.mask_channel_min_space = cfg.mask_channel_min_space + + self.dropout_input = nn.Dropout(cfg.dropout_input) + self.dropout_features = nn.Dropout(cfg.dropout_features) + + self.feature_grad_mult = cfg.feature_grad_mult + self.logit_temp = cfg.logit_temp + self.skip_masked = cfg.skip_masked + self.skip_nomask = cfg.skip_nomask + + # Note(jiatong): different from hubert, we just set the final dim as encoder_embed_dim + final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim + + self.mask_emb = nn.Parameter( + torch.FloatTensor(cfg.encoder_embed_dim).uniform_() + ) + + self.layer_norm = LayerNorm(self.embed) + + self.predictor_head_num = 1 if self.use_single_prediction else self.label_nums + + self.target_glu = None + if cfg.target_glu: + self.target_glus = nn.ModuleList() + for i in range(self.predictor_head_num): + self.target_glus.append( + nn.Sequential(nn.Linear(final_dim, final_dim * 2), nn.GLU()) + ) + + self.untie_final_proj = cfg.untie_final_proj + self.final_projs = nn.ModuleList() + + # Note(jiatong): we do not have untie cases for multires hubert + for i in range(self.predictor_head_num): + self.final_projs.append(nn.Linear(cfg.encoder_embed_dim, final_dim)) + + # modules below are not needed during fine-tuning + self.multires_classes = [] + self.label_embs_concat = nn.ParameterList() + + for i in range(self.predictor_head_num): + if self.use_single_target: + num_classes = len(dictionaries[0]) + else: + num_classes = len(dictionaries[i]) + self.multires_classes.append(num_classes) + self.label_embs_concat.append( + nn.Parameter(torch.FloatTensor(num_classes, final_dim)) + ) + nn.init.uniform_(self.label_embs_concat[i]) + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + + super().upgrade_state_dict_named(state_dict, name) + return state_dict + + @classmethod + def build_model( + cls, cfg: MultiresHubertConfig, task: MultiresHubertPretrainingTask + ): + """Build a new model instance.""" + + model = MultiresHubertModel(cfg, task.cfg, task.dictionaries) + return model + + def apply_mask(self, x, padding_mask, target_list): + B, T, C = x.shape + if self.mask_prob > 0: + mask_indices = compute_mask_indices( + (B, T), + padding_mask, + self.mask_prob, + self.mask_length, + self.mask_selection, + self.mask_other, + min_masks=2, + no_overlap=self.no_mask_overlap, + min_space=self.mask_min_space, + ) + mask_indices = torch.from_numpy(mask_indices).to(x.device) + x[mask_indices] = self.mask_emb + else: + mask_indices = None + + if self.mask_channel_prob > 0: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + torch.from_numpy(mask_channel_indices) + .to(x.device) + .unsqueeze(1) + .expand(-1, T, -1) + ) + x[mask_channel_indices] = 0 + + return x, mask_indices + + def compute_nce(self, x, pos, negs): + neg_is_pos = (pos == negs).all(-1) + pos = pos.unsqueeze(0) + targets = torch.cat([pos, negs], dim=0) + + logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x) + logits /= self.logit_temp + if neg_is_pos.any(): + logits[1:][neg_is_pos] = float("-inf") + logits = logits.transpose(0, 1) # (num_x, num_cls+1) + return logits + + def forward_features(self, source: torch.Tensor) -> torch.Tensor: + if self.feature_grad_mult > 0: + features = self.feature_extractor(source) + if self.feature_grad_mult != 1.0: + features = GradMultiply.apply(features, self.feature_grad_mult) + else: + with torch.no_grad(): + features = self.feature_extractor(source) + return features + + def forward_targets( + self, + features: torch.Tensor, + target: torch.Tensor, + feat2tar_ratio: float, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Trim features to ensure labels exist and then get aligned labels + + feat_tsz = features.size(1) + + # skip if no target is provided + if target is None: + return features, None, None + targ_tsz = target.size(1) + if feat2tar_ratio * feat_tsz > targ_tsz: + feat_tsz = int(targ_tsz / feat2tar_ratio) + features = features[:, :feat_tsz] + target_inds = torch.arange(feat_tsz).float() * feat2tar_ratio + target = target[:, target_inds.long()] + return features, target + + def forward_padding_mask( + self, + features: torch.Tensor, + padding_mask: torch.Tensor, + ) -> torch.Tensor: + extra = padding_mask.size(1) % features.size(1) + if extra > 0: + padding_mask = padding_mask[:, :-extra] + padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) + padding_mask = padding_mask.all(-1) + return padding_mask + + def forward( + self, + source: torch.Tensor, + target_list: Optional[List[torch.Tensor]] = None, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = True, + features_only: bool = False, + output_layer: Optional[int] = None, + ) -> Dict[str, torch.Tensor]: + """output layer is 1-based""" + features = self.forward_features(source) + + features_pen = features.float().pow(2).mean() + + features = features.transpose(1, 2) + features = self.layer_norm(features) + unmasked_features = features.clone() + + if padding_mask is not None: + padding_mask = self.forward_padding_mask(features, padding_mask) + + if self.post_extract_proj is not None: + features = self.post_extract_proj(features) + + features = self.dropout_input(features) + unmasked_features = self.dropout_features(unmasked_features) + + if mask: + x, mask_indices = self.apply_mask(features, padding_mask, target_list) + else: + x = features + mask_indices = None + + # feature: (B, T, D), float + # target: (B, T), long + # x: (B, T, D), float + # padding_mask: (B, T), bool + # mask_indices: (B, T), bool + + def align_size_sum(feat1, pad1, feat2): + assert ( + abs(feat1.size(1) - feat2.size(1)) < 10 + ), "misaligned results for feat1 and feat2 of size {} - {}".format( + feat1.size(1), feat2.size(1) + ) + common_size = min(feat1.size(1), feat2.size(1)) + + return ( + feat1[:, :common_size] + feat2[:, :common_size], + pad1[:, :common_size], + ) + + # process encoders + res_outputs = [] # final output for different resolution + multi_mask_indices = [] # mask indices for different resolution + residuals = [] # record the x in encoders + padding_masks = [] # final padding masks + # The encoder has (self.label_nums - 1) blocks + for i in range(self.label_nums - 1): + x, _ = self.encoders[i](x, padding_mask=padding_mask, layer=None) + residuals.append(x) + x, padding_mask, mask_indices = self.downsample_modules[i]( + x, padding=padding_mask, mask_indices=mask_indices + ) + + residual = self.middle_encoder(x, padding_mask=padding_mask, layer=None)[0] + x = x + residual + res_outputs.append(x) + + # process decoders + # The encoder has (self.label_nums - 1) blocks + padding_masks.append(padding_mask) + multi_mask_indices.append(mask_indices) + residuals.reverse() # NOTE(jiatong): reverse res_output to match corresponding input + for i in range(self.label_nums - 1): + x, padding_mask, mask_indices = self.upsample_modules[ + self.label_nums - 2 - i + ](x, padding=padding_mask, mask_indices=mask_indices) + x, _ = self.decoders[i](x, padding_mask=padding_mask, layer=None) + x, padding_mask = align_size_sum(x, padding_mask, residuals[i]) + res_outputs.append(x) + padding_masks.append(padding_mask) + multi_mask_indices.append(mask_indices) + + # NOTE(jiatong): need reverse of target list to allow matched target-representation + res_outputs.reverse() + padding_masks.reverse() + multi_mask_indices.reverse() + if target_list is not None: + new_target_list = [] + for i in range(self.label_nums): + if self.use_single_target: + res_outputs[i], reformat_target_list = self.forward_targets( + res_outputs[i], target_list[0], self.feat2tar_ratios[i] + ) + new_target_list.append(reformat_target_list) + else: + if target_list[i] is not None: + res_outputs[i], reformat_target_list = self.forward_targets( + res_outputs[i], target_list[i], self.feat2tar_ratios[i] + ) + new_target_list.append(reformat_target_list) + else: + # Append a None target list then it won't be used to calculate loss + new_target_list.append(None) + if padding_masks[i] is not None: + padding_masks[i] = self.forward_padding_mask( + res_outputs[i], padding_masks[i] + ) + if multi_mask_indices[i] is not None: + multi_mask_indices[i] = self.forward_padding_mask( + res_outputs[i], multi_mask_indices[i] + ) + + + if features_only: + # NOTE(jiatong): need to reverse back + res_outputs.reverse() + return { + "x": res_outputs, + "padding_mask": padding_masks[0], + "features": features, + } + + def compute_pred(proj_x, target, label_embs): + # compute logits for the i-th label set + y = torch.index_select(label_embs, 0, target.long()) + negs = label_embs.unsqueeze(1).expand(-1, proj_x.size(0), -1) + if self.target_glu: + y = self.target_glu(y) + negs = self.target_glu(negs) + # proj_x: (S, D) + # y: (S, D) + # negs: (Neg, S, D) + return self.compute_nce(proj_x, y, negs) + + logit_m_list, logit_u_list = [], [] + for j in range(self.label_nums): + if new_target_list[j] is None: + continue # skip empty targets + label_embs_list = self.label_embs_concat[j].split( + [self.multires_classes[j]], 0 + ) + # set the variables (after the set, the procedure is the same as hubert) + # all the elements are list with only one element (to simulate the normal hubert process) + x = res_outputs[j] + target = new_target_list[j] + padding_mask = padding_masks[j] + mask_indices = multi_mask_indices[j] + final_proj = self.final_projs[j] + + if not self.skip_masked: + masked_indices = torch.logical_and(~padding_mask, mask_indices) + proj_x_m = final_proj(x[masked_indices]) + logit_m_list.append( + compute_pred(proj_x_m, target[masked_indices], label_embs_list[0]) + ) + else: + logit_m_list.append(None) + + if not self.skip_nomask: + nomask_indices = torch.logical_and(~padding_mask, ~mask_indices) + proj_x_u = final_proj(x[nomask_indices]) + logit_u_list.append( + compute_pred(proj_x_u, target[nomask_indices], label_embs_list[0]) + ) + else: + logit_u_list.append(None) + + # if we only want one prediction, we can exit now + if self.predictor_head_num == 1: + break + + result = { + "logit_m_list": logit_m_list, + "logit_u_list": logit_u_list, + "padding_mask": padding_mask, + "features_pen": features_pen, + } + return result + + def extract_features( + self, + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = False, + ret_conv: bool = False, + output_layer: Optional[int] = None, + last_layer: Optional[bool] = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + res = self.forward( + source, + padding_mask=padding_mask, + mask=mask, + features_only=True, + output_layer=output_layer, + ) + feature = res["features"] if ret_conv else res["x"] + if last_layer: + feature = feature[-1] + return feature, res["padding_mask"] + + def get_logits(self, net_output, is_masked=True): + if is_masked: + logits_list = net_output["logit_m_list"] + else: + logits_list = net_output["logit_u_list"] + logits_list = [x.float() for x in logits_list if x is not None] + return logits_list + + def get_targets(self, net_output, is_masked=True): + logits_list = self.get_logits(net_output, is_masked) + targets_list = [x.new_zeros(x.size(0), dtype=torch.long) for x in logits_list] + return targets_list + + def get_extra_losses(self, net_output): + extra_losses = [] + names = [] + + if "features_pen" in net_output: + extra_losses.append(net_output["features_pen"]) + names.append("features_pen") + + return extra_losses, names + + def remove_pretraining_modules(self): + self.target_glu = None + self.final_proj = None + + +class ConvAdapter(nn.Module): + """Conv adapter that combines two modules with different label rate with downsample or upsample. + To allow different ratios than integer, two convs are utilized with first to upsample (numerator) + and the second to downsample (denominator)""" + + def __init__( + self, + k, + label_rate, + dropout, + channels, + activation, + log_compression=False, + skip_connections=True, + highway=True, + residual_scale=0.4, + non_affine_group_norm=False, + ): + super().__init__() + + def downsample_block(channel, k, stride): + return nn.Sequential( + # with padding (k - 1) // 2 to keep the same size + nn.Conv1d( + channel, + channel, + k, + stride=stride, + bias=False, + padding=(k - 1) // 2, + ), + nn.Dropout(p=dropout), + norm_block( + is_layer_norm=False, dim=channel, affine=not non_affine_group_norm + ), + activation, + ) + + def upsample_block(channel, k, stride): + return nn.Sequential( + # with padding (k - 1) // 2 to keep the same size + nn.ConvTranspose1d( + channel, + channel, + k, + stride=stride, + bias=False, + padding=0, # padding=(k - 1) // 2, + output_padding=(stride - 1), + ), + nn.Dropout(p=dropout), + norm_block( + is_layer_norm=False, dim=channel, affine=not non_affine_group_norm + ), + activation, + ) + + assert len(label_rate) == 2, "label_rate should be sized two to apply fusion" + # Lout =(Lin~H~R1)~Wstride~H~R2~Wpadding+dilation~W(kernel_size~H~R1)+output_padding+1 + self.upsample_conv = upsample_block(channels, k, label_rate[0]) + self.downsample_conv = downsample_block(channels, k, label_rate[1]) + + self.upsample_rate, self.downsample_rate = label_rate + self.log_compression = log_compression + self.skip_connections = skip_connections + self.highway = highway + self.residual_scale = math.sqrt(residual_scale) + + def forward(self, x, padding=None, mask_indices=None): + # Assume x1 = (B, T, C) as input + x = x.permute(0, 2, 1) + residual_before_upsample = x + x = self.upsample_conv(x) + upsample_size = x.size(2) + + # conduct upsample + if self.skip_connections: + residual_upsample = torch.repeat_interleave( + residual_before_upsample, self.upsample_rate, dim=2 + ) + upsample_size = min(upsample_size, residual_upsample.size(2)) + x = ( + x[..., :upsample_size] + residual_upsample[..., :upsample_size] + ) * self.residual_scale + + residual_before_downsample = x + x = self.downsample_conv(x) + downsample_size = x.size(2) + + if self.skip_connections: + residual_downsample = residual_before_downsample[ + ..., :: self.downsample_rate + ] + downsample_size = min(x.size(2), residual_downsample.size(2)) + x = ( + x[..., :downsample_size] + residual_downsample[..., :downsample_size] + ) * self.residual_scale + + if self.highway: + residual_after_sample = residual_upsample[..., :: self.downsample_rate] + final_size = min(x.size(2), residual_after_sample.size(2)) + x = ( + x[..., :final_size] + residual_after_sample[..., :final_size] + ) * self.residual_scale + + if self.log_compression: + x = x.abs() + x = x + 1 + x = x.log() + + x = x.permute(0, 2, 1) + + # process padding + if padding is not None: + padding = torch.repeat_interleave(padding, self.upsample_rate, dim=1) + padding = padding[..., :: self.downsample_rate] + padding = padding[..., : x.size(1)] + + # process mask indices + if mask_indices is not None: + mask_indices = torch.repeat_interleave( + mask_indices, self.upsample_rate, dim=1 + ) + mask_indices = mask_indices[..., :: self.downsample_rate] + mask_indices = mask_indices[..., : x.size(1)] + return x, padding, mask_indices + + +class ConvDownsampler(nn.Module): + """Conv downsampler that combines two modules with different label rate with downsample or upsample. + To allow different ratios than integer, two convs are utilized with first to upsample (numerator) + and the second to downsample (denominator)""" + + def __init__( + self, + k, + label_rate, + dropout, + channels, + activation, + log_compression=False, + skip_connections=True, + highway=True, + residual_scale=0.4, + non_affine_group_norm=False, + ): + super().__init__() + + def downsample_block(channel, k, stride): + return nn.Sequential( + # with padding (k - 1) // 2 to keep the same size + nn.Conv1d( + channel, + channel, + k, + stride=stride, + bias=False, + padding=(k - 1) // 2, + ), + nn.Dropout(p=dropout), + norm_block( + is_layer_norm=False, dim=channel, affine=not non_affine_group_norm + ), + activation, + ) + + assert len(label_rate) == 2, "label_rate should be sized two to apply fusion" + self.downsample_conv = downsample_block(channels, k, label_rate[1]) + + upsample_rate, self.downsample_rate = label_rate + assert upsample_rate == 1, "must be 1 to perform downsample only" + self.log_compression = log_compression + self.skip_connections = skip_connections + self.highway = highway # Useless as placeholder + self.residual_scale = math.sqrt(residual_scale) + + def forward(self, x, padding=None, mask_indices=None): + # Assume x1 = (B, T, C) as input + x = x.permute(0, 2, 1) + + residual_before_downsample = x + x = self.downsample_conv(x) + downsample_size = x.size(2) + + if self.skip_connections: + residual_downsample = residual_before_downsample[ + ..., :: self.downsample_rate + ] + downsample_size = min(x.size(2), residual_downsample.size(2)) + x = ( + x[..., :downsample_size] + residual_downsample[..., :downsample_size] + ) * self.residual_scale + + if self.log_compression: + x = x.abs() + x = x + 1 + x = x.log() + + x = x.permute(0, 2, 1) + + # process padding + if padding is not None: + padding = padding[..., :: self.downsample_rate] + padding = padding[..., : x.size(1)] + + # process mask indices + if mask_indices is not None: + mask_indices = mask_indices[..., :: self.downsample_rate] + mask_indices = mask_indices[..., : x.size(1)] + return x, padding, mask_indices + + +class ConvUpsampler(nn.Module): + """Conv upsampler that combines two modules with different label rate with downsample or upsample. + To allow different ratios than integer, two convs are utilized with first to upsample (numerator) + and the second to downsample (denominator)""" + + def __init__( + self, + k, + label_rate, + dropout, + channels, + activation, + log_compression=False, + skip_connections=True, + highway=True, + residual_scale=0.4, + non_affine_group_norm=False, + ): + super().__init__() + + def upsample_block(channel, k, stride): + return nn.Sequential( + # with padding (k - 1) // 2 to keep the same size + nn.ConvTranspose1d( + channel, + channel, + k, + stride=stride, + bias=False, + padding=0, # padding=(k - 1) // 2, + output_padding=(stride - 1), + ), + nn.Dropout(p=dropout), + norm_block( + is_layer_norm=False, dim=channel, affine=not non_affine_group_norm + ), + activation, + ) + + assert len(label_rate) == 2, "label_rate should be sized two to apply fusion" + # Lout =(Lin~H~R1)~Wstride~H~R2~Wpadding+dilation~W(kernel_size~H~R1)+output_padding+1 + self.upsample_conv = upsample_block(channels, k, label_rate[0]) + + self.upsample_rate, downsample_rate = label_rate + assert downsample_rate == 1, "must be 1 to perform downsample only" + self.log_compression = log_compression + self.skip_connections = skip_connections + self.highway = highway # Useless + self.residual_scale = math.sqrt(residual_scale) + + def forward(self, x, padding=None, mask_indices=None): + # Assume x1 = (B, T, C) as input + x = x.permute(0, 2, 1) + residual_before_upsample = x + x = self.upsample_conv(x) + upsample_size = x.size(2) + + # conduct upsample + if self.skip_connections: + residual_upsample = torch.repeat_interleave( + residual_before_upsample, self.upsample_rate, dim=2 + ) + upsample_size = min(upsample_size, residual_upsample.size(2)) + x = ( + x[..., :upsample_size] + residual_upsample[..., :upsample_size] + ) * self.residual_scale + + if self.log_compression: + x = x.abs() + x = x + 1 + x = x.log() + + x = x.permute(0, 2, 1) + + # process padding + if padding is not None: + padding = torch.repeat_interleave(padding, self.upsample_rate, dim=1) + padding = padding[..., : x.size(1)] + + # process mask indices + if mask_indices is not None: + mask_indices = torch.repeat_interleave( + mask_indices, self.upsample_rate, dim=1 + ) + mask_indices = mask_indices[..., : x.size(1)] + return x, padding, mask_indices diff --git a/fairseq/models/multires_hubert/multires_hubert_asr.py b/fairseq/models/multires_hubert/multires_hubert_asr.py new file mode 100644 index 0000000000..2e7ad99ce1 --- /dev/null +++ b/fairseq/models/multires_hubert/multires_hubert_asr.py @@ -0,0 +1,376 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Any + +import torch +import torch.nn as nn +from omegaconf import II, MISSING + +from fairseq import checkpoint_utils, tasks, utils +from fairseq.dataclass import FairseqDataclass +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.models import BaseFairseqModel, FairseqEncoder, register_model +from fairseq.models.hubert.hubert import MASKING_DISTRIBUTION_CHOICES +from fairseq.tasks import FairseqTask + + +@dataclass +class MultiresHubertAsrConfig(FairseqDataclass): + multires_hubert_path: str = field( + default=MISSING, metadata={"help": "path to multires_hubert model"} + ) + no_pretrained_weights: bool = field( + default=False, + metadata={"help": "if true, does not load pretrained weights"}, + ) + dropout_input: float = field( + default=0.0, + metadata={"help": "dropout to apply to the input (after feat extr)"}, + ) + final_dropout: float = field( + default=0.0, + metadata={"help": "dropout after transformer and before final projection"}, + ) + dropout: float = field( + default=0.0, + metadata={"help": "dropout probability inside hubert model"}, + ) + attention_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability for attention weights " "inside hubert model" + }, + ) + activation_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability after activation in FFN " "inside hubert model" + }, + ) + + # masking + apply_mask: bool = field( + default=False, metadata={"help": "apply masking during fine-tuning"} + ) + mask_length: int = field( + default=10, metadata={"help": "repeat the mask indices multiple times"} + ) + mask_prob: float = field( + default=0.5, + metadata={ + "help": "probability of replacing a token with mask " + "(normalized by length)" + }, + ) + mask_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose masks"} + ) + mask_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument " + "(used for more complex distributions), " + "see help in compute_mask_indices" + }, + ) + no_mask_overlap: bool = field( + default=False, metadata={"help": "whether to allow masks to overlap"} + ) + + # channel masking + mask_channel_length: int = field( + default=10, + metadata={"help": "length of the mask for features (channels)"}, + ) + mask_channel_prob: float = field( + default=0.0, + metadata={"help": "probability of replacing a feature with 0"}, + ) + mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", + metadata={"help": "how to choose mask length for channel masking"}, + ) + mask_channel_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument " + "(used for more complex distributions), " + "see help in compute_mask_indices" + }, + ) + no_mask_channel_overlap: bool = field( + default=False, + metadata={"help": "whether to allow channel masks to overlap"}, + ) + freeze_finetune_updates: int = field( + default=0, + metadata={"help": "dont finetune hubert for this many updates"}, + ) + feature_grad_mult: float = field( + default=0.0, + metadata={"help": "reset feature grad mult in hubert to this"}, + ) + layerdrop: float = field( + default=0.0, + metadata={"help": "probability of dropping a layer in hubert"}, + ) + normalize: bool = II("task.normalize") + data: str = II("task.data") + + # this holds the loaded hubert args + multires_hubert_args: Any = None + + +@dataclass +class MultiresHubertCtcConfig(MultiresHubertAsrConfig): + pass + + +@register_model("multires_hubert_ctc", dataclass=MultiresHubertAsrConfig) +class MultiresHubertCtc(BaseFairseqModel): + def __init__( + self, cfg: MultiresHubertAsrConfig, multireshubert_encoder: BaseFairseqModel + ): + super().__init__() + self.cfg = cfg + self.multireshubert_encoder = multireshubert_encoder + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + return state_dict + + @classmethod + def build_model(cls, cfg: MultiresHubertAsrConfig, task: FairseqTask): + """Build a new model instance.""" + multireshubert_encoder = MultiresHubertEncoder(cfg, task) + return cls(cfg, multireshubert_encoder) + + def get_normalized_probs(self, net_output, log_probs, sample=None): + """Get normalized probabilities (or log probs) from a net's output.""" + + logits = net_output["encoder_out"] + if log_probs: + return utils.log_softmax(logits.float(), dim=-1) + else: + return utils.softmax(logits.float(), dim=-1) + + def get_logits(self, net_output): + logits = net_output["encoder_out"] + padding = net_output["encoder_padding_mask"] + if padding is not None and padding.any(): + padding = padding.T + logits[padding][..., 0] = 0 + logits[padding][..., 1:] = float("-inf") + + return logits + + def forward(self, **kwargs): + x = self.multireshubert_encoder(**kwargs) + return x + + +@dataclass +class MultiresHubertSeq2SeqConfig(MultiresHubertAsrConfig): + decoder_embed_dim: int = field( + default=768, metadata={"help": "decoder embedding dimension"} + ) + decoder_ffn_embed_dim: int = field( + default=3072, metadata={"help": "decoder embedding dimension for FFN"} + ) + decoder_layers: int = field(default=6, metadata={"help": "num of decoder layers"}) + decoder_layerdrop: float = field( + default=0.0, metadata={"help": "decoder layerdrop chance"} + ) + decoder_attention_heads: int = field( + default=4, metadata={"help": "num decoder attention heads"} + ) + decoder_learned_pos: bool = field( + default=False, + metadata={"help": "use learned positional embeddings in the decoder"}, + ) + decoder_normalize_before: bool = field( + default=False, + metadata={"help": "apply layernorm before each decoder block"}, + ) + no_token_positional_embeddings: bool = field( + default=False, + metadata={ + "help": "if set, disables positional embeddings " "(outside self attention)" + }, + ) + decoder_dropout: float = field( + default=0.0, metadata={"help": "dropout probability in the decoder"} + ) + decoder_attention_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability for attention weights " "inside the decoder" + }, + ) + decoder_activation_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability after activation in FFN " "inside the decoder" + }, + ) + max_target_positions: int = field( + default=2048, metadata={"help": "max target positions"} + ) + share_decoder_input_output_embed: bool = field( + default=False, + metadata={"help": "share decoder input and output embeddings"}, + ) + + +class MultiresHubertEncoder(FairseqEncoder): + def __init__(self, cfg: MultiresHubertAsrConfig, task): + self.apply_mask = cfg.apply_mask + + arg_overrides = { + "dropout": cfg.dropout, + "activation_dropout": cfg.activation_dropout, + "dropout_input": cfg.dropout_input, + "attention_dropout": cfg.attention_dropout, + "mask_length": cfg.mask_length, + "mask_prob": cfg.mask_prob, + "mask_selection": cfg.mask_selection, + "mask_other": cfg.mask_other, + "no_mask_overlap": cfg.no_mask_overlap, + "mask_channel_length": cfg.mask_channel_length, + "mask_channel_prob": cfg.mask_channel_prob, + "mask_channel_selection": cfg.mask_channel_selection, + "mask_channel_other": cfg.mask_channel_other, + "no_mask_channel_overlap": cfg.no_mask_channel_overlap, + "encoder_layerdrop": cfg.layerdrop, + "feature_grad_mult": cfg.feature_grad_mult, + } + + if cfg.multires_hubert_args is None: + state = checkpoint_utils.load_checkpoint_to_cpu( + cfg.multires_hubert_path, arg_overrides + ) + multires_hubert_args = state.get("cfg", None) + if multires_hubert_args is None: + multires_hubert_args = convert_namespace_to_omegaconf(state["args"]) + cfg.multires_hubert_args = multires_hubert_args + else: + state = None + multires_hubert_args = cfg.multires_hubert_args + if isinstance(multires_hubert_args, Namespace): + cfg.multires_hubert_args = ( + multires_hubert_args + ) = convert_namespace_to_omegaconf(multires_hubert_args) + + assert cfg.normalize == multires_hubert_args.task.normalize, ( + "Fine-tuning works best when data normalization is the same. " + "Please check that --normalize is set or unset for " + "both pre-training and here" + ) + + multires_hubert_args.task.data = cfg.data + pretrain_task = tasks.setup_task(multires_hubert_args.task) + if state is not None and "task_state" in state: + # This will load the stored "dictionaries" object + pretrain_task.load_state_dict(state["task_state"]) + else: + pretrain_task.load_state_dict(task.state_dict()) + + model = pretrain_task.build_model( + multires_hubert_args.model, from_checkpoint=True + ) + if state is not None and not cfg.no_pretrained_weights: + # set strict=False because we omit some modules + model.load_state_dict(state["model"], strict=False) + + model.remove_pretraining_modules() + + super().__init__(pretrain_task.source_dictionary) + + d = multires_hubert_args.model.encoder_embed_dim + + self.multires_hubert_model = model + + self.final_dropout = nn.Dropout(cfg.final_dropout) + self.freeze_finetune_updates = cfg.freeze_finetune_updates + self.num_updates = 0 + + if task.target_dictionary is not None: + self.proj = Linear(d, len(task.target_dictionary)) + elif getattr(cfg, "decoder_embed_dim", d) != d: + self.proj = Linear(d, cfg.decoder_embed_dim) + else: + self.proj = None + + def set_num_updates(self, num_updates): + """Set the number of parameters updates.""" + super().set_num_updates(num_updates) + self.num_updates = num_updates + + def forward(self, source, padding_mask, tbc=True, **kwargs): + multires_hubert_args = { + "source": source, + "padding_mask": padding_mask, + "mask": self.apply_mask and self.training, + "last_layer": True, + } + + ft = self.freeze_finetune_updates <= self.num_updates + + with torch.no_grad() if not ft else contextlib.ExitStack(): + x, padding_mask = self.multires_hubert_model.extract_features( + **multires_hubert_args + ) + + if tbc: + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + x = self.final_dropout(x) + + if self.proj: + x = self.proj(x) + + return { + "encoder_out": x, # T x B x C + "encoder_padding_mask": padding_mask, # B x T + "padding_mask": padding_mask, + } + + def reorder_encoder_out(self, encoder_out, new_order): + if encoder_out["encoder_out"] is not None: + encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( + 1, new_order + ) + if encoder_out["encoder_padding_mask"] is not None: + encoder_out["encoder_padding_mask"] = encoder_out[ + "encoder_padding_mask" + ].index_select(0, new_order) + return encoder_out + + def max_positions(self): + """Maximum input length supported by the encoder.""" + return None + + def upgrade_state_dict_named(self, state_dict, name): + return state_dict + + +def Embedding(num_embeddings, embedding_dim, padding_idx): + m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) + nn.init.constant_(m.weight[padding_idx], 0) + return m + + +def Linear(in_features, out_features, bias=True): + m = nn.Linear(in_features, out_features, bias) + nn.init.xavier_uniform_(m.weight) + if bias: + nn.init.constant_(m.bias, 0.0) + return m diff --git a/fairseq/models/nat/fairseq_nat_model.py b/fairseq/models/nat/fairseq_nat_model.py index 1dbc29d0f4..a5594a4ed9 100644 --- a/fairseq/models/nat/fairseq_nat_model.py +++ b/fairseq/models/nat/fairseq_nat_model.py @@ -18,18 +18,26 @@ def ensemble_encoder(func): def wrapper(self, *args, **kwargs): if self.ensemble_models is None or len(self.ensemble_models) == 1: return func(self, *args, **kwargs) - encoder_outs = [func(model, *args, **kwargs) for model in self.ensemble_models] - _encoder_out = encoder_outs[0] + encoder_outs = [ + func(model, *args, **kwargs, return_all_hiddens=True) + for model in self.ensemble_models + ] + _encoder_out = encoder_outs[0].copy() def stack(key): - outs = [getattr(e, key) for e in encoder_outs] - return torch.stack(outs, -1) if outs[0] is not None else None + outs = [e[key][0] for e in encoder_outs] + return [torch.stack(outs, -1) if outs[0] is not None else None] - return _encoder_out._replace( - encoder_out=stack("encoder_out"), - encoder_embedding=stack("encoder_embedding"), - encoder_states=stack("encoder_states"), - ) + _encoder_out["encoder_out"] = stack("encoder_out") + _encoder_out["encoder_embedding"] = stack("encoder_embedding") + + num_layers = len(_encoder_out["encoder_states"]) + if num_layers > 0: + _encoder_out["encoder_states"] = [ + torch.stack([e["encoder_states"][i] for e in encoder_outs], -1) + for i in range(num_layers) + ] + return _encoder_out return wrapper @@ -41,12 +49,17 @@ def wrapper(self, normalize=False, encoder_out=None, *args, **kwargs): self, normalize=normalize, encoder_out=encoder_out, *args, **kwargs ) + def _replace(encoder_out, new_val): + new_encoder_out = encoder_out.copy() + new_encoder_out["encoder_out"] = [new_val] + return new_encoder_out + action_outs = [ func( model, normalize=normalize, - encoder_out=encoder_out._replace( - encoder_out=encoder_out.encoder_out[:, :, :, i] + encoder_out=_replace( + encoder_out, encoder_out["encoder_out"][0][:, :, :, i] ), *args, **kwargs diff --git a/fairseq/models/nat/levenshtein_transformer.py b/fairseq/models/nat/levenshtein_transformer.py index f7a3f003ca..d60d3c52d5 100644 --- a/fairseq/models/nat/levenshtein_transformer.py +++ b/fairseq/models/nat/levenshtein_transformer.py @@ -9,7 +9,8 @@ from fairseq.iterative_refinement_generator import DecoderOut from fairseq.models import register_model, register_model_architecture from fairseq.models.nat import FairseqNATDecoder, FairseqNATModel, ensemble_decoder -from fairseq.models.transformer import Embedding, TransformerDecoderLayer +from fairseq.models.transformer import Embedding +from fairseq.modules import TransformerDecoderLayer from fairseq.modules.transformer_sentence_encoder import init_bert_params from .levenshtein_utils import ( @@ -149,11 +150,11 @@ def forward_decoder( if max_ratio is None: max_lens = torch.zeros_like(output_tokens).fill_(255) else: - if encoder_out.encoder_padding_mask is None: - max_src_len = encoder_out.encoder_out.size(0) - src_lens = encoder_out.encoder_out.new(bsz).fill_(max_src_len) + if not encoder_out["encoder_padding_mask"]: + max_src_len = encoder_out["encoder_out"].size(0) + src_lens = encoder_out["encoder_out"].new(bsz).fill_(max_src_len) else: - src_lens = (~encoder_out.encoder_padding_mask).sum(1) + src_lens = (~encoder_out["encoder_padding_mask"][0]).sum(1) max_lens = (src_lens * max_ratio).clamp(min=10).long() # delete words @@ -256,7 +257,7 @@ def initialize_output_tokens(self, encoder_out, src_tokens): initial_output_scores = initial_output_tokens.new_zeros( *initial_output_tokens.size() - ).type_as(encoder_out.encoder_out) + ).type_as(encoder_out["encoder_out"][0]) return DecoderOut( output_tokens=initial_output_tokens, @@ -357,8 +358,15 @@ def extract_features( for _, layer in enumerate(layers[:early_exit]): x, attn, _ = layer( x, - encoder_out.encoder_out if encoder_out is not None else None, - encoder_out.encoder_padding_mask if encoder_out is not None else None, + encoder_out["encoder_out"][0] + if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) + else None, + encoder_out["encoder_padding_mask"][0] + if ( + encoder_out is not None + and len(encoder_out["encoder_padding_mask"]) > 0 + ) + else None, self_attn_mask=None, self_attn_padding_mask=decoder_padding_mask, ) diff --git a/fairseq/models/nat/nonautoregressive_ensembles.py b/fairseq/models/nat/nonautoregressive_ensembles.py index 46bb8aac43..0a0221f9c4 100644 --- a/fairseq/models/nat/nonautoregressive_ensembles.py +++ b/fairseq/models/nat/nonautoregressive_ensembles.py @@ -83,14 +83,14 @@ def forward_decoder( if max_ratio is None: max_lens = output_tokens.new().fill_(255) else: - if encoder_outs[0].encoder_padding_mask is None: + if not encoder_outs[0]["encoder_padding_mask"]: src_lens = ( - encoder_outs[0] - .encoder_out.new(bsz) - .fill_(encoder_outs[0].encoder_out.size(1)) + encoder_outs[0]["encoder_out"][0] + .new(bsz) + .fill_(encoder_outs[0]["encoder_out"][0].size(1)) ) else: - src_lens = (~encoder_outs[0].encoder_padding_mask).sum(1) + src_lens = (~encoder_outs[0]["encoder_padding_mask"][0]).sum(1) max_lens = (src_lens * max_ratio).clamp(min=10).long() # delete words diff --git a/fairseq/models/nat/nonautoregressive_transformer.py b/fairseq/models/nat/nonautoregressive_transformer.py index 735297fc29..d114202d25 100644 --- a/fairseq/models/nat/nonautoregressive_transformer.py +++ b/fairseq/models/nat/nonautoregressive_transformer.py @@ -163,7 +163,7 @@ def initialize_output_tokens(self, encoder_out, src_tokens): initial_output_scores = initial_output_tokens.new_zeros( *initial_output_tokens.size() - ).type_as(encoder_out.encoder_out) + ).type_as(encoder_out["encoder_out"][0]) return DecoderOut( output_tokens=initial_output_tokens, @@ -233,8 +233,11 @@ def forward(self, normalize, encoder_out, prev_output_tokens, step=0, **unused): @ensemble_decoder def forward_length(self, normalize, encoder_out): - enc_feats = encoder_out.encoder_out # T x B x C - src_masks = encoder_out.encoder_padding_mask # B x T or None + enc_feats = encoder_out["encoder_out"][0] # T x B x C + if len(encoder_out["encoder_padding_mask"]) > 0: + src_masks = encoder_out["encoder_padding_mask"][0] # B x T + else: + src_masks = None enc_feats = _mean_pooling(enc_feats, src_masks) if self.sg_length_pred: enc_feats = enc_feats.detach() @@ -264,8 +267,11 @@ def extract_features( """ # embedding if embedding_copy: - src_embd = encoder_out.encoder_embedding - src_mask = encoder_out.encoder_padding_mask + src_embd = encoder_out["encoder_embedding"][0] + if len(encoder_out["encoder_padding_mask"]) > 0: + src_mask = encoder_out["encoder_padding_mask"][0] + else: + src_mask = None src_mask = ( ~src_mask if src_mask is not None @@ -297,8 +303,15 @@ def extract_features( x, attn, _ = layer( x, - encoder_out.encoder_out if encoder_out is not None else None, - encoder_out.encoder_padding_mask if encoder_out is not None else None, + encoder_out["encoder_out"][0] + if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) + else None, + encoder_out["encoder_padding_mask"][0] + if ( + encoder_out is not None + and len(encoder_out["encoder_padding_mask"]) > 0 + ) + else None, self_attn_mask=None, self_attn_padding_mask=decoder_padding_mask, ) @@ -353,8 +366,11 @@ def forward_copying_source(self, src_embeds, src_masks, tgt_masks): return copied_embedding def forward_length_prediction(self, length_out, encoder_out, tgt_tokens=None): - enc_feats = encoder_out.encoder_out # T x B x C - src_masks = encoder_out.encoder_padding_mask # B x T or None + enc_feats = encoder_out["encoder_out"][0] # T x B x C + if len(encoder_out["encoder_padding_mask"]) > 0: + src_masks = encoder_out["encoder_padding_mask"][0] # B x T + else: + src_masks = None if self.pred_length_offset: if src_masks is None: src_lengs = enc_feats.new_ones(enc_feats.size(1)).fill_( diff --git a/fairseq/models/roberta/__init__.py b/fairseq/models/roberta/__init__.py index 56579e5915..4cd723ae96 100644 --- a/fairseq/models/roberta/__init__.py +++ b/fairseq/models/roberta/__init__.py @@ -5,5 +5,7 @@ from .hub_interface import * # noqa from .model import * # noqa +from .enc_dec import * # noqa from .model_camembert import * # noqa +from .model_gottbert import * # noqa from .model_xlmr import * # noqa diff --git a/fairseq/models/roberta/enc_dec.py b/fairseq/models/roberta/enc_dec.py new file mode 100644 index 0000000000..e538dee0aa --- /dev/null +++ b/fairseq/models/roberta/enc_dec.py @@ -0,0 +1,192 @@ +import argparse +import logging + +import torch.nn as nn +import fairseq.checkpoint_utils +from fairseq.models import ( + FairseqEncoderDecoderModel, + register_model, + register_model_architecture, +) +from fairseq.models.transformer import TransformerDecoder +from fairseq.models.roberta import model as roberta + +logger = logging.getLogger(__name__) + + +@register_model("roberta_enc_dec") +class RobertaEncDecModel(FairseqEncoderDecoderModel): + @staticmethod + def add_args(parser): + parser.add_argument( + "--pretrained-mlm-checkpoint", + default=None, + type=str, + metavar="PRETRAINED", + help="path to pretrained mlm checkpoint", + ) + parser.add_argument( + "--pretrained-decoder", action="store_true", help="reload decoder" + ) + parser.add_argument( + "--hack-layernorm-embedding", + action="store_true", + help="hack to reload old models trained with encoder-normalize-before=False (no equivalent to encoder-normalize-before=False and layernorm_embedding=False", + ) + parser.add_argument( + "--share-decoder-input-output-embed", + action="store_true", + help="share decoder input and output embeddings", + ) + parser.add_argument( + "--share-all-embeddings", + action="store_true", + help="share encoder, decoder and output embeddings" + " (requires shared dictionary and embed dim)", + ) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present + base_enc_dec_architecture(args) + if args.pretrained_mlm_checkpoint: + arg_overrides = None + if args.hack_layernorm_embedding: + arg_overrides = {"layernorm_embedding": False} + loaded = fairseq.checkpoint_utils.load_model_ensemble_and_task( + [args.pretrained_mlm_checkpoint], arg_overrides=arg_overrides + ) + ([roberta_enc], _cfg, _task) = loaded + else: + # Do we need to edit untie_weights here ? + share_in_out = ( + args.share_decoder_input_output_embed or args.share_all_embeddings + ) + args.untie_weights_roberta = not share_in_out + if args.hack_layernorm_embedding: + args.layernorm_embedding = False + args.encoder_normalize_before = False + roberta_enc = roberta.RobertaModel.build_model(args, task) + + return cls.from_roberta(roberta_enc, args, task.source_dictionary) + + @staticmethod + def from_roberta(roberta_enc: roberta.RobertaModel, args, dictionary): + encoder = roberta_enc.encoder.sentence_encoder + vocab_size, embed_dim = encoder.embed_tokens.weight.shape + + if args.share_all_embeddings: + lm_head = roberta_enc.encoder.lm_head + assert encoder.embed_tokens.weight is lm_head.weight, ( + "Can't use --share-all-embeddings with a model " + "that was pretraiend with --untie-weights-roberta_enc" + ) + else: + lm_head = roberta.RobertaLMHead( + embed_dim, vocab_size, roberta_enc.args.activation_fn + ) + + dec_embs = nn.Embedding(vocab_size, embed_dim, dictionary.pad()) + if args.share_all_embeddings or args.share_decoder_input_output_embed: + # Note: I wasn't able to use Embedding _weight parameter to achive this sharing. + dec_embs.weight = lm_head.weight + + decoder = TransformerDecoder( + RobertaEncDecModel.read_args_from_roberta(roberta_enc.args), + dictionary, + dec_embs, + no_encoder_attn=False, + output_projection=lm_head, + ) + if getattr(args, "pretrained_decoder", False): + decoder_dict = encoder.state_dict() + + # TODO: hide setting "encoder_attn" layers behind a flag. + for k, w in list(decoder_dict.items()): + if ".self_attn" in k: + k_enc_attn = k.replace(".self_attn", ".encoder_attn") + decoder_dict[k_enc_attn] = w.detach().clone() + + for k, w in lm_head.state_dict().items(): + decoder_dict["output_projection." + k] = w + + missing_keys, unexpected_keys = decoder.load_state_dict( + decoder_dict, strict=False + ) + # missing_keys = [m for m in missing_keys if ".encoder_attn" not in m] + assert not missing_keys and not unexpected_keys, ( + "Failed to load state dict. " + f"Missing keys: {missing_keys}. " + f"Unexpected keys: {unexpected_keys}." + ) + + if args.share_all_embeddings: + assert decoder.output_projection.weight is decoder.embed_tokens.weight + assert encoder.embed_tokens.weight is decoder.embed_tokens.weight + elif args.share_decoder_input_output_embed: + assert decoder.output_projection.weight is decoder.embed_tokens.weight + assert encoder.embed_tokens.weight is not decoder.embed_tokens.weight + else: + assert decoder.output_projection.weight is not decoder.embed_tokens.weight + assert encoder.embed_tokens.weight is not decoder.embed_tokens.weight + + return RobertaEncDecModel(encoder, decoder) + + @staticmethod + def read_args_from_roberta(roberta_args: argparse.Namespace): + # TODO: this would become easier if encoder/decoder where using a similar + # TransformerConfig object + args = argparse.Namespace(**vars(roberta_args)) + attr_map = [ + ("encoder_attention_heads", "decoder_attention_heads"), + ("encoder_embed_dim", "decoder_embed_dim"), + ("encoder_embed_dim", "decoder_output_dim"), + ("encoder_normalize_before", "decoder_normalize_before"), + ("encoder_layers_to_keep", "decoder_layers_to_keep"), + ("encoder_ffn_embed_dim", "decoder_ffn_embed_dim"), + ("encoder_layerdrop", "decoder_layerdrop"), + ("encoder_layers", "decoder_layers"), + ("encoder_learned_pos", "decoder_learned_pos"), + # should this be set from here ? + ("max_positions", "max_target_positions"), + ] + for k1, k2 in attr_map: + setattr(args, k2, getattr(roberta_args, k1)) + + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = not roberta_args.untie_weights_roberta + return args + + def upgrade_state_dict_named(self, state_dict, name): + prefix = name + "." if name != "" else "" + super().upgrade_state_dict_named(state_dict, name) + old_keys = list(state_dict.keys()) + + # rename decoder -> encoder before upgrading children modules + for k in old_keys: + if k.startswith(prefix + "encoder.lm_head"): + state_dict.pop(k) + continue + new_k = k + new_k = new_k.replace(".sentence_encoder.", ".") + new_k = new_k.replace("decoder.lm_head.", "decoder.output_projection.") + if k == new_k: + continue + # print(k, "->", new_k) + state_dict[new_k] = state_dict.pop(k) + + +@register_model_architecture("roberta_enc_dec", "roberta_enc_dec") +def base_enc_dec_architecture(args): + args.hack_layernorm_embedding = getattr(args, "hack_layernorm_embedding", False) + args.pretrained_mlm_checkpoint = getattr(args, "pretrained_mlm_checkpoint", None) + args.pretrained_decoder = getattr(args, "pretrained_decoder", None) + args.share_all_embeddings = getattr(args, "share_all_embeddings", False) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + + roberta.base_architecture(args) diff --git a/fairseq/models/roberta/hub_interface.py b/fairseq/models/roberta/hub_interface.py index d6322c30e8..ba298d63ba 100644 --- a/fairseq/models/roberta/hub_interface.py +++ b/fairseq/models/roberta/hub_interface.py @@ -14,7 +14,7 @@ class RobertaHubInterface(nn.Module): """A simple PyTorch Hub interface to RoBERTa. - Usage: https://github.com/pytorch/fairseq/tree/master/examples/roberta + Usage: https://github.com/pytorch/fairseq/tree/main/examples/roberta """ def __init__(self, cfg, task, model): @@ -71,7 +71,7 @@ def decode(self, tokens: torch.LongTensor): tokens = tokens[1:] # remove <s> eos_mask = tokens == self.task.source_dictionary.eos() doc_mask = eos_mask[1:] & eos_mask[:-1] - sentences = np.split(tokens, doc_mask.nonzero(as_tuple=False)[0] + 1) + sentences = np.split(tokens, doc_mask.nonzero()[0] + 1) sentences = [ self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences ] diff --git a/fairseq/models/roberta/model.py b/fairseq/models/roberta/model.py index 5c9f92a149..d7ced9190c 100644 --- a/fairseq/models/roberta/model.py +++ b/fairseq/models/roberta/model.py @@ -11,6 +11,7 @@ import torch import torch.nn as nn import torch.nn.functional as F + from fairseq import utils from fairseq.models import ( FairseqEncoder, @@ -18,13 +19,14 @@ register_model, register_model_architecture, ) -from fairseq.modules import LayerNorm, TransformerSentenceEncoder +from fairseq.models.transformer import DEFAULT_MIN_PARAMS_TO_WRAP, TransformerEncoder +from fairseq.modules import LayerNorm from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ from fairseq.modules.transformer_sentence_encoder import init_bert_params +from fairseq.utils import safe_getattr, safe_hasattr from .hub_interface import RobertaHubInterface - logger = logging.getLogger(__name__) @@ -87,6 +89,11 @@ def add_args(parser): action="store_true", help="apply layernorm before each encoder block", ) + parser.add_argument( + "--layernorm-embedding", + action="store_true", + help="add layernorm to embedding", + ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) @@ -116,6 +123,11 @@ def add_args(parser): action="store_true", help="(re-)register and load heads when loading checkpoints", ) + parser.add_argument( + "--untie-weights-roberta", + action="store_true", + help="Untie weights between embeddings and classifiers in RoBERTa", + ) # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument( "--encoder-layerdrop", @@ -151,29 +163,82 @@ def add_args(parser): default=0, help="scalar quantization noise and scalar quantization at training time", ) - parser.add_argument( - "--untie-weights-roberta", - action="store_true", - help="Untie weights between embeddings and classifiers in RoBERTa", - ) + # args for "Better Fine-Tuning by Reducing Representational Collapse" (Aghajanyan et al. 2020) parser.add_argument( "--spectral-norm-classification-head", action="store_true", default=False, help="Apply spectral normalization on the classification head", ) + # args for Fully Sharded Data Parallel (FSDP) training + parser.add_argument( + "--min-params-to-wrap", + type=int, + metavar="D", + default=DEFAULT_MIN_PARAMS_TO_WRAP, + help=( + "minimum number of params for a layer to be wrapped with FSDP() when " + "training with --ddp-backend=fully_sharded. Smaller values will " + "improve memory efficiency, but may make torch.distributed " + "communication less efficient due to smaller input sizes. This option " + "is set to 0 (i.e., always wrap) when --checkpoint-activations or " + "--offload-activations are passed." + ), + ) + # args for AdaPruning + # In short, it adds regularizarion for the multihead attention module and feed forward neural nets + # For more details, please refer to the paper https://openreview.net/forum?id=_CMSV7FTzGI + parser.add_argument( + "--mha-reg-scale-factor", + type=float, + metavar="D", + default=0.0, + help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375", + ) + parser.add_argument( + "--ffn-reg-scale-factor", + type=float, + metavar="D", + default=0.0, + help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375", + ) + parser.add_argument( + "--mha-heads-to-keep", + type=int, + metavar="D", + default=-1, + help="number of heads to keep in each multi-head attention module, -1 means keeping all heads", + ) + parser.add_argument( + "--ffn-blocks-to-remove", + type=int, + metavar="D", + default=-1, + help="number of feedforward blocks to remove in each transformer layer, -1 means keeping all ffn blocks", + ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" + from omegaconf import OmegaConf + + if OmegaConf.is_config(args): + OmegaConf.set_struct(args, False) + # make sure all arguments are present base_architecture(args) - if not hasattr(args, "max_positions"): + if not safe_hasattr(args, "max_positions"): + if not safe_hasattr(args, "tokens_per_sample"): + args.tokens_per_sample = task.max_positions() args.max_positions = args.tokens_per_sample encoder = RobertaEncoder(args, task.source_dictionary) + + if OmegaConf.is_config(args): + OmegaConf.set_struct(args, True) + return cls(args, encoder) def forward( @@ -182,7 +247,7 @@ def forward( features_only=False, return_all_hiddens=False, classification_head_name=None, - **kwargs + **kwargs, ): if classification_head_name is not None: features_only = True @@ -193,6 +258,66 @@ def forward( x = self.classification_heads[classification_head_name](x) return x, extra + def _get_adaptive_head_loss(self): + norm_loss = 0 + scaling = float(self.args.mha_reg_scale_factor) + for layer in self.encoder.sentence_encoder.layers: + norm_loss_layer = 0 + for i in range(layer.self_attn.num_heads): + start_idx = i * layer.self_attn.head_dim + end_idx = (i + 1) * layer.self_attn.head_dim + norm_loss_layer += scaling * ( + torch.sum( + torch.abs( + layer.self_attn.q_proj.weight[ + start_idx:end_idx, + ] + ) + ) + + torch.sum( + torch.abs(layer.self_attn.q_proj.bias[start_idx:end_idx]) + ) + ) + norm_loss_layer += scaling * ( + torch.sum( + torch.abs( + layer.self_attn.k_proj.weight[ + start_idx:end_idx, + ] + ) + ) + + torch.sum( + torch.abs(layer.self_attn.k_proj.bias[start_idx:end_idx]) + ) + ) + norm_loss_layer += scaling * ( + torch.sum( + torch.abs( + layer.self_attn.v_proj.weight[ + start_idx:end_idx, + ] + ) + ) + + torch.sum( + torch.abs(layer.self_attn.v_proj.bias[start_idx:end_idx]) + ) + ) + + norm_loss += norm_loss_layer + return norm_loss + + def _get_adaptive_ffn_loss(self): + ffn_scale_factor = float(self.args.ffn_reg_scale_factor) + filter_loss = 0 + for layer in self.encoder.sentence_encoder.layers: + filter_loss += torch.sum( + torch.abs(layer.fc1.weight * ffn_scale_factor) + ) + torch.sum(torch.abs(layer.fc2.weight * ffn_scale_factor)) + filter_loss += torch.sum( + torch.abs(layer.fc1.bias * ffn_scale_factor) + ) + torch.sum(torch.abs(layer.fc2.bias * ffn_scale_factor)) + return filter_loss + def get_normalized_probs(self, net_output, log_probs, sample=None): """Get normalized probabilities (or log probs) from a net's output.""" logits = net_output[0].float() @@ -237,7 +362,7 @@ def from_pretrained( checkpoint_file="model.pt", data_name_or_path=".", bpe="gpt2", - **kwargs + **kwargs, ): from fairseq import hub_utils @@ -264,6 +389,13 @@ def upgrade_state_dict_named(self, state_dict, name): state_dict[new_k] = state_dict[k] del state_dict[k] + # rename emb_layer_norm -> layernorm_embedding + for k in list(state_dict.keys()): + if ".emb_layer_norm." in k: + new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.") + state_dict[new_k] = state_dict[k] + del state_dict[k] + # upgrade children modules super().upgrade_state_dict_named(state_dict, name) @@ -321,6 +453,19 @@ def upgrade_state_dict_named(self, state_dict, name): logger.info("Overwriting " + prefix + "classification_heads." + k) state_dict[prefix + "classification_heads." + k] = v + # adapt data2vec models + if ( + "encoder._ema" in state_dict + and "encoder.lm_head.weight" not in state_dict + ): + lm_state = self.encoder.lm_head.state_dict() + for k, v in lm_state.items(): + state_dict["encoder.lm_head." + k] = v + + for k in list(state_dict.keys()): + if k.startswith("encoder.regression_head") or k == "encoder._ema": + del state_dict[k] + class RobertaLMHead(nn.Module): """Head for masked language modeling.""" @@ -393,33 +538,21 @@ class RobertaEncoder(FairseqEncoder): def __init__(self, args, dictionary): super().__init__(dictionary) + + # set any missing default values + base_architecture(args) self.args = args if args.encoder_layers_to_keep: args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) - self.sentence_encoder = TransformerSentenceEncoder( - padding_idx=dictionary.pad(), - vocab_size=len(dictionary), - num_encoder_layers=args.encoder_layers, - embedding_dim=args.encoder_embed_dim, - ffn_embedding_dim=args.encoder_ffn_embed_dim, - num_attention_heads=args.encoder_attention_heads, - dropout=args.dropout, - attention_dropout=args.attention_dropout, - activation_dropout=args.activation_dropout, - layerdrop=args.encoder_layerdrop, - max_seq_len=args.max_positions, - num_segments=0, - encoder_normalize_before=True, - apply_bert_init=True, - activation_fn=args.activation_fn, - q_noise=args.quant_noise_pq, - qn_block_size=args.quant_noise_pq_block_size, + embed_tokens = self.build_embedding( + len(dictionary), args.encoder_embed_dim, dictionary.pad() ) - args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False) - self.lm_head = RobertaLMHead( + self.sentence_encoder = self.build_encoder(args, dictionary, embed_tokens) + + self.lm_head = self.build_lm_head( embed_dim=args.encoder_embed_dim, output_dim=len(dictionary), activation_fn=args.activation_fn, @@ -430,13 +563,24 @@ def __init__(self, args, dictionary): ), ) + def build_embedding(self, vocab_size, embedding_dim, padding_idx): + return nn.Embedding(vocab_size, embedding_dim, padding_idx) + + def build_encoder(self, args, dictionary, embed_tokens): + encoder = TransformerEncoder(args, dictionary, embed_tokens) + encoder.apply(init_bert_params) + return encoder + + def build_lm_head(self, embed_dim, output_dim, activation_fn, weight): + return RobertaLMHead(embed_dim, output_dim, activation_fn, weight) + def forward( self, src_tokens, features_only=False, return_all_hiddens=False, masked_tokens=None, - **unused + **unused, ): """ Args: @@ -462,13 +606,15 @@ def forward( return x, extra def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs): - inner_states, _ = self.sentence_encoder( + encoder_out = self.sentence_encoder( src_tokens, - last_state_only=not return_all_hiddens, + return_all_hiddens=return_all_hiddens, token_embeddings=kwargs.get("token_embeddings", None), ) - features = inner_states[-1].transpose(0, 1) # T x B x C -> B x T x C - return features, {"inner_states": inner_states if return_all_hiddens else None} + # T x B x C -> B x T x C + features = encoder_out["encoder_out"][0].transpose(0, 1) + inner_states = encoder_out["encoder_states"] if return_all_hiddens else None + return features, {"inner_states": inner_states} def output_layer(self, features, masked_tokens=None, **unused): return self.lm_head(features, masked_tokens) @@ -480,26 +626,57 @@ def max_positions(self): @register_model_architecture("roberta", "roberta") def base_architecture(args): - args.encoder_layers = getattr(args, "encoder_layers", 12) - args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) - args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072) - args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) - - args.activation_fn = getattr(args, "activation_fn", "gelu") - args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") - - args.dropout = getattr(args, "dropout", 0.1) - args.attention_dropout = getattr(args, "attention_dropout", 0.1) - args.activation_dropout = getattr(args, "activation_dropout", 0.0) - args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) - args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None) - args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0) - args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0) - args.spectral_norm_classification_head = getattr( + args.encoder_layers = safe_getattr(args, "encoder_layers", 12) + args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 768) + args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 3072) + args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 12) + + args.dropout = safe_getattr(args, "dropout", 0.1) + args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) + args.activation_dropout = safe_getattr(args, "activation_dropout", 0.0) + args.pooler_dropout = safe_getattr(args, "pooler_dropout", 0.0) + + args.max_source_positions = safe_getattr(args, "max_positions", 512) + args.no_token_positional_embeddings = safe_getattr( + args, "no_token_positional_embeddings", False + ) + + # BERT has a few structural differences compared to the original Transformer + args.encoder_learned_pos = safe_getattr(args, "encoder_learned_pos", True) + args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", True) + args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", True) + args.activation_fn = safe_getattr(args, "activation_fn", "gelu") + args.encoder_normalize_before = safe_getattr( + args, "encoder_normalize_before", False + ) + args.pooler_activation_fn = safe_getattr(args, "pooler_activation_fn", "tanh") + args.untie_weights_roberta = safe_getattr(args, "untie_weights_roberta", False) + + # Adaptive input config + args.adaptive_input = safe_getattr(args, "adaptive_input", False) + + # LayerDrop config + args.encoder_layerdrop = safe_getattr(args, "encoder_layerdrop", 0.0) + args.encoder_layers_to_keep = safe_getattr(args, "encoder_layers_to_keep", None) + + # Quantization noise config + args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0) + args.quant_noise_pq_block_size = safe_getattr(args, "quant_noise_pq_block_size", 8) + args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0) + + # R4F config + args.spectral_norm_classification_head = safe_getattr( args, "spectral_norm_classification_head", False ) +@register_model_architecture("roberta", "roberta_prenorm") +def roberta_prenorm_architecture(args): + args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False) + args.encoder_normalize_before = safe_getattr(args, "encoder_normalize_before", True) + base_architecture(args) + + @register_model_architecture("roberta", "roberta_base") def roberta_base_architecture(args): base_architecture(args) @@ -507,17 +684,17 @@ def roberta_base_architecture(args): @register_model_architecture("roberta", "roberta_large") def roberta_large_architecture(args): - args.encoder_layers = getattr(args, "encoder_layers", 24) - args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) - args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) - args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.encoder_layers = safe_getattr(args, "encoder_layers", 24) + args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 4096) + args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 16) base_architecture(args) @register_model_architecture("roberta", "xlm") def xlm_architecture(args): - args.encoder_layers = getattr(args, "encoder_layers", 16) - args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1280) - args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1280 * 4) - args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.encoder_layers = safe_getattr(args, "encoder_layers", 16) + args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1280) + args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 1280 * 4) + args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 16) base_architecture(args) diff --git a/fairseq/models/roberta/model_gottbert.py b/fairseq/models/roberta/model_gottbert.py new file mode 100644 index 0000000000..dc7a019b33 --- /dev/null +++ b/fairseq/models/roberta/model_gottbert.py @@ -0,0 +1,49 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +GottBERT: a pure German Language Model +""" + +from fairseq.models import register_model + +from .hub_interface import RobertaHubInterface +from .model import RobertaModel + + +@register_model("gottbert") +class GottbertModel(RobertaModel): + @classmethod + def hub_models(cls): + return { + "gottbert-base": "https://dl.gottbert.de/fairseq/models/gottbert-base.tar.gz", + } + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + bpe="hf_byte_bpe", + bpe_vocab="vocab.json", + bpe_merges="merges.txt", + bpe_add_prefix_space=False, + **kwargs + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + bpe=bpe, + load_checkpoint_heads=True, + bpe_vocab=bpe_vocab, + bpe_merges=bpe_merges, + bpe_add_prefix_space=bpe_add_prefix_space, + **kwargs, + ) + return RobertaHubInterface(x["args"], x["task"], x["models"][0]) diff --git a/fairseq/models/roberta/model_xlmr.py b/fairseq/models/roberta/model_xlmr.py index 5886880f73..cf6e354d53 100644 --- a/fairseq/models/roberta/model_xlmr.py +++ b/fairseq/models/roberta/model_xlmr.py @@ -19,6 +19,8 @@ def hub_models(cls): return { "xlmr.base": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz", "xlmr.large": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz", + "xlmr.xl": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr/xlmr.xl.tar.gz", + "xlmr.xxl": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr/xlmr.xxl.tar.gz", } @classmethod diff --git a/fairseq/models/speech_dlm/__init__.py b/fairseq/models/speech_dlm/__init__.py new file mode 100644 index 0000000000..6ea914d6a5 --- /dev/null +++ b/fairseq/models/speech_dlm/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .speech_dlm import * # noqa +from .hub_interface import * # noqa diff --git a/fairseq/models/speech_dlm/hub_interface.py b/fairseq/models/speech_dlm/hub_interface.py new file mode 100644 index 0000000000..11bc0f50bb --- /dev/null +++ b/fairseq/models/speech_dlm/hub_interface.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import logging +from typing import Any, Dict, Iterator, List + +import torch +from fairseq import utils +from omegaconf import open_dict +from torch import nn + +from tqdm import tqdm + +from fairseq.hub_utils import GeneratorHubInterface + + +logger = logging.getLogger(__name__) + + +class MultichannelGeneratorHubInterface(GeneratorHubInterface): + """Pytorch Hub interface for generating sequences from a pre-trained + multichannel language model. + """ + + def __init__(self, cfg, task, models): + super().__init__(cfg, task, models) + self.cfg = cfg + self.task = task + self.models = nn.ModuleList(models) + self.src_dicts = task.source_dictionaries + self.tgt_dicts = task.target_dictionaries + self.channels = task.channels + + # optimize model for generation + for model in self.models: + model.prepare_for_inference_(cfg) + + def sample( + self, + sentences: List[Dict[str, str]], + beam: int = 1, + verbose: bool = False, + **kwargs + ) -> List[str]: + if isinstance(sentences, dict): + return self.sample([sentences], beam=beam, verbose=verbose, **kwargs)[0] + tokenized_sentences = [self.encode(sentence) for sentence in sentences] + batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs) + return [self.decode(hypos[0]["tokens"]) for hypos in batched_hypos] + + def score(self, sentences: List[Dict[str, str]], **kwargs): + raise NotImplementedError( + "MultichannelGeneratorHubInterface doesn't support score() method" + ) + + def generate( + self, + tokenized_sentences: List[Dict[str, torch.LongTensor]], + beam: int = 5, + verbose: bool = False, + skip_invalid_size_inputs=False, + inference_step_args=None, + **kwargs + ) -> List[List[Dict[str, torch.Tensor]]]: + if isinstance(tokenized_sentences, dict): + return self.generate( + [tokenized_sentences], beam=beam, verbose=verbose, **kwargs + )[0] + + # build generator using current args as well as any kwargs + gen_args = copy.deepcopy(self.cfg.generation) + with open_dict(gen_args): + gen_args.beam = beam + for k, v in kwargs.items(): + setattr(gen_args, k, v) + generator = self.task.build_generator(self.models, gen_args) + + inference_step_args = inference_step_args or {} + results = [] + for batch in tqdm( + self._build_batches(tokenized_sentences, skip_invalid_size_inputs) + ): + batch = utils.apply_to_sample(lambda t: t.to(self.device), batch) + translations = self.task.inference_step( + generator, self.models, batch, **inference_step_args + ) + for id, hypos in zip(batch["id"].tolist(), translations): + # The output of the generator is supposed to be a tensor of size (bsz x max_len x n_channels) + # So we need to convert it to dictionary form + for i in range(len(hypos)): + hypos[i]["tokens"] = { + channel: hypos[i]["tokens"][..., j] + for j, channel in enumerate(self.channels) + } + results.append((id, hypos)) + + # sort output to match input order + outputs = [hypos for _, hypos in sorted(results, key=lambda x: x[0])] + + if verbose: + + def getarg(name, default): + return getattr(gen_args, name, getattr(self.cfg, name, default)) + + for source_tokens, target_hypotheses in zip(tokenized_sentences, outputs): + src_str_with_unk = { + channel: self.string(source_tokens[channel], channel) + for channel in source_tokens + } + logger.info("S\t{}".format(src_str_with_unk)) + for hypo in target_hypotheses: + hypo_str = self.decode(hypo["tokens"]) + logger.info("H\t{}\t{}".format(hypo["score"], hypo_str)) + # hypo["positional_scores"]: T x n_channels + pos_scores = {} + for c, channel in enumerate(source_tokens): + pos_scores[channel] = " ".join( + map( + lambda x: "{:.4f}".format(x), + hypo["positional_scores"][:, c].tolist(), + ) + ) + logger.info("P\t{}".format(pos_scores)) + + return outputs + + def encode(self, sentence: Dict[str, str]) -> Dict[str, torch.LongTensor]: + assert isinstance( + sentence, dict + ), "Input sentence is expected to be a dictionary over channels" + assert set(sentence.keys()) == set( + self.channels + ), "Mismatch between input sentence keys and model channels ({} vs {})".format( + set(sentence.keys()), set(self.channels) + ) + encoded_sentence = {} + for channel in sentence: + sentence_channel = sentence[channel] + sentence_channel = self.tokenize(sentence_channel) + sentence_channel = self.apply_bpe(sentence_channel) + sentence_channel = self.binarize(sentence_channel, channel) + encoded_sentence[channel] = sentence_channel + sentence_size = encoded_sentence[self.channels[0]].size() + assert all( + encoded_sentence[channel].size() == sentence_size + for channel in encoded_sentence + ), "Input tensors are expected to have the same size in all channels" + return encoded_sentence + + def decode(self, tokens: Dict[str, torch.LongTensor]) -> Dict[str, str]: + assert isinstance( + tokens, dict + ), "Input tokens are expected to be a dictionary over channels" + assert set(tokens.keys()) == set( + self.channels + ), "Mismatch between input tokens keys and model channels ({} vs {})".format( + set(tokens.keys()), set(self.channels) + ) + decoded_sentence = {} + for channel in tokens: + tokens_channel = tokens[channel] + sentence_channel = self.string(tokens_channel, channel) + sentence_channel = self.remove_bpe(sentence_channel) + sentence_channel = self.detokenize(sentence_channel) + decoded_sentence[channel] = sentence_channel + return decoded_sentence + + def binarize(self, sentence: str, channel: str) -> torch.LongTensor: + return ( + self.src_dicts[channel].encode_line(sentence, add_if_not_exist=False).long() + ) + + def string(self, tokens: torch.LongTensor, channel: str) -> str: + return self.tgt_dicts[channel].string(tokens) + + def _build_batches( + self, tokens: List[Dict[str, List[int]]], skip_invalid_size_inputs: bool + ) -> Iterator[Dict[str, Any]]: + lengths = torch.LongTensor([next(iter(d.values())).numel() for d in tokens]) + batch_iterator = self.task.get_batch_iterator( + dataset=self.task.build_dataset_for_inference(tokens, lengths), + max_tokens=self.cfg.dataset.max_tokens, + max_sentences=self.cfg.dataset.batch_size, + max_positions=self.max_positions, + ignore_invalid_inputs=skip_invalid_size_inputs, + disable_iterator_cache=True, + ).next_epoch_itr(shuffle=False) + return batch_iterator diff --git a/fairseq/models/speech_dlm/modules/__init__.py b/fairseq/models/speech_dlm/modules/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py b/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py new file mode 100644 index 0000000000..a14a1d64a8 --- /dev/null +++ b/fairseq/models/speech_dlm/modules/speech_dlm_decoder.py @@ -0,0 +1,572 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Any, Dict, List, Optional, Tuple + +import torch +import torch.nn as nn +from fairseq import utils +from fairseq.models import FairseqIncrementalDecoder +from fairseq.modules import ( + FairseqDropout, + LayerDropModuleList, + LayerNorm, + PositionalEmbedding, +) +from .speech_dlm_decoder_layer import ( + CrossChannelTransformerDecoderLayer, + StandardTransformerDecoderLayer, +) +from fairseq.modules.checkpoint_activations import checkpoint_wrapper +from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ +from torch import Tensor + + +class CrossChannelTransformerDecoder(FairseqIncrementalDecoder): + """ + Cross-channel Transformer Decoder Block for parallel spoken dialogue units + as described in the paper: https://arxiv.org/pdf/2203.16502.pdf; + consisting of *args.decoder_layers* layers. Each layer is a + :class:`StandardTransformerDecoderLayer` or + :class:`CrossChannelTransformerDecoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + channels (list): list of channel names (string) + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__(self, args, dictionary, embed_tokens, channels, no_encoder_attn=False): + self.args = args + super().__init__(dictionary) + self.register_buffer("version", torch.Tensor([3])) + self._future_mask = torch.empty(0) + + self.dropout_module = FairseqDropout( + args.dropout, module_name=self.__class__.__name__ + ) + self.decoder_layerdrop = args.decoder_layerdrop + self.share_input_output_embed = args.share_decoder_input_output_embed + self.channels = channels + + input_embed_dim = embed_tokens.embedding_dim + embed_dim = args.decoder_embed_dim + self.embed_dim = embed_dim + self.output_embed_dim = args.decoder_output_dim + + self.padding_idx = embed_tokens.padding_idx + self.max_target_positions = args.max_target_positions + + self.embed_tokens = embed_tokens + + self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) + + if args.quant_noise_pq > 0: + self.quant_noise = apply_quant_noise_( + nn.Linear(embed_dim, embed_dim, bias=False), + args.quant_noise_pq, + args.quant_noise_pq_block_size, + ) + else: + self.quant_noise = None + + self.project_in_dim = ( + nn.Linear(input_embed_dim, embed_dim, bias=False) + if embed_dim != input_embed_dim + else None + ) + self.embed_positions = ( + PositionalEmbedding( + self.max_target_positions, + embed_dim, + self.padding_idx, + learned=args.decoder_learned_pos, + ) + if not args.no_token_positional_embeddings + else None + ) + + if getattr(args, "layernorm_embedding", False): + self.layernorm_embedding = LayerNorm(embed_dim) + else: + self.layernorm_embedding = None + + self.cross_self_attention = getattr(args, "cross_self_attention", False) + + assert 0 <= args.decoder_cross_layers <= args.decoder_layers, ( + "The number of cross-channel attention decoder layers must be non-negative" + f"and not exceeds the number of decoder layers (found {args.decoder_cross_layers})" + ) + + if self.decoder_layerdrop > 0.0: + self.layers = LayerDropModuleList(p=self.decoder_layerdrop) + else: + self.layers = nn.ModuleList([]) + self.layers.extend( + [ + self.build_decoder_layer(args, no_encoder_attn) + if i < args.decoder_layers - args.decoder_cross_layers + else self.build_cross_decoder_layer(args, no_encoder_attn) + for i in range(args.decoder_layers) + ] + ) + self.num_layers = len(self.layers) + self.non_cross_layers = args.decoder_layers - args.decoder_cross_layers + + if args.decoder_normalize_before and not getattr( + args, "no_decoder_final_norm", False + ): + self.layer_norm = LayerNorm(embed_dim) + else: + self.layer_norm = None + + self.project_out_dim = ( + nn.Linear(embed_dim, self.output_embed_dim, bias=False) + if embed_dim != self.output_embed_dim + else None + ) + + self.output_projection = None + self.is_cross_prediction = bool( + float(args.main_and_cross_weights.split(",")[1]) != 0 + ) + self.n_output_projections = ( + 1 if not self.is_cross_prediction else len(self.channels) + ) + + if self.share_input_output_embed: + # Output projection is a list of projections + # where the first proj is for the main-channel, + # then roll in a cicular way. + # For example: if the main channel has index i + # the second proj is for channel i+1 (mod N_channels), etc. + self.output_projection = nn.ModuleList( + [ + nn.Linear( + embed_tokens.weight.shape[1], # embed_dim + embed_tokens.weight.shape[0], # n_dictionaries + bias=False, + ) + for _ in range(self.n_output_projections) + ] + ) + # Only share the main-channel projection + self.output_projection[0].weight = embed_tokens.weight + for i in range(1, self.n_output_projections): + nn.init.normal_( + self.output_projection[i].weight, + mean=0, + std=embed_tokens.weight.shape[1] ** -0.5, + ) + else: + self.output_projection = nn.ModuleList( + [ + nn.Linear(self.output_embed_dim, len(dictionary), bias=False) + for _ in range(self.n_output_projections) + ] + ) + for i in range(self.n_output_projections): + nn.init.normal_( + self.output_projection[i].weight, + mean=0, + std=self.output_embed_dim**-0.5, + ) + self.output_duration_prediction = ( + None + if str(args.duration_prediction).lower() == "false" + else nn.ModuleList( + [ + nn.Linear(self.output_embed_dim, 1) + for _ in range(self.n_output_projections) + ] + ) + ) + + def build_decoder_layer(self, args, no_encoder_attn=False): + layer = StandardTransformerDecoderLayer(args, no_encoder_attn) + if getattr(args, "checkpoint_activations", False): + offload_to_cpu = getattr(args, "offload_activations", False) + layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) + return layer + + def build_cross_decoder_layer(self, args, no_encoder_attn=False): + layer = CrossChannelTransformerDecoderLayer(args, no_encoder_attn) + if getattr(args, "checkpoint_activations", False): + offload_to_cpu = getattr(args, "offload_activations", False) + layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) + return layer + + def forward( + self, + prev_output_tokens: Dict[str, Tensor], + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[ + List[Dict[str, Dict[str, Optional[Tensor]]]] + ] = None, + features_only: bool = False, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + src_lengths: Optional[Any] = None, + # return_all_hiddens: bool = False, + ): + """ + Args: + prev_output_tokens (dict[str, LongTensor]): previous decoder outputs, + dictionary over all channels with the values being the tensors + of shape `(batch, tgt_len)`, for teacher forcing + encoder_out (optional): output from the encoder, used for + encoder-side attention + incremental_state (dict): list of dictionaries used for storing state + during :ref:`Incremental decoding` + features_only (bool, optional): only return features without + applying output layer (default: False). + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + + Returns: + tuple: + - the decoder's output, dict over channels of tensors + of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + x, extra = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + incremental_state=incremental_state, + full_context_alignment=full_context_alignment, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + ) + if not features_only: + x = self.output_layer(x) + return x, extra + + def extract_features( + self, + prev_output_tokens: Dict[str, Tensor], + encoder_out: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[ + List[Dict[str, Dict[str, Optional[Tensor]]]] + ] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + return self.extract_features_scriptable( + prev_output_tokens, + encoder_out, + incremental_state, + full_context_alignment, + alignment_layer, + alignment_heads, + ) + + """ + A scriptable subclass of this class has an extract_features method and calls + super().extract_features, but super() is not supported in torchscript. A copy of + this function is made to be used in the subclass instead. + """ + + def extract_features_scriptable( + self, + prev_output_tokens: Dict[str, Tensor], + encoder_out: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[ + List[Dict[str, Dict[str, Optional[Tensor]]]] + ] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + """ + The core function of *forward* but only return features. + + The input (prev_output_tokens) is a dictionary over all channels, + expected to have the following form: + { + 'channel1' : Tensor((batch x tgt_len)), + 'channel2' : Tensor((batch x tgt_len)), + } + + Args: + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + alignment_layer (int, optional): return mean alignment over + heads at this layer (default: last layer). + alignment_heads (int, optional): only average alignment over + this many heads (default: all heads). + + Returns: + tuple: + - the decoder's features, dict over channels of tensors + of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + if alignment_layer is None: + alignment_layer = self.num_layers - 1 + + x_list = [] + for i, channel in enumerate(self.channels): + # embed positions + positions = None + if self.embed_positions is not None: + positions = self.embed_positions( + prev_output_tokens[channel], + incremental_state=incremental_state[i] + if incremental_state is not None + else None, + ) + + if incremental_state is not None: + prev_output_tokens[channel] = prev_output_tokens[channel][:, -1:] + if positions is not None: + positions = positions[:, -1:] + + # embed tokens and positions + x = self.embed_tokens(prev_output_tokens[channel]) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + x = self.embed_scale * x + + if self.quant_noise is not None: + x = self.quant_noise(x) + + if positions is not None: + x += positions + + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + x_list.append(x) + + self_attn_padding_mask: Optional[Tensor] = None + if ( + self.cross_self_attention + or prev_output_tokens[self.channels[0]].eq(self.padding_idx).any() + ): + self_attn_padding_mask = prev_output_tokens[self.channels[0]].eq( + self.padding_idx + ) + + # decoder layers + attn: Optional[Dict[Tensor]] = None + inner_states: List[Optional[Dict[str, Tensor]]] = [ + {channel: x_list[i] for i, channel in enumerate(self.channels)} + ] + for idx, layer in enumerate(self.layers): + if incremental_state is None and not full_context_alignment: + self_attn_mask = self.buffered_future_mask(x_list[0]) + else: + self_attn_mask = None + + # need to change to tensor for the checkpoint activation to work + if isinstance(x_list, list): + x_list = torch.stack(x_list) + x_list, layer_attn_list, _ = layer( + x_list, + encoder_out["encoder_out"][0] + if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) + else None, + encoder_out["encoder_padding_mask"][0] + if ( + encoder_out is not None + and len(encoder_out["encoder_padding_mask"]) > 0 + ) + else None, + incremental_state, + self_attn_mask=self_attn_mask, + self_attn_padding_mask=self_attn_padding_mask, + need_attn=bool((idx == alignment_layer)), + need_head_weights=bool((idx == alignment_layer)), + ) + + inner_states.append( + {channel: x_list[i] for i, channel in enumerate(self.channels)} + ) + if idx == alignment_layer and all( + layer_attn is not None for layer_attn in layer_attn_list + ): + attn = { + channel: layer_attn_list[i].float().to(x_list[0]) + for i, channel in enumerate(self.channels) + } + # change back from tensor to list + if not isinstance(x_list, list): + x_list = list(torch.unbind(x_list)) + + if attn is not None: + for channel in attn: + if alignment_heads is not None: + attn[channel] = attn[channel][:alignment_heads] + + # average probabilities over heads + attn[channel] = attn[channel].mean(dim=0) + + for i, x in enumerate(x_list): + if self.layer_norm is not None: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + if self.project_out_dim is not None: + x = self.project_out_dim(x) + + x_list[i] = x + + x = {channel: x_list[i] for i, channel in enumerate(self.channels)} + + return x, {"attn": [attn], "inner_states": inner_states} + + def output_layer(self, features): + """Project features to the vocabulary size. + Return a dictionary of the form: + { + 'input-channel': { + 'predicted-channel': token prediction tensor of shape `(batch, tgt_len, vocab)`, + } + } + + if duration_prediction is enabled + { + 'input-channel': { + 'predicted-channel': { + 'pred_token': token prediction tensor of shape `(batch, tgt_len, vocab)`, + 'pred_duration': duration prediction tensor + } + } + } + """ + # project back to size of vocabulary + if self.output_duration_prediction is None: + if self.is_cross_prediction: + return { + channel: { + pred_channel: self.output_projection[j - i](features[channel]) + for j, pred_channel in enumerate(self.channels) + } + for i, channel in enumerate(self.channels) + } + else: + return { + channel: {channel: self.output_projection[0](features[channel])} + for i, channel in enumerate(self.channels) + } + else: + if self.is_cross_prediction: + return { + channel: { + pred_channel: { + "pred_token": self.output_projection[j - i]( + features[channel] + ), + "pred_duration": self.output_duration_prediction[j - i]( + features[channel] + ), + } + for j, pred_channel in enumerate(self.channels) + } + for i, channel in enumerate(self.channels) + } + else: + return { + channel: { + channel: { + "pred_token": self.output_projection[0](features[channel]), + "pred_duration": self.output_duration_prediction[0]( + features[channel] + ), + } + } + for i, channel in enumerate(self.channels) + } + + def max_positions(self): + """Maximum output length supported by the decoder.""" + if self.embed_positions is None: + return self.max_target_positions + return min(self.max_target_positions, self.embed_positions.max_positions) + + def buffered_future_mask(self, tensor): + dim = tensor.size(0) + # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround. + if ( + self._future_mask.size(0) == 0 + or (not self._future_mask.device == tensor.device) + or self._future_mask.size(0) < dim + ): + self._future_mask = torch.triu( + utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1 + ) + self._future_mask = self._future_mask.to(tensor) + return self._future_mask[:dim, :dim] + + def get_normalized_probs_scriptable( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): + """Get normalized probabilities (or log probs) from a net's output.""" + + logits_dict = net_output[0] + out_dict = {} + for channel in logits_dict: + out_dict[channel] = {} + for pred_channel in logits_dict[channel]: + if isinstance(logits_dict[channel][pred_channel], dict): + pred_token_logits = logits_dict[channel][pred_channel]["pred_token"] + else: + pred_token_logits = logits_dict[channel][pred_channel] + if log_probs: + out = utils.log_softmax( + pred_token_logits, dim=-1, onnx_trace=self.onnx_trace + ) + else: + out = utils.softmax( + pred_token_logits, dim=-1, onnx_trace=self.onnx_trace + ) + if isinstance(logits_dict[channel][pred_channel], dict): + out_dict[channel][pred_channel] = { + "pred_token": out, + "pred_duration": logits_dict[channel][pred_channel][ + "pred_duration" + ].float(), + } # move to float32 to avoid inf loss + else: + out_dict[channel][pred_channel] = out + return out_dict + + def reorder_incremental_state_scripting( + self, + incremental_state: List[Dict[str, Dict[str, Optional[Tensor]]]], + new_order: Tensor, + ): + """Main entry point for reordering the incremental state. + + Due to limitations in TorchScript, we call this function in + :class:`fairseq.sequence_generator.SequenceGenerator` instead of + calling :func:`reorder_incremental_state` directly. + """ + for module in self.modules(): + if hasattr(module, "reorder_incremental_state"): + for i, incremental_state_channel in enumerate(incremental_state): + result = module.reorder_incremental_state( + incremental_state_channel, new_order + ) + if result is not None: + incremental_state[i] = result diff --git a/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py b/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py new file mode 100644 index 0000000000..fb65fdf810 --- /dev/null +++ b/fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py @@ -0,0 +1,717 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, List, Tuple, Optional + +import torch +import torch.nn as nn +from fairseq import utils +from fairseq.modules import LayerNorm, MultiheadAttention +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.quant_noise import quant_noise +from torch import Tensor + + +class CrossChannelTransformerDecoderLayer(nn.Module): + """Cross-Attention Transformer Decoder Layer block as described + in the paper: https://arxiv.org/pdf/2203.16502.pdf + + Composed of a Multi-head Self Attention block followed by a + Multi-head Cross-Attention block which attends to the self-attention + outputs of the other channels. The weights of the attention blocks + in all channels are shared. + + Args: + args (argparse.Namespace): parsed command-line arguments + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__( + self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False + ): + super().__init__() + self.embed_dim = args.decoder_embed_dim + self.dropout_module = FairseqDropout( + args.dropout, module_name=self.__class__.__name__ + ) + self.quant_noise = getattr(args, "quant_noise_pq", 0) + self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8) + + # This cross_self_attention is used for encoder-decoder systems, + # It's not the cross-channel attention (defined below as cross_channel_attn) + self.cross_self_attention = getattr(args, "cross_self_attention", False) + + self.self_attn = self.build_self_attention( + self.embed_dim, + args, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ) + self.cross_channel_attn = self.build_cross_channel_attention( + self.embed_dim, + args, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ) + + self.activation_fn = utils.get_activation_fn( + activation=str(args.activation_fn) + if getattr(args, "activation_fn", None) is not None + else "relu" + ) + activation_dropout_p = getattr(args, "activation_dropout", 0) or 0 + if activation_dropout_p == 0: + # for backwards compatibility with models that use args.relu_dropout + activation_dropout_p = getattr(args, "relu_dropout", 0) or 0 + self.activation_dropout_module = FairseqDropout( + float(activation_dropout_p), module_name=self.__class__.__name__ + ) + self.normalize_before = args.decoder_normalize_before + + # use layerNorm rather than FusedLayerNorm for exporting. + # char_inputs can be used to determint this. + # TODO remove this once we update apex with the fix + export = getattr(args, "char_inputs", False) + self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) + self.cross_channel_attn_layer_norm = LayerNorm(self.embed_dim, export=export) + + if no_encoder_attn: + self.encoder_attn = None + self.encoder_attn_layer_norm = None + else: + self.encoder_attn = self.build_encoder_attention(self.embed_dim, args) + self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) + + self.fc1 = self.build_fc1( + self.embed_dim, + args.decoder_ffn_embed_dim, + self.quant_noise, + self.quant_noise_block_size, + ) + self.fc2 = self.build_fc2( + args.decoder_ffn_embed_dim, + self.embed_dim, + self.quant_noise, + self.quant_noise_block_size, + ) + + self.final_layer_norm = LayerNorm(self.embed_dim, export=export) + self.need_attn = True + + self.onnx_trace = False + + def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): + return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) + + def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): + return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) + + def build_self_attention( + self, embed_dim, args, add_bias_kv=False, add_zero_attn=False + ): + return MultiheadAttention( + embed_dim, + args.decoder_attention_heads, + dropout=args.attention_dropout, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + self_attention=not getattr(args, "cross_self_attention", False), + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + ) + + def build_cross_channel_attention( + self, embed_dim, args, add_bias_kv=False, add_zero_attn=False + ): + return MultiheadAttention( + embed_dim, + args.decoder_attention_heads, + dropout=args.attention_dropout, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + self_attention=False, + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + ) + + def build_encoder_attention(self, embed_dim, args): + return MultiheadAttention( + embed_dim, + args.decoder_attention_heads, + kdim=getattr(args, "encoder_embed_dim", None), + vdim=getattr(args, "encoder_embed_dim", None), + dropout=args.attention_dropout, + encoder_decoder_attention=True, + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + ) + + def prepare_for_onnx_export_(self): + self.onnx_trace = True + + def residual_connection(self, x, residual): + return residual + x + + def forward( + self, + x_list_tensor: List[torch.Tensor], + encoder_out: Optional[torch.Tensor] = None, + encoder_padding_mask: Optional[torch.Tensor] = None, + incremental_state: Optional[ + List[Dict[str, Dict[str, Optional[Tensor]]]] + ] = None, + prev_self_attn_state: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None, + prev_attn_state: Optional[List[torch.Tensor]] = None, + self_attn_mask: Optional[torch.Tensor] = None, + self_attn_padding_mask: Optional[torch.Tensor] = None, + need_attn: bool = False, + need_head_weights: bool = False, + ): + """ + Args: + x_list_tensor (List[Tensor]): list of input tensors in different channels, + each tensor is of shape `(seq_len, batch, embed_dim)` + encoder_padding_mask (ByteTensor, optional): binary + ByteTensor of shape `(batch, src_len)` where padding + elements are indicated by ``1``. + incremental_state (optional): list of incremental_state dictionaries over + different channels (sequence generation mode) + prev_self_attn_state (List[Tuple[Tensor, Tensor]], optional): list of tuples + (self_attn_state, cross_channel_attn_state) over different channels + need_attn (bool, optional): return attention weights + need_head_weights (bool, optional): return attention weights + for each head (default: return average over heads). + + Returns: + list of encoded output of shape `(seq_len, batch, embed_dim)` + """ + n_channels = len(x_list_tensor) + if need_head_weights: + need_attn = True + + # incremental_state is a list of dictionaries over different channels + if incremental_state is not None: + assert isinstance(incremental_state, list) + assert len(incremental_state) == n_channels + + # prev_self_attn_state is a list of tuples (self_attn_state, cross_channel_attn_state) over different channels + if prev_self_attn_state is not None: + assert isinstance(prev_self_attn_state, list) + assert len(prev_self_attn_state) == n_channels + for prev_self_attn_state_channel in prev_self_attn_state: + assert isinstance(prev_self_attn_state_channel, tuple) + assert len(prev_self_attn_state_channel) == 2 + + # Backup for other channels & cross channel attention + self_attn_mask_orin = self_attn_mask + self_attn_padding_mask_orin = self_attn_padding_mask + + x_list = [] + attn_list = [] + for i, x in enumerate(x_list_tensor): + residual = x + + if self.normalize_before: + x = self.self_attn_layer_norm(x) + + if prev_self_attn_state is not None: + prev_key, prev_value = prev_self_attn_state[i][0][:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_self_attn_state[i][0]) >= 3: + saved_state["prev_key_padding_mask"] = prev_self_attn_state[i][0][2] + assert incremental_state is not None + self.self_attn._set_input_buffer(incremental_state[i], saved_state) + _self_attn_input_buffer = self.self_attn._get_input_buffer( + incremental_state[i] if incremental_state is not None else None + ) + if self.cross_self_attention and not ( + incremental_state is not None + and _self_attn_input_buffer is not None + and "prev_key" in _self_attn_input_buffer + ): + if self_attn_mask_orin is not None: + assert encoder_out is not None + self_attn_mask = torch.cat( + ( + x.new_zeros(x.size(0), encoder_out.size(0)), + self_attn_mask_orin, + ), + dim=1, + ) + if self_attn_padding_mask_orin is not None: + if encoder_padding_mask is None: + assert encoder_out is not None + encoder_padding_mask = self_attn_padding_mask_orin.new_zeros( + encoder_out.size(1), encoder_out.size(0) + ) + self_attn_padding_mask = torch.cat( + (encoder_padding_mask, self_attn_padding_mask_orin), dim=1 + ) + assert encoder_out is not None + y = torch.cat((encoder_out, x), dim=0) + else: + y = x + + x, attn = self.self_attn( + query=x, + key=y, + value=y, + key_padding_mask=self_attn_padding_mask, + incremental_state=incremental_state[i] + if incremental_state is not None + else None, + need_weights=False, + attn_mask=self_attn_mask, + ) + + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.self_attn_layer_norm(x) + + if self.encoder_attn is not None and encoder_out is not None: + residual = x + if self.normalize_before: + x = self.encoder_attn_layer_norm(x) + if prev_attn_state is not None: + prev_key, prev_value = prev_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_attn_state[2] + assert incremental_state is not None + self.encoder_attn._set_input_buffer( + incremental_state[i], saved_state + ) + + x, attn = self.encoder_attn( + query=x, + key=encoder_out, + value=encoder_out, + key_padding_mask=encoder_padding_mask, + incremental_state=incremental_state[i] + if incremental_state is not None + else None, + static_kv=True, + need_weights=need_attn or (not self.training and self.need_attn), + need_head_weights=need_head_weights, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.encoder_attn_layer_norm(x) + + x_list.append(x) + attn_list.append(attn) + + # Store attentions & new x(s) (bc the old x(s) are used in other channels) + x_list_new = [] + # Here comes the cross channel attention + for i, x in enumerate(x_list): + residual = x + if self.normalize_before: + x = self.cross_channel_attn_layer_norm(x) + + if prev_self_attn_state is not None: + prev_key, prev_value = prev_self_attn_state[i][1][:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_self_attn_state[i][1]) >= 3: + saved_state["prev_key_padding_mask"] = prev_self_attn_state[i][1][2] + assert incremental_state is not None + self.cross_channel_attn._set_input_buffer( + incremental_state[i], saved_state + ) + + # The cross attention is computed with the concatenation of attentions from other channels + if len(x_list) > 1: + x_other = torch.cat( + [x_list[(i + j) % len(x_list)] for j in range(1, len(x_list))], + dim=0, + ) + else: + # Self-attention when having only one channel + x_other = x_list[i] + + x, attn = self.cross_channel_attn( + query=x, + key=x_other, + value=x_other, + key_padding_mask=self_attn_padding_mask_orin, + incremental_state=incremental_state[i] + if incremental_state is not None + else None, + need_weights=False, + attn_mask=self_attn_mask_orin, + ) + + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.cross_channel_attn_layer_norm(x) + + x_list_new.append(x) + x_list = x_list_new + + for i, x in enumerate(x_list): + residual = x + if self.normalize_before: + x = self.final_layer_norm(x) + + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + x = self.fc2(x) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.final_layer_norm(x) + + x_list[i] = x + # Trick for the checkpoint activation + x_list_tensor = torch.stack(x_list) + if self.onnx_trace and incremental_state is not None: + self_and_cross_attn_state_list = [] + for i in range(n_channels): + self_and_cross_attn_state = [] + for self_attn_module in [self.self_attn, self.cross_channel_attn]: + saved_state = self_attn_module._get_input_buffer( + incremental_state[i] + ) + assert saved_state is not None + if self_attn_padding_mask is not None: + self_attn_module_state = [ + saved_state["prev_key"], + saved_state["prev_value"], + saved_state["prev_key_padding_mask"], + ] + else: + self_attn_module_state = [ + saved_state["prev_key"], + saved_state["prev_value"], + ] + self_and_cross_attn_state.append(self_attn_module_state) + self_and_cross_attn_state_list.append(tuple(self_and_cross_attn_state)) + return x_list_tensor, attn_list, self_and_cross_attn_state_list + return x_list_tensor, attn_list, None + + def make_generation_fast_(self, need_attn: bool = False, **kwargs): + self.need_attn = need_attn + + +# Rewrite fairseq.modules.TransformerDecoderLayer +# to be compatible with checkpoint_activations +# (avoid forwarding model multiple times) +class StandardTransformerDecoderLayer(nn.Module): + """Rewrite fairseq.modules.TransformerDecoderLayer to avoid forwarding + model multiple times and be compatible with checkpoint_activations. + + The input is expected to be a list of tensors from different channels, + each is forwarded to the same model (shared attention weights). + + In the original paper each operation (multi-head attention, encoder + attention or FFN) is postprocessed with: `dropout -> add residual -> + layernorm`. In the tensor2tensor code they suggest that learning is more + robust when preprocessing each layer with layernorm and postprocessing with: + `dropout -> add residual`. We default to the approach in the paper, but the + tensor2tensor approach can be enabled by setting + *args.decoder_normalize_before* to ``True``. + + Args: + args (argparse.Namespace): parsed command-line arguments + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__( + self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False + ): + super().__init__() + self.embed_dim = args.decoder_embed_dim + self.dropout_module = FairseqDropout( + args.dropout, module_name=self.__class__.__name__ + ) + self.quant_noise = getattr(args, "quant_noise_pq", 0) + self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8) + + self.cross_self_attention = getattr(args, "cross_self_attention", False) + + self.self_attn = self.build_self_attention( + self.embed_dim, + args, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ) + + self.activation_fn = utils.get_activation_fn( + activation=str(args.activation_fn) + if getattr(args, "activation_fn", None) is not None + else "relu" + ) + activation_dropout_p = getattr(args, "activation_dropout", 0) or 0 + if activation_dropout_p == 0: + # for backwards compatibility with models that use args.relu_dropout + activation_dropout_p = getattr(args, "relu_dropout", 0) or 0 + self.activation_dropout_module = FairseqDropout( + float(activation_dropout_p), module_name=self.__class__.__name__ + ) + self.normalize_before = args.decoder_normalize_before + + # use layerNorm rather than FusedLayerNorm for exporting. + # char_inputs can be used to determint this. + # TODO remove this once we update apex with the fix + export = getattr(args, "char_inputs", False) + self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) + + if no_encoder_attn: + self.encoder_attn = None + self.encoder_attn_layer_norm = None + else: + self.encoder_attn = self.build_encoder_attention(self.embed_dim, args) + self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) + + self.fc1 = self.build_fc1( + self.embed_dim, + args.decoder_ffn_embed_dim, + self.quant_noise, + self.quant_noise_block_size, + ) + self.fc2 = self.build_fc2( + args.decoder_ffn_embed_dim, + self.embed_dim, + self.quant_noise, + self.quant_noise_block_size, + ) + + self.final_layer_norm = LayerNorm(self.embed_dim, export=export) + self.need_attn = True + + self.onnx_trace = False + + def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): + return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) + + def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): + return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) + + def build_self_attention( + self, embed_dim, args, add_bias_kv=False, add_zero_attn=False + ): + return MultiheadAttention( + embed_dim, + args.decoder_attention_heads, + dropout=args.attention_dropout, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + self_attention=not getattr(args, "cross_self_attention", False), + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + ) + + def build_encoder_attention(self, embed_dim, args): + return MultiheadAttention( + embed_dim, + args.decoder_attention_heads, + kdim=getattr(args, "encoder_embed_dim", None), + vdim=getattr(args, "encoder_embed_dim", None), + dropout=args.attention_dropout, + encoder_decoder_attention=True, + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + ) + + def prepare_for_onnx_export_(self): + self.onnx_trace = True + + def residual_connection(self, x, residual): + return residual + x + + def forward( + self, + x_list_tensor: List[torch.Tensor], + encoder_out: Optional[torch.Tensor] = None, + encoder_padding_mask: Optional[torch.Tensor] = None, + incremental_state: Optional[ + List[Dict[str, Dict[str, Optional[Tensor]]]] + ] = None, + prev_self_attn_state: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None, + prev_attn_state: Optional[List[torch.Tensor]] = None, + self_attn_mask: Optional[torch.Tensor] = None, + self_attn_padding_mask: Optional[torch.Tensor] = None, + need_attn: bool = False, + need_head_weights: bool = False, + ): + """ + Args: + x_list_tensor (List[Tensor]): list of input tensors in different channels, + each tensor is of shape `(seq_len, batch, embed_dim)` + encoder_padding_mask (ByteTensor, optional): binary + ByteTensor of shape `(batch, src_len)` where padding + elements are indicated by ``1``. + incremental_state (optional): list of incremental_state dictionaries over + different channels (sequence generation mode) + prev_self_attn_state (List[Tuple[Tensor, Tensor]], optional): list of tuples + (self_attn_state, cross_channel_attn_state) over different channels + need_attn (bool, optional): return attention weights + need_head_weights (bool, optional): return attention weights + for each head (default: return average over heads). + + Returns: + list of encoded output of shape `(seq_len, batch, embed_dim)` + """ + n_channels = len(x_list_tensor) + if need_head_weights: + need_attn = True + + # incremental_state is a list of dictionaries over different channels + if incremental_state is not None: + assert isinstance(incremental_state, list) + assert len(incremental_state) == n_channels + + # prev_self_attn_state is a list of self_attn_state over different channels + if prev_self_attn_state is not None: + assert isinstance(prev_self_attn_state, list) + assert len(prev_self_attn_state) == n_channels + + x_list = [] + attn_list = [] + for i, x in enumerate(x_list_tensor): + residual = x + + if self.normalize_before: + x = self.self_attn_layer_norm(x) + + if prev_self_attn_state is not None: + prev_key, prev_value = prev_self_attn_state[i][:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_self_attn_state[i]) >= 3: + saved_state["prev_key_padding_mask"] = prev_self_attn_state[2] + assert incremental_state is not None + self.self_attn._set_input_buffer(incremental_state[i], saved_state) + _self_attn_input_buffer = self.self_attn._get_input_buffer( + incremental_state + ) + if self.cross_self_attention and not ( + incremental_state is not None + and _self_attn_input_buffer is not None + and "prev_key" in _self_attn_input_buffer + ): + if self_attn_mask is not None: + assert encoder_out is not None + self_attn_mask = torch.cat( + (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), + dim=1, + ) + if self_attn_padding_mask is not None: + if encoder_padding_mask is None: + assert encoder_out is not None + encoder_padding_mask = self_attn_padding_mask.new_zeros( + encoder_out.size(1), encoder_out.size(0) + ) + self_attn_padding_mask = torch.cat( + (encoder_padding_mask, self_attn_padding_mask), dim=1 + ) + assert encoder_out is not None + y = torch.cat((encoder_out, x), dim=0) + else: + y = x + + x, attn = self.self_attn( + query=x, + key=y, + value=y, + key_padding_mask=self_attn_padding_mask, + incremental_state=incremental_state[i] + if incremental_state is not None + else None, + need_weights=False, + attn_mask=self_attn_mask, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.self_attn_layer_norm(x) + + if self.encoder_attn is not None and encoder_out is not None: + residual = x + if self.normalize_before: + x = self.encoder_attn_layer_norm(x) + if prev_attn_state is not None: + prev_key, prev_value = prev_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_attn_state[2] + assert incremental_state is not None + self.encoder_attn._set_input_buffer(incremental_state, saved_state) + + x, attn = self.encoder_attn( + query=x, + key=encoder_out, + value=encoder_out, + key_padding_mask=encoder_padding_mask, + incremental_state=incremental_state[i] + if incremental_state is not None + else None, + static_kv=True, + need_weights=need_attn or (not self.training and self.need_attn), + need_head_weights=need_head_weights, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.encoder_attn_layer_norm(x) + + residual = x + if self.normalize_before: + x = self.final_layer_norm(x) + + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + x = self.fc2(x) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.final_layer_norm(x) + + x_list.append(x) + attn_list.append(attn) + + # Trick for the checkpoint activation + x_list_tensor = torch.stack(x_list) + if self.onnx_trace and incremental_state is not None: + self_attn_state_list = [] + for i in range(n_channels): + saved_state = self.self_attn._get_input_buffer(incremental_state[i]) + assert saved_state is not None + if self_attn_padding_mask is not None: + self_attn_state = [ + saved_state["prev_key"], + saved_state["prev_value"], + saved_state["prev_key_padding_mask"], + ] + else: + self_attn_state = [ + saved_state["prev_key"], + saved_state["prev_value"], + ] + self_attn_state_list.append(self_attn_state) + return x_list_tensor, attn_list, self_attn_state_list + return x_list_tensor, attn_list, None + + def make_generation_fast_(self, need_attn: bool = False, **kwargs): + self.need_attn = need_attn diff --git a/fairseq/models/speech_dlm/sequence_generator/__init__.py b/fairseq/models/speech_dlm/sequence_generator/__init__.py new file mode 100644 index 0000000000..a88e144577 --- /dev/null +++ b/fairseq/models/speech_dlm/sequence_generator/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .multichannel_sequence_generator import * # noqa diff --git a/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py b/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py new file mode 100644 index 0000000000..db4b77f345 --- /dev/null +++ b/fairseq/models/speech_dlm/sequence_generator/multichannel_search.py @@ -0,0 +1,430 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, Optional + +import torch +import torch.nn as nn +from torch import Tensor + + +class MultichannelSearch(nn.Module): + def __init__(self, tgt_dicts): + super().__init__() + tgt_dict = list(tgt_dicts.values())[0] + self.pad = tgt_dict.pad() + self.unk = tgt_dict.unk() + self.eos = tgt_dict.eos() + for tgt_dict in tgt_dicts.values(): + assert self.pad == tgt_dict.pad() + assert self.unk == tgt_dict.unk() + assert self.eos == tgt_dict.eos() + self.vocab_sizes = {channel: len(tgt_dicts[channel]) for channel in tgt_dicts} + self.src_lengths = torch.tensor(-1) + self.supports_constraints = False + self.stop_on_max_len = False + + def step( + self, step, lprobs, scores, prev_output_tokens=None, original_batch_idxs=None + ): + """Take a single search step. + + Args: + step: the current search step, starting at 0 + lprobs: dictionary of channels {channel : (bsz x input_beam_size x vocab_size_channel)} + the model's log-probabilities over the vocabulary at the current step + scores: {channel : (bsz x input_beam_size x step)} + the historical model scores of each hypothesis up to this point + prev_output_tokens: {channel : (bsz x step)} + the previously generated oputput tokens + original_batch_idxs: (bsz) + the tensor with the batch indices, in the range [0, bsz) + this is useful in case there has been applied a re-ordering + and we need to know the orignal indices + + Return: A tuple of (scores, indices, beams) where: + scores: {channel : (bsz x output_beam_size)} + the scores of the chosen elements; output_beam_size can be + larger than input_beam_size, e.g., we may return + 2*input_beam_size to account for EOS + indices: {channel : (bsz x output_beam_size)} + the indices of the chosen elements + beams: (bsz x output_beam_size) + the hypothesis ids of the chosen elements, in the range [0, input_beam_size) + """ + raise NotImplementedError + + @torch.jit.export + def set_src_lengths(self, src_lengths): + self.src_lengths = src_lengths + + @torch.jit.export + def init_constraints(self, batch_constraints: Optional[Tensor], beam_size: int): + """Initialize constraint states for constrained decoding (if supported). + + Args: + batch_constraints: (torch.Tensor, optional) + the list of constraints, in packed form + beam_size: (int) + the beam size + Returns: + *encoder_out* rearranged according to *new_order* + """ + pass + + def prune_sentences(self, batch_idxs: Tensor): + """ + Removes constraint states for completed sentences (if supported). + This is called from sequence_generator._generate() when sentences are + deleted from the batch. + + Args: + batch_idxs: Indices of *sentences* whose constraint state should be *kept*. + """ + pass + + def update_constraints(self, active_hypos: Tensor): + """ + Updates the constraint states by selecting the beam items that are retained. + This is called at each time step of sequence_generator._generate() when + the set of 2 * {beam_size} candidate hypotheses are reduced to the beam size. + + Args: + active_hypos: (batch size, beam size) + list of integers denoting, for each sentence, which beam candidate items + should be kept. + """ + pass + + +def unravel_index(index, shape): + out = [] + for dim in reversed(shape): + out.append(index % dim) + index = index // dim + return torch.stack(tuple(reversed(out)), dim=-1) + + +def topk_sum(lprobs_list, k): + """ + lprobs_list = [lprobs_1,...,lprobs_n], where: + lprobs_1 : (batch_size x beam_size x vocab_1) + ... + lprobs_n : (batch_size x beam_size x vocab_n) + + Return: + - topk_values : (batch_size x k) + values of the topk sum of the form : + lprobs_1[bsz, beam_idx, vocab_1_idx] + ... + lprobs_n[bsz, beam_idx, vocab_n_idx] + - topk_idxs : (batch_size x k x n+1) + each (n+1)-tensor being [beam_idx, vocab_1_idx, ..., vocab_n_idx] + """ + # Reduce all lprobs to k candidates first to reduce later complexity + # We may assume that k << vocab + lprobs_topk_list = [] + lprobs_topk_indices_list = [] + for lprobs in lprobs_list: + k_i = min(k, lprobs.size(-1)) + topk_values, topk_indices = torch.topk(lprobs, k=k_i) + # topk_values : (batch_size x beam_size x k_i) + # topk_indices : (batch_size x beam_size x k_i) + lprobs_topk_list.append(topk_values) + lprobs_topk_indices_list.append(topk_indices) + + # Compute all possible sums + sum_lprobs_topk = lprobs_topk_list[0] + for i in range(1, len(lprobs_topk_list)): + unsqueezed_lprobs = lprobs_topk_list[i] + for _ in range(i): + unsqueezed_lprobs = unsqueezed_lprobs.unsqueeze(-2) + sum_lprobs_topk = sum_lprobs_topk.unsqueeze(-1) + unsqueezed_lprobs + # sum_lprobs : (batch_size x beam_size x k_1 x ... x k_n) + + # Get the top k sums and the (transformed indices) + topk_sum_values, topk_sum_indices = torch.topk( + sum_lprobs_topk.view(sum_lprobs_topk.size(0), -1), k=k + ) + # topk_sum_values : (batch_size x k) + # topk_sum_indices : (batch_size x k) + topk_sum_indices = unravel_index(topk_sum_indices, tuple(sum_lprobs_topk.shape[1:])) + # topk_sum_indices : (batch_size x k x n+1) + + # Convert the transformed indices to the true indices + for i_batch in range(topk_sum_indices.size(0)): + for i_cand in range(topk_sum_indices.size(1)): + i_beam, *transformed_vocab_indices = topk_sum_indices[i_batch, i_cand] + true_vocab_indices = [i_beam] + for j, transformed_vocab_j_idx in enumerate(transformed_vocab_indices): + true_vocab_j_idx = lprobs_topk_indices_list[j][ + i_batch, i_beam, transformed_vocab_j_idx + ] + true_vocab_indices.append(true_vocab_j_idx) + topk_sum_indices[i_batch, i_cand] = torch.tensor(true_vocab_indices) + + topk_sum_beams = topk_sum_indices[:, :, 0] + topk_sum_indices = topk_sum_indices[:, :, 1:] + + return topk_sum_values, topk_sum_indices, topk_sum_beams + + +class MultichannelBeamSearch(MultichannelSearch): + def __init__(self, tgt_dicts): + super().__init__(tgt_dicts) + self.constraint_states = None + + @torch.jit.export + def step( + self, + step: int, + lprobs, + scores: Optional[Dict[str, Tensor]], + prev_output_tokens: Optional[Dict[str, Tensor]] = None, + original_batch_idxs: Optional[Tensor] = None, + ): + channels = list(lprobs.keys()) + bsz, beam_size, _ = lprobs[channels[0]].size() + + lprobs_list = [] + if step == 0: + # at the first step all hypotheses are equally likely, so use + # only the first beam + for channel in channels: + lprobs_list.append(lprobs[channel][:, ::beam_size, :].contiguous()) + else: + # make probs contain cumulative scores for each hypothesis + assert scores is not None + for channel in channels: + lprobs_list.append( + lprobs[channel] + scores[channel][:, :, step - 1].unsqueeze(-1) + ) + + topk_sum_values, topk_sum_indices, topk_sum_beams = topk_sum( + lprobs_list, k=beam_size * 2 + ) + + beams_buf = topk_sum_beams + scores_buf = {} + indices_buf = {} + for i, channel in enumerate(channels): + indices_buf[channel] = topk_sum_indices[:, :, i] + scores_buf[channel] = ( + torch.tensor( + [ + lprobs_list[i][i_batch, i_beam, i_index] + for i_batch in range(bsz) + for i_beam, i_index in zip( + beams_buf[i_batch], indices_buf[channel][i_batch] + ) + ] + ) + .view(bsz, -1) + .to(lprobs_list[i].device) + ) + + # At this point, beams_buf and indices_buf are single-dim and contain relative indices + return scores_buf, indices_buf, beams_buf + + +class ContiguousMultichannelBeamSearch(MultichannelSearch): + def __init__(self, tgt_dicts): + super().__init__(tgt_dicts) + self.constraint_states = None + + @torch.jit.export + def step( + self, + step: int, + lprobs, + scores: Optional[Tensor], + prev_output_tokens: Optional[Tensor] = None, + original_batch_idxs: Optional[Tensor] = None, + ): + n_channels = len(lprobs) + bsz, beam_size, _ = lprobs[0].size() + + lprobs_list = [] + if step == 0: + # at the first step all hypotheses are equally likely, so use + # only the first beam + for i in range(n_channels): + lprobs_list.append(lprobs[i][:, ::beam_size, :].contiguous()) + else: + # make probs contain cumulative scores for each hypothesis + assert scores is not None + for i in range(n_channels): + lprobs_list.append(lprobs[i] + scores[:, :, step - 1, i].unsqueeze(-1)) + + topk_sum_values, topk_sum_indices, topk_sum_beams = topk_sum( + lprobs_list, k=beam_size * 2 + ) + + beams_buf = topk_sum_beams + indices_buf = topk_sum_indices + scores_buf = ( + torch.tensor( + [ + lprobs_list[i][i_batch, i_beam, i_index] + for i in range(len(lprobs_list)) + for i_batch in range(bsz) + for i_beam, i_index in zip( + beams_buf[i_batch], indices_buf[i_batch, :, i] + ) + ] + ) + .view(len(lprobs_list), bsz, -1) + .permute(1, 2, 0) + .to(lprobs_list[0].device) + ) + + # At this point, beams_buf and indices_buf are single-dim and contain relative indices + return scores_buf, indices_buf, beams_buf + + +class ContiguousMultichannelSampling(MultichannelSearch): + sampling_topk: int + sampling_topp: float + + def __init__(self, tgt_dicts, sampling_topk=-1, sampling_topp=-1.0): + super().__init__(tgt_dicts) + self.sampling_topk = sampling_topk + self.sampling_topp = sampling_topp + + def _sample_topp(self, lprobs): + """Sample among the smallest set of elements whose cumulative probability mass exceeds p. + + See `"The Curious Case of Neural Text Degeneration" + (Holtzman et al., 2019) <https://arxiv.org/abs/1904.09751>`_. + + Args: + lprobs: (bsz x input_beam_size x vocab_size) + the model's log-probabilities over the vocabulary at the current step + + Return: A tuple of (trimed_probs, truncated_indices) where: + trimed_probs: (bsz x input_beam_size x ?) + the model's probabilities over the elements selected to sample from. The + width of the third dimension is determined by top-P. + truncated_indices: (bsz x input_beam_size x ?) + the indices of the chosen elements. + """ + probs = lprobs.exp_() + + # sort the last dimension (vocab dimension) in descending order + sorted_probs, sorted_indices = probs.sort(descending=True) + + # compute a mask to indicate the words to be included in the top-P set. + cumsum_probs = sorted_probs.cumsum(dim=2) + mask = cumsum_probs.lt(self.sampling_topp) + + # note that mask was computed by 'lt'. One more word needs to be included + # so that the cumulative probability mass can exceed p. + cumsum_mask = mask.cumsum(dim=2) + last_included = cumsum_mask[:, :, -1:] + last_included.clamp_(0, mask.size()[2] - 1) + mask = mask.scatter_(2, last_included, 1) + + # truncate unnecessary dims. + max_dim = last_included.max() + truncated_mask = mask[:, :, : max_dim + 1] + truncated_probs = sorted_probs[:, :, : max_dim + 1] + truncated_indices = sorted_indices[:, :, : max_dim + 1] + + # trim the words that are not in top-P by setting their probabilities + # to 0, so that they would not be sampled later. + trim_mask = ~truncated_mask + trimed_probs = truncated_probs.masked_fill_(trim_mask, 0) + return trimed_probs, truncated_indices + + @torch.jit.export + def step( + self, + step: int, + lprobs, + scores, + prev_output_tokens: Optional[Tensor] = None, + original_batch_idxs: Optional[Tensor] = None, + ): + n_channels = len(lprobs) + bsz, beam_size, vocab_size = lprobs[0].size() + + if step == 0: + # at the first step all hypotheses are equally likely, so use + # only the first beam + for i in range(n_channels): + lprobs[i] = lprobs[i][:, ::beam_size, :].contiguous() + + probs = [] + top_indices = [] + for i in range(n_channels): + if self.sampling_topp > 0: + # only sample from the smallest set of words whose cumulative probability mass exceeds p + probs_i, top_indices_i = self._sample_topp(lprobs[i]) + elif self.sampling_topk > 0: + # only sample from top-k candidates + lprobs[i], top_indices_i = lprobs[i].topk( + min(self.sampling_topk, lprobs[i].size(-1)) + ) + probs_i = lprobs[i].exp_() + else: + probs_i = lprobs[i].exp_() + + # dummy data to be consistent with true branch for type check + top_indices_i = torch.empty(0).to(probs_i) + probs.append(probs_i) + top_indices.append(top_indices_i) + # sample + indices_buf = [] + for i in range(n_channels): + if step == 0: + indices_buf.append( + torch.multinomial( + probs[i].view(bsz, -1), + beam_size, + replacement=True, + ).view(bsz, beam_size) + ) + else: + indices_buf.append( + torch.multinomial( + probs[i].view(bsz * beam_size, -1), + 1, + replacement=True, + ).view(bsz, beam_size) + ) + + if step == 0: + for i in range(n_channels): + # expand to beam size + probs[i] = probs[i].expand(bsz, beam_size, -1) + + # gather scores + scores_buf = [] + for i in range(n_channels): + scores_buf.append( + torch.gather(probs[i], dim=2, index=indices_buf[i].unsqueeze(-1)) + ) + scores_buf[i] = scores_buf[i].log_().view(bsz, -1) + + # remap indices if using top-k or top-P sampling + if self.sampling_topk > 0 or self.sampling_topp > 0: + for i in range(n_channels): + indices_buf[i] = torch.gather( + top_indices[i].expand(bsz, beam_size, -1), + dim=2, + index=indices_buf[i].unsqueeze(-1), + ).squeeze(2) + + if step == 0: + beams_buf = indices_buf[0].new_zeros(bsz, beam_size) + else: + beams_buf = torch.arange(0, beam_size).to(indices_buf[0]).repeat(bsz, 1) + # make scores cumulative + for i in range(n_channels): + scores_buf[i].add_( + torch.gather(scores[:, :, step - 1, i], dim=1, index=beams_buf) + ) + scores_buf = torch.stack(scores_buf, dim=-1) + indices_buf = torch.stack(indices_buf, dim=-1) + + return scores_buf, indices_buf, beams_buf diff --git a/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py b/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py new file mode 100644 index 0000000000..24807b866d --- /dev/null +++ b/fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py @@ -0,0 +1,1110 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Dict, List, Optional + +from omegaconf.listconfig import ListConfig +from omegaconf.dictconfig import DictConfig + +import torch +import torch.nn as nn +from fairseq.models import FairseqIncrementalDecoder +from torch import Tensor +from fairseq.ngram_repeat_block import NGramRepeatBlock +from .multichannel_search import ContiguousMultichannelBeamSearch +from fairseq.models.speech_dlm import SpeechDLM + + +class MultichannelSequenceGenerator(nn.Module): + def __init__( + self, + models, + tgt_dicts, + beam_size=1, + max_len_a=0, + max_len_b=200, + min_len=1, + normalize_scores=True, + len_penalty=1.0, + unk_penalty=0.0, + temperature=1.0, + match_source_len=False, + no_repeat_ngram_size=0, + search_strategy=None, + eos=None, + symbols_to_strip_from_output=None, + lm_model=None, + lm_weight=1.0, + duration_temperature=1.0, + ): + """Generate multi-channel parallel units with the SpeechDLM model + as described in the paper: https://arxiv.org/pdf/2203.16502.pdf; + + Args: + models (List[~fairseq.models.FairseqModel]): ensemble of models, + currently support fairseq.models.TransformerModel for scripting + beam_size (int, optional): beam width (default: 1) + max_len_a/b (int, optional): generate sequences of maximum length + ax + b, where x is the source length + min_len (int, optional): the minimum length of the generated output + (not including end-of-sentence) + normalize_scores (bool, optional): normalize scores by the length + of the output (default: True) + len_penalty (float, optional): length penalty, where <1.0 favors + shorter, >1.0 favors longer sentences (default: 1.0) + unk_penalty (float, optional): unknown word penalty, where <0 + produces more unks, >0 produces fewer (default: 0.0) + temperature (float, optional): temperature, where values + >1.0 produce more uniform samples and values <1.0 produce + sharper samples (default: 1.0) + match_source_len (bool, optional): outputs should match the source + length (default: False) + duration_temperature (float, optional): rate of the duration prediction, + higher rate induces a faster generated wav (default: 1.0) + """ + super().__init__() + if isinstance(models, MultichannelEnsembleModel): + self.model = models + else: + self.model = MultichannelEnsembleModel(models) + self.tgt_dicts = tgt_dicts + self.pad = list(tgt_dicts.values())[0].pad() + self.unk = list(tgt_dicts.values())[0].unk() + self.eos = list(tgt_dicts.values())[0].eos() if eos is None else eos + self.symbols_to_strip_from_output = ( + symbols_to_strip_from_output.union({self.eos}) + if symbols_to_strip_from_output is not None + else {self.eos} + ) + self.channels = list(tgt_dicts.keys()) + self.n_channels = len(self.channels) + self.vocab_sizes = [len(tgt_dicts[channel]) for channel in self.channels] + # the max beam size is the dictionary size - 1, since we never select pad + max_possible_beam_size = 1 + for i in self.vocab_sizes: + max_possible_beam_size *= i - 1 + self.beam_size = min(beam_size, max_possible_beam_size) + self.max_len_a = max_len_a + self.max_len_b = max_len_b + self.min_len = min_len + + self.normalize_scores = normalize_scores + self.len_penalty = len_penalty + self.unk_penalty = unk_penalty + if isinstance(temperature, (int, float)): + temperature = {channel: temperature for channel in self.channels} + elif isinstance(temperature, ListConfig) or isinstance(temperature, list): + temperature = { + channel: temperature[i] for i, channel in enumerate(self.channels) + } + assert isinstance(temperature, DictConfig) or isinstance( + temperature, dict + ), f"temperature: expected dict, but found {type(temperature)}" + self.temperature = temperature + self.match_source_len = match_source_len + + if no_repeat_ngram_size > 0: + self.repeat_ngram_blocker = NGramRepeatBlock(no_repeat_ngram_size) + else: + self.repeat_ngram_blocker = None + + for channel in temperature: + assert temperature[channel] > 0, "--temperature must be greater than 0" + + if search_strategy is None: + self.search = ContiguousMultichannelBeamSearch(tgt_dicts) + else: + self.search = search_strategy + # We only need to set src_lengths in LengthConstrainedBeamSearch. + # As a module attribute, setting it would break in multithread + # settings when the model is shared. + self.should_set_src_lengths = ( + hasattr(self.search, "needs_src_lengths") and self.search.needs_src_lengths + ) + + self.model.eval() + + self.lm_model = lm_model + self.lm_weight = lm_weight + if self.lm_model is not None: + self.lm_model.eval() + + self.duration_prediction = bool( + str(getattr(models[0].decoder.args, "duration_prediction", "false")).lower() + == "true" + ) + self.delayed_duration = bool( + str( + getattr(models[0].decoder.args, "delayed_duration_target", "false") + ).lower() + == "true" + ) + self.duration_temperature = duration_temperature + + def cuda(self): + self.model.cuda() + return self + + @torch.no_grad() + def forward( + self, + sample: Dict[str, Dict[str, Tensor]], # TODO: Modify this + prefix_tokens: Optional[Dict[str, Tensor]] = None, + bos_token: Optional[int] = None, + ): + """Generate a batch of translations. + + Args: + sample (dict): batch + prefix_tokens (dict of torch.LongTensor, optional): force decoder to begin + with these tokens + bos_token (int, optional): beginning of sentence token + (default: self.eos) + """ + return self._generate(sample, prefix_tokens, bos_token=bos_token) + + @torch.no_grad() + def generate(self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs): + """Generate translations. Match the api of other fairseq generators. + + Args: + models (List[~fairseq.models.FairseqModel]): ensemble of models + sample (dict): batch + prefix_tokens (dict of torch.LongTensor, optional): force decoder to begin + with these tokens + constraints (torch.LongTensor, optional): force decoder to include + the list of constraints + bos_token (int, optional): beginning of sentence token + (default: self.eos) + """ + return self._generate(sample, **kwargs) + + def _generate( + self, + sample: Dict[str, Dict[str, Tensor]], + prefix_tokens: Optional[Dict[str, Tensor]] = None, + constraints: Optional[Tensor] = None, + bos_token: Optional[int] = None, + ): + """ + Here sample is expected to have the following form + { + 'id': index, + 'net_input': { + 'src_tokens': { + 'channel1' : tensor((batch x src_length)), + 'channel2' : tensor((batch x src_length)), + }, + ... + }, + } + and prefix_tokens + { + 'channel1' : tensor((batch x prefix_length)), + 'channel2' : tensor((batch x prefix_length)), + } + """ + if self.model.is_speech_dlm: + incremental_states = torch.jit.annotate( + List[Dict[str, Dict[str, Optional[Tensor]]]], + [ + torch.jit.annotate( + List[Dict[str, Dict[str, Optional[Tensor]]]], + [{} for _ in range(self.n_channels)], + ) + for i in range(self.model.models_size) + ], + ) + else: + incremental_states = torch.jit.annotate( + List[Dict[str, Dict[str, Optional[Tensor]]]], + [ + torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) + for i in range(self.model.models_size) + ], + ) + net_input = sample["net_input"] + # Convert from dict to tensor form + # shape of src_tokens : (bsz x src_len x n_channels) + src_tokens = torch.stack( + [net_input["src_tokens"][channel] for channel in self.channels], dim=-1 + ) + prefix_tokens = torch.stack( + [prefix_tokens[channel] for channel in self.channels], dim=-1 + ) + # length of the source text being the character length except EndOfSentence and pad + src_lengths = ( + (src_tokens[..., 0].ne(self.eos) & src_tokens[..., 0].ne(self.pad)) + .long() + .sum(dim=1) + ) + + # bsz: total number of sentences in beam + # Note that src_tokens may have more than 2 dimensions (i.e. audio features) + bsz, src_len = src_tokens.size()[:2] + beam_size = self.beam_size + + if constraints is not None and not self.search.supports_constraints: + raise NotImplementedError( + "Target-side constraints were provided, but search method doesn't support them" + ) + + # Initialize constraints, when active + self.search.init_constraints(constraints, beam_size) + + max_len: int = -1 + if self.match_source_len: + max_len = src_lengths.max().item() + else: + max_len = min( + int(self.max_len_a * src_len + self.max_len_b), + # exclude the EOS marker + self.model.max_decoder_positions() - 1, + ) + assert ( + self.min_len <= max_len + ), "min_len cannot be larger than max_len, please adjust these!" + # compute the encoder output for each beam + encoder_outs = self.model.forward_encoder(net_input) + + # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores + new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) + new_order = new_order.to(src_tokens.device).long() + encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order) + # ensure encoder_outs is a List. + assert encoder_outs is not None + + # initialize buffers + # cumulative scores of hypotheses + scores = ( + torch.zeros(bsz * beam_size, max_len + 1, self.n_channels) + .to(src_tokens) + .float() + ) # +1 for eos; pad is never chosen for scoring + tokens = ( + torch.zeros(bsz * beam_size, max_len + 2, self.n_channels) + .to(src_tokens) + .long() + .fill_(self.pad) + ) # +2 for eos and pad + tokens[:, 0] = self.eos if bos_token is None else bos_token + attn: Optional[Tensor] = None + + # A list that indicates candidates that should be ignored. + # For example, suppose we're sampling and have already finalized 2/5 + # samples. Then cands_to_ignore would mark 2 positions as being ignored, + # so that we only finalize the remaining 3 samples. + cands_to_ignore = ( + torch.zeros(bsz, beam_size).to(src_tokens).eq(-1) + ) # forward and backward-compatible False mask + + # list of completed sentences + finalized = torch.jit.annotate( + List[List[Dict[str, Tensor]]], + [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)], + ) # contains lists of dictionaries of infomation about the hypothesis being finalized at each step + + finished = [ + False for i in range(bsz) + ] # a boolean array indicating if the sentence at the index is finished or not + num_remaining_sent = bsz # number of sentences remaining + + # number of candidate hypos per step + cand_size = 2 * beam_size # 2 x beam size in case half are EOS + + # offset arrays for converting between different indexing schemes + bbsz_offsets = ( + (torch.arange(0, bsz) * beam_size) + .unsqueeze(1) + .type_as(tokens) + .to(src_tokens.device) + ) + cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(src_tokens.device) + + reorder_state: Optional[Tensor] = None + batch_idxs: Optional[Tensor] = None + + original_batch_idxs: Optional[Tensor] = None + if "id" in sample and isinstance(sample["id"], Tensor): + original_batch_idxs = sample["id"] + else: + original_batch_idxs = torch.arange(0, bsz).type_as(tokens) + + if self.duration_prediction: + dur_counter = torch.ones(bsz * beam_size, self.n_channels).to(src_tokens) + # save the indice where the dur_counter just copied from dur_pred + dur_counter_jump_indices = None + + for step in range(max_len + 1): # one extra step for EOS marker + # reorder decoder internal states based on the prev choice of beams + if reorder_state is not None: + if batch_idxs is not None: + # update beam indices to take into account removed sentences + corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as( + batch_idxs + ) + reorder_state.view(-1, beam_size).add_( + corr.unsqueeze(-1) * beam_size + ) + original_batch_idxs = original_batch_idxs[batch_idxs] + self.model.reorder_incremental_state(incremental_states, reorder_state) + encoder_outs = self.model.reorder_encoder_out( + encoder_outs, reorder_state + ) + + input_tokens = { + channel: tokens[:, : step + 1, i] + for i, channel in enumerate(self.channels) + } + + lprobs_dict, avg_attn_scores = self.model.forward_decoder( + input_tokens, + encoder_outs, + incremental_states, + self.temperature, + ) + + # Because the sizes of vocab is different, we cannot concat the lprobs to form a single tensor + if not self.duration_prediction: + lprobs_list = list(lprobs_dict.values()) + else: + lprobs_list = [ + net_output["pred_token"] for net_output in lprobs_dict.values() + ] + + # non-positive predicted durations + dur_preds = ( + torch.stack( + [ + net_output["pred_duration"] + for net_output in lprobs_dict.values() + ] + ) + .squeeze(-1) + .T + ) + dur_preds = dur_preds / self.duration_temperature + dur_preds = dur_preds.round().long() + dur_preds[dur_preds < 1] = 1 + + # dur_preds & dur_counter needs to be modified when there isn't an edge + if step > 0: + non_edge_indices = tokens[:, step, :] == tokens[:, step - 1, :] + if self.delayed_duration: + dur_preds[non_edge_indices] = 1 + else: + if dur_counter_jump_indices is not None: + dur_counter[dur_counter_jump_indices & non_edge_indices] = 2 + + # update dur_counter + if step > 0: + if self.delayed_duration: + dur_counter -= ( + (dur_counter == 1) + | (tokens[:, step, :] == tokens[:, step - 1, :]) + ).int() + dur_counter[dur_counter < 0] = 0 + else: + dur_counter -= ( + tokens[:, step, :] == tokens[:, step - 1, :] + ).int() + dur_counter[dur_counter < 1] = 1 + + # whether to copy previous token (ie. if the counter is still on) + # and get get the new duration + if self.delayed_duration: + dur_counter_jump_indices = dur_counter == 0 + dur_counter[dur_counter_jump_indices] = dur_preds[ + dur_counter_jump_indices + ] + + # whether to copy previous token in this step + copy_prev_token = dur_counter != 1 + if self.delayed_duration is False: + dur_counter_jump_indices = dur_counter == 1 + dur_counter[dur_counter_jump_indices] = dur_preds[ + dur_counter_jump_indices + ] + # else: + # dur_counter[dur_counter==0] = dur_preds[dur_counter==0] - 1 + # copy_prev_token = (dur_counter > 0) + + if self.lm_model is not None: + assert False, "Currently not supported in multichannelLM case" + + for i in range(self.n_channels): + lprobs_list[i][lprobs_list[i] != lprobs_list[i]] = torch.tensor( + -math.inf + ).to(lprobs_list[i]) + + lprobs_list[i][:, self.pad] = -math.inf # never select pad + lprobs_list[i][:, self.unk] -= self.unk_penalty # apply unk penalty + + # handle max length constraint + if step >= max_len: + lprobs_list[i][:, : self.eos] = -math.inf + lprobs_list[i][:, self.eos + 1 :] = -math.inf + else: + lprobs_list[i][ + :, self.eos + ] = -math.inf # quick fix for short generation + + # handle prefix tokens (possibly with different lengths) + if ( + prefix_tokens is not None + and step < prefix_tokens.size(1) + and step < max_len + ): + ( + lprobs_list[i], + tokens[..., i], + scores[..., i], + ) = self._prefix_tokens( + step, + lprobs_list[i], + scores[..., i], + tokens[..., i], + prefix_tokens[..., i], + beam_size, + ) + if self.duration_prediction: + # Can copy previous token if the prefix token is padding or unk (1-channel conditionned case) + can_copy_mask = ( + prefix_tokens[:, step, i].eq(self.pad) + | prefix_tokens[:, step, i].eq(self.unk) + ).repeat_interleave(beam_size) + copy_prev_token[:, i] &= can_copy_mask + elif step < self.min_len: + # minimum length constraint (does not apply if using prefix_tokens) + lprobs_list[i][:, self.eos] = -math.inf + + if self.duration_prediction: + if step < max_len: + for j in range(copy_prev_token.size(0)): + if copy_prev_token[j, i]: + prev_token = tokens[j, step, i] + lprobs_list[i][j, :prev_token] = -math.inf + lprobs_list[i][j, prev_token + 1 :] = -math.inf + # lprobs_list[i][j, prev_token] = 0. + # dur_counter[j,i] -= 1 + # else: + # prev_token = tokens[j, step, i] + # if not (lprobs_list[i][j,:].ne(-math.inf).nonzero() == prev_token).all(): + # lprobs_list[i][j, prev_token] = -math.inf + # dur_counter[j,i] = 0. + + # Record attention scores, only support avg_attn_scores is a Tensor + if avg_attn_scores is not None: + if attn is None: + attn = torch.empty( + bsz * beam_size, avg_attn_scores.size(1), max_len + 2 + ).to(scores) + attn[:, :, step + 1].copy_(avg_attn_scores) + + scores = scores.type_as(lprobs_list[0]) + eos_bbsz_idx = torch.empty(0).to( + tokens + ) # indices of hypothesis ending with eos (finished sentences) + eos_scores = torch.empty(0).to( + scores + ) # scores of hypothesis ending with eos (finished sentences) + + if self.should_set_src_lengths: + self.search.set_src_lengths(src_lengths) + + if self.repeat_ngram_blocker is not None: + for i in range(self.n_channels): + lprobs_list[i] = self.repeat_ngram_blocker( + tokens, lprobs_list[i], bsz, beam_size, step + ) + + # Shape: (batch, cand_size) + cand_scores, cand_indices, cand_beams = self.search.step( + step, + [ + lprobs_list[i].view(bsz, -1, self.vocab_sizes[i]) + for i in range(self.n_channels) + ], + scores.view(bsz, beam_size, -1, self.n_channels)[:, :, :step, :], + tokens[:, : step + 1], + original_batch_idxs, + ) + + # cand_bbsz_idx contains beam indices for the top candidate + # hypotheses, with a range of values: [0, bsz*beam_size), + # and dimensions: [bsz, cand_size] + cand_bbsz_idx = cand_beams.add(bbsz_offsets) + + # finalize hypotheses that end in eos + # Shape of eos_mask: (batch size, beam size) + eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf) + eos_mask = torch.any(eos_mask, dim=-1, keepdim=False) + eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask) + + # only consider eos when it's among the top beam_size indices + # Now we know what beam item(s) to finish + # Shape: 1d list of absolute-numbered + eos_bbsz_idx = torch.masked_select( + cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size] + ) + + finalized_sents: List[int] = [] + if eos_bbsz_idx.numel() > 0: + eos_scores = torch.stack( + [ + torch.masked_select( + cand_scores[:, :beam_size, i], mask=eos_mask[:, :beam_size] + ) + for i in range(self.n_channels) + ], + dim=-1, + ) + finalized_sents = self.finalize_hypos( + step, + eos_bbsz_idx, + eos_scores, + tokens, + scores, + finalized, + finished, + beam_size, + attn, + src_lengths, + max_len, + ) + num_remaining_sent -= len(finalized_sents) + + assert num_remaining_sent >= 0 + if num_remaining_sent == 0: + break + if self.search.stop_on_max_len and step >= max_len: + break + assert step < max_len, f"{step} < {max_len}" + + # Remove finalized sentences (ones for which {beam_size} + # finished hypotheses have been generated) from the batch. + if len(finalized_sents) > 0: + new_bsz = bsz - len(finalized_sents) + + # construct batch_idxs which holds indices of batches to keep for the next pass + batch_mask = torch.ones( + bsz, dtype=torch.bool, device=cand_indices.device + ) + batch_mask[finalized_sents] = False + # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it + batch_idxs = torch.arange( + bsz, device=cand_indices.device + ).masked_select(batch_mask) + + # Choose the subset of the hypothesized constraints that will continue + self.search.prune_sentences(batch_idxs) + + eos_mask = eos_mask[batch_idxs] + cand_beams = cand_beams[batch_idxs] + bbsz_offsets.resize_(new_bsz, 1) + cand_bbsz_idx = cand_beams.add(bbsz_offsets) + cand_scores = cand_scores[batch_idxs] + cand_indices = cand_indices[batch_idxs] + + if prefix_tokens is not None: + prefix_tokens = prefix_tokens[batch_idxs] + src_lengths = src_lengths[batch_idxs] + cands_to_ignore = cands_to_ignore[batch_idxs] + + scores = scores.view(bsz, -1)[batch_idxs].view( + new_bsz * beam_size, -1, self.n_channels + ) + tokens = tokens.view(bsz, -1)[batch_idxs].view( + new_bsz * beam_size, -1, self.n_channels + ) + if self.duration_prediction: + dur_counter = dur_counter.view(bsz, -1)[batch_idxs].view( + new_bsz * beam_size, self.n_channels + ) + if attn is not None: + attn = attn.view(bsz, -1)[batch_idxs].view( + new_bsz * beam_size, attn.size(1), -1 + ) + bsz = new_bsz + else: + batch_idxs = None + + # Set active_mask so that values > cand_size indicate eos hypos + # and values < cand_size indicate candidate active hypos. + # After, the min values per row are the top candidate active hypos + + # Rewrite the operator since the element wise or is not supported in torchscript. + + eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size])) + active_mask = torch.add( + eos_mask.type_as(cand_offsets) * cand_size, + cand_offsets[: eos_mask.size(1)], + ) + + # get the top beam_size active hypotheses, which are just + # the hypos with the smallest values in active_mask. + # {active_hypos} indicates which {beam_size} hypotheses + # from the list of {2 * beam_size} candidates were + # selected. Shapes: (batch size, beam size) + new_cands_to_ignore, active_hypos = torch.topk( + active_mask, k=beam_size, dim=1, largest=False + ) + + # update cands_to_ignore to ignore any finalized hypos. + cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size] + # Make sure there is at least one active item for each sentence in the batch. + assert (~cands_to_ignore).any(dim=1).all() + + # update cands_to_ignore to ignore any finalized hypos + # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam + # can be selected more than once). + active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos) + active_bbsz_idx = active_bbsz_idx.view(-1) + + # active_scores = torch.stack([ + # torch.gather(cand_scores[...,0], dim=1, index=active_hypos) + # for i in range(self.n_channels) + # ], dim = -1) + # active_scores = active_scores.view(-1) + + # copy tokens and scores for active hypotheses + + # Set the tokens for each beam (can select the same row more than once) + tokens[:, : step + 1] = torch.index_select( + tokens[:, : step + 1], dim=0, index=active_bbsz_idx + ) + # Select the next token for each of them + for i in range(self.n_channels): + tokens.view(bsz, beam_size, -1, self.n_channels)[ + :, :, step + 1, i + ] = torch.gather(cand_indices[..., i], dim=1, index=active_hypos) + if step > 0: + scores[:, :step] = torch.index_select( + scores[:, :step], dim=0, index=active_bbsz_idx + ) + for i in range(self.n_channels): + scores.view(bsz, beam_size, -1, self.n_channels)[ + :, :, step, i + ] = torch.gather(cand_scores[..., i], dim=1, index=active_hypos) + + if self.duration_prediction: + dur_counter = torch.index_select( + dur_counter, dim=0, index=active_bbsz_idx + ) + + # Update constraints based on which candidates were selected for the next beam + self.search.update_constraints(active_hypos) + + # copy attention for active hypotheses + if attn is not None: + attn[:, :, : step + 2] = torch.index_select( + attn[:, :, : step + 2], dim=0, index=active_bbsz_idx + ) + + # reorder incremental state in decoder + reorder_state = active_bbsz_idx + + # sort by score descending + for sent in range(len(finalized)): + scores = torch.tensor( + [float(elem["score"].item()) for elem in finalized[sent]] + ) + _, sorted_scores_indices = torch.sort(scores, descending=True) + finalized[sent] = [finalized[sent][ssi] for ssi in sorted_scores_indices] + finalized[sent] = torch.jit.annotate( + List[Dict[str, Tensor]], finalized[sent] + ) + return finalized + + def _prefix_tokens( + self, step: int, lprobs, scores, tokens, prefix_tokens, beam_size: int + ): + """Handle prefix tokens""" + prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1) + prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1)) + prefix_mask = prefix_toks.ne(self.pad) + # used for 1-channel generation, do not force the unk token (i.e. unk tokens are changed) + prefix_mask &= prefix_toks.ne(self.unk) + # zeroing the copying tokens + # if step > 0: + # copy_mask = (prefix_tokens[:, step] == prefix_tokens[:, step-1]).unsqueeze(-1).repeat(1, beam_size).view(-1) + # prefix_lprobs[copy_mask & prefix_mask] = 0. + lprobs[prefix_mask] = torch.tensor(-math.inf).to(lprobs) + lprobs[prefix_mask] = lprobs[prefix_mask].scatter( + -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lprobs[prefix_mask] + ) + # shouldn't stop at unk token + unk_mask = prefix_toks.eq(self.unk) + if len(lprobs[unk_mask]) > 0: + # otherwise it won't assign to lprobs, + # see: https://discuss.pytorch.org/t/how-to-mask-and-assign-a-value-to-tensor/18437 + copy_lprobs = lprobs[unk_mask][:, :] + copy_lprobs[:, self.eos] = -math.inf + lprobs[unk_mask] = copy_lprobs + # if prefix includes eos, then we should make sure tokens and + # scores are the same across all beams + eos_mask = prefix_toks.eq(self.eos) + if eos_mask.any(): + # validate that the first beam matches the prefix + first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[ + :, 0, 1 : step + 1 + ] + eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0] + target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step] + assert (first_beam == target_prefix).all() + + # copy tokens, scores and lprobs from the first beam to all beams + tokens = self.replicate_first_beam(tokens, eos_mask_batch_dim, beam_size) + scores = self.replicate_first_beam(scores, eos_mask_batch_dim, beam_size) + lprobs = self.replicate_first_beam(lprobs, eos_mask_batch_dim, beam_size) + return lprobs, tokens, scores + + def replicate_first_beam(self, tensor, mask, beam_size: int): + tensor = tensor.view(-1, beam_size, tensor.size(-1)) + tensor[mask] = tensor[mask][:, :1, :] + return tensor.view(-1, tensor.size(-1)) + + def finalize_hypos( + self, + step: int, + bbsz_idx, + eos_scores, + tokens, + scores, + finalized: List[List[Dict[str, Tensor]]], + finished: List[bool], + beam_size: int, + attn: Optional[Tensor], + src_lengths, + max_len: int, + ): + """Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly. + A sentence is finalized when {beam_size} finished items have been collected for it. + + Returns number of sentences (not beam items) being finalized. + These will be removed from the batch and not processed further. + Args: + bbsz_idx (Tensor): + """ + assert bbsz_idx.numel() == eos_scores.size(0) + + # clone relevant token and attention tensors. + # tokens is (batch * beam, max_len). So the index_select + # gets the newly EOS rows, then selects cols 1..{step + 2} + tokens_clone = tokens.index_select(0, bbsz_idx)[ + :, 1 : step + 2 + ] # skip the first index, which is EOS + + tokens_clone[:, step] = self.eos + attn_clone = ( + attn.index_select(0, bbsz_idx)[:, :, 1 : step + 2] + if attn is not None + else None + ) + + # compute scores per token position + pos_scores = scores.index_select(0, bbsz_idx)[:, : step + 1] + pos_scores[:, step, :] = eos_scores + # convert from cumulative to per-position scores + pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] + + # normalize sentence-level scores + if self.normalize_scores: + eos_scores /= (step + 1) ** self.len_penalty + + # cum_unfin records which sentences in the batch are finished. + # It helps match indexing between (a) the original sentences + # in the batch and (b) the current, possibly-reduced set of + # sentences. + cum_unfin: List[int] = [] + prev = 0 + for f in finished: + if f: + prev += 1 + else: + cum_unfin.append(prev) + + # The keys here are of the form "{sent}_{unfin_idx}", where + # "unfin_idx" is the index in the current (possibly reduced) + # list of sentences, and "sent" is the index in the original, + # unreduced batch + # set() is not supported in script export + sents_seen: Dict[str, Optional[Tensor]] = {} + + # For every finished beam item + for i in range(bbsz_idx.size()[0]): + idx = bbsz_idx[i] + score = eos_scores[i].sum() + # sentence index in the current (possibly reduced) batch + unfin_idx = idx // beam_size + # sentence index in the original (unreduced) batch + sent = unfin_idx + cum_unfin[unfin_idx] + # Cannot create dict for key type '(int, int)' in torchscript. + # The workaround is to cast int to string + seen = str(sent.item()) + "_" + str(unfin_idx.item()) + if seen not in sents_seen: + sents_seen[seen] = None + + if self.match_source_len and step > src_lengths[unfin_idx]: + score = torch.tensor(-math.inf).to(score) + + # An input sentence (among those in a batch) is finished when + # beam_size hypotheses have been collected for it + if len(finalized[sent]) < beam_size: + if attn_clone is not None: + # remove padding tokens from attn scores + hypo_attn = attn_clone[i] + else: + hypo_attn = torch.empty(0) + + finalized[sent].append( + { + "tokens": tokens_clone[i], + "score": score, + "attention": hypo_attn, # src_len x tgt_len + "alignment": torch.empty(0), + "positional_scores": pos_scores[i], + } + ) + + newly_finished: List[int] = [] + + for seen in sents_seen.keys(): + # check termination conditions for this sentence + sent: int = int(float(seen.split("_")[0])) + unfin_idx: int = int(float(seen.split("_")[1])) + + if not finished[sent] and self.is_finished( + step, unfin_idx, max_len, len(finalized[sent]), beam_size + ): + finished[sent] = True + newly_finished.append(unfin_idx) + + return newly_finished + + def is_finished( + self, + step: int, + unfin_idx: int, + max_len: int, + finalized_sent_len: int, + beam_size: int, + ): + """ + Check whether decoding for a sentence is finished, which + occurs when the list of finalized sentences has reached the + beam size, or when we reach the maximum length. + """ + assert finalized_sent_len <= beam_size + if finalized_sent_len == beam_size or step == max_len: + return True + return False + + +class MultichannelEnsembleModel(nn.Module): + """A wrapper around an ensemble of SpeechDLM models.""" + + def __init__(self, models): + super().__init__() + self.models_size = len(models) + # method '__len__' is not supported in ModuleList for torch script + self.single_model = models[0] + self.models = nn.ModuleList(models) + + self.has_incremental: bool = False + if all( + hasattr(m, "decoder") and isinstance(m.decoder, FairseqIncrementalDecoder) + for m in models + ): + self.has_incremental = True + + if isinstance(models[0], SpeechDLM): + self.is_speech_dlm = True + # Otherwise it's a multi-channel language model (without cross-prediction outputs) + else: + self.is_speech_dlm = False + + if getattr(models[0].decoder.args, "duration_prediction", False): + self.is_duration_prediction = True + else: + self.is_duration_prediction = False + + def forward(self): + pass + + def has_encoder(self): + return hasattr(self.single_model, "encoder") + + def has_incremental_states(self): + return self.has_incremental + + def max_decoder_positions(self): + return min([m.max_decoder_positions() for m in self.models]) + + @torch.jit.export + def forward_encoder(self, net_input: Dict[str, Tensor]): + if not self.has_encoder(): + return None + return [model.encoder.forward_torchscript(net_input) for model in self.models] + + @torch.jit.export + def forward_decoder( + self, + tokens, + encoder_outs: List[Dict[str, List[Tensor]]], + incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], + temperature: Dict[str, float] = 1.0, + ): + if isinstance(temperature, (float, int)): + temperature = {channel: temperature for channel in tokens} + log_probs = {channel: [] for channel in tokens} + avg_attn: Optional[Tensor] = None + encoder_out: Optional[Dict[str, List[Tensor]]] = None + for i, model in enumerate(self.models): + if self.has_encoder(): + encoder_out = encoder_outs[i] + # decode each model + if self.has_incremental_states(): + decoder_out = model.decoder.forward( + tokens, + encoder_out=encoder_out, + incremental_state=incremental_states[i], + ) + else: + decoder_out = model.decoder.forward(tokens, encoder_out=encoder_out) + + attn: Optional[Tensor] = None + decoder_len = len(decoder_out) + if decoder_len > 1 and decoder_out[1] is not None: + if isinstance(decoder_out[1], Tensor): + attn = decoder_out[1] + else: + attn_holder = decoder_out[1]["attn"] + if isinstance(attn_holder, Tensor): + attn = attn_holder + elif attn_holder is not None: + attn = attn_holder[0] + if attn is not None: + attn = attn[:, -1, :] + + if self.is_speech_dlm: + if self.is_duration_prediction: + decoder_out_divided_by_temperature = { + channel_src: { + channel_pred: { + "pred_token": decoder_out[0][channel_src][channel_pred][ + "pred_token" + ][:, -1:, :].div_(temperature[channel_pred]), + "pred_duration": decoder_out[0][channel_src][ + channel_pred + ]["pred_duration"][:, -1:, :], + } + for channel_pred in decoder_out[0][channel_src] + } + for channel_src in decoder_out[0] + } + else: + decoder_out_divided_by_temperature = { + channel_src: { + channel_pred: decoder_out[0][channel_src][channel_pred][ + :, -1:, : + ].div_(temperature[channel_pred]) + for channel_pred in decoder_out[0][channel_src] + } + for channel_src in decoder_out[0] + } + else: + decoder_out_divided_by_temperature = { + channel: decoder_out[0][channel][:, -1:, :].div_( + temperature[channel] + ) + for channel in decoder_out[0] + } + decoder_out_tuple = ( + decoder_out_divided_by_temperature, + None if decoder_len <= 1 else decoder_out[1], + ) + + probs = model.get_normalized_probs( + decoder_out_tuple, log_probs=True, sample=None + ) + + if self.is_speech_dlm: + if self.is_duration_prediction: + probs = { + channel: { + "pred_token": probs[channel][channel]["pred_token"][ + :, -1, : + ], + "pred_duration": probs[channel][channel]["pred_duration"][ + :, -1, : + ], + } + for channel in probs + } + else: + probs = { + channel: probs[channel][channel][:, -1, :] for channel in probs + } + else: + probs = {channel: probs[channel][:, -1, :] for channel in probs} + if self.models_size == 1: + return probs, attn + + for channel in probs: + log_probs[channel].append(probs[channel]) + if attn is not None: + if avg_attn is None: + avg_attn = attn + else: + avg_attn.add_(attn) + + avg_probs = {} + for channel in log_probs: + avg_probs[channel] = torch.logsumexp( + torch.stack(log_probs[channel], dim=0), dim=0 + ) - math.log(self.models_size) + + if avg_attn is not None: + avg_attn.div_(self.models_size) + return avg_probs, avg_attn + + @torch.jit.export + def reorder_encoder_out( + self, encoder_outs: Optional[List[Dict[str, List[Tensor]]]], new_order + ): + """ + Reorder encoder output according to *new_order*. + + Args: + encoder_out: output from the ``forward()`` method + new_order (LongTensor): desired order + + Returns: + *encoder_out* rearranged according to *new_order* + """ + new_outs: List[Dict[str, List[Tensor]]] = [] + if not self.has_encoder(): + return new_outs + for i, model in enumerate(self.models): + assert encoder_outs is not None + new_outs.append( + model.encoder.reorder_encoder_out(encoder_outs[i], new_order) + ) + return new_outs + + @torch.jit.export + def reorder_incremental_state( + self, + incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], + new_order, + ): + if not self.has_incremental_states(): + return + for i, model in enumerate(self.models): + model.decoder.reorder_incremental_state_scripting( + incremental_states[i], new_order + ) diff --git a/fairseq/models/speech_dlm/speech_dlm.py b/fairseq/models/speech_dlm/speech_dlm.py new file mode 100644 index 0000000000..dc13f565f1 --- /dev/null +++ b/fairseq/models/speech_dlm/speech_dlm.py @@ -0,0 +1,280 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from dataclasses import dataclass, field +from typing import Optional + +from fairseq import utils +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.models import ( + FairseqLanguageModel, + register_model, + register_model_architecture, +) +from fairseq.models.transformer import Embedding +from .modules.speech_dlm_decoder import CrossChannelTransformerDecoder +from omegaconf import II + + +DEFAULT_MAX_TARGET_POSITIONS = 1024 + +logger = logging.getLogger(__name__) + + +@dataclass +class SpeechDLMConfig(FairseqDataclass): + activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( + default="relu", metadata={"help": "activation function to use"} + ) + dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) + attention_dropout: float = field( + default=0.0, metadata={"help": "dropout probability for attention weights"} + ) + activation_dropout: float = field( + default=0.0, metadata={"help": "dropout probability after activation in FFN."} + ) + relu_dropout: float = field( + default=0.0, metadata={"help": "dropout probability after activation in FFN."} + ) + decoder_embed_dim: int = field( + default=512, metadata={"help": "decoder embedding dimension"} + ) + decoder_output_dim: int = field( + default=512, metadata={"help": "decoder output dimension"} + ) + decoder_input_dim: int = field( + default=512, metadata={"help": "decoder input dimension"} + ) + decoder_ffn_embed_dim: int = field( + default=2048, metadata={"help": "decoder embedding dimension for FFN"} + ) + decoder_layers: int = field(default=6, metadata={"help": "num decoder layers"}) + decoder_cross_layers: int = field( + default=-1, metadata={"help": "num self cross attention decoder layers"} + ) + decoder_attention_heads: int = field( + default=8, metadata={"help": "num decoder attention heads"} + ) + decoder_normalize_before: bool = field( + default=False, metadata={"help": "apply layernorm before each decoder block"} + ) + no_decoder_final_norm: bool = field( + default=False, + metadata={"help": "don't add an extra layernorm after the last decoder block"}, + ) + no_token_positional_embeddings: bool = field( + default=False, + metadata={ + "help": "if set, disables positional embeddings (outside self attention)" + }, + ) + share_decoder_input_output_embed: bool = field( + default=False, metadata={"help": "share decoder input and output embeddings"} + ) + decoder_learned_pos: bool = field( + default=False, + metadata={"help": "use learned positional embeddings in the decoder"}, + ) + decoder_layerdrop: float = field( + default=0.0, metadata={"help": "LayerDrop probability for decoder"} + ) + decoder_layers_to_keep: Optional[str] = field( + default=None, + metadata={ + "help": "which layers to *keep* when pruning as a comma-separated list" + }, + ) + layernorm_embedding: bool = field( + default=False, metadata={"help": "add layernorm to embedding"} + ) + no_scale_embedding: bool = field( + default=False, metadata={"help": "if True, dont scale embeddings"} + ) + checkpoint_activations: bool = field( + default=False, metadata={"help": "checkpoint activations at each layer"} + ) + offload_activations: bool = field( + default=False, + metadata={"help": "move checkpointed activations to CPU after they are used."}, + ) + quant_noise_pq: float = field( + default=0.0, + metadata={"help": "iterative PQ quantization noise at training time"}, + ) + quant_noise_pq_block_size: int = field( + default=8, + metadata={"help": "block size of quantization noise at training time"}, + ) + # TODO common var add to parent + quant_noise_scalar: float = field( + default=0.0, + metadata={ + "help": "scalar quantization noise and scalar quantization at training time" + }, + ) + add_bos_token: bool = II("task.add_bos_token") + tokens_per_sample: int = II("task.tokens_per_sample") + max_target_positions: Optional[int] = II("task.max_target_positions") + tpu: bool = II("common.tpu") + duration_prediction: str = II("task.duration_prediction") + delayed_duration_target: str = II("task.delayed_duration_target") + main_and_cross_weights: str = II("criterion.main_and_cross_weights") + + +@register_model("speech_dlm", dataclass=SpeechDLMConfig) +class SpeechDLM(FairseqLanguageModel): + """Spoken Unit-based Dialogue Language Model model (SpeechDLM) as described + in the paper: https://arxiv.org/pdf/2203.16502.pdf + """ + + def __init__(self, decoder): + super().__init__(decoder) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + # make sure all arguments are present in older models + base_lm_architecture(args) + + if args.decoder_layers_to_keep: + args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) + + if args.decoder_cross_layers < 0: + args.decoder_cross_layers = args.decoder_layers + + if getattr(args, "max_target_positions", None) is None: + args.max_target_positions = getattr( + args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS + ) + + # Assert all dictionary to be the same + assert all( + task.source_dictionaries[channel] == task.source_dictionary + for channel in task.channels + ), "Source dictionaries of all channels are expected to be the same!!!" + assert all( + task.target_dictionaries[channel] == task.target_dictionary + for channel in task.channels + ), "Target dictionaries of all channels are expected to be the same!!!" + # Build the unit embeddings + embed_tokens = cls.build_embedding( + args, task.source_dictionary, args.decoder_input_dim + ) + + decoder = CrossChannelTransformerDecoder( + args, + task.target_dictionary, + embed_tokens, + channels=task.channels, + no_encoder_attn=True, + ) + return cls(decoder) + + @classmethod + def build_embedding(cls, args, dictionary, embed_dim, path=None): + embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad()) + return embed_tokens + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + **kwargs, + ): + """ + Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model + file. Downloads and caches the pre-trained model file if needed. + + The base implementation returns a + :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to + generate translations or sample from language models. The underlying + :class:`~fairseq.models.FairseqModel` can be accessed via the + *generator.models* attribute. + + This function return a class:`MultichannelGeneratorHubInterface` object, + which allows generation in multiple channels with a multichannel model. + + Args: + model_name_or_path (str): either the name of a pre-trained model to + load or a path/URL to a pre-trained model state dict + checkpoint_file (str, optional): colon-separated list of checkpoint + files in the model archive to ensemble (default: 'model.pt') + data_name_or_path (str, optional): point args.data to the archive + at the given path/URL. Can start with '.' or './' to reuse the + model archive path. + """ + from fairseq import hub_utils + from .hub_interface import MultichannelGeneratorHubInterface + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + **kwargs, + ) + logger.info(x["args"]) + return MultichannelGeneratorHubInterface(x["args"], x["task"], x["models"]) + + @property + def supported_targets(self): + return {"next", "edge", "duration"} + + +def base_lm_architecture(args): + # backward compatibility for older model checkpoints + if hasattr(args, "decoder_final_norm"): + args.no_decoder_final_norm = not args.decoder_final_norm + + args.dropout = getattr(args, "dropout", 0.1) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.decoder_cross_layers = getattr(args, "decoder_cross_layers", 6) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.activation_fn = getattr(args, "activation_fn", "relu") + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0) + args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None) + args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) + args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8) + args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0) + + args.add_bos_token = getattr(args, "add_bos_token", False) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + + # Model training is not stable without this + args.decoder_normalize_before = True + args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", False) + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + args.layernorm_embedding = getattr(args, "layernorm_embedding", False) + args.checkpoint_activations = getattr(args, "checkpoint_activations", False) + args.offload_activations = getattr(args, "offload_activations", False) + if args.offload_activations: + args.checkpoint_activations = True + + +@register_model_architecture("speech_dlm", "speech_dlm_big") +def speech_dlm_big(args): + args.decoder_layers = getattr(args, "decoder_layers", 12) + args.decoder_cross_layers = getattr(args, "decoder_cross_layers", 12) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + base_lm_architecture(args) diff --git a/fairseq/models/speech_to_speech/__init__.py b/fairseq/models/speech_to_speech/__init__.py new file mode 100644 index 0000000000..f29215c2fe --- /dev/null +++ b/fairseq/models/speech_to_speech/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .s2s_conformer import * # noqa +from .s2s_conformer_translatotron2 import * # noqa +from .s2s_conformer_unity import * # noqa +from .s2s_transformer import * # noqa diff --git a/fairseq/models/speech_to_speech/modules/__init__.py b/fairseq/models/speech_to_speech/modules/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fairseq/models/speech_to_speech/modules/ctc_decoder.py b/fairseq/models/speech_to_speech/modules/ctc_decoder.py new file mode 100644 index 0000000000..721efbf61a --- /dev/null +++ b/fairseq/models/speech_to_speech/modules/ctc_decoder.py @@ -0,0 +1,18 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from torch import nn + +from fairseq.models import FairseqEncoder + + +class CTCDecoder(FairseqEncoder): + def __init__(self, dictionary, in_dim): + super().__init__(dictionary) + self.proj = nn.Linear(in_dim, len(dictionary)) + + def forward(self, src_tokens, src_lengths=None, **kwargs): + encoder_out = self.proj(src_tokens) + return {"encoder_out": encoder_out} diff --git a/fairseq/models/speech_to_speech/modules/stacked_embedding.py b/fairseq/models/speech_to_speech/modules/stacked_embedding.py new file mode 100644 index 0000000000..5955a08538 --- /dev/null +++ b/fairseq/models/speech_to_speech/modules/stacked_embedding.py @@ -0,0 +1,48 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from torch import nn + +from fairseq.models.transformer import Linear + + +class StackedEmbedding(nn.Embedding): + """Embedding module that supports stacked units -> single embedding""" + + def __init__(self, num_embeddings, embed_dim, padding_idx, num_stacked=1): + super().__init__(num_embeddings, embed_dim, padding_idx) + # follow transformer.Embedding + nn.init.normal_(self.weight, mean=0, std=embed_dim**-0.5) + nn.init.constant_(self.weight[padding_idx], 0) + + self.offset = ( + 4 # skip <bos>, <pad>, <eos>, <unk>, specific to fairseq dictionary + ) + self.vocab_size = num_embeddings - self.offset + self.num_stacked = num_stacked + + if self.num_stacked > 1: + self.project_in_dim = Linear(embed_dim * num_stacked, embed_dim, bias=False) + + def forward(self, input): + if self.num_stacked == 1: + return super().forward(input) + + # expand input indices + mask = input >= self.offset + stacked_input = [] + cum_input = input.new_zeros(input.shape) + for i in range(1, self.num_stacked + 1): + div = pow(self.vocab_size, i) + next_input = torch.remainder(input - self.offset - cum_input, div) + cum_input += next_input + next_input = torch.floor_divide(next_input, div // self.vocab_size) + stacked_input.append((next_input + self.offset) * mask + input * ~mask) + + stacked_input = torch.stack(stacked_input[::-1], dim=2) + embed = super().forward(stacked_input).view(input.size(0), input.size(1), -1) + embed = self.project_in_dim(embed) + return embed diff --git a/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py b/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py new file mode 100644 index 0000000000..68f42c2b36 --- /dev/null +++ b/fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py @@ -0,0 +1,108 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, Dict, List, Optional + +from torch import Tensor + +from fairseq.models.transformer import Linear +from fairseq.models.transformer.transformer_decoder_aug import AugTransformerDecoder + + +class AugTransformerUnitDecoder(AugTransformerDecoder): + """Based on Transformer decoder, with support to decoding stacked units""" + + def __init__( + self, + args, + dictionary, + embed_tokens, + no_encoder_attn=False, + output_projection=None, + ): + super().__init__( + args, dictionary, embed_tokens, no_encoder_attn, output_projection + ) + self.n_frames_per_step = args.n_frames_per_step + + self.out_proj_n_frames = ( + Linear( + self.output_embed_dim, + self.output_embed_dim * self.n_frames_per_step, + bias=False, + ) + if self.n_frames_per_step > 1 + else None + ) + + def forward( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + encoder_out_aug: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + features_only: bool = False, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + src_lengths: Optional[Any] = None, + return_all_hiddens: bool = False, + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (optional): output from the encoder, used for + encoder-side attention, should be of size T x B x C + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + features_only (bool, optional): only return features without + applying output layer (default: False). + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + + x, extra = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + encoder_out_aug=encoder_out_aug, + incremental_state=incremental_state, + full_context_alignment=full_context_alignment, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + ) + + if not features_only: + bsz, seq_len, d = x.size() + if self.out_proj_n_frames: + x = self.out_proj_n_frames(x) + x = self.output_layer(x.view(bsz, seq_len, self.n_frames_per_step, d)) + x = x.view(bsz, seq_len * self.n_frames_per_step, -1) + if ( + incremental_state is None and self.n_frames_per_step > 1 + ): # teacher-forcing mode in training + x = x[ + :, : -(self.n_frames_per_step - 1), : + ] # remove extra frames after <eos> + + return x, extra + + def upgrade_state_dict_named(self, state_dict, name): + if self.n_frames_per_step > 1: + move_keys = [ + ( + f"{name}.project_in_dim.weight", + f"{name}.embed_tokens.project_in_dim.weight", + ) + ] + for from_k, to_k in move_keys: + if from_k in state_dict and to_k not in state_dict: + state_dict[to_k] = state_dict[from_k] + del state_dict[from_k] diff --git a/fairseq/models/speech_to_speech/modules/transformer_encoder.py b/fairseq/models/speech_to_speech/modules/transformer_encoder.py new file mode 100644 index 0000000000..fb1af433d8 --- /dev/null +++ b/fairseq/models/speech_to_speech/modules/transformer_encoder.py @@ -0,0 +1,85 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch.nn as nn + +from fairseq.models import FairseqEncoder +from fairseq.modules import LayerNorm, TransformerEncoderLayer + + +class TransformerEncoderNoEmb(FairseqEncoder): + """Transformer encoder without token embeddings.""" + + def __init__(self, args): + super().__init__(None) + + self.layers = nn.ModuleList( + [TransformerEncoderLayer(args) for _ in range(args.encoder_layers)] + ) + if args.encoder_normalize_before: + self.layer_norm = LayerNorm(args.encoder_embed_dim) + else: + self.layer_norm = None + + def forward(self, x, encoder_padding_mask, return_all_hiddens=False): + + encoder_states = [] + + for layer in self.layers: + x = layer(x, encoder_padding_mask) + if return_all_hiddens: + encoder_states.append(x) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [encoder_padding_mask] + if encoder_padding_mask is not None and encoder_padding_mask.any() + else [], # B x T + "encoder_embedding": [], # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + } + + def reorder_encoder_out(self, encoder_out, new_order): + new_encoder_out = ( + [] + if len(encoder_out["encoder_out"]) == 0 + else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] + ) + + new_encoder_padding_mask = ( + [] + if len(encoder_out["encoder_padding_mask"]) == 0 + else [ + x.index_select(0, new_order) + for x in encoder_out["encoder_padding_mask"] + ] + ) + + new_encoder_embedding = ( + [] + if len(encoder_out["encoder_embedding"]) == 0 + else [ + x.index_select(0, new_order) for x in encoder_out["encoder_embedding"] + ] + ) + + encoder_states = encoder_out["encoder_states"] + if len(encoder_states) > 0: + for idx, state in enumerate(encoder_states): + encoder_states[idx] = state.index_select(1, new_order) + + return { + "encoder_out": new_encoder_out, # T x B x C + "encoder_padding_mask": new_encoder_padding_mask, # B x T + "encoder_embedding": new_encoder_embedding, # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], # B x T + "src_lengths": [], # B x 1 + } diff --git a/fairseq/models/speech_to_speech/s2s_conformer.py b/fairseq/models/speech_to_speech/s2s_conformer.py new file mode 100644 index 0000000000..636396d536 --- /dev/null +++ b/fairseq/models/speech_to_speech/s2s_conformer.py @@ -0,0 +1,172 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from pathlib import Path + +import torch + +from fairseq import checkpoint_utils +from fairseq.models import register_model, register_model_architecture +from fairseq.models.speech_to_speech.s2s_transformer import ( + S2SpecTTransformerModel, + S2UTTransformerModel, + s2spect_architecture_base, + s2ut_architecture_base, +) +from fairseq.models.speech_to_text import S2TConformerEncoder +from fairseq.models.transformer import Linear + +logger = logging.getLogger(__name__) + + +def build_s2s_conformer_encoder(args): + encoder = S2SConformerEncoder(args) + pretraining_path = getattr(args, "load_pretrained_encoder_from", None) + if pretraining_path is not None: + if not Path(pretraining_path).exists(): + logger.warning( + f"skipped pretraining because {pretraining_path} does not exist" + ) + else: + encoder = checkpoint_utils.load_pretrained_component_from_model( + component=encoder, checkpoint=pretraining_path + ) + logger.info(f"loaded pretrained encoder from: {pretraining_path}") + return encoder + + +class S2SConformerEncoder(S2TConformerEncoder): + """Based on S2T transformer encoder, with support + to incorporate target speaker embedding.""" + + def __init__(self, args): + super().__init__(args) + + self.spk_emb_proj = None + if args.target_speaker_embed: + self.spk_emb_proj = Linear( + args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim + ) + + def forward( + self, src_tokens, src_lengths, tgt_speaker=None, return_all_hiddens=False + ): + out = super().forward(src_tokens, src_lengths, return_all_hiddens) + + if self.spk_emb_proj: + x = out["encoder_out"][0] + seq_len, bsz, _ = x.size() + tgt_speaker_emb = tgt_speaker.view(1, bsz, -1).expand(seq_len, bsz, -1) + x = self.spk_emb_proj(torch.cat([x, tgt_speaker_emb], dim=2)) + out["encoder_out"][0] = x + + return out + + +@register_model("s2ut_conformer") +class S2UTConformerModel(S2UTTransformerModel): + """ + Direct speech-to-speech translation model with Conformer encoder + Transformer discrete unit decoder + """ + + @staticmethod + def add_args(parser): + S2UTTransformerModel.add_args(parser) + parser.add_argument( + "--depthwise-conv-kernel-size", + type=int, + metavar="N", + help="kernel size of depthwise convolution layers", + ) + parser.add_argument( + "--attn-type", + type=str, + metavar="STR", + help="If not specified uses fairseq MHA. Other valid option is espnet for using conformer", + ) + parser.add_argument( + "--pos-enc-type", + type=str, + metavar="STR", + help="Must be specified in addition to attn-type=espnet for rel_pos and rope", + ) + + @classmethod + def build_encoder(cls, args): + return build_s2s_conformer_encoder(args) + + +@register_model("s2spect_conformer") +class S2SpecTConformerModel(S2SpecTTransformerModel): + """ + Direct speech-to-speech translation model with Conformer encoder + TTS Transformer decoder + """ + + @staticmethod + def add_args(parser): + S2SpecTTransformerModel.add_args(parser) + parser.add_argument("--depthwise-conv-kernel-size", type=int, default=31) + parser.add_argument( + "--attn-type", + type=str, + default=None, + help="If not specified uses fairseq MHA. Other valid option is espnet for using conformer", + ) + parser.add_argument( + "--pos-enc-type", + type=str, + default="abs", + help="Must be specified in addition to attn-type=espnet for rel_pos and rope", + ) + + @classmethod + def build_encoder(cls, args): + return build_s2s_conformer_encoder(args) + + +@register_model_architecture("s2ut_conformer", "s2ut_conformer") +def s2ut_conformer_architecture_base(args): + args.attn_type = getattr(args, "attn_type", None) + args.pos_enc_type = getattr(args, "pos_enc_type", "abs") + args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) + args.input_channels = getattr(args, "input_channels", 1) + args.max_source_positions = getattr(args, "max_source_positions", 6000) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.dropout = getattr(args, "dropout", 0.1) + args.encoder_layers = getattr(args, "encoder_layers", 16) + args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31) + s2ut_architecture_base(args) + + +@register_model_architecture("s2spect_conformer", "s2spect_conformer") +def s2spect_conformer_architecture_base(args): + args.attn_type = getattr(args, "attn_type", None) + args.pos_enc_type = getattr(args, "pos_enc_type", "abs") + args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) + args.input_channels = getattr(args, "input_channels", 1) + args.max_source_positions = getattr(args, "max_source_positions", 6000) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.dropout = getattr(args, "dropout", 0.1) + args.encoder_layers = getattr(args, "encoder_layers", 16) + args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31) + s2spect_architecture_base(args) + + +@register_model_architecture("s2spect_conformer", "s2spect_conformer_fisher") +def s2spect_architecture_fisher(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.dropout = getattr(args, "dropout", 0.1) + + # decoder + args.prenet_dim = getattr(args, "prenet_dim", 32) + + s2spect_conformer_architecture_base(args) diff --git a/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py b/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py new file mode 100644 index 0000000000..8016daee8d --- /dev/null +++ b/fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py @@ -0,0 +1,262 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import logging + +from fairseq.models import ( + FairseqEncoderModel, + FairseqLanguageModel, + register_model, + register_model_architecture, +) +from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder +from fairseq.models.speech_to_speech.modules.transformer_encoder import ( + TransformerEncoderNoEmb, +) +from fairseq.models.speech_to_speech.s2s_conformer import S2SpecTConformerModel +from fairseq.models.speech_to_speech.s2s_conformer_unity import ( + multitask_text_transformer_decoder_arch, +) +from fairseq.models.speech_to_speech.s2s_transformer import ( + base_multitask_text_transformer_decoder_arch, + s2spect_architecture_base, +) +from fairseq.models.text_to_speech import TTSTransformerDecoder +from fairseq.models.transformer import TransformerDecoder, TransformerModelBase + +logger = logging.getLogger(__name__) + + +@register_model("s2spect2_conformer") +class S2SpecT2ConformerModel(S2SpecTConformerModel): + """ + Direct speech-to-speech translation model with Conformer encoder + MT Transformer decoder + TTS Transformer decoder + """ + + @staticmethod + def add_args(parser): + S2SpecTConformerModel.add_args(parser) + parser.add_argument( + "--translation-decoder-layers", + type=int, + default=4, + metavar="N", + help="num decoder layers in the first-pass translation module", + ) + parser.add_argument( + "--synthesizer", + default="transformer", + choices=["transformer"], + help="", + ) + parser.add_argument( + "--synthesizer-encoder-layers", + type=int, + default=0, + metavar="N", + help="num encoder layers in the second-pass synthesizer module", + ) + + @classmethod + def build_multitask_decoder( + cls, + args, + tgt_dict, + in_dim, + is_mt_decoder, + decoder_layers, + decoder_embed_dim, + decoder_attention_heads, + ): + decoder_args = args.decoder_args + decoder_args.encoder_embed_dim = in_dim + if args.decoder_type == "transformer": + if is_mt_decoder: + multitask_text_transformer_decoder_arch( + decoder_args, + decoder_layers, + decoder_embed_dim, + decoder_attention_heads, + ) # 4L + else: + base_multitask_text_transformer_decoder_arch(decoder_args) # 2L + task_decoder = TransformerDecoder( + decoder_args, + tgt_dict, + embed_tokens=TransformerModelBase.build_embedding( + decoder_args, + tgt_dict, + decoder_args.decoder_embed_dim, + ), + ) + elif args.decoder_type == "ctc": + task_decoder = CTCDecoder( + dictionary=tgt_dict, + in_dim=in_dim, + ) + else: + raise NotImplementedError( + "currently only support multitask decoder_type 'transformer', 'ctc'" + ) + + return task_decoder + + @classmethod + def build_decoder(cls, args): + _args = copy.deepcopy(args) + _args.encoder_embed_dim = args.decoder_embed_dim + + if args.synthesizer == "transformer": + return TTSTransformerDecoder(_args, None, padding_idx=1) + else: + raise NotImplementedError(args.synthesizer) + + @classmethod + def build_model(cls, args, task): + encoder = cls.build_encoder(args) + decoder = cls.build_decoder(args) + base_model = cls(encoder, decoder) + + # set up multitask decoders + base_model.mt_task_name = None + base_model.multitask_decoders = {} + has_first_pass_decoder = False + for task_name, task_obj in task.multitask_tasks.items(): + if task_obj.is_first_pass_decoder: + has_first_pass_decoder = True + base_model.mt_task_name = task_name + + in_dim = ( + args.encoder_embed_dim + if task_obj.args.input_from == "encoder" + else args.decoder_embed_dim + ) + task_decoder = cls.build_multitask_decoder( + task_obj.args, + task_obj.target_dictionary, + in_dim, + task_obj.is_first_pass_decoder, + getattr(args, "translation_decoder_layers", 4), + getattr(args, "decoder_embed_dim", 256), + getattr(args, "decoder_attention_heads", 4), + ) + + setattr(base_model, f"{task_name}_decoder", task_decoder) + decoder_model_cls = ( + FairseqEncoderModel + if task_obj.args.decoder_type == "ctc" + else FairseqLanguageModel + ) + base_model.multitask_decoders[task_name] = decoder_model_cls( + getattr(base_model, f"{task_name}_decoder") + ) + + assert has_first_pass_decoder, "set at least one intermediate non-CTC decoder" + + # set up encoder on top of the auxiliary MT decoder + if getattr(args, "synthesizer_encoder_layers", 0) > 0: + base_model.synthesizer_encoder = cls.build_text_encoder(args) + else: + base_model.synthesizer_encoder = None + + return base_model + + @classmethod + def build_text_encoder(cls, args): + _args = copy.deepcopy(args) + _args.encoder_layers = args.synthesizer_encoder_layers + _args.encoder_embed_dim = args.decoder_embed_dim + _args.encoder_ffn_embed_dim = args.decoder_ffn_embed_dim + _args.encoder_attention_heads = args.decoder_attention_heads + _args.encoder_normalize_before = True + return TransformerEncoderNoEmb(_args) + + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens, + prev_output_tokens_mt, + tgt_speaker=None, + incremental_state=None, + target_lengths=None, + speaker=None, + return_all_hiddens=False, + ): + encoder_out = self.encoder( + src_tokens, + src_lengths=src_lengths, + tgt_speaker=tgt_speaker, + return_all_hiddens=return_all_hiddens, + ) + + # 1. MT decoder + mt_decoder = getattr(self, f"{self.mt_task_name}_decoder") + mt_decoder_out = mt_decoder( + prev_output_tokens_mt, + encoder_out=encoder_out, + ) + x = mt_decoder_out[1]["inner_states"][-1] + if mt_decoder.layer_norm is not None: + x = mt_decoder.layer_norm(x) + + mt_decoder_padding_mask = None + if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any(): + mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx) + + # 2. TTS encoder + if self.synthesizer_encoder is not None: + tts_encoder_out = self.synthesizer_encoder( + x, + mt_decoder_padding_mask, + return_all_hiddens=return_all_hiddens, + ) + else: + tts_encoder_out = { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [mt_decoder_padding_mask], # B x T + } + + # 3. TTS decoder + decoder_out = self.decoder( + prev_output_tokens, + encoder_out=tts_encoder_out, + incremental_state=incremental_state, + target_lengths=target_lengths, + speaker=speaker, + ) + if return_all_hiddens: + decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"] + decoder_out[-1]["encoder_padding_mask"] = encoder_out[ + "encoder_padding_mask" + ] + decoder_out[-1]["mt_decoder_out"] = mt_decoder_out + return decoder_out + + +@register_model_architecture( + model_name="s2spect2_conformer", arch_name="s2spect2_conformer" +) +def s2spect2_conformer_architecture_base(args): + args.conv_version = getattr(args, "conv_version", "convtransformer") + args.attn_type = getattr(args, "attn_type", None) + args.pos_enc_type = getattr(args, "pos_enc_type", "abs") + args.max_source_positions = getattr(args, "max_source_positions", 6000) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.dropout = getattr(args, "dropout", 0.1) + args.encoder_layers = getattr(args, "encoder_layers", 16) + args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31) + s2spect_architecture_base(args) + + +# for old naming +@register_model_architecture( + model_name="s2spect2_conformer", arch_name="s2spect_conformer_translatotron2" +) +def s2spect2_conformer_architecture_base_legacy(args): + s2spect2_conformer_architecture_base(args) diff --git a/fairseq/models/speech_to_speech/s2s_conformer_unity.py b/fairseq/models/speech_to_speech/s2s_conformer_unity.py new file mode 100644 index 0000000000..64388d6d16 --- /dev/null +++ b/fairseq/models/speech_to_speech/s2s_conformer_unity.py @@ -0,0 +1,298 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import logging + +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderModel, + FairseqLanguageModel, + register_model, + register_model_architecture, +) +from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder +from fairseq.models.speech_to_speech.modules.stacked_embedding import StackedEmbedding +from fairseq.models.speech_to_speech.modules.transformer_decoder_aug import ( + AugTransformerUnitDecoder, +) +from fairseq.models.speech_to_speech.modules.transformer_encoder import ( + TransformerEncoderNoEmb, +) +from fairseq.models.speech_to_speech.s2s_conformer import S2UTConformerModel +from fairseq.models.speech_to_speech.s2s_transformer import ( + TransformerUnitDecoder, + base_multitask_text_transformer_decoder_arch, + s2ut_architecture_base, +) +from fairseq.models.transformer import TransformerDecoder, TransformerModelBase + +logger = logging.getLogger(__name__) + + +def multitask_text_transformer_decoder_arch( + args, decoder_layers, decoder_embed_dim=256, decoder_attention_heads=4 +): + args.decoder_layers = decoder_layers + args.decoder_embed_dim = decoder_embed_dim + args.decoder_attention_heads = decoder_attention_heads + base_multitask_text_transformer_decoder_arch(args) + + +@register_model("unity_conformer") +class UnityConformerModel(S2UTConformerModel): + """ + Direct speech-to-speech translation model with Conformer encoder + MT Transformer decoder + Transformer discrete unit decoder + """ + + @staticmethod + def add_args(parser): + S2UTConformerModel.add_args(parser) + parser.add_argument( + "--translation-decoder-layers", + type=int, + default=4, + metavar="N", + help="num decoder layers in the first-pass translation module", + ) + parser.add_argument( + "--synthesizer", + default="transformer", + choices=["transformer"], + help="", + ) + parser.add_argument( + "--synthesizer-encoder-layers", + type=int, + default=0, + metavar="N", + help="num encoder layers in the second-pass synthesizer module", + ) + parser.add_argument( + "--synthesizer-augmented-cross-attention", + action="store_true", + default=False, + help="augmented cross-attention over speech encoder output", + ) + + @classmethod + def build_multitask_decoder( + cls, + args, + tgt_dict, + in_dim, + is_first_pass_decoder, + decoder_layers, + decoder_embed_dim, + decoder_attention_heads, + ): + decoder_args = args.decoder_args + decoder_args.encoder_embed_dim = in_dim + if args.decoder_type == "transformer": + if is_first_pass_decoder: + multitask_text_transformer_decoder_arch( + decoder_args, + decoder_layers, + decoder_embed_dim, + decoder_attention_heads, + ) # 4L + else: + base_multitask_text_transformer_decoder_arch(decoder_args) # 2L + task_decoder = TransformerDecoder( + decoder_args, + tgt_dict, + embed_tokens=TransformerModelBase.build_embedding( + decoder_args, + tgt_dict, + decoder_args.decoder_embed_dim, + ), + ) + elif args.decoder_type == "ctc": + task_decoder = CTCDecoder( + dictionary=tgt_dict, + in_dim=in_dim, + ) + else: + raise NotImplementedError( + "currently only support multitask decoder_type 'transformer', 'ctc'" + ) + + return task_decoder + + @classmethod + def build_decoder(cls, args, tgt_dict, aug_attn=False): + num_embeddings = len(tgt_dict) + padding_idx = tgt_dict.pad() + embed_tokens = StackedEmbedding( + num_embeddings, + args.decoder_embed_dim, + padding_idx, + num_stacked=args.n_frames_per_step, + ) + + _args = copy.deepcopy(args) + _args.encoder_embed_dim = args.decoder_embed_dim + + decoder_cls = AugTransformerUnitDecoder if aug_attn else TransformerUnitDecoder + return decoder_cls( + _args, + tgt_dict, + embed_tokens, + ) + + @classmethod + def build_model(cls, args, task): + encoder = cls.build_encoder(args) + decoder = cls.build_decoder( + args, + task.target_dictionary, + aug_attn=getattr(args, "synthesizer_augmented_cross_attention", False), + ) + base_model = cls(encoder, decoder) + + base_model.t2u_augmented_cross_attn = getattr( + args, "synthesizer_augmented_cross_attention", False + ) + + # set up multitask decoders + base_model.mt_task_name = None + base_model.multitask_decoders = {} + has_first_pass_decoder = False + for task_name, task_obj in task.multitask_tasks.items(): + if task_obj.is_first_pass_decoder: + has_first_pass_decoder = True + base_model.mt_task_name = task_name + + in_dim = ( + args.encoder_embed_dim + if task_obj.args.input_from == "encoder" + else args.decoder_embed_dim + ) + task_decoder = cls.build_multitask_decoder( + task_obj.args, + task_obj.target_dictionary, + in_dim, + task_obj.is_first_pass_decoder, + getattr(args, "translation_decoder_layers", 4), + getattr(args, "decoder_embed_dim", 256), + getattr(args, "decoder_attention_heads", 4), + ) + + setattr(base_model, f"{task_name}_decoder", task_decoder) + decoder_model_cls = ( + FairseqEncoderModel + if task_obj.args.decoder_type == "ctc" + else FairseqLanguageModel + ) + base_model.multitask_decoders[task_name] = decoder_model_cls( + getattr(base_model, f"{task_name}_decoder") + ) + + assert has_first_pass_decoder, "set at least one intermediate non-CTC decoder" + + # set up encoder on top of the auxiliary MT decoder + if getattr(args, "synthesizer_encoder_layers", 0) > 0: + base_model.synthesizer_encoder = cls.build_text_encoder(args) + else: + base_model.synthesizer_encoder = None + + return base_model + + @classmethod + def build_text_encoder(cls, args): + _args = copy.deepcopy(args) + _args.encoder_layers = args.synthesizer_encoder_layers + _args.encoder_embed_dim = args.decoder_embed_dim + _args.encoder_ffn_embed_dim = args.decoder_ffn_embed_dim + _args.encoder_attention_heads = args.decoder_attention_heads + _args.encoder_normalize_before = True + return TransformerEncoderNoEmb(_args) + + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens, + prev_output_tokens_mt, + tgt_speaker=None, + return_all_hiddens=False, + ): + mt_decoder = getattr(self, f"{self.mt_task_name}_decoder") + + encoder_out = self.encoder( + src_tokens, + src_lengths=src_lengths, + tgt_speaker=tgt_speaker, + return_all_hiddens=return_all_hiddens, + ) + + # 1. MT decoder + mt_decoder_out = mt_decoder( + prev_output_tokens_mt, + encoder_out=encoder_out, + ) + x = mt_decoder_out[1]["inner_states"][-1] + if mt_decoder.layer_norm is not None: + x = mt_decoder.layer_norm(x) + + mt_decoder_padding_mask = None + if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any(): + mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx) + + # 2. T2U encoder + if self.synthesizer_encoder is not None: + t2u_encoder_out = self.synthesizer_encoder( + x, + mt_decoder_padding_mask, + return_all_hiddens=return_all_hiddens, + ) + else: + t2u_encoder_out = { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [mt_decoder_padding_mask], # B x T + } + + # 3. T2U decoder + if self.t2u_augmented_cross_attn: + decoder_out = self.decoder( + prev_output_tokens, + encoder_out=encoder_out, + encoder_out_aug=t2u_encoder_out, + ) + else: + decoder_out = self.decoder( + prev_output_tokens, + encoder_out=t2u_encoder_out, + ) + if return_all_hiddens: + decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"] + decoder_out[-1]["encoder_padding_mask"] = encoder_out[ + "encoder_padding_mask" + ] + decoder_out[-1]["mt_decoder_out"] = mt_decoder_out + return decoder_out + + +@register_model_architecture(model_name="unity_conformer", arch_name="unity_conformer") +def unity_conformer_architecture_base(args): + args.conv_version = getattr(args, "conv_version", "convtransformer") + args.attn_type = getattr(args, "attn_type", None) + args.pos_enc_type = getattr(args, "pos_enc_type", "abs") + args.max_source_positions = getattr(args, "max_source_positions", 6000) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.dropout = getattr(args, "dropout", 0.1) + args.encoder_layers = getattr(args, "encoder_layers", 16) + args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31) + s2ut_architecture_base(args) + + +# for old naming +@register_model_architecture( + model_name="unity_conformer", arch_name="s2ut_conformer_translatotron2" +) +def unity_conformer_architecture_base_legacy(args): + unity_conformer_architecture_base(args) diff --git a/fairseq/models/speech_to_speech/s2s_transformer.py b/fairseq/models/speech_to_speech/s2s_transformer.py new file mode 100644 index 0000000000..07393d2598 --- /dev/null +++ b/fairseq/models/speech_to_speech/s2s_transformer.py @@ -0,0 +1,722 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from pathlib import Path +from typing import Any, Dict, List, Optional + +import torch +from torch import Tensor + +from fairseq import checkpoint_utils, utils +from fairseq.models import ( + FairseqEncoderDecoderModel, + FairseqEncoderModel, + FairseqLanguageModel, + register_model, + register_model_architecture, +) +from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder +from fairseq.models.speech_to_speech.modules.stacked_embedding import StackedEmbedding +from fairseq.models.speech_to_text import S2TTransformerEncoder +from fairseq.models.text_to_speech import TTSTransformerDecoder +from fairseq.models.transformer import Linear, TransformerDecoder, TransformerModelBase + +logger = logging.getLogger(__name__) + + +class S2STransformerEncoder(S2TTransformerEncoder): + """Based on S2T transformer encoder, with support + to incorporate target speaker embedding.""" + + def __init__(self, args): + super().__init__(args) + + self.spk_emb_proj = None + if args.target_speaker_embed: + self.spk_emb_proj = Linear( + args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim + ) + + def forward( + self, src_tokens, src_lengths, tgt_speaker=None, return_all_hiddens=False + ): + out = super().forward(src_tokens, src_lengths, return_all_hiddens) + + if self.spk_emb_proj: + x = out["encoder_out"][0] + seq_len, bsz, _ = x.size() + tgt_speaker_emb = tgt_speaker.view(1, bsz, -1).expand(seq_len, bsz, -1) + x = self.spk_emb_proj(torch.cat([x, tgt_speaker_emb], dim=2)) + out["encoder_out"][0] = x + + return out + + +class TransformerUnitDecoder(TransformerDecoder): + """Based on Transformer decoder, with support to decoding stacked units""" + + def __init__( + self, + args, + dictionary, + embed_tokens, + no_encoder_attn=False, + output_projection=None, + ): + super().__init__( + args, dictionary, embed_tokens, no_encoder_attn, output_projection + ) + self.n_frames_per_step = args.n_frames_per_step + + self.out_proj_n_frames = ( + Linear( + self.output_embed_dim, + self.output_embed_dim * self.n_frames_per_step, + bias=False, + ) + if self.n_frames_per_step > 1 + else None + ) + + def forward( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + features_only: bool = False, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + src_lengths: Optional[Any] = None, + return_all_hiddens: bool = False, + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (optional): output from the encoder, used for + encoder-side attention, should be of size T x B x C + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + features_only (bool, optional): only return features without + applying output layer (default: False). + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + + x, extra = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + incremental_state=incremental_state, + full_context_alignment=full_context_alignment, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + ) + + if not features_only: + bsz, seq_len, d = x.size() + if self.out_proj_n_frames: + x = self.out_proj_n_frames(x) + x = self.output_layer(x.view(bsz, seq_len, self.n_frames_per_step, d)) + x = x.view(bsz, seq_len * self.n_frames_per_step, -1) + if ( + incremental_state is None and self.n_frames_per_step > 1 + ): # teacher-forcing mode in training + x = x[ + :, : -(self.n_frames_per_step - 1), : + ] # remove extra frames after <eos> + + return x, extra + + def upgrade_state_dict_named(self, state_dict, name): + if self.n_frames_per_step > 1: + move_keys = [ + ( + f"{name}.project_in_dim.weight", + f"{name}.embed_tokens.project_in_dim.weight", + ) + ] + for from_k, to_k in move_keys: + if from_k in state_dict and to_k not in state_dict: + state_dict[to_k] = state_dict[from_k] + del state_dict[from_k] + + +class S2STransformerMultitaskModelBase(FairseqEncoderDecoderModel): + @classmethod + def build_encoder(cls, args): + encoder = S2STransformerEncoder(args) + pretraining_path = getattr(args, "load_pretrained_encoder_from", None) + if pretraining_path is not None: + if not Path(pretraining_path).exists(): + logger.warning( + f"skipped pretraining because {pretraining_path} does not exist" + ) + else: + encoder = checkpoint_utils.load_pretrained_component_from_model( + component=encoder, checkpoint=pretraining_path + ) + logger.info(f"loaded pretrained encoder from: {pretraining_path}") + return encoder + + @classmethod + def build_multitask_decoder(cls, args, tgt_dict, in_dim): + decoder_args = args.decoder_args + decoder_args.encoder_embed_dim = in_dim + if args.decoder_type == "transformer": + base_multitask_text_transformer_decoder_arch(decoder_args) + task_decoder = TransformerDecoder( + decoder_args, + tgt_dict, + embed_tokens=TransformerModelBase.build_embedding( + decoder_args, + tgt_dict, + decoder_args.decoder_embed_dim, + ), + ) + elif args.decoder_type == "ctc": + task_decoder = CTCDecoder( + dictionary=tgt_dict, + in_dim=in_dim, + ) + else: + raise NotImplementedError( + "currently only support multitask decoder_type 'transformer', 'ctc'" + ) + + return task_decoder + + @classmethod + def build_model(cls, args, task): + encoder = cls.build_encoder(args) + decoder = ( + cls.build_decoder(args, task.target_dictionary) + if task.args.target_is_code + else cls.build_decoder(args) + ) + base_model = cls(encoder, decoder) + + # set up multitask decoders + base_model.multitask_decoders = {} + for task_name, task_obj in task.multitask_tasks.items(): + in_dim = ( + args.encoder_embed_dim + if task_obj.args.input_from == "encoder" + else args.decoder_embed_dim + ) + task_decoder = cls.build_multitask_decoder( + task_obj.args, task_obj.target_dictionary, in_dim + ) + + setattr(base_model, f"{task_name}_decoder", task_decoder) + decoder_model_cls = ( + FairseqEncoderModel + if task_obj.args.decoder_type == "ctc" + else FairseqLanguageModel + ) + base_model.multitask_decoders[task_name] = decoder_model_cls( + getattr(base_model, f"{task_name}_decoder") + ) + + return base_model + + def forward_encoder(self, src_tokens, src_lengths, speaker=None, **kwargs): + return self.encoder( + src_tokens, src_lengths=src_lengths, tgt_speaker=speaker, **kwargs + ) + + +@register_model("s2ut_transformer") +class S2UTTransformerModel(S2STransformerMultitaskModelBase): + """ + Direct speech-to-speech translation model with Transformer encoder + Transformer discrete unit decoder + https://arxiv.org/abs/2107.05604 + """ + + @staticmethod + def add_args(parser): + # input + parser.add_argument( + "--conv-kernel-sizes", + type=str, + metavar="STR", + help="kernel sizes of Conv1d (s2t_transformer) subsampling layers", + ) + parser.add_argument( + "--conv-channels", + type=int, + metavar="N", + help="# of channels in Conv1d (s2t_transformer) subsampling layers", + ) + parser.add_argument( + "--conv-out-channels", + type=int, + metavar="N", + help="# of channels in Conv2d (convtransformer) subsampling layers", + ) + parser.add_argument( + "--conv-version", + type=str, + default="s2t_transformer", + choices=["s2t_transformer", "convtransformer"], + help="version of frontend convolutional layers", + ) + # Transformer + parser.add_argument( + "--activation-fn", + type=str, + default="relu", + choices=utils.get_available_activation_fns(), + help="activation function to use", + ) + parser.add_argument( + "--dropout", type=float, metavar="D", help="dropout probability" + ) + parser.add_argument( + "--attention-dropout", + type=float, + metavar="D", + help="dropout probability for attention weights", + ) + parser.add_argument( + "--activation-dropout", + "--relu-dropout", + type=float, + metavar="D", + help="dropout probability after activation in FFN.", + ) + parser.add_argument( + "--encoder-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension", + ) + parser.add_argument( + "--encoder-ffn-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension for FFN", + ) + parser.add_argument( + "--encoder-layers", type=int, metavar="N", help="num encoder layers" + ) + parser.add_argument( + "--encoder-attention-heads", + type=int, + metavar="N", + help="num encoder attention heads", + ) + parser.add_argument( + "--encoder-normalize-before", + action="store_true", + help="apply layernorm before each encoder block", + ) + parser.add_argument( + "--decoder-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension", + ) + parser.add_argument( + "--decoder-ffn-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension for FFN", + ) + parser.add_argument( + "--decoder-layers", type=int, metavar="N", help="num decoder layers" + ) + parser.add_argument( + "--decoder-attention-heads", + type=int, + metavar="N", + help="num decoder attention heads", + ) + parser.add_argument( + "--decoder-normalize-before", + action="store_true", + help="apply layernorm before each decoder block", + ) + parser.add_argument( + "--share-decoder-input-output-embed", + action="store_true", + help="share decoder input and output embeddings", + ) + parser.add_argument( + "--layernorm-embedding", + action="store_true", + help="add layernorm to embedding", + ) + parser.add_argument( + "--no-scale-embedding", + action="store_true", + help="if True, dont scale embeddings", + ) + parser.add_argument( + "--load-pretrained-encoder-from", + type=str, + metavar="STR", + help="model to take encoder weights from (for initialization)", + ) + parser.add_argument( + "--encoder-freezing-updates", + type=int, + metavar="N", + help="freeze encoder for first N updates", + ) + # speaker + parser.add_argument( + "--speaker-embed-dim", + type=int, + metavar="N", + help="speaker embedding dimension", + ) + + @classmethod + def build_decoder(cls, args, tgt_dict): + num_embeddings = len(tgt_dict) + padding_idx = tgt_dict.pad() + embed_tokens = StackedEmbedding( + num_embeddings, + args.decoder_embed_dim, + padding_idx, + num_stacked=args.n_frames_per_step, + ) + + return TransformerUnitDecoder( + args, + tgt_dict, + embed_tokens, + ) + + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens, + tgt_speaker=None, + return_all_hiddens=False, + ): + encoder_out = self.encoder( + src_tokens, + src_lengths=src_lengths, + tgt_speaker=tgt_speaker, + return_all_hiddens=return_all_hiddens, + ) + decoder_out = self.decoder( + prev_output_tokens, + encoder_out=encoder_out, + ) + if return_all_hiddens: + decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"] + decoder_out[-1]["encoder_padding_mask"] = encoder_out[ + "encoder_padding_mask" + ] + return decoder_out + + +@register_model("s2spect_transformer") +class S2SpecTTransformerModel(S2STransformerMultitaskModelBase): + """ + Speech-to-spectrogram model with S2T Transformer encoder + TTS Transformer decoder + """ + + @staticmethod + def add_args(parser): + # input + parser.add_argument( + "--conv-kernel-sizes", + type=str, + metavar="STR", + help="kernel sizes of Conv1d (s2t_transformer) subsampling layers", + ) + parser.add_argument( + "--conv-channels", + type=int, + metavar="N", + help="# of channels in Conv1d (s2t_transformer) subsampling layers", + ) + parser.add_argument( + "--conv-version", + type=str, + default="s2t_transformer", + choices=["s2t_transformer", "convtransformer"], + help="version of frontend convolutional layers", + ) + # Transformer + parser.add_argument( + "--activation-fn", + type=str, + default="relu", + choices=utils.get_available_activation_fns(), + help="activation function to use", + ) + parser.add_argument( + "--dropout", type=float, metavar="D", help="dropout probability" + ) + parser.add_argument( + "--attention-dropout", + type=float, + metavar="D", + help="dropout probability for attention weights", + ) + parser.add_argument( + "--activation-dropout", + "--relu-dropout", + type=float, + metavar="D", + help="dropout probability after activation in FFN.", + ) + parser.add_argument( + "--encoder-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension", + ) + parser.add_argument( + "--encoder-ffn-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension for FFN", + ) + parser.add_argument( + "--encoder-layers", type=int, metavar="N", help="num encoder layers" + ) + parser.add_argument( + "--encoder-attention-heads", + type=int, + metavar="N", + help="num encoder attention heads", + ) + parser.add_argument( + "--encoder-normalize-before", + action="store_true", + help="apply layernorm before each encoder block", + ) + parser.add_argument( + "--no-scale-embedding", + action="store_true", + help="if True, dont scale embeddings", + ) + parser.add_argument( + "--load-pretrained-encoder-from", + type=str, + metavar="STR", + help="model to take encoder weights from (for initialization)", + ) + parser.add_argument( + "--encoder-freezing-updates", + type=int, + metavar="N", + help="freeze encoder for first N updates", + ) + # speaker + parser.add_argument( + "--speaker-embed-dim", + type=int, + metavar="N", + help="speaker embedding dimension", + ) + # decoder + parser.add_argument("--output-frame-dim", type=int) + # decoder prenet + parser.add_argument("--prenet-dropout", type=float) + parser.add_argument("--prenet-layers", type=int) + parser.add_argument("--prenet-dim", type=int) + # decoder postnet + parser.add_argument("--postnet-dropout", type=float) + parser.add_argument("--postnet-layers", type=int) + parser.add_argument("--postnet-conv-dim", type=int) + parser.add_argument("--postnet-conv-kernel-size", type=int) + # decoder transformer layers + parser.add_argument("--decoder-transformer-layers", type=int) + parser.add_argument("--decoder-embed-dim", type=int) + parser.add_argument("--decoder-ffn-embed-dim", type=int) + parser.add_argument("--decoder-normalize-before", action="store_true") + parser.add_argument("--decoder-attention-heads", type=int) + + @classmethod + def build_decoder(cls, args): + return TTSTransformerDecoder(args, None, padding_idx=1) + + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens, + tgt_speaker=None, + incremental_state=None, + target_lengths=None, + speaker=None, + return_all_hiddens=False, + ): + encoder_out = self.encoder( + src_tokens, + src_lengths=src_lengths, + tgt_speaker=tgt_speaker, + return_all_hiddens=return_all_hiddens, + ) + decoder_out = self.decoder( + prev_output_tokens, + encoder_out=encoder_out, + incremental_state=incremental_state, + target_lengths=target_lengths, + speaker=speaker, + ) + if return_all_hiddens: + decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"] + decoder_out[-1]["encoder_padding_mask"] = encoder_out[ + "encoder_padding_mask" + ] + return decoder_out + + +def base_multitask_text_transformer_decoder_arch(args): + args.dropout = getattr(args, "dropout", 0.3) + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", True + ) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256) + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + + args.max_target_positions = getattr(args, "max_target_positions", 1024) + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + + args.adaptive_input = getattr(args, "adaptive_input", False) + args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) + + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + + args.decoder_layers = getattr(args, "decoder_layers", 2) + + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + + # decoder layer + args.activation_dropout = getattr(args, "activation_dropout", args.dropout) + args.activation_fn = getattr(args, "activation_fn", "relu") + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) + + args.attention_dropout = getattr(args, "attention_dropout", args.dropout) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) + + +def base_s2st_transformer_encoder_architecture(args): + args.encoder_freezing_updates = getattr(args, "encoder_freezing_updates", 0) + + # Convolutional subsampler + args.input_channels = getattr(args, "input_channels", 1) + args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5") # for Conv1d + args.conv_channels = getattr(args, "conv_channels", 1024) # for Conv1d + args.conv_out_channels = getattr(args, "conv_out_channels", 256) # for Conv2d + args.conv_version = getattr(args, "conv_version", "s2t_transformer") + # Transformer + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_layers = getattr(args, "encoder_layers", 12) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + + args.dropout = getattr(args, "dropout", 0.1) + args.attention_dropout = getattr(args, "attention_dropout", args.dropout) + args.activation_dropout = getattr(args, "activation_dropout", args.dropout) + args.activation_fn = getattr(args, "activation_fn", "relu") + + args.speaker_embed_dim = getattr(args, "speaker_embed_dim", 256) + + +@register_model_architecture( + model_name="s2ut_transformer", arch_name="s2ut_transformer" +) +def s2ut_architecture_base(args): + base_s2st_transformer_encoder_architecture(args) + + # decoder + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) + + +@register_model_architecture("s2ut_transformer", "s2ut_transformer_fisher") +def s2ut_architecture_fisher(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.dropout = getattr(args, "dropout", 0.1) + + s2ut_architecture_base(args) + + +@register_model_architecture( + model_name="s2spect_transformer", arch_name="s2spect_transformer" +) +def s2spect_architecture_base(args): + base_s2st_transformer_encoder_architecture(args) + + # decoder + args.output_frame_dim = getattr(args, "output_frame_dim", 80) + # decoder prenet + args.prenet_dropout = getattr(args, "prenet_dropout", 0.5) + args.prenet_layers = getattr(args, "prenet_layers", 2) + args.prenet_dim = getattr(args, "prenet_dim", 256) + # decoder postnet + args.postnet_dropout = getattr(args, "postnet_dropout", 0.5) + args.postnet_layers = getattr(args, "postnet_layers", 5) + args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512) + args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5) + # decoder transformer layers + args.decoder_transformer_layers = getattr(args, "decoder_transformer_layers", 6) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", 4 * args.decoder_embed_dim + ) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) + + +@register_model_architecture("s2spect_transformer", "s2spect_transformer_fisher") +def s2spect_architecture_fisher(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.dropout = getattr(args, "dropout", 0.1) + + # decoder + args.prenet_dim = getattr(args, "prenet_dim", 32) + + s2spect_architecture_base(args) diff --git a/fairseq/models/speech_to_text/__init__.py b/fairseq/models/speech_to_text/__init__.py index 5d7f59b3a6..62ef663efb 100644 --- a/fairseq/models/speech_to_text/__init__.py +++ b/fairseq/models/speech_to_text/__init__.py @@ -4,4 +4,10 @@ # LICENSE file in the root directory of this source tree. from .berard import * # noqa +from .convtransformer import * # noqa +from .multi_modality_model import * # noqa +from .s2t_conformer import * # noqa from .s2t_transformer import * # noqa +from .s2t_wav_transformer import * # noqa +from .xm_transformer import * # noqa +from .xm_transformer_unity import * # noqa diff --git a/fairseq/models/speech_to_text/berard.py b/fairseq/models/speech_to_text/berard.py index c505e3acaa..107ac983c6 100644 --- a/fairseq/models/speech_to_text/berard.py +++ b/fairseq/models/speech_to_text/berard.py @@ -6,6 +6,7 @@ import torch import torch.nn as nn import torch.nn.functional as F + from fairseq import checkpoint_utils, utils from fairseq.data.data_utils import lengths_to_padding_mask from fairseq.models import ( @@ -131,7 +132,7 @@ def build_encoder(cls, args, task): lstm_size=args.lstm_size, dropout=args.dropout, ) - if getattr(args, "load_pretrained_encoder_from", None): + if getattr(args, "load_pretrained_encoder_from", None) is not None: encoder = checkpoint_utils.load_pretrained_component_from_model( component=encoder, checkpoint=args.load_pretrained_encoder_from ) @@ -149,7 +150,7 @@ def build_decoder(cls, args, task): attention_dim=args.attention_dim, output_layer_dim=args.output_layer_dim, ) - if getattr(args, "load_pretrained_decoder_from", None): + if getattr(args, "load_pretrained_decoder_from", None) is not None: decoder = checkpoint_utils.load_pretrained_component_from_model( component=decoder, checkpoint=args.load_pretrained_decoder_from ) diff --git a/fairseq/models/speech_to_text/convtransformer.py b/fairseq/models/speech_to_text/convtransformer.py new file mode 100644 index 0000000000..4d0fc02aee --- /dev/null +++ b/fairseq/models/speech_to_text/convtransformer.py @@ -0,0 +1,443 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import math +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from fairseq import checkpoint_utils, utils +from fairseq.data.data_utils import lengths_to_padding_mask +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderDecoderModel, + register_model, + register_model_architecture, +) +from fairseq.models.speech_to_text.modules.convolution import infer_conv_output_dim +from fairseq.models.transformer import Embedding, TransformerDecoder +from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerEncoderLayer + +logger = logging.getLogger(__name__) + + +@register_model("convtransformer") +class ConvTransformerModel(FairseqEncoderDecoderModel): + """ + Transformer-based Speech translation model from ESPNet-ST + https://arxiv.org/abs/2004.10234 + """ + + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + parser.add_argument( + "--input-feat-per-channel", + type=int, + metavar="N", + help="encoder input dimension per input channel", + ) + parser.add_argument( + "--activation-fn", + choices=utils.get_available_activation_fns(), + help="activation function to use", + ) + parser.add_argument( + "--dropout", type=float, metavar="D", help="dropout probability" + ) + parser.add_argument( + "--attention-dropout", + type=float, + metavar="D", + help="dropout probability for attention weights", + ) + parser.add_argument( + "--activation-dropout", + "--relu-dropout", + type=float, + metavar="D", + help="dropout probability after activation in FFN.", + ) + parser.add_argument( + "--encoder-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension", + ) + parser.add_argument( + "--encoder-ffn-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension for FFN", + ) + parser.add_argument( + "--encoder-layers", type=int, metavar="N", help="num encoder layers" + ) + parser.add_argument( + "--encoder-attention-heads", + type=int, + metavar="N", + help="num encoder attention heads", + ) + parser.add_argument( + "--encoder-normalize-before", + action="store_true", + help="apply layernorm before each encoder block", + ) + parser.add_argument( + "--decoder-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension", + ) + parser.add_argument( + "--decoder-ffn-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension for FFN", + ) + parser.add_argument( + "--decoder-layers", type=int, metavar="N", help="num decoder layers" + ) + parser.add_argument( + "--decoder-attention-heads", + type=int, + metavar="N", + help="num decoder attention heads", + ) + parser.add_argument( + "--decoder-normalize-before", + action="store_true", + help="apply layernorm before each decoder block", + ) + parser.add_argument( + "--decoder-output-dim", + type=int, + metavar="N", + help="decoder output dimension (extra linear layer if different from decoder embed dim)", + ) + parser.add_argument( + "--share-decoder-input-output-embed", + action="store_true", + help="share decoder input and output embeddings", + ) + parser.add_argument( + "--layernorm-embedding", + action="store_true", + help="add layernorm to embedding", + ) + parser.add_argument( + "--no-scale-embedding", + action="store_true", + help="if True, dont scale embeddings", + ) + parser.add_argument( + "--load-pretrained-encoder-from", + type=str, + metavar="STR", + help="model to take encoder weights from (for initialization)", + ) + parser.add_argument( + "--load-pretrained-decoder-from", + type=str, + metavar="STR", + help="model to take decoder weights from (for initialization)", + ) + parser.add_argument( + "--conv-out-channels", + type=int, + metavar="INT", + help="the number of output channels of conv layer", + ) + + @classmethod + def build_encoder(cls, args): + encoder = ConvTransformerEncoder(args) + if getattr(args, "load_pretrained_encoder_from", None) is not None: + encoder = checkpoint_utils.load_pretrained_component_from_model( + component=encoder, checkpoint=args.load_pretrained_encoder_from + ) + return encoder + + @classmethod + def build_decoder(cls, args, task, embed_tokens): + decoder = TransformerDecoderNoExtra(args, task.target_dictionary, embed_tokens) + if getattr(args, "load_pretrained_decoder_from", None) is not None: + decoder = checkpoint_utils.load_pretrained_component_from_model( + component=decoder, checkpoint=args.load_pretrained_decoder_from + ) + return decoder + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + base_architecture(args) + + def build_embedding(dictionary, embed_dim): + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + return Embedding(num_embeddings, embed_dim, padding_idx) + + decoder_embed_tokens = build_embedding( + task.target_dictionary, args.decoder_embed_dim + ) + encoder = cls.build_encoder(args) + decoder = cls.build_decoder(args, task, decoder_embed_tokens) + return cls(encoder, decoder) + + @staticmethod + @torch.jit.unused + def set_batch_first(lprobs): + lprobs.batch_first = True + + def get_normalized_probs( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): + # net_output['encoder_out'] is a (B, T, D) tensor + lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample) + if self.training: + self.set_batch_first(lprobs) + return lprobs + + def output_layout(self): + return "BTD" + + """ + The forward method inherited from the base class has a **kwargs argument in + its input, which is not supported in torchscript. This method overrites the forward + method definition without **kwargs. + """ + + def forward(self, src_tokens, src_lengths, prev_output_tokens): + encoder_out = self.encoder(src_tokens=src_tokens, src_lengths=src_lengths) + decoder_out = self.decoder( + prev_output_tokens=prev_output_tokens, encoder_out=encoder_out + ) + return decoder_out + + +class ConvTransformerEncoder(FairseqEncoder): + """Conv + Transformer encoder""" + + def __init__(self, args): + """Construct an Encoder object.""" + super().__init__(None) + + self.dropout = args.dropout + self.embed_scale = ( + 1.0 if args.no_scale_embedding else math.sqrt(args.encoder_embed_dim) + ) + self.padding_idx = 1 + self.in_channels = 1 + self.input_dim = args.input_feat_per_channel + self.conv = torch.nn.Sequential( + torch.nn.Conv2d(1, args.conv_out_channels, 3, stride=2, padding=3 // 2), + torch.nn.ReLU(), + torch.nn.Conv2d( + args.conv_out_channels, + args.conv_out_channels, + 3, + stride=2, + padding=3 // 2, + ), + torch.nn.ReLU(), + ) + transformer_input_dim = infer_conv_output_dim( + self.in_channels, self.input_dim, args.conv_out_channels + ) + self.out = torch.nn.Linear(transformer_input_dim, args.encoder_embed_dim) + self.embed_positions = PositionalEmbedding( + args.max_source_positions, + args.encoder_embed_dim, + self.padding_idx, + learned=False, + ) + + self.transformer_layers = nn.ModuleList([]) + self.transformer_layers.extend( + [TransformerEncoderLayer(args) for i in range(args.encoder_layers)] + ) + if args.encoder_normalize_before: + self.layer_norm = LayerNorm(args.encoder_embed_dim) + else: + self.layer_norm = None + + def pooling_ratio(self): + return 4 + + def forward(self, src_tokens, src_lengths): + """Encode input sequence. + :param torch.Tensor xs: input tensor + :param torch.Tensor masks: input mask + :return: position embedded tensor and mask + :rtype Tuple[torch.Tensor, torch.Tensor]: + """ + bsz, max_seq_len, _ = src_tokens.size() + x = ( + src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim) + .transpose(1, 2) + .contiguous() + ) + x = self.conv(x) + bsz, _, output_seq_len, _ = x.size() + x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1) + x = self.out(x) + x = self.embed_scale * x + + subsampling_factor = int(max_seq_len * 1.0 / output_seq_len + 0.5) + input_len_0 = (src_lengths.float() / subsampling_factor).ceil().long() + input_len_1 = x.size(0) * torch.ones([src_lengths.size(0)]).long().to( + input_len_0.device + ) + input_lengths = torch.min(input_len_0, input_len_1) + + encoder_padding_mask = lengths_to_padding_mask(input_lengths) + + positions = self.embed_positions(encoder_padding_mask).transpose(0, 1) + x += positions + x = F.dropout(x, p=self.dropout, training=self.training) + + for layer in self.transformer_layers: + x = layer(x, encoder_padding_mask) + + if not encoder_padding_mask.any(): + maybe_encoder_padding_mask = None + else: + maybe_encoder_padding_mask = encoder_padding_mask + + return { + "encoder_out": [x], + "encoder_padding_mask": [maybe_encoder_padding_mask] + if maybe_encoder_padding_mask is not None + else [], + "encoder_embedding": [], + "encoder_states": [], + "src_tokens": [], + "src_lengths": [], + } + + @torch.jit.export + def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order): + """ + Reorder encoder output according to *new_order*. + + Args: + encoder_out: output from the ``forward()`` method + new_order (LongTensor): desired order + + Returns: + *encoder_out* rearranged according to *new_order* + """ + new_encoder_out = [encoder_out["encoder_out"][0].index_select(1, new_order)] + if len(encoder_out["encoder_padding_mask"]) == 0: + new_encoder_padding_mask = [] + else: + new_encoder_padding_mask = [ + (encoder_out["encoder_padding_mask"][0]).index_select(0, new_order) + ] + if len(encoder_out["encoder_embedding"]) == 0: + new_encoder_embedding = [] + else: + new_encoder_embedding = [ + (encoder_out["encoder_embedding"][0]).index_select(0, new_order) + ] + encoder_states = encoder_out["encoder_states"] + if len(encoder_states) > 0: + for idx, state in enumerate(encoder_states): + encoder_states[idx] = state.index_select(1, new_order) + + return { + "encoder_out": new_encoder_out, + "encoder_padding_mask": new_encoder_padding_mask, + "encoder_embedding": new_encoder_embedding, + "encoder_states": encoder_states, + "src_tokens": [], + "src_lengths": [], + } + + +class TransformerDecoderNoExtra(TransformerDecoder): + def extract_features( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + # call scriptable method from parent class + x, _ = self.extract_features_scriptable( + prev_output_tokens, + encoder_out, + incremental_state, + full_context_alignment, + alignment_layer, + alignment_heads, + ) + return x, None + + +@register_model_architecture(model_name="convtransformer", arch_name="convtransformer") +def base_architecture(args): + args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.activation_fn = getattr(args, "activation_fn", "relu") + args.dropout = getattr(args, "dropout", 0.1) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) + + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) + args.max_source_positions = getattr(args, "max_source_positions", 3000) + args.max_target_positions = getattr(args, "max_target_positions", 1024) + args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) + args.conv_out_channels = getattr(args, "conv_out_channels", args.encoder_embed_dim) + + +@register_model_architecture("convtransformer", "convtransformer_espnet") +def convtransformer_espnet(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_layers = getattr(args, "encoder_layers", 12) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) diff --git a/fairseq/models/speech_to_text/hub_interface.py b/fairseq/models/speech_to_text/hub_interface.py new file mode 100644 index 0000000000..d78427f687 --- /dev/null +++ b/fairseq/models/speech_to_text/hub_interface.py @@ -0,0 +1,128 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from argparse import Namespace +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import fairseq.data.audio.feature_transforms.utterance_cmvn as utt_cmvn +from fairseq.data import encoders +from fairseq.data.audio.audio_utils import convert_waveform as convert_wav +from fairseq.data.audio.audio_utils import get_fbank +from fairseq.data.audio.audio_utils import get_waveform as get_wav +from fairseq.data.audio.speech_to_text_dataset import SpeechToTextDataset + +logger = logging.getLogger(__name__) + + +class S2THubInterface(nn.Module): + def __init__(self, cfg, task, model): + super().__init__() + self.cfg = cfg + self.task = task + self.model = model + self.model.eval() + self.generator = self.task.build_generator([self.model], self.cfg.generation) + + @classmethod + def get_model_input(cls, task, audio: Union[str, torch.Tensor]): + input_type = task.data_cfg.hub.get("input_type", "fbank80") + if input_type == "fbank80_w_utt_cmvn": + if isinstance(audio, str): + feat = utt_cmvn.UtteranceCMVN()(get_fbank(audio)) + feat = feat.unsqueeze(0) # T x D -> 1 x T x D + else: + import torchaudio.compliance.kaldi as kaldi + + feat = kaldi.fbank(audio, num_mel_bins=80).numpy() # 1 x T x D + elif input_type in {"waveform", "standardized_waveform"}: + if isinstance(audio, str): + feat, sr = get_wav(audio) # C x T + feat, _ = convert_wav( + feat, sr, to_sample_rate=16_000, to_mono=True + ) # C x T -> 1 x T + else: + feat = audio.numpy() + else: + raise ValueError(f"Unknown value: input_type = {input_type}") + + src_lengths = torch.Tensor([feat.shape[1]]).long() + src_tokens = torch.from_numpy(feat) # 1 x T (x D) + if input_type == "standardized_waveform": + with torch.no_grad(): + src_tokens = F.layer_norm(src_tokens, src_tokens.shape) + + return { + "net_input": { + "src_tokens": src_tokens, + "src_lengths": src_lengths, + "prev_output_tokens": None, + }, + "target_lengths": None, + "speaker": None, + } + + @classmethod + def detokenize(cls, task, tokens): + text = task.tgt_dict.string(tokens) + tkn_cfg = task.data_cfg.bpe_tokenizer + tokenizer = encoders.build_bpe(Namespace(**tkn_cfg)) + return text if tokenizer is None else tokenizer.decode(text) + + @classmethod + def get_prefix_token(cls, task, lang): + prefix_size = int(task.data_cfg.prepend_tgt_lang_tag) + prefix_tokens = None + if prefix_size > 0: + assert lang is not None + lang_tag = SpeechToTextDataset.get_lang_tag_idx(lang, task.tgt_dict) + prefix_tokens = torch.Tensor([lang_tag]).long().unsqueeze(0) + return prefix_tokens + + @classmethod + def get_prediction( + cls, task, model, generator, sample, tgt_lang=None, synthesize_speech=False + ) -> Union[str, Tuple[str, Tuple[torch.Tensor, int]]]: + _tgt_lang = tgt_lang or task.data_cfg.hub.get("tgt_lang", None) + prefix = cls.get_prefix_token(task, _tgt_lang) + pred_tokens = generator.generate([model], sample, prefix_tokens=prefix) + pred = cls.detokenize(task, pred_tokens[0][0]["tokens"]) + eos_token = task.data_cfg.config.get("eos_token", None) + if eos_token: + pred = " ".join(pred.split(" ")[:-1]) + + if synthesize_speech: + pfx = f"{_tgt_lang}_" if task.data_cfg.prepend_tgt_lang_tag else "" + tts_model_id = task.data_cfg.hub.get(f"{pfx}tts_model_id", None) + speaker = task.data_cfg.hub.get(f"{pfx}speaker", None) + if tts_model_id is None: + logger.warning("TTS model configuration not found") + else: + _repo, _id = tts_model_id.split(":") + tts_model = torch.hub.load(_repo, _id, verbose=False) + pred = (pred, tts_model.predict(pred, speaker=speaker)) + return pred + + def predict( + self, + audio: Union[str, torch.Tensor], + tgt_lang: Optional[str] = None, + synthesize_speech: bool = False, + ) -> Union[str, Tuple[str, Tuple[torch.Tensor, int]]]: + # `audio` is either a file path or a 1xT Tensor + # return either text or (text, synthetic speech) + sample = self.get_model_input(self.task, audio) + return self.get_prediction( + self.task, + self.model, + self.generator, + sample, + tgt_lang=tgt_lang, + synthesize_speech=synthesize_speech, + ) diff --git a/fairseq/models/speech_to_text/modules/__init__.py b/fairseq/models/speech_to_text/modules/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fairseq/models/speech_to_text/modules/augmented_memory_attention.py b/fairseq/models/speech_to_text/modules/augmented_memory_attention.py new file mode 100644 index 0000000000..2d330f96f6 --- /dev/null +++ b/fairseq/models/speech_to_text/modules/augmented_memory_attention.py @@ -0,0 +1,487 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Tuple + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from fairseq.models import FairseqEncoder +from fairseq.models.speech_to_text import ConvTransformerEncoder +from fairseq.models.speech_to_text.utils import ( + attention_suppression, + lengths_to_encoder_padding_mask, + segments_to_sequence, + sequence_to_segments, +) +from fairseq.modules import MultiheadAttention, TransformerEncoderLayer + +# ------------------------------------------------------------------------------ +# AugmentedMemoryConvTransformerEncoder +# ------------------------------------------------------------------------------ + + +class AugmentedMemoryConvTransformerEncoder(ConvTransformerEncoder): + def __init__(self, args): + super().__init__(args) + + args.encoder_stride = self.stride() + + self.left_context = args.left_context // args.encoder_stride + + self.right_context = args.right_context // args.encoder_stride + + self.left_context_after_stride = args.left_context // args.encoder_stride + self.right_context_after_stride = args.right_context // args.encoder_stride + + self.transformer_layers = nn.ModuleList([]) + self.transformer_layers.extend( + [ + AugmentedMemoryTransformerEncoderLayer(args) + for i in range(args.encoder_layers) + ] + ) + + def stride(self): + # Hard coded here. Should infer from convs in future + stride = 4 + return stride + + def forward(self, src_tokens, src_lengths, states=None): + """Encode input sequence. + :param torch.Tensor xs: input tensor + :param torch.Tensor masks: input mask + :return: position embedded tensor and mask + :rtype Tuple[torch.Tensor, torch.Tensor]: + """ + bsz, max_seq_len, _ = src_tokens.size() + x = ( + src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim) + .transpose(1, 2) + .contiguous() + ) + x = self.conv(x) + bsz, _, output_seq_len, _ = x.size() + x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1) + x = self.out(x) + x = self.embed_scale * x + + subsampling_factor = 1.0 * max_seq_len / output_seq_len + input_lengths = torch.max( + (src_lengths.float() / subsampling_factor).ceil().long(), + x.size(0) * src_lengths.new_ones([src_lengths.size(0)]).long(), + ) + + encoder_padding_mask, _ = lengths_to_encoder_padding_mask( + input_lengths, batch_first=True + ) + + # TODO: fix positional embedding + positions = self.embed_positions(encoder_padding_mask).transpose(0, 1) + + x += positions + x = F.dropout(x, p=self.dropout, training=self.training) + + # State to store memory banks etc. + if states is None: + states = [ + {"memory_banks": None, "encoder_states": None} + for i in range(len(self.transformer_layers)) + ] + + for i, layer in enumerate(self.transformer_layers): + # x size: + # (self.left_size + self.segment_size + self.right_size) + # / self.stride, num_heads, dim + # TODO: Consider mask here + x = layer(x, states[i]) + states[i]["encoder_states"] = x[ + self.left_context_after_stride : -self.right_context_after_stride + ] + + lengths = ( + ( + ~encoder_padding_mask[ + :, self.left_context_after_stride : -self.right_context_after_stride + ] + ) + .sum(dim=1, keepdim=True) + .long() + ) + + return states[-1]["encoder_states"], lengths, states + + +# ------------------------------------------------------------------------------ +# AugmentedMemoryTransformerEncoderLayer +# ------------------------------------------------------------------------------ +class AugmentedMemoryTransformerEncoderLayer(TransformerEncoderLayer): + def __init__(self, args): + super().__init__(args) + + self.left_context = args.left_context // args.encoder_stride + self.right_context = args.right_context // args.encoder_stride + + def forward(self, x, state): + + length, batch_size, x_dim = x.size() + + residual = x + + if self.normalize_before: + x = self.self_attn_layer_norm(x) + + # init_state + if state.get("memory_banks", None) is None: + state["memory_banks"] = [] + + # TODO reseach new sum_query method + seg_start = self.left_context + seg_end = length - self.right_context + if seg_start < seg_end: + summarization_query = torch.mean(x[seg_start:seg_end], keepdim=True, dim=0) + else: + summarization_query = x.new_zeros(1, batch_size, x_dim) + + x = torch.cat([x, summarization_query], dim=0) + + x = self.self_attn(input_and_summary=x, state=state) + + x = self.dropout_module(x) + x = residual + x + + if not self.normalize_before: + x = self.self_attn_layer_norm(x) + + residual = x + if self.normalize_before: + x = self.final_layer_norm(x) + + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + x = self.fc2(x) + x = self.dropout_module(x) + x = residual + x + if not self.normalize_before: + x = self.final_layer_norm(x) + + return x + + def build_self_attention(self, embed_dim, args): + return AugmentedMemoryMultiheadAttention( + embed_dim=embed_dim, + num_heads=args.encoder_attention_heads, + dropout=args.attention_dropout, + self_attention=True, + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + tanh_on_mem=True, + max_memory_size=args.max_memory_size, + ) + + +# ------------------------------------------------------------------------------ +# AugmentedMemoryMultiheadAttention +# ------------------------------------------------------------------------------ +class AugmentedMemoryMultiheadAttention(MultiheadAttention): + """ + Augmented Memory Attention from + Streaming Transformer-based Acoustic Models + Using Self-attention with Augmented Memory + https://arxiv.org/abs/2005.08042 + """ + + def __init__( + self, + embed_dim, + num_heads, + kdim=None, + vdim=None, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + self_attention=False, + encoder_decoder_attention=False, + q_noise=0.0, + qn_block_size=8, + tanh_on_mem=False, + memory_dim=None, + std_scale=0.5, # 0.5 based on https://arxiv.org/abs/2005.09137 + max_memory_size=-1, + disable_mem_on_mem_attn=True, + ): + super().__init__( + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + self_attention, + encoder_decoder_attention, + q_noise, + qn_block_size, + ) + + self.memory_dim = memory_dim if memory_dim is not None else embed_dim + self.std_scale = std_scale + self.disable_mem_on_mem_attn = disable_mem_on_mem_attn + + # This Operator was used for factorization in PySpeech + self.v2e = lambda x: x + + if tanh_on_mem: + self.squash_mem = torch.tanh + self.nonlinear_squash_mem = True + else: + self.squash_mem = lambda x: x + self.nonlinear_squash_mem = False + + self.max_memory_size = max_memory_size + + def forward(self, input_and_summary, state): + """ + input: Encoder states of current segment with left or right context, + plus one summarization query + + """ + + length, batch_size, _ = input_and_summary.shape + length = length - 1 # not include sum_query, last index + + memory = state["memory_banks"] + # TODO: positional embedding on memory + + if self.max_memory_size > -1 and len(memory) > self.max_memory_size: + # TODO: need to fix here + if self.max_memory_size == 0: + memory = memory.new_zeros(1, memory.size(1), self.memory_dim) + else: + memory = memory[-self.max_memory_size :] + + memory_and_input = torch.cat(memory + [input_and_summary[:-1]], dim=0) + input_and_sum_query = input_and_summary + + q = self.q_proj(self.v2e(input_and_sum_query)) + k = self.k_proj(self.v2e(memory_and_input)) + v = self.v_proj(self.v2e(memory_and_input)) + + q = ( + q.contiguous() + .view(-1, batch_size * self.num_heads, self.head_dim) + .transpose(0, 1) + * self.scaling + ) + k = ( + k.contiguous() + .view(-1, batch_size * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + + v = ( + v.contiguous() + .view(-1, batch_size * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + + attention_weights = torch.bmm(q, k.transpose(1, 2)) + + if self.disable_mem_on_mem_attn: + attention_weights = self.suppress_mem_on_mem_attention( + batch_size, self.num_heads, len(memory), attention_weights + ) + + if self.std_scale is not None: + attention_weights = attention_suppression(attention_weights, self.std_scale) + + assert list(attention_weights.shape) == [ + batch_size * self.num_heads, + length + 1, + length + len(memory), + ] + + attention_weights = torch.nn.functional.softmax( + attention_weights.float(), dim=-1 + ).type_as(attention_weights) + + attention_probs = self.dropout_module(attention_weights) + + # [T, T, B, n_head] + [T, B, n_head, d_head] -> [T, B, n_head, d_head] + attention = torch.bmm(attention_probs, v) + + assert list(attention.shape) == [ + batch_size * self.num_heads, + length + 1, + self.head_dim, + ] + + attention = ( + attention.transpose(0, 1) + .contiguous() + .view(length + 1, batch_size, self.embed_dim) + ) + + output_and_memory = self.out_proj(attention) + + next_m = output_and_memory[-1:] + next_m = self.squash_mem(next_m) + output = output_and_memory[:-1] + + state["memory_banks"].append(next_m) + + return output + + def suppress_mem_on_mem_attention( + self, B: int, num_heads: int, mem_size: int, attention_weight: Tensor + ): + """ + Arguments: + - B: batch size + - num_heads: number of attention heads + - mem_size: size of memory bank + - attention_weight: a [B*num_heads, T + 1, T + mem_size] vector + + Return: + modified attention_weight with [B*num_heads, -1, :mem_size] = -inf + """ + attention_weight[:, -1, :mem_size] = float("-inf") + return attention_weight + + +# ------------------------------------------------------------------------------ +# SequenceEncoder +# ------------------------------------------------------------------------------ +class SequenceEncoder(FairseqEncoder): + """ + SequenceEncoder encodes sequences. + + More specifically, `src_tokens` and `src_lengths` in `forward()` should + describe a batch of "complete" sequences rather than segments. + + Segment-by-segment inference can be triggered by `segment_size`: + 1) `segment_size` is None: + SequenceEncoder treats the input sequence as one single segment. + 2) `segment_size` is not None (some int instead): + SequenceEncoder does the following: + 1. breaks the input sequence into several segments + 2. inference on each segment and collect the outputs + 3. concatanete segment outputs into the output sequence. + Note that `segment_size` here shouldn't include additional left/right + contexts needed, for example if we wish to infer with LC-BLSTM where the + middle chunk size is 100 and right context is 20, `segment_size` should be + 100. + """ + + def __init__(self, args, module): + super().__init__(None) + + self.module = module + self.input_time_axis = 1 + self.output_time_axis = 0 + self.segment_size = args.segment_size + self.left_context = args.left_context + self.right_context = args.right_context + + def forward( + self, + src_tokens: Tensor, + src_lengths: Tensor, + states=None, + ): + + seg_src_tokens_lengths = sequence_to_segments( + sequence=src_tokens, + time_axis=self.input_time_axis, + lengths=src_lengths, + segment_size=self.segment_size, + extra_left_context=self.left_context, + extra_right_context=self.right_context, + ) + + seg_encoder_states_lengths: List[Tuple[Tensor, Tensor]] = [] + + for seg_src_tokens, seg_src_lengths in seg_src_tokens_lengths: + (seg_encoder_states, seg_enc_lengths, states) = self.module( + seg_src_tokens, + seg_src_lengths, + states=states, + ) + + seg_encoder_states_lengths.append((seg_encoder_states, seg_enc_lengths)) + + encoder_out, enc_lengths = segments_to_sequence( + segments=seg_encoder_states_lengths, time_axis=self.output_time_axis + ) + + encoder_padding_mask, _ = lengths_to_encoder_padding_mask( + enc_lengths, batch_first=True + ) + + if not encoder_padding_mask.any(): + encoder_padding_mask = None + + return { + "encoder_out": [encoder_out], + "encoder_padding_mask": [encoder_padding_mask], + "encoder_embedding": [], + "encoder_states": [states], + "src_tokens": [], + "src_lengths": [], + } + + def incremental_encode( + self, + seg_src_tokens: Tensor, + seg_src_lengths: Tensor, + states=None, + ): + """ + Different from forward function, this function takes segmented speech + as input, and append encoder states to previous states + """ + (seg_encoder_states, seg_enc_lengths, states) = self.module( + seg_src_tokens, + seg_src_lengths, + states=states, + ) + return seg_encoder_states, seg_enc_lengths, states + + +# ------------------------------------------------------------------------------ +# Augmented memory model decorator +# ------------------------------------------------------------------------------ +def augmented_memory(klass): + class StreamSeq2SeqModel(klass): + @staticmethod + def add_args(parser): + super(StreamSeq2SeqModel, StreamSeq2SeqModel).add_args(parser) + parser.add_argument( + "--segment-size", type=int, required=True, help="Length of the segment." + ) + parser.add_argument( + "--left-context", + type=int, + default=0, + help="Left context for the segment.", + ) + parser.add_argument( + "--right-context", + type=int, + default=0, + help="Right context for the segment.", + ) + parser.add_argument( + "--max-memory-size", + type=int, + default=-1, + help="Right context for the segment.", + ) + + StreamSeq2SeqModel.__name__ = klass.__name__ + return StreamSeq2SeqModel diff --git a/fairseq/models/speech_to_text/modules/convolution.py b/fairseq/models/speech_to_text/modules/convolution.py new file mode 100644 index 0000000000..526d7540c5 --- /dev/null +++ b/fairseq/models/speech_to_text/modules/convolution.py @@ -0,0 +1,126 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import List + +import torch +import torch.nn as nn + + +class Conv1dSubsampler(nn.Module): + """Convolutional subsampler: a stack of 1D convolution (along temporal + dimension) followed by non-linear activation via gated linear units + (https://arxiv.org/abs/1911.08460) + + Args: + in_channels (int): the number of input channels + mid_channels (int): the number of intermediate channels + out_channels (int): the number of output channels + kernel_sizes (List[int]): the kernel size for each convolutional layer + """ + + def __init__( + self, + in_channels: int, + mid_channels: int, + out_channels: int, + kernel_sizes: List[int] = (3, 3), + ): + super(Conv1dSubsampler, self).__init__() + self.n_layers = len(kernel_sizes) + self.conv_layers = nn.ModuleList( + nn.Conv1d( + in_channels if i == 0 else mid_channels // 2, + mid_channels if i < self.n_layers - 1 else out_channels * 2, + k, + stride=2, + padding=k // 2, + ) + for i, k in enumerate(kernel_sizes) + ) + + def get_out_seq_lens_tensor(self, in_seq_lens_tensor): + out = in_seq_lens_tensor.clone() + for _ in range(self.n_layers): + out = ((out.float() - 1) / 2 + 1).floor().long() + return out + + def forward(self, src_tokens, src_lengths): + bsz, in_seq_len, _ = src_tokens.size() # B x T x (C x D) + x = src_tokens.transpose(1, 2).contiguous() # -> B x (C x D) x T + for conv in self.conv_layers: + x = conv(x) + x = nn.functional.glu(x, dim=1) + _, _, out_seq_len = x.size() + x = x.transpose(1, 2).transpose(0, 1).contiguous() # -> T x B x (C x D) + return x, self.get_out_seq_lens_tensor(src_lengths) + + +def infer_conv_output_dim(in_channels, input_dim, out_channels): + sample_seq_len = 200 + sample_bsz = 10 + x = torch.randn(sample_bsz, in_channels, sample_seq_len, input_dim) + x = torch.nn.Conv2d(in_channels, out_channels, 3, stride=2, padding=3 // 2)(x) + x = torch.nn.Conv2d(out_channels, out_channels, 3, stride=2, padding=3 // 2)(x) + x = x.transpose(1, 2) + mb, seq = x.size()[:2] + return x.contiguous().view(mb, seq, -1).size(-1) + + +class Conv2dSubsampler(nn.Module): + """Convolutional subsampler: a stack of 2D convolution based on ESPnet implementation + (https://github.com/espnet/espnet) + + Args: + input_channels (int): the number of input channels + input_feat_per_channel (int): encoder input dimension per input channel + conv_out_channels (int): the number of output channels of conv layer + encoder_embed_dim (int): encoder dimentions + """ + + def __init__( + self, + input_channels: int, + input_feat_per_channel: int, + conv_out_channels: int, + encoder_embed_dim: int, + ): + super().__init__() + assert input_channels == 1, input_channels + self.conv = torch.nn.Sequential( + torch.nn.Conv2d( + input_channels, conv_out_channels, 3, stride=2, padding=3 // 2 + ), + torch.nn.ReLU(), + torch.nn.Conv2d( + conv_out_channels, + conv_out_channels, + 3, + stride=2, + padding=3 // 2, + ), + torch.nn.ReLU(), + ) + transformer_input_dim = infer_conv_output_dim( + input_channels, input_feat_per_channel, conv_out_channels + ) + self.out = torch.nn.Linear(transformer_input_dim, encoder_embed_dim) + + def forward(self, src_tokens, src_lengths): + B, T_i, C = src_tokens.size() + x = src_tokens.view(B, T_i, 1, C).transpose(1, 2).contiguous() + x = self.conv(x) + B, _, T_o, _ = x.size() + x = x.transpose(1, 2).transpose(0, 1).contiguous().view(T_o, B, -1) + x = self.out(x) + + subsampling_factor = int(T_i * 1.0 / T_o + 0.5) + input_len_0 = (src_lengths.float() / subsampling_factor).ceil().long() + input_len_1 = x.size(0) * torch.ones([src_lengths.size(0)]).long().to( + input_len_0.device + ) + input_lengths = torch.min(input_len_0, input_len_1) + return x, input_lengths diff --git a/fairseq/models/speech_to_text/modules/emformer.py b/fairseq/models/speech_to_text/modules/emformer.py new file mode 100644 index 0000000000..935d593078 --- /dev/null +++ b/fairseq/models/speech_to_text/modules/emformer.py @@ -0,0 +1,1844 @@ +#!/usr/bin/env python3 +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + + +import math +import re +from functools import partial +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from torch import Tensor +from torch import device as Device + +from fairseq.models import FairseqEncoder +from fairseq.models.speech_to_text.utils import ( + NoOp, + attention_suppression, + layer_norm_backward_hook, + lengths_to_padding_mask, + segments_to_sequence, +) + +try: + import torch.ao.quantization as quantization + from torch.ao.quantization.qconfig import ( + default_dynamic_qconfig, + per_channel_dynamic_qconfig, + ) +except ImportError: + import torch.quantization as quantization + from torch.quantization.qconfig import ( + default_dynamic_qconfig, + per_channel_dynamic_qconfig, + ) + + +class RelativePositionEmbedding(nn.Module): + """ + Implementation according to https://arxiv.org/abs/1803.02155 + """ + + def __init__(self, head_dim, max_position, norm_init=True): + super().__init__() + self.head_dim = head_dim + self.max_position = max_position + self.embeddings = nn.Parameter(torch.Tensor(max_position * 2 + 1, head_dim)) + if norm_init: + nn.init.xavier_normal_(self.embeddings) + else: + nn.init.xavier_uniform_(self.embeddings) + + def forward(self, input: Tensor): + output = nn.functional.embedding(input.long(), self.embeddings) + return output + + +class Fp32LayerNorm(nn.Module): + def __init__( + self, + input_dim, + clamp_grad=True, + max_grad_value=256, + eps=1e-5, + elementwise_affine=True, + ): + super().__init__() + self.torch_module = torch.nn.LayerNorm( + input_dim, eps=eps, elementwise_affine=elementwise_affine + ) + if clamp_grad: + hook = partial(layer_norm_backward_hook, clamp_value=max_grad_value) + self.torch_module.register_backward_hook(hook) + + def forward(self, input): + output = torch.nn.functional.layer_norm( + input.float(), + self.torch_module.normalized_shape, + self.torch_module.weight.float() + if self.torch_module.weight is not None + else None, + self.torch_module.bias.float() + if self.torch_module.bias is not None + else None, + self.torch_module.eps, + ).type_as(input) + return output + + +# ------------------------------------------------------------------------------ +# PositionwiseFF +# ------------------------------------------------------------------------------ + + +class PositionwiseFF(nn.Module): + """ + FFN layer in transformer. + + Args: + input_dim: input embedding dimension + ffn_dim: FFN layer inner dimension + dropout_on_fc1: dropout for first linear layer + dropout_on_fc2: dropout fr second linear layer + activation_fn: activation function used after first linear layer. \ + Only relu or gelu is supported. + + """ + + def __init__( + self, input_dim, ffn_dim, dropout_on_fc1, dropout_on_fc2, activation_fn + ): + super(PositionwiseFF, self).__init__() + + self.input_dim = input_dim + self.ffn_dim = ffn_dim + if activation_fn == "relu": + ac = nn.ReLU() + elif activation_fn == "gelu": + ac = nn.GELU() + else: + raise ValueError("Unsupported activation_fn = ({})".format(activation_fn)) + + # fc1 -> ac -> dropout -> fc2 -> dropout + self.module = nn.Sequential( + nn.Linear(input_dim, ffn_dim), + ac, + nn.Dropout(dropout_on_fc1), + nn.Linear(ffn_dim, input_dim), + nn.Dropout(dropout_on_fc2), + ) + + self.layer_norm = Fp32LayerNorm(input_dim) + + def forward(self, input): + module_out = self.module(self.layer_norm(input)) + output = module_out + input + + return output + + def quantize_(self, params=None): + if params and "per_channel" in params and params["per_channel"]: + qconfig = per_channel_dynamic_qconfig + else: + qconfig = default_dynamic_qconfig + quantization.quantize_dynamic( + self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True + ) + return self + + +# ------------------------------------------------------------------------------ +# SummarizationLayer +# ------------------------------------------------------------------------------ + + +class SummarizationLayer(nn.Module): + def __init__(self, method, segment_size, embedding_dim): + super(SummarizationLayer, self).__init__() + self.segment_size = segment_size + self.embedding_dim = embedding_dim + nonlin_match = re.match(r"nonlinear\((?P<act>[a-z]+),(?P<dim>[0-9]+)\)", method) + self.method = method + if method == "mean": + self.module = nn.AvgPool1d( + kernel_size=segment_size, + stride=segment_size, + ceil_mode=True, + ) + elif method == "max": + self.module = nn.MaxPool1d( + kernel_size=segment_size, + stride=segment_size, + ceil_mode=True, + ) + elif method == "linear": + self.module = nn.Linear(segment_size, 1) + elif nonlin_match: + nonlin_args = nonlin_match.groupdict() + act_type = nonlin_args["act"] + hid_dim = int(nonlin_args["dim"]) + if act_type == "relu": + act = nn.ReLU() + elif act_type == "gelu": + act = nn.GELU() + else: + raise ValueError("Unsupported activation_fn = ({})".format(act_type)) + self.module = nn.Sequential( + nn.Linear(segment_size, hid_dim), + act, + nn.Linear(hid_dim, 1), + ) + else: + raise ValueError("Unsupported summarization method = ({})".format(method)) + + def forward(self, input): + # T, B, D -> B, D, T + input = input.permute(1, 2, 0) + + if self.method == "mean" or self.method == "max": + output = self.module(input) + output = output.permute(2, 0, 1) + return output + + full_seg_length = input.size(2) // self.segment_size * self.segment_size + if full_seg_length > 0: + # at least one seg is full + B = input.size(0) + D = input.size(1) + input_todo = ( + input[:, :, :full_seg_length] + .contiguous() + .view(B, -1, self.segment_size) + ) + output = self.module(input_todo) + output = output.view(B, D, -1) + else: + output = input.new_zeros(input.size(0), input.size(1), 0) + left = input.size(2) - full_seg_length + if left > 0: + # when last seg is not full, use zeros as last memory placeholder + zeros = input.new_zeros(input.size(0), input.size(1), 1) + output = torch.cat([output, zeros], dim=2) + output = output.permute(2, 0, 1) + return output + + +# ------------------------------------------------------------------------------ +# NoSegAugmentedMemoryMultiheadAttentionBmm +# ------------------------------------------------------------------------------ + + +class NoSegAugmentedMemoryMultiheadAttentionBmm(nn.Module): + """ + Whole utterance augmented memory multihead attention using BMM. + + Different with previous augmented memory multihead attention where + the utterance is chunked into segments. Here we use attention mask + achieve so. The input embedding [right_context, utterance, summary] + is a concatenation of right context, utterance and summary. + + Right context block is the concatenation of all the right context for + each segments. [right_context_0, right_context_1, ..., right_context_n] + For example, if we have utterance = [v0, v1, v2, ...., v20]. segment + size 8, right_context size 4. Then the right context blocks = + [v8, v9, v10, v11, v16, v17, v18, v19, 0, 0, 0, 0], where v8, v9, v10, + and v11 are the right context for first segment. v16, v17, v18 and v19 + are the right context for second segment. 0, 0, 0 and 0 are right context + for the last segment. + + utterance is corresponding to input embedding sequence + + summary is concatenation of average of each segments. [summary_0, + summary_1, ..., ]. + + In augmented memory multihead attention, the query is [right_context, + utterance, summary], key is [memory, right_context, utterance]. Different + with AugmentedMemoryMultiheadAttentionBmm, memory here is passed from + previous attention layer. For the first attention layer, memory is average + of each segment. + + Memory is a concatenation of memory from each segments in previous attention + layer. For example, current layer is i, then memory is [m_0, m_1, ..., m_n]. + Each m_k is the output from seg_k in layer i-1. + + args: + input_dim: input embedding dimension + num_heads: number of heads in multihead self-attention + dropout: attention dropout + std_scale: if std_scale is not None. The weak attention suppression is + turned on. For std_scale = 0.5, all the attention smaller than + mean + 0.5 * std will be suppressed. + scaled_init: whether to use scaled init for linear weight + tanh_on_mem: whether to use tanh on memory output + use_mem: whether to use memory or not. When max_memory_size is 0, then + we don't have memory anymore. + layer_index: current self-attention layer index that is used in depth + initialization + max_relative_position: max relative position used in relative position + embedding + rpe_old_option: To be compatible with previous model. The previous model + was trained with attention += attention + rpe. The correct equation + should be attention = attention + rpe + + """ + + def __init__( + self, + input_dim, + num_heads, + dropout=0.0, + std_scale=None, + scaled_init=False, + tanh_on_mem=False, + use_mem=True, + mini_batches=False, + negative_inf="-inf", + layer_index=-1, + max_relative_position=0, + rpe_old_option=True, + ): + if input_dim % num_heads: + raise ValueError( + "input_dim ({}) must be divisible by num_heads ({})".format( + input_dim, num_heads + ) + ) + + super().__init__() + + embed_dim = input_dim + self.e2h_kv = torch.nn.Linear(input_dim, 2 * input_dim, bias=True) + self.e2h_q = torch.nn.Linear(input_dim, input_dim, bias=True) + self.rpe_old_option = rpe_old_option + if max_relative_position > 0: + self.use_rpe = True + self.rpe_k = RelativePositionEmbedding( + head_dim=input_dim // num_heads, + max_position=max_relative_position, + ) + self.rpe_v = RelativePositionEmbedding( + head_dim=input_dim // num_heads, + max_position=max_relative_position, + ) + else: + self.use_rpe = False + self.rpe_k = None + self.rpe_v = None + if scaled_init: + if layer_index == -1: + gain = 1.0 / math.sqrt(2) + else: + # https://arxiv.org/abs/2005.09684 depthwise initialization + # stablize the training greatly. Use depthwise initialization to + # replace incremental loss. + gain = 1.0 / math.sqrt(layer_index + 1) + torch.nn.init.xavier_uniform_(self.e2h_kv.weight, gain=gain) + torch.nn.init.xavier_uniform_(self.e2h_q.weight, gain=gain) + + self.out_proj = torch.nn.Linear(embed_dim, embed_dim, bias=True) + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + + self.head_dim = embed_dim // num_heads + self.scaling = self.head_dim**-0.5 + + self.std_scale = std_scale + self.use_mem = use_mem + self.mini_batches = mini_batches + self.negative_inf = negative_inf + + if tanh_on_mem: + self.squash_mem = torch.tanh + self.nonlinear_squash_mem = True + else: + self.squash_mem = NoOp() + self.nonlinear_squash_mem = False + + def prepare_qkv( + self, + input: Tensor, + mems: Tensor, + lengths: Tensor, + summary_length: int, + lc_length: int, + ): + # T: right_context length + utterance_length + summary_length + T, B, D = input.shape + mem_length = mems.size(0) + utterance_length = torch.max(lengths) + + right_context_blocks_length = T - utterance_length - summary_length + rc_block = input[:right_context_blocks_length, :, :] + utterance_block = input[right_context_blocks_length : T - summary_length, :, :] + + if B == 1: + padding_mask = None + else: + klengths = lengths + mem_length + right_context_blocks_length + lc_length + padding_mask = lengths_to_padding_mask(lengths=klengths) + + mem_rc_input = torch.cat([mems, rc_block, utterance_block], dim=0) + + # In training lc_length = 0 + key_length = mem_rc_input.size(0) + lc_length + rc_input_sum = input + q = self.e2h_q(rc_input_sum) + kv = self.e2h_kv(mem_rc_input) + k, v = kv.chunk(chunks=2, dim=2) + result_qkv = (q, k, v) + input_shape = (T, B, D) + result_lengths_info = ( + mem_length, + utterance_length, + right_context_blocks_length, + key_length, + ) + if padding_mask is not None: + assert padding_mask.size(0) == B + assert padding_mask.size(1) == key_length + + return result_qkv, input_shape, result_lengths_info, padding_mask + + def prepare_attention_weights( + self, + q: Tensor, + new_k: Tensor, + new_v: Tensor, + input_shape: Tuple[int, int, int], + rpe: Optional[Tensor], + ) -> Tuple[Tensor, Tensor, Tensor]: + T, B, D = input_shape + q = ( + q.contiguous().view(-1, B * self.num_heads, self.head_dim).transpose(0, 1) + * self.scaling + ) + + k = ( + new_k.contiguous() + .view(-1, B * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + + v = ( + new_v.contiguous() + .view(-1, B * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + + attention_weights = torch.bmm(q, k.transpose(1, 2)) + if self.use_rpe and rpe is not None and self.rpe_v is not None: + r_k = self.rpe_k(rpe) + # [q, B*h, d] * [q, k, d] -> [B*h, q, k] + attention_weights_rpe = torch.matmul( + q.transpose(0, 1), r_k.transpose(1, 2) + ).transpose(0, 1) + attention_weights = attention_weights + attention_weights_rpe + attention_weights_float = attention_weights.float() + + return attention_weights, attention_weights_float, v + + def prepare_attention_output( + self, + attention_weights: Tensor, + attention_weights_float: Tensor, + v: Tensor, + input_shape: Tuple[int, int, int], + key_length: int, + padding_mask: Optional[Tensor], + rpe: Optional[Tensor], + ) -> Tensor: + T, B, D = input_shape + if padding_mask is not None: + attention_weights_float = attention_weights_float.view( + B, self.num_heads, T, key_length + ) + attention_weights_float = attention_weights_float.masked_fill( + padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf") + ) + attention_weights_float = attention_weights_float.view( + B * self.num_heads, T, key_length + ) + + if self.std_scale is not None: + attention_weights_float = attention_suppression( + attention_weights_float, self.std_scale + ) + + attention_weights_float = torch.nn.functional.softmax( + attention_weights_float, dim=-1 + ) + attention_weights = attention_weights_float.type_as(attention_weights) + + attention_probs = torch.nn.functional.dropout( + attention_weights, p=self.dropout, training=self.training + ) + + # [T, key_length, B, n_head]+ [key_length, B, n_head, d_head] + # -> [T, B, n_head, d_head] + attention = torch.bmm(attention_probs, v) + if self.use_rpe and rpe is not None and self.rpe_v is not None: + r_v = self.rpe_v(rpe) + attention_rpe = torch.matmul( + attention_probs.transpose(0, 1), r_v + ).transpose(0, 1) + + if self.rpe_old_option: + attention += attention + attention_rpe + else: + attention = attention + attention_rpe + + assert list(attention.shape) == [B * self.num_heads, T, self.head_dim] + + attention = attention.transpose(0, 1).contiguous().view(T, B, self.embed_dim) + + rc_output_memory = self.out_proj(attention) + return rc_output_memory + + @torch.jit.unused + def forward( + self, + input: Tensor, + lengths: Tensor, + mems: Tensor, + attention_mask: Tensor, + pre_mems: Optional[Tensor] = None, + left_context_key: Optional[Tensor] = None, + left_context_val: Optional[Tensor] = None, + rpe: Optional[Tensor] = None, + ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + """ + forward function for NoSegAugmentedMemoryMultiheadAttentionBmm in training. + + args: + input: formed in the following way + [right_context_0, right_contex_1, ..., seg_0, seg_1, + ..., summary_0, summary_1,..] + lengths: the length of query which is [seg_0, seg_1, ....] + mems: [mem_0, mem_1, ...]. + attention_mask: attention mask for query = [right_context, query, summary] + key = [mem, right_context, query]. This is only used for traing. + + """ + if self.use_mem: + mem_length = mems.size(0) + summary_length = mem_length + 1 + if pre_mems is not None: + mems = torch.cat([pre_mems, mems], dim=0) + else: + mem_length = 0 + summary_length = 0 + + # In training, lc_length = 0 + if left_context_key is not None: + lc_length = left_context_key.size(0) + else: + lc_length = 0 + results = self.prepare_qkv( + input=input, + mems=mems, + lengths=lengths, + summary_length=summary_length, + lc_length=lc_length, + ) + result_qkv, input_shape, result_lengths_info, padding_mask = results + q, k, v = result_qkv + ( + mem_length, + utterance_length, + right_context_blocks_length, + key_length, + ) = result_lengths_info + + if left_context_key is not None: + # add the cache key and value + new_k = torch.cat( + [ + k[: mem_length + right_context_blocks_length, :, :], + left_context_key, + k[-utterance_length:, :, :], + ], + dim=0, + ) + new_v = torch.cat( + [ + v[: mem_length + right_context_blocks_length, :, :], + left_context_val, + v[-utterance_length:, :, :], + ], + dim=0, + ) + next_k = new_k[mem_length + right_context_blocks_length :, :, :] + next_v = new_v[mem_length + right_context_blocks_length :, :, :] + else: + new_k = k + new_v = v + next_k = None + next_v = None + + attention_weights, attention_weights_float, v = self.prepare_attention_weights( + q=q, + new_k=new_k, + new_v=new_v, + input_shape=input_shape, + rpe=rpe, + ) + + # mask attention + attention_mask = attention_mask.unsqueeze(0) + attention_weights_float = attention_weights_float.masked_fill( + attention_mask, float(self.negative_inf) + ) + + rc_output_memory = self.prepare_attention_output( + attention_weights=attention_weights, + attention_weights_float=attention_weights_float, + v=v, + input_shape=input_shape, + key_length=key_length, + padding_mask=padding_mask, + rpe=rpe, + ) + + if self.use_mem: + # next_m length equals to summary length - 1 + # last memory is ignored + if self.mini_batches: + next_m = rc_output_memory[-summary_length:] + else: + next_m = rc_output_memory[-summary_length:-1] + + next_m = self.squash_mem(next_m) + # rc and output + rc_output = rc_output_memory[:-summary_length] + if not self.nonlinear_squash_mem: + next_m = torch.clamp(next_m, min=-10, max=10) + else: + next_m = mems + rc_output = rc_output_memory + + return rc_output, next_m, next_k, next_v + + @torch.jit.export + def forward_jit( + self, + input: Tensor, + lengths: Tensor, + mems: Tensor, + left_context_key: Tensor, + left_context_val: Tensor, + rpe: Optional[Tensor], + ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + """ + forward function for NoSegAugmentedMemoryMultiheadAttentionBmm in decoding. + + args: + input: formed in the following way + [right_context_0, right_contex_1, ..., seg_0, seg_1, + ..., summary_0, summary_1,..] + lengths: the length of query which is [seg_0, seg_1, ....] + mems: [mem_0, mem_1, ...]. + left_context_key: left_context for key part. This is only used for online + decoding. In training, this is empty tensor + left_context_val: left_context for value part. This is only used for online + decoding. In training, this is empty tensor + + """ + lc_length = left_context_key.size(0) + + # In decoding, summary_length = 1 or 0 + if self.use_mem: + summary_length = 1 + else: + summary_length = 0 + + results = self.prepare_qkv( + input=input, + mems=mems, + lengths=lengths, + summary_length=summary_length, + lc_length=lc_length, + ) + result_qkv, input_shape, result_lengths_info, padding_mask = results + q, k, v = result_qkv + ( + mem_length, + utterance_length, + right_context_blocks_length, + key_length, + ) = result_lengths_info + + # add the cache key and value + new_k = torch.cat( + [ + k[: mem_length + right_context_blocks_length, :, :], + left_context_key, + k[-utterance_length:, :, :], + ], + dim=0, + ) + new_v = torch.cat( + [ + v[: mem_length + right_context_blocks_length, :, :], + left_context_val, + v[-utterance_length:, :, :], + ], + dim=0, + ) + next_k = new_k[mem_length + right_context_blocks_length :, :, :] + next_v = new_v[mem_length + right_context_blocks_length :, :, :] + + attention_weights, attention_weights_float, v = self.prepare_attention_weights( + q=q, + new_k=new_k, + new_v=new_v, + input_shape=input_shape, + rpe=rpe, + ) + # In online decoding, we don't have attention mask. But we still need + # to disable the attention from summary query to memory + attention_weights_float[:, -1, :mem_length] = float(self.negative_inf) + rc_output_memory = self.prepare_attention_output( + attention_weights=attention_weights, + attention_weights_float=attention_weights_float, + v=v, + input_shape=input_shape, + key_length=key_length, + padding_mask=padding_mask, + rpe=rpe, + ) + + # In decoding, summary length is 1 + if self.use_mem: + next_m = rc_output_memory[-1:] + next_m = self.squash_mem(next_m) + # rc and output + rc_output = rc_output_memory[:-1] + if not self.nonlinear_squash_mem: + next_m = torch.clamp(next_m, min=-10, max=10) + else: + rc_output = rc_output_memory + # empty tensor as input mems + next_m = mems + + return rc_output, next_m, next_k, next_v + + def quantize_(self, params=None): + if params and "per_channel" in params and params["per_channel"]: + qconfig = per_channel_dynamic_qconfig + else: + qconfig = default_dynamic_qconfig + quantization.quantize_dynamic( + self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True + ) + return self + + +class NoSegAugmentedMemoryTransformer(nn.Module): + """ + Whole utterance augmented memory transformer. + + This is not pyspeech nn layer. It is used as a module in a master layer where + multiple transformers is used. + """ + + def __init__( + self, + input_dim, + num_heads, + ffn_dim, + dropout_in_attn=0.0, + dropout_on_attn=None, + dropout_on_fc1=None, + dropout_on_fc2=None, + activation_fn="relu", + tanh_on_mem=False, + std_scale=None, + scaled_init=False, + segment_size=128, + use_mem=True, + mini_batches=False, + negative_inf="-inf", + layer_index=-1, + summarization_method="mean", + max_relative_position=0, + rpe_old_option=True, + ): + super(NoSegAugmentedMemoryTransformer, self).__init__() + + self.attention = NoSegAugmentedMemoryMultiheadAttentionBmm( + input_dim=input_dim, + num_heads=num_heads, + dropout=dropout_in_attn, + scaled_init=scaled_init, + tanh_on_mem=tanh_on_mem, + std_scale=std_scale, + use_mem=use_mem, + mini_batches=mini_batches, + negative_inf=negative_inf, + layer_index=layer_index, + max_relative_position=max_relative_position, + ) + self.dropout = nn.Dropout(dropout_on_attn) + self.pos_ff = PositionwiseFF( + input_dim=input_dim, + ffn_dim=ffn_dim, + dropout_on_fc1=dropout_on_fc1, + dropout_on_fc2=dropout_on_fc2, + activation_fn=activation_fn, + ) + self.layer_norm_pre = Fp32LayerNorm(input_dim) + self.layer_norm = Fp32LayerNorm(input_dim) + self.segment_size = segment_size + self.use_mem = use_mem + + self.memory_op = SummarizationLayer( + summarization_method, segment_size, input_dim + ) + + def set_mini_batches(self, mini_batches): + self.attention.mini_batches = mini_batches + + def gen_summary_queries(self, input): + sum_input = self.memory_op(input) + return sum_input + + def pre_attention_ops(self, input, right_context_blocks): + rc_length = right_context_blocks.size(0) + input_length = input.size(0) + + rc_and_input = torch.cat([right_context_blocks, input], dim=0) + residual_input = rc_and_input + rc_and_input = self.layer_norm_pre(rc_and_input) + + query_input = rc_and_input[-input_length:, :, :] + return rc_length, input_length, residual_input, query_input, rc_and_input + + def after_attention_ops(self, attention_output, residual_input): + output = self.dropout(attention_output) + output = output + residual_input + output = self.pos_ff(output) + output = self.layer_norm(output) + return output + + @torch.jit.export + def forward_jit( + self, + input: Tensor, + lengths: Tensor, + mems: Tensor, + left_context_key: Tensor, + left_context_val: Tensor, + right_context_blocks: Tensor, + rpe: Optional[Tensor], + ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: + + results = self.pre_attention_ops(input, right_context_blocks) + rc_length, input_length, residual_input, query_input, rc_and_input = results + + # In online decoding, the summary query size is always 1 or 0 + if self.use_mem: + summary_query = self.gen_summary_queries(query_input) + summary_query = summary_query[0:1, :, :] + rc_qu_su = torch.cat([rc_and_input, summary_query], dim=0) + else: + rc_qu_su = rc_and_input + + rc_output, next_m, next_k, next_v = self.attention.forward_jit( + input=rc_qu_su, + lengths=lengths, + mems=mems, + left_context_key=left_context_key, + left_context_val=left_context_val, + rpe=rpe, + ) + rc_output = self.after_attention_ops(rc_output, residual_input) + results = ( + rc_output[-input_length:, :, :], + next_m, + rc_output[0:rc_length, :, :], + next_k, + next_v, + ) + return results + + @torch.jit.unused + def forward( + self, + input, + lengths, + mems, + right_context_blocks, + attention_mask, + pre_mems, + left_context_key, + left_context_val, + rpe, + ): + + results = self.pre_attention_ops(input, right_context_blocks) + rc_length, input_length, residual_input, query_input, rc_and_input = results + if self.use_mem: + summary_query = self.gen_summary_queries(query_input) + rc_qu_su = torch.cat([rc_and_input, summary_query], dim=0) + else: + rc_qu_su = rc_and_input + + rc_output, next_m, next_k, next_v = self.attention( + input=rc_qu_su, + lengths=lengths, + mems=mems, + attention_mask=attention_mask, + pre_mems=pre_mems, + left_context_key=left_context_key, + left_context_val=left_context_val, + rpe=rpe, + ) + + # [TODO] Note memory did not go through pos_ff. What happen if we pass + # memory through the pos_ff as well? + rc_output = self.after_attention_ops(rc_output, residual_input) + results = ( + rc_output[-input_length:, :, :], + next_m, + rc_output[0:rc_length, :, :], + next_k, + next_v, + ) + + return results + + +class NoSegAugmentedMemoryTransformerEncoderLayer(FairseqEncoder): + """ + Whole utterance augmented memory transformer encoder layer. This is a master layer + where we can define multiple augmented memory transformers. There are two reasons + to setup the master layer. + 1. We only need to define once about the attention mask. All the layers in the master + layer share the same mask. + 2. pyspeech nn layer has special input and output format. Defining one master layer is + easier to passing memory between different layes inside the master layer + + args: + input_dim: input embedding dimension + num_heads: number of heads in multihead self-attention + ffn_dim: ffn dimension in FFN layer + num_layers: number of augmented memory transformer layers + dropout_in_attn: dropout used in multi-head self-attention + dropout_on_attn: dropout used for output from te multihead self-attention + dropout_on_fc1: dropout used in FFN layer for the first linear layer + dropout_on_fc2: dropout used in FFN layer for the second linear layer + segment_size: segment size for each segment + context_config: (left_context_size, right_context_size) defines the surround context size + for each segment + max_memory_size: maximum memory size used for each segment + scaled_init: whether use scaled init for weight initialization in attention layer + std_scale: if std_scale is not None. The weak attention suppression is + turned on. For std_scale = 0.5, all the attention smaller than + mean + 0.5 * std will be suppressed. + activation_fn: activation function used in FFN layer. [ReLU, GELU] supported + tanh_on_mem: whether use tanh on memory + mini_batches: use mini-btach training + negative_inf: the negative infinity value used in attention masking. default is "-inf". + For some situation, e.g. LM. it is better to use "-1e8" to avoid nan issue. + summarization_method: method to generate segment summrization embedding + max_relative_position: max relatie position for relative position embedding + rpe_old_option: To be compatible with previous model. The previous model + was trained with attention += attention + rpe. The correct equation + should be attention = attention + rpe + [TODO]: remove the rpe_old_option by the end of 2021 Q1. + + """ + + def __init__( + self, + input_dim, + num_heads, + ffn_dim, + num_layers=1, + dropout_in_attn=0.0, + dropout_on_attn=0.0, + dropout_on_fc1=0.0, + dropout_on_fc2=0.0, + segment_size=128, + context_config=(0, 0), + max_memory_size=0, + scaled_init=True, + std_scale=None, + activation_fn="relu", + tanh_on_mem=False, + mini_batches=False, + negative_inf="-inf", + deep_init=True, + summarization_method="mean", + max_relative_position=0, + rpe_old_option=True, + ): + super().__init__(None) + if input_dim % num_heads: + raise ValueError( + "input_dim ({}) must be divisible by num_heads ({})".format( + input_dim, num_heads + ) + ) + + # we used to support growing memory size. However, it will cause + # cross stream batching failure. Now we need to have exact max memory size + if max_memory_size < 0: + raise ValueError("max_memory_size must be >= 0") + + # Only assign right_context. In decoding, left context will be cached. + # No need to let the online decoder to re-assign the left context + self.left_context, self.right_context = context_config + self.segment_size = segment_size + self.memory_dim = input_dim + self.max_memory_size = max_memory_size + self.mini_batches = mini_batches + if self.max_memory_size != 0: + self.use_mem = True + else: + self.use_mem = False + + self.memory_op = SummarizationLayer( + summarization_method, segment_size, input_dim + ) + + self.layers = torch.nn.ModuleList() + self.num_layers = num_layers + self.max_relative_position = max_relative_position + if self.max_relative_position > 0: + self.use_rpe = True + else: + self.use_rpe = False + for i in range(self.num_layers): + if deep_init: + layer_index = i + else: + layer_index = -1 + + self.layers.append( + NoSegAugmentedMemoryTransformer( + num_heads=num_heads, + input_dim=input_dim, + ffn_dim=ffn_dim, + dropout_in_attn=dropout_in_attn, + dropout_on_attn=dropout_on_attn, + dropout_on_fc1=dropout_on_fc1, + dropout_on_fc2=dropout_on_fc2, + segment_size=segment_size, + std_scale=std_scale, + activation_fn=activation_fn, + tanh_on_mem=tanh_on_mem, + scaled_init=scaled_init, + use_mem=self.use_mem, + mini_batches=mini_batches, + negative_inf=negative_inf, + layer_index=layer_index, + summarization_method=summarization_method, + max_relative_position=max_relative_position, + rpe_old_option=rpe_old_option, + ) + ) + + def set_mini_batches(self, mini_batches): + # handy function only used for unit test + self.mini_batches = mini_batches + for layer in self.layers: + layer.set_mini_batches(mini_batches) + + def _get_relative_position( + self, + input: Tensor, + max_relative_position: int, + left_context_length: int, + past_length: int, + is_decoding: bool, + ): + # For training, we copy the right context to the start of the utterance + # First dimension in distance is corresponding to query. + # [right context, utterance, summary vector] + # Second dimension in distance is corresponding to key. + # [Memory bank, right context, utterance] + # For summary vector in query part, the distance with + # all other position is 2*max_position. For memory bank in key, + # the distance with all other positions is 0. + + T, B, D = input.shape + num_segs = math.ceil((T - self.right_context) / self.segment_size) + + # utterance + u_st = past_length * self.segment_size + u_ed = u_st + T + utterance_ranges = torch.arange(u_st, u_ed - self.right_context) + + # left context. Only in minibatch or decoding + left_context_ranges = torch.arange(u_st - left_context_length, u_st) + + # Right context block + # right context + utterance + right_context_blocks = [] + for i in range(0, num_segs - 1): + st = (i + 1) * self.segment_size + u_st + ed = st + self.right_context + assert ed < u_ed + temp = torch.arange(st, ed) + right_context_blocks.append(temp) + right_context_blocks.append(torch.arange(u_ed - self.right_context, u_ed)) + right_context_ranges = torch.cat(right_context_blocks) + + if self.use_mem: + # Memory bank + # The position for memory -n, .., -1 + if is_decoding: + memory_size = min(past_length, self.max_memory_size) + else: + memory_size = num_segs + past_length - 1 + memory_bank_ranges = torch.arange( + -max_relative_position - 1, -max_relative_position - 1 - memory_size, -1 + ) + + # summary vector + # The position for summary vector as the T+max_relative_position+1. + # After the clamping, the relative position is max_relative_position + summary_pos_st = u_ed + max_relative_position + 1 + summary_vector_ranges = torch.arange( + summary_pos_st, summary_pos_st + num_segs + ) + + key_ranges = torch.cat( + [ + memory_bank_ranges, + right_context_ranges, + left_context_ranges, + utterance_ranges, + ] + ) + + query_ranges = torch.cat( + [right_context_ranges, utterance_ranges, summary_vector_ranges] + ) + else: + key_ranges = torch.cat( + [right_context_ranges, left_context_ranges, utterance_ranges] + ) + + query_ranges = torch.cat([right_context_ranges, utterance_ranges]) + + distance = key_ranges[None, :] - query_ranges[:, None] + distance_clamp = ( + torch.clamp(distance, -max_relative_position, max_relative_position) + + max_relative_position + ) + distance_clamp = distance_clamp.to(input.device).long().detach() + return distance_clamp + + def _get_attention_mask(self, input, past_length=0, left_context_cache=0): + # attention mask for each query contains three parts: + # 1. memory part + # 2. left_context + segment + # 3. right_context_block + # so for each segment and its correspoinding right context block, + # the attention matrix is formed by 9 parts: + # [0, m, 0, 0, right_context, 0, 0, seg, 0] + # [before memory, memory, after memory, before right context, right_context, + # after right context, before seg, seg, after seg] + # + # Query is formed in the way as [right_context_blocks, utterance, summary] + # + # Note: put m and right_context before segment is convenient + # for padding_mask operation. + # Key lengths = m_length + right_context_block_length + lengths + utterance_length, batch_size, _ = input.shape + summary_length = math.ceil(utterance_length / self.segment_size) + num_segs = summary_length + rc_length = self.right_context * num_segs + rc = self.right_context + lc = self.left_context + + # using mini-batches, there is left context cache available for current + # sequence. + lcc = left_context_cache + + # max_memory_size is 0 then we don't have memory and summary + # past_length is the memory carry from previous sequence + if self.use_mem: + mem_length = num_segs - 1 + past_length + else: + mem_length = 0 + rc_mask = [] + query_mask = [] + summary_mask = [] + for j in range(0, num_segs): + ssize = min(self.segment_size, utterance_length - j * self.segment_size) + + rc_size = rc + rc_mat = [] + q_mat = [] + s_mat = [] + m_start = max(j + past_length - self.max_memory_size, 0) + + # max_memory_size is 0, then we don't use memory + if self.use_mem: + # part 0: before memory + rc_mat.append(input.new_zeros(rc_size, m_start)) + q_mat.append(input.new_zeros(ssize, m_start)) + s_mat.append(input.new_zeros(1, m_start)) + + # part 1: memory + col_1 = j + past_length - m_start + rc_mat.append(torch.ones(rc_size, col_1, device=input.device)) + q_mat.append(torch.ones(ssize, col_1, device=input.device)) + # based on D22875746, disable summary query attention + # on memeory is better for long form utterance + s_mat.append(input.new_zeros(1, col_1)) + + # part 2: after memory + col_2 = mem_length - (j + past_length) + rc_mat.append(input.new_zeros(rc_size, col_2)) + q_mat.append(input.new_zeros(ssize, col_2)) + s_mat.append(input.new_zeros(1, col_2)) + + # part 3: before right context + rc_start = j * rc + rc_mat.append(input.new_zeros(rc_size, rc_start)) + q_mat.append(input.new_zeros(ssize, rc_start)) + s_mat.append(input.new_zeros(1, rc_start)) + + # part 4: right context + rc_end = rc_start + rc + col_4 = rc + rc_mat.append(torch.ones(rc_size, col_4, device=input.device)) + q_mat.append(torch.ones(ssize, col_4, device=input.device)) + s_mat.append(torch.ones(1, col_4, device=input.device)) + + # part 5: after right context + col_5 = rc_length - rc_end + rc_mat.append(input.new_zeros(rc_size, col_5)) + q_mat.append(input.new_zeros(ssize, col_5)) + s_mat.append(input.new_zeros(1, col_5)) + + # part 6: before query segment + seg_start = max(j * self.segment_size + lcc - lc, 0) + rc_mat.append(input.new_zeros(rc_size, seg_start)) + q_mat.append(input.new_zeros(ssize, seg_start)) + s_mat.append(input.new_zeros(1, seg_start)) + + # part 7: query segment + # note: right context is put in right context block + # here we only need to consider about left context + seg_end = min((j + 1) * self.segment_size + lcc, utterance_length + lcc) + col_7 = seg_end - seg_start + rc_mat.append(torch.ones(rc_size, col_7, device=input.device)) + q_mat.append(torch.ones(ssize, col_7, device=input.device)) + s_mat.append(torch.ones(1, col_7, device=input.device)) + + # part 8: after query segment + col_8 = utterance_length + lcc - seg_end + rc_mat.append(input.new_zeros(rc_size, col_8)) + q_mat.append(input.new_zeros(ssize, col_8)) + s_mat.append(input.new_zeros(1, col_8)) + + rc_mask.append(torch.cat(rc_mat, dim=1)) + query_mask.append(torch.cat(q_mat, dim=1)) + summary_mask.append(torch.cat(s_mat, dim=1)) + + # no memory, then we don't need summary either + if self.use_mem: + attention_mask = ( + 1 + - torch.cat( + [ + torch.cat(rc_mask, dim=0), + torch.cat(query_mask, dim=0), + torch.cat(summary_mask, dim=0), + ], + dim=0, + ) + ).to(torch.bool) + else: + attention_mask = ( + 1 + - torch.cat( + [torch.cat(rc_mask, dim=0), torch.cat(query_mask, dim=0)], dim=0 + ) + ).to(torch.bool) + + return attention_mask + + @torch.jit.export + def init_state( + self, batch_size: int, device: Optional[Device] = None + ) -> List[Tensor]: + empty_memory = torch.zeros( + self.num_layers, + self.max_memory_size, + batch_size, + self.memory_dim, + device=device, + ) + left_context_key = torch.zeros( + self.num_layers, + self.left_context, + batch_size, + self.memory_dim, + device=device, + ) + left_context_val = torch.zeros( + self.num_layers, + self.left_context, + batch_size, + self.memory_dim, + device=device, + ) + past_length = torch.zeros(1, batch_size, dtype=torch.int32, device=device) + + return [empty_memory, left_context_key, left_context_val, past_length] + + @torch.jit.export + def batch_state(self, states: List[List[Tensor]]) -> List[Tensor]: + if len(states) == 0: + return [] + batched_m = [] + batched_lc_key = [] + batched_lc_val = [] + batched_past_length = [] + for state in states: + if len(state) == 0: + continue + m, lc_key, lc_val, past_length = state + batched_m.append(m) + batched_lc_key.append(lc_key) + batched_lc_val.append(lc_val) + batched_past_length.append(past_length) + + if ( + (len(batched_m) == 0) + or (len(batched_lc_key) == 0) + or (len(batched_lc_val) == 0) + or (len(batched_past_length) == 0) + ): + return [ + torch.tensor([]), + torch.tensor([]), + torch.tensor([]), + torch.tensor([]), + ] + + batched_m = torch.cat(batched_m, dim=2) + batched_lc_key = torch.cat(batched_lc_key, dim=2) + batched_lc_val = torch.cat(batched_lc_val, dim=2) + batched_past_length = torch.cat(batched_past_length, dim=1) + return [batched_m, batched_lc_key, batched_lc_val, batched_past_length] + + @torch.jit.export + def reorder_state(self, state: List[Tensor], indices: Tensor) -> List[Tensor]: + if len(state) == 0: + return [] + m, lc_key, lc_val, past_length = state + indices = indices.to(device=m.device) + reord_m = torch.index_select(m, 2, indices) + reord_lc_key = torch.index_select(lc_key, 2, indices) + reord_lc_val = torch.index_select(lc_val, 2, indices) + reord_past_length = torch.index_select(past_length, 1, indices) + return [reord_m, reord_lc_key, reord_lc_val, reord_past_length] + + @torch.jit.export + def reset_state(self, state: List[Tensor], indices: Tensor) -> List[Tensor]: + m, lc_key, lc_val, past_length = state + m = m.index_fill(dim=2, index=indices, value=0.0) + lc_key = lc_key.index_fill(dim=2, index=indices, value=0.0) + lc_val = lc_val.index_fill(dim=2, index=indices, value=0.0) + past_length = past_length.index_fill(dim=1, index=indices, value=0) + + return [m, lc_key, lc_val, past_length] + + @torch.jit.export + def state_size(self) -> int: + return 4 + + @torch.jit.export + def batch_size_in_state( + self, state: Optional[List[Tensor]], sloppy: bool = True + ) -> Optional[int]: + if state is None: + return None + return state[0].size(2) + + def gen_summary_queries(self, input): + sum_input = self.memory_op(input) + return sum_input + + def _gen_right_context_padded_input(self, input): + # This function deals with input that is already + # padded with right context (e.g. minibatch training) + right_context_blocks = [] + T, B, D = input.shape + num_segs = math.ceil((T - self.right_context) / self.segment_size) + for i in range(0, num_segs - 1): + st = (i + 1) * self.segment_size + ed = st + self.right_context + assert ed < T + temp = input[st:ed, :, :] + right_context_blocks.append(temp) + + # last segment right context is already available + right_context_blocks.append(input[T - self.right_context :, :, :]) + return torch.cat(right_context_blocks, dim=0) + + def _gen_segs_right_context(self, input, lengths): + segments = [] + T, B, D = input.size() + nT = T - self.right_context + + # assume input is right context padded + num_segs = math.ceil(nT / self.segment_size) + # pad zeros to the utterance to make sure each + # segment has the same right context. For the + for i in range(0, num_segs - 1): + st = i * self.segment_size + ed = min(T, st + self.segment_size + self.right_context) + temp = input[st:ed, :, :] + rest_lengths = torch.clamp( + lengths - self.segment_size, min=0, max=nT - (i + 1) * self.segment_size + ) + segments.append((temp, lengths - rest_lengths + self.right_context)) + lengths = rest_lengths + + last_seg = input[st + self.segment_size :, :, :] + segments.append((last_seg, rest_lengths + self.right_context)) + + return segments + + @torch.jit.unused + def forward( + self, input: Tensor, padding_masks: Tensor, state: Optional[List[Tensor]] = None + ) -> Tuple[Tensor, Tensor, List[Tensor], List[Tensor]]: + # Xutai: originally the second argument is lengths. + lengths = (~padding_masks).sum(dim=1).long() + # mini batch training. + if self.mini_batches: + return self.forward_mini_batches(input, lengths, state) + + # regular full sequence training. Note, assume the right context in provided + # in the input. + T, B, D = input.size() + right_context_blocks = self._gen_right_context_padded_input(input) + + # generate the relative positional embedding + if self.use_rpe: + rpe = self._get_relative_position( + input=input, + max_relative_position=self.max_relative_position, + left_context_length=0, + past_length=0, + is_decoding=False, + ) + else: + rpe = None + input = input[: T - self.right_context, :, :] + + attention_mask = self._get_attention_mask(input) + + # firt layer use each segment mean as memory + # ignore the last one seg average + if self.use_mem: + mems = self.gen_summary_queries(input)[:-1, :, :] + else: + mems = torch.zeros(0, input.size(1), input.size(2), device=input.device) + mems = mems.type_as(input) + + output = input + all_outputs = [] + + for layer in self.layers: + output, mems, right_context_blocks, _, _ = layer( + input=output, + lengths=lengths, + attention_mask=attention_mask, + mems=mems, + right_context_blocks=right_context_blocks, + pre_mems=None, + left_context_key=None, + left_context_val=None, + rpe=rpe, + ) + all_outputs.append(output) + return output, padding_masks, [], all_outputs + + def forward_jit_mini_batch_init( + self, + seg: Tensor, + state: Optional[List[Tensor]] = None, + is_decoding: bool = False, + ): + # Prepare state. In whole sequence training, state is ignored. + # For minibatch training, we need to prepare state + if state is None: + state = self.init_state(batch_size=seg.size(1), device=seg.device) + if seg.dtype == torch.half: + state = [state[0].half(), state[1].half(), state[2].half(), state[3]] + + if self.use_mem: + # note input average only on seg, not on right context + # first layer use each segmetn mean as memory. the last + # one segment average is used in state + full_mems = self.gen_summary_queries(seg) + if is_decoding: + mems = full_mems[0:1, :, :] + state_mems = torch.cat([state[0][0], mems], dim=0) + else: + mems = full_mems[:-1, :, :] + state_mems = torch.cat([state[0][0], full_mems], dim=0) + else: + mems = state[0][0] + state_mems = mems + + # track processed segment number or memory number + # the same batch as the same bumber of past length + past_length = state[3][0][0].item() + past_left_context = min(past_length * self.segment_size, self.left_context) + past_length = min(self.max_memory_size, past_length) + + return state, mems, state_mems, past_length, past_left_context + + def state_update_before( + self, layer: int, state: List[Tensor], past_length: int, past_left_context: int + ): + pre_mems = state[0][layer][self.max_memory_size - past_length :, :, :] + lc_key = state[1][layer][self.left_context - past_left_context :, :, :] + lc_val = state[2][layer][self.left_context - past_left_context :, :, :] + return pre_mems, lc_key, lc_val + + def state_update_after( + self, + layer: int, + state: List[Tensor], + mems: Tensor, + next_key: Tensor, + next_val: Tensor, + mems_list: List[Tensor], + lc_key_list: List[Tensor], + lc_val_list: List[Tensor], + ): + # mems is used for next layer + if layer < self.num_layers - 1: + state_mems = torch.cat([state[0][layer + 1], mems], dim=0) + mems_list.append(state_mems[-self.max_memory_size :, :, :]) + + # when mems pass to next sequence, we need the last memory. when mems + # use for the next layer, we can ignore the last memory + mems = mems[:-1, :, :] + + # note state[1][i] and state[2][i] original length equals to self.left_context + new_k = torch.cat([state[1][layer], next_key], dim=0) + new_v = torch.cat([state[2][layer], next_val], dim=0) + lc_key_list.append(new_k[-self.left_context :, :, :]) + lc_val_list.append(new_v[-self.left_context :, :, :]) + return mems_list, lc_key_list, lc_val_list, mems + + def state_update_after_loop( + self, + state: List[Tensor], + mems_list: List[Tensor], + lc_key_list: List[Tensor], + lc_val_list: List[Tensor], + update_length: int, + ): + state[0] = torch.stack(mems_list, dim=0) + state[1] = torch.stack(lc_key_list, dim=0) + state[2] = torch.stack(lc_val_list, dim=0) + state[3] = state[3] + update_length + return state + + @torch.jit.unused + def forward_mini_batches( + self, input: Tensor, lengths: Tensor, state: Optional[List[Tensor]] = None + ) -> Tuple[Tensor, Tensor, List[Tensor], List[Tensor]]: + T, B, D = input.size() + + # input without right context + seg = input[: T - self.right_context, :, :] + + # get right context blocks + right_context_blocks = self._gen_right_context_padded_input(input) + + mems_list = [] + lc_key_list = [] + lc_val_list = [] + results = self.forward_jit_mini_batch_init(seg, state, False) + state, mems, state_mems, past_length, past_left_context = results + + # relative position embedding + if self.use_rpe: + rpe = self._get_relative_position( + input=input, + max_relative_position=self.max_relative_position, + left_context_length=past_left_context, + past_length=past_length, + is_decoding=False, + ) + else: + rpe = None + + # get attention mask based on seg (not include right context) and available + # left context + attention_mask = self._get_attention_mask(seg, past_length, past_left_context) + mems_list.append(state_mems[-self.max_memory_size :, :, :]) + output = seg + i = 0 + all_outputs = [] + for layer in self.layers: + # In order to make cross stream batching work, mem, left context key + # and left context value in the state should always be the same shape. + # We use the past length to track the processed segment number. In this + # way, we take out the essential memory, left context key and left + # context val from the state. After finish the forward for current segment + # we add the new memory, left context key and left context value into the + # staate and trim out the oldest part to keep the shape consistent. + pre_mems, lc_key, lc_val = self.state_update_before( + i, state, past_length, past_left_context + ) + + output, mems, right_context_blocks, next_key, next_val = layer.forward( + input=output, + lengths=lengths, + attention_mask=attention_mask, + mems=mems, + right_context_blocks=right_context_blocks, + pre_mems=pre_mems, + left_context_key=lc_key, + left_context_val=lc_val, + rpe=rpe, + ) + all_outputs.append(output) + mems_list, lc_key_list, lc_val_list, mems = self.state_update_after( + layer=i, + state=state, + mems=mems, + next_key=next_key, + next_val=next_val, + mems_list=mems_list, + lc_key_list=lc_key_list, + lc_val_list=lc_val_list, + ) + + i += 1 + + # update state + update_length = math.ceil((T - self.right_context) / self.segment_size) + state = self.state_update_after_loop( + state=state, + mems_list=mems_list, + lc_key_list=lc_key_list, + lc_val_list=lc_val_list, + update_length=update_length, + ) + + return output, lengths, state, all_outputs + + def forward_jit_test( + self, input: Tensor, lengths: Tensor, state: Optional[List[Tensor]] = None + ) -> Tuple[Tensor, Tensor, List[Tensor]]: + """ + This one simulate sequence encoder forward jit. This is for unit test purpose. + It is not used in training or decoding. Note, extra_right_context is set in + the model. In unit test, input = [utterance, right_context], lengths = + [utterance_length]. + args: + input: input utterance + lengths: utterance input length + state: None here. input is whole utterance + """ + # [TODO] sequence_to_segment has bug in lengths. + seg_src_tokens_lengths = self._gen_segs_right_context(input, lengths) + + seg_enc_tokens_lengths: List[Tuple[Tensor, Tensor]] = [] + state: Optional[List[Tensor]] = None + for seg_src_tokens, seg_src_lengths in seg_src_tokens_lengths: + seg_enc_tokens, seg_enc_lengths, state = self.forward_jit( + input=seg_src_tokens, lengths=seg_src_lengths, state=state + ) + seg_enc_tokens_lengths.append((seg_enc_tokens, seg_enc_lengths)) + + enc_tokens, enc_lengths = segments_to_sequence( + segments=seg_enc_tokens_lengths, time_axis=0 + ) + + state = [] # returns trivial state + + return enc_tokens, enc_lengths, state + + @torch.jit.export + def forward_jit( + self, input: Tensor, lengths: Tensor, state: Optional[List[Tensor]] = None + ) -> Tuple[Tensor, Tensor, List[Tensor]]: + """ + Forward helper for online decoding. + + args: + input: [seg, right_context]. We assume in online we + always padding the right context to the preset right context size. + For the last segment, we may have short segment size, but right + context size is the same as other segments + lengths: utterance input length is the utterance segment length and + right context size + state: [memory, left_context_key, left_context_val]. To improve throughput, + in addition to memory, we also cache key and value for left_context in + multihead self-attention + """ + # In online decoding, input = [segment, right_context] + # Lengths = [segment_length, right_context_length] + # so we need strip right context in output + T, B, D = input.size() + rc_str = T - self.right_context + rc_end = T + right_context_blocks = input[rc_str:rc_end, :, :] + seg = input[:rc_str, :, :] + lengths = torch.clamp(lengths - self.right_context, min=0) + mems_list = [] + lc_key_list = [] + lc_val_list = [] + + results = self.forward_jit_mini_batch_init(seg, state, True) + state, mems, state_mems, past_length, past_left_context = results + + # relative position embedding + if self.use_rpe: + rpe = self._get_relative_position( + input=input, + max_relative_position=self.max_relative_position, + left_context_length=past_left_context, + past_length=past_length, + is_decoding=True, + ) + else: + rpe = None + + # memory for first layer. + mems_list.append(state_mems[-self.max_memory_size :, :, :]) + output = seg + i = 0 + for layer in self.layers: + # In order to make cross stream batching work, mem, left context key + # and left context value in the state should always be the same shape. + # We use the past length to track the processed segment number. In this + # way, we take out the essential memory, left context key and left + # context val from the state. After finish the forward for current segment + # we add the new memory, left context key and left context value into the + # staate and trim out the oldest part to keep the shape consistent. + true_mems, lc_key, lc_val = self.state_update_before( + layer=i, + state=state, + past_length=past_length, + past_left_context=past_left_context, + ) + + output, mems, right_context_blocks, next_key, next_val = layer.forward_jit( + input=output, + lengths=lengths, + mems=true_mems, + right_context_blocks=right_context_blocks, + left_context_key=lc_key, + left_context_val=lc_val, + rpe=rpe, + ) + # mems is used for next layer + mems_list, lc_key_list, lc_val_list, _ = self.state_update_after( + layer=i, + state=state, + mems_list=mems_list, + mems=mems, + next_key=next_key, + next_val=next_val, + lc_key_list=lc_key_list, + lc_val_list=lc_val_list, + ) + i += 1 + + # update state + state = self.state_update_after_loop( + state=state, + mems_list=mems_list, + lc_key_list=lc_key_list, + lc_val_list=lc_val_list, + update_length=1, + ) + + return output, lengths, state + + def quantize_(self, params=None): + if params and "per_channel" in params and params["per_channel"]: + qconfig = per_channel_dynamic_qconfig + else: + qconfig = default_dynamic_qconfig + quantization.quantize_dynamic( + self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True + ) + return self + + +# ------------------------------------------------------------------------------ +# Emformer encoder for seq2seq model +# This is a wrapper over the original emformer +# ------------------------------------------------------------------------------ +def emformer_encoder(klass): + class SpeechEncoder(klass): + def __init__(self, args): + super().__init__(args) + stride = SpeechEncoder.conv_layer_stride(args) + trf_left_context = args.segment_left_context // stride + trf_right_context = args.segment_right_context // stride + context_config = [trf_left_context, trf_right_context] + self.transformer_layers = nn.ModuleList( + [ + NoSegAugmentedMemoryTransformerEncoderLayer( + input_dim=args.encoder_embed_dim, + num_heads=args.encoder_attention_heads, + ffn_dim=args.encoder_ffn_embed_dim, + num_layers=args.encoder_layers, + dropout_in_attn=args.dropout, + dropout_on_attn=args.dropout, + dropout_on_fc1=args.dropout, + dropout_on_fc2=args.dropout, + activation_fn=args.activation_fn, + context_config=context_config, + segment_size=args.segment_length, + max_memory_size=args.max_memory_size, + scaled_init=True, # TODO: use constant for now. + tanh_on_mem=args.amtrf_tanh_on_mem, + ) + ] + ) + + def forward(self, src_tokens, src_lengths): + encoder_out = super().forward(src_tokens, src_lengths) + output = encoder_out["encoder_out"][0] + encoder_padding_masks = encoder_out["encoder_padding_mask"][0] + + # This is because that in the original implementation + # the output didn't consider the last segment as right context. + encoder_padding_masks = encoder_padding_masks[:, : output.size(0)] + + return { + "encoder_out": [output], + "encoder_padding_mask": [encoder_padding_masks], + "encoder_embedding": [], + "encoder_states": [], + "src_tokens": [], + "src_lengths": [], + } + + @staticmethod + def conv_layer_stride(args): + # TODO: make it configurable from the args + return 4 + + SpeechEncoder.__name__ = klass.__name__ + return SpeechEncoder diff --git a/fairseq/models/speech_to_text/multi_modality_model.py b/fairseq/models/speech_to_text/multi_modality_model.py new file mode 100644 index 0000000000..046421620a --- /dev/null +++ b/fairseq/models/speech_to_text/multi_modality_model.py @@ -0,0 +1,49 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.models import FairseqDecoder, FairseqEncoder + + +# a container for different encoders with training samples from different modality +# each time, only one encoder is selected +class MultiModalityEncoder(FairseqEncoder): + def __init__(self, dictionary): + super().__init__(dictionary) + + def select_encoder(self, mode, **kwargs): + raise NotImplementedError("Model must implement the select_encoder method") + return None, kwargs + + # def post_encoder(self, encoder_out, src_tokens, src_lengths, mode, **kwargs): + # # Default do nothing + # return encoder_out + + # get sample data from JointSpeechTextDataset + def forward(self, src_tokens, src_lengths=None, mode="", **kwargs): + encoder, kwargs = self.select_encoder(mode, **kwargs) + # return self.post_encoder(encoder(src_tokens, src_lengths, **kwargs), src_tokens, src_lengths, mode, **kwargs) + return encoder(src_tokens, src_lengths, **kwargs) + + +# a container for different decoders with training samples from different modality +# each time, only one decoder is selected +class MultiInputDecoder(FairseqDecoder): + def __init__(self, dictionary): + super().__init__(dictionary) + + def select_decoder(self, mode, **kwargs): + raise NotImplementedError("Model must implement the select_decoder method") + return None, kwargs + + def forward( + self, prev_output_tokens, encoder_out, incremental_state=None, mode="", **kwargs + ): + decoder, kwargs = self.select_decoder(mode, **kwargs) + return decoder( + prev_output_tokens, + encoder_out, + incremental_state=incremental_state, + **kwargs + ) diff --git a/fairseq/models/speech_to_text/s2t_conformer.py b/fairseq/models/speech_to_text/s2t_conformer.py new file mode 100644 index 0000000000..79dbbec1b9 --- /dev/null +++ b/fairseq/models/speech_to_text/s2t_conformer.py @@ -0,0 +1,234 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import math +from pathlib import Path + +import torch + +from fairseq import checkpoint_utils +from fairseq.data.data_utils import lengths_to_padding_mask +from fairseq.models import FairseqEncoder, register_model, register_model_architecture +from fairseq.models.speech_to_text.modules.convolution import ( + Conv1dSubsampler, + Conv2dSubsampler, +) +from fairseq.models.speech_to_text.s2t_transformer import ( + S2TTransformerEncoder, + S2TTransformerModel, +) +from fairseq.models.speech_to_text.s2t_transformer import ( + base_architecture as transformer_base_architecture, +) +from fairseq.modules import PositionalEmbedding, RelPositionalEncoding +from fairseq.modules.conformer_layer import ConformerEncoderLayer + +logger = logging.getLogger(__name__) + + +class S2TConformerEncoder(FairseqEncoder): + """Conformer Encoder for speech translation based on https://arxiv.org/abs/2005.08100""" + + def __init__(self, args): + super().__init__(None) + + self.encoder_freezing_updates = args.encoder_freezing_updates + self.num_updates = 0 + + self.embed_scale = math.sqrt(args.encoder_embed_dim) + if args.no_scale_embedding: + self.embed_scale = 1.0 + self.padding_idx = 1 + self.conv_version = args.conv_version + if self.conv_version == "s2t_transformer": + self.subsample = Conv1dSubsampler( + args.input_feat_per_channel * args.input_channels, + args.conv_channels, + args.encoder_embed_dim, + [int(k) for k in args.conv_kernel_sizes.split(",")], + ) + elif self.conv_version == "convtransformer": + self.subsample = Conv2dSubsampler( + args.input_channels, + args.input_feat_per_channel, + args.conv_out_channels, + args.encoder_embed_dim, + ) + self.pos_enc_type = args.pos_enc_type + if self.pos_enc_type == "rel_pos": + self.embed_positions = RelPositionalEncoding( + args.max_source_positions, args.encoder_embed_dim + ) + elif self.pos_enc_type == "rope": + self.embed_positions = None + else: # Use absolute positional embedding + self.pos_enc_type = "abs" + self.embed_positions = PositionalEmbedding( + args.max_source_positions, args.encoder_embed_dim, self.padding_idx + ) + + self.linear = torch.nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) + self.dropout = torch.nn.Dropout(args.dropout) + self.conformer_layers = torch.nn.ModuleList( + [ + ConformerEncoderLayer( + embed_dim=args.encoder_embed_dim, + ffn_embed_dim=args.encoder_ffn_embed_dim, + attention_heads=args.encoder_attention_heads, + dropout=args.dropout, + depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, + attn_type=args.attn_type, + pos_enc_type=self.pos_enc_type, + use_fp16=args.fp16, + ) + for _ in range(args.encoder_layers) + ] + ) + + def _forward(self, src_tokens, src_lengths, return_all_hiddens=False): + """ + Args: + src_tokens: Input source tokens Tensor of shape B X T X C + src_lengths: Lengths Tensor corresponding to input source tokens + return_all_hiddens: If true will append the self attention states to the encoder states + Returns: + encoder_out: Tensor of shape B X T X C + encoder_padding_mask: Optional Tensor with mask + encoder_embedding: Optional Tensor. Always empty here + encoder_states: List of Optional Tensors wih self attention states + src_tokens: Optional Tensor. Always empty here + src_lengths: Optional Tensor. Always empty here + """ + x, input_lengths = self.subsample(src_tokens, src_lengths) # returns T X B X C + encoder_padding_mask = lengths_to_padding_mask(input_lengths) + x = self.embed_scale * x + if self.pos_enc_type == "rel_pos": + positions = self.embed_positions(x) + + elif self.pos_enc_type == "rope": + positions = None + + else: + positions = self.embed_positions(encoder_padding_mask).transpose(0, 1) + x += positions + positions = None + + x = self.linear(x) + x = self.dropout(x) + encoder_states = [] + + # x is T X B X C + for layer in self.conformer_layers: + x, _ = layer(x, encoder_padding_mask, positions) + if return_all_hiddens: + encoder_states.append(x) + + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [encoder_padding_mask] + if encoder_padding_mask.any() + else [], # B x T + "encoder_embedding": [], # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + } + + def forward(self, src_tokens, src_lengths, return_all_hiddens=False): + if self.num_updates < self.encoder_freezing_updates: + with torch.no_grad(): + x = self._forward( + src_tokens, + src_lengths, + return_all_hiddens=return_all_hiddens, + ) + else: + x = self._forward( + src_tokens, + src_lengths, + return_all_hiddens=return_all_hiddens, + ) + return x + + def reorder_encoder_out(self, encoder_out, new_order): + """Required method for a FairseqEncoder. Calls the method from the parent class""" + return S2TTransformerEncoder.reorder_encoder_out(self, encoder_out, new_order) + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + self.num_updates = num_updates + + +@register_model("s2t_conformer") +class S2TConformerModel(S2TTransformerModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @staticmethod + def add_args(parser): + S2TTransformerModel.add_args(parser) + parser.add_argument( + "--input-feat-per-channel", + type=int, + metavar="N", + help="dimension of input features per channel", + ) + parser.add_argument( + "--input-channels", + type=int, + metavar="N", + help="number of chennels of input features", + ) + parser.add_argument( + "--depthwise-conv-kernel-size", + type=int, + metavar="N", + help="kernel size of depthwise convolution layers", + ) + parser.add_argument( + "--attn-type", + type=str, + metavar="STR", + help="If not specified uses fairseq MHA. Other valid option is espnet", + ) + parser.add_argument( + "--pos-enc-type", + type=str, + metavar="STR", + help="Must be specified in addition to attn-type=espnet for rel_pos and rope", + ) + + @classmethod + def build_encoder(cls, args): + encoder = S2TConformerEncoder(args) + pretraining_path = getattr(args, "load_pretrained_encoder_from", None) + if pretraining_path is not None: + if not Path(pretraining_path).exists(): + logger.warning( + f"skipped pretraining because {pretraining_path} does not exist" + ) + else: + encoder = checkpoint_utils.load_pretrained_component_from_model( + component=encoder, checkpoint=pretraining_path + ) + logger.info(f"loaded pretrained encoder from: {pretraining_path}") + return encoder + + +@register_model_architecture("s2t_conformer", "s2t_conformer") +def conformer_base_architecture(args): + args.attn_type = getattr(args, "attn_type", None) + args.pos_enc_type = getattr(args, "pos_enc_type", "abs") + args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) + args.input_channels = getattr(args, "input_channels", 1) + args.max_source_positions = getattr(args, "max_source_positions", 6000) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.dropout = getattr(args, "dropout", 0.1) + args.encoder_layers = getattr(args, "encoder_layers", 16) + args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31) + transformer_base_architecture(args) diff --git a/fairseq/models/speech_to_text/s2t_transformer.py b/fairseq/models/speech_to_text/s2t_transformer.py index fc2f14fea6..50fae2ffa2 100644 --- a/fairseq/models/speech_to_text/s2t_transformer.py +++ b/fairseq/models/speech_to_text/s2t_transformer.py @@ -1,11 +1,17 @@ -#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. import logging import math +from pathlib import Path from typing import Dict, List, Optional, Tuple import torch import torch.nn as nn +from torch import Tensor + from fairseq import checkpoint_utils, utils from fairseq.data.data_utils import lengths_to_padding_mask from fairseq.models import ( @@ -14,7 +20,11 @@ register_model, register_model_architecture, ) -from fairseq.models.fairseq_encoder import EncoderOut +from fairseq.models.speech_to_text.hub_interface import S2THubInterface +from fairseq.models.speech_to_text.modules.convolution import ( + Conv1dSubsampler, + Conv2dSubsampler, +) from fairseq.models.transformer import Embedding, TransformerDecoder from fairseq.modules import ( FairseqDropout, @@ -22,61 +32,10 @@ PositionalEmbedding, TransformerEncoderLayer, ) -from torch import Tensor - logger = logging.getLogger(__name__) -class Conv1dSubsampler(nn.Module): - """Convolutional subsampler: a stack of 1D convolution (along temporal - dimension) followed by non-linear activation via gated linear units - (https://arxiv.org/abs/1911.08460) - - Args: - in_channels (int): the number of input channels - mid_channels (int): the number of intermediate channels - out_channels (int): the number of output channels - kernel_sizes (List[int]): the kernel size for each convolutional layer - """ - - def __init__( - self, - in_channels: int, - mid_channels: int, - out_channels: int, - kernel_sizes: List[int] = (3, 3), - ): - super(Conv1dSubsampler, self).__init__() - self.n_layers = len(kernel_sizes) - self.conv_layers = nn.ModuleList( - nn.Conv1d( - in_channels if i == 0 else mid_channels // 2, - mid_channels if i < self.n_layers - 1 else out_channels * 2, - k, - stride=2, - padding=k // 2, - ) - for i, k in enumerate(kernel_sizes) - ) - - def get_out_seq_lens_tensor(self, in_seq_lens_tensor): - out = in_seq_lens_tensor.clone() - for _ in range(self.n_layers): - out = ((out.float() - 1) / 2 + 1).floor().long() - return out - - def forward(self, src_tokens, src_lengths): - bsz, in_seq_len, _ = src_tokens.size() # B x T x (C x D) - x = src_tokens.transpose(1, 2).contiguous() # -> B x (C x D) x T - for conv in self.conv_layers: - x = conv(x) - x = nn.functional.glu(x, dim=1) - _, _, out_seq_len = x.size() - x = x.transpose(1, 2).transpose(0, 1).contiguous() # -> T x B x (C x D) - return x, self.get_out_seq_lens_tensor(src_lengths) - - @register_model("s2t_transformer") class S2TTransformerModel(FairseqEncoderDecoderModel): """Adapted Transformer model (https://arxiv.org/abs/1706.03762) for @@ -85,6 +44,37 @@ class S2TTransformerModel(FairseqEncoderDecoderModel): project inputs into the encoder dimension as well as downsample input sequence for computational efficiency.""" + @classmethod + def hub_models(cls): + base_url = "http://dl.fbaipublicfiles.com/fairseq/s2t" + model_ids = [ + "s2t_transformer_s-en-asr-librispeech", + "s2t_transformer_m-en-asr-librispeech", + "s2t_transformer_l-en-asr-librispeech", + ] + return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + config_yaml="config.yaml", + **kwargs, + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + config_yaml=config_yaml, + **kwargs, + ) + return S2THubInterface(x["args"], x["task"], x["models"][0]) + def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @@ -95,14 +85,27 @@ def add_args(parser): parser.add_argument( "--conv-kernel-sizes", type=str, - metavar="N", - help="kernel sizes of Conv1d subsampling layers", + metavar="STR", + help="kernel sizes of Conv1d (s2t_transformer) subsampling layers", ) parser.add_argument( "--conv-channels", type=int, metavar="N", - help="# of channels in Conv1d subsampling layers", + help="# of channels in Conv1d (s2t_transformer) subsampling layers", + ) + parser.add_argument( + "--conv-out-channels", + type=int, + metavar="N", + help="# of channels in Conv2d (convtransformer) subsampling layers", + ) + parser.add_argument( + "--conv-version", + type=str, + default="s2t_transformer", + choices=["s2t_transformer", "convtransformer"], + help="version of frontend convolutional layers", ) # Transformer parser.add_argument( @@ -201,18 +204,27 @@ def add_args(parser): metavar="STR", help="model to take encoder weights from (for initialization)", ) + parser.add_argument( + "--encoder-freezing-updates", + type=int, + metavar="N", + help="freeze encoder for first N updates", + ) @classmethod def build_encoder(cls, args): encoder = S2TTransformerEncoder(args) - if getattr(args, "load_pretrained_encoder_from", None): - encoder = checkpoint_utils.load_pretrained_component_from_model( - component=encoder, checkpoint=args.load_pretrained_encoder_from - ) - logger.info( - f"loaded pretrained encoder from: " - f"{args.load_pretrained_encoder_from}" - ) + pretraining_path = getattr(args, "load_pretrained_encoder_from", None) + if pretraining_path is not None: + if not Path(pretraining_path).exists(): + logger.warning( + f"skipped pretraining because {pretraining_path} does not exist" + ) + else: + encoder = checkpoint_utils.load_pretrained_component_from_model( + component=encoder, checkpoint=pretraining_path + ) + logger.info(f"loaded pretrained encoder from: {pretraining_path}") return encoder @classmethod @@ -234,6 +246,7 @@ def build_embedding(dictionary, embed_dim): decoder_embed_tokens = build_embedding( task.target_dictionary, args.decoder_embed_dim ) + args.tgt_dict_size = len(task.target_dictionary) encoder = cls.build_encoder(args) decoder = cls.build_decoder(args, task, decoder_embed_tokens) return cls(encoder, decoder) @@ -249,11 +262,28 @@ def get_normalized_probs( lprobs.batch_first = True return lprobs + def get_ctc_target(self, sample: Optional[Dict[str, Tensor]]): + return sample["target"], sample["target_lengths"] + + def get_ctc_output( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + sample: Optional[Dict[str, Tensor]], + ): + encoder_out = net_output[1]["encoder_out"]["encoder_out"][0] + logits = self.encoder.ctc_proj(encoder_out) # T x B x C + out = utils.log_softmax(logits.float(), dim=-1) + padding_mask = net_output[1]["encoder_out"]["encoder_padding_mask"] + lens = out.new_full((out.shape[1],), out.shape[0]).long() + if len(padding_mask) > 0: + lens -= padding_mask[0].sum(dim=-1) + return out, lens + def forward(self, src_tokens, src_lengths, prev_output_tokens): """ The forward method inherited from the base class has a **kwargs argument in its input, which is not supported in torchscript. This - method overrites the forward method definition without **kwargs. + method overwrites the forward method definition without **kwargs. """ encoder_out = self.encoder(src_tokens=src_tokens, src_lengths=src_lengths) decoder_out = self.decoder( @@ -269,6 +299,9 @@ class S2TTransformerEncoder(FairseqEncoder): def __init__(self, args): super().__init__(None) + self.encoder_freezing_updates = args.encoder_freezing_updates + self.num_updates = 0 + self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__ ) @@ -277,12 +310,21 @@ def __init__(self, args): self.embed_scale = 1.0 self.padding_idx = 1 - self.subsample = Conv1dSubsampler( - args.input_feat_per_channel * args.input_channels, - args.conv_channels, - args.encoder_embed_dim, - [int(k) for k in args.conv_kernel_sizes.split(",")], - ) + self.conv_version = args.conv_version + if self.conv_version == "s2t_transformer": + self.subsample = Conv1dSubsampler( + args.input_feat_per_channel * args.input_channels, + args.conv_channels, + args.encoder_embed_dim, + [int(k) for k in args.conv_kernel_sizes.split(",")], + ) + elif self.conv_version == "convtransformer": + self.subsample = Conv2dSubsampler( + args.input_channels, + args.input_feat_per_channel, + args.conv_out_channels, + args.encoder_embed_dim, + ) self.embed_positions = PositionalEmbedding( args.max_source_positions, args.encoder_embed_dim, self.padding_idx @@ -296,7 +338,11 @@ def __init__(self, args): else: self.layer_norm = None - def forward(self, src_tokens, src_lengths): + self.ctc_proj = None + if getattr(args, "ctc_weight", 0.0) > 0.0: + self.ctc_proj = nn.Linear(args.encoder_embed_dim, args.tgt_dict_size) + + def _forward(self, src_tokens, src_lengths, return_all_hiddens=False): x, input_lengths = self.subsample(src_tokens, src_lengths) x = self.embed_scale * x @@ -305,73 +351,87 @@ def forward(self, src_tokens, src_lengths): x += positions x = self.dropout_module(x) + encoder_states = [] + for layer in self.transformer_layers: x = layer(x, encoder_padding_mask) - - if not encoder_padding_mask.any(): - encoder_padding_mask = None + if return_all_hiddens: + encoder_states.append(x) if self.layer_norm is not None: x = self.layer_norm(x) - return EncoderOut( - encoder_out=x, - encoder_padding_mask=encoder_padding_mask, - encoder_embedding=None, - encoder_states=None, - src_tokens=None, - src_lengths=None, - ) - - @torch.jit.export - def reorder_encoder_out(self, encoder_out: EncoderOut, new_order): - """ - Since encoder_padding_mask and encoder_embedding are both of type - Optional[Tensor] in EncoderOut, they need to be copied as local - variables for Torchscript Optional refinement - """ - - encoder_padding_mask: Optional[Tensor] = encoder_out.encoder_padding_mask - encoder_embedding: Optional[Tensor] = encoder_out.encoder_embedding + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [encoder_padding_mask] + if encoder_padding_mask.any() + else [], # B x T + "encoder_embedding": [], # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + } + + def forward(self, src_tokens, src_lengths, return_all_hiddens=False): + if self.num_updates < self.encoder_freezing_updates: + with torch.no_grad(): + x = self._forward( + src_tokens, src_lengths, return_all_hiddens=return_all_hiddens + ) + else: + x = self._forward( + src_tokens, src_lengths, return_all_hiddens=return_all_hiddens + ) + return x + def reorder_encoder_out(self, encoder_out, new_order): new_encoder_out = ( - encoder_out.encoder_out - if encoder_out.encoder_out is None - else encoder_out.encoder_out.index_select(1, new_order) + [] + if len(encoder_out["encoder_out"]) == 0 + else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] ) new_encoder_padding_mask = ( - encoder_padding_mask - if encoder_padding_mask is None - else encoder_padding_mask.index_select(0, new_order) + [] + if len(encoder_out["encoder_padding_mask"]) == 0 + else [ + x.index_select(0, new_order) + for x in encoder_out["encoder_padding_mask"] + ] ) new_encoder_embedding = ( - encoder_embedding - if encoder_embedding is None - else encoder_embedding.index_select(0, new_order) + [] + if len(encoder_out["encoder_embedding"]) == 0 + else [ + x.index_select(0, new_order) for x in encoder_out["encoder_embedding"] + ] ) - encoder_states = encoder_out.encoder_states - if encoder_states is not None: + encoder_states = encoder_out["encoder_states"] + if len(encoder_states) > 0: for idx, state in enumerate(encoder_states): encoder_states[idx] = state.index_select(1, new_order) - return EncoderOut( - encoder_out=new_encoder_out, # T x B x C - encoder_padding_mask=new_encoder_padding_mask, # B x T - encoder_embedding=new_encoder_embedding, # B x T x C - encoder_states=encoder_states, # List[T x B x C] - src_tokens=None, - src_lengths=None, - ) + return { + "encoder_out": new_encoder_out, # T x B x C + "encoder_padding_mask": new_encoder_padding_mask, # B x T + "encoder_embedding": new_encoder_embedding, # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], # B x T + "src_lengths": [], # B x 1 + } + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + self.num_updates = num_updates class TransformerDecoderScriptable(TransformerDecoder): def extract_features( self, prev_output_tokens, - encoder_out: Optional[EncoderOut] = None, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, @@ -386,14 +446,19 @@ def extract_features( alignment_layer, alignment_heads, ) - return x, None + extra = {"encoder_out": encoder_out} if incremental_state is None else None + return x, extra @register_model_architecture(model_name="s2t_transformer", arch_name="s2t_transformer") def base_architecture(args): + args.encoder_freezing_updates = getattr(args, "encoder_freezing_updates", 0) # Convolutional subsampler - args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5") - args.conv_channels = getattr(args, "conv_channels", 1024) + args.input_channels = getattr(args, "input_channels", 1) + args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5") # for Conv1d + args.conv_channels = getattr(args, "conv_channels", 1024) # for Conv1d + args.conv_out_channels = getattr(args, "conv_out_channels", 256) # for Conv2d + args.conv_version = getattr(args, "conv_version", "s2t_transformer") # Transformer args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) @@ -415,7 +480,7 @@ def base_architecture(args): args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( - args, "share_decoder_input_output_embed", True + args, "share_decoder_input_output_embed", False ) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False @@ -440,6 +505,15 @@ def s2t_transformer_s(args): base_architecture(args) +@register_model_architecture("s2t_transformer", "s2t_transformer_xs") +def s2t_transformer_xs(args): + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.decoder_layers = getattr(args, "decoder_layers", 3) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 4) + args.dropout = getattr(args, "dropout", 0.3) + s2t_transformer_s(args) + + @register_model_architecture("s2t_transformer", "s2t_transformer_sp") def s2t_transformer_sp(args): args.encoder_layers = getattr(args, "encoder_layers", 16) diff --git a/fairseq/models/speech_to_text/s2t_wav_transformer.py b/fairseq/models/speech_to_text/s2t_wav_transformer.py new file mode 100644 index 0000000000..ad21aeeb1a --- /dev/null +++ b/fairseq/models/speech_to_text/s2t_wav_transformer.py @@ -0,0 +1,504 @@ +#!/usr/bin/env python3 + +import math + +import torch +import torch.nn as nn + +from fairseq.data.data_utils import compute_mask_indices +from fairseq.models import FairseqEncoder +from fairseq.models.wav2vec import ConvFeatureExtractionModel +from fairseq.modules import GradMultiply, LayerNorm, SamePad, TransformerEncoderLayer + + +# Transformer encoder with wave input, it is adopted from wav2vec 2.0 Encoder. +# use wav input +# use trained position embedding so it is easier to match with text input +class SpeechWavTransformerEncoder(FairseqEncoder): + + # extra parameters for speech encoder besides those defined in transformermodel + @staticmethod + def add_args(parser): + parser.add_argument( + "--dropout-input", + type=float, + metavar="D", + help="dropout to apply to the input (after feat extr)", + ) + parser.add_argument( + "--dropout-features", + type=float, + metavar="D", + help="dropout to apply to the unmasked features (after feat extr)", + ) + parser.add_argument( + "--speech-extractor-mode", + type=str, + default="layer_norm", + choices=["default", "layer_norm"], + help="feature extractor norm", + ) + + parser.add_argument( + "--speech-conv-bias", + action="store_true", + help="include bias in speech conv encoder", + ) + + parser.add_argument( + "--conv-feature-layers", + default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]", + help="string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]", + ) + + parser.add_argument( + "--speech-mask-length", + type=int, + help="repeat the mask indices multiple times", + ) + + parser.add_argument( + "--speech-mask-prob", + type=float, + help="probability of replacing a token with mask", + ) + + parser.add_argument( + "--speech-mask-selection", + type=str, + choices=["static", "uniform", "normal", "poisson"], + help="how to choose masks", + ) + + parser.add_argument( + "--speech-mask-other", + type=float, + help="stdev of the mask length in case of 'normal' selection strategy", + ) + + parser.add_argument( + "--speech-no-mask-overlap", + action="store_true", + help="whether to allow masks to overlap", + ) + + parser.add_argument( + "--speech-mask-min-space", + type=int, + help="min space between spans (if no overlap is enabled)", + ) + + parser.add_argument( + "--speech-mask-channel-length", + type=int, + help="repeat the mask indices multiple times", + ) + + parser.add_argument( + "--speech-mask-channel-prob", + type=float, + help="probability of replacing a token with mask", + ) + + parser.add_argument( + "--speech-mask-channel-selection", + type=str, + choices=["static", "uniform", "normal", "poisson"], + help="how to choose masks", + ) + + parser.add_argument( + "--speech-mask-channel-other", + type=float, + help="stdev of the mask length in case of 'normal' selection strategy", + ) + + parser.add_argument( + "--speech-no-mask-channel-overlap", + action="store_true", + help="whether to allow masks to overlap", + ) + + parser.add_argument( + "--no-scale-feature", + action="store_true", + help="no scale for the calculated features", + ) + + parser.add_argument( + "--speech-mask-channel-min-space", + type=int, + help="min space between spans (if no overlap is enabled)", + ) + + parser.add_argument( + "--feature-grad-mult", + type=float, + help="reset feature grad mult in wav2vec 2.0 to this", + ) + + # positional embeddings + parser.add_argument( + "--conv-pos", + type=int, + default=128, + help="number of filters for convolutional positional embeddings", + ) + + parser.add_argument( + "--conv-pos-groups", + type=int, + default=16, + help="number of groups for convolutional positional embedding", + ) + # model configures + parser.add_argument( + "--speech-encoder-layers", + type=int, + help="number of speech encoder layers", + ) + parser.add_argument( + "--text-encoder-layers", + type=int, + help="number of text encoder layers", + ) + + def __init__(self, args, alway_mask=False): + super().__init__(args) + self.args = args + self.dropout = args.dropout + self.embedding_dim = args.encoder_embed_dim + self.feat_scale = math.sqrt(args.encoder_embed_dim) + if args.no_scale_feature: + self.feat_scale = 1.0 + + subsample = ConvFeatureExtractionModel( + conv_layers=eval(args.conv_feature_layers), + dropout=0.0, + mode=args.speech_extractor_mode, # default, layer_norm + conv_bias=args.speech_conv_bias, + ) + self.feature_enc_layers = eval(args.conv_feature_layers) + self.subsample = subsample + self.feat_proj = ( + nn.Linear(self.feature_enc_layers[-1][0], self.embedding_dim) + if self.feature_enc_layers[-1][0] != self.embedding_dim + else None + ) + + self.feat_layer_norm = LayerNorm(self.feature_enc_layers[-1][0]) + + self.embed_positions = nn.Conv1d( + self.embedding_dim, + self.embedding_dim, + kernel_size=args.conv_pos, + padding=args.conv_pos // 2, + groups=args.conv_pos_groups, + ) + std = math.sqrt(4 / (args.conv_pos * self.embedding_dim)) + nn.init.normal_(self.embed_positions.weight, mean=0, std=std) + nn.init.constant_(self.embed_positions.bias, 0) + + self.embed_positions = nn.utils.weight_norm( + self.embed_positions, name="weight", dim=2 + ) + self.embed_positions = nn.Sequential( + self.embed_positions, SamePad(args.conv_pos), nn.GELU() + ) + + self.mask_prob = args.speech_mask_prob + self.mask_selection = args.speech_mask_selection + self.mask_other = args.speech_mask_other + self.mask_length = args.speech_mask_length + self.no_mask_overlap = args.speech_no_mask_overlap + self.mask_min_space = args.speech_mask_min_space + + self.mask_channel_prob = args.speech_mask_channel_prob + self.mask_channel_selection = args.speech_mask_channel_selection + self.mask_channel_other = args.speech_mask_channel_other + self.mask_channel_length = args.speech_mask_channel_length + self.no_mask_channel_overlap = args.speech_no_mask_channel_overlap + self.mask_channel_min_space = args.speech_mask_channel_min_space + + self.dropout_input = nn.Dropout(args.dropout_input) + self.dropout_features = nn.Dropout(args.dropout_features) + + self.feature_grad_mult = args.feature_grad_mult + + self.mask_emb = nn.Parameter( + torch.FloatTensor(args.encoder_embed_dim).uniform_() + ) + + self.layers = nn.ModuleList( + [TransformerEncoderLayer(args) for _ in range(args.encoder_layers)] + ) + self.layer_norm = LayerNorm(args.encoder_embed_dim) + self.normalize_before = args.encoder_normalize_before + self.alway_mask = alway_mask + + def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor): + """ + Computes the output length of the convolutional layers + """ + + def _conv_out_length(input_length, kernel_size, stride): + return torch.floor((input_length - kernel_size) / stride + 1) + + for i in range(len(self.feature_enc_layers)): + input_lengths = _conv_out_length( + input_lengths, + self.feature_enc_layers[i][1], + self.feature_enc_layers[i][2], + ) + + return input_lengths.to(torch.long) + + def apply_mask(self, x, padding_mask): + B, T, C = x.shape + if self.mask_prob > 0: + mask_indices = compute_mask_indices( + (B, T), + padding_mask, + self.mask_prob, + self.mask_length, + self.mask_selection, + self.mask_other, + min_masks=2, + no_overlap=self.no_mask_overlap, + min_space=self.mask_min_space, + ) + mask_indices = torch.from_numpy(mask_indices).to(x.device) + x[mask_indices] = self.mask_emb + else: + mask_indices = None + + if self.mask_channel_prob > 0: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + torch.from_numpy(mask_channel_indices) + .to(x.device) + .unsqueeze(1) + .expand(-1, T, -1) + ) + x[mask_channel_indices] = 0 + + return x, mask_indices + + def forward( + self, + src_tokens, + src_lengths, + return_all_hiddens=False, + padding_mask=None, + features_only=True, + ): + mask = self.training or self.alway_mask + if self.feature_grad_mult > 0 and self.training: + features = self.subsample(src_tokens) + if self.feature_grad_mult != 1.0: + features = GradMultiply.apply(features, self.feature_grad_mult) + else: + with torch.no_grad(): + features = self.subsample(src_tokens) + features = features.transpose(1, 2) + features = self.feat_layer_norm(features) + if self.feat_proj is not None: + features = self.feat_proj(features) + + if padding_mask is not None: + input_lengths = (1 - padding_mask.long()).sum(-1) + else: + input_lengths = src_lengths + # apply conv formula to get real output_lengths + output_lengths = self._get_feat_extract_output_lengths(input_lengths) + + padding_mask = torch.zeros( + features.shape[:2], dtype=features.dtype, device=features.device + ) + + # these two operations makes sure that all values + # before the output lengths indices are attended to + padding_mask[ + ( + torch.arange(padding_mask.shape[0], device=padding_mask.device), + output_lengths - 1, + ) + ] = 1 + padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() + + features = self.feat_scale * features if self.feat_scale != 1.0 else features + unmasked_features = features.clone() + + features = self.dropout_input(features) + unmasked_features = self.dropout_features(unmasked_features) + if mask: + x, mask_indices = self.apply_mask(features, padding_mask) + else: + x = features + mask_indices = None + + def cal_transformer_layers(x, encoder_padding_mask, return_all_hiddens=False): + # x: B x T x C + positions = self.embed_positions(x.transpose(1, 2)).transpose(1, 2) + x = x + positions + if not self.normalize_before: + x = self.layer_norm(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + encoder_states = [] + for layer in self.layers: + x = layer(x, encoder_padding_mask) + if return_all_hiddens: + encoder_states.append(x) + if self.normalize_before: + x = self.layer_norm(x) + return x, encoder_states + + x, encoder_states = cal_transformer_layers(x, padding_mask, return_all_hiddens) + if features_only: + return { + "encoder_out": [x], # [T x B x C] + "encoder_padding_mask": [padding_mask] + if padding_mask is not None + else [], # B x T + "encoder_embedding": [], # + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + "mask_indices": [mask_indices], + } + + x_unmasked = x + if self.mask_prob > 0 or self.mask_channel_prob > 0: + x_unmasked, _ = cal_transformer_layers(unmasked_features, padding_mask) + return { + "encoder_out": [x], # [T x B x C] + "encoder_unmasked_out": [x_unmasked], # [T x B x C] + "encoder_padding_mask": [padding_mask] + if padding_mask is not None + else [], # B x T + "encoder_embedding": [], # + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + "mask_indices": [mask_indices] if mask_indices is not None else [], # B X T + } + + def reorder_encoder_out(self, encoder_out, new_order): + new_encoder_out = ( + [] + if len(encoder_out["encoder_out"]) == 0 + else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] + ) + + new_encoder_padding_mask = ( + [] + if len(encoder_out["encoder_padding_mask"]) == 0 + else [ + x.index_select(0, new_order) + for x in encoder_out["encoder_padding_mask"] + ] + ) + + new_encoder_embedding = ( + [] + if len(encoder_out["encoder_embedding"]) == 0 + else [ + x.index_select(0, new_order) for x in encoder_out["encoder_embedding"] + ] + ) + + encoder_states = encoder_out["encoder_states"] + if len(encoder_states) > 0: + for idx, state in enumerate(encoder_states): + encoder_states[idx] = state.index_select(1, new_order) + + return { + "encoder_out": new_encoder_out, # T x B x C + "encoder_padding_mask": new_encoder_padding_mask, # B x T + "encoder_embedding": new_encoder_embedding, # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], # B x T + "src_lengths": [], # B x 1 + } + + +class StackedSpeechWavTransformerEncoder(FairseqEncoder): + def __init__(self, speech_enc, text_enc_layers, text_layer_norm): + super().__init__(None) + self.speech_encoder = speech_enc + self.text_encoder_layers = text_enc_layers + self.final_layer_norm = text_layer_norm + + def forward( + self, + src_tokens, + src_lengths=None, + return_all_hiddens=False, + padding_mask=None, + features_only=True, + ): + + out = self.speech_encoder.forward( + src_tokens, + src_lengths, + return_all_hiddens, + padding_mask=padding_mask, + features_only=features_only, + ) + x = out["encoder_out"][0] + encoder_padding_mask = None + if len(out["encoder_padding_mask"]) > 0: + encoder_padding_mask = out["encoder_padding_mask"][0] + + def cal_text_layers(x, padding_mask, return_all_hiddens=False): + encoder_states = [] + for layer in self.text_encoder_layers: + x = layer(x, padding_mask) + if return_all_hiddens: + encoder_states.append(x) + if self.final_layer_norm is not None: + x = self.final_layer_norm(x) + return x, encoder_states + + x, encoder_states = cal_text_layers(x, encoder_padding_mask, return_all_hiddens) + if features_only: + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [encoder_padding_mask] + if encoder_padding_mask is not None + else [], # B x T + "encoder_embedding": [], # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + } + + x_u = out["encoder_unmasked_out"][0] + x_u, _ = cal_text_layers(x_u, encoder_padding_mask) + + return { + "encoder_out": [x], # [T x B x C] + "encoder_unmasked_out": [x_u], # [T x B x C] + "encoder_padding_mask": [encoder_padding_mask] + if encoder_padding_mask is not None + else [], # B x T + "encoder_embedding": [], # + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + "mask_indices": out["mask_indices"], # B X T + } + + def reorder_encoder_out(self, encoder_out, new_order): + return self.speech_encoder.reorder_encoder_out(encoder_out, new_order) diff --git a/fairseq/models/speech_to_text/utils.py b/fairseq/models/speech_to_text/utils.py new file mode 100644 index 0000000000..33117446a5 --- /dev/null +++ b/fairseq/models/speech_to_text/utils.py @@ -0,0 +1,562 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + + +import logging +from collections.abc import Iterable +from itertools import repeat +from typing import List, Optional, Tuple + +import torch +from torch import Tensor + +# ------------------------------------------------------------------------------ +# assert_equal() +# ------------------------------------------------------------------------------ + + +def assert_equal(value1, value2, name1=None, name2=None): + """Asserts two values are equal otherwise raise an error.""" + + str_name1 = "" if name1 is None else "{} ".format(name1) + str_name2 = "" if name2 is None else "{} ".format(name2) + if value1 != value2: + str_value1 = "{}" if name1 is None else "({})" + str_value1 = str_value1.format(value1) + str_value2 = "{}" if name2 is None else "({})" + str_value2 = str_value2.format(value2) + raise ValueError( + "Expected {}{} == {}{}".format(str_name1, str_value1, str_name2, str_value2) + ) + + +def fill_config(config, key, value): + if value is not None: + if key not in config or config[key] is None: + config[key] = value + assert_equal(value, config[key], "value", f'config["{key}"]') + + +# ------------------------------------------------------------------------------ +# check_and_return_expected() +# ------------------------------------------------------------------------------ + + +def check_and_return_expected(value, undefined_value, expected_value, name=None): + """ + Return the expected value while checking if the given value is undefined or + equal to the expected value. + """ + if (undefined_value is None and value is None) or (undefined_value == value): + return expected_value + if value != expected_value: + str_name = "" if name is None else "{} ".format(name) + str_value = "{}" if name is None else "({})" + str_value = str_value.format(value) + raise ValueError( + "Expected {}{} == {}".format(str_name, str_value, expected_value) + ) + return expected_value + + +# ------------------------------------------------------------------------------ +# get_time_axis() +# ------------------------------------------------------------------------------ + + +def get_time_axis(layout): + """ + Extract the time axis from the layout, for example for breaking sequence into + segments. + """ + if layout in ["TB", "TBD"]: + return 0 + if layout in ["BT", "BTD"]: + return 1 + if layout in ["BCTD"]: + return 2 + raise ValueError("Unsupported layout = {}".format(layout)) + + +# ------------------------------------------------------------------------------ +# get_batch_axis() +# ------------------------------------------------------------------------------ + + +def get_batch_axis(layout): + """ + Extract the batch axis from the layout + """ + if layout in ["TB", "TBD"]: + return 1 + if layout in ["BT", "BTD", "BCTD"]: + return 0 + raise ValueError("Unsupported layout = {}".format(layout)) + + +# ------------------------------------------------------------------------------ +# monotonically_increasing_and_bounded() +# ------------------------------------------------------------------------------ + + +def monotonically_increasing_and_bounded(iterable, min=None, max=None): + """ + Check if the elements in the given iterable are monotonically increasing and + bounded by upper/lower bounds. + """ + if not isinstance(iterable, Iterable): + raise TypeError( + "Expected iterable to be of type Iterable, got ({})".format( + iterable.__class__.__name__ + ) + ) + for i in range(len(iterable)): + if min is not None and iterable[i] < min: + return False + if max is not None and iterable[i] > max: + return False + if i > 0 and iterable[i] <= iterable[i - 1]: + return False + return True + + +# ------------------------------------------------------------------------------ +# to_pair() +# ------------------------------------------------------------------------------ + + +def to_pair(value, name): + """Make a pair (of type tuple) of given value.""" + if isinstance(value, Iterable): + if len(value) != 2: + raise ValueError( + "Expected `{}` to have exactly 2 elements, got: ({})".format( + name, value + ) + ) + return value + return tuple(repeat(value, 2)) + + +# ------------------------------------------------------------------------------ +# infer_conv_output_attrs() +# ------------------------------------------------------------------------------ + + +# TODO(cfyeh): figure out if we can get `output_dim` without calling the module. +def infer_conv_output_attrs( + module, input_channels, input_dim, batch_size=1, max_length=8 +): + """Get output attributes of a module with input.""" + input = torch.randn(batch_size, input_channels, max_length, input_dim) + output = module(input) + output_channels = output.shape[1] + output_dim = output.shape[-1] + return output_channels, output_dim + + +# ------------------------------------------------------------------------------ +# NoOp +# ------------------------------------------------------------------------------ + + +class NoOp(torch.nn.Module): + """ + NoOp simply passes the input as the output. + """ + + def __init__(self): + super().__init__() + + def forward(self, input: Tensor) -> Tensor: + return input + + +# ------------------------------------------------------------------------------ +# Permute: a torch.nn.Module applies permutation on the input tensor. +# ------------------------------------------------------------------------------ + + +class Permute(torch.nn.Module): + def __init__(self, dims): + super().__init__() + self.dims = dims + + def forward(self, input: Tensor) -> Tensor: + return input.permute(self.dims).contiguous() + + +# ------------------------------------------------------------------------------ +# lengths_to_padding_mask() +# ------------------------------------------------------------------------------ + + +def lengths_to_padding_mask(lengths: Tensor) -> Tensor: + """Convert lengths of shape (B, ) to padding mask.""" + batch_size = lengths.shape[0] + max_length = int(torch.max(lengths).item()) + padding_mask = torch.arange( # [0, ..., T-1] + max_length, device=lengths.device, dtype=lengths.dtype + ).expand(batch_size, max_length) >= lengths.unsqueeze(1) + + return padding_mask + + +# ------------------------------------------------------------------------------ +# lengths_to_attention_mask() +# ------------------------------------------------------------------------------ + + +def lengths_to_attention_mask( + lengths: Tensor, + left_context: Optional[int] = None, + right_context: Optional[int] = None, +) -> Optional[Tensor]: + """ + Generate attention mask based on (lengths, left_context, right_context). + left_context is None means unlimited left context. + right_context is None means unlimited right context. + """ + + if left_context is None and right_context is None: + return None + + max_length = int(torch.max(lengths).item()) + + # For example, with `max_length` == 5, + # indices = tensor([ + # [ 0, 1, 2, 3, 4, 5], + # [-1, 0, 1, 2, 3, 4], + # [-2, -1, 0, 1, 2, 3], + # [-3, -2, -1, 0, 1, 2], + # [-4, -3, -2, -1, 0, 1], + # [-5, -4, -3, -2, -1, 0], + # ]) + + # In some cases the second torch.arange is created on cpu which causes a + # failure. Adding the device option to guard against it. + indices = torch.arange( + max_length, device=lengths.device, dtype=lengths.dtype + ).expand(max_length, max_length) - torch.arange( + max_length, device=lengths.device + ).view( + max_length, -1 + ) + + # For example, with `max_length` == 5, + # bool_mask = tensor([ + # [True, True, True, True, True], + # [True, True, True, True, True], + # [True, True, True, True, True], + # [True, True, True, True, True], + # [True, True, True, True, True], + # ]) + bool_mask = ( + torch.tensor([True]).to(device=lengths.device).expand(max_length, max_length) + ) + + # For example, with `max_length` == 5, left_context == 2 + # left_mask = tensor([ + # [ True, True, True, True, True], + # [ True, True, True, True, True], + # [ True, True, True, True, True], + # [False, True, True, True, True], + # [False, False, True, True, True], + # ]) + if left_context is not None: + left_mask = indices >= -left_context + bool_mask = bool_mask & left_mask + + # For example, with `max_length` == 5, right_context == 1 + # right_mask = tensor([ + # [True, True, False, False, False], + # [True, True, True, False, False], + # [True, True, True, True, False], + # [True, True, True, True, True], + # [True, True, True, True, True], + # ]) + if right_context is not None: + right_mask = indices <= right_context + bool_mask = bool_mask & right_mask + + bool_mask = (~bool_mask).to(device=lengths.device) + return bool_mask + + +# ------------------------------------------------------------------------------ +# infer_output_norm() +# ------------------------------------------------------------------------------ + + +def infer_output_norm(module, output_norm=None): + """ + Infer the output norm (string and module) needed on the module gvien desired + output normalization. + """ + if output_norm == module.output_norm(): + # output_norm already matches module.output_norm(). + return (None, NoOp()) + + if output_norm is None and module.output_norm() is not None: + logger = logging.getLogger("infer_output_norm()") + logger.warning( + "trying to set output_norm ({}) ".format(output_norm) + + "but got module.output_norm() ({}), ".format(module.output_norm()) + + "the combined output_norm() will be ({})".format(module.output_norm()) + ) + return (None, NoOp()) + + if output_norm == "log_softmax": + if module.output_norm() is not None: + raise ValueError( + "incompatible output_norm ({}) ".format(output_norm) + + "and module.output_norm() ({})".format(module.output_norm()) + ) + else: + return ("log_softmax", torch.nn.LogSoftmax(dim=-1)) + + if output_norm == "softmax": + if module.output_norm() is not None: + raise ValueError( + "incompatible output_norm ({}) ".format(output_norm) + + "and module.output_norm() ({})".format(module.output_norm()) + ) + else: + return ("softmax", torch.nn.Softmax(dim=-1)) + + raise ValueError( + "output_norm ({}) not in ".format(output_norm) + + "supported list = [None, softmax, log_softmax]" + ) + + +# ------------------------------------------------------------------------------ +# infer_channels_from_layout() +# ------------------------------------------------------------------------------ + + +def infer_channels_from_layout(layout, channels): + """Extract the number of channels from the layout.""" + if layout in ("TBD", "BTD"): + if channels is not None and channels != 1: + raise ValueError( + "Expected channels ({}) to be 1 for layout = {}".format( + channels, layout + ) + ) + if channels is None: + return 1 + return channels + + +# ------------------------------------------------------------------------------ +# pad_sequence() +# ------------------------------------------------------------------------------ + + +@torch.jit.export +def pad_sequence( + sequence: Tensor, + time_axis: int, + extra_left_context: int = 0, + extra_right_context: int = 0, +) -> Tensor: + """Pad extra left/right contexts to the sequence.""" + + if extra_left_context == 0 and extra_right_context == 0: + return sequence + + tensors_to_concat = [] + + if extra_left_context: + size = (extra_left_context,) + fill_value = 0 + indices = torch.full( + size=size, + fill_value=fill_value, + dtype=torch.long, + device=sequence.device, + ) + left_padding = torch.index_select(sequence, time_axis, indices) + tensors_to_concat.append(left_padding) + + tensors_to_concat.append(sequence) + + # NOTE(cfyeh): for efficiency reason we pad 0 instead of the last frame for + # extra right contexts. + if extra_right_context: + size = list(sequence.shape) + size[time_axis] = extra_right_context + right_padding = torch.zeros(size, dtype=sequence.dtype, device=sequence.device) + tensors_to_concat.append(right_padding) + + padded_sequence = torch.cat(tensors_to_concat, dim=time_axis) + return padded_sequence + + +# ------------------------------------------------------------------------------ +# sequence_to_segments() +# ------------------------------------------------------------------------------ + + +@torch.jit.export +def sequence_to_segments( + sequence: Tensor, + time_axis: int, + lengths: Tensor, + segment_size: Optional[int] = None, + extra_left_context: int = 0, + extra_right_context: int = 0, +) -> List[Tuple[Tensor, Tensor]]: + """Breaks sequence into segments.""" + + sequence = pad_sequence( + sequence=sequence, + time_axis=time_axis, + extra_left_context=extra_left_context, + extra_right_context=extra_right_context, + ) + + lengths = lengths + extra_left_context + extra_right_context + + segments: List[Tuple[Tensor, Tensor]] = [] + + if segment_size is None: + segments.append((sequence, lengths)) + return segments + + offset = 0 + end = sequence.shape[time_axis] + step = segment_size + size = extra_left_context + segment_size + extra_right_context + + while offset + extra_left_context + extra_right_context < end: + clamped_size = min(size, end - offset) + segment_lengths = torch.clamp(lengths - offset, min=0, max=clamped_size) + indices = torch.arange( + start=offset, + end=(offset + clamped_size), + step=1, + dtype=torch.long, + device=sequence.device, + ) + segment_tensor = torch.index_select(sequence, time_axis, indices) + segments.append((segment_tensor, segment_lengths)) + offset = offset + step + + return segments + + +# ------------------------------------------------------------------------------ +# segments_to_sequence() +# ------------------------------------------------------------------------------ + + +@torch.jit.export +def segments_to_sequence( + segments: List[Tuple[Tensor, Tensor]], time_axis: int +) -> Tuple[Tensor, Tensor]: + """Concatenate segments into a full sequence.""" + if len(segments) == 1: + return segments[0] + + tensors_to_concat: List[Tensor] = [] + lengths_to_stack: List[Tensor] = [] + + for tensor, lengths in segments: + tensors_to_concat.append(tensor) + lengths_to_stack.append(lengths) + + sequence = torch.cat(tensors_to_concat, dim=time_axis) + lengths = torch.stack(lengths_to_stack, dim=0) + lengths = torch.sum(lengths, dim=0) + + return sequence, lengths + + +def lengths_to_encoder_padding_mask(lengths, batch_first: bool = False): + """ + convert lengths (a 1-D Long/Int tensor) to 2-D binary tensor + + Args: + lengths: a (B, )-shaped tensor + batch_first: whether to return a (B, T) tensor + + Return: + max_length: maximum length of B sequences + encoder_padding_mask: a (max_length, B) binary mask, where + [t, b] = False for t < lengths[b] and True otherwise + + TODO: + kernelize this function if benchmarking shows this function is slow + """ + max_lengths = torch.max(lengths).item() + bsz = lengths.size(0) + encoder_padding_mask = torch.arange( + max_lengths + ).to( # a (T, ) tensor with [0, ..., T-1] + lengths.device + ).view( # move to the right device + 1, max_lengths + ).expand( # reshape to (1, T)-shaped tensor + bsz, -1 + ) > lengths.view( # expand to (B, T)-shaped tensor + bsz, 1 + ).expand( + -1, max_lengths + ) + if not batch_first: + return encoder_padding_mask.t(), max_lengths + else: + return encoder_padding_mask, max_lengths + + +# ------------------------------------------------------------------------------ +# attention suppression +# ------------------------------------------------------------------------------ + + +def attention_suppression(attention_weights: Tensor, scale: float): + # B, H, qlen, klen -> B, H, qlen, 1 + attention_prob = torch.nn.functional.softmax(attention_weights.float(), dim=-1) + attention_nozeros = attention_prob.to(torch.bool) + nozeros_sum = torch.sum(attention_nozeros.to(torch.float), dim=-1, keepdim=True) + + # For very sparse situation, we need get round about 0s + key_sum = torch.sum(attention_prob, dim=-1, keepdim=True) + + # nozeros_sum should > 1 + key_mean = key_sum / (nozeros_sum + 1e-8) + + # std calculation + dis = (attention_prob - key_mean) * (attention_prob - key_mean) + + # if attention_prob[i] < threshold, then dis_masked[i] = 0; for all i + dis_masked = torch.where( + attention_nozeros, dis, attention_prob.new_zeros(attention_prob.size()) + ) + + key_var = torch.sum(dis_masked, dim=-1, keepdim=True) + key_var = key_var / (nozeros_sum - 1.0 + 1e-8) + key_std = torch.sqrt(key_var) + key_thread = key_mean - scale * key_std + + # if attention_prob[i] >= key_thread, then attention_prob[i] + # , otherwise "-inf" + inf_tensor = attention_prob.new_zeros(attention_prob.size()).detach() + inf_tensor[:] = float("-inf") + attention_weights_float = torch.where( + attention_prob < key_thread, + inf_tensor, + attention_weights.float(), + ) + + return attention_weights_float.type_as(attention_weights) + + +def layer_norm_backward_hook(module, grad_input, grad_output, clamp_value): + return tuple(torch.clamp(v, min=-clamp_value, max=clamp_value) for v in grad_input) diff --git a/fairseq/models/speech_to_text/xm_transformer.py b/fairseq/models/speech_to_text/xm_transformer.py new file mode 100644 index 0000000000..7b4b234641 --- /dev/null +++ b/fairseq/models/speech_to_text/xm_transformer.py @@ -0,0 +1,855 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import logging +from typing import Dict, List, Optional, Tuple + +import numpy as np +import torch +import torch.nn as nn +from torch import Tensor + +from fairseq import checkpoint_utils, utils +from fairseq.data.data_utils import lengths_to_padding_mask +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderDecoderModel, + FairseqEncoderModel, + FairseqLanguageModel, + register_model, + register_model_architecture, +) +from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder +from fairseq.models.speech_to_text.hub_interface import S2THubInterface +from fairseq.models.transformer import ( + Embedding, + TransformerDecoder, + TransformerModelBase, +) +from fairseq.models.wav2vec import Wav2VecEncoder +from fairseq.modules.layer_norm import LayerNorm + +logger = logging.getLogger(__name__) + + +def build_embedding(dictionary, embed_dim): + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + return Embedding(num_embeddings, embed_dim, padding_idx) + + +class Conv1dAdaptor(nn.Module): + def __init__( + self, + in_dim, + out_dim, + n_layers=3, + kernel_size=3, + stride=2, + layerdrop=0.0, + layernorm=False, + proj=False, + ): + super().__init__() + self.proj, self.proj_ln = None, None + self.post_proj, self.post_proj_ln = None, None + if proj: + self.proj = nn.Sequential( + nn.Linear(in_dim, in_dim * 4), nn.ReLU(), nn.Linear(in_dim * 4, in_dim) + ) + self.proj_ln = LayerNorm(in_dim) + self.post_proj = nn.Sequential( + nn.Linear(out_dim, out_dim * 4), + nn.ReLU(), + nn.Linear(out_dim * 4, out_dim), + ) + self.post_proj_ln = LayerNorm(out_dim) + + self.layers = nn.ModuleList( + nn.Conv1d( + in_dim if i == 0 else out_dim, + out_dim * 2, + kernel_size, + stride=stride, + padding=kernel_size // 2, + ) + for i in range(n_layers) + ) + self.stride = stride + self.layerdrop = layerdrop + self.layernorm = LayerNorm(in_dim) if layernorm else None + + @classmethod + def add_args(cls, parser): + parser.add_argument("--adaptor-n-layers", type=int) + parser.add_argument("--adaptor-kernel-size", type=int) + parser.add_argument("--adaptor-stride", type=int) + parser.add_argument("--adaptor-layerdrop", type=float) + parser.add_argument("--adaptor-layernorm", action="store_true") + parser.add_argument("--adaptor-proj", action="store_true") + + def forward(self, x, padding_mask: Optional[torch.Tensor]): + if self.layernorm is not None: + x = self.layernorm(x) + + if self.proj is not None: + x = x + 0.5 * self.proj(x) + x = self.proj_ln(x) + + if padding_mask is not None: + x = utils.index_put(x, padding_mask.T, 0) + + # T x B x C -> B x C x T + x = x.transpose(0, 1).transpose(1, 2) + out_lens = None + if padding_mask is not None: + out_lens = (~padding_mask).sum(1).float() + + for layer in self.layers: + layerdrop_prob = np.random.random() + if not self.training or (layerdrop_prob > self.layerdrop): + x = nn.functional.glu(layer(x), dim=1) + if padding_mask is not None: + out_lens = ((out_lens - 1) / self.stride + 1).floor() + # B x C x T -> T x B x C + x = x.transpose(1, 2).transpose(0, 1) + + if self.post_proj is not None: + x = x + 0.5 * self.post_proj(x) + x = self.post_proj_ln(x) + + out_padding_mask = None + if padding_mask is not None: + out_padding_mask = lengths_to_padding_mask(out_lens.long()) + x = utils.index_put(x, out_padding_mask.T, 0) + return x, out_padding_mask + + +def add_wav2vec_asr_args(parser): + parser.add_argument("--w2v-path", help="path to wav2vec 2.0 model") + parser.add_argument( + "--no-pretrained-weights", + action="store_true", + help="if true, does not load pretrained weights", + ) + parser.add_argument( + "--dropout-input", + type=float, + metavar="D", + help="dropout to apply to the input (after feat extr)", + ) + parser.add_argument( + "--final-dropout", + type=float, + metavar="D", + help="dropout after transformer and before final projection", + ) + parser.add_argument( + "--apply-mask", action="store_true", help="apply masking during fine-tuning" + ) + parser.add_argument( + "--dropout", + type=float, + metavar="D", + help="dropout probability inside wav2vec 2.0 model", + ) + parser.add_argument( + "--attention-dropout", + type=float, + metavar="D", + help="dropout probability for attention weights inside wav2vec 2.0 model", + ) + parser.add_argument( + "--activation-dropout", + "--relu-dropout", + type=float, + metavar="D", + help="dropout probability after activation in FFN inside wav2vec 2.0 model", + ) + parser.add_argument( + "--mask-length", type=int, help="repeat the mask indices multiple times" + ) + parser.add_argument( + "--mask-prob", type=float, help="probability of replacing a token with mask" + ) + parser.add_argument( + "--mask-selection", + type=str, + choices=["static", "uniform", "normal", "poisson"], + help="how to choose masks", + ) + parser.add_argument( + "--mask-other", + type=float, + help="stdev of the mask length in case of 'normal' selection strategy", + ) + parser.add_argument( + "--no-mask-overlap", + action="store_true", + help="whether to allow masks to overlap", + ) + parser.add_argument( + "--mask-channel-length", type=int, help="repeat the mask indices multiple times" + ) + parser.add_argument( + "--mask-channel-prob", + type=float, + help="probability of replacing a token with mask", + ) + parser.add_argument( + "--mask-channel-selection", + type=str, + choices=["static", "uniform", "normal", "poisson"], + help="how to choose masks", + ) + parser.add_argument( + "--mask-channel-other", + type=float, + help="stdev of the mask length in case of 'normal' selection strategy", + ) + parser.add_argument( + "--no-mask-channel-overlap", + action="store_true", + help="whether to allow masks to overlap", + ) + parser.add_argument( + "--freeze-finetune-updates", + type=int, + metavar="N", + help="dont finetune wav2vec for this many updates", + ) + parser.add_argument( + "--feature-grad-mult", + type=float, + metavar="D", + help="reset feature grad mult in wav2vec 2.0 to this", + ) + parser.add_argument( + "--layerdrop", + type=float, + metavar="D", + help="probability of dropping a layer in wav2vec 2.0", + ) + parser.add_argument( + "--max-positions", + type=int, + metavar="N", + help="Max input positions to be used in the conformer encoder in wav2vec 2.0", + ) + parser.add_argument("--encoder-proj", action="store_true") + parser.add_argument("--w2v-args", default=None) + parser.add_argument( + "--remove-weight-norm", + action="store_true", + help="if set, then the weight-norm (in one pos_conv layer) is removed from the model", + ) + parser.add_argument( + "--encoder-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension to be used when w2v_path is None and no encoder_proj is set", + ) + + +def need_finetuning(ft_params, param_name): + if ft_params == "all": + return True + ft_params_list = ft_params.split(",") + for ft_param in ft_params_list: + if ft_param in param_name: + return True + return False + + +class Wav2VecEncoderWithAdaptor(FairseqEncoder): + def build_adaptor(self, args): + adaptor = None + if args.adaptor_n_layers > 0: + adaptor = Conv1dAdaptor( + args.decoder_embed_dim, + args.decoder_embed_dim, + n_layers=args.adaptor_n_layers, + kernel_size=args.adaptor_kernel_size, + stride=args.adaptor_stride, + layerdrop=args.adaptor_layerdrop, + layernorm=args.adaptor_layernorm, + proj=args.adaptor_proj, + ) + return adaptor + + def __init__(self, args): + super().__init__(None) + self.w2v_encoder = Wav2VecEncoder(args) + self.is_v0_arch = not args.adaptor_proj + self.w2v_proj_ln = None + if not self.is_v0_arch and self.w2v_encoder.proj is not None: + self.w2v_proj_ln = LayerNorm(args.decoder_embed_dim) + self.adaptor = self.build_adaptor(args) + + self.num_updates = 0 + self.freezing_updates = args.w2v_freezing_updates + self.finetuning_params = args.finetune_w2v_params + for k, p in self.w2v_encoder.w2v_model.named_parameters(): + p.requires_grad = need_finetuning(self.finetuning_params, k) + + @classmethod + def add_args(cls, parser): + """Add model-specific arguments to the parser.""" + add_wav2vec_asr_args(parser) + parser.add_argument( + "--normalize", + action="store_true", + help="if set, normalizes input to have 0 mean and unit variance", + ) + parser.add_argument( + "--finetune-w2v-params", + type=str, + metavar="STR", + help="comma-separated param strings to finetune.", + ) + parser.add_argument("--w2v-freezing-updates", type=int) + parser.add_argument("--load-pretrained-encoder-from", type=str, metavar="STR") + Conv1dAdaptor.add_args(parser) + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + self.num_updates = num_updates + + def forward(self, src_tokens, src_lengths=None, **kwargs): + if ( + self.freezing_updates is not None + and self.num_updates > self.freezing_updates + ): + for p in self.w2v_encoder.w2v_model.parameters(): + p.requires_grad = True + + padding_mask = lengths_to_padding_mask(src_lengths) + out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True) + x, padding_mask = out["encoder_out"], out["padding_mask"] + if self.w2v_proj_ln is not None: + x = self.w2v_proj_ln(x) + + if self.adaptor is not None: + x, padding_mask = self.adaptor(x, padding_mask) + + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [] + if padding_mask is None + else [padding_mask], # B x T + "encoder_embedding": [], # B x T x C + "encoder_states": [], # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + } + + def reorder_encoder_out(self, encoder_out, new_order): + new_encoder_out = ( + [] + if len(encoder_out["encoder_out"]) == 0 + else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] + ) + + new_encoder_padding_mask = ( + [] + if len(encoder_out["encoder_padding_mask"]) == 0 + else [ + x.index_select(0, new_order) + for x in encoder_out["encoder_padding_mask"] + ] + ) + + new_encoder_embedding = ( + [] + if len(encoder_out["encoder_embedding"]) == 0 + else [ + x.index_select(0, new_order) for x in encoder_out["encoder_embedding"] + ] + ) + + encoder_states = encoder_out["encoder_states"] + if len(encoder_states) > 0: + for idx, state in enumerate(encoder_states): + encoder_states[idx] = state.index_select(1, new_order) + + return { + "encoder_out": new_encoder_out, # T x B x C + "encoder_padding_mask": new_encoder_padding_mask, # B x T + "encoder_embedding": new_encoder_embedding, # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], # B x T + "src_lengths": [], # B x 1 + } + + +def add_decoder_args(parser): + parser.add_argument( + "--activation-fn", + type=str, + default="relu", + choices=utils.get_available_activation_fns(), + help="activation function to use", + ) + parser.add_argument( + "--decoder-dropout", type=float, metavar="D", help="dropout probability" + ) + parser.add_argument( + "--decoder-attention-dropout", + type=float, + metavar="D", + help="dropout probability for attention weights", + ) + parser.add_argument( + "--decoder-activation-dropout", + type=float, + metavar="D", + help="dropout probability after activation in FFN.", + ) + parser.add_argument( + "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension" + ) + parser.add_argument( + "--decoder-ffn-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension for FFN", + ) + parser.add_argument( + "--decoder-layers", type=int, metavar="N", help="num decoder layers" + ) + parser.add_argument( + "--decoder-attention-heads", + type=int, + metavar="N", + help="num decoder attention heads", + ) + parser.add_argument( + "--decoder-normalize-before", + action="store_true", + help="apply layernorm before each decoder block", + ) + parser.add_argument( + "--layernorm-embedding", action="store_true", help="add layernorm to embedding" + ) + parser.add_argument( + "--decoder-layerdrop", + type=float, + metavar="D", + help="layerdrop probability for decoder", + ) + parser.add_argument( + "--decoder-learned-pos", + action="store_true", + help="learn positional embedding in decoder", + ) + parser.add_argument( + "--share-decoder-input-output-embed", + action="store_true", + help="share decoder input and output embeddings", + ) + parser.add_argument( + "--no-scale-embedding", + action="store_true", + help="if True, dont scale embeddings", + ) + parser.add_argument( + "--load-pretrained-decoder-from", + type=str, + metavar="STR", + help="model to take decoder weights from (for initialization)", + ) + parser.add_argument( + "--finetune-decoder-params", + type=str, + metavar="STR", + help="comma-separated param strings to finetune.", + ) + + +def remove_weight_norm_from_model(model): + from functools import reduce + + layers_with_wn = [] + for param_name, _ in model.named_parameters(): + if param_name.endswith("_g"): + # retrieve the module with this param_name + module_names = param_name.split(".")[ + :-1 + ] # exclude the actual parameter name + wn_module = reduce(getattr, module_names, model) + layers_with_wn.append(wn_module) + for wn_module in layers_with_wn: + torch.nn.utils.remove_weight_norm(wn_module) + logger.warning(f"Weight norm removed from module with {wn_module}\n") + + +@register_model("xm_transformer") +class XMTransformerModel(FairseqEncoderDecoderModel): + @classmethod + def hub_models(cls): + base_url = "http://dl.fbaipublicfiles.com/fairseq/s2t" + model_ids = [ + "xm_transformer_600m-es_en-multi_domain", + "xm_transformer_600m-ru_en-multi_domain", + "xm_transformer_600m-fr_en-multi_domain", + "xm_transformer_600m-en_es-multi_domain", + "xm_transformer_600m-en_ru-multi_domain", + "xm_transformer_600m-en_fr-multi_domain", + "xm_transformer_600m-en_zh-multi_domain", + "xm_transformer_600m-en_ar-multi_domain", + "xm_transformer_600m-en_tr-multi_domain", + "xm_transformer_600m-en_vi-multi_domain", + "xm_transformer-21_en-xls_r_300m", + "xm_transformer-en_15-xls_r_300m", + "xm_transformer-21_en-xls_r_1b", + "xm_transformer-en_15-xls_r_1b", + "xm_transformer-21_en-xls_r_2b", + "xm_transformer-en_15-xls_r_2b", + "xm_transformer-22_16-xls_r_2b", + "xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022", + "xm_transformer_s2ut_800m-en-es-st_plus_asr", + "xm_transformer_s2ut_800m-hk-en-h1_2022", + "xm_transformer_s2ut_800m-en-hk-h1_2022", + ] + return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + config_yaml="config.yaml", + task="speech_to_text", + generation_args=None, + **kwargs, + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + config_yaml=config_yaml, + task=task, + generation_args=generation_args, + **kwargs, + ) + return S2THubInterface(x["args"], x["task"], x["models"][0]) + + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @classmethod + def add_args(cls, parser): + """Add model-specific arguments to the parser.""" + Wav2VecEncoderWithAdaptor.add_args(parser) + add_decoder_args(parser) + parser.add_argument("--checkpoint-activations", action="store_true") + parser.add_argument("--offload-activations", action="store_true") + parser.add_argument("--min-params-to-wrap", type=int, metavar="N") + + @classmethod + def maybe_load_pretrained(cls, component, checkpoint: Optional[str] = None): + if checkpoint is None: + return component + + _load = checkpoint_utils.load_pretrained_component_from_model + try: + return _load(component, checkpoint) + except RuntimeError as e: + logger.warning(e) + return _load(component, checkpoint, strict=False) + + @classmethod + def build_encoder(cls, args): + _args = copy.deepcopy(args) + if not args.adaptor_proj and not args.encoder_proj: # V0 arch + if args.w2v_path: + state = checkpoint_utils.load_checkpoint_to_cpu(args.w2v_path) + if state.get("cfg") is not None: + encoder_embed_dim = state["cfg"]._content["model"][ + "encoder_embed_dim" + ] + elif state.get("args") is not None: + encoder_embed_dim = state["args"].encoder_embed_dim + else: + raise ValueError(f"Invalid config in {args.w2v_path}") + _args.decoder_embed_dim = encoder_embed_dim + del state + else: + _args.decoder_embed_dim = args.encoder_embed_dim + + encoder = Wav2VecEncoderWithAdaptor(_args) + encoder = cls.maybe_load_pretrained( + encoder, getattr(args, "load_pretrained_encoder_from", None) + ) + if args.remove_weight_norm: + # remove the wn for EMA usage + logger.warning("Removing weight norm from wav2vec encoder") + remove_weight_norm_from_model(encoder) + + return encoder + + @classmethod + def get_decoder_args_from_checkpoint(cls, ckpt_args): + assert "model" in ckpt_args, "Model args not found in checkpoint cfg!" + decoder_args = {} + for k, v in ckpt_args["model"].__dict__.items(): + if "decoder" in k: + decoder_args[k] = v + + return decoder_args + + @classmethod + def override_decoder_args(cls, cli_args, decoder_args_dict): + for k, v in decoder_args_dict.items(): + if v != getattr(cli_args, k, None): + logger.warning( + f"Overriding decoder arg {k}: from {getattr(cli_args, k, None)} to {v}" + ) + setattr(cli_args, k, v) + + return cli_args + + @classmethod + def build_decoder(cls, args, task, embed_tokens): + _args = copy.deepcopy(args) + if args.adaptor_proj or args.encoder_proj: # not V0 arch + _args.encoder_embed_dim = _args.decoder_embed_dim + _args.dropout = args.decoder_dropout + _args.attention_dropout = args.decoder_attention_dropout + _args.activation_dropout = args.decoder_activation_dropout + _args.layerdrop = _args.decoder_layerdrop + + decoder = TransformerDecoder(_args, task.target_dictionary, embed_tokens) + decoder = cls.maybe_load_pretrained( + decoder, getattr(args, "load_pretrained_decoder_from", None) + ) + + for k, p in decoder.named_parameters(): + p.requires_grad = need_finetuning(args.finetune_decoder_params, k) + return decoder + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + base_architecture(args) + if getattr(args, "load_pretrained_decoder_from", None) is not None: + ckpt = torch.load(getattr(args, "load_pretrained_decoder_from", None)) + decoder_args_dict = cls.get_decoder_args_from_checkpoint(ckpt["cfg"]) + args = cls.override_decoder_args(args, decoder_args_dict) + + decoder_embed_tokens = build_embedding( + task.target_dictionary, args.decoder_embed_dim + ) + + encoder = cls.build_encoder(args) + decoder = cls.build_decoder(args, task, decoder_embed_tokens) + base_model = cls(encoder, decoder) + + # set up multitask decoders + base_model.multitask_decoders = {} + for i, (task_name, task_obj) in enumerate(task.multitask_tasks.items()): + # dummy auxiliary decoder + if task_obj.args.get_loss_weight(0) == 0: + continue + + task_decoder = cls.build_multitask_decoder( + args, task_obj.args, task_obj.target_dictionary, args.decoder_embed_dim + ) + + setattr(base_model, f"{task_name}_decoder", task_decoder) + decoder_model_cls = ( + FairseqEncoderModel + if task_obj.args.decoder_type == "ctc" + else FairseqLanguageModel + ) + base_model.multitask_decoders[task_name] = decoder_model_cls( + getattr(base_model, f"{task_name}_decoder") + ) + return base_model + + @classmethod + def build_multitask_decoder( + cls, + args, + mtl_args, + tgt_dict, + in_dim, + is_first_pass_decoder=False, + ): + decoder_args = mtl_args.decoder_args + decoder_args.encoder_embed_dim = in_dim + if mtl_args.decoder_type == "transformer": + if is_first_pass_decoder: + task_decoder = cls.build_text_decoder(args, tgt_dict) + else: + from fairseq.models.speech_to_speech import ( + base_multitask_text_transformer_decoder_arch, + ) + + base_multitask_text_transformer_decoder_arch(decoder_args) # 2L + task_decoder = TransformerDecoder( + decoder_args, + tgt_dict, + embed_tokens=TransformerModelBase.build_embedding( + decoder_args, + tgt_dict, + decoder_args.decoder_embed_dim, + ), + ) + elif mtl_args.decoder_type == "ctc": + task_decoder = CTCDecoder( + dictionary=tgt_dict, + in_dim=in_dim, + ) + else: + raise NotImplementedError( + "currently only support multitask decoder_type 'transformer', 'ctc'" + ) + + return task_decoder + + def get_normalized_probs( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): + return self.get_normalized_probs_scriptable(net_output, log_probs, sample) + + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens, + return_all_hiddens=False, + **kwargs, + ): + """ + The forward method inherited from the base class has a **kwargs + argument in its input, which is not supported in torchscript. This + method overwrites the forward method definition without **kwargs. + """ + encoder_out = self.encoder( + src_tokens=src_tokens, src_lengths=src_lengths, **kwargs + ) + decoder_out = self.decoder( + prev_output_tokens=prev_output_tokens, encoder_out=encoder_out + ) + if return_all_hiddens: + decoder_out[-1]["encoder_states"] = encoder_out["encoder_out"] + # NOTE: from the top layer + decoder_out[-1]["encoder_padding_mask"] = encoder_out[ + "encoder_padding_mask" + ] + return decoder_out + + def upgrade_state_dict(self, state_dict): + for k, _ in state_dict.items(): + if "adaptor.layers" in state_dict: + new = k.replace("adaptor.layers", "adaptor_layers") + state_dict[new] = state_dict[k] + del state_dict[k] + + +def set_default_w2v_encoder_args(args): + args.no_pretrained_weights = getattr(args, "no_pretrained_weights", False) + args.dropout_input = getattr(args, "dropout_input", 0) + args.final_dropout = getattr(args, "final_dropout", 0) + args.apply_mask = getattr(args, "apply_mask", False) + args.dropout = getattr(args, "dropout", 0) + args.attention_dropout = getattr(args, "attention_dropout", 0) + args.activation_dropout = getattr(args, "activation_dropout", 0) + args.encoder_proj = getattr(args, "encoder_proj", False) + args.remove_weight_norm = getattr(args, "remove_weight_norm", False) + + args.mask_length = getattr(args, "mask_length", 10) + args.mask_prob = getattr(args, "mask_prob", 0.5) + args.mask_selection = getattr(args, "mask_selection", "static") + args.mask_other = getattr(args, "mask_other", 0) + args.no_mask_overlap = getattr(args, "no_mask_overlap", False) + args.mask_channel_length = getattr(args, "mask_channel_length", 10) + args.mask_channel_prob = getattr(args, "mask_channel_prob", 0.5) + args.mask_channel_before = getattr(args, "mask_channel_before", False) + args.mask_channel_selection = getattr(args, "mask_channel_selection", "static") + args.mask_channel_other = getattr(args, "mask_channel_other", 0) + args.no_mask_channel_overlap = getattr(args, "no_mask_channel_overlap", False) + + args.freeze_finetune_updates = getattr(args, "freeze_finetune_updates", 0) + args.feature_grad_mult = 0.1 + args.layerdrop = getattr(args, "layerdrop", 0.0) + + args.normalize = getattr(args, "normalize", False) + args.finetune_w2v_params = getattr(args, "finetune_w2v_params", "all") + args.w2v_freezing_updates = getattr(args, "w2v_freezing_updates", None) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + + +def set_default_adaptor_args(args): + args.adaptor_n_layers = getattr(args, "adaptor_n_layers", 3) + args.adaptor_kernel_size = getattr(args, "adaptor_kernel_size", 3) + args.adaptor_stride = getattr(args, "adaptor_stride", 2) + args.adaptor_layerdrop = getattr(args, "adaptor_layerdrop", 0.0) + args.adaptor_layernorm = getattr(args, "adaptor_layernorm", False) + args.adaptor_proj = getattr(args, "adaptor_proj", False) + + +def set_default_transformer_decoder_args(args): + args.decoder_embed_path = getattr(args, "decoder_embed_path", None) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4 * 1024) + args.decoder_layers = getattr(args, "decoder_layers", 12) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.decoder_attention_dropout = getattr(args, "decoder_attention_dropout", 0.0) + args.decoder_activation_dropout = getattr(args, "decoder_activation_dropout", 0.0) + args.decoder_dropout = getattr(args, "decoder_dropout", 0.1) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) + args.layernorm_embedding = getattr(args, "layernorm_embedding", False) + + args.activation_fn = getattr(args, "activation_fn", "gelu") + args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") + args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) + + args.finetune_decoder_params = getattr(args, "finetune_decoder_params", "all") + + +def set_default_general_args(args): + args.checkpoint_activations = getattr(args, "checkpoint_activations", False) + args.offload_activations = getattr(args, "offload_activations", False) + args.min_params_to_wrap = getattr(args, "min_params_to_wrap", int(1e8)) + args.max_positions = getattr(args, "max_positions", 3000) + + +@register_model_architecture(model_name="xm_transformer", arch_name="xm_transformer") +def base_architecture(args): + set_default_general_args(args) + set_default_w2v_encoder_args(args) + set_default_adaptor_args(args) + set_default_transformer_decoder_args(args) diff --git a/fairseq/models/speech_to_text/xm_transformer_unity.py b/fairseq/models/speech_to_text/xm_transformer_unity.py new file mode 100644 index 0000000000..f77ef4e570 --- /dev/null +++ b/fairseq/models/speech_to_text/xm_transformer_unity.py @@ -0,0 +1,315 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import logging + +from fairseq.models import ( + FairseqEncoderModel, + FairseqLanguageModel, + register_model, + register_model_architecture, +) +from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder +from fairseq.models.speech_to_speech.modules.transformer_encoder import ( + TransformerEncoderNoEmb, +) +from fairseq.models.speech_to_text.xm_transformer import XMTransformerModel +from fairseq.models.speech_to_text.xm_transformer import ( + base_architecture as xm_t_base_architecture, +) +from fairseq.models.speech_to_text.xm_transformer import ( + build_embedding, + need_finetuning, + set_default_adaptor_args, + set_default_general_args, + set_default_transformer_decoder_args, + set_default_w2v_encoder_args, +) +from fairseq.models.transformer import Linear, TransformerDecoder, TransformerModelBase +from fairseq.models.transformer.transformer_decoder_aug import AugTransformerDecoder + +logger = logging.getLogger(__name__) + + +def unit_transformer_decoder_arch_base( + args, decoder_layers=6, decoder_embed_dim=768, decoder_attention_heads=12 +): + args.encoder_layers = decoder_layers + args.decoder_layers = decoder_layers + args.decoder_embed_dim = decoder_embed_dim + args.decoder_ffn_embed_dim = decoder_embed_dim * 4 + args.decoder_attention_heads = decoder_attention_heads + args.encoder_embed_dim = args.decoder_embed_dim + args.decoder_output_dim = decoder_embed_dim + args.decoder_input_dim = decoder_embed_dim + + +def unit_transformer_decoder_arch_large( + args, decoder_layers=12, decoder_embed_dim=1024, decoder_attention_heads=16 +): + args.encoder_layers = decoder_layers + args.decoder_layers = decoder_layers + args.decoder_embed_dim = decoder_embed_dim + args.decoder_ffn_embed_dim = decoder_embed_dim * 4 + args.decoder_attention_heads = decoder_attention_heads + args.encoder_embed_dim = args.decoder_embed_dim + args.decoder_output_dim = decoder_embed_dim + args.decoder_input_dim = decoder_embed_dim + + +@register_model("unity_xm_transformer") +class XMTransformerModelUnitY(XMTransformerModel): + @classmethod + def hub_models(cls): + base_url = "http://dl.fbaipublicfiles.com/fairseq/s2t" + model_ids = [] + return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} + + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @classmethod + def add_args(cls, parser): + """Add model-specific arguments to the parser.""" + XMTransformerModel.add_args(parser) + parser.add_argument( + "--translation-decoder-layers", + type=int, + default=4, + metavar="N", + help="num decoder layers in the first-pass translation module", + ) + parser.add_argument( + "--synthesizer-encoder-layers", + type=int, + default=0, + metavar="N", + help="num encoder layers in the second-pass synthesizer module", + ) + parser.add_argument( + "--synthesizer-augmented-cross-attention", + action="store_true", + default=False, + help="augmented cross-attention over speech encoder output", + ) + parser.add_argument( + "--load-pretrained-aux-decoder-from", + type=str, + metavar="STR", + help="model to take decoder weights from (for initialization)", + ) + + @classmethod + def build_text_decoder(cls, args, tgt_dict): + _args = copy.deepcopy(args) + + if args.adaptor_proj or args.encoder_proj: # not V0 arch + _args.encoder_embed_dim = _args.decoder_embed_dim + _args.dropout = args.decoder_dropout + _args.attention_dropout = args.decoder_attention_dropout + _args.activation_dropout = args.decoder_activation_dropout + _args.layerdrop = _args.decoder_layerdrop + _args.decoder_layers = _args.translation_decoder_layers + + embed_tokens = build_embedding(tgt_dict, _args.decoder_embed_dim) + decoder = TransformerDecoder(_args, tgt_dict, embed_tokens) + + if getattr(args, "load_pretrained_aux_decoder_from", None) is not None: + decoder = cls.maybe_load_pretrained( + decoder, getattr(args, "load_pretrained_aux_decoder_from", None) + ) + + for k, p in decoder.named_parameters(): + p.requires_grad = need_finetuning(args.finetune_decoder_params, k) + return decoder + + @classmethod + def build_decoder(cls, args, task, aug_attn=False): + _args = copy.deepcopy(args) + _args.layerdrop = 0.0 # turn off layerdrop for shallow layers + + _args.encoder_embed_dim = args.decoder_embed_dim + + proj = None + if args.decoder_embed_dim != _args.decoder_embed_dim: + proj = Linear(args.decoder_embed_dim, _args.decoder_embed_dim) + + embed_tokens = build_embedding(task.target_dictionary, _args.decoder_embed_dim) + decoder_cls = AugTransformerDecoder if aug_attn else TransformerDecoder + decoder = decoder_cls(_args, task.target_dictionary, embed_tokens) + + if getattr(args, "load_pretrained_decoder_from", None) is not None: + # load all layers first and then discard the bottom layers + embed_tokens = build_embedding( + task.target_dictionary, _args.decoder_embed_dim + ) + decoder_tmp = decoder_cls(_args, task.target_dictionary, embed_tokens) + decoder_tmp = cls.maybe_load_pretrained( + decoder_tmp, getattr(_args, "load_pretrained_decoder_from", None) + ) + state_dict = decoder_tmp.state_dict() + for k, p in decoder.named_parameters(): + p.data = state_dict[k].data + p.requires_grad = need_finetuning(_args.finetune_decoder_params, k) + decoder.layers = decoder.layers[-_args.decoder_layers :] + + return decoder, proj, _args + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + xm_t_base_architecture(args) + + encoder = cls.build_encoder(args) + decoder, proj, unit_args = cls.build_decoder( + args, + task, + aug_attn=getattr(args, "synthesizer_augmented_cross_attention", False), + ) + base_model = cls(encoder, decoder) + setattr(base_model, "proj", proj) + + base_model.t2u_augmented_cross_attn = getattr( + args, "synthesizer_augmented_cross_attention", False + ) + + # set up multitask decoders + base_model.mt_task_name = None + base_model.multitask_decoders = {} + has_first_pass_decoder = False + for task_name, task_obj in task.multitask_tasks.items(): + if task_obj.is_first_pass_decoder: + has_first_pass_decoder = True + base_model.mt_task_name = task_name + + task_decoder = cls.build_multitask_decoder( + args, + task_obj.args, + task_obj.target_dictionary, + args.decoder_embed_dim, + task_obj.is_first_pass_decoder, + ) + + setattr(base_model, f"{task_name}_decoder", task_decoder) + decoder_model_cls = ( + FairseqEncoderModel + if task_obj.args.decoder_type == "ctc" + else FairseqLanguageModel + ) + base_model.multitask_decoders[task_name] = decoder_model_cls( + getattr(base_model, f"{task_name}_decoder") + ) + + assert has_first_pass_decoder, "set at least one intermediate non-CTC decoder" + + # set up encoder on top of the auxiliary MT decoder + if getattr(args, "synthesizer_encoder_layers", 0) > 0: + base_model.synthesizer_encoder = cls.build_t2u_encoder(unit_args) + else: + base_model.synthesizer_encoder = None + + return base_model + + @classmethod + def build_t2u_encoder(cls, args): + _args = copy.deepcopy(args) + _args.encoder_layers = _args.synthesizer_encoder_layers + _args.encoder_embed_dim = args.decoder_embed_dim + _args.encoder_ffn_embed_dim = args.decoder_ffn_embed_dim + _args.encoder_attention_heads = args.decoder_attention_heads + _args.encoder_normalize_before = True + return TransformerEncoderNoEmb(_args) + + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens, + prev_output_tokens_mt, + return_all_hiddens=False, + tgt_speaker=None, + **kwargs, + ): + """ + The forward method inherited from the base class has a **kwargs + argument in its input, which is not supported in torchscript. This + method overwrites the forward method definition without **kwargs. + """ + encoder_out = self.encoder( + src_tokens=src_tokens, src_lengths=src_lengths, **kwargs + ) + + # 1. MT decoder + mt_decoder = getattr(self, f"{self.mt_task_name}_decoder") + mt_decoder_out = mt_decoder( + prev_output_tokens_mt, + encoder_out=encoder_out, + ) + x = mt_decoder_out[1]["inner_states"][-1] + if mt_decoder.layer_norm is not None: + x = mt_decoder.layer_norm(x) + if self.proj is not None: + x = self.proj(x) + + mt_decoder_padding_mask = None + if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any(): + mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx) + + # 2. T2U encoder + if self.synthesizer_encoder is not None: + t2u_encoder_out = self.synthesizer_encoder( + x, + mt_decoder_padding_mask, + ) + else: + t2u_encoder_out = { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [mt_decoder_padding_mask], # B x T + } + + # 3. T2U decoder + if self.t2u_augmented_cross_attn: + decoder_out = self.decoder( + prev_output_tokens, + encoder_out=encoder_out, + encoder_out_aug=t2u_encoder_out, + ) + else: + decoder_out = self.decoder( + prev_output_tokens, + encoder_out=t2u_encoder_out, + ) + if return_all_hiddens: + decoder_out[-1]["encoder_states"] = encoder_out["encoder_out"] + # NOTE: from the top layer + decoder_out[-1]["encoder_padding_mask"] = encoder_out[ + "encoder_padding_mask" + ] + decoder_out[-1]["mt_decoder_out"] = mt_decoder_out + return decoder_out + + +@register_model_architecture( + model_name="unity_xm_transformer", arch_name="unity_xm_transformer" +) +def base_architecture_unity(args): + set_default_general_args(args) + set_default_w2v_encoder_args(args) + set_default_adaptor_args(args) + set_default_transformer_decoder_args(args) + + args.layernorm_embedding = False + args.decoder_learned_pos = False + + +# for old models +@register_model_architecture( + model_name="unity_xm_transformer", arch_name="xm_transformer_t2" +) +def base_architecture_unity_legacy(args): + base_architecture_unity(args) diff --git a/fairseq/models/text_to_speech/__init__.py b/fairseq/models/text_to_speech/__init__.py new file mode 100644 index 0000000000..c0dcd69b07 --- /dev/null +++ b/fairseq/models/text_to_speech/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .tacotron2 import * # noqa +from .tts_transformer import * # noqa +from .fastspeech2 import * # noqa +from .vocoder import * # noqa diff --git a/fairseq/models/text_to_speech/codehifigan.py b/fairseq/models/text_to_speech/codehifigan.py new file mode 100644 index 0000000000..d1574dd63f --- /dev/null +++ b/fairseq/models/text_to_speech/codehifigan.py @@ -0,0 +1,95 @@ +from argparse import Namespace +import torch +import torch.nn as nn + +from fairseq.models.text_to_speech.fastspeech2 import VariancePredictor +from fairseq.models.text_to_speech.hifigan import Generator + + +class CodeGenerator(Generator): + def __init__(self, cfg): + super().__init__(cfg) + self.dict = nn.Embedding(cfg["num_embeddings"], cfg["embedding_dim"]) + self.multispkr = cfg.get("multispkr", None) + self.embedder = cfg.get("embedder_params", None) + + if self.multispkr and not self.embedder: + self.spkr = nn.Embedding(cfg.get("num_speakers", 200), cfg["embedding_dim"]) + elif self.embedder: + self.spkr = nn.Linear(cfg.get("embedder_dim", 256), cfg["embedding_dim"]) + + self.dur_predictor = None + if cfg.get("dur_predictor_params", None): + self.dur_predictor = VariancePredictor( + Namespace(**cfg["dur_predictor_params"]) + ) + + self.f0 = cfg.get("f0", None) + n_f0_bin = cfg.get("f0_quant_num_bin", 0) + self.f0_quant_embed = ( + None if n_f0_bin <= 0 else nn.Embedding(n_f0_bin, cfg["embedding_dim"]) + ) + + @staticmethod + def _upsample(signal, max_frames): + if signal.dim() == 3: + bsz, channels, cond_length = signal.size() + elif signal.dim() == 2: + signal = signal.unsqueeze(2) + bsz, channels, cond_length = signal.size() + else: + signal = signal.view(-1, 1, 1) + bsz, channels, cond_length = signal.size() + + signal = signal.unsqueeze(3).repeat(1, 1, 1, max_frames // cond_length) + + # pad zeros as needed (if signal's shape does not divide completely with max_frames) + reminder = (max_frames - signal.shape[2] * signal.shape[3]) // signal.shape[3] + if reminder > 0: + raise NotImplementedError( + "Padding condition signal - misalignment between condition features." + ) + + signal = signal.view(bsz, channels, max_frames) + return signal + + def forward(self, **kwargs): + x = self.dict(kwargs["code"]).transpose(1, 2) + + if self.dur_predictor and kwargs.get("dur_prediction", False): + assert x.size(0) == 1, "only support single sample" + log_dur_pred = self.dur_predictor(x.transpose(1, 2)) + dur_out = torch.clamp( + torch.round((torch.exp(log_dur_pred) - 1)).long(), min=1 + ) + # B x C x T + x = torch.repeat_interleave(x, dur_out.view(-1), dim=2) + + if self.f0: + if self.f0_quant_embed: + kwargs["f0"] = self.f0_quant_embed(kwargs["f0"].long()).transpose(1, 2) + else: + kwargs["f0"] = kwargs["f0"].unsqueeze(1) + + if x.shape[-1] < kwargs["f0"].shape[-1]: + x = self._upsample(x, kwargs["f0"].shape[-1]) + elif x.shape[-1] > kwargs["f0"].shape[-1]: + kwargs["f0"] = self._upsample(kwargs["f0"], x.shape[-1]) + x = torch.cat([x, kwargs["f0"]], dim=1) + + if self.multispkr: + assert ( + "spkr" in kwargs + ), 'require "spkr" input for multispeaker CodeHiFiGAN vocoder' + spkr = self.spkr(kwargs["spkr"]).transpose(1, 2) + spkr = self._upsample(spkr, x.shape[-1]) + x = torch.cat([x, spkr], dim=1) + + for k, feat in kwargs.items(): + if k in ["spkr", "code", "f0", "dur_prediction"]: + continue + + feat = self._upsample(feat, x.shape[-1]) + x = torch.cat([x, feat], dim=1) + + return super().forward(x) diff --git a/fairseq/models/text_to_speech/fastspeech2.py b/fairseq/models/text_to_speech/fastspeech2.py new file mode 100644 index 0000000000..fb2d0df37d --- /dev/null +++ b/fairseq/models/text_to_speech/fastspeech2.py @@ -0,0 +1,448 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch +from torch import nn + +from fairseq import utils +from fairseq.data.data_utils import lengths_to_padding_mask +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderModel, + register_model, + register_model_architecture, +) +from fairseq.models.text_to_speech.hub_interface import TTSHubInterface +from fairseq.models.text_to_speech.tacotron2 import Postnet +from fairseq.modules import ( + FairseqDropout, + LayerNorm, + MultiheadAttention, + PositionalEmbedding, +) + +logger = logging.getLogger(__name__) + + +def model_init(m): + if isinstance(m, nn.Conv1d): + nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("relu")) + + +def Embedding(num_embeddings, embedding_dim, padding_idx=None): + m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) + return m + + +class PositionwiseFeedForward(nn.Module): + def __init__(self, in_dim, hidden_dim, kernel_size, dropout): + super().__init__() + self.ffn = nn.Sequential( + nn.Conv1d( + in_dim, + hidden_dim, + kernel_size=kernel_size, + padding=(kernel_size - 1) // 2, + ), + nn.ReLU(), + nn.Conv1d( + hidden_dim, + in_dim, + kernel_size=kernel_size, + padding=(kernel_size - 1) // 2, + ), + ) + self.layer_norm = LayerNorm(in_dim) + self.dropout = self.dropout_module = FairseqDropout( + p=dropout, module_name=self.__class__.__name__ + ) + + def forward(self, x): + # B x T x C + residual = x + x = self.ffn(x.transpose(1, 2)).transpose(1, 2) + x = self.dropout(x) + return self.layer_norm(x + residual) + + +class FFTLayer(torch.nn.Module): + def __init__( + self, embed_dim, n_heads, hidden_dim, kernel_size, dropout, attention_dropout + ): + super().__init__() + self.self_attn = MultiheadAttention( + embed_dim, n_heads, dropout=attention_dropout, self_attention=True + ) + self.layer_norm = LayerNorm(embed_dim) + self.ffn = PositionwiseFeedForward( + embed_dim, hidden_dim, kernel_size, dropout=dropout + ) + + def forward(self, x, padding_mask=None): + # B x T x C + residual = x + x = x.transpose(0, 1) + x, _ = self.self_attn( + query=x, key=x, value=x, key_padding_mask=padding_mask, need_weights=False + ) + x = x.transpose(0, 1) + x = self.layer_norm(x + residual) + return self.ffn(x) + + +class LengthRegulator(nn.Module): + def forward(self, x, durations): + # x: B x T x C + out_lens = durations.sum(dim=1) + max_len = out_lens.max() + bsz, seq_len, dim = x.size() + out = x.new_zeros((bsz, max_len, dim)) + + for b in range(bsz): + indices = [] + for t in range(seq_len): + indices.extend([t] * utils.item(durations[b, t])) + indices = torch.tensor(indices, dtype=torch.long).to(x.device) + out_len = utils.item(out_lens[b]) + out[b, :out_len] = x[b].index_select(0, indices) + + return out, out_lens + + +class VariancePredictor(nn.Module): + def __init__(self, args): + super().__init__() + self.conv1 = nn.Sequential( + nn.Conv1d( + args.encoder_embed_dim, + args.var_pred_hidden_dim, + kernel_size=args.var_pred_kernel_size, + padding=(args.var_pred_kernel_size - 1) // 2, + ), + nn.ReLU(), + ) + self.ln1 = nn.LayerNorm(args.var_pred_hidden_dim) + self.dropout_module = FairseqDropout( + p=args.var_pred_dropout, module_name=self.__class__.__name__ + ) + self.conv2 = nn.Sequential( + nn.Conv1d( + args.var_pred_hidden_dim, + args.var_pred_hidden_dim, + kernel_size=args.var_pred_kernel_size, + padding=1, + ), + nn.ReLU(), + ) + self.ln2 = nn.LayerNorm(args.var_pred_hidden_dim) + self.proj = nn.Linear(args.var_pred_hidden_dim, 1) + + def forward(self, x): + # Input: B x T x C; Output: B x T + x = self.conv1(x.transpose(1, 2)).transpose(1, 2) + x = self.dropout_module(self.ln1(x)) + x = self.conv2(x.transpose(1, 2)).transpose(1, 2) + x = self.dropout_module(self.ln2(x)) + return self.proj(x).squeeze(dim=2) + + +class VarianceAdaptor(nn.Module): + def __init__(self, args): + super().__init__() + self.args = args + self.length_regulator = LengthRegulator() + self.duration_predictor = VariancePredictor(args) + self.pitch_predictor = VariancePredictor(args) + self.energy_predictor = VariancePredictor(args) + + n_bins, steps = self.args.var_pred_n_bins, self.args.var_pred_n_bins - 1 + self.pitch_bins = torch.linspace(args.pitch_min, args.pitch_max, steps) + self.embed_pitch = Embedding(n_bins, args.encoder_embed_dim) + self.energy_bins = torch.linspace(args.energy_min, args.energy_max, steps) + self.embed_energy = Embedding(n_bins, args.encoder_embed_dim) + + def get_pitch_emb(self, x, tgt=None, factor=1.0): + out = self.pitch_predictor(x) + bins = self.pitch_bins.to(x.device) + if tgt is None: + out = out * factor + emb = self.embed_pitch(torch.bucketize(out, bins)) + else: + emb = self.embed_pitch(torch.bucketize(tgt, bins)) + return out, emb + + def get_energy_emb(self, x, tgt=None, factor=1.0): + out = self.energy_predictor(x) + bins = self.energy_bins.to(x.device) + if tgt is None: + out = out * factor + emb = self.embed_energy(torch.bucketize(out, bins)) + else: + emb = self.embed_energy(torch.bucketize(tgt, bins)) + return out, emb + + def forward( + self, + x, + padding_mask, + durations=None, + pitches=None, + energies=None, + d_factor=1.0, + p_factor=1.0, + e_factor=1.0, + ): + # x: B x T x C + log_dur_out = self.duration_predictor(x) + dur_out = torch.clamp( + torch.round((torch.exp(log_dur_out) - 1) * d_factor).long(), min=0 + ) + dur_out.masked_fill_(padding_mask, 0) + + pitch_out, pitch_emb = self.get_pitch_emb(x, pitches, p_factor) + x = x + pitch_emb + energy_out, energy_emb = self.get_energy_emb(x, energies, e_factor) + x = x + energy_emb + + x, out_lens = self.length_regulator( + x, dur_out if durations is None else durations + ) + + return x, out_lens, log_dur_out, pitch_out, energy_out + + +class FastSpeech2Encoder(FairseqEncoder): + def __init__(self, args, src_dict, embed_speaker): + super().__init__(src_dict) + self.args = args + self.padding_idx = src_dict.pad() + self.n_frames_per_step = args.n_frames_per_step + self.out_dim = args.output_frame_dim * args.n_frames_per_step + + self.embed_speaker = embed_speaker + self.spk_emb_proj = None + if embed_speaker is not None: + self.spk_emb_proj = nn.Linear( + args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim + ) + + self.dropout_module = FairseqDropout( + p=args.dropout, module_name=self.__class__.__name__ + ) + self.embed_tokens = Embedding( + len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx + ) + + self.embed_positions = PositionalEmbedding( + args.max_source_positions, args.encoder_embed_dim, self.padding_idx + ) + self.pos_emb_alpha = nn.Parameter(torch.ones(1)) + self.dec_pos_emb_alpha = nn.Parameter(torch.ones(1)) + + self.encoder_fft_layers = nn.ModuleList( + FFTLayer( + args.encoder_embed_dim, + args.encoder_attention_heads, + args.fft_hidden_dim, + args.fft_kernel_size, + dropout=args.dropout, + attention_dropout=args.attention_dropout, + ) + for _ in range(args.encoder_layers) + ) + + self.var_adaptor = VarianceAdaptor(args) + + self.decoder_fft_layers = nn.ModuleList( + FFTLayer( + args.decoder_embed_dim, + args.decoder_attention_heads, + args.fft_hidden_dim, + args.fft_kernel_size, + dropout=args.dropout, + attention_dropout=args.attention_dropout, + ) + for _ in range(args.decoder_layers) + ) + + self.out_proj = nn.Linear(args.decoder_embed_dim, self.out_dim) + + self.postnet = None + if args.add_postnet: + self.postnet = Postnet( + self.out_dim, + args.postnet_conv_dim, + args.postnet_conv_kernel_size, + args.postnet_layers, + args.postnet_dropout, + ) + + self.apply(model_init) + + def forward( + self, + src_tokens, + src_lengths=None, + speaker=None, + durations=None, + pitches=None, + energies=None, + **kwargs, + ): + x = self.embed_tokens(src_tokens) + + enc_padding_mask = src_tokens.eq(self.padding_idx) + x += self.pos_emb_alpha * self.embed_positions(enc_padding_mask) + x = self.dropout_module(x) + + for layer in self.encoder_fft_layers: + x = layer(x, enc_padding_mask) + + if self.embed_speaker is not None: + bsz, seq_len, _ = x.size() + emb = self.embed_speaker(speaker).expand(bsz, seq_len, -1) + x = self.spk_emb_proj(torch.cat([x, emb], dim=2)) + + x, out_lens, log_dur_out, pitch_out, energy_out = self.var_adaptor( + x, enc_padding_mask, durations, pitches, energies + ) + + dec_padding_mask = lengths_to_padding_mask(out_lens) + x += self.dec_pos_emb_alpha * self.embed_positions(dec_padding_mask) + for layer in self.decoder_fft_layers: + x = layer(x, dec_padding_mask) + + x = self.out_proj(x) + x_post = None + if self.postnet is not None: + x_post = x + self.postnet(x) + return x, x_post, out_lens, log_dur_out, pitch_out, energy_out + + +@register_model("fastspeech2") +class FastSpeech2Model(FairseqEncoderModel): + """ + Implementation for https://arxiv.org/abs/2006.04558 + """ + + NON_AUTOREGRESSIVE = True + + @classmethod + def hub_models(cls): + base_url = "http://dl.fbaipublicfiles.com/fairseq/s2" + model_ids = [ + "fastspeech2-en-ljspeech", + "fastspeech2-en-200_speaker-cv4", + ] + return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + config_yaml="config.yaml", + vocoder: str = "griffin_lim", + fp16: bool = False, + **kwargs, + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + config_yaml=config_yaml, + vocoder=vocoder, + fp16=fp16, + **kwargs, + ) + return TTSHubInterface(x["args"], x["task"], x["models"][0]) + + @staticmethod + def add_args(parser): + parser.add_argument("--dropout", type=float) + parser.add_argument("--output-frame-dim", type=int) + parser.add_argument("--speaker-embed-dim", type=int) + # FFT blocks + parser.add_argument("--fft-hidden-dim", type=int) + parser.add_argument("--fft-kernel-size", type=int) + parser.add_argument("--attention-dropout", type=float) + parser.add_argument("--encoder-layers", type=int) + parser.add_argument("--encoder-embed-dim", type=int) + parser.add_argument("--encoder-attention-heads", type=int) + parser.add_argument("--decoder-layers", type=int) + parser.add_argument("--decoder-embed-dim", type=int) + parser.add_argument("--decoder-attention-heads", type=int) + # variance predictor + parser.add_argument("--var-pred-n-bins", type=int) + parser.add_argument("--var-pred-hidden-dim", type=int) + parser.add_argument("--var-pred-kernel-size", type=int) + parser.add_argument("--var-pred-dropout", type=float) + # postnet + parser.add_argument("--add-postnet", action="store_true") + parser.add_argument("--postnet-dropout", type=float) + parser.add_argument("--postnet-layers", type=int) + parser.add_argument("--postnet-conv-dim", type=int) + parser.add_argument("--postnet-conv-kernel-size", type=int) + + def __init__(self, encoder, args, src_dict): + super().__init__(encoder) + self._num_updates = 0 + + out_dim = args.output_frame_dim * args.n_frames_per_step + self.ctc_proj = None + if getattr(args, "ctc_weight", 0.0) > 0.0: + self.ctc_proj = nn.Linear(out_dim, len(src_dict)) + + @classmethod + def build_model(cls, args, task): + embed_speaker = task.get_speaker_embeddings(args) + encoder = FastSpeech2Encoder(args, task.src_dict, embed_speaker) + return cls(encoder, args, task.src_dict) + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + self._num_updates = num_updates + + def get_normalized_probs(self, net_output, log_probs, sample=None): + logits = self.ctc_proj(net_output[0]) + if log_probs: + return utils.log_softmax(logits.float(), dim=-1) + else: + return utils.softmax(logits.float(), dim=-1) + + +@register_model_architecture("fastspeech2", "fastspeech2") +def base_architecture(args): + args.dropout = getattr(args, "dropout", 0.2) + args.output_frame_dim = getattr(args, "output_frame_dim", 80) + args.speaker_embed_dim = getattr(args, "speaker_embed_dim", 64) + # FFT blocks + args.fft_hidden_dim = getattr(args, "fft_hidden_dim", 1024) + args.fft_kernel_size = getattr(args, "fft_kernel_size", 9) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + args.encoder_layers = getattr(args, "encoder_layers", 4) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 2) + args.decoder_layers = getattr(args, "decoder_layers", 4) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 2) + # variance predictor + args.var_pred_n_bins = getattr(args, "var_pred_n_bins", 256) + args.var_pred_hidden_dim = getattr(args, "var_pred_hidden_dim", 256) + args.var_pred_kernel_size = getattr(args, "var_pred_kernel_size", 3) + args.var_pred_dropout = getattr(args, "var_pred_dropout", 0.5) + # postnet + args.add_postnet = getattr(args, "add_postnet", False) + args.postnet_dropout = getattr(args, "postnet_dropout", 0.5) + args.postnet_layers = getattr(args, "postnet_layers", 5) + args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512) + args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5) diff --git a/fairseq/models/text_to_speech/hifigan.py b/fairseq/models/text_to_speech/hifigan.py new file mode 100644 index 0000000000..a852beefec --- /dev/null +++ b/fairseq/models/text_to_speech/hifigan.py @@ -0,0 +1,179 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Conv1d, ConvTranspose1d +from torch.nn.utils import remove_weight_norm, weight_norm + +LRELU_SLOPE = 0.1 + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return (kernel_size * dilation - dilation) // 2 + + +class ResBlock(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock, self).__init__() + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for layer in self.convs1: + remove_weight_norm(layer) + for layer in self.convs2: + remove_weight_norm(layer) + + +class Generator(torch.nn.Module): + def __init__(self, cfg): + super(Generator, self).__init__() + self.num_kernels = len(cfg["resblock_kernel_sizes"]) + self.num_upsamples = len(cfg["upsample_rates"]) + self.conv_pre = weight_norm( + Conv1d( + cfg.get("model_in_dim", 80), + cfg["upsample_initial_channel"], + 7, + 1, + padding=3, + ) + ) + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate( + zip(cfg["upsample_rates"], cfg["upsample_kernel_sizes"]) + ): + self.ups.append( + weight_norm( + ConvTranspose1d( + cfg["upsample_initial_channel"] // (2**i), + cfg["upsample_initial_channel"] // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = cfg["upsample_initial_channel"] // (2 ** (i + 1)) + for k, d in zip( + cfg["resblock_kernel_sizes"], cfg["resblock_dilation_sizes"] + ): + self.resblocks.append(ResBlock(ch, k, d)) + + self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x): + x = self.conv_pre(x) + for i in range(self.num_upsamples): + x = F.leaky_relu(x, LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + print("Removing weight norm...") + for layer in self.ups: + remove_weight_norm(layer) + for layer in self.resblocks: + layer.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) diff --git a/fairseq/models/text_to_speech/hub_interface.py b/fairseq/models/text_to_speech/hub_interface.py new file mode 100644 index 0000000000..e251c65c1d --- /dev/null +++ b/fairseq/models/text_to_speech/hub_interface.py @@ -0,0 +1,188 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import random +from pathlib import Path +from typing import Dict, Optional, Tuple + +import torch +import torch.nn as nn + +logger = logging.getLogger(__name__) + + +class TTSHubInterface(nn.Module): + def __init__(self, cfg, task, model): + super().__init__() + self.cfg = cfg + self.task = task + self.model = model + self.model.eval() + + self.update_cfg_with_data_cfg(self.cfg, self.task.data_cfg) + self.generator = self.task.build_generator([self.model], self.cfg) + + @classmethod + def phonemize( + cls, + text: str, + lang: Optional[str], + phonemizer: Optional[str] = None, + preserve_punct: bool = False, + to_simplified_zh: bool = False, + ): + if to_simplified_zh: + import hanziconv + + text = hanziconv.HanziConv.toSimplified(text) + + if phonemizer == "g2p": + import g2p_en + + g2p = g2p_en.G2p() + if preserve_punct: + return " ".join("|" if p == " " else p for p in g2p(text)) + else: + res = [{",": "sp", ";": "sp"}.get(p, p) for p in g2p(text)] + return " ".join(p for p in res if p.isalnum()) + if phonemizer == "g2pc": + import g2pc + + g2p = g2pc.G2pC() + return " ".join([w[3] for w in g2p(text)]) + elif phonemizer == "ipa": + assert lang is not None + import phonemizer + from phonemizer.separator import Separator + + lang_map = {"en": "en-us", "fr": "fr-fr"} + return phonemizer.phonemize( + text, + backend="espeak", + language=lang_map.get(lang, lang), + separator=Separator(word="| ", phone=" "), + ) + else: + return text + + @classmethod + def tokenize(cls, text: str, tkn_cfg: Dict[str, str]): + sentencepiece_model = tkn_cfg.get("sentencepiece_model", None) + if sentencepiece_model is not None: + assert Path(sentencepiece_model).exists() + import sentencepiece as sp + + spm = sp.SentencePieceProcessor() + spm.Load(sentencepiece_model) + return " ".join(spm.Encode(text, out_type=str)) + else: + return text + + @classmethod + def update_cfg_with_data_cfg(cls, cfg, data_cfg): + cfg["task"].vocoder = data_cfg.vocoder.get("type", "griffin_lim") + + @classmethod + def get_model_input( + cls, task, text: str, speaker: Optional[int] = None, verbose: bool = False + ): + phonemized = cls.phonemize( + text, + task.data_cfg.hub.get("lang", None), + task.data_cfg.hub.get("phonemizer", None), + task.data_cfg.hub.get("preserve_punct", False), + task.data_cfg.hub.get("to_simplified_zh", False), + ) + tkn_cfg = task.data_cfg.bpe_tokenizer + tokenized = cls.tokenize(phonemized, tkn_cfg) + if verbose: + logger.info(f"text: {text}") + logger.info(f"phonemized: {phonemized}") + logger.info(f"tokenized: {tokenized}") + + spk = task.data_cfg.hub.get("speaker", speaker) + n_speakers = len(task.speaker_to_id or {}) + if spk is None and n_speakers > 0: + spk = random.randint(0, n_speakers - 1) + if spk is not None: + spk = max(0, min(spk, n_speakers - 1)) + if verbose: + logger.info(f"speaker: {spk}") + spk = None if spk is None else torch.Tensor([[spk]]).long() + + src_tokens = task.src_dict.encode_line(tokenized, add_if_not_exist=False).view( + 1, -1 + ) + src_lengths = torch.Tensor([len(tokenized.split())]).long() + return { + "net_input": { + "src_tokens": src_tokens, + "src_lengths": src_lengths, + "prev_output_tokens": None, + }, + "target_lengths": None, + "speaker": spk, + } + + @classmethod + def get_prediction(cls, task, model, generator, sample) -> Tuple[torch.Tensor, int]: + prediction = generator.generate(model, sample) + return prediction[0]["waveform"], task.sr + + def predict( + self, text: str, speaker: Optional[int] = None, verbose: bool = False + ) -> Tuple[torch.Tensor, int]: + sample = self.get_model_input(self.task, text, speaker, verbose=verbose) + return self.get_prediction(self.task, self.model, self.generator, sample) + + +class VocoderHubInterface(nn.Module): + """Vocoder interface to run vocoder models through hub. Currently we only support unit vocoder""" + + def __init__(self, cfg, model): + super().__init__() + self.vocoder = model + self.vocoder.eval() + self.sr = 16000 + self.multispkr = self.vocoder.model.multispkr + if self.multispkr: + logger.info("multi-speaker vocoder") + self.num_speakers = cfg.get( + "num_speakers", + 200, + ) # following the default in codehifigan to set to 200 + + def get_model_input( + self, + text: str, + speaker: Optional[int] = -1, + ): + units = list(map(int, text.strip().split())) + x = { + "code": torch.LongTensor(units).view(1, -1), + } + if not speaker: + speaker = -1 + if self.multispkr: + assert ( + speaker < self.num_speakers + ), f"invalid --speaker-id ({speaker}) with total #speakers = {self.num_speakers}" + spk = random.randint(0, self.num_speakers - 1) if speaker == -1 else speaker + x["spkr"] = torch.LongTensor([spk]).view(1, 1) + return x + + def get_prediction(self, sample, dur_prediction: Optional[bool] = True): + wav = self.vocoder(sample, dur_prediction) + return wav, self.sr + + def predict( + self, + text: str, + speaker: Optional[int] = None, + dur_prediction: Optional[bool] = True, + ): + sample = self.get_model_input(text, speaker) + return self.get_prediction(sample, dur_prediction) diff --git a/fairseq/models/text_to_speech/tacotron2.py b/fairseq/models/text_to_speech/tacotron2.py new file mode 100644 index 0000000000..4df4075617 --- /dev/null +++ b/fairseq/models/text_to_speech/tacotron2.py @@ -0,0 +1,380 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch +from torch import nn +from torch.nn import functional as F + +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderDecoderModel, + FairseqIncrementalDecoder, + register_model, + register_model_architecture, +) +from fairseq.modules import LSTMCellWithZoneOut, LocationAttention + + +logger = logging.getLogger(__name__) + + +def encoder_init(m): + if isinstance(m, nn.Conv1d): + nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("relu")) + + +class Tacotron2Encoder(FairseqEncoder): + def __init__(self, args, src_dict, embed_speaker): + super().__init__(src_dict) + self.padding_idx = src_dict.pad() + self.embed_speaker = embed_speaker + self.spk_emb_proj = None + if embed_speaker is not None: + self.spk_emb_proj = nn.Linear( + args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim + ) + + self.embed_tokens = nn.Embedding( + len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx + ) + + assert args.encoder_conv_kernel_size % 2 == 1 + self.convolutions = nn.ModuleList( + nn.Sequential( + nn.Conv1d( + args.encoder_embed_dim, + args.encoder_embed_dim, + kernel_size=args.encoder_conv_kernel_size, + padding=((args.encoder_conv_kernel_size - 1) // 2), + ), + nn.BatchNorm1d(args.encoder_embed_dim), + nn.ReLU(), + nn.Dropout(args.encoder_dropout), + ) + for _ in range(args.encoder_conv_layers) + ) + + self.lstm = nn.LSTM( + args.encoder_embed_dim, + args.encoder_embed_dim // 2, + num_layers=args.encoder_lstm_layers, + batch_first=True, + bidirectional=True, + ) + + self.apply(encoder_init) + + def forward(self, src_tokens, src_lengths=None, speaker=None, **kwargs): + x = self.embed_tokens(src_tokens) + x = x.transpose(1, 2).contiguous() # B x T x C -> B x C x T + for conv in self.convolutions: + x = conv(x) + x = x.transpose(1, 2).contiguous() # B x C x T -> B x T x C + + src_lengths = src_lengths.cpu().long() + x = nn.utils.rnn.pack_padded_sequence(x, src_lengths, batch_first=True) + x = self.lstm(x)[0] + x = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)[0] + + encoder_padding_mask = src_tokens.eq(self.padding_idx) + + if self.embed_speaker is not None: + seq_len, bsz, _ = x.size() + emb = self.embed_speaker(speaker).expand(seq_len, bsz, -1) + x = self.spk_emb_proj(torch.cat([x, emb], dim=2)) + + return { + "encoder_out": [x], # B x T x C + "encoder_padding_mask": encoder_padding_mask, # B x T + } + + +class Prenet(nn.Module): + def __init__(self, in_dim, n_layers, n_units, dropout): + super().__init__() + self.layers = nn.ModuleList( + nn.Sequential(nn.Linear(in_dim if i == 0 else n_units, n_units), nn.ReLU()) + for i in range(n_layers) + ) + self.dropout = dropout + + def forward(self, x): + for layer in self.layers: + x = F.dropout(layer(x), p=self.dropout) # always applies dropout + return x + + +class Postnet(nn.Module): + def __init__(self, in_dim, n_channels, kernel_size, n_layers, dropout): + super(Postnet, self).__init__() + self.convolutions = nn.ModuleList() + assert kernel_size % 2 == 1 + for i in range(n_layers): + cur_layers = ( + [ + nn.Conv1d( + in_dim if i == 0 else n_channels, + n_channels if i < n_layers - 1 else in_dim, + kernel_size=kernel_size, + padding=((kernel_size - 1) // 2), + ), + nn.BatchNorm1d(n_channels if i < n_layers - 1 else in_dim), + ] + + ([nn.Tanh()] if i < n_layers - 1 else []) + + [nn.Dropout(dropout)] + ) + nn.init.xavier_uniform_( + cur_layers[0].weight, + torch.nn.init.calculate_gain("tanh" if i < n_layers - 1 else "linear"), + ) + self.convolutions.append(nn.Sequential(*cur_layers)) + + def forward(self, x): + x = x.transpose(1, 2) # B x T x C -> B x C x T + for conv in self.convolutions: + x = conv(x) + return x.transpose(1, 2) + + +def decoder_init(m): + if isinstance(m, torch.nn.Conv1d): + nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("tanh")) + + +class Tacotron2Decoder(FairseqIncrementalDecoder): + def __init__(self, args, src_dict): + super().__init__(None) + self.args = args + self.n_frames_per_step = args.n_frames_per_step + self.out_dim = args.output_frame_dim * args.n_frames_per_step + + self.prenet = Prenet( + self.out_dim, args.prenet_layers, args.prenet_dim, args.prenet_dropout + ) + + # take prev_context, prev_frame, (speaker embedding) as input + self.attention_lstm = LSTMCellWithZoneOut( + args.zoneout, + args.prenet_dim + args.encoder_embed_dim, + args.decoder_lstm_dim, + ) + + # take attention_lstm output, attention_state, encoder_out as input + self.attention = LocationAttention( + args.attention_dim, + args.encoder_embed_dim, + args.decoder_lstm_dim, + (1 + int(args.attention_use_cumprob)), + args.attention_conv_dim, + args.attention_conv_kernel_size, + ) + + # take attention_lstm output, context, (gated_latent) as input + self.lstm = nn.ModuleList( + LSTMCellWithZoneOut( + args.zoneout, + args.encoder_embed_dim + args.decoder_lstm_dim, + args.decoder_lstm_dim, + ) + for i in range(args.decoder_lstm_layers) + ) + + proj_in_dim = args.encoder_embed_dim + args.decoder_lstm_dim + self.feat_proj = nn.Linear(proj_in_dim, self.out_dim) + self.eos_proj = nn.Linear(proj_in_dim, 1) + + self.postnet = Postnet( + self.out_dim, + args.postnet_conv_dim, + args.postnet_conv_kernel_size, + args.postnet_layers, + args.postnet_dropout, + ) + + self.ctc_proj = None + if getattr(args, "ctc_weight", 0.0) > 0.0: + self.ctc_proj = nn.Linear(self.out_dim, len(src_dict)) + + self.apply(decoder_init) + + def _get_states(self, incremental_state, enc_out): + bsz, in_len, _ = enc_out.size() + alstm_h = self.get_incremental_state(incremental_state, "alstm_h") + if alstm_h is None: + alstm_h = enc_out.new_zeros(bsz, self.args.decoder_lstm_dim) + alstm_c = self.get_incremental_state(incremental_state, "alstm_c") + if alstm_c is None: + alstm_c = enc_out.new_zeros(bsz, self.args.decoder_lstm_dim) + + lstm_h = self.get_incremental_state(incremental_state, "lstm_h") + if lstm_h is None: + lstm_h = [ + enc_out.new_zeros(bsz, self.args.decoder_lstm_dim) + for _ in range(self.args.decoder_lstm_layers) + ] + lstm_c = self.get_incremental_state(incremental_state, "lstm_c") + if lstm_c is None: + lstm_c = [ + enc_out.new_zeros(bsz, self.args.decoder_lstm_dim) + for _ in range(self.args.decoder_lstm_layers) + ] + + attn_w = self.get_incremental_state(incremental_state, "attn_w") + if attn_w is None: + attn_w = enc_out.new_zeros(bsz, in_len) + attn_w_cum = self.get_incremental_state(incremental_state, "attn_w_cum") + if attn_w_cum is None: + attn_w_cum = enc_out.new_zeros(bsz, in_len) + return alstm_h, alstm_c, lstm_h, lstm_c, attn_w, attn_w_cum + + def _get_init_attn_c(self, enc_out, enc_mask): + bsz = enc_out.size(0) + if self.args.init_attn_c == "zero": + return enc_out.new_zeros(bsz, self.args.encoder_embed_dim) + elif self.args.init_attn_c == "avg": + enc_w = (~enc_mask).type(enc_out.type()) + enc_w = enc_w / enc_w.sum(dim=1, keepdim=True) + return torch.sum(enc_out * enc_w.unsqueeze(2), dim=1) + else: + raise ValueError(f"{self.args.init_attn_c} not supported") + + def forward( + self, + prev_output_tokens, + encoder_out=None, + incremental_state=None, + target_lengths=None, + **kwargs, + ): + enc_mask = encoder_out["encoder_padding_mask"] + enc_out = encoder_out["encoder_out"][0] + in_len = enc_out.size(1) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:, :] + bsz, out_len, _ = prev_output_tokens.size() + + prenet_out = self.prenet(prev_output_tokens) + (alstm_h, alstm_c, lstm_h, lstm_c, attn_w, attn_w_cum) = self._get_states( + incremental_state, enc_out + ) + attn_ctx = self._get_init_attn_c(enc_out, enc_mask) + + attn_out = enc_out.new_zeros(bsz, in_len, out_len) + feat_out = enc_out.new_zeros(bsz, out_len, self.out_dim) + eos_out = enc_out.new_zeros(bsz, out_len) + for t in range(out_len): + alstm_in = torch.cat((attn_ctx, prenet_out[:, t, :]), dim=1) + alstm_h, alstm_c = self.attention_lstm(alstm_in, (alstm_h, alstm_c)) + + attn_state = attn_w.unsqueeze(1) + if self.args.attention_use_cumprob: + attn_state = torch.stack((attn_w, attn_w_cum), dim=1) + attn_ctx, attn_w = self.attention(enc_out, enc_mask, alstm_h, attn_state) + attn_w_cum = attn_w_cum + attn_w + attn_out[:, :, t] = attn_w + + for i, cur_lstm in enumerate(self.lstm): + if i == 0: + lstm_in = torch.cat((attn_ctx, alstm_h), dim=1) + else: + lstm_in = torch.cat((attn_ctx, lstm_h[i - 1]), dim=1) + lstm_h[i], lstm_c[i] = cur_lstm(lstm_in, (lstm_h[i], lstm_c[i])) + + proj_in = torch.cat((attn_ctx, lstm_h[-1]), dim=1) + feat_out[:, t, :] = self.feat_proj(proj_in) + eos_out[:, t] = self.eos_proj(proj_in).squeeze(1) + self.attention.clear_cache() + + self.set_incremental_state(incremental_state, "alstm_h", alstm_h) + self.set_incremental_state(incremental_state, "alstm_c", alstm_c) + self.set_incremental_state(incremental_state, "lstm_h", lstm_h) + self.set_incremental_state(incremental_state, "lstm_c", lstm_c) + self.set_incremental_state(incremental_state, "attn_w", attn_w) + self.set_incremental_state(incremental_state, "attn_w_cum", attn_w_cum) + + post_feat_out = feat_out + self.postnet(feat_out) + eos_out = eos_out.view(bsz, out_len, 1) + return post_feat_out, eos_out, {"attn": attn_out, "feature_out": feat_out} + + +@register_model("tacotron_2") +class Tacotron2Model(FairseqEncoderDecoderModel): + """ + Implementation for https://arxiv.org/pdf/1712.05884.pdf + """ + + @staticmethod + def add_args(parser): + # encoder + parser.add_argument("--encoder-dropout", type=float) + parser.add_argument("--encoder-embed-dim", type=int) + parser.add_argument("--encoder-conv-layers", type=int) + parser.add_argument("--encoder-conv-kernel-size", type=int) + parser.add_argument("--encoder-lstm-layers", type=int) + # decoder + parser.add_argument("--attention-dim", type=int) + parser.add_argument("--attention-conv-dim", type=int) + parser.add_argument("--attention-conv-kernel-size", type=int) + parser.add_argument("--prenet-dropout", type=float) + parser.add_argument("--prenet-layers", type=int) + parser.add_argument("--prenet-dim", type=int) + parser.add_argument("--postnet-dropout", type=float) + parser.add_argument("--postnet-layers", type=int) + parser.add_argument("--postnet-conv-dim", type=int) + parser.add_argument("--postnet-conv-kernel-size", type=int) + parser.add_argument("--init-attn-c", type=str) + parser.add_argument("--attention-use-cumprob", action="store_true") + parser.add_argument("--zoneout", type=float) + parser.add_argument("--decoder-lstm-layers", type=int) + parser.add_argument("--decoder-lstm-dim", type=int) + parser.add_argument("--output-frame-dim", type=int) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._num_updates = 0 + + @classmethod + def build_model(cls, args, task): + embed_speaker = task.get_speaker_embeddings(args) + encoder = Tacotron2Encoder(args, task.src_dict, embed_speaker) + decoder = Tacotron2Decoder(args, task.src_dict) + return cls(encoder, decoder) + + def forward_encoder(self, src_tokens, src_lengths, **kwargs): + return self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + self._num_updates = num_updates + + +@register_model_architecture("tacotron_2", "tacotron_2") +def base_architecture(args): + # encoder + args.encoder_dropout = getattr(args, "encoder_dropout", 0.5) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_conv_layers = getattr(args, "encoder_conv_layers", 3) + args.encoder_conv_kernel_size = getattr(args, "encoder_conv_kernel_size", 5) + args.encoder_lstm_layers = getattr(args, "encoder_lstm_layers", 1) + # decoder + args.attention_dim = getattr(args, "attention_dim", 128) + args.attention_conv_dim = getattr(args, "attention_conv_dim", 32) + args.attention_conv_kernel_size = getattr(args, "attention_conv_kernel_size", 15) + args.prenet_dropout = getattr(args, "prenet_dropout", 0.5) + args.prenet_layers = getattr(args, "prenet_layers", 2) + args.prenet_dim = getattr(args, "prenet_dim", 256) + args.postnet_dropout = getattr(args, "postnet_dropout", 0.5) + args.postnet_layers = getattr(args, "postnet_layers", 5) + args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512) + args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5) + args.init_attn_c = getattr(args, "init_attn_c", "zero") + args.attention_use_cumprob = getattr(args, "attention_use_cumprob", True) + args.zoneout = getattr(args, "zoneout", 0.1) + args.decoder_lstm_layers = getattr(args, "decoder_lstm_layers", 2) + args.decoder_lstm_dim = getattr(args, "decoder_lstm_dim", 1024) + args.output_frame_dim = getattr(args, "output_frame_dim", 80) diff --git a/fairseq/models/text_to_speech/tts_transformer.py b/fairseq/models/text_to_speech/tts_transformer.py new file mode 100644 index 0000000000..19afc2b717 --- /dev/null +++ b/fairseq/models/text_to_speech/tts_transformer.py @@ -0,0 +1,454 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import List, Optional + +import torch +from torch import nn + +from fairseq import utils +from fairseq.data.data_utils import lengths_to_padding_mask +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderDecoderModel, + FairseqIncrementalDecoder, + register_model, + register_model_architecture, +) +from fairseq.models.text_to_speech.hub_interface import TTSHubInterface +from fairseq.models.text_to_speech.tacotron2 import Postnet, Prenet +from fairseq.modules import ( + FairseqDropout, + LayerNorm, + PositionalEmbedding, + TransformerDecoderLayer, + TransformerEncoderLayer, +) + +logger = logging.getLogger(__name__) + + +def encoder_init(m): + if isinstance(m, nn.Conv1d): + nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("relu")) + + +def Embedding(num_embeddings, embedding_dim): + m = nn.Embedding(num_embeddings, embedding_dim) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) + return m + + +class TTSTransformerEncoder(FairseqEncoder): + def __init__(self, args, src_dict, embed_speaker): + super().__init__(src_dict) + self.padding_idx = src_dict.pad() + self.embed_speaker = embed_speaker + self.spk_emb_proj = None + if embed_speaker is not None: + self.spk_emb_proj = nn.Linear( + args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim + ) + + self.dropout_module = FairseqDropout( + p=args.dropout, module_name=self.__class__.__name__ + ) + self.embed_tokens = nn.Embedding( + len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx + ) + assert args.encoder_conv_kernel_size % 2 == 1 + self.prenet = nn.ModuleList( + nn.Sequential( + nn.Conv1d( + args.encoder_embed_dim, + args.encoder_embed_dim, + kernel_size=args.encoder_conv_kernel_size, + padding=((args.encoder_conv_kernel_size - 1) // 2), + ), + nn.BatchNorm1d(args.encoder_embed_dim), + nn.ReLU(), + nn.Dropout(args.encoder_dropout), + ) + for _ in range(args.encoder_conv_layers) + ) + self.prenet_proj = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) + self.embed_positions = PositionalEmbedding( + args.max_source_positions, args.encoder_embed_dim, self.padding_idx + ) + self.pos_emb_alpha = nn.Parameter(torch.ones(1)) + + self.transformer_layers = nn.ModuleList( + TransformerEncoderLayer(args) + for _ in range(args.encoder_transformer_layers) + ) + if args.encoder_normalize_before: + self.layer_norm = LayerNorm(args.encoder_embed_dim) + else: + self.layer_norm = None + + self.apply(encoder_init) + + def forward(self, src_tokens, src_lengths=None, speaker=None, **kwargs): + x = self.embed_tokens(src_tokens) + x = x.transpose(1, 2).contiguous() # B x T x C -> B x C x T + for conv in self.prenet: + x = conv(x) + x = x.transpose(1, 2).contiguous() # B x C x T -> B x T x C + x = self.prenet_proj(x) + + padding_mask = src_tokens.eq(self.padding_idx) + positions = self.embed_positions(padding_mask) + x += self.pos_emb_alpha * positions + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + for layer in self.transformer_layers: + x = layer(x, padding_mask) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + if self.embed_speaker is not None: + seq_len, bsz, _ = x.size() + emb = self.embed_speaker(speaker).transpose(0, 1) + emb = emb.expand(seq_len, bsz, -1) + x = self.spk_emb_proj(torch.cat([x, emb], dim=2)) + + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [padding_mask] + if padding_mask.any() + else [], # B x T + "encoder_embedding": [], # B x T x C + "encoder_states": [], # List[T x B x C] + "src_tokens": [], + "src_lengths": [], + } + + +def decoder_init(m): + if isinstance(m, torch.nn.Conv1d): + nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("tanh")) + + +class TTSTransformerDecoder(FairseqIncrementalDecoder): + def __init__(self, args, src_dict, padding_idx=1): + super().__init__(None) + self._future_mask = torch.empty(0) + + self.args = args + self.padding_idx = src_dict.pad() if src_dict else padding_idx + self.n_frames_per_step = args.n_frames_per_step + self.out_dim = args.output_frame_dim * args.n_frames_per_step + + self.dropout_module = FairseqDropout( + args.dropout, module_name=self.__class__.__name__ + ) + self.embed_positions = PositionalEmbedding( + args.max_target_positions, args.decoder_embed_dim, self.padding_idx + ) + self.pos_emb_alpha = nn.Parameter(torch.ones(1)) + self.prenet = nn.Sequential( + Prenet( + self.out_dim, args.prenet_layers, args.prenet_dim, args.prenet_dropout + ), + nn.Linear(args.prenet_dim, args.decoder_embed_dim), + ) + + self.n_transformer_layers = args.decoder_transformer_layers + self.transformer_layers = nn.ModuleList( + TransformerDecoderLayer(args) for _ in range(self.n_transformer_layers) + ) + if args.decoder_normalize_before: + self.layer_norm = LayerNorm(args.decoder_embed_dim) + else: + self.layer_norm = None + + self.feat_proj = nn.Linear(args.decoder_embed_dim, self.out_dim) + self.eos_proj = nn.Linear(args.decoder_embed_dim, 1) + + self.postnet = Postnet( + self.out_dim, + args.postnet_conv_dim, + args.postnet_conv_kernel_size, + args.postnet_layers, + args.postnet_dropout, + ) + + self.ctc_proj = None + if getattr(args, "ctc_weight", 0.0) > 0.0: + self.ctc_proj = nn.Linear(self.out_dim, len(src_dict)) + + self.apply(decoder_init) + + def extract_features( + self, + prev_outputs, + encoder_out=None, + incremental_state=None, + target_lengths=None, + speaker=None, + **kwargs, + ): + alignment_layer = self.n_transformer_layers - 1 + self_attn_padding_mask = lengths_to_padding_mask(target_lengths) + positions = self.embed_positions( + self_attn_padding_mask, incremental_state=incremental_state + ) + + if incremental_state is not None: + prev_outputs = prev_outputs[:, -1:, :] + self_attn_padding_mask = self_attn_padding_mask[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + x = self.prenet(prev_outputs) + x += self.pos_emb_alpha * positions + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + if not self_attn_padding_mask.any(): + self_attn_padding_mask = None + + attn: Optional[torch.Tensor] = None + inner_states: List[Optional[torch.Tensor]] = [x] + for idx, transformer_layer in enumerate(self.transformer_layers): + if incremental_state is None: + self_attn_mask = self.buffered_future_mask(x) + else: + self_attn_mask = None + + x, layer_attn, _ = transformer_layer( + x, + encoder_out["encoder_out"][0] + if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) + else None, + encoder_out["encoder_padding_mask"][0] + if ( + encoder_out is not None + and len(encoder_out["encoder_padding_mask"]) > 0 + ) + else None, + incremental_state, + self_attn_mask=self_attn_mask, + self_attn_padding_mask=self_attn_padding_mask, + need_attn=bool((idx == alignment_layer)), + need_head_weights=bool((idx == alignment_layer)), + ) + inner_states.append(x) + if layer_attn is not None and idx == alignment_layer: + attn = layer_attn.float().to(x) + + if attn is not None: + # average probabilities over heads, transpose to + # (B, src_len, tgt_len) + attn = attn.mean(dim=0).transpose(2, 1) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + return x, {"attn": attn, "inner_states": inner_states} + + def forward( + self, + prev_output_tokens, + encoder_out=None, + incremental_state=None, + target_lengths=None, + speaker=None, + **kwargs, + ): + x, extra = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + incremental_state=incremental_state, + target_lengths=target_lengths, + speaker=speaker, + **kwargs, + ) + attn = extra["attn"] + feat_out = self.feat_proj(x) + bsz, seq_len, _ = x.size() + eos_out = self.eos_proj(x) + post_feat_out = feat_out + self.postnet(feat_out) + return ( + post_feat_out, + eos_out, + { + "attn": attn, + "feature_out": feat_out, + "inner_states": extra["inner_states"], + }, + ) + + def get_normalized_probs(self, net_output, log_probs, sample): + logits = self.ctc_proj(net_output[2]["feature_out"]) + if log_probs: + return utils.log_softmax(logits.float(), dim=-1) + else: + return utils.softmax(logits.float(), dim=-1) + + def buffered_future_mask(self, tensor): + dim = tensor.size(0) + # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround. + if ( + self._future_mask.size(0) == 0 + or (not self._future_mask.device == tensor.device) + or self._future_mask.size(0) < dim + ): + self._future_mask = torch.triu( + utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1 + ) + self._future_mask = self._future_mask.to(tensor) + return self._future_mask[:dim, :dim] + + +@register_model("tts_transformer") +class TTSTransformerModel(FairseqEncoderDecoderModel): + """ + Implementation for https://arxiv.org/pdf/1809.08895.pdf + """ + + @classmethod + def hub_models(cls): + base_url = "http://dl.fbaipublicfiles.com/fairseq/s2" + model_ids = [ + "tts_transformer-en-ljspeech", + "tts_transformer-en-200_speaker-cv4", + "tts_transformer-es-css10", + "tts_transformer-fr-cv7_css10", + "tts_transformer-ru-cv7_css10", + "tts_transformer-zh-cv7_css10", + "tts_transformer-ar-cv7_css10", + "tts_transformer-tr-cv7_css10", + "tts_transformer-vi-cv7", + ] + return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + config_yaml="config.yaml", + vocoder: str = "griffin_lim", + fp16: bool = False, + **kwargs, + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + config_yaml=config_yaml, + vocoder=vocoder, + fp16=fp16, + **kwargs, + ) + return TTSHubInterface(x["args"], x["task"], x["models"][0]) + + @staticmethod + def add_args(parser): + parser.add_argument("--dropout", type=float) + parser.add_argument("--output-frame-dim", type=int) + parser.add_argument("--speaker-embed-dim", type=int) + # encoder prenet + parser.add_argument("--encoder-dropout", type=float) + parser.add_argument("--encoder-conv-layers", type=int) + parser.add_argument("--encoder-conv-kernel-size", type=int) + # encoder transformer layers + parser.add_argument("--encoder-transformer-layers", type=int) + parser.add_argument("--encoder-embed-dim", type=int) + parser.add_argument("--encoder-ffn-embed-dim", type=int) + parser.add_argument("--encoder-normalize-before", action="store_true") + parser.add_argument("--encoder-attention-heads", type=int) + parser.add_argument("--attention-dropout", type=float) + parser.add_argument("--activation-dropout", "--relu-dropout", type=float) + parser.add_argument("--activation-fn", type=str, default="relu") + # decoder prenet + parser.add_argument("--prenet-dropout", type=float) + parser.add_argument("--prenet-layers", type=int) + parser.add_argument("--prenet-dim", type=int) + # decoder postnet + parser.add_argument("--postnet-dropout", type=float) + parser.add_argument("--postnet-layers", type=int) + parser.add_argument("--postnet-conv-dim", type=int) + parser.add_argument("--postnet-conv-kernel-size", type=int) + # decoder transformer layers + parser.add_argument("--decoder-transformer-layers", type=int) + parser.add_argument("--decoder-embed-dim", type=int) + parser.add_argument("--decoder-ffn-embed-dim", type=int) + parser.add_argument("--decoder-normalize-before", action="store_true") + parser.add_argument("--decoder-attention-heads", type=int) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._num_updates = 0 + + @classmethod + def build_model(cls, args, task): + embed_speaker = task.get_speaker_embeddings(args) + encoder = TTSTransformerEncoder(args, task.src_dict, embed_speaker) + decoder = TTSTransformerDecoder(args, task.src_dict) + return cls(encoder, decoder) + + def forward_encoder(self, src_tokens, src_lengths, speaker=None, **kwargs): + return self.encoder( + src_tokens, src_lengths=src_lengths, speaker=speaker, **kwargs + ) + + def set_num_updates(self, num_updates): + super().set_num_updates(num_updates) + self._num_updates = num_updates + + +@register_model_architecture("tts_transformer", "tts_transformer") +def base_architecture(args): + args.dropout = getattr(args, "dropout", 0.1) + args.output_frame_dim = getattr(args, "output_frame_dim", 80) + args.speaker_embed_dim = getattr(args, "speaker_embed_dim", 64) + # encoder prenet + args.encoder_dropout = getattr(args, "encoder_dropout", 0.5) + args.encoder_conv_layers = getattr(args, "encoder_conv_layers", 3) + args.encoder_conv_kernel_size = getattr(args, "encoder_conv_kernel_size", 5) + # encoder transformer layers + args.encoder_transformer_layers = getattr(args, "encoder_transformer_layers", 6) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr( + args, "encoder_ffn_embed_dim", 4 * args.encoder_embed_dim + ) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.activation_fn = getattr(args, "activation_fn", "relu") + # decoder prenet + args.prenet_dropout = getattr(args, "prenet_dropout", 0.5) + args.prenet_layers = getattr(args, "prenet_layers", 2) + args.prenet_dim = getattr(args, "prenet_dim", 256) + # decoder postnet + args.postnet_dropout = getattr(args, "postnet_dropout", 0.5) + args.postnet_layers = getattr(args, "postnet_layers", 5) + args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512) + args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5) + # decoder transformer layers + args.decoder_transformer_layers = getattr(args, "decoder_transformer_layers", 6) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", 4 * args.decoder_embed_dim + ) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) diff --git a/fairseq/models/text_to_speech/vocoder.py b/fairseq/models/text_to_speech/vocoder.py new file mode 100644 index 0000000000..dbc02da368 --- /dev/null +++ b/fairseq/models/text_to_speech/vocoder.py @@ -0,0 +1,305 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import logging +from typing import Dict + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +from fairseq.data.audio.audio_utils import ( + TTSSpectrogram, + get_fourier_basis, + get_mel_filters, + get_window, +) +from fairseq.data.audio.speech_to_text_dataset import S2TDataConfig +from fairseq.models import BaseFairseqModel, register_model +from fairseq.models.text_to_speech.codehifigan import CodeGenerator as CodeHiFiGANModel +from fairseq.models.text_to_speech.hifigan import Generator as HiFiGANModel +from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface + +logger = logging.getLogger(__name__) + + +class PseudoInverseMelScale(torch.nn.Module): + def __init__(self, n_stft, n_mels, sample_rate, f_min, f_max) -> None: + super(PseudoInverseMelScale, self).__init__() + self.n_mels = n_mels + basis = get_mel_filters(sample_rate, (n_stft - 1) * 2, n_mels, f_min, f_max) + basis = torch.pinverse(basis) # F x F_mel + self.register_buffer("basis", basis) + + def forward(self, melspec: torch.Tensor) -> torch.Tensor: + # pack batch + shape = melspec.shape # B_1 x ... x B_K x F_mel x T + n_mels, time = shape[-2], shape[-1] + melspec = melspec.view(-1, n_mels, time) + + freq, _ = self.basis.size() # F x F_mel + assert self.n_mels == n_mels, (self.n_mels, n_mels) + specgram = self.basis.matmul(melspec).clamp(min=0) + + # unpack batch + specgram = specgram.view(shape[:-2] + (freq, time)) + return specgram + + +class GriffinLim(torch.nn.Module): + def __init__( + self, + n_fft: int, + win_length: int, + hop_length: int, + n_iter: int, + window_fn=torch.hann_window, + ): + super(GriffinLim, self).__init__() + self.transform = TTSSpectrogram( + n_fft, win_length, hop_length, return_phase=True + ) + + basis = get_fourier_basis(n_fft) + basis = torch.pinverse(n_fft / hop_length * basis).T[:, None, :] + basis *= get_window(window_fn, n_fft, win_length) + self.register_buffer("basis", basis) + + self.n_fft = n_fft + self.win_length = win_length + self.hop_length = hop_length + self.n_iter = n_iter + + self.tiny = 1.1754944e-38 + + @classmethod + def get_window_sum_square( + cls, n_frames, hop_length, win_length, n_fft, window_fn=torch.hann_window + ) -> torch.Tensor: + w_sq = get_window(window_fn, n_fft, win_length) ** 2 + n = n_fft + hop_length * (n_frames - 1) + x = torch.zeros(n, dtype=torch.float32) + for i in range(n_frames): + ofst = i * hop_length + x[ofst : min(n, ofst + n_fft)] += w_sq[: max(0, min(n_fft, n - ofst))] + return x + + def inverse(self, magnitude: torch.Tensor, phase) -> torch.Tensor: + x = torch.cat( + [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 + ) + x = F.conv_transpose1d(x, self.basis, stride=self.hop_length) + win_sum_sq = self.get_window_sum_square( + magnitude.shape[-1], + hop_length=self.hop_length, + win_length=self.win_length, + n_fft=self.n_fft, + ).to(magnitude.device) + # remove modulation effects + approx_nonzero_indices = win_sum_sq > self.tiny + x[:, :, approx_nonzero_indices] /= win_sum_sq[approx_nonzero_indices] + x *= self.n_fft / self.hop_length + x = x[:, :, self.n_fft // 2 :] + x = x[:, :, : -self.n_fft // 2 :] + return x + + def forward(self, specgram: torch.Tensor) -> torch.Tensor: + angles = np.angle(np.exp(2j * np.pi * np.random.rand(*specgram.shape))) + angles = torch.from_numpy(angles).to(specgram) + _specgram = specgram.view(-1, specgram.shape[-2], specgram.shape[-1]) + waveform = self.inverse(_specgram, angles).squeeze(1) + for _ in range(self.n_iter): + _, angles = self.transform(waveform) + waveform = self.inverse(_specgram, angles).squeeze(1) + return waveform.squeeze(0) + + +class GriffinLimVocoder(nn.Module): + def __init__( + self, + sample_rate, + win_size, + hop_size, + n_fft, + n_mels, + f_min, + f_max, + window_fn, + spec_bwd_max_iter=32, + fp16=False, + ): + super().__init__() + self.inv_mel_transform = PseudoInverseMelScale( + n_stft=n_fft // 2 + 1, + n_mels=n_mels, + sample_rate=sample_rate, + f_min=f_min, + f_max=f_max, + ) + self.gl_transform = GriffinLim( + n_fft=n_fft, + win_length=win_size, + hop_length=hop_size, + window_fn=window_fn, + n_iter=spec_bwd_max_iter, + ) + if fp16: + self.half() + self.inv_mel_transform.half() + self.gl_transform.half() + else: + self.float() + self.inv_mel_transform.float() + self.gl_transform.float() + + def forward(self, x): + # x: (B x) T x D -> (B x) 1 x T + # NOTE: batched forward produces noisier waveform. recommend running + # one utterance at a time + self.eval() + x = x.exp().transpose(-1, -2) + x = self.inv_mel_transform(x) + x = self.gl_transform(x) + return x + + @classmethod + def from_data_cfg(cls, args, data_cfg: S2TDataConfig): + feat_cfg = data_cfg.config["features"] + window_fn = getattr(torch, feat_cfg["window_fn"] + "_window") + return cls( + sample_rate=feat_cfg["sample_rate"], + win_size=int(feat_cfg["win_len_t"] * feat_cfg["sample_rate"]), + hop_size=int(feat_cfg["hop_len_t"] * feat_cfg["sample_rate"]), + n_fft=feat_cfg["n_fft"], + n_mels=feat_cfg["n_mels"], + f_min=feat_cfg["f_min"], + f_max=feat_cfg["f_max"], + window_fn=window_fn, + spec_bwd_max_iter=args.spec_bwd_max_iter, + fp16=args.fp16, + ) + + +class HiFiGANVocoder(nn.Module): + def __init__( + self, checkpoint_path: str, model_cfg: Dict[str, str], fp16: bool = False + ) -> None: + super().__init__() + self.model = HiFiGANModel(model_cfg) + state_dict = torch.load(checkpoint_path) + self.model.load_state_dict(state_dict["generator"]) + if fp16: + self.model.half() + logger.info(f"loaded HiFiGAN checkpoint from {checkpoint_path}") + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # (B x) T x D -> (B x) 1 x T + model = self.model.eval() + if len(x.shape) == 2: + return model(x.unsqueeze(0).transpose(1, 2)).detach().squeeze(0) + else: + return model(x.transpose(-1, -2)).detach() + + @classmethod + def from_data_cfg(cls, args, data_cfg: S2TDataConfig): + vocoder_cfg = data_cfg.vocoder + assert vocoder_cfg.get("type", "griffin_lim") == "hifigan" + with open(vocoder_cfg["config"]) as f: + model_cfg = json.load(f) + return cls(vocoder_cfg["checkpoint"], model_cfg, fp16=args.fp16) + + +@register_model("CodeHiFiGANVocoder") +class CodeHiFiGANVocoder(BaseFairseqModel): + def __init__( + self, checkpoint_path: str, model_cfg: Dict[str, str], fp16: bool = False + ) -> None: + super().__init__() + self.model = CodeHiFiGANModel(model_cfg) + if torch.cuda.is_available(): + state_dict = torch.load(checkpoint_path) + else: + state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu")) + self.model.load_state_dict(state_dict["generator"]) + self.model.eval() + if fp16: + self.model.half() + self.model.remove_weight_norm() + logger.info(f"loaded CodeHiFiGAN checkpoint from {checkpoint_path}") + + def forward(self, x: Dict[str, torch.Tensor], dur_prediction=False) -> torch.Tensor: + assert "code" in x + x["dur_prediction"] = dur_prediction + + # remove invalid code + mask = x["code"] >= 0 + x["code"] = x["code"][mask].unsqueeze(dim=0) + if "f0" in x: + f0_up_ratio = x["f0"].size(1) // x["code"].size(1) + mask = mask.unsqueeze(2).repeat(1, 1, f0_up_ratio).view(-1, x["f0"].size(1)) + x["f0"] = x["f0"][mask].unsqueeze(dim=0) + + return self.model(**x).detach().squeeze() + + @classmethod + def from_data_cfg(cls, args, data_cfg): + vocoder_cfg = data_cfg.vocoder + assert vocoder_cfg is not None, "vocoder not specified in the data config" + with open(vocoder_cfg["config"]) as f: + model_cfg = json.load(f) + return cls(vocoder_cfg["checkpoint"], model_cfg, fp16=args.fp16) + + @classmethod + def hub_models(cls): + base_url = "http://dl.fbaipublicfiles.com/fairseq/vocoder" + model_ids = [ + "unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur", + "unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10_dur", + "unit_hifigan_HK_layer12.km2500_frame_TAT-TTS", + ] + return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + config="config.json", + fp16: bool = False, + **kwargs, + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + config_yaml=config, + fp16=fp16, + is_vocoder=True, + **kwargs, + ) + + with open(f"{x['args']['data']}/{config}") as f: + vocoder_cfg = json.load(f) + assert len(x["args"]["model_path"]) == 1, "Too many vocoder models in the input" + + vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg) + return VocoderHubInterface(vocoder_cfg, vocoder) + + +def get_vocoder(args, data_cfg: S2TDataConfig): + if args.vocoder == "griffin_lim": + return GriffinLimVocoder.from_data_cfg(args, data_cfg) + elif args.vocoder == "hifigan": + return HiFiGANVocoder.from_data_cfg(args, data_cfg) + elif args.vocoder == "code_hifigan": + return CodeHiFiGANVocoder.from_data_cfg(args, data_cfg) + else: + raise ValueError("Unknown vocoder") diff --git a/fairseq/models/transformer.py b/fairseq/models/transformer.py deleted file mode 100644 index f87fa50d29..0000000000 --- a/fairseq/models/transformer.py +++ /dev/null @@ -1,1025 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import math -from typing import Any, Dict, List, Optional, Tuple - -import torch -import torch.nn as nn -from fairseq import utils -from fairseq.models import ( - FairseqEncoder, - FairseqEncoderDecoderModel, - FairseqIncrementalDecoder, - register_model, - register_model_architecture, -) -from fairseq.models.fairseq_encoder import EncoderOut -from fairseq.modules import ( - AdaptiveSoftmax, - FairseqDropout, - LayerDropModuleList, - LayerNorm, - PositionalEmbedding, - SinusoidalPositionalEmbedding, - TransformerDecoderLayer, - TransformerEncoderLayer, -) -from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ -from torch import Tensor - - -DEFAULT_MAX_SOURCE_POSITIONS = 1024 -DEFAULT_MAX_TARGET_POSITIONS = 1024 - - -@register_model("transformer") -class TransformerModel(FairseqEncoderDecoderModel): - """ - Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017) - <https://arxiv.org/abs/1706.03762>`_. - - Args: - encoder (TransformerEncoder): the encoder - decoder (TransformerDecoder): the decoder - - The Transformer model provides the following named architectures and - command-line arguments: - - .. argparse:: - :ref: fairseq.models.transformer_parser - :prog: - """ - - @classmethod - def hub_models(cls): - # fmt: off - - def moses_subword(path): - return { - 'path': path, - 'tokenizer': 'moses', - 'bpe': 'subword_nmt', - } - - def moses_fastbpe(path): - return { - 'path': path, - 'tokenizer': 'moses', - 'bpe': 'fastbpe', - } - - return { - 'transformer.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2'), - 'transformer.wmt16.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2', - 'transformer.wmt18.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz'), - 'transformer.wmt19.en-de': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz'), - 'transformer.wmt19.en-ru': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz'), - 'transformer.wmt19.de-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz'), - 'transformer.wmt19.ru-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz'), - 'transformer.wmt19.en-de.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.single_model.tar.gz'), - 'transformer.wmt19.en-ru.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.single_model.tar.gz'), - 'transformer.wmt19.de-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.single_model.tar.gz'), - 'transformer.wmt19.ru-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.single_model.tar.gz'), - } - # fmt: on - - def __init__(self, args, encoder, decoder): - super().__init__(encoder, decoder) - self.args = args - self.supports_align_args = True - - @staticmethod - def add_args(parser): - """Add model-specific arguments to the parser.""" - # fmt: off - parser.add_argument('--activation-fn', - choices=utils.get_available_activation_fns(), - help='activation function to use') - parser.add_argument('--dropout', type=float, metavar='D', - help='dropout probability') - parser.add_argument('--attention-dropout', type=float, metavar='D', - help='dropout probability for attention weights') - parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D', - help='dropout probability after activation in FFN.') - parser.add_argument('--encoder-embed-path', type=str, metavar='STR', - help='path to pre-trained encoder embedding') - parser.add_argument('--encoder-embed-dim', type=int, metavar='N', - help='encoder embedding dimension') - parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', - help='encoder embedding dimension for FFN') - parser.add_argument('--encoder-layers', type=int, metavar='N', - help='num encoder layers') - parser.add_argument('--encoder-attention-heads', type=int, metavar='N', - help='num encoder attention heads') - parser.add_argument('--encoder-normalize-before', action='store_true', - help='apply layernorm before each encoder block') - parser.add_argument('--encoder-learned-pos', action='store_true', - help='use learned positional embeddings in the encoder') - parser.add_argument('--decoder-embed-path', type=str, metavar='STR', - help='path to pre-trained decoder embedding') - parser.add_argument('--decoder-embed-dim', type=int, metavar='N', - help='decoder embedding dimension') - parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', - help='decoder embedding dimension for FFN') - parser.add_argument('--decoder-layers', type=int, metavar='N', - help='num decoder layers') - parser.add_argument('--decoder-attention-heads', type=int, metavar='N', - help='num decoder attention heads') - parser.add_argument('--decoder-learned-pos', action='store_true', - help='use learned positional embeddings in the decoder') - parser.add_argument('--decoder-normalize-before', action='store_true', - help='apply layernorm before each decoder block') - parser.add_argument('--decoder-output-dim', type=int, metavar='N', - help='decoder output dimension (extra linear layer ' - 'if different from decoder embed dim') - parser.add_argument('--share-decoder-input-output-embed', action='store_true', - help='share decoder input and output embeddings') - parser.add_argument('--share-all-embeddings', action='store_true', - help='share encoder, decoder and output embeddings' - ' (requires shared dictionary and embed dim)') - parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', - help='if set, disables positional embeddings (outside self attention)') - parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', - help='comma separated list of adaptive softmax cutoff points. ' - 'Must be used with adaptive_loss criterion'), - parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', - help='sets adaptive softmax dropout for the tail projections') - parser.add_argument('--layernorm-embedding', action='store_true', - help='add layernorm to embedding') - parser.add_argument('--no-scale-embedding', action='store_true', - help='if True, dont scale embeddings') - # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) - parser.add_argument('--no-cross-attention', default=False, action='store_true', - help='do not perform cross-attention') - parser.add_argument('--cross-self-attention', default=False, action='store_true', - help='perform cross+self-attention') - # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) - parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0, - help='LayerDrop probability for encoder') - parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0, - help='LayerDrop probability for decoder') - parser.add_argument('--encoder-layers-to-keep', default=None, - help='which layers to *keep* when pruning as a comma-separated list') - parser.add_argument('--decoder-layers-to-keep', default=None, - help='which layers to *keep* when pruning as a comma-separated list') - # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) - parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0, - help='iterative PQ quantization noise at training time') - parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8, - help='block size of quantization noise at training time') - parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0, - help='scalar quantization noise and scalar quantization at training time') - # fmt: on - - @classmethod - def build_model(cls, args, task): - """Build a new model instance.""" - - # make sure all arguments are present in older models - base_architecture(args) - - if args.encoder_layers_to_keep: - args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) - if args.decoder_layers_to_keep: - args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) - - if getattr(args, "max_source_positions", None) is None: - args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS - if getattr(args, "max_target_positions", None) is None: - args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS - - src_dict, tgt_dict = task.source_dictionary, task.target_dictionary - - if args.share_all_embeddings: - if src_dict != tgt_dict: - raise ValueError("--share-all-embeddings requires a joined dictionary") - if args.encoder_embed_dim != args.decoder_embed_dim: - raise ValueError( - "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" - ) - if args.decoder_embed_path and ( - args.decoder_embed_path != args.encoder_embed_path - ): - raise ValueError( - "--share-all-embeddings not compatible with --decoder-embed-path" - ) - encoder_embed_tokens = cls.build_embedding( - args, src_dict, args.encoder_embed_dim, args.encoder_embed_path - ) - decoder_embed_tokens = encoder_embed_tokens - args.share_decoder_input_output_embed = True - else: - encoder_embed_tokens = cls.build_embedding( - args, src_dict, args.encoder_embed_dim, args.encoder_embed_path - ) - decoder_embed_tokens = cls.build_embedding( - args, tgt_dict, args.decoder_embed_dim, args.decoder_embed_path - ) - - encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens) - decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens) - return cls(args, encoder, decoder) - - @classmethod - def build_embedding(cls, args, dictionary, embed_dim, path=None): - num_embeddings = len(dictionary) - padding_idx = dictionary.pad() - - emb = Embedding(num_embeddings, embed_dim, padding_idx) - # if provided, load from preloaded dictionaries - if path: - embed_dict = utils.parse_embedding(path) - utils.load_embedding(embed_dict, dictionary, emb) - return emb - - @classmethod - def build_encoder(cls, args, src_dict, embed_tokens): - return TransformerEncoder(args, src_dict, embed_tokens) - - @classmethod - def build_decoder(cls, args, tgt_dict, embed_tokens): - return TransformerDecoder( - args, - tgt_dict, - embed_tokens, - no_encoder_attn=getattr(args, "no_cross_attention", False), - ) - - # TorchScript doesn't support optional arguments with variable length (**kwargs). - # Current workaround is to add union of all arguments in child classes. - def forward( - self, - src_tokens, - src_lengths, - prev_output_tokens, - return_all_hiddens: bool = True, - features_only: bool = False, - alignment_layer: Optional[int] = None, - alignment_heads: Optional[int] = None, - ): - """ - Run the forward pass for an encoder-decoder model. - - Copied from the base class, but without ``**kwargs``, - which are not supported by TorchScript. - """ - encoder_out = self.encoder( - src_tokens, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens - ) - decoder_out = self.decoder( - prev_output_tokens, - encoder_out=encoder_out, - features_only=features_only, - alignment_layer=alignment_layer, - alignment_heads=alignment_heads, - src_lengths=src_lengths, - return_all_hiddens=return_all_hiddens, - ) - return decoder_out - - # Since get_normalized_probs is in the Fairseq Model which is not scriptable, - # I rewrite the get_normalized_probs from Base Class to call the - # helper function in the Base Class. - @torch.jit.export - def get_normalized_probs( - self, - net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], - log_probs: bool, - sample: Optional[Dict[str, Tensor]] = None, - ): - """Get normalized probabilities (or log probs) from a net's output.""" - return self.get_normalized_probs_scriptable(net_output, log_probs, sample) - - -class TransformerEncoder(FairseqEncoder): - """ - Transformer encoder consisting of *args.encoder_layers* layers. Each layer - is a :class:`TransformerEncoderLayer`. - - Args: - args (argparse.Namespace): parsed command-line arguments - dictionary (~fairseq.data.Dictionary): encoding dictionary - embed_tokens (torch.nn.Embedding): input embedding - """ - - def __init__(self, args, dictionary, embed_tokens): - super().__init__(dictionary) - self.register_buffer("version", torch.Tensor([3])) - - self.dropout_module = FairseqDropout( - args.dropout, module_name=self.__class__.__name__ - ) - self.encoder_layerdrop = args.encoder_layerdrop - - embed_dim = embed_tokens.embedding_dim - self.padding_idx = embed_tokens.padding_idx - self.max_source_positions = args.max_source_positions - - self.embed_tokens = embed_tokens - - self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) - - self.embed_positions = ( - PositionalEmbedding( - args.max_source_positions, - embed_dim, - self.padding_idx, - learned=args.encoder_learned_pos, - ) - if not args.no_token_positional_embeddings - else None - ) - - if getattr(args, "layernorm_embedding", False): - self.layernorm_embedding = LayerNorm(embed_dim) - else: - self.layernorm_embedding = None - - if not args.adaptive_input and args.quant_noise_pq > 0: - self.quant_noise = apply_quant_noise_( - nn.Linear(embed_dim, embed_dim, bias=False), - args.quant_noise_pq, - args.quant_noise_pq_block_size, - ) - else: - self.quant_noise = None - - if self.encoder_layerdrop > 0.0: - self.layers = LayerDropModuleList(p=self.encoder_layerdrop) - else: - self.layers = nn.ModuleList([]) - self.layers.extend( - [self.build_encoder_layer(args) for i in range(args.encoder_layers)] - ) - self.num_layers = len(self.layers) - - if args.encoder_normalize_before: - self.layer_norm = LayerNorm(embed_dim) - else: - self.layer_norm = None - - def build_encoder_layer(self, args): - return TransformerEncoderLayer(args) - - def forward_embedding( - self, src_tokens, token_embedding: Optional[torch.Tensor] = None - ): - # embed tokens and positions - if token_embedding is None: - token_embedding = self.embed_tokens(src_tokens) - x = embed = self.embed_scale * token_embedding - if self.embed_positions is not None: - x = embed + self.embed_positions(src_tokens) - if self.layernorm_embedding is not None: - x = self.layernorm_embedding(x) - x = self.dropout_module(x) - if self.quant_noise is not None: - x = self.quant_noise(x) - return x, embed - - def forward( - self, - src_tokens, - src_lengths, - return_all_hiddens: bool = False, - token_embeddings: Optional[torch.Tensor] = None, - ): - """ - Args: - src_tokens (LongTensor): tokens in the source language of shape - `(batch, src_len)` - src_lengths (torch.LongTensor): lengths of each source sentence of - shape `(batch)` - return_all_hiddens (bool, optional): also return all of the - intermediate hidden states (default: False). - token_embeddings (torch.Tensor, optional): precomputed embeddings - default `None` will recompute embeddings - - Returns: - namedtuple: - - **encoder_out** (Tensor): the last encoder layer's output of - shape `(src_len, batch, embed_dim)` - - **encoder_padding_mask** (ByteTensor): the positions of - padding elements of shape `(batch, src_len)` - - **encoder_embedding** (Tensor): the (scaled) embedding lookup - of shape `(batch, src_len, embed_dim)` - - **encoder_states** (List[Tensor]): all intermediate - hidden states of shape `(src_len, batch, embed_dim)`. - Only populated if *return_all_hiddens* is True. - """ - x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings) - - # B x T x C -> T x B x C - x = x.transpose(0, 1) - - # compute padding mask - encoder_padding_mask = src_tokens.eq(self.padding_idx) - - encoder_states = [] if return_all_hiddens else None - - # encoder layers - for layer in self.layers: - x = layer(x, encoder_padding_mask) - if return_all_hiddens: - assert encoder_states is not None - encoder_states.append(x) - - if self.layer_norm is not None: - x = self.layer_norm(x) - - return EncoderOut( - encoder_out=x, # T x B x C - encoder_padding_mask=encoder_padding_mask, # B x T - encoder_embedding=encoder_embedding, # B x T x C - encoder_states=encoder_states, # List[T x B x C] - src_tokens=None, - src_lengths=None, - ) - - @torch.jit.export - def reorder_encoder_out(self, encoder_out: EncoderOut, new_order): - """ - Reorder encoder output according to *new_order*. - - Args: - encoder_out: output from the ``forward()`` method - new_order (LongTensor): desired order - - Returns: - *encoder_out* rearranged according to *new_order* - """ - """ - Since encoder_padding_mask and encoder_embedding are both of type - Optional[Tensor] in EncoderOut, they need to be copied as local - variables for Torchscript Optional refinement - """ - encoder_padding_mask: Optional[Tensor] = encoder_out.encoder_padding_mask - encoder_embedding: Optional[Tensor] = encoder_out.encoder_embedding - - new_encoder_out = ( - encoder_out.encoder_out - if encoder_out.encoder_out is None - else encoder_out.encoder_out.index_select(1, new_order) - ) - new_encoder_padding_mask = ( - encoder_padding_mask - if encoder_padding_mask is None - else encoder_padding_mask.index_select(0, new_order) - ) - new_encoder_embedding = ( - encoder_embedding - if encoder_embedding is None - else encoder_embedding.index_select(0, new_order) - ) - src_tokens = encoder_out.src_tokens - if src_tokens is not None: - src_tokens = src_tokens.index_select(0, new_order) - - src_lengths = encoder_out.src_lengths - if src_lengths is not None: - src_lengths = src_lengths.index_select(0, new_order) - - encoder_states = encoder_out.encoder_states - if encoder_states is not None: - for idx, state in enumerate(encoder_states): - encoder_states[idx] = state.index_select(1, new_order) - - return EncoderOut( - encoder_out=new_encoder_out, # T x B x C - encoder_padding_mask=new_encoder_padding_mask, # B x T - encoder_embedding=new_encoder_embedding, # B x T x C - encoder_states=encoder_states, # List[T x B x C] - src_tokens=src_tokens, # B x T - src_lengths=src_lengths, # B x 1 - ) - - def max_positions(self): - """Maximum input length supported by the encoder.""" - if self.embed_positions is None: - return self.max_source_positions - return min(self.max_source_positions, self.embed_positions.max_positions) - - def upgrade_state_dict_named(self, state_dict, name): - """Upgrade a (possibly old) state dict for new versions of fairseq.""" - if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): - weights_key = "{}.embed_positions.weights".format(name) - if weights_key in state_dict: - print("deleting {0}".format(weights_key)) - del state_dict[weights_key] - state_dict[ - "{}.embed_positions._float_tensor".format(name) - ] = torch.FloatTensor(1) - for i in range(self.num_layers): - # update layer norms - self.layers[i].upgrade_state_dict_named( - state_dict, "{}.layers.{}".format(name, i) - ) - - version_key = "{}.version".format(name) - if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: - # earlier checkpoints did not normalize after the stack of layers - self.layer_norm = None - self.normalize = False - state_dict[version_key] = torch.Tensor([1]) - return state_dict - - -class TransformerDecoder(FairseqIncrementalDecoder): - """ - Transformer decoder consisting of *args.decoder_layers* layers. Each layer - is a :class:`TransformerDecoderLayer`. - - Args: - args (argparse.Namespace): parsed command-line arguments - dictionary (~fairseq.data.Dictionary): decoding dictionary - embed_tokens (torch.nn.Embedding): output embedding - no_encoder_attn (bool, optional): whether to attend to encoder outputs - (default: False). - """ - - def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): - self.args = args - super().__init__(dictionary) - self.register_buffer("version", torch.Tensor([3])) - self._future_mask = torch.empty(0) - - self.dropout_module = FairseqDropout( - args.dropout, module_name=self.__class__.__name__ - ) - self.decoder_layerdrop = args.decoder_layerdrop - self.share_input_output_embed = args.share_decoder_input_output_embed - - input_embed_dim = embed_tokens.embedding_dim - embed_dim = args.decoder_embed_dim - self.embed_dim = embed_dim - self.output_embed_dim = args.decoder_output_dim - - self.padding_idx = embed_tokens.padding_idx - self.max_target_positions = args.max_target_positions - - self.embed_tokens = embed_tokens - - self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) - - if not args.adaptive_input and args.quant_noise_pq > 0: - self.quant_noise = apply_quant_noise_( - nn.Linear(embed_dim, embed_dim, bias=False), - args.quant_noise_pq, - args.quant_noise_pq_block_size, - ) - else: - self.quant_noise = None - - self.project_in_dim = ( - Linear(input_embed_dim, embed_dim, bias=False) - if embed_dim != input_embed_dim - else None - ) - self.embed_positions = ( - PositionalEmbedding( - self.max_target_positions, - embed_dim, - self.padding_idx, - learned=args.decoder_learned_pos, - ) - if not args.no_token_positional_embeddings - else None - ) - - if getattr(args, "layernorm_embedding", False): - self.layernorm_embedding = LayerNorm(embed_dim) - else: - self.layernorm_embedding = None - - self.cross_self_attention = getattr(args, "cross_self_attention", False) - - if self.decoder_layerdrop > 0.0: - self.layers = LayerDropModuleList(p=self.decoder_layerdrop) - else: - self.layers = nn.ModuleList([]) - self.layers.extend( - [ - self.build_decoder_layer(args, no_encoder_attn) - for _ in range(args.decoder_layers) - ] - ) - self.num_layers = len(self.layers) - - if args.decoder_normalize_before and not getattr( - args, "no_decoder_final_norm", False - ): - self.layer_norm = LayerNorm(embed_dim) - else: - self.layer_norm = None - - self.project_out_dim = ( - Linear(embed_dim, self.output_embed_dim, bias=False) - if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights - else None - ) - - self.adaptive_softmax = None - self.output_projection = None - if args.adaptive_softmax_cutoff is not None: - self.adaptive_softmax = AdaptiveSoftmax( - len(dictionary), - self.output_embed_dim, - utils.eval_str_list(args.adaptive_softmax_cutoff, type=int), - dropout=args.adaptive_softmax_dropout, - adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, - factor=args.adaptive_softmax_factor, - tie_proj=args.tie_adaptive_proj, - ) - elif self.share_input_output_embed: - self.output_projection = nn.Linear( - self.embed_tokens.weight.shape[1], - self.embed_tokens.weight.shape[0], - bias=False, - ) - self.output_projection.weight = self.embed_tokens.weight - else: - self.output_projection = nn.Linear( - self.output_embed_dim, len(dictionary), bias=False - ) - nn.init.normal_( - self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5 - ) - - def build_decoder_layer(self, args, no_encoder_attn=False): - return TransformerDecoderLayer(args, no_encoder_attn) - - def forward( - self, - prev_output_tokens, - encoder_out: Optional[EncoderOut] = None, - incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, - features_only: bool = False, - full_context_alignment: bool = False, - alignment_layer: Optional[int] = None, - alignment_heads: Optional[int] = None, - src_lengths: Optional[Any] = None, - return_all_hiddens: bool = False, - ): - """ - Args: - prev_output_tokens (LongTensor): previous decoder outputs of shape - `(batch, tgt_len)`, for teacher forcing - encoder_out (optional): output from the encoder, used for - encoder-side attention - incremental_state (dict): dictionary used for storing state during - :ref:`Incremental decoding` - features_only (bool, optional): only return features without - applying output layer (default: False). - full_context_alignment (bool, optional): don't apply - auto-regressive mask to self-attention (default: False). - - Returns: - tuple: - - the decoder's output of shape `(batch, tgt_len, vocab)` - - a dictionary with any model-specific outputs - """ - x, extra = self.extract_features( - prev_output_tokens, - encoder_out=encoder_out, - incremental_state=incremental_state, - full_context_alignment=full_context_alignment, - alignment_layer=alignment_layer, - alignment_heads=alignment_heads, - ) - if not features_only: - x = self.output_layer(x) - return x, extra - - def extract_features( - self, - prev_output_tokens, - encoder_out: Optional[EncoderOut] = None, - incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, - full_context_alignment: bool = False, - alignment_layer: Optional[int] = None, - alignment_heads: Optional[int] = None, - ): - return self.extract_features_scriptable( - prev_output_tokens, - encoder_out, - incremental_state, - full_context_alignment, - alignment_layer, - alignment_heads, - ) - - """ - A scriptable subclass of this class has an extract_features method and calls - super().extract_features, but super() is not supported in torchscript. Aa copy of - this function is made to be used in the subclass instead. - """ - - def extract_features_scriptable( - self, - prev_output_tokens, - encoder_out: Optional[EncoderOut] = None, - incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, - full_context_alignment: bool = False, - alignment_layer: Optional[int] = None, - alignment_heads: Optional[int] = None, - ): - """ - Similar to *forward* but only return features. - - Includes several features from "Jointly Learning to Align and - Translate with Transformer Models" (Garg et al., EMNLP 2019). - - Args: - full_context_alignment (bool, optional): don't apply - auto-regressive mask to self-attention (default: False). - alignment_layer (int, optional): return mean alignment over - heads at this layer (default: last layer). - alignment_heads (int, optional): only average alignment over - this many heads (default: all heads). - - Returns: - tuple: - - the decoder's features of shape `(batch, tgt_len, embed_dim)` - - a dictionary with any model-specific outputs - """ - if alignment_layer is None: - alignment_layer = self.num_layers - 1 - - # embed positions - positions = ( - self.embed_positions( - prev_output_tokens, incremental_state=incremental_state - ) - if self.embed_positions is not None - else None - ) - - if incremental_state is not None: - prev_output_tokens = prev_output_tokens[:, -1:] - if positions is not None: - positions = positions[:, -1:] - - # embed tokens and positions - x = self.embed_scale * self.embed_tokens(prev_output_tokens) - - if self.quant_noise is not None: - x = self.quant_noise(x) - - if self.project_in_dim is not None: - x = self.project_in_dim(x) - - if positions is not None: - x += positions - - if self.layernorm_embedding is not None: - x = self.layernorm_embedding(x) - - x = self.dropout_module(x) - - # B x T x C -> T x B x C - x = x.transpose(0, 1) - - self_attn_padding_mask: Optional[Tensor] = None - if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): - self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) - - # decoder layers - attn: Optional[Tensor] = None - inner_states: List[Optional[Tensor]] = [x] - for idx, layer in enumerate(self.layers): - if incremental_state is None and not full_context_alignment: - self_attn_mask = self.buffered_future_mask(x) - else: - self_attn_mask = None - - x, layer_attn, _ = layer( - x, - encoder_out.encoder_out if encoder_out is not None else None, - encoder_out.encoder_padding_mask if encoder_out is not None else None, - incremental_state, - self_attn_mask=self_attn_mask, - self_attn_padding_mask=self_attn_padding_mask, - need_attn=bool((idx == alignment_layer)), - need_head_weights=bool((idx == alignment_layer)), - ) - inner_states.append(x) - if layer_attn is not None and idx == alignment_layer: - attn = layer_attn.float().to(x) - - if attn is not None: - if alignment_heads is not None: - attn = attn[:alignment_heads] - - # average probabilities over heads - attn = attn.mean(dim=0) - - if self.layer_norm is not None: - x = self.layer_norm(x) - - # T x B x C -> B x T x C - x = x.transpose(0, 1) - - if self.project_out_dim is not None: - x = self.project_out_dim(x) - - return x, {"attn": [attn], "inner_states": inner_states} - - def output_layer(self, features): - """Project features to the vocabulary size.""" - if self.adaptive_softmax is None: - # project back to size of vocabulary - return self.output_projection(features) - else: - return features - - def max_positions(self): - """Maximum output length supported by the decoder.""" - if self.embed_positions is None: - return self.max_target_positions - return min(self.max_target_positions, self.embed_positions.max_positions) - - def buffered_future_mask(self, tensor): - dim = tensor.size(0) - # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround. - if ( - self._future_mask.size(0) == 0 - or (not self._future_mask.device == tensor.device) - or self._future_mask.size(0) < dim - ): - self._future_mask = torch.triu( - utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1 - ) - self._future_mask = self._future_mask.to(tensor) - return self._future_mask[:dim, :dim] - - def upgrade_state_dict_named(self, state_dict, name): - """Upgrade a (possibly old) state dict for new versions of fairseq.""" - if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): - weights_key = "{}.embed_positions.weights".format(name) - if weights_key in state_dict: - del state_dict[weights_key] - state_dict[ - "{}.embed_positions._float_tensor".format(name) - ] = torch.FloatTensor(1) - - if f"{name}.output_projection.weight" not in state_dict: - if self.share_input_output_embed: - embed_out_key = f"{name}.embed_tokens.weight" - else: - embed_out_key = f"{name}.embed_out" - if embed_out_key in state_dict: - state_dict[f"{name}.output_projection.weight"] = state_dict[ - embed_out_key - ] - if not self.share_input_output_embed: - del state_dict[embed_out_key] - - for i in range(self.num_layers): - # update layer norms - layer_norm_map = { - "0": "self_attn_layer_norm", - "1": "encoder_attn_layer_norm", - "2": "final_layer_norm", - } - for old, new in layer_norm_map.items(): - for m in ("weight", "bias"): - k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m) - if k in state_dict: - state_dict[ - "{}.layers.{}.{}.{}".format(name, i, new, m) - ] = state_dict[k] - del state_dict[k] - - version_key = "{}.version".format(name) - if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: - # earlier checkpoints did not normalize after the stack of layers - self.layer_norm = None - self.normalize = False - state_dict[version_key] = torch.Tensor([1]) - - return state_dict - - -def Embedding(num_embeddings, embedding_dim, padding_idx): - m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) - nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) - nn.init.constant_(m.weight[padding_idx], 0) - return m - - -def Linear(in_features, out_features, bias=True): - m = nn.Linear(in_features, out_features, bias) - nn.init.xavier_uniform_(m.weight) - if bias: - nn.init.constant_(m.bias, 0.0) - return m - - -@register_model_architecture("transformer", "transformer") -def base_architecture(args): - args.encoder_embed_path = getattr(args, "encoder_embed_path", None) - args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) - args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) - args.encoder_layers = getattr(args, "encoder_layers", 6) - args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) - args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) - args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) - args.decoder_embed_path = getattr(args, "decoder_embed_path", None) - args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) - args.decoder_ffn_embed_dim = getattr( - args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim - ) - args.decoder_layers = getattr(args, "decoder_layers", 6) - args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) - args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) - args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) - args.attention_dropout = getattr(args, "attention_dropout", 0.0) - args.activation_dropout = getattr(args, "activation_dropout", 0.0) - args.activation_fn = getattr(args, "activation_fn", "relu") - args.dropout = getattr(args, "dropout", 0.1) - args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) - args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) - args.share_decoder_input_output_embed = getattr( - args, "share_decoder_input_output_embed", False - ) - args.share_all_embeddings = getattr(args, "share_all_embeddings", False) - args.no_token_positional_embeddings = getattr( - args, "no_token_positional_embeddings", False - ) - args.adaptive_input = getattr(args, "adaptive_input", False) - args.no_cross_attention = getattr(args, "no_cross_attention", False) - args.cross_self_attention = getattr(args, "cross_self_attention", False) - - args.decoder_output_dim = getattr( - args, "decoder_output_dim", args.decoder_embed_dim - ) - args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) - - args.no_scale_embedding = getattr(args, "no_scale_embedding", False) - args.layernorm_embedding = getattr(args, "layernorm_embedding", False) - args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) - - args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None) - args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None) - args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0) - args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0) - args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) - args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8) - args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0) - - -@register_model_architecture("transformer", "transformer_iwslt_de_en") -def transformer_iwslt_de_en(args): - args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) - args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024) - args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) - args.encoder_layers = getattr(args, "encoder_layers", 6) - args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) - args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024) - args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) - args.decoder_layers = getattr(args, "decoder_layers", 6) - base_architecture(args) - - -@register_model_architecture("transformer", "transformer_wmt_en_de") -def transformer_wmt_en_de(args): - base_architecture(args) - - -# parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017) -@register_model_architecture("transformer", "transformer_vaswani_wmt_en_de_big") -def transformer_vaswani_wmt_en_de_big(args): - args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) - args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) - args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) - args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) - args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) - args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) - args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) - args.dropout = getattr(args, "dropout", 0.3) - base_architecture(args) - - -@register_model_architecture("transformer", "transformer_vaswani_wmt_en_fr_big") -def transformer_vaswani_wmt_en_fr_big(args): - args.dropout = getattr(args, "dropout", 0.1) - transformer_vaswani_wmt_en_de_big(args) - - -@register_model_architecture("transformer", "transformer_wmt_en_de_big") -def transformer_wmt_en_de_big(args): - args.attention_dropout = getattr(args, "attention_dropout", 0.1) - transformer_vaswani_wmt_en_de_big(args) - - -# default parameters used in tensor2tensor implementation -@register_model_architecture("transformer", "transformer_wmt_en_de_big_t2t") -def transformer_wmt_en_de_big_t2t(args): - args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) - args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) - args.attention_dropout = getattr(args, "attention_dropout", 0.1) - args.activation_dropout = getattr(args, "activation_dropout", 0.1) - transformer_vaswani_wmt_en_de_big(args) diff --git a/fairseq/models/transformer/__init__.py b/fairseq/models/transformer/__init__.py new file mode 100644 index 0000000000..681fca3d45 --- /dev/null +++ b/fairseq/models/transformer/__init__.py @@ -0,0 +1,50 @@ +# Copyright (c) Facebook Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +"""isort:skip_file""" + +from .transformer_config import ( + TransformerConfig, + DEFAULT_MAX_SOURCE_POSITIONS, + DEFAULT_MAX_TARGET_POSITIONS, + DEFAULT_MIN_PARAMS_TO_WRAP, +) +from .transformer_decoder import TransformerDecoder, TransformerDecoderBase, Linear +from .transformer_encoder import TransformerEncoder, TransformerEncoderBase +from .transformer_legacy import ( + TransformerModel, + base_architecture, + tiny_architecture, + transformer_iwslt_de_en, + transformer_wmt_en_de, + transformer_vaswani_wmt_en_de_big, + transformer_vaswani_wmt_en_fr_big, + transformer_wmt_en_de_big, + transformer_wmt_en_de_big_t2t, +) +from .transformer_base import TransformerModelBase, Embedding + + +__all__ = [ + "TransformerModelBase", + "TransformerConfig", + "TransformerDecoder", + "TransformerDecoderBase", + "TransformerEncoder", + "TransformerEncoderBase", + "TransformerModel", + "Embedding", + "Linear", + "base_architecture", + "tiny_architecture", + "transformer_iwslt_de_en", + "transformer_wmt_en_de", + "transformer_vaswani_wmt_en_de_big", + "transformer_vaswani_wmt_en_fr_big", + "transformer_wmt_en_de_big", + "transformer_wmt_en_de_big_t2t", + "DEFAULT_MAX_SOURCE_POSITIONS", + "DEFAULT_MAX_TARGET_POSITIONS", + "DEFAULT_MIN_PARAMS_TO_WRAP", +] diff --git a/fairseq/models/transformer/transformer_base.py b/fairseq/models/transformer/transformer_base.py new file mode 100644 index 0000000000..f9f097f04b --- /dev/null +++ b/fairseq/models/transformer/transformer_base.py @@ -0,0 +1,193 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn as nn +from torch import Tensor + +import logging + +from fairseq import utils +from fairseq.dataclass.utils import gen_parser_from_dataclass +from fairseq.distributed import fsdp_wrap +from fairseq.models import FairseqEncoderDecoderModel +from fairseq.models.transformer import ( + TransformerConfig, + TransformerDecoderBase, + TransformerEncoderBase, +) + + +logger = logging.getLogger(__name__) + + +class TransformerModelBase(FairseqEncoderDecoderModel): + """ + Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017) + <https://arxiv.org/abs/1706.03762>`_. + + Args: + encoder (TransformerEncoder): the encoder + decoder (TransformerDecoder): the decoder + + The Transformer model provides the following named architectures and + command-line arguments: + + .. argparse:: + :ref: fairseq.models.transformer_parser + :prog: + """ + + def __init__(self, cfg, encoder, decoder): + super().__init__(encoder, decoder) + self.cfg = cfg + self.supports_align_args = True + + @classmethod + def add_args(cls, parser): + """Add model-specific arguments to the parser.""" + # we want to build the args recursively in this case. + gen_parser_from_dataclass( + parser, TransformerConfig(), delete_default=False, with_prefix="" + ) + + @classmethod + def build_model(cls, cfg, task): + """Build a new model instance.""" + + # -- TODO T96535332 + # bug caused by interaction between OmegaConf II and argparsing + cfg.decoder.input_dim = int(cfg.decoder.input_dim) + cfg.decoder.output_dim = int(cfg.decoder.output_dim) + # -- + + if cfg.encoder.layers_to_keep: + cfg.encoder.layers = len(cfg.encoder.layers_to_keep.split(",")) + if cfg.decoder.layers_to_keep: + cfg.decoder.layers = len(cfg.decoder.layers_to_keep.split(",")) + + src_dict, tgt_dict = task.source_dictionary, task.target_dictionary + + if cfg.share_all_embeddings: + if src_dict != tgt_dict: + raise ValueError("--share-all-embeddings requires a joined dictionary") + if cfg.encoder.embed_dim != cfg.decoder.embed_dim: + raise ValueError( + "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" + ) + if cfg.decoder.embed_path and ( + cfg.decoder.embed_path != cfg.encoder.embed_path + ): + raise ValueError( + "--share-all-embeddings not compatible with --decoder-embed-path" + ) + encoder_embed_tokens = cls.build_embedding( + cfg, src_dict, cfg.encoder.embed_dim, cfg.encoder.embed_path + ) + decoder_embed_tokens = encoder_embed_tokens + cfg.share_decoder_input_output_embed = True + elif cfg.merge_src_tgt_embed: + logger.info(f"source dict size: {len(src_dict)}") + logger.info(f"target dict size: {len(tgt_dict)}") + src_dict.update(tgt_dict) + task.src_dict = src_dict + task.tgt_dict = src_dict + logger.info(f"merged dict size: {len(src_dict)}") + encoder_embed_tokens = cls.build_embedding( + cfg, src_dict, cfg.encoder.embed_dim + ) + decoder_embed_tokens = encoder_embed_tokens + cfg.share_decoder_input_output_embed = True + else: + encoder_embed_tokens = cls.build_embedding( + cfg, src_dict, cfg.encoder.embed_dim, cfg.encoder.embed_path + ) + decoder_embed_tokens = cls.build_embedding( + cfg, tgt_dict, cfg.decoder.embed_dim, cfg.decoder.embed_path + ) + if cfg.offload_activations: + cfg.checkpoint_activations = True # offloading implies checkpointing + encoder = cls.build_encoder(cfg, src_dict, encoder_embed_tokens) + decoder = cls.build_decoder(cfg, tgt_dict, decoder_embed_tokens) + return cls(cfg, encoder, decoder) + + @classmethod + def build_embedding(cls, cfg, dictionary, embed_dim, path=None): + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + + emb = Embedding(num_embeddings, embed_dim, padding_idx) + # if provided, load from preloaded dictionaries + if path: + embed_dict = utils.parse_embedding(path) + utils.load_embedding(embed_dict, dictionary, emb) + return emb + + @classmethod + def build_encoder(cls, cfg, src_dict, embed_tokens): + return TransformerEncoderBase(cfg, src_dict, embed_tokens) + + @classmethod + def build_decoder(cls, cfg, tgt_dict, embed_tokens): + return TransformerDecoderBase( + cfg, + tgt_dict, + embed_tokens, + no_encoder_attn=cfg.no_cross_attention, + ) + + # TorchScript doesn't support optional arguments with variable length (**kwargs). + # Current workaround is to add union of all arguments in child classes. + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens, + return_all_hiddens: bool = True, + features_only: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + """ + Run the forward pass for an encoder-decoder model. + + Copied from the base class, but without ``**kwargs``, + which are not supported by TorchScript. + """ + encoder_out = self.encoder( + src_tokens, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens + ) + decoder_out = self.decoder( + prev_output_tokens, + encoder_out=encoder_out, + features_only=features_only, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + src_lengths=src_lengths, + return_all_hiddens=return_all_hiddens, + ) + return decoder_out + + # Since get_normalized_probs is in the Fairseq Model which is not scriptable, + # I rewrite the get_normalized_probs from Base Class to call the + # helper function in the Base Class. + @torch.jit.export + def get_normalized_probs( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): + """Get normalized probabilities (or log probs) from a net's output.""" + return self.get_normalized_probs_scriptable(net_output, log_probs, sample) + + +def Embedding(num_embeddings, embedding_dim, padding_idx): + m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) + nn.init.constant_(m.weight[padding_idx], 0) + return m diff --git a/fairseq/models/transformer/transformer_config.py b/fairseq/models/transformer/transformer_config.py new file mode 100644 index 0000000000..4650de2e17 --- /dev/null +++ b/fairseq/models/transformer/transformer_config.py @@ -0,0 +1,341 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import re +from dataclasses import dataclass, field, fields +from typing import List, Optional + +from omegaconf import II + +from fairseq import utils +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.utils import safe_getattr, safe_hasattr + +DEFAULT_MAX_SOURCE_POSITIONS = 1024 +DEFAULT_MAX_TARGET_POSITIONS = 1024 + +DEFAULT_MIN_PARAMS_TO_WRAP = int(1e8) + +_NAME_PARSER = r"(decoder|encoder|quant_noise)_(.*)" + + +@dataclass +class EncDecBaseConfig(FairseqDataclass): + embed_path: Optional[str] = field( + default=None, metadata={"help": "path to pre-trained embedding"} + ) + embed_dim: Optional[int] = field( + default=512, metadata={"help": "embedding dimension"} + ) + ffn_embed_dim: int = field( + default=2048, metadata={"help": "embedding dimension for FFN"} + ) + layers: int = field(default=6, metadata={"help": "number of layers"}) + attention_heads: int = field( + default=8, metadata={"help": "number of attention heads"} + ) + normalize_before: bool = field( + default=False, metadata={"help": "apply layernorm before each block"} + ) + learned_pos: bool = field( + default=False, metadata={"help": "use learned positional embeddings"} + ) + # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) + layerdrop: float = field(default=0, metadata={"help": "LayerDrop probability"}) + layers_to_keep: Optional[List[int]] = field( + default=None, metadata={"help": "which layers to *keep* when pruning"} + ) + + xformers_att_config: Optional[str] = field( + default=None, + metadata={ + "help": "config for xFormers attention, defined in xformers.components.attention.AttentionConfig" + }, + ) + + +@dataclass +class DecoderConfig(EncDecBaseConfig): + input_dim: int = II("model.decoder.embed_dim") + output_dim: int = field( + default=II("model.decoder.embed_dim"), + metadata={ + "help": "decoder output dimension (extra linear layer if different from decoder embed dim)" + }, + ) + + def __post_init__(self): + # II doesn't work if we are just creating the object outside of hydra so fix that + if self.input_dim == II("model.decoder.embed_dim"): + self.input_dim = self.embed_dim + if self.output_dim == II("model.decoder.embed_dim"): + self.output_dim = self.embed_dim + + +@dataclass +class QuantNoiseConfig(FairseqDataclass): + pq: float = field( + default=0.0, + metadata={"help": "iterative PQ quantization noise at training time"}, + ) + pq_block_size: int = field( + default=8, + metadata={"help": "block size of quantization noise at training time"}, + ) + scalar: float = field( + default=0.0, + metadata={ + "help": "scalar quantization noise and scalar quantization at training time" + }, + ) + + +@dataclass +class TransformerConfig(FairseqDataclass): + activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( + default="relu", + metadata={"help": "activation function to use"}, + ) + dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) + attention_dropout: float = field( + default=0.0, metadata={"help": "dropout probability for attention weights"} + ) + activation_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability after activation in FFN.", + "alias": "--relu-dropout", + }, + ) + adaptive_input: bool = False + encoder: EncDecBaseConfig = EncDecBaseConfig() + # TODO should really be in the encoder config + max_source_positions: int = field( + default=DEFAULT_MAX_SOURCE_POSITIONS, + metadata={"help": "Maximum input length supported by the encoder"}, + ) + decoder: DecoderConfig = DecoderConfig() + # TODO should really be in the decoder config + max_target_positions: int = field( + default=DEFAULT_MAX_TARGET_POSITIONS, + metadata={"help": "Maximum output length supported by the decoder"}, + ) + share_decoder_input_output_embed: bool = field( + default=False, metadata={"help": "share decoder input and output embeddings"} + ) + share_all_embeddings: bool = field( + default=False, + metadata={ + "help": "share encoder, decoder and output embeddings (requires shared dictionary and embed dim)" + }, + ) + merge_src_tgt_embed: bool = field( + default=False, + metadata={ + "help": "if true then the source and target embedding table is " + "merged into one table. This is going to make the model smaller but " + "it might hurt performance." + }, + ) + no_token_positional_embeddings: bool = field( + default=False, + metadata={ + "help": "if True, disables positional embeddings (outside self attention)" + }, + ) + adaptive_softmax_cutoff: Optional[List[int]] = field( + default=None, + metadata={ + "help": "list of adaptive softmax cutoff points. Must be used with adaptive_loss criterion" + }, + ) + adaptive_softmax_dropout: float = field( + default=0.0, + metadata={"help": "sets adaptive softmax dropout for the tail projections"}, + ) + adaptive_softmax_factor: float = field( + default=4, metadata={"help": "adaptive input factor"} + ) + layernorm_embedding: bool = field( + default=False, metadata={"help": "add layernorm to embedding"} + ) + tie_adaptive_weights: bool = field( + default=False, + metadata={ + "help": "if set, ties the weights of adaptive softmax and adaptive input" + }, + ) + tie_adaptive_proj: bool = field( + default=False, + metadata={ + "help": "if set, ties the projection weights of adaptive softmax and adaptive input" + }, + ) + no_scale_embedding: bool = field( + default=False, metadata={"help": "if True, dont scale embeddings"} + ) + checkpoint_activations: bool = field( + default=False, + metadata={ + "help": "checkpoint activations at each layer, which saves GPU memory usage at the cost of some additional compute" + }, + ) + offload_activations: bool = field( + default=False, + metadata={ + "help": "checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations." + }, + ) + # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) + no_cross_attention: bool = field( + default=False, metadata={"help": "do not perform cross-attention"} + ) + cross_self_attention: bool = field( + default=False, metadata={"help": "perform cross+self-attention"} + ) + # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) + quant_noise: QuantNoiseConfig = field(default=QuantNoiseConfig()) + min_params_to_wrap: int = field( + default=DEFAULT_MIN_PARAMS_TO_WRAP, + metadata={ + "help": "minimum number of params for a layer to be wrapped with FSDP() when " + "training with --ddp-backend=fully_sharded. Smaller values will " + "improve memory efficiency, but may make torch.distributed " + "communication less efficient due to smaller input sizes. This option " + "is set to 0 (i.e., always wrap) when --checkpoint-activations or " + "--offload-activations are passed." + }, + ) + # DEPRECATED field, but some old checkpoints might have it + char_inputs: bool = field( + default=False, metadata={"help": "if set, model takes character ids as input"} + ) + relu_dropout: float = 0.0 + # config for "BASE Layers: Simplifying Training of Large, Sparse Models" + base_layers: Optional[int] = field( + default=0, metadata={"help": "number of BASE layers in total"} + ) + base_sublayers: Optional[int] = field( + default=1, metadata={"help": "number of sublayers in each BASE layer"} + ) + base_shuffle: Optional[int] = field( + default=1, + metadata={"help": "shuffle tokens between workers before computing assignment"}, + ) + + export: bool = field( + default=False, + metadata={"help": "make the layernorm exportable with torchscript."}, + ) + + # copied from transformer_lm but expected in transformer_decoder: + no_decoder_final_norm: bool = field( + default=False, + metadata={"help": "don't add an extra layernorm after the last decoder block"}, + ) + + # We need to make this hierarchical dataclass like the flat namespace + # __getattr__ and __setattr__ here allow backward compatibility + # for subclasses of Transformer(Legacy) that depend on read/write on + # the flat namespace. + + def __getattr__(self, name): + match = re.match(_NAME_PARSER, name) + if match: + sub = safe_getattr(self, match[1]) + return safe_getattr(sub, match[2]) + raise AttributeError(f"invalid argument {name}.") + + def __setattr__(self, name, value): + match = re.match(_NAME_PARSER, name) + if match: + sub = safe_getattr(self, match[1]) + setattr(sub, match[2], value) + else: + super().__setattr__(name, value) + + @staticmethod + def _copy_keys(args, cls, prefix, seen): + """ + copy the prefixed keys (decoder_embed_dim) to the DC fields: decoder.embed_dim + """ + cfg = cls() + for fld in fields(cls): + # for all the fields in the DC, find the fields (e.g. embed_dim) + # in the namespace with the prefix (e.g. decoder) + # and set it on the dc. + args_key = f"{prefix}_{fld.name}" + if safe_hasattr(args, args_key): + seen.add(args_key) + setattr(cfg, fld.name, safe_getattr(args, args_key)) + if safe_hasattr(args, fld.name): + seen.add(fld.name) + setattr(cfg, fld.name, safe_getattr(args, fld.name)) + return cfg + + @classmethod + def from_namespace(cls, args): + if args is None: + return None + if not isinstance(args, cls): + seen = set() + config = cls() + # currently, we can go generically from DC fields to args hierarchically + # but we can't easily deconstruct a flat namespace to a hierarchical + # DC. Mostly because we could have a sub-dc called `decoder-foo` that should not + # go to the sub struct called `decoder`. There are ways to go around this, but let's keep it simple + # for now. + for fld in fields(cls): + # concretelly, the transformer_config know what sub-dc it has, so we go through all the dc fields + # and if it's one that has a sub-dc, we build that sub-dc with `copy_keys()` + if fld.name == "decoder": + if safe_hasattr(args, "decoder"): + # in some cases, the args we receive is already structured (as DictConfigs), so let's just build the correct DC + seen.add("decoder") + config.decoder = DecoderConfig(**args.decoder) + else: + config.decoder = cls._copy_keys( + args, DecoderConfig, "decoder", seen + ) + elif fld.name == "encoder": + # same but for encoder + if safe_hasattr(args, "encoder"): + seen.add("encoder") + config.encoder = EncDecBaseConfig(**args.encoder) + else: + config.encoder = cls._copy_keys( + args, EncDecBaseConfig, "encoder", seen + ) + elif fld.name == "quant_noise": + # same but for quant_noise + if safe_hasattr(args, "quant_noise"): + seen.add("quant_noise") + config.quant_noise = QuantNoiseConfig(**args.quant_noise) + else: + config.quant_noise = cls._copy_keys( + args, QuantNoiseConfig, "quant_noise", seen + ) + elif safe_hasattr(args, fld.name): + # if it's not a structure field, it's just a normal field, copy it over + seen.add(fld.name) + setattr(config, fld.name, safe_getattr(args, fld.name)) + # we got all the fields defined in the dataclass, but + # the argparse namespace might have extra args for two reasons: + # - we are in a legacy class so all the args are not declared in the dataclass. Ideally once everyone has defined a dataclass for their model, we won't need this + # - some places expect args to be there but never define them + args_dict = ( + args._asdict() + if safe_hasattr(args, "_asdict") + else vars(args) + if safe_hasattr(args, "__dict__") + else {} + ) # namedtupled doesn't have __dict__ :-/ + for key, value in args_dict.items(): + if key not in seen: + setattr(config, key, value) + return config + else: + return args diff --git a/fairseq/models/transformer/transformer_decoder.py b/fairseq/models/transformer/transformer_decoder.py new file mode 100644 index 0000000000..744c73f4f8 --- /dev/null +++ b/fairseq/models/transformer/transformer_decoder.py @@ -0,0 +1,474 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Any, Dict, List, Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from fairseq import utils +from fairseq.distributed import fsdp_wrap +from fairseq.models import FairseqIncrementalDecoder +from fairseq.models.transformer import TransformerConfig +from fairseq.modules import ( + AdaptiveSoftmax, + BaseLayer, + FairseqDropout, + LayerDropModuleList, + LayerNorm, + PositionalEmbedding, + SinusoidalPositionalEmbedding, + transformer_layer, +) +from fairseq.modules.checkpoint_activations import checkpoint_wrapper +from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ + + +# rewrite name for backward compatibility in `make_generation_fast_` +def module_name_fordropout(module_name: str) -> str: + if module_name == "TransformerDecoderBase": + return "TransformerDecoder" + else: + return module_name + + +class TransformerDecoderBase(FairseqIncrementalDecoder): + """ + Transformer decoder consisting of *cfg.decoder.layers* layers. Each layer + is a :class:`TransformerDecoderLayer`. + + Args: + cfg (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__( + self, + cfg, + dictionary, + embed_tokens, + no_encoder_attn=False, + output_projection=None, + ): + self.cfg = cfg + super().__init__(dictionary) + self.register_buffer("version", torch.Tensor([3])) + self._future_mask = torch.empty(0) + + self.dropout_module = FairseqDropout( + cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__) + ) + self.decoder_layerdrop = cfg.decoder.layerdrop + self.share_input_output_embed = cfg.share_decoder_input_output_embed + + input_embed_dim = embed_tokens.embedding_dim + embed_dim = cfg.decoder.embed_dim + self.embed_dim = embed_dim + self.output_embed_dim = cfg.decoder.output_dim + + self.padding_idx = embed_tokens.padding_idx + self.max_target_positions = cfg.max_target_positions + + self.embed_tokens = embed_tokens + + self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim) + + if not cfg.adaptive_input and cfg.quant_noise.pq > 0: + self.quant_noise = apply_quant_noise_( + nn.Linear(embed_dim, embed_dim, bias=False), + cfg.quant_noise.pq, + cfg.quant_noise.pq_block_size, + ) + else: + self.quant_noise = None + + self.project_in_dim = ( + Linear(input_embed_dim, embed_dim, bias=False) + if embed_dim != input_embed_dim + else None + ) + self.embed_positions = ( + PositionalEmbedding( + self.max_target_positions, + embed_dim, + self.padding_idx, + learned=cfg.decoder.learned_pos, + ) + if not cfg.no_token_positional_embeddings + else None + ) + if cfg.layernorm_embedding: + self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export) + else: + self.layernorm_embedding = None + + self.cross_self_attention = cfg.cross_self_attention + + if self.decoder_layerdrop > 0.0: + self.layers = LayerDropModuleList(p=self.decoder_layerdrop) + else: + self.layers = nn.ModuleList([]) + self.layers.extend( + [ + self.build_decoder_layer(cfg, no_encoder_attn) + for _ in range(cfg.decoder.layers) + ] + ) + self.num_layers = len(self.layers) + + if cfg.decoder.normalize_before and not cfg.no_decoder_final_norm: + self.layer_norm = LayerNorm(embed_dim, export=cfg.export) + else: + self.layer_norm = None + + self.project_out_dim = ( + Linear(embed_dim, self.output_embed_dim, bias=False) + if embed_dim != self.output_embed_dim and not cfg.tie_adaptive_weights + else None + ) + + self.adaptive_softmax = None + self.output_projection = output_projection + if self.output_projection is None: + self.build_output_projection(cfg, dictionary, embed_tokens) + + def build_output_projection(self, cfg, dictionary, embed_tokens): + if cfg.adaptive_softmax_cutoff is not None: + self.adaptive_softmax = AdaptiveSoftmax( + len(dictionary), + self.output_embed_dim, + utils.eval_str_list(cfg.adaptive_softmax_cutoff, type=int), + dropout=cfg.adaptive_softmax_dropout, + adaptive_inputs=embed_tokens if cfg.tie_adaptive_weights else None, + factor=cfg.adaptive_softmax_factor, + tie_proj=cfg.tie_adaptive_proj, + ) + elif self.share_input_output_embed: + self.output_projection = nn.Linear( + self.embed_tokens.weight.shape[1], + self.embed_tokens.weight.shape[0], + bias=False, + ) + self.output_projection.weight = self.embed_tokens.weight + else: + self.output_projection = nn.Linear( + self.output_embed_dim, len(dictionary), bias=False + ) + nn.init.normal_( + self.output_projection.weight, mean=0, std=self.output_embed_dim**-0.5 + ) + num_base_layers = cfg.base_layers + for i in range(num_base_layers): + self.layers.insert( + ((i + 1) * cfg.decoder.layers) // (num_base_layers + 1), + BaseLayer(cfg), + ) + + def build_decoder_layer(self, cfg, no_encoder_attn=False): + layer = transformer_layer.TransformerDecoderLayerBase(cfg, no_encoder_attn) + checkpoint = cfg.checkpoint_activations + if checkpoint: + offload_to_cpu = cfg.offload_activations + layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) + # if we are checkpointing, enforce that FSDP always wraps the + # checkpointed layer, regardless of layer size + min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 + layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) + return layer + + def forward( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + features_only: bool = False, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + src_lengths: Optional[Any] = None, + return_all_hiddens: bool = False, + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (optional): output from the encoder, used for + encoder-side attention, should be of size T x B x C + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + features_only (bool, optional): only return features without + applying output layer (default: False). + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + + x, extra = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + incremental_state=incremental_state, + full_context_alignment=full_context_alignment, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + ) + + if not features_only: + x = self.output_layer(x) + return x, extra + + def extract_features( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + return self.extract_features_scriptable( + prev_output_tokens, + encoder_out, + incremental_state, + full_context_alignment, + alignment_layer, + alignment_heads, + ) + + """ + A scriptable subclass of this class has an extract_features method and calls + super().extract_features, but super() is not supported in torchscript. A copy of + this function is made to be used in the subclass instead. + """ + + def extract_features_scriptable( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + """ + Similar to *forward* but only return features. + + Includes several features from "Jointly Learning to Align and + Translate with Transformer Models" (Garg et al., EMNLP 2019). + + Args: + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + alignment_layer (int, optional): return mean alignment over + heads at this layer (default: last layer). + alignment_heads (int, optional): only average alignment over + this many heads (default: all heads). + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + bs, slen = prev_output_tokens.size() + if alignment_layer is None: + alignment_layer = self.num_layers - 1 + + enc: Optional[Tensor] = None + padding_mask: Optional[Tensor] = None + if encoder_out is not None and len(encoder_out["encoder_out"]) > 0: + enc = encoder_out["encoder_out"][0] + if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0: + padding_mask = encoder_out["encoder_padding_mask"][0] + + # embed positions + positions = None + if self.embed_positions is not None: + positions = self.embed_positions( + prev_output_tokens, incremental_state=incremental_state + ) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + # Prevent torchscript exporting issue for dynamic quant embedding + prev_output_tokens = prev_output_tokens.contiguous() + # embed tokens and positions + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + + if self.quant_noise is not None: + x = self.quant_noise(x) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + if positions is not None: + x += positions + + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + self_attn_padding_mask: Optional[Tensor] = None + if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): + self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) + + # decoder layers + attn: Optional[Tensor] = None + inner_states: List[Optional[Tensor]] = [x] + for idx, layer in enumerate(self.layers): + if incremental_state is None and not full_context_alignment: + self_attn_mask = self.buffered_future_mask(x) + else: + self_attn_mask = None + + x, layer_attn, _ = layer( + x, + enc, + padding_mask, + incremental_state, + self_attn_mask=self_attn_mask, + self_attn_padding_mask=self_attn_padding_mask, + need_attn=bool((idx == alignment_layer)), + need_head_weights=bool((idx == alignment_layer)), + ) + inner_states.append(x) + if layer_attn is not None and idx == alignment_layer: + attn = layer_attn.float().to(x) + + if attn is not None: + if alignment_heads is not None: + attn = attn[:alignment_heads] + + # average probabilities over heads + attn = attn.mean(dim=0) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + if self.project_out_dim is not None: + x = self.project_out_dim(x) + + return x, {"attn": [attn], "inner_states": inner_states} + + def output_layer(self, features): + """Project features to the vocabulary size.""" + if self.adaptive_softmax is None: + # project back to size of vocabulary + return self.output_projection(features) + else: + return features + + def max_positions(self): + """Maximum output length supported by the decoder.""" + if self.embed_positions is None: + return self.max_target_positions + return min(self.max_target_positions, self.embed_positions.max_positions) + + def buffered_future_mask(self, tensor): + dim = tensor.size(0) + # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround. + if ( + self._future_mask.size(0) == 0 + or (not self._future_mask.device == tensor.device) + or self._future_mask.size(0) < dim + ): + self._future_mask = torch.triu( + utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1 + ) + self._future_mask = self._future_mask.to(tensor) + return self._future_mask[:dim, :dim] + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + if f"{name}.output_projection.weight" not in state_dict: + if self.share_input_output_embed: + embed_out_key = f"{name}.embed_tokens.weight" + else: + embed_out_key = f"{name}.embed_out" + if embed_out_key in state_dict: + state_dict[f"{name}.output_projection.weight"] = state_dict[ + embed_out_key + ] + if not self.share_input_output_embed: + del state_dict[embed_out_key] + + for i in range(self.num_layers): + # update layer norms + layer_norm_map = { + "0": "self_attn_layer_norm", + "1": "encoder_attn_layer_norm", + "2": "final_layer_norm", + } + for old, new in layer_norm_map.items(): + for m in ("weight", "bias"): + k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m) + if k in state_dict: + state_dict[ + "{}.layers.{}.{}.{}".format(name, i, new, m) + ] = state_dict[k] + del state_dict[k] + + version_key = "{}.version".format(name) + if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: + # earlier checkpoints did not normalize after the stack of layers + self.layer_norm = None + self.normalize = False + state_dict[version_key] = torch.Tensor([1]) + + return state_dict + + +def Linear(in_features, out_features, bias=True): + m = nn.Linear(in_features, out_features, bias) + nn.init.xavier_uniform_(m.weight) + if bias: + nn.init.constant_(m.bias, 0.0) + return m + + +class TransformerDecoder(TransformerDecoderBase): + def __init__( + self, + args, + dictionary, + embed_tokens, + no_encoder_attn=False, + output_projection=None, + ): + self.args = args + super().__init__( + TransformerConfig.from_namespace(args), + dictionary, + embed_tokens, + no_encoder_attn=no_encoder_attn, + output_projection=output_projection, + ) + + def build_output_projection(self, args, dictionary, embed_tokens): + super().build_output_projection( + TransformerConfig.from_namespace(args), dictionary, embed_tokens + ) + + def build_decoder_layer(self, args, no_encoder_attn=False): + return super().build_decoder_layer( + TransformerConfig.from_namespace(args), no_encoder_attn=no_encoder_attn + ) diff --git a/fairseq/models/transformer/transformer_decoder_aug.py b/fairseq/models/transformer/transformer_decoder_aug.py new file mode 100644 index 0000000000..b73c06e02a --- /dev/null +++ b/fairseq/models/transformer/transformer_decoder_aug.py @@ -0,0 +1,384 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, Dict, List, Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from fairseq import utils +from fairseq.distributed import fsdp_wrap +from fairseq.models.transformer import TransformerConfig +from fairseq.models.transformer.transformer_decoder import TransformerDecoderBase +from fairseq.modules import ( + LayerDropModuleList, + SinusoidalPositionalEmbedding, + transformer_layer_aug, +) +from fairseq.modules.checkpoint_activations import checkpoint_wrapper + + +class AugTransformerDecoderBase(TransformerDecoderBase): + """ + Transformer decoder augmented with an additional cross-attention. Each layer + is a :class:`AugTransformerDecoderLayerBase`. + + Args: + cfg (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + encoder_attn_merge_type (str, optional): the way to combine outputs from + two cross-attention modules. If "sequential" is set, two cross-attention + modules are stacked sequentially. If "parallel" is set, they are processed + in parallel and combined before feeding it to FFN (default: sequential). + dropnet_ratio (float, optional): a probability to drop each cross-attention + module during training (default: 0.0). + """ + + def __init__( + self, + cfg, + dictionary, + embed_tokens, + output_projection=None, + encoder_attn_merge_type="sequential", + dropnet_ratio=0.0, + ): + super().__init__( + cfg, + dictionary, + embed_tokens, + no_encoder_attn=False, + output_projection=output_projection, + ) + # assert cfg.cross_self_attention + self.cross_self_attention = cfg.cross_self_attention + + if self.decoder_layerdrop > 0.0: + self.layers = LayerDropModuleList(p=self.decoder_layerdrop) + else: + self.layers = nn.ModuleList([]) + self.layers.extend( + [ + self.build_decoder_layer(cfg, encoder_attn_merge_type, dropnet_ratio) + for _ in range(cfg.decoder.layers) + ] + ) + + def build_decoder_layer( + self, + cfg, + encoder_attn_merge_type="sequential", + dropnet_ratio=0, + ): + layer = transformer_layer_aug.AugTransformerDecoderLayerBase( + cfg, + no_encoder_attn=False, + encoder_attn_merge_type=encoder_attn_merge_type, + dropnet_ratio=dropnet_ratio, + ) + checkpoint = cfg.checkpoint_activations + if checkpoint: + offload_to_cpu = cfg.offload_activations + layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) + # if we are checkpointing, enforce that FSDP always wraps the + # checkpointed layer, regardless of layer size + min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 + layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) + return layer + + def forward( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + encoder_out_aug: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + features_only: bool = False, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + src_lengths: Optional[Any] = None, + return_all_hiddens: bool = False, + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (optional): output from the encoder, used for + encoder-side attention, should be of size T x B x C + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + features_only (bool, optional): only return features without + applying output layer (default: False). + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + + x, extra = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + encoder_out_aug=encoder_out_aug, + incremental_state=incremental_state, + full_context_alignment=full_context_alignment, + alignment_layer=alignment_layer, + alignment_heads=alignment_heads, + ) + + if not features_only: + x = self.output_layer(x) + return x, extra + + def extract_features( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + encoder_out_aug: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + return self.extract_features_scriptable( + prev_output_tokens, + encoder_out, + encoder_out_aug, + incremental_state, + full_context_alignment, + alignment_layer, + alignment_heads, + ) + + """ + A scriptable subclass of this class has an extract_features method and calls + super().extract_features, but super() is not supported in torchscript. A copy of + this function is made to be used in the subclass instead. + """ + + def extract_features_scriptable( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + encoder_out_aug: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + """ + Similar to *forward* but only return features. + + Includes several features from "Jointly Learning to Align and + Translate with Transformer Models" (Garg et al., EMNLP 2019). + + Args: + full_context_alignment (bool, optional): don't apply + auto-regressive mask to self-attention (default: False). + alignment_layer (int, optional): return mean alignment over + heads at this layer (default: last layer). + alignment_heads (int, optional): only average alignment over + this many heads (default: all heads). + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + bs, slen = prev_output_tokens.size() + if alignment_layer is None: + alignment_layer = self.num_layers - 1 + + enc: Optional[Tensor] = None + padding_mask: Optional[Tensor] = None + if encoder_out is not None and len(encoder_out["encoder_out"]) > 0: + enc = encoder_out["encoder_out"][0] + if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0: + padding_mask = encoder_out["encoder_padding_mask"][0] + + enc_aug: Optional[Tensor] = None + padding_mask_aug: Optional[Tensor] = None + if encoder_out_aug is not None and len(encoder_out_aug["encoder_out"]) > 0: + enc_aug = encoder_out_aug["encoder_out"][0] + if ( + encoder_out_aug is not None + and len(encoder_out_aug["encoder_padding_mask"]) > 0 + ): + padding_mask_aug = encoder_out_aug["encoder_padding_mask"][0] + + # embed positions + positions = None + if self.embed_positions is not None: + positions = self.embed_positions( + prev_output_tokens, incremental_state=incremental_state + ) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + # Prevent torchscript exporting issue for dynamic quant embedding + prev_output_tokens = prev_output_tokens.contiguous() + # embed tokens and positions + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + + if self.quant_noise is not None: + x = self.quant_noise(x) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + if positions is not None: + x += positions + + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + self_attn_padding_mask: Optional[Tensor] = None + if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): + self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) + + # decoder layers + attn: Optional[Tensor] = None + attn_aug: Optional[Tensor] = None + inner_states: List[Optional[Tensor]] = [x] + for idx, layer in enumerate(self.layers): + if incremental_state is None and not full_context_alignment: + self_attn_mask = self.buffered_future_mask(x) + else: + self_attn_mask = None + + x, layer_attn, layer_attn_aug, _ = layer( + x, + enc, + padding_mask, + enc_aug, + padding_mask_aug, + incremental_state, + self_attn_mask=self_attn_mask, + self_attn_padding_mask=self_attn_padding_mask, + need_attn=bool((idx == alignment_layer)), + need_head_weights=bool((idx == alignment_layer)), + ) + inner_states.append(x) + if layer_attn is not None and idx == alignment_layer: + attn = layer_attn.float().to(x) + if layer_attn_aug is not None and idx == alignment_layer: + attn_aug = layer_attn_aug.float().to(x) + + if attn is not None: + if alignment_heads is not None: + attn = attn[:alignment_heads] + + # average probabilities over heads + attn = attn.mean(dim=0) + + if attn_aug is not None: + if alignment_heads is not None: + attn_aug = attn_aug[:alignment_heads] + + # average probabilities over heads + attn_aug = attn_aug.mean(dim=0) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + if self.project_out_dim is not None: + x = self.project_out_dim(x) + + return x, {"attn": [attn], "attn_aug": [attn_aug], "inner_states": inner_states} + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + if f"{name}.output_projection.weight" not in state_dict: + if self.share_input_output_embed: + embed_out_key = f"{name}.embed_tokens.weight" + else: + embed_out_key = f"{name}.embed_out" + if embed_out_key in state_dict: + state_dict[f"{name}.output_projection.weight"] = state_dict[ + embed_out_key + ] + if not self.share_input_output_embed: + del state_dict[embed_out_key] + + for i in range(self.num_layers): + # update layer norms + layer_norm_map = { + "0": "self_attn_layer_norm", + "1": "encoder_attn_layer_norm", + "2": "encoder_attn_layer_norm2", + "3": "final_layer_norm", + } + for old, new in layer_norm_map.items(): + for m in ("weight", "bias"): + k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m) + if k in state_dict: + state_dict[ + "{}.layers.{}.{}.{}".format(name, i, new, m) + ] = state_dict[k] + del state_dict[k] + + version_key = "{}.version".format(name) + if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: + # earlier checkpoints did not normalize after the stack of layers + self.layer_norm = None + self.normalize = False + state_dict[version_key] = torch.Tensor([1]) + + return state_dict + + +class AugTransformerDecoder(AugTransformerDecoderBase): + def __init__( + self, + args, + dictionary, + embed_tokens, + output_projection=None, + ): + self.args = args + super().__init__( + TransformerConfig.from_namespace(args), + dictionary, + embed_tokens, + no_encoder_attn=False, + output_projection=output_projection, + encoder_attn_merge_type=getattr( + args, "synthesizer_augmented_cross_attention_merge_type", "sequential" + ), + dropnet_ratio=getattr(args, "dropnet_ratio", 0), + ) + + def build_output_projection(self, args, dictionary, embed_tokens): + super().build_output_projection( + TransformerConfig.from_namespace(args), dictionary, embed_tokens + ) + + def build_decoder_layer( + self, + args, + encoder_attn_merge_type="sequential", + dropnet_ratio=0, + ): + return super().build_decoder_layer( + TransformerConfig.from_namespace(args), + no_encoder_attn=False, + encoder_attn_merge_type=encoder_attn_merge_type, + dropnet_ratio=dropnet_ratio, + ) diff --git a/fairseq/models/transformer/transformer_encoder.py b/fairseq/models/transformer/transformer_encoder.py new file mode 100644 index 0000000000..a684fcb448 --- /dev/null +++ b/fairseq/models/transformer/transformer_encoder.py @@ -0,0 +1,362 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Dict, List, Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from fairseq import utils +from fairseq.distributed import fsdp_wrap +from fairseq.models import FairseqEncoder +from fairseq.models.transformer import TransformerConfig +from fairseq.modules import ( + FairseqDropout, + LayerDropModuleList, + LayerNorm, + PositionalEmbedding, + SinusoidalPositionalEmbedding, + transformer_layer, +) +from fairseq.modules.checkpoint_activations import checkpoint_wrapper +from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ + + +# rewrite name for backward compatibility in `make_generation_fast_` +def module_name_fordropout(module_name: str) -> str: + if module_name == "TransformerEncoderBase": + return "TransformerEncoder" + else: + return module_name + + +class TransformerEncoderBase(FairseqEncoder): + """ + Transformer encoder consisting of *cfg.encoder.layers* layers. Each layer + is a :class:`TransformerEncoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): encoding dictionary + embed_tokens (torch.nn.Embedding): input embedding + """ + + def __init__(self, cfg, dictionary, embed_tokens, return_fc=False): + self.cfg = cfg + super().__init__(dictionary) + self.register_buffer("version", torch.Tensor([3])) + + self.dropout_module = FairseqDropout( + cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__) + ) + self.encoder_layerdrop = cfg.encoder.layerdrop + self.return_fc = return_fc + + embed_dim = embed_tokens.embedding_dim + self.padding_idx = embed_tokens.padding_idx + self.max_source_positions = cfg.max_source_positions + + self.embed_tokens = embed_tokens + + self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim) + + self.embed_positions = ( + PositionalEmbedding( + cfg.max_source_positions, + embed_dim, + self.padding_idx, + learned=cfg.encoder.learned_pos, + ) + if not cfg.no_token_positional_embeddings + else None + ) + if cfg.layernorm_embedding: + self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export) + else: + self.layernorm_embedding = None + + if not cfg.adaptive_input and cfg.quant_noise.pq > 0: + self.quant_noise = apply_quant_noise_( + nn.Linear(embed_dim, embed_dim, bias=False), + cfg.quant_noise.pq, + cfg.quant_noise.pq_block_size, + ) + else: + self.quant_noise = None + + if self.encoder_layerdrop > 0.0: + self.layers = LayerDropModuleList(p=self.encoder_layerdrop) + else: + self.layers = nn.ModuleList([]) + self.layers.extend( + [self.build_encoder_layer(cfg) for i in range(cfg.encoder.layers)] + ) + self.num_layers = len(self.layers) + + if cfg.encoder.normalize_before: + self.layer_norm = LayerNorm(embed_dim, export=cfg.export) + else: + self.layer_norm = None + + def build_encoder_layer(self, cfg): + layer = transformer_layer.TransformerEncoderLayerBase( + cfg, return_fc=self.return_fc + ) + checkpoint = cfg.checkpoint_activations + if checkpoint: + offload_to_cpu = cfg.offload_activations + layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) + # if we are checkpointing, enforce that FSDP always wraps the + # checkpointed layer, regardless of layer size + min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 + layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) + return layer + + def forward_embedding( + self, src_tokens, token_embedding: Optional[torch.Tensor] = None + ): + # embed tokens and positions + if token_embedding is None: + token_embedding = self.embed_tokens(src_tokens) + x = embed = self.embed_scale * token_embedding + if self.embed_positions is not None: + x = embed + self.embed_positions(src_tokens) + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + x = self.dropout_module(x) + if self.quant_noise is not None: + x = self.quant_noise(x) + return x, embed + + def forward( + self, + src_tokens, + src_lengths: Optional[torch.Tensor] = None, + return_all_hiddens: bool = False, + token_embeddings: Optional[torch.Tensor] = None, + ): + """ + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings + + Returns: + dict: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - **encoder_embedding** (Tensor): the (scaled) embedding lookup + of shape `(batch, src_len, embed_dim)` + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + """ + return self.forward_scriptable( + src_tokens, src_lengths, return_all_hiddens, token_embeddings + ) + + # TorchScript doesn't support super() method so that the scriptable Subclass + # can't access the base class model in Torchscript. + # Current workaround is to add a helper function with different name and + # call the helper function from scriptable Subclass. + def forward_scriptable( + self, + src_tokens, + src_lengths: Optional[torch.Tensor] = None, + return_all_hiddens: bool = False, + token_embeddings: Optional[torch.Tensor] = None, + ): + """ + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings + + Returns: + dict: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - **encoder_embedding** (Tensor): the (scaled) embedding lookup + of shape `(batch, src_len, embed_dim)` + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + """ + # compute padding mask + encoder_padding_mask = src_tokens.eq(self.padding_idx) + has_pads = ( + torch.tensor(src_tokens.device.type == "xla") or encoder_padding_mask.any() + ) + # Torchscript doesn't handle bool Tensor correctly, so we need to work around. + if torch.jit.is_scripting(): + has_pads = torch.tensor(1) if has_pads else torch.tensor(0) + + x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings) + + # account for padding while computing the representation + x = x * ( + 1 - encoder_padding_mask.unsqueeze(-1).type_as(x) * has_pads.type_as(x) + ) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + encoder_states = [] + fc_results = [] + + if return_all_hiddens: + encoder_states.append(x) + + # encoder layers + for layer in self.layers: + lr = layer( + x, encoder_padding_mask=encoder_padding_mask if has_pads else None + ) + + if isinstance(lr, tuple) and len(lr) == 2: + x, fc_result = lr + else: + x = lr + fc_result = None + + if return_all_hiddens and not torch.jit.is_scripting(): + assert encoder_states is not None + encoder_states.append(x) + fc_results.append(fc_result) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in + # `forward` so we use a dictionary instead. + # TorchScript does not support mixed values so the values are all lists. + # The empty list is equivalent to None. + src_lengths = ( + src_tokens.ne(self.padding_idx) + .sum(dim=1, dtype=torch.int32) + .reshape(-1, 1) + .contiguous() + ) + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [encoder_padding_mask], # B x T + "encoder_embedding": [encoder_embedding], # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "fc_results": fc_results, # List[T x B x C] + "src_tokens": [], + "src_lengths": [src_lengths], + } + + @torch.jit.export + def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order): + """ + Reorder encoder output according to *new_order*. + + Args: + encoder_out: output from the ``forward()`` method + new_order (LongTensor): desired order + + Returns: + *encoder_out* rearranged according to *new_order* + """ + if len(encoder_out["encoder_out"]) == 0: + new_encoder_out = [] + else: + new_encoder_out = [encoder_out["encoder_out"][0].index_select(1, new_order)] + if len(encoder_out["encoder_padding_mask"]) == 0: + new_encoder_padding_mask = [] + else: + new_encoder_padding_mask = [ + encoder_out["encoder_padding_mask"][0].index_select(0, new_order) + ] + if len(encoder_out["encoder_embedding"]) == 0: + new_encoder_embedding = [] + else: + new_encoder_embedding = [ + encoder_out["encoder_embedding"][0].index_select(0, new_order) + ] + + if len(encoder_out["src_tokens"]) == 0: + src_tokens = [] + else: + src_tokens = [(encoder_out["src_tokens"][0]).index_select(0, new_order)] + + if len(encoder_out["src_lengths"]) == 0: + src_lengths = [] + else: + src_lengths = [(encoder_out["src_lengths"][0]).index_select(0, new_order)] + + encoder_states = encoder_out["encoder_states"] + if len(encoder_states) > 0: + for idx, state in enumerate(encoder_states): + encoder_states[idx] = state.index_select(1, new_order) + + return { + "encoder_out": new_encoder_out, # T x B x C + "encoder_padding_mask": new_encoder_padding_mask, # B x T + "encoder_embedding": new_encoder_embedding, # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": src_tokens, # B x T + "src_lengths": src_lengths, # B x 1 + } + + @torch.jit.export + def _reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order): + """Dummy re-order function for beamable enc-dec attention""" + return encoder_out + + def max_positions(self): + """Maximum input length supported by the encoder.""" + if self.embed_positions is None: + return self.max_source_positions + return min(self.max_source_positions, self.embed_positions.max_positions) + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + for i in range(self.num_layers): + # update layer norms + self.layers[i].upgrade_state_dict_named( + state_dict, "{}.layers.{}".format(name, i) + ) + + version_key = "{}.version".format(name) + if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: + # earlier checkpoints did not normalize after the stack of layers + self.layer_norm = None + self.normalize = False + state_dict[version_key] = torch.Tensor([1]) + return state_dict + + +class TransformerEncoder(TransformerEncoderBase): + def __init__(self, args, dictionary, embed_tokens, return_fc=False): + self.args = args + super().__init__( + TransformerConfig.from_namespace(args), + dictionary, + embed_tokens, + return_fc=return_fc, + ) + + def build_encoder_layer(self, args): + return super().build_encoder_layer( + TransformerConfig.from_namespace(args), + ) diff --git a/fairseq/models/transformer/transformer_legacy.py b/fairseq/models/transformer/transformer_legacy.py new file mode 100644 index 0000000000..00d14a7dde --- /dev/null +++ b/fairseq/models/transformer/transformer_legacy.py @@ -0,0 +1,277 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.dataclass.utils import gen_parser_from_dataclass +from fairseq.models import ( + register_model, + register_model_architecture, +) +from fairseq.models.transformer.transformer_config import ( + TransformerConfig, + DEFAULT_MAX_SOURCE_POSITIONS, + DEFAULT_MAX_TARGET_POSITIONS, + DEFAULT_MIN_PARAMS_TO_WRAP, +) +from fairseq.models.transformer.transformer_base import ( + TransformerModelBase, +) + + +@register_model("transformer") +class TransformerModel(TransformerModelBase): + """ + This is the legacy implementation of the transformer model that + uses argparse for configuration. + """ + + @classmethod + def hub_models(cls): + # fmt: off + + def moses_subword(path): + return { + 'path': path, + 'tokenizer': 'moses', + 'bpe': 'subword_nmt', + } + + def moses_fastbpe(path): + return { + 'path': path, + 'tokenizer': 'moses', + 'bpe': 'fastbpe', + } + + def spm(path): + return { + 'path': path, + 'bpe': 'sentencepiece', + 'tokenizer': 'space', + } + + return { + 'transformer.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2'), + 'transformer.wmt16.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2', + 'transformer.wmt18.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz'), + 'transformer.wmt19.en-de': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz'), + 'transformer.wmt19.en-ru': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz'), + 'transformer.wmt19.de-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz'), + 'transformer.wmt19.ru-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz'), + 'transformer.wmt19.en-de.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.single_model.tar.gz'), + 'transformer.wmt19.en-ru.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.single_model.tar.gz'), + 'transformer.wmt19.de-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.single_model.tar.gz'), + 'transformer.wmt19.ru-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.single_model.tar.gz'), + 'transformer.wmt20.en-ta': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-ta.single.tar.gz'), + 'transformer.wmt20.en-iu.news': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.news.single.tar.gz'), + 'transformer.wmt20.en-iu.nh': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.nh.single.tar.gz'), + 'transformer.wmt20.ta-en': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.ta-en.single.tar.gz'), + 'transformer.wmt20.iu-en.news': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.news.single.tar.gz'), + 'transformer.wmt20.iu-en.nh': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.nh.single.tar.gz'), + 'transformer.flores101.mm100.615M': spm('https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz'), + 'transformer.flores101.mm100.175M': spm('https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_175M.tar.gz'), + } + # fmt: on + + def __init__(self, args, encoder, decoder): + cfg = TransformerConfig.from_namespace(args) + super().__init__(cfg, encoder, decoder) + self.args = args + + @classmethod + def add_args(cls, parser): + """Add model-specific arguments to the parser.""" + # we want to build the args recursively in this case. + # do not set defaults so that settings defaults from various architectures still works + gen_parser_from_dataclass( + parser, TransformerConfig(), delete_default=True, with_prefix="" + ) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + base_architecture(args) + + if args.encoder_layers_to_keep: + args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) + if args.decoder_layers_to_keep: + args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) + + if getattr(args, "max_source_positions", None) is None: + args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS + if getattr(args, "max_target_positions", None) is None: + args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS + + src_dict, tgt_dict = task.source_dictionary, task.target_dictionary + + if args.share_all_embeddings: + if src_dict != tgt_dict: + raise ValueError("--share-all-embeddings requires a joined dictionary") + if args.encoder_embed_dim != args.decoder_embed_dim: + raise ValueError( + "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" + ) + if args.decoder_embed_path and ( + args.decoder_embed_path != args.encoder_embed_path + ): + raise ValueError( + "--share-all-embeddings not compatible with --decoder-embed-path" + ) + args.share_decoder_input_output_embed = True + + if getattr(args, "offload_activations", False): + args.checkpoint_activations = True # offloading implies checkpointing + + if not args.share_all_embeddings: + args.min_params_to_wrap = getattr( + args, "min_params_to_wrap", DEFAULT_MIN_PARAMS_TO_WRAP + ) + cfg = TransformerConfig.from_namespace(args) + return super().build_model(cfg, task) + + @classmethod + def build_embedding(cls, args, dictionary, embed_dim, path=None): + return super().build_embedding( + TransformerConfig.from_namespace(args), dictionary, embed_dim, path + ) + + @classmethod + def build_encoder(cls, args, src_dict, embed_tokens): + return super().build_encoder( + TransformerConfig.from_namespace(args), src_dict, embed_tokens + ) + + @classmethod + def build_decoder(cls, args, tgt_dict, embed_tokens): + return super().build_decoder( + TransformerConfig.from_namespace(args), tgt_dict, embed_tokens + ) + + +# architectures + + +@register_model_architecture("transformer", "transformer_tiny") +def tiny_architecture(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 64) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 64) + args.encoder_layers = getattr(args, "encoder_layers", 2) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 2) + args.decoder_layers = getattr(args, "decoder_layers", 2) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 2) + return base_architecture(args) + + +@register_model_architecture("transformer", "transformer") +def base_architecture(args): + args.encoder_embed_path = getattr(args, "encoder_embed_path", None) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) + + args.decoder_embed_path = getattr(args, "decoder_embed_path", None) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.activation_fn = getattr(args, "activation_fn", "relu") + args.dropout = getattr(args, "dropout", 0.1) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.share_all_embeddings = getattr(args, "share_all_embeddings", False) + args.merge_src_tgt_embed = getattr(args, "merge_src_tgt_embed", False) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.no_cross_attention = getattr(args, "no_cross_attention", False) + args.cross_self_attention = getattr(args, "cross_self_attention", False) + + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + args.layernorm_embedding = getattr(args, "layernorm_embedding", False) + args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) + args.checkpoint_activations = getattr(args, "checkpoint_activations", False) + args.offload_activations = getattr(args, "offload_activations", False) + if args.offload_activations: + args.checkpoint_activations = True + args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None) + args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None) + args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0) + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0) + args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) + args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8) + args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0) + + +@register_model_architecture("transformer", "transformer_iwslt_de_en") +def transformer_iwslt_de_en(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) + args.decoder_layers = getattr(args, "decoder_layers", 6) + base_architecture(args) + + +@register_model_architecture("transformer", "transformer_wmt_en_de") +def transformer_wmt_en_de(args): + base_architecture(args) + + +# parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017) +@register_model_architecture("transformer", "transformer_vaswani_wmt_en_de_big") +def transformer_vaswani_wmt_en_de_big(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + args.dropout = getattr(args, "dropout", 0.3) + base_architecture(args) + + +@register_model_architecture("transformer", "transformer_vaswani_wmt_en_fr_big") +def transformer_vaswani_wmt_en_fr_big(args): + args.dropout = getattr(args, "dropout", 0.1) + transformer_vaswani_wmt_en_de_big(args) + + +@register_model_architecture("transformer", "transformer_wmt_en_de_big") +def transformer_wmt_en_de_big(args): + args.attention_dropout = getattr(args, "attention_dropout", 0.1) + transformer_vaswani_wmt_en_de_big(args) + + +# default parameters used in tensor2tensor implementation +@register_model_architecture("transformer", "transformer_wmt_en_de_big_t2t") +def transformer_wmt_en_de_big_t2t(args): + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) + args.attention_dropout = getattr(args, "attention_dropout", 0.1) + args.activation_dropout = getattr(args, "activation_dropout", 0.1) + transformer_vaswani_wmt_en_de_big(args) diff --git a/fairseq/models/transformer_lm.py b/fairseq/models/transformer_lm.py index df809bdb19..1e3aa72d38 100644 --- a/fairseq/models/transformer_lm.py +++ b/fairseq/models/transformer_lm.py @@ -7,6 +7,8 @@ from dataclasses import dataclass, field from typing import Optional +from omegaconf import II + from fairseq import options, utils from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.models import ( @@ -14,10 +16,13 @@ register_model, register_model_architecture, ) -from fairseq.models.transformer import Embedding, TransformerDecoder +from fairseq.models.transformer import ( + DEFAULT_MIN_PARAMS_TO_WRAP, + Embedding, + TransformerDecoder, +) from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder -from omegaconf import II - +from fairseq.utils import safe_getattr, safe_hasattr DEFAULT_MAX_TARGET_POSITIONS = 1024 @@ -126,6 +131,20 @@ class TransformerLanguageModelConfig(FairseqDataclass): default=False, metadata={"help": "use learned positional embeddings in the decoder"}, ) + layernorm_embedding: bool = field( + default=False, metadata={"help": "add layernorm to embedding"} + ) + no_scale_embedding: bool = field( + default=False, metadata={"help": "if True, dont scale embeddings"} + ) + checkpoint_activations: bool = field( + default=False, metadata={"help": "checkpoint activations at each layer"} + ) + offload_activations: bool = field( + default=False, + metadata={"help": "move checkpointed activations to CPU after they are used."}, + ) + # config for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) decoder_layerdrop: float = field( default=0.0, metadata={"help": "LayerDrop probability for decoder"} ) @@ -135,12 +154,7 @@ class TransformerLanguageModelConfig(FairseqDataclass): "help": "which layers to *keep* when pruning as a comma-separated list" }, ) - layernorm_embedding: bool = field( - default=False, metadata={"help": "add layernorm to embedding"} - ) - no_scale_embedding: bool = field( - default=False, metadata={"help": "if True, dont scale embeddings"} - ) + # config for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) quant_noise_pq: float = field( default=0.0, metadata={"help": "iterative PQ quantization noise at training time"}, @@ -149,13 +163,63 @@ class TransformerLanguageModelConfig(FairseqDataclass): default=8, metadata={"help": "block size of quantization noise at training time"}, ) - # TODO common var add to parent quant_noise_scalar: float = field( default=0.0, metadata={ "help": "scalar quantization noise and scalar quantization at training time" }, ) + # config for Fully Sharded Data Parallel (FSDP) training + min_params_to_wrap: int = field( + default=DEFAULT_MIN_PARAMS_TO_WRAP, + metadata={ + "help": ( + "minimum number of params for a layer to be wrapped with FSDP() when " + "training with --ddp-backend=fully_sharded. Smaller values will " + "improve memory efficiency, but may make torch.distributed " + "communication less efficient due to smaller input sizes. This option " + "is set to 0 (i.e., always wrap) when --checkpoint-activations or " + "--offload-activations are passed." + ) + }, + ) + # config for "BASE Layers: Simplifying Training of Large, Sparse Models" + base_layers: Optional[int] = field( + default=0, metadata={"help": "number of BASE layers in total"} + ) + base_sublayers: Optional[int] = field( + default=1, metadata={"help": "number of sublayers in each BASE layer"} + ) + base_shuffle: Optional[int] = field( + default=1, + metadata={"help": "shuffle tokens between workers before computing assignment"}, + ) + # NormFormer + scale_fc: Optional[bool] = field( + default=False, + metadata={"help": "Insert LayerNorm between fully connected layers"}, + ) + scale_attn: Optional[bool] = field( + default=False, metadata={"help": "Insert LayerNorm after attention"} + ) + scale_heads: Optional[bool] = field( + default=False, + metadata={"help": "Learn a scale coefficient for each attention head"}, + ) + scale_resids: Optional[bool] = field( + default=False, + metadata={"help": "Learn a scale coefficient for each residual connection"}, + ) + + # xFormers arguments + decoder_xformers_att_config: Optional[str] = field( + default=None, + metadata={ + "help": "config for xFormers library attention, defined in xformers.components.attention.AttentionConfig", + }, + ) + + # options from other parts of the config add_bos_token: bool = II("task.add_bos_token") tokens_per_sample: int = II("task.tokens_per_sample") max_target_positions: Optional[int] = II("task.max_target_positions") @@ -169,6 +233,9 @@ def hub_models(cls): def moses_fastbpe(path): return {"path": path, "tokenizer": "moses", "bpe": "fastbpe"} + def spm(path): + return {"path": path, "tokenizer": "space", "bpe": "sentencepiece"} + return { "transformer_lm.gbw.adaptive_huge": "https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2", "transformer_lm.wiki103.adaptive": "https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2", @@ -181,6 +248,18 @@ def moses_fastbpe(path): "transformer_lm.wmt19.ru": moses_fastbpe( "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.bz2" ), + "transformer_lm.wmt20.en": spm( + "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.en.tar.gz" + ), + "transformer_lm.wmt20.ta": spm( + "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.ta.tar.gz" + ), + "transformer_lm.wmt20.iu.news": spm( + "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.iu.news.tar.gz" + ), + "transformer_lm.wmt20.iu.nh": spm( + "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.iu.nh.tar.gz" + ), } def __init__(self, decoder): @@ -190,14 +269,11 @@ def __init__(self, decoder): def build_model(cls, args, task): """Build a new model instance.""" - # make sure all arguments are present in older models - base_lm_architecture(args) - if args.decoder_layers_to_keep: args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) - if getattr(args, "max_target_positions", None) is None: - args.max_target_positions = getattr( + if safe_getattr(args, "max_target_positions", None) is None: + args.max_target_positions = safe_getattr( args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS ) @@ -246,148 +322,286 @@ def build_embedding(cls, args, dictionary, embed_dim, path=None): return embed_tokens -@register_model_architecture("transformer_lm", "transformer_lm") def base_lm_architecture(args): # backward compatibility for older model checkpoints - if hasattr(args, "no_tie_adaptive_proj"): + if safe_hasattr(args, "no_tie_adaptive_proj"): # previous models defined --no-tie-adaptive-proj, so use the existence of # that option to determine if this is an "old" model checkpoint args.no_decoder_final_norm = True # old models always set this to True if args.no_tie_adaptive_proj is False: args.tie_adaptive_proj = True - if hasattr(args, "decoder_final_norm"): + if safe_hasattr(args, "decoder_final_norm"): args.no_decoder_final_norm = not args.decoder_final_norm - args.dropout = getattr(args, "dropout", 0.1) - args.attention_dropout = getattr(args, "attention_dropout", 0.0) - - args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) - args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) - args.decoder_layers = getattr(args, "decoder_layers", 6) - args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) - args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) - args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) - args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4) - args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) - args.activation_fn = getattr(args, "activation_fn", "relu") - - args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0) - args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None) - args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) - args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8) - args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0) - - args.add_bos_token = getattr(args, "add_bos_token", False) - args.no_token_positional_embeddings = getattr( + args.dropout = safe_getattr(args, "dropout", 0.1) + args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0) + + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 2048) + args.decoder_layers = safe_getattr(args, "decoder_layers", 6) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8) + args.adaptive_softmax_cutoff = safe_getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0) + args.adaptive_softmax_factor = safe_getattr(args, "adaptive_softmax_factor", 4) + args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", False) + args.activation_fn = safe_getattr(args, "activation_fn", "relu") + + args.decoder_layerdrop = safe_getattr(args, "decoder_layerdrop", 0) + args.decoder_layers_to_keep = safe_getattr(args, "decoder_layers_to_keep", None) + args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0) + args.quant_noise_pq_block_size = safe_getattr(args, "quant_noise_pq_block_size", 8) + args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0) + + args.base_layers = safe_getattr(args, "base_layers", 0) + args.base_sublayers = safe_getattr(args, "base_sublayers", 1) + args.base_shuffle = safe_getattr(args, "base_shuffle", False) + + args.add_bos_token = safe_getattr(args, "add_bos_token", False) + args.no_token_positional_embeddings = safe_getattr( args, "no_token_positional_embeddings", False ) - args.share_decoder_input_output_embed = getattr( + args.share_decoder_input_output_embed = safe_getattr( args, "share_decoder_input_output_embed", False ) - args.character_embeddings = getattr(args, "character_embeddings", False) + args.character_embeddings = safe_getattr(args, "character_embeddings", False) - args.decoder_output_dim = getattr( + args.decoder_output_dim = safe_getattr( args, "decoder_output_dim", args.decoder_embed_dim ) - args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + args.decoder_input_dim = safe_getattr( + args, "decoder_input_dim", args.decoder_embed_dim + ) # Model training is not stable without this args.decoder_normalize_before = True - args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", False) + args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", False) - args.adaptive_input = getattr(args, "adaptive_input", False) - args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4) - args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None) + args.adaptive_input = safe_getattr(args, "adaptive_input", False) + args.adaptive_input_factor = safe_getattr(args, "adaptive_input_factor", 4) + args.adaptive_input_cutoff = safe_getattr(args, "adaptive_input_cutoff", None) - args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) - args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False) + args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", False) + args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", False) - args.no_scale_embedding = getattr(args, "no_scale_embedding", False) - args.layernorm_embedding = getattr(args, "layernorm_embedding", False) + args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", False) + args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False) + args.checkpoint_activations = safe_getattr(args, "checkpoint_activations", False) + args.offload_activations = safe_getattr(args, "offload_activations", False) + args.scale_fc = safe_getattr(args, "scale_fc", False) + args.scale_attn = safe_getattr(args, "scale_attn", False) + args.scale_heads = safe_getattr(args, "scale_heads", False) + args.scale_resids = safe_getattr(args, "scale_resids", False) + if args.offload_activations: + args.checkpoint_activations = True @register_model_architecture("transformer_lm", "transformer_lm_big") def transformer_lm_big(args): - args.decoder_layers = getattr(args, "decoder_layers", 12) - args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) - args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) - args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + args.decoder_layers = safe_getattr(args, "decoder_layers", 12) + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024) + args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 4096) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16) base_lm_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_wiki103") @register_model_architecture("transformer_lm", "transformer_lm_baevski_wiki103") def transformer_lm_baevski_wiki103(args): - args.decoder_layers = getattr(args, "decoder_layers", 16) - args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) - args.dropout = getattr(args, "dropout", 0.3) - args.adaptive_input = getattr(args, "adaptive_input", True) - args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", True) - args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", "20000,60000") - args.adaptive_softmax_cutoff = getattr( + args.decoder_layers = safe_getattr(args, "decoder_layers", 16) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8) + args.dropout = safe_getattr(args, "dropout", 0.3) + args.adaptive_input = safe_getattr(args, "adaptive_input", True) + args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", True) + args.adaptive_input_cutoff = safe_getattr( + args, "adaptive_input_cutoff", "20000,60000" + ) + args.adaptive_softmax_cutoff = safe_getattr( args, "adaptive_softmax_cutoff", "20000,60000" ) - args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0.2) - args.attention_dropout = getattr(args, "attention_dropout", 0.1) - args.activation_dropout = getattr(args, "activation_dropout", 0.1) - args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", True) - args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", True) + args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0.2) + args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) + args.activation_dropout = safe_getattr(args, "activation_dropout", 0.1) + args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", True) + args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", True) transformer_lm_big(args) @register_model_architecture("transformer_lm", "transformer_lm_gbw") @register_model_architecture("transformer_lm", "transformer_lm_baevski_gbw") def transformer_lm_baevski_gbw(args): - args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) - args.dropout = getattr(args, "dropout", 0.1) - args.attention_dropout = getattr(args, "attention_dropout", 0.1) - args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", True) + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512) + args.dropout = safe_getattr(args, "dropout", 0.1) + args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) + args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", True) transformer_lm_big(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt") def transformer_lm_gpt(args): - args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 768) - args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 3072) - args.decoder_layers = getattr(args, "decoder_layers", 12) - args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 12) - args.dropout = getattr(args, "dropout", 0.1) - args.attention_dropout = getattr(args, "attention_dropout", 0.1) - args.activation_fn = getattr(args, "activation_fn", "gelu") + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 768) + args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 3072) + args.decoder_layers = safe_getattr(args, "decoder_layers", 12) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 12) + args.dropout = safe_getattr(args, "dropout", 0.1) + args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) + args.activation_fn = safe_getattr(args, "activation_fn", "gelu") base_lm_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt2_small") def transformer_lm_gpt2_small(args): - args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) - args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) - args.decoder_layers = getattr(args, "decoder_layers", 24) - args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) - args.dropout = getattr(args, "dropout", 0.1) - args.attention_dropout = getattr(args, "attention_dropout", 0.1) - args.activation_fn = getattr(args, "activation_fn", "gelu") + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024) + args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 4096) + args.decoder_layers = safe_getattr(args, "decoder_layers", 24) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16) + args.dropout = safe_getattr(args, "dropout", 0.1) + args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) + args.activation_fn = safe_getattr(args, "activation_fn", "gelu") + base_lm_architecture(args) + + +@register_model_architecture("transformer_lm", "transformer_lm_gpt2_tiny") +def transformer_lm_gpt2_tiny(args): + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 64) + args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 64) + args.decoder_layers = safe_getattr(args, "decoder_layers", 2) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 1) + args.dropout = safe_getattr(args, "dropout", 0.1) + args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) + args.activation_fn = safe_getattr(args, "activation_fn", "gelu") base_lm_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt2_medium") def transformer_lm_gpt2_medium(args): - args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1280) - args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 5120) - args.decoder_layers = getattr(args, "decoder_layers", 36) - args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 20) + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1280) + args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 5120) + args.decoder_layers = safe_getattr(args, "decoder_layers", 36) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 20) + args.dropout = safe_getattr(args, "dropout", 0.1) + args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) + args.activation_fn = safe_getattr(args, "activation_fn", "gelu") + base_lm_architecture(args) + + +@register_model_architecture("transformer_lm", "transformer_lm_gpt2_big") +def transformer_lm_gpt2_big(args): + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1600) + args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 6400) + args.decoder_layers = safe_getattr(args, "decoder_layers", 48) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 25) + args.dropout = safe_getattr(args, "dropout", 0.1) + args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) + args.activation_fn = safe_getattr(args, "activation_fn", "gelu") + base_lm_architecture(args) + + +@register_model_architecture("transformer_lm", "transformer_lm_gpt2_big_wide") +def transformer_lm_gpt2_big_wide(args): + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 2048) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 8192) + args.decoder_layers = getattr(args, "decoder_layers", 24) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.activation_fn = getattr(args, "activation_fn", "gelu") base_lm_architecture(args) -@register_model_architecture("transformer_lm", "transformer_lm_gpt2_big") -def transformer_lm_gpt2_big(args): - args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1600) - args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 6400) +@register_model_architecture("transformer_lm", "transformer_lm_gpt2_bigger") +def transformer_lm_gpt2_bigger(args): + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 2048) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 8192) args.decoder_layers = getattr(args, "decoder_layers", 48) - args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 25) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.activation_fn = getattr(args, "activation_fn", "gelu") base_lm_architecture(args) + + +def base_gpt3_architecture(args): + args.decoder_input_dim = args.decoder_embed_dim + args.decoder_output_dim = args.decoder_embed_dim + args.decoder_ffn_embed_dim = safe_getattr( + args, "decoder_ffn_embed_dim", args.decoder_embed_dim * 4 + ) + # GPT-3 used learned positional embeddings, rather than sinusoidal + args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", True) + args.dropout = safe_getattr(args, "dropout", 0.0) + args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0) + args.activation_fn = safe_getattr(args, "activation_fn", "gelu") + args.share_decoder_input_output_embed = True + base_lm_architecture(args) + + +@register_model_architecture("transformer_lm", "transformer_lm_gpt3_small") +def transformer_lm_gpt3_small(args): + # 125M params + args.decoder_layers = safe_getattr(args, "decoder_layers", 12) + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 768) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 12) + base_gpt3_architecture(args) + + +@register_model_architecture("transformer_lm", "transformer_lm_gpt3_medium") +def transformer_lm_gpt3_medium(args): + # 350M params + args.decoder_layers = safe_getattr(args, "decoder_layers", 24) + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16) + base_gpt3_architecture(args) + + +@register_model_architecture("transformer_lm", "transformer_lm_gpt3_large") +def transformer_lm_gpt3_large(args): + # 760M params + args.decoder_layers = safe_getattr(args, "decoder_layers", 24) + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1536) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16) + base_gpt3_architecture(args) + + +@register_model_architecture("transformer_lm", "transformer_lm_gpt3_xl") +def transformer_lm_gpt3_xl(args): + # 1.3B params + args.decoder_layers = safe_getattr(args, "decoder_layers", 24) + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 2048) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32) + base_gpt3_architecture(args) + + +@register_model_architecture("transformer_lm", "transformer_lm_gpt3_2_7") +def transformer_lm_gpt3_2_7(args): + # 2.7B params + args.decoder_layers = safe_getattr(args, "decoder_layers", 32) + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 2560) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32) + base_gpt3_architecture(args) + + +@register_model_architecture("transformer_lm", "transformer_lm_gpt3_6_7") +def transformer_lm_gpt3_6_7(args): + # 6.7B params + args.decoder_layers = safe_getattr(args, "decoder_layers", 32) + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 4096) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32) + base_gpt3_architecture(args) + + +@register_model_architecture("transformer_lm", "transformer_lm_gpt3_13") +def transformer_lm_gpt3_13(args): + # 13B params + args.decoder_layers = safe_getattr(args, "decoder_layers", 40) + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 5120) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 40) + base_gpt3_architecture(args) + + +@register_model_architecture("transformer_lm", "transformer_lm_gpt3_175") +def transformer_lm_gpt3_175(args): + # 175B params + args.decoder_layers = safe_getattr(args, "decoder_layers", 96) + args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 12288) + args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 96) + base_gpt3_architecture(args) diff --git a/fairseq/models/transformer_ulm.py b/fairseq/models/transformer_ulm.py new file mode 100644 index 0000000000..0fc9ae4348 --- /dev/null +++ b/fairseq/models/transformer_ulm.py @@ -0,0 +1,408 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from dataclasses import dataclass, field +from fairseq.models.fairseq_decoder import FairseqDecoder +import numpy as np +from typing import Optional, Dict, Any, List +import torch +from torch import nn +from fairseq.data.data_utils import compute_mask_indices +from fairseq.dataclass import ChoiceEnum +from fairseq.models import ( + FairseqLanguageModel, + register_model, + register_model_architecture, +) +from fairseq.tasks.speech_ulm_task import SpeechUnitLanguageModelingTask +from fairseq.models.transformer import Embedding, TransformerDecoder, Linear +from fairseq.models.transformer_lm import TransformerLanguageModelConfig +from torch import Tensor + + +DEFAULT_MAX_TARGET_POSITIONS = 1024 +MASKING_DISTRIBUTION_CHOICES = ChoiceEnum(["static", "uniform", "normal", "poisson"]) + + +@dataclass +class SpeechUnitLanguageModelConfig(TransformerLanguageModelConfig): + mask_unit_seg_prob: float = field( + default=0.0, metadata={"help": "probability to mask a segment of unit sequence"} + ) + mask_unit_seg_leng: int = field( + default=5, metadata={"help": "length of unit segment mask"} + ) + mask_unit_seg_type: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose unit mask length"} + ) + + mask_dur_prob: float = field( + default=0.0, metadata={"help": "probability to mask entire duration sequence"} + ) + mask_dur_seg_prob: float = field( + default=0.0, + metadata={"help": "probability to mask a segment of duration sequence"}, + ) + mask_dur_seg_leng: int = field( + default=5, metadata={"help": "length of duration segment mask"} + ) + mask_dur_seg_type: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose duration mask length"} + ) + + mask_f0_prob: float = field( + default=0.0, metadata={"help": "probability to mask entire duration sequence"} + ) + mask_f0_seg_prob: float = field( + default=0.0, metadata={"help": "probability to mask a segment of f0 sequence"} + ) + mask_f0_seg_leng: int = field( + default=5, metadata={"help": "length of f0 segment mask"} + ) + mask_f0_seg_type: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose f0 mask length"} + ) + + +@register_model("transformer_ulm", dataclass=SpeechUnitLanguageModelConfig) +class TransformerUnitLanguageModel(FairseqLanguageModel): + def __init__( + self, + cfg: SpeechUnitLanguageModelConfig, + task: SpeechUnitLanguageModelingTask, + decoder: FairseqDecoder, + ): + super().__init__(decoder) + self.cfg = cfg + + self.channel_names = task.channel_names + self.channel_sizes = task.channel_sizes + + self.unit_mask_val = task.source_dictionary.unk() + self.dur_mask_val = ( + task.source_duration_dictionary.unk() if task.cfg.discrete_duration else 0 + ) + self.f0_mask_val = ( + task.source_f0_dictionary.unk() if task.cfg.discrete_f0 else 0 + ) + + self.ignore_duration_input = task.cfg.ignore_duration_input + self.ignore_f0_input = task.cfg.ignore_f0_input + + @classmethod + def build_model(cls, args, task): + base_ulm_architecture(args) + + if getattr(args, "max_target_positions", None) is None: + args.max_target_positions = getattr( + args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS + ) + + embed_tokens = Embedding( + len(task.source_dictionary), + args.decoder_input_dim, + padding_idx=task.source_dictionary.pad(), + ) + embed_duration = None + if task.cfg.discrete_duration: + embed_duration = Embedding( + len(task.source_duration_dictionary), + args.decoder_input_dim, + padding_idx=0, # duration uses 0 for padding + ) + embed_f0 = None + if task.cfg.discrete_f0: + embed_f0 = Embedding( + len(task.source_f0_dictionary), + args.decoder_input_dim, + padding_idx=task.source_f0_dictionary.pad(), + ) + + decoder = MultiStreamTransformerDecoder( + args, + task.target_dictionary, + embed_tokens, + [embed_duration, embed_f0], + no_encoder_attn=True, + channel_sizes=task.channel_sizes, + ) + + return cls(args, task, decoder) + + def apply_seg_dropout(self, inp, mask_prob, mask_leng, mask_type, mask_val): + B, T = inp.size() + if mask_prob > 0: + mask_indices = compute_mask_indices( + (B, T), None, mask_prob, mask_leng, mask_type # may mask padding + ) + mask_indices = torch.from_numpy(mask_indices).to(inp.device) + inp[mask_indices] = mask_val + else: + mask_indices = torch.zeros_like(inp).bool() + return inp, mask_indices + + def apply_seq_dropout(self, inp, mask_prob, mask_val): + B, T = inp.size() + if mask_prob > 0: + mask_indices = np.random.uniform(0, 1, (B,)) < mask_prob + mask_indices = ( + torch.from_numpy(mask_indices).to(inp.device).unsqueeze(1).expand(-1, T) + ) + inp[mask_indices] = mask_val + else: + mask_indices = torch.zeros_like(inp).bool() + return inp, mask_indices + + def apply_dropout(self, src_tokens, dur_src, f0_src): + src_tokens, unit_mask = self.apply_seg_dropout( + src_tokens, + self.cfg.mask_unit_seg_prob, + self.cfg.mask_unit_seg_leng, + self.cfg.mask_unit_seg_type, + self.unit_mask_val, + ) + + dur_src, dur_mask = self.apply_seq_dropout( + dur_src, self.cfg.mask_dur_prob, self.dur_mask_val + ) + dur_src, _dur_mask = self.apply_seg_dropout( + dur_src, + self.cfg.mask_dur_seg_prob, + self.cfg.mask_dur_seg_leng, + self.cfg.mask_dur_seg_type, + self.dur_mask_val, + ) + dur_mask = dur_mask.logical_or(_dur_mask) + + f0_src, f0_mask = self.apply_seq_dropout( + f0_src, self.cfg.mask_f0_prob, self.f0_mask_val + ) + f0_src, _f0_mask = self.apply_seg_dropout( + f0_src, + self.cfg.mask_f0_seg_prob, + self.cfg.mask_f0_seg_leng, + self.cfg.mask_f0_seg_type, + self.f0_mask_val, + ) + f0_mask = f0_mask.logical_or(_f0_mask) + + return src_tokens, unit_mask, dur_src, dur_mask, f0_src, f0_mask + + def forward( + self, + src_tokens: torch.Tensor, + dur_src: torch.Tensor, + f0_src: torch.Tensor, + src_lengths: Optional[Any] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + ): + if self.ignore_duration_input: + dur_src = torch.zeros_like(dur_src) + + if self.ignore_f0_input: + f0_src = torch.zeros_like(f0_src) + + if self.training: + ( + src_tokens, + unit_mask, + dur_src, + dur_mask, + f0_src, + f0_mask, + ) = self.apply_dropout(src_tokens, dur_src, f0_src) + else: + unit_masks = dur_mask = f0_mask = None + + prediction, _ = self.decoder( + prev_output_tokens=(src_tokens, dur_src, f0_src), + incremental_state=incremental_state, + src_lengths=src_lengths, + features_only=True, + ) + + result = dict(zip(self.channel_names, prediction)) + + return result + + +def base_ulm_architecture(args): + from .transformer_lm import base_lm_architecture + + base_lm_architecture(args) + + +@register_model_architecture("transformer_ulm", "transformer_ulm_big") +def transformer_ulm_big(args): + from .transformer_lm import transformer_lm_big + + transformer_lm_big(args) + base_ulm_architecture(args) + + +@register_model_architecture("transformer_ulm", "transformer_ulm_tiny") +def transformer_ulm_tiny(args): + from .transformer_lm import transformer_lm_gpt2_tiny + + transformer_lm_gpt2_tiny(args) + base_ulm_architecture(args) + + +class MultiStreamTransformerDecoder(TransformerDecoder): + def __init__( + self, + args, + dictionary, + embed_tokens, + embed_other_list, + no_encoder_attn, + channel_sizes, + ): + super().__init__( + args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn + ) + + # embed each channel and project if dimensions do not match + self.embed_other_list = torch.nn.ModuleList(embed_other_list) + self.proj_other_list = torch.nn.ModuleList() + dim = embed_tokens.embedding_dim + for embed_other in embed_other_list: + other_dim = 1 if embed_other is None else embed_other.embedding_dim + self.proj_other_list.append( + nn.Linear(other_dim, dim) if other_dim != dim else None + ) + + # tranformer output to prediction + self.channel_sizes = channel_sizes + self.project_out_dim = Linear( + embed_tokens.embedding_dim, sum(channel_sizes), bias=False + ) + + def extract_features_scriptable( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, + alignment_layer: Optional[int] = None, + alignment_heads: Optional[int] = None, + ): + if alignment_layer is None: + alignment_layer = self.num_layers - 1 + + # XXX: first multi-channel change start + prev_output_tokens, *other_channels = prev_output_tokens + # XXX: first multi-channel change end + + # embed positions + positions = None + if self.embed_positions is not None: + positions = self.embed_positions( + prev_output_tokens, incremental_state=incremental_state + ) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + other_channels = [o[:, -1:] for o in other_channels] + if positions is not None: + positions = positions[:, -1:] + + # embed tokens and positions + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + + # XXX: second multi-channel change start + other_channels = [ + o.unsqueeze(-1).to(dtype=x.dtype) if emb is None else emb(o) + for o, emb in zip(other_channels, self.embed_other_list) + ] + other_channels = [ + o if proj_other is None else proj_other(o) + for o, proj_other in zip(other_channels, self.proj_other_list) + ] + for o in other_channels: + x = x + o + # XXX: second multi-channel change end + + if self.quant_noise is not None: + x = self.quant_noise(x) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + if positions is not None: + x += positions + + if self.layernorm_embedding is not None: + x = self.layernorm_embedding(x) + + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + self_attn_padding_mask: Optional[Tensor] = None + if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): + self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) + + # decoder layers + attn: Optional[Tensor] = None + inner_states: List[Optional[Tensor]] = [x] + for idx, layer in enumerate(self.layers): + if incremental_state is None and not full_context_alignment: + self_attn_mask = self.buffered_future_mask(x) + else: + self_attn_mask = None + + x, layer_attn, _ = layer( + x, + encoder_out["encoder_out"][0] + if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) + else None, + encoder_out["encoder_padding_mask"][0] + if ( + encoder_out is not None + and len(encoder_out["encoder_padding_mask"]) > 0 + ) + else None, + incremental_state, + self_attn_mask=self_attn_mask, + self_attn_padding_mask=self_attn_padding_mask, + need_attn=bool((idx == alignment_layer)), + need_head_weights=bool((idx == alignment_layer)), + ) + inner_states.append(x) + if layer_attn is not None and idx == alignment_layer: + attn = layer_attn.float().to(x) + + if attn is not None: + if alignment_heads is not None: + attn = attn[:alignment_heads] + + # average probabilities over heads + attn = attn.mean(dim=0) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + if self.project_out_dim is not None: + x = self.project_out_dim(x) + else: + assert False + + # XXX: the last change start + result = [] + start = 0 + for channel_size in self.channel_sizes: + end = start + channel_size + result.append(x[:, :, start:end]) + start = end + assert end == x.size(-1) + # XXX: the last change end + + return result, {"attn": [attn], "inner_states": inner_states} diff --git a/fairseq/models/wav2vec/__init__.py b/fairseq/models/wav2vec/__init__.py index 06cec18183..b756e4580b 100644 --- a/fairseq/models/wav2vec/__init__.py +++ b/fairseq/models/wav2vec/__init__.py @@ -6,3 +6,5 @@ from .wav2vec import * # noqa from .wav2vec2 import * # noqa from .wav2vec2_asr import * # noqa +from .wav2vec2_laser import * # noqa +from .wav2vec2_classification import * # noqa diff --git a/fairseq/models/wav2vec/utils.py b/fairseq/models/wav2vec/utils.py new file mode 100644 index 0000000000..dd52d86242 --- /dev/null +++ b/fairseq/models/wav2vec/utils.py @@ -0,0 +1,21 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import torch.nn.functional as F + + +def pad_to_multiple(x, multiple, dim=-1, value=0): + # Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41 + if x is None: + return None, 0 + tsz = x.size(dim) + m = tsz / multiple + remainder = math.ceil(m) * multiple - tsz + if m.is_integer(): + return x, 0 + pad_offset = (0,) * (-1 - dim) * 2 + + return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder diff --git a/fairseq/models/wav2vec/wav2vec.py b/fairseq/models/wav2vec/wav2vec.py index 772995b526..af6604da10 100644 --- a/fairseq/models/wav2vec/wav2vec.py +++ b/fairseq/models/wav2vec/wav2vec.py @@ -3,14 +3,18 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +from dataclasses import dataclass, field import logging import math +from typing import Optional, Tuple +from omegaconf import II import sys import torch import torch.nn as nn import torch.nn.functional as F -from fairseq.models import BaseFairseqModel, register_model, register_model_architecture +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.models import BaseFairseqModel, register_model from fairseq.modules import ( Fp32GroupNorm, Fp32LayerNorm, @@ -18,264 +22,208 @@ KmeansVectorQuantizer, TransposeLast, ) +from fairseq.tasks import FairseqTask from fairseq.utils import buffered_arange logger = logging.getLogger(__name__) -@register_model("wav2vec") -class Wav2VecModel(BaseFairseqModel): - @staticmethod - def add_args(parser): - """Add model-specific arguments to the parser.""" - parser.add_argument( - "--prediction-steps", - type=int, - metavar="N", - help="number of steps ahead to predict", - ) - parser.add_argument( - "--sample-distance", - type=int, - metavar="N", - help="sample distance from target. does not work properly with cross-sampling", - ) - parser.add_argument( - "--cross-sample-negatives", - type=int, - metavar="N", - help="num of cross sampled negatives", - ) - parser.add_argument( - "--num-negatives", type=int, metavar="N", help="number of negative examples" - ) - parser.add_argument( - "--conv-feature-layers", - type=str, - metavar="EXPR", - help="convolutional feature extraction layers [(dim, kernel_size, stride), ...]", - ) - parser.add_argument( - "--conv-aggregator-layers", - type=str, - metavar="EXPR", - help="convolutional feature extraction layers [(dim, kernel_size, stride), ...]", - ) - parser.add_argument( - "--dropout", - type=float, - metavar="D", - help="dropout to apply within the model", - ) - parser.add_argument( - "--dropout-features", - type=float, - metavar="D", - help="dropout to apply to the features", - ) - parser.add_argument( - "--dropout-agg", - type=float, - metavar="D", - help="dropout to apply after aggregation step", - ) - parser.add_argument( - "--encoder", type=str, choices=["cnn"], help="type of encoder to use" - ) - parser.add_argument( - "--aggregator", - type=str, - choices=["cnn", "gru"], - help="type of aggregator to use", - ) - parser.add_argument( - "--gru-dim", type=int, metavar="N", help="GRU dimensionality" - ) - - parser.add_argument( - "--no-conv-bias", - action="store_true", - help="if set, does not learn bias for conv layers", - ) - parser.add_argument( - "--agg-zero-pad", - action="store_true", - help="if set, zero pads in aggregator instead of repl pad", - ) +AGGREGATOR_CHOICES = ChoiceEnum(["cnn", "gru"]) +PROJECT_FEATURES_CHOICES = ChoiceEnum(["none", "same", "new"]) +ACTIVATION_CHOICES = ChoiceEnum(["relu", "gelu"]) +VQ_TYPE_CHOICES = ChoiceEnum(["none", "gumbel", "kmeans"]) - parser.add_argument( - "--skip-connections-feat", - action="store_true", - help="if set, adds skip connections to the feature extractor", - ) - parser.add_argument( - "--skip-connections-agg", - action="store_true", - help="if set, adds skip connections to the aggregator", - ) - parser.add_argument( - "--residual-scale", - type=float, - metavar="D", - help="scales residual by sqrt(value)", - ) - - parser.add_argument( - "--log-compression", - action="store_true", - help="if set, adds a log compression to feature extractor", - ) - - parser.add_argument( - "--balanced-classes", - action="store_true", - help="if set, loss is scaled to balance for number of negatives", - ) - parser.add_argument( - "--project-features", - choices=["none", "same", "new"], - help="if not none, features are projected using the (same or new) aggregator", - ) - - parser.add_argument( - "--non-affine-group-norm", - action="store_true", - help="if set, group norm is not affine", - ) - - parser.add_argument( - "--offset", - help="if set, introduces an offset from target to predictions. " - 'if set to "auto", it is computed automatically from the receptive field', - ) - - parser.add_argument( - "--activation", - type=str, - choices=["relu", "gelu"], - help="which activation function to use", - ) +@dataclass +class Wav2VecConfig(FairseqDataclass): + prediction_steps: int = field( + default=12, metadata={"help": "number of steps ahead to predict"} + ) + sample_distance: Optional[int] = field( + default=None, + metadata={ + "help": "sample distance from target. does not work properly with cross-sampling" + }, + ) + cross_sample_negatives: int = field( + default=0, metadata={"help": "num of cross sampled negatives"} + ) + num_negatives: int = field( + default=10, metadata={"help": "num of sampled negatives"} + ) + conv_feature_layers: str = field( + default="[(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1), (512, 1, 1)]", + metadata={ + "help": "convolutional feature extraction layers [(dim, kernel_size, stride), ...]" + }, + ) + conv_aggregator_layers: str = field( + default="[(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)]", + metadata={ + "help": "convolutional aggregator layers [(dim, kernel_size, stride), ...]" + }, + ) + dropout: float = field( + default=0.0, metadata={"help": "dropout to apply within the model"} + ) + dropout_features: float = field( + default=0.0, metadata={"help": "dropout to apply to the features"} + ) + dropout_agg: float = field( + default=0.0, metadata={"help": "dropout to apply after aggregation step"} + ) + aggregator: AGGREGATOR_CHOICES = field( + default="cnn", metadata={"help": "type of aggregator to use"} + ) + gru_dim: int = field(default=512, metadata={"help": "GRU dimensionality"}) + no_conv_bias: bool = field( + default=False, metadata={"help": "if set, does not learn bias for conv layers"} + ) + agg_zero_pad: bool = field( + default=False, + metadata={"help": "if set, zero pads in aggregator instead of repl pad"}, + ) + skip_connections_feat: bool = field( + default=False, + metadata={"help": "if set, adds skip connections to the feature extractor"}, + ) + skip_connections_agg: bool = field( + default=True, + metadata={"help": "if set, adds skip connections to the aggregator"}, + ) + residual_scale: float = field( + default=0.5, metadata={"help": "scales residual by sqrt(value)"} + ) + log_compression: bool = field( + default=True, + metadata={"help": "if set, adds a log compression to feature extractor"}, + ) + balanced_classes: bool = field( + default=False, + metadata={"help": "if set, loss is scaled to balance for number of negatives"}, + ) + project_features: PROJECT_FEATURES_CHOICES = field( + default="none", + metadata={ + "help": "if not none, features are projected using the (same or new) aggregator" + }, + ) + non_affine_group_norm: bool = field( + default=False, metadata={"help": "if set, group norm is not affine"} + ) + offset: str = field( + default="auto", + metadata={ + "help": "if set to 'auto', it is computed automatically from the receptive field, else set to int value" + }, + ) + activation: ACTIVATION_CHOICES = field( + default="relu", + metadata={ + "help": "if set to 'auto', it is computed automatically from the receptive field, else set to int value" + }, + ) + vq_type: VQ_TYPE_CHOICES = field( + default="none", metadata={"help": "which type of quantizer to use"} + ) + vq_vars: int = field( + default=320, + metadata={"help": "project to this many vector quantized variables per group"}, + ) + vq_groups: int = field( + default=2, metadata={"help": "number of groups of latent variables"} + ) + vq_dim: int = field( + default=0, + metadata={ + "help": "uses this dimensionality for quantized vectors. 0 to use model dim // groups" + }, + ) + vq_depth: int = field( + default=1, metadata={"help": "number of layers for vq weight projection"} + ) + combine_groups: bool = field( + default=False, metadata={"help": "if set, variables are shared among groups"} + ) + vq_temp: Tuple[float, float, float] = field( + default=(2.0, 0.5, 0.999995), + metadata={ + "help": "temperature for latent variable sampling with gumbel softmax. should be a tuple of 3 values (start, end, decay)" + }, + ) + vq_gamma: float = field( + default=0.25, + metadata={"help": "gamma parameter for kmeans style vector quantization"}, + ) + infonce: bool = II("criterion.infonce") - parser.add_argument( - "--vq-type", - type=str, - choices=["none", "gumbel", "kmeans"], - help="which type of quantizer to use", - ) - parser.add_argument( - "--vq-vars", - type=int, - metavar="N", - help="if set, project to this many vector quantized variables per group", - ) - parser.add_argument( - "--vq-groups", - type=int, - metavar="N", - help="number of groups of latent variables", - ) - parser.add_argument( - "--vq-dim", - type=int, - metavar="N", - help="uses this dimensionality for quantized vectors", - ) - parser.add_argument( - "--vq-depth", - type=int, - metavar="N", - help="number of layers for vq weight projection", - ) - parser.add_argument( - "--combine-groups", - action="store_true", - help="if set, variables are shared among groups", - ) - parser.add_argument( - "--vq-temp", - type=str, - metavar="TEMP", - help="temperature for latent variable sampling with gumbel softmax. should be a tuple of 3 values (start, end, decay)", - ) - parser.add_argument( - "--vq-gamma", - type=float, - metavar="D", - help="gamma parameter for kmeans style vector quantization", - ) +@register_model("wav2vec", dataclass=Wav2VecConfig) +class Wav2VecModel(BaseFairseqModel): @classmethod - def build_model(cls, args, task): + def build_model(cls, cfg: Wav2VecConfig, task: FairseqTask): """Build a new model instance.""" - # make sure all arguments are present in older models - base_wav2vec_architecture(args) - - model = Wav2VecModel(args) + model = Wav2VecModel(cfg) logger.info(model) return model - def __init__(self, args): + def __init__(self, cfg: Wav2VecConfig): super().__init__() - self.prediction_steps = args.prediction_steps - offset = args.offset + self.prediction_steps = cfg.prediction_steps + offset = cfg.offset - if args.activation == "relu": + if cfg.activation == "relu": activation = nn.ReLU() - elif args.activation == "gelu": + elif cfg.activation == "gelu": activation = nn.GELU() else: - raise Exception("unknown activation " + args.activation) - - if args.encoder == "cnn": - feature_enc_layers = eval(args.conv_feature_layers) - self.feature_extractor = ConvFeatureExtractionModel( - conv_layers=feature_enc_layers, - dropout=0.0, - log_compression=args.log_compression, - skip_connections=args.skip_connections_feat, - residual_scale=args.residual_scale, - non_affine_group_norm=args.non_affine_group_norm, - activation=activation, - ) - embed = feature_enc_layers[-1][0] - else: - raise Exception("unknown encoder type " + args.encoder) + raise Exception("unknown activation " + cfg.activation) + + feature_enc_layers = eval(cfg.conv_feature_layers) + self.feature_extractor = ConvFeatureExtractionModel( + conv_layers=feature_enc_layers, + dropout=0.0, + log_compression=cfg.log_compression, + skip_connections=cfg.skip_connections_feat, + residual_scale=cfg.residual_scale, + non_affine_group_norm=cfg.non_affine_group_norm, + activation=activation, + ) + embed = feature_enc_layers[-1][0] self.vector_quantizer = None - if args.vq_type == "gumbel": + if cfg.vq_type == "gumbel": self.vector_quantizer = GumbelVectorQuantizer( dim=embed, - num_vars=args.vq_vars, - temp=eval(args.vq_temp), - groups=args.vq_groups, - combine_groups=args.combine_groups, - vq_dim=args.vq_dim if args.vq_dim > 0 else embed, + num_vars=cfg.vq_vars, + temp=cfg.vq_temp, + groups=cfg.vq_groups, + combine_groups=cfg.combine_groups, + vq_dim=cfg.vq_dim if cfg.vq_dim > 0 else embed, time_first=False, activation=activation, - weight_proj_depth=args.vq_depth, + weight_proj_depth=cfg.vq_depth, weight_proj_factor=2, ) - elif args.vq_type == "kmeans": + elif cfg.vq_type == "kmeans": self.vector_quantizer = KmeansVectorQuantizer( dim=embed, - num_vars=args.vq_vars, - groups=args.vq_groups, - combine_groups=args.combine_groups, - vq_dim=args.vq_dim if args.vq_dim > 0 else embed, + num_vars=cfg.vq_vars, + groups=cfg.vq_groups, + combine_groups=cfg.combine_groups, + vq_dim=cfg.vq_dim if cfg.vq_dim > 0 else embed, time_first=False, - gamma=args.vq_gamma, + gamma=cfg.vq_gamma, ) else: assert ( - args.vq_type == "none" or args.vq_type is None + cfg.vq_type == "none" or cfg.vq_type is None ), "Unknown quantizer type" - if args.offset == "auto": - assert args.encoder == "cnn" + if cfg.offset == "auto": jin = 0 rin = 0 for _, k, stride in feature_enc_layers: @@ -291,34 +239,34 @@ def __init__(self, args): offset = int(offset) def make_aggregator(): - if args.aggregator == "cnn": - agg_layers = eval(args.conv_aggregator_layers) + if cfg.aggregator == "cnn": + agg_layers = eval(cfg.conv_aggregator_layers) agg_dim = agg_layers[-1][0] feature_aggregator = ConvAggegator( conv_layers=agg_layers, embed=embed, - dropout=args.dropout, - skip_connections=args.skip_connections_agg, - residual_scale=args.residual_scale, - non_affine_group_norm=args.non_affine_group_norm, - conv_bias=not args.no_conv_bias, - zero_pad=args.agg_zero_pad, + dropout=cfg.dropout, + skip_connections=cfg.skip_connections_agg, + residual_scale=cfg.residual_scale, + non_affine_group_norm=cfg.non_affine_group_norm, + conv_bias=not cfg.no_conv_bias, + zero_pad=cfg.agg_zero_pad, activation=activation, ) - elif args.aggregator == "gru": - agg_dim = args.gru_dim + elif cfg.aggregator == "gru": + agg_dim = cfg.gru_dim feature_aggregator = nn.Sequential( TransposeLast(), nn.GRU( input_size=embed, hidden_size=agg_dim, num_layers=1, - dropout=args.dropout, + dropout=cfg.dropout, ), TransposeLast(deconstruct_idx=0), ) else: - raise Exception("unknown aggregator type " + args.aggregator) + raise Exception("unknown aggregator type " + cfg.aggregator) return feature_aggregator, agg_dim @@ -327,24 +275,24 @@ def make_aggregator(): self.wav2vec_predictions = Wav2VecPredictionsModel( in_dim=agg_dim, out_dim=embed, - prediction_steps=args.prediction_steps, - n_negatives=args.num_negatives, - cross_sample_negatives=args.cross_sample_negatives, - sample_distance=args.sample_distance, - dropout=args.dropout, + prediction_steps=cfg.prediction_steps, + n_negatives=cfg.num_negatives, + cross_sample_negatives=cfg.cross_sample_negatives, + sample_distance=cfg.sample_distance, + dropout=cfg.dropout, offset=offset, - balanced_classes=args.balanced_classes, - infonce=args.infonce, + balanced_classes=cfg.balanced_classes, + infonce=cfg.infonce, ) - self.dropout_feats = nn.Dropout(p=args.dropout_features) - self.dropout_agg = nn.Dropout(p=args.dropout_agg) + self.dropout_feats = nn.Dropout(p=cfg.dropout_features) + self.dropout_agg = nn.Dropout(p=cfg.dropout_agg) - if args.project_features == "none": + if cfg.project_features == "none": self.project_features = None - elif args.project_features == "same": + elif cfg.project_features == "same": self.project_features = self.feature_aggregator - elif args.project_features == "new": + elif cfg.project_features == "new": self.project_features, _ = make_aggregator() def forward(self, source): @@ -680,56 +628,3 @@ def forward(self, x, y): labels = (labels, weights) return predictions, labels - - -@register_model_architecture("wav2vec", "wav2vec") -def base_wav2vec_architecture(args): - conv_feature_layers = "[(512, 10, 5)]" - conv_feature_layers += " + [(512, 8, 4)]" - conv_feature_layers += " + [(512, 4, 2)] * 3" - args.conv_feature_layers = getattr(args, "conv_feature_layers", conv_feature_layers) - - args.conv_aggregator_layers = getattr( - args, "conv_aggregator_layers", "[(512, 3, 1)] * 9" - ) - - args.prediction_steps = getattr(args, "prediction_steps", 12) - args.num_negatives = getattr(args, "num_negatives", 1) - args.sample_distance = getattr(args, "sample_distance", None) - args.cross_sample_negatives = getattr(args, "cross_sample_negatives", 0) - - args.dropout = getattr(args, "dropout", 0.0) - args.dropout_features = getattr(args, "dropout_features", 0.0) - args.dropout_agg = getattr(args, "dropout_agg", 0.0) - args.encoder = getattr(args, "encoder", "cnn") - args.aggregator = getattr(args, "aggregator", "cnn") - - args.skip_connections_feat = getattr(args, "skip_connections_feat", False) - args.skip_connections_agg = getattr(args, "skip_connections_agg", False) - args.residual_scale = getattr(args, "residual_scale", 0.5) - - args.gru_dim = getattr(args, "gru_dim", 512) - - args.no_conv_bias = getattr(args, "no_conv_bias", False) - args.agg_zero_pad = getattr(args, "agg_zero_pad", False) - - args.log_compression = getattr(args, "log_compression", False) - - args.balanced_classes = getattr(args, "balanced_classes", False) - args.infonce = getattr(args, "infonce", False) - args.project_features = getattr(args, "project_features", "none") - - args.non_affine_group_norm = getattr(args, "non_affine_group_norm", False) - - args.offset = getattr(args, "offset", "auto") - - args.activation = getattr(args, "activation", "relu") - - args.vq_type = getattr(args, "vq_type", "none") - args.vq_vars = getattr(args, "vq_vars", 320) - args.vq_groups = getattr(args, "vq_groups", 2) - args.vq_dim = getattr(args, "vq_dim", 0) - args.vq_depth = getattr(args, "vq_depth", 1) - args.combine_groups = getattr(args, "combine_groups", False) - args.vq_temp = getattr(args, "vq_temp", "(2.0, 0.5, 0.999995)") - args.vq_gamma = getattr(args, "vq_gamma", 0.25) diff --git a/fairseq/models/wav2vec/wav2vec2.py b/fairseq/models/wav2vec/wav2vec2.py index 6a0f787601..0faba77f8b 100644 --- a/fairseq/models/wav2vec/wav2vec2.py +++ b/fairseq/models/wav2vec/wav2vec2.py @@ -3,17 +3,21 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import logging import math +from dataclasses import dataclass, field from typing import List, Tuple import numpy as np import torch import torch.nn as nn import torch.nn.functional as F + from fairseq import utils from fairseq.data.data_utils import compute_mask_indices -from fairseq.models import BaseFairseqModel, register_model, register_model_architecture +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.distributed import fsdp_wrap +from fairseq.models import BaseFairseqModel, register_model +from fairseq.distributed.fully_sharded_data_parallel import FullyShardedDataParallel from fairseq.modules import ( Fp32GroupNorm, Fp32LayerNorm, @@ -21,381 +25,395 @@ GumbelVectorQuantizer, LayerNorm, MultiheadAttention, + RelPositionalEncoding, SamePad, TransposeLast, ) +from fairseq.modules.checkpoint_activations import checkpoint_wrapper +from fairseq.modules.conformer_layer import ConformerWav2Vec2EncoderLayer from fairseq.modules.transformer_sentence_encoder import init_bert_params -from fairseq.utils import buffered_arange - - -@register_model("wav2vec2") +from fairseq.utils import buffered_arange, index_put, is_xla_tensor + +from .utils import pad_to_multiple + +EXTRACTOR_MODE_CHOICES = ChoiceEnum(["default", "layer_norm"]) +MASKING_DISTRIBUTION_CHOICES = ChoiceEnum(["static", "uniform", "normal", "poisson"]) +LAYER_TYPE_CHOICES = ChoiceEnum(["transformer", "conformer", "trf_adp"]) + + +@dataclass +class Wav2Vec2Config(FairseqDataclass): + extractor_mode: EXTRACTOR_MODE_CHOICES = field( + default="default", + metadata={ + "help": "mode for feature extractor. default has a single group norm with d " + "groups in the first conv block, whereas layer_norm has layer norms in " + "every block (meant to use with normalize=True)" + }, + ) + encoder_layers: int = field( + default=12, metadata={"help": "num encoder layers in the transformer"} + ) + encoder_embed_dim: int = field( + default=768, metadata={"help": "encoder embedding dimension"} + ) + encoder_ffn_embed_dim: int = field( + default=3072, metadata={"help": "encoder embedding dimension for FFN"} + ) + encoder_attention_heads: int = field( + default=12, metadata={"help": "num encoder attention heads"} + ) + activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( + default="gelu", metadata={"help": "activation function to use"} + ) + layer_type: LAYER_TYPE_CHOICES = field( + default="transformer", metadata={"help": "layer type in encoder"} + ) + # dropouts + dropout: float = field( + default=0.1, metadata={"help": "dropout probability for the transformer"} + ) + attention_dropout: float = field( + default=0.1, metadata={"help": "dropout probability for attention weights"} + ) + activation_dropout: float = field( + default=0.0, metadata={"help": "dropout probability after activation in FFN"} + ) + encoder_layerdrop: float = field( + default=0.0, metadata={"help": "probability of dropping a tarnsformer layer"} + ) + dropout_input: float = field( + default=0.0, + metadata={"help": "dropout to apply to the input (after feat extr)"}, + ) + dropout_features: float = field( + default=0.0, + metadata={"help": "dropout to apply to the features (after feat extr)"}, + ) + + final_dim: int = field( + default=0, + metadata={ + "help": "project final representations and targets to this many dimensions." + "set to encoder_embed_dim is <= 0" + }, + ) + layer_norm_first: bool = field( + default=False, metadata={"help": "apply layernorm first in the transformer"} + ) + conv_feature_layers: str = field( + default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]", + metadata={ + "help": "string describing convolutional feature extraction layers in form of a python list that contains " + "[(dim, kernel_size, stride), ...]" + }, + ) + conv_bias: bool = field( + default=False, metadata={"help": "include bias in conv encoder"} + ) + logit_temp: float = field( + default=0.1, metadata={"help": "temperature to divide logits by"} + ) + quantize_targets: bool = field( + default=False, metadata={"help": "use quantized targets"} + ) + quantize_input: bool = field( + default=False, metadata={"help": "use quantized inputs"} + ) + same_quantizer: bool = field( + default=False, metadata={"help": "use same quantizer for inputs and targets"} + ) + target_glu: bool = field( + default=False, metadata={"help": "adds projection + glu to targets"} + ) + feature_grad_mult: float = field( + default=1.0, metadata={"help": "multiply feature extractor var grads by this"} + ) + quantizer_depth: int = field( + default=1, + metadata={"help": "number of quantizer layers"}, + ) + quantizer_factor: int = field( + default=3, + metadata={ + "help": "dimensionality increase for inner quantizer layers (if depth > 1)" + }, + ) + latent_vars: int = field( + default=320, + metadata={"help": "number of latent variables V in each group of the codebook"}, + ) + latent_groups: int = field( + default=2, + metadata={"help": "number of groups G of latent variables in the codebook"}, + ) + latent_dim: int = field( + default=0, + metadata={ + "help": "if > 0, uses this dimensionality for latent variables. " + "otherwise uses final_dim / latent_groups" + }, + ) + + # masking + mask_length: int = field(default=10, metadata={"help": "mask length"}) + mask_prob: float = field( + default=0.65, metadata={"help": "probability of replacing a token with mask"} + ) + mask_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose mask length"} + ) + mask_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument (used for more complex distributions), " + "see help in compute_mask_indices" + }, + ) + no_mask_overlap: bool = field( + default=False, metadata={"help": "whether to allow masks to overlap"} + ) + mask_min_space: int = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + require_same_masks: bool = field( + default=True, + metadata={ + "help": "whether to number of masked timesteps must be the same across all " + "examples in a batch" + }, + ) + mask_dropout: float = field( + default=0.0, + metadata={"help": "percent of masks to unmask for each sample"}, + ) + + # channel masking + mask_channel_length: int = field( + default=10, metadata={"help": "length of the mask for features (channels)"} + ) + mask_channel_prob: float = field( + default=0.0, metadata={"help": "probability of replacing a feature with 0"} + ) + mask_channel_before: bool = False + mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", + metadata={"help": "how to choose mask length for channel masking"}, + ) + mask_channel_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument (used for more complex distributions), " + "see help in compute_mask_indicesh" + }, + ) + no_mask_channel_overlap: bool = field( + default=False, metadata={"help": "whether to allow channel masks to overlap"} + ) + mask_channel_min_space: int = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + + # negative selection + num_negatives: int = field( + default=100, + metadata={"help": "number of negative examples from the same sample"}, + ) + negatives_from_everywhere: bool = field( + default=False, + metadata={"help": "sample negatives from everywhere, not just masked states"}, + ) + cross_sample_negatives: int = field( + default=0, metadata={"help": "number of negative examples from the any sample"} + ) + codebook_negatives: int = field( + default=0, metadata={"help": "number of negative examples codebook"} + ) + + # positional embeddings + conv_pos: int = field( + default=128, + metadata={"help": "number of filters for convolutional positional embeddings"}, + ) + conv_pos_groups: int = field( + default=16, + metadata={"help": "number of groups for convolutional positional embedding"}, + ) + pos_conv_depth: int = field( + default=1, + metadata={"help": "depth of positional encoder network"}, + ) + + latent_temp: Tuple[float, float, float] = field( + default=(2, 0.5, 0.999995), + metadata={ + "help": "temperature for latent variable sampling. " + "can be tuple of 3 values (start, end, decay)" + }, + ) + max_positions: int = field(default=100000, metadata={"help": "Max positions"}) + checkpoint_activations: bool = field( + default=False, + metadata={"help": "recompute activations and save memory for extra compute"}, + ) + + # FP16 optimization + required_seq_len_multiple: int = field( + default=2, + metadata={ + "help": "pad the input to encoder such that the sequence length is divisible by multiple" + }, + ) + crop_seq_to_multiple: int = field( + default=1, + metadata={ + "help": "crop convolutional feature extractor output such that the sequence length is divisible by multiple" + }, + ) + + # Conformer + depthwise_conv_kernel_size: int = field( + default=31, + metadata={ + "help": "depthwise-conv-kernel-size for convolution in conformer layer" + }, + ) + attn_type: str = field( + default="", + metadata={"help": "if espnet use ESPNET MHA"}, + ) + pos_enc_type: str = field( + default="abs", + metadata={"help": "Positional encoding type to use in conformer"}, + ) + fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"}) + + # Adapter num + adp_num: int = field( + default=-1 + ) + adp_dim: int = field( + default=64 + ) + adp_act_fn: str = field( + default="relu" + ) + adp_trf_idx: str = field( + default="all", + ) + + +@register_model("wav2vec2", dataclass=Wav2Vec2Config) class Wav2Vec2Model(BaseFairseqModel): - @staticmethod - def add_args(parser): - """Add model-specific arguments to the parser.""" - - parser.add_argument( - "--extractor-mode", - choices=["default", "layer_norm"], - help="mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with --normalize)", - ) - - parser.add_argument( - "--encoder-layers", - type=int, - metavar="L", - help="num encoder layers in the transformer", - ) - parser.add_argument( - "--encoder-embed-dim", - type=int, - metavar="H", - help="encoder embedding dimension", - ) - parser.add_argument( - "--encoder-ffn-embed-dim", - type=int, - metavar="F", - help="encoder embedding dimension for FFN", - ) - parser.add_argument( - "--encoder-attention-heads", - type=int, - metavar="A", - help="num encoder attention heads", - ) - parser.add_argument( - "--activation-fn", - choices=utils.get_available_activation_fns(), - help="activation function to use", - ) - - parser.add_argument( - "--dropout", - type=float, - metavar="D", - help="dropout probability for the transformer", - ) - - parser.add_argument( - "--attention-dropout", - type=float, - metavar="D", - help="dropout probability for attention weights", - ) - - parser.add_argument( - "--activation-dropout", - type=float, - metavar="D", - help="dropout probability after activation in FFN", - ) - - parser.add_argument( - "--final-dim", - type=int, - metavar="D", - help="project final representations and targets to this many dimensions", - ) - - parser.add_argument( - "--layer-norm-first", - action="store_true", - help="apply layernorm first in the transformer", - ) - - parser.add_argument( - "--encoder-layerdrop", - type=float, - help="probability of dropping a tarnsformer layer", - ) - - parser.add_argument( - "--conv-feature-layers", - type=str, - metavar="EXPR", - help="convolutional feature extraction layers [(dim, kernel_size, stride), ...]", - ) - - parser.add_argument( - "--logit-temp", type=float, help="temperature to divide logits by" - ) - - parser.add_argument( - "--quantize-targets", action="store_true", help="use quantized targets" - ) - - parser.add_argument( - "--quantize-input", action="store_true", help="use quantized inputs" - ) - - parser.add_argument( - "--same-quantizer", - action="store_true", - help="use same quantizer for inputs and targets", - ) - - parser.add_argument( - "--feature-grad-mult", - type=float, - help="multiply feature extractor var grads by this", - ) - - parser.add_argument( - "--latent-vars", - type=int, - metavar="N", - help="number of latent variables V in each group of the codebook", - ) - - parser.add_argument( - "--latent-groups", - type=int, - metavar="N", - help="number of groups G of latent variables in the codebook", - ) - - parser.add_argument( - "--latent-dim", - type=int, - metavar="N", - help="if set, uses this dimensionality for latent variables. otherwise uses final_dim / latent_groups", - ) - - parser.add_argument("--mask-length", type=int, help="mask length") - - parser.add_argument( - "--mask-prob", type=float, help="probability of replacing a token with mask" - ) - - parser.add_argument( - "--mask-selection", - type=str, - choices=["static", "uniform", "normal", "poisson"], - help="how to choose masks", - ) - - parser.add_argument( - "--mask-other", - type=float, - help="secondary mask argument (used for more complex distributions), see help in compute_mask_indices", - ) - - parser.add_argument( - "--no-mask-overlap", - action="store_true", - help="whether to allow masks to overlap", - ) - - parser.add_argument( - "--mask-min-space", - type=int, - help="min space between spans (if no overlap is enabled)", - ) - - parser.add_argument( - "--mask-channel-length", - type=int, - help="repeat the mask indices multiple times", - ) - - parser.add_argument( - "--mask-channel-prob", - type=float, - help="probability of replacing a token with mask", - ) - - parser.add_argument( - "--mask-channel-selection", - type=str, - choices=["static", "uniform", "normal", "poisson"], - help="how to choose masks", - ) - - parser.add_argument( - "--mask-channel-other", - type=float, - help="secondary mask argument (used for more complex distributions), see help in compute_mask_indices", - ) - - parser.add_argument( - "--no-mask-channel-overlap", - action="store_true", - help="whether to allow masks to overlap", - ) - - parser.add_argument( - "--mask-channel-min-space", - type=int, - help="min space between spans (if no overlap is enabled)", - ) - - parser.add_argument( - "--dropout-input", - type=float, - metavar="D", - help="dropout to apply to the input (after feat extr)", - ) - - parser.add_argument( - "--dropout-features", - type=float, - metavar="D", - help="dropout to apply to the features (after feat extr)", - ) - - parser.add_argument( - "--num-negatives", type=int, metavar="N", help="number of negative examples" - ) - - parser.add_argument( - "--negatives-from-everywhere", - action="store_true", - help="sample negatives from everywhere, not just masked states", - ) - - parser.add_argument( - "--cross-sample-negatives", - type=int, - metavar="N", - help="num of cross sampled negatives", - ) - - parser.add_argument( - "--codebook-negatives", - type=int, - metavar="N", - help="num of codebook sampled negatives", - ) - - parser.add_argument( - "--conv-pos", - type=int, - metavar="N", - help="number of filters for convolutional positional embeddings", - ) - - parser.add_argument( - "--conv-pos-groups", - type=int, - metavar="N", - help="number of groups for convolutional positional embedding", - ) - - parser.add_argument( - "--latent-temp", - type=str, - metavar="D", - help="temperature for latent variable sampling. can be tuple of 3 values (start, end, decay)", - ) - - parser.add_argument( - "--target-glu", action="store_true", help="adds projection + glu to targets" - ) - - parser.add_argument( - "--conv-bias", action="store_true", help="include bias in conv encoder" - ) - - def __init__(self, args): + def __init__(self, cfg: Wav2Vec2Config): super().__init__() - self.args = args + self.cfg = cfg - feature_enc_layers = eval(args.conv_feature_layers) + feature_enc_layers = eval(cfg.conv_feature_layers) self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, - mode=args.extractor_mode, - conv_bias=args.conv_bias, + mode=cfg.extractor_mode, + conv_bias=cfg.conv_bias, ) self.post_extract_proj = ( - nn.Linear(self.embed, args.encoder_embed_dim) - if self.embed != args.encoder_embed_dim and not args.quantize_input + nn.Linear(self.embed, cfg.encoder_embed_dim) + if self.embed != cfg.encoder_embed_dim and not cfg.quantize_input else None ) - self.mask_prob = args.mask_prob - self.mask_selection = args.mask_selection - self.mask_other = args.mask_other - self.mask_length = args.mask_length - self.no_mask_overlap = args.no_mask_overlap - self.mask_min_space = args.mask_min_space + self.crop_seq_to_multiple = cfg.crop_seq_to_multiple - self.mask_channel_prob = args.mask_channel_prob - self.mask_channel_selection = args.mask_channel_selection - self.mask_channel_other = args.mask_channel_other - self.mask_channel_length = args.mask_channel_length - self.no_mask_channel_overlap = args.no_mask_channel_overlap - self.mask_channel_min_space = args.mask_channel_min_space + self.mask_prob = cfg.mask_prob + self.mask_selection = cfg.mask_selection + self.mask_other = cfg.mask_other + self.mask_length = cfg.mask_length + self.no_mask_overlap = cfg.no_mask_overlap + self.mask_min_space = cfg.mask_min_space - self.dropout_input = nn.Dropout(args.dropout_input) - self.dropout_features = nn.Dropout(args.dropout_features) + self.mask_channel_prob = cfg.mask_channel_prob + self.mask_channel_before = cfg.mask_channel_before + self.mask_channel_selection = cfg.mask_channel_selection + self.mask_channel_other = cfg.mask_channel_other + self.mask_channel_length = cfg.mask_channel_length + self.no_mask_channel_overlap = cfg.no_mask_channel_overlap + self.mask_channel_min_space = cfg.mask_channel_min_space - self.feature_grad_mult = args.feature_grad_mult + self.dropout_input = nn.Dropout(cfg.dropout_input) + self.dropout_features = nn.Dropout(cfg.dropout_features) + + self.feature_grad_mult = cfg.feature_grad_mult self.quantizer = None self.input_quantizer = None - self.n_negatives = args.num_negatives - self.cross_sample_negatives = args.cross_sample_negatives - self.codebook_negatives = args.codebook_negatives - self.negatives_from_everywhere = args.negatives_from_everywhere + self.n_negatives = cfg.num_negatives + self.cross_sample_negatives = cfg.cross_sample_negatives + self.codebook_negatives = cfg.codebook_negatives + self.negatives_from_everywhere = cfg.negatives_from_everywhere - self.logit_temp = args.logit_temp + self.logit_temp = cfg.logit_temp - final_dim = args.final_dim if args.final_dim > 0 else args.encoder_embed_dim + final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim - if args.quantize_targets: - vq_dim = args.latent_dim if args.latent_dim > 0 else final_dim + if cfg.quantize_targets: + vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else final_dim self.quantizer = GumbelVectorQuantizer( dim=self.embed, - num_vars=args.latent_vars, - temp=eval(args.latent_temp), - groups=args.latent_groups, + num_vars=cfg.latent_vars, + temp=cfg.latent_temp, + groups=cfg.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, + weight_proj_depth=cfg.quantizer_depth, + weight_proj_factor=cfg.quantizer_factor, ) self.project_q = nn.Linear(vq_dim, final_dim) else: self.project_q = nn.Linear(self.embed, final_dim) - if args.quantize_input: - if args.same_quantizer and self.quantizer is not None: + if cfg.quantize_input: + if cfg.same_quantizer and self.quantizer is not None: vq_dim = final_dim self.input_quantizer = self.quantizer else: - vq_dim = ( - args.latent_dim if args.latent_dim > 0 else args.encoder_embed_dim - ) + vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else cfg.encoder_embed_dim self.input_quantizer = GumbelVectorQuantizer( dim=self.embed, - num_vars=args.latent_vars, - temp=eval(args.latent_temp), - groups=args.latent_groups, + num_vars=cfg.latent_vars, + temp=cfg.latent_temp, + groups=cfg.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, + weight_proj_depth=cfg.quantizer_depth, + weight_proj_factor=cfg.quantizer_factor, ) - self.project_inp = nn.Linear(vq_dim, args.encoder_embed_dim) + self.project_inp = nn.Linear(vq_dim, cfg.encoder_embed_dim) self.mask_emb = nn.Parameter( - torch.FloatTensor(args.encoder_embed_dim).uniform_() + torch.FloatTensor(cfg.encoder_embed_dim).uniform_() ) + encoder_cls = TransformerEncoder + if cfg.layer_type == "conformer" and cfg.pos_enc_type in ["rel_pos", "rope"]: + encoder_cls = ConformerEncoder - self.encoder = TransformerEncoder(args) + self.encoder = encoder_cls(cfg) self.layer_norm = LayerNorm(self.embed) self.target_glu = None - if args.target_glu: + if cfg.target_glu: self.target_glu = nn.Sequential( nn.Linear(final_dim, final_dim * 2), nn.GLU() ) - self.final_proj = nn.Linear(args.encoder_embed_dim, final_dim) + self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim) def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) @@ -403,34 +421,21 @@ def upgrade_state_dict_named(self, state_dict, name): return state_dict @classmethod - def build_model(cls, args, task=None): + def build_model(cls, cfg: Wav2Vec2Config, task=None): """Build a new model instance.""" - # make sure all arguments are present - base_architecture(args) - - return cls(args) + return cls(cfg) - def apply_mask(self, x, padding_mask): + def apply_mask( + self, + x, + padding_mask, + mask_indices=None, + mask_channel_indices=None, + ): B, T, C = x.shape - if self.mask_prob > 0: - mask_indices = compute_mask_indices( - (B, T), - padding_mask, - self.mask_prob, - self.mask_length, - self.mask_selection, - self.mask_other, - min_masks=2, - no_overlap=self.no_mask_overlap, - min_space=self.mask_min_space, - ) - mask_indices = torch.from_numpy(mask_indices).to(x.device) - x[mask_indices] = self.mask_emb - else: - mask_indices = None - if self.mask_channel_prob > 0: + if self.mask_channel_prob > 0 and self.mask_channel_before: mask_channel_indices = compute_mask_indices( (B, C), None, @@ -449,9 +454,49 @@ def apply_mask(self, x, padding_mask): ) x[mask_channel_indices] = 0 + if self.mask_prob > 0: + if mask_indices is None: + mask_indices = compute_mask_indices( + (B, T), + padding_mask, + self.mask_prob, + self.mask_length, + self.mask_selection, + self.mask_other, + min_masks=2, + no_overlap=self.no_mask_overlap, + min_space=self.mask_min_space, + require_same_masks=self.cfg.require_same_masks, + mask_dropout=self.cfg.mask_dropout, + ) + mask_indices = torch.from_numpy(mask_indices).to(x.device) + x = index_put(x, mask_indices, self.mask_emb) + else: + mask_indices = None + + if self.mask_channel_prob > 0 and not self.mask_channel_before: + if mask_channel_indices is None: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + torch.from_numpy(mask_channel_indices) + .to(x.device) + .unsqueeze(1) + .expand(-1, T, -1) + ) + x = index_put(x, mask_channel_indices, 0) + return x, mask_indices - def sample_negatives(self, y, num): + def sample_negatives(self, y, num, padding_count=None): if self.n_negatives == 0 and self.cross_sample_negatives == 0: return y.new(0) @@ -459,8 +504,9 @@ def sample_negatives(self, y, num): bsz, tsz, fsz = y.shape y = y.view(-1, fsz) # BTC => (BxT)C + # FIXME: what happens if padding_count is specified? cross_high = tsz * bsz - high = tsz + high = tsz - (padding_count or 0) with torch.no_grad(): assert high > 1, f"{bsz,tsz,fsz}" @@ -493,8 +539,7 @@ def sample_negatives(self, y, num): cross_neg_idxs[cross_neg_idxs >= tszs] += 1 if self.n_negatives > 0: - for i in range(1, bsz): - neg_idxs[i] += i * high + neg_idxs = neg_idxs + (torch.arange(bsz).unsqueeze(1) * high) else: neg_idxs = cross_neg_idxs @@ -515,16 +560,51 @@ def compute_preds(self, x, y, negatives): y = y.unsqueeze(0) targets = torch.cat([y, negatives], dim=0) - logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x) + logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1) + logits = logits / self.logit_temp + logits = logits.type_as(x) + + if is_xla_tensor(logits) or neg_is_pos.any(): + if not hasattr(self, "_inftensor"): + fillval = -float(2**30) + self._inftensor = ( + torch.tensor(fillval).to(x.device) + if is_xla_tensor(logits) + else float("-inf") + ) + logits[1:] = index_put(logits[1:], neg_is_pos, self._inftensor) - logits /= self.logit_temp + return logits - if neg_is_pos.any(): - logits[1:][neg_is_pos] = float("-inf") + def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor): + """ + Computes the output length of the convolutional layers + """ - return logits + def _conv_out_length(input_length, kernel_size, stride): + return torch.floor((input_length - kernel_size) / stride + 1) + + conv_cfg_list = eval(self.cfg.conv_feature_layers) + + for i in range(len(conv_cfg_list)): + input_lengths = _conv_out_length( + input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2] + ) - def forward(self, source, padding_mask=None, mask=True, features_only=False): + return input_lengths.to(torch.long) + + def forward( + self, + source, + padding_mask=None, + mask=True, + features_only=False, + layer=None, + mask_indices=None, + mask_channel_indices=None, + padding_count=None, + corpus_key=None, + ): if self.feature_grad_mult > 0: features = self.feature_extractor(source) @@ -540,12 +620,33 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False): features = self.layer_norm(features) unmasked_features = features.clone() - if padding_mask is not None: - extra = padding_mask.size(1) % features.size(1) - if extra > 0: - padding_mask = padding_mask[:, :-extra] - padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) - padding_mask = padding_mask.all(-1) + if padding_mask is not None and padding_mask.any(): + input_lengths = (1 - padding_mask.long()).sum(-1) + # apply conv formula to get real output_lengths + output_lengths = self._get_feat_extract_output_lengths(input_lengths) + + padding_mask = torch.zeros( + features.shape[:2], dtype=features.dtype, device=features.device + ) + + # these two operations makes sure that all values + # before the output lengths indices are attended to + padding_mask[ + ( + torch.arange(padding_mask.shape[0], device=padding_mask.device), + output_lengths - 1, + ) + ] = 1 + padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() + else: + padding_mask = None + + time_steps_to_drop = features.size(1) % self.crop_seq_to_multiple + if time_steps_to_drop != 0: + features = features[:, :-time_steps_to_drop] + unmasked_features = unmasked_features[:, :-time_steps_to_drop] + if padding_mask is not None: + padding_mask = padding_mask[:, :-time_steps_to_drop] if self.post_extract_proj is not None: features = self.post_extract_proj(features) @@ -568,8 +669,15 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False): features = self.project_inp(features) if mask: - x, mask_indices = self.apply_mask(features, padding_mask) - if mask_indices is not None: + x, mask_indices = self.apply_mask( + features, + padding_mask, + mask_indices=mask_indices, + mask_channel_indices=mask_channel_indices, + ) + if not is_xla_tensor(x) and mask_indices is not None: + # tpu-comment: reducing the size in a dynamic way causes + # too many recompilations on xla. y = unmasked_features[mask_indices].view( unmasked_features.size(0), -1, unmasked_features.size(-1) ) @@ -580,28 +688,50 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False): y = unmasked_features mask_indices = None - x = self.encoder(x, padding_mask=padding_mask) + x, layer_results = self.encoder( + x, padding_mask=padding_mask, layer=layer, corpus_key=corpus_key + ) if features_only: - return {"x": x, "padding_mask": padding_mask} + return { + "x": x, + "padding_mask": padding_mask, + "features": unmasked_features, + "layer_results": layer_results, + } if self.quantizer: - q = self.quantizer(y, produce_targets=False) - y = q["x"] - num_vars = q["num_vars"] - code_ppl = q["code_perplexity"] - prob_ppl = q["prob_perplexity"] - curr_temp = q["temp"] - - y = self.project_q(y) - if self.negatives_from_everywhere: - neg_cands, *_ = self.quantizer(unmasked_features, produce_targets=False) - negs, _ = self.sample_negatives(neg_cands, y.size(1)) - negs = self.project_q(negs) + q = self.quantizer(unmasked_features, produce_targets=False) + y = q["x"] + num_vars = q["num_vars"] + code_ppl = q["code_perplexity"] + prob_ppl = q["prob_perplexity"] + curr_temp = q["temp"] + y = self.project_q(y) + + negs, _ = self.sample_negatives( + y, + mask_indices[0].sum(), + padding_count=padding_count, + ) + y = y[mask_indices].view(y.size(0), -1, y.size(-1)) else: - negs, _ = self.sample_negatives(y, y.size(1)) + q = self.quantizer(y, produce_targets=False) + y = q["x"] + num_vars = q["num_vars"] + code_ppl = q["code_perplexity"] + prob_ppl = q["prob_perplexity"] + curr_temp = q["temp"] + + y = self.project_q(y) + + negs, _ = self.sample_negatives( + y, + y.size(1), + padding_count=padding_count, + ) if self.codebook_negatives > 0: cb_negs = self.quantizer.sample_from_codebook( @@ -616,12 +746,23 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False): y = self.project_q(y) if self.negatives_from_everywhere: - negs, _ = self.sample_negatives(unmasked_features, y.size(1)) + negs, _ = self.sample_negatives( + unmasked_features, + y.size(1), + padding_count=padding_count, + ) negs = self.project_q(negs) else: - negs, _ = self.sample_negatives(y, y.size(1)) + negs, _ = self.sample_negatives( + y, + y.size(1), + padding_count=padding_count, + ) - x = x[mask_indices].view(x.size(0), -1, x.size(-1)) + if not is_xla_tensor(x): + # tpu-comment: reducing the size in a dynamic way causes + # too many recompilations on xla. + x = x[mask_indices].view(x.size(0), -1, x.size(-1)) if self.target_glu: y = self.target_glu(y) @@ -630,7 +771,11 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False): x = self.final_proj(x) x = self.compute_preds(x, y, negs) - result = {"x": x, "padding_mask": padding_mask, "features_pen": features_pen} + result = { + "x": x, + "padding_mask": padding_mask, + "features_pen": features_pen, + } if prob_ppl is not None: result["prob_perplexity"] = prob_ppl @@ -647,9 +792,18 @@ def quantize(self, x): x = self.layer_norm(x) return self.quantizer.forward_idx(x) - def extract_features(self, source, padding_mask, mask=False): - res = self.forward(source, padding_mask, mask=mask, features_only=True) - return res["x"], res["padding_mask"] + def extract_features( + self, source, padding_mask, mask=False, layer=None, corpus_key=None + ): + res = self.forward( + source, + padding_mask, + mask=mask, + features_only=True, + layer=layer, + corpus_key=corpus_key, + ) + return res def get_logits(self, net_output): logits = net_output["x"] @@ -675,12 +829,17 @@ def get_extra_losses(self, net_output): return pen - def remove_pretraining_modules(self): + def remove_pretraining_modules(self, last_layer=None): self.quantizer = None self.project_q = None self.target_glu = None self.final_proj = None + if last_layer is not None: + self.encoder.layers = nn.ModuleList( + l for i, l in enumerate(self.encoder.layers) if i <= last_layer + ) + class ConvFeatureExtractionModel(nn.Module): def __init__( @@ -763,31 +922,64 @@ def forward(self, x): return x -class TransformerEncoder(nn.Module): - def __init__(self, args): - super().__init__() +def make_conv_pos(e, k, g, is_batch_norm=False): + pos_conv = nn.Conv1d( + e, + e, + kernel_size=k, + padding=k // 2, + groups=g, + ) + dropout = 0 + std = math.sqrt((4 * (1.0 - dropout)) / (k * e)) + nn.init.normal_(pos_conv.weight, mean=0, std=std) + nn.init.constant_(pos_conv.bias, 0) - self.dropout = args.dropout - self.embedding_dim = args.encoder_embed_dim + if not is_batch_norm: + pos_conv = nn.utils.weight_norm(pos_conv, name="weight", dim=2) + pos_conv = nn.Sequential(pos_conv, SamePad(k), nn.GELU()) + else: + batch_norm = nn.BatchNorm1d(e) + pos_conv = nn.Sequential(batch_norm, pos_conv, SamePad(k), nn.GELU()) - self.pos_conv = nn.Conv1d( - self.embedding_dim, - self.embedding_dim, - kernel_size=args.conv_pos, - padding=args.conv_pos // 2, - groups=args.conv_pos_groups, - ) - dropout = 0 - std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) - nn.init.normal_(self.pos_conv.weight, mean=0, std=std) - nn.init.constant_(self.pos_conv.bias, 0) + return pos_conv - self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) - self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) - self.layers = nn.ModuleList( - [ - TransformerSentenceEncoderLayer( +class TransformerEncoder(nn.Module): + def build_encoder_layer(self, args: Wav2Vec2Config, **kwargs): + if args.layer_type == "transformer": + layer = TransformerSentenceEncoderLayer( + embedding_dim=self.embedding_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=self.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + activation_fn=args.activation_fn, + layer_norm_first=args.layer_norm_first, + ) + elif args.layer_type == "conformer": + layer = ConformerWav2Vec2EncoderLayer( + embed_dim=self.embedding_dim, + ffn_embed_dim=args.encoder_ffn_embed_dim, + attention_heads=args.encoder_attention_heads, + dropout=args.dropout, + depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, + activation_fn="swish", + attn_type=args.attn_type, + use_fp16=args.fp16, + pos_enc_type="abs", + ) + elif args.layer_type == "trf_adp": + use_adp = False + if args.adp_trf_idx == "all": + use_adp = True + else: + adp_trf_idx = list(range(*[int(g) for g in args.adp_trf_idx.split(":")])) + if kwargs.get("layer_idx", None) in adp_trf_idx: + use_adp = True + if use_adp: + layer = TransformerSentenceEncoderWithAdapterLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, @@ -796,53 +988,186 @@ def __init__(self, args): activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first, + adapter_num=args.adp_num, + adapter_dim=args.adp_dim, + adapter_act_fn=args.adp_act_fn, ) - for _ in range(args.encoder_layers) - ] - ) + else: + layer = TransformerSentenceEncoderLayer( + embedding_dim=self.embedding_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=self.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + activation_fn=args.activation_fn, + layer_norm_first=args.layer_norm_first, + ) + + layer = fsdp_wrap(layer) + if args.checkpoint_activations: + layer = checkpoint_wrapper(layer) + return layer + + def __init__(self, args: Wav2Vec2Config, skip_pos_conv: bool = False, override_encoder_layer: int = None): + super().__init__() + + self.dropout = args.dropout + self.embedding_dim = args.encoder_embed_dim + self.required_seq_len_multiple = args.required_seq_len_multiple + pos_conv_depth = getattr(args, "pos_conv_depth", 1) + if pos_conv_depth > 1: + num_layers = args.pos_conv_depth + k = max(3, args.conv_pos // num_layers) + + def make_conv_block(e, k, g, l): + return nn.Sequential( + *[ + nn.Sequential( + nn.Conv1d( + e, + e, + kernel_size=k, + padding=k // 2, + groups=g, + ), + SamePad(k), + TransposeLast(), + LayerNorm(e, elementwise_affine=False), + TransposeLast(), + nn.GELU(), + ) + for _ in range(l) + ] + ) + + self.pos_conv = make_conv_block( + self.embedding_dim, k, args.conv_pos_groups, num_layers + ) + elif skip_pos_conv: + self.pos_conv = None + else: + self.pos_conv = make_conv_pos( + self.embedding_dim, + args.conv_pos, + args.conv_pos_groups, + is_batch_norm=args.conv_pos_batch_norm + if hasattr(args, "conv_pos_batch_norm") + else False, + ) + + if override_encoder_layer is None: + encoder_layers = args.encoder_layers + else: + encoder_layers = override_encoder_layer + + self.layers = nn.ModuleList( + [self.build_encoder_layer(args, layer_idx=ii) for ii in range(encoder_layers)] + ) self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) self.layerdrop = args.encoder_layerdrop self.apply(init_bert_params) - def forward(self, x, padding_mask=None): - x = self.extract_features(x, padding_mask) + def forward(self, x, padding_mask=None, layer=None, corpus_key=None): + x, layer_results = self.extract_features( + x, padding_mask, layer, corpus_key=corpus_key + ) - if self.layer_norm_first: + if self.layer_norm_first and layer is None: x = self.layer_norm(x) - return x + return x, layer_results - def extract_features(self, x, padding_mask=None): + def extract_features( + self, + x, + padding_mask=None, + tgt_layer=None, + min_layer=0, + corpus_key=None, + ): if padding_mask is not None: - x[padding_mask] = 0 + x = index_put(x, padding_mask, 0) - x_conv = self.pos_conv(x.transpose(1, 2)) - x_conv = x_conv.transpose(1, 2) - x += x_conv + if self.pos_conv is not None: + x_conv = self.pos_conv(x.transpose(1, 2)) + x_conv = x_conv.transpose(1, 2) + x = x + x_conv if not self.layer_norm_first: x = self.layer_norm(x) + # pad to the sequence length dimension + x, pad_length = pad_to_multiple( + x, self.required_seq_len_multiple, dim=-2, value=0 + ) + if pad_length > 0 and padding_mask is None: + padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool) + padding_mask[:, -pad_length:] = True + else: + padding_mask, _ = pad_to_multiple( + padding_mask, self.required_seq_len_multiple, dim=-1, value=True + ) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) layer_results = [] + r = None + for i, layer in enumerate(self.layers): - dropout_probability = np.random.random() + dropout_probability = np.random.random() if self.layerdrop > 0 else 1 if not self.training or (dropout_probability > self.layerdrop): - x, z = layer(x, self_attn_padding_mask=padding_mask, need_weights=False) - layer_results.append(x) + layer_check = layer + if isinstance(layer, FullyShardedDataParallel): + layer_check = layer.unwrapped_module + if (corpus_key is None) or ( + not isinstance(layer_check, ( + TransformerSentenceEncoderWithAdapterLayer, + ) + ) + ): + x, (z, lr) = layer( + x, self_attn_padding_mask=padding_mask, need_weights=False + ) + else: + x, (z, lr) = layer( + x, + self_attn_padding_mask=padding_mask, + need_weights=False, + corpus_key=corpus_key, + ) + if i >= min_layer: + layer_results.append((x, z, lr)) + if i == tgt_layer: + r = x + break + + if r is not None: + x = r # T x B x C -> B x T x C x = x.transpose(0, 1) - return x + # undo paddding + if pad_length > 0: + x = x[:, :-pad_length] + + def undo_pad(a, b, c): + return ( + a[:-pad_length], + b[:-pad_length] if b is not None else b, + c[:-pad_length], + ) + + layer_results = [undo_pad(*u) for u in layer_results] + + return x, layer_results def max_positions(self): """Maximum output length supported by the encoder.""" @@ -853,6 +1178,93 @@ def upgrade_state_dict_named(self, state_dict, name): return state_dict +class ConformerEncoder(TransformerEncoder): + def build_encoder_layer(self, args): + layer = ConformerWav2Vec2EncoderLayer( + embed_dim=self.embedding_dim, + ffn_embed_dim=args.encoder_ffn_embed_dim, + attention_heads=args.encoder_attention_heads, + dropout=args.dropout, + depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, + activation_fn="swish", + attn_type=args.attn_type, + pos_enc_type=args.pos_enc_type, + use_fp16=args.fp16, # only used for rope + ) + layer = fsdp_wrap(layer) + if args.checkpoint_activations: + layer = checkpoint_wrapper(layer) + return layer + + def __init__(self, args): + super().__init__(args) + self.args = args + self.dropout = args.dropout + self.embedding_dim = args.encoder_embed_dim + self.pos_enc_type = args.pos_enc_type + max_source_positions = self.max_positions() + + if self.pos_enc_type == "rel_pos": + self.embed_positions = RelPositionalEncoding( + max_source_positions, self.embedding_dim + ) + elif self.pos_enc_type == "rope": + self.embed_positions = None + else: + raise Exception("Unsupported positional encoding type") + + self.layers = nn.ModuleList( + [self.build_encoder_layer(args) for _ in range(args.encoder_layers)] + ) + self.layer_norm_first = args.layer_norm_first + self.layer_norm = LayerNorm(self.embedding_dim) + self.layerdrop = args.encoder_layerdrop + + self.apply(init_bert_params) + + def extract_features(self, x, padding_mask=None, tgt_layer=None): + if padding_mask is not None: + x = index_put(x, padding_mask, 0) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + # B X T X C here + position_emb = None + if self.pos_enc_type == "rel_pos": + position_emb = self.embed_positions(x) + + if not self.layer_norm_first: + x = self.layer_norm(x) + + x = F.dropout(x, p=self.dropout, training=self.training) + + layer_results = [] + r = None + for i, layer in enumerate(self.layers): + dropout_probability = np.random.random() + if not self.training or (dropout_probability > self.layerdrop): + x, z = layer( + x, + self_attn_padding_mask=padding_mask, + need_weights=False, + position_emb=position_emb, + ) + if tgt_layer is not None: + layer_results.append((x, z)) + if i == tgt_layer: + r = x + break + + if r is not None: + x = r + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + return x, layer_results + + class TransformerSentenceEncoderLayer(nn.Module): """ Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained @@ -863,7 +1275,7 @@ def __init__( self, embedding_dim: float = 768, ffn_embedding_dim: float = 3072, - num_attention_heads: float = 8, + num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, @@ -921,8 +1333,8 @@ def forward( key=x, value=x, key_padding_mask=self_attn_padding_mask, - need_weights=False, attn_mask=self_attn_mask, + need_weights=False, ) x = self.dropout1(x) x = residual + x @@ -932,6 +1344,9 @@ def forward( x = self.activation_fn(self.fc1(x)) x = self.dropout2(x) x = self.fc2(x) + + layer_result = x + x = self.dropout3(x) x = residual + x else: @@ -940,7 +1355,7 @@ def forward( key=x, value=x, key_padding_mask=self_attn_padding_mask, - need_weights=need_weights, + need_weights=False, ) x = self.dropout1(x) @@ -952,78 +1367,133 @@ def forward( x = self.activation_fn(self.fc1(x)) x = self.dropout2(x) x = self.fc2(x) + + layer_result = x + x = self.dropout3(x) x = residual + x x = self.final_layer_norm(x) - return x, attn - + return x, (attn, layer_result) -@register_model_architecture("wav2vec2", "wav2vec2") -def base_architecture(args): - args.extractor_mode = getattr(args, "extractor_mode", "default") - args.encoder_layers = getattr(args, "encoder_layers", 12) - args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) - args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072) - args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) - - args.activation_fn = getattr(args, "activation_fn", "gelu") +class AdapterFast(nn.Module): + def __init__(self, adapter_num, input_dim, hidden_dim, act_fn): + """ + Implements adapter modules directly with 3D tensor weight as parameters + and without using ModuleList orto speed up training throughput. + """ + super().__init__() - args.dropout = getattr(args, "dropout", 0.1) - args.attention_dropout = getattr(args, "attention_dropout", 0.1) - args.activation_dropout = getattr(args, "activation_dropout", 0.0) + self.adapter_num = adapter_num + self.input_dim = input_dim + self.hidden_dim = hidden_dim + self.W_a = nn.Parameter(torch.empty(adapter_num, hidden_dim, input_dim)) + self.W_b = nn.Parameter(torch.empty(adapter_num, input_dim, hidden_dim)) + self.b_a = nn.Parameter(torch.empty(adapter_num, hidden_dim)) + self.b_b = nn.Parameter(torch.empty(adapter_num, input_dim)) + + self.ln_W = nn.Parameter(torch.empty(adapter_num, input_dim)) + self.ln_b = nn.Parameter(torch.empty(adapter_num, input_dim)) + self.act_fn = nn.Identity() + if act_fn == "relu": + self.act_fn = nn.ReLU() + elif act_fn == "gelu": + self.act_fn = nn.GELU() + elif act_fn == "selu": + self.act_fn = nn.SELU() + else: + raise ValueError(f"unsupported {act_fn}") - args.final_dim = getattr(args, "final_dim", 0) - args.layer_norm_first = getattr(args, "layer_norm_first", False) - args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0) + self.input_dim = input_dim + self.reset_parameters() - conv_feature_layers = "[(512, 10, 5)]" - conv_feature_layers += " + [(512, 8, 4)]" - conv_feature_layers += " + [(512, 4, 2)] * 3" - conv_feature_layers += " + [(512, 1, 1)]" - args.conv_feature_layers = getattr(args, "conv_feature_layers", conv_feature_layers) + def reset_parameters(self): + for ii in range(self.adapter_num): + nn.init.kaiming_uniform_(self.W_a[ii], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.W_b[ii], a=math.sqrt(5)) + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.W_a[ii]) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + nn.init.uniform_(self.b_a[ii], -bound, bound) + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.W_b[ii]) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + nn.init.uniform_(self.b_b[ii], -bound, bound) - args.logit_temp = getattr(args, "logit_temp", 0.1) + nn.init.ones_(self.ln_W) + nn.init.zeros_(self.ln_b) - args.quantize_targets = getattr(args, "quantize_targets", False) - args.quantize_input = getattr(args, "quantize_input", False) - args.same_quantizer = getattr(args, "same_quantizer", False) + def forward(self, x, adapter_id): + ii = adapter_id + h = x + h = F.layer_norm(h, (self.input_dim, ), self.ln_W[ii], self.ln_b[ii]) + h = F.linear(h, self.W_a[ii], self.b_a[ii]) + h = self.act_fn(h) + h = F.linear(h, self.W_b[ii], self.b_b[ii]) + outputs = h + return outputs - args.feature_grad_mult = getattr(args, "feature_grad_mult", 1.0) + def extra_repr(self): + return ('adapter={}, input_dim={}, hidden_dim={}'.format(self.adapter_num, self.input_dim, self.hidden_dim)) - args.latent_vars = getattr(args, "latent_vars", 320) - args.latent_groups = getattr(args, "latent_groups", 2) - args.latent_dim = getattr(args, "latent_dim", 0) - args.mask_length = getattr(args, "mask_length", 10) - args.mask_prob = getattr(args, "mask_prob", 0.65) - args.mask_selection = getattr(args, "mask_selection", "static") - args.mask_other = getattr(args, "mask_other", 0) - args.no_mask_overlap = getattr(args, "no_mask_overlap", False) - args.mask_min_space = getattr(args, "mask_min_space", 1) - args.mask_channel_length = getattr(args, "mask_channel_length", 10) - args.mask_channel_prob = getattr(args, "mask_channel_prob", 0) - args.mask_channel_selection = getattr(args, "mask_channel_selection", "static") - args.mask_channel_other = getattr(args, "mask_channel_other", 0) - args.no_mask_channel_overlap = getattr(args, "no_mask_channel_overlap", False) - args.mask_channel_min_space = getattr(args, "mask_channel_min_space", 1) +class TransformerSentenceEncoderWithAdapterLayer(TransformerSentenceEncoderLayer): + """ + Implements a Transformer Encoder Layer with adapters used in BERT/XLM style pre-trained + models. An adapter module is added along with vanilla Transformer module. + """ - args.dropout_input = getattr(args, "dropout_input", 0) - args.dropout_features = getattr(args, "dropout_features", 0) + def __init__( + self, + embedding_dim: float = 768, + ffn_embedding_dim: float = 3072, + num_attention_heads: int = 8, + dropout: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + activation_fn: str = "relu", + layer_norm_first: bool = False, + adapter_num=201, + adapter_dim=64, + adapter_act_fn="relu", + ) -> None: - args.num_negatives = getattr(args, "num_negatives", 100) - args.negatives_from_everywhere = getattr(args, "negatives_from_everywhere", False) - args.cross_sample_negatives = getattr(args, "cross_sample_negatives", 0) - args.codebook_negatives = getattr(args, "codebook_negatives", 0) + super().__init__( + embedding_dim=embedding_dim, + ffn_embedding_dim=ffn_embedding_dim, + num_attention_heads=num_attention_heads, + dropout=dropout, + attention_dropout=attention_dropout, + activation_dropout=activation_dropout, + activation_fn=activation_fn, + layer_norm_first=layer_norm_first, - args.conv_pos = getattr(args, "conv_pos", 128) - args.conv_pos_groups = getattr(args, "conv_pos_groups", 16) + ) - args.latent_temp = getattr(args, "latent_temp", "(2,0.5,0.999995)") + self.adapter_num = adapter_num + self.adapter_dim = adapter_dim + self.adapter_layer = AdapterFast(adapter_num, self.embedding_dim, self.adapter_dim, adapter_act_fn) - args.target_glu = getattr(args, "target_glu", False) + def forward( + self, + x: torch.Tensor, + self_attn_mask: torch.Tensor = None, + self_attn_padding_mask: torch.Tensor = None, + need_weights: bool = False, + att_args=None, + corpus_key=None, + ): - args.conv_bias = getattr(args, "conv_bias", False) + x, (attn, layer_result) = super().forward( + x=x, + self_attn_mask=self_attn_mask, + self_attn_padding_mask=self_attn_padding_mask, + need_weights=need_weights, + att_args=att_args, + ) + assert corpus_key is not None + assert len(set(corpus_key)) == 1, f"corpus_key items are not same {corpus_key}" + y = self.adapter_layer(x, corpus_key[0]) + x = x + y + return x, (attn, layer_result) diff --git a/fairseq/models/wav2vec/wav2vec2_asr.py b/fairseq/models/wav2vec/wav2vec2_asr.py index 52ca9a8007..0403efebb9 100644 --- a/fairseq/models/wav2vec/wav2vec2_asr.py +++ b/fairseq/models/wav2vec/wav2vec2_asr.py @@ -5,171 +5,259 @@ import contextlib import copy +import logging import math +import re +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Any, Optional import numpy as np import torch import torch.nn as nn import torch.nn.functional as F +from omegaconf import II, MISSING, open_dict + from fairseq import checkpoint_utils, tasks, utils +from fairseq.dataclass import FairseqDataclass +from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.models import ( BaseFairseqModel, FairseqEncoder, FairseqEncoderDecoderModel, FairseqIncrementalDecoder, register_model, - register_model_architecture, ) +from fairseq.models.wav2vec.wav2vec2 import MASKING_DISTRIBUTION_CHOICES, LAYER_TYPE_CHOICES, AdapterFast from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerDecoderLayer +from fairseq.tasks import FairseqTask + +logger = logging.getLogger(__name__) -def add_common_args(parser): - parser.add_argument("--w2v-path", help="path to wav2vec 2.0 model") - parser.add_argument( - "--no-pretrained-weights", - action="store_true", - help="if true, does not load pretrained weights", +@dataclass +class Wav2Vec2AsrConfig(FairseqDataclass): + w2v_path: str = field( + default=MISSING, metadata={"help": "path to wav2vec 2.0 model"} ) - parser.add_argument( - "--dropout-input", - type=float, - metavar="D", - help="dropout to apply to the input (after feat extr)", + no_pretrained_weights: bool = field( + default=False, metadata={"help": "if true, does not load pretrained weights"} ) - parser.add_argument( - "--final-dropout", - type=float, - metavar="D", - help="dropout after transformer and before final projection", + dropout_input: float = field( + default=0.0, + metadata={"help": "dropout to apply to the input (after feat extr)"}, ) - parser.add_argument( - "--apply-mask", action="store_true", help="apply masking during fine-tuning" + + final_dropout: float = field( + default=0.0, + metadata={"help": "dropout after transformer and before final projection"}, ) - parser.add_argument( - "--dropout", - type=float, - metavar="D", - help="dropout probability inside wav2vec 2.0 model", + dropout: float = field( + default=0.0, metadata={"help": "dropout probability inside wav2vec 2.0 model"} ) - parser.add_argument( - "--attention-dropout", - type=float, - metavar="D", - help="dropout probability for attention weights inside wav2vec 2.0 model", + attention_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability for attention weights inside wav2vec 2.0 model" + }, ) - parser.add_argument( - "--activation-dropout", - "--relu-dropout", - type=float, - metavar="D", - help="dropout probability after activation in FFN inside wav2vec 2.0 model", + activation_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability after activation in FFN inside wav2vec 2.0 model" + }, ) - parser.add_argument( - "--mask-length", type=int, help="repeat the mask indices multiple times" + # masking + apply_mask: bool = field( + default=False, metadata={"help": "apply masking during fine-tuning"} ) - - parser.add_argument( - "--mask-prob", type=float, help="probability of replacing a token with mask" + mask_length: int = field( + default=10, metadata={"help": "repeat the mask indices multiple times"} ) - - parser.add_argument( - "--mask-selection", - type=str, - choices=["static", "uniform", "normal", "poisson"], - help="how to choose masks", + mask_prob: float = field( + default=0.5, + metadata={ + "help": "probability of replacing a token with mask (normalized by length)" + }, ) - - parser.add_argument( - "--mask-other", - type=float, - help="stdev of the mask length in case of 'normal' selection strategy", + mask_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", metadata={"help": "how to choose masks"} ) - - parser.add_argument( - "--no-mask-overlap", - action="store_true", - help="whether to allow masks to overlap", + mask_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument (used for more complex distributions), " + "see help in compute_mask_indices" + }, ) - - parser.add_argument( - "--mask-channel-length", type=int, help="repeat the mask indices multiple times" + no_mask_overlap: bool = field( + default=False, metadata={"help": "whether to allow masks to overlap"} ) - - parser.add_argument( - "--mask-channel-prob", - type=float, - help="probability of replacing a token with mask", + mask_min_space: Optional[int] = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, ) - - parser.add_argument( - "--mask-channel-selection", - type=str, - choices=["static", "uniform", "normal", "poisson"], - help="how to choose masks", + require_same_masks: bool = field( + default=True, + metadata={ + "help": "whether to number of masked timesteps must be the same across all " + "examples in a batch" + }, + ) + mask_dropout: float = field( + default=0.0, + metadata={"help": "percent of masks to unmask for each sample"}, ) - parser.add_argument( - "--mask-channel-other", - type=float, - help="stdev of the mask length in case of 'normal' selection strategy", + # channel masking + mask_channel_length: int = field( + default=10, metadata={"help": "length of the mask for features (channels)"} + ) + mask_channel_prob: float = field( + default=0.0, metadata={"help": "probability of replacing a feature with 0"} + ) + mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( + default="static", + metadata={"help": "how to choose mask length for channel masking"}, + ) + mask_channel_other: float = field( + default=0, + metadata={ + "help": "secondary mask argument (used for more complex distributions), " + "see help in compute_mask_indicesh" + }, + ) + no_mask_channel_overlap: bool = field( + default=False, metadata={"help": "whether to allow channel masks to overlap"} + ) + freeze_finetune_updates: int = field( + default=0, metadata={"help": "dont finetune wav2vec for this many updates"} + ) + feature_grad_mult: float = field( + default=0.0, metadata={"help": "reset feature grad mult in wav2vec 2.0 to this"} + ) + layerdrop: float = field( + default=0.0, metadata={"help": "probability of dropping a layer in wav2vec 2.0"} + ) + drop_path: float = 0 + mask_channel_min_space: Optional[int] = field( + default=1, + metadata={"help": "min space between spans (if no overlap is enabled)"}, + ) + mask_channel_before: bool = False + normalize: bool = II("task.normalize") + update_alibi: bool = True + data: str = II("task.data") + # this holds the loaded wav2vec args + w2v_args: Any = None + offload_activations: bool = field( + default=False, metadata={"help": "offload_activations"} + ) + min_params_to_wrap: int = field( + default=int(1e8), + metadata={ + "help": "minimum number of params for a layer to be wrapped with FSDP() when " + "training with --ddp-backend=fully_sharded. Smaller values will " + "improve memory efficiency, but may make torch.distributed " + "communication less efficient due to smaller input sizes. This option " + "is set to 0 (i.e., always wrap) when --checkpoint-activations or " + "--offload-activations are passed." + }, ) - parser.add_argument( - "--no-mask-channel-overlap", - action="store_true", - help="whether to allow masks to overlap", + checkpoint_activations: bool = field( + default=False, + metadata={"help": "recompute activations and save memory for extra compute"}, ) + ddp_backend: str = II("distributed_training.ddp_backend") - parser.add_argument( - "--freeze-finetune-updates", - default=0, - type=int, - help="dont finetune wav2vec for this many updates", + zero_mask: bool = False + load_ema: bool = False + + layer_decay: float = 1 + + + layer_type: LAYER_TYPE_CHOICES = field( + default="transformer", metadata={"help": "layer type in encoder"} + ) + # Adapter num + adp_num: int = field( + default=-1 + ) + adp_dim: int = field( + default=64 + ) + adp_act_fn: str = field( + default="relu" + ) + adp_trf_idx: str = field( + default="all", ) - parser.add_argument( - "--feature-grad-mult", + freeze_regex: Optional[str] = field( default=None, - type=float, - help="reset feature grad mult in wav2vec 2.0 to this", ) - parser.add_argument( - "--layerdrop", - default=0.0, - type=float, - help="probability of dropping a layer in wav2vec 2.0", - ) +@dataclass +class Wav2Vec2CtcConfig(Wav2Vec2AsrConfig): + blank_weight: float = 0 + blank_mode: str = "add" -@register_model("wav2vec_ctc") +@register_model("wav2vec_ctc", dataclass=Wav2Vec2CtcConfig) class Wav2VecCtc(BaseFairseqModel): - @staticmethod - def add_args(parser): - """Add model-specific arguments to the parser.""" - add_common_args(parser) - - def __init__(self, w2v_encoder, args): + def __init__(self, cfg: Wav2Vec2CtcConfig, w2v_encoder: BaseFairseqModel): super().__init__() + self.cfg = cfg self.w2v_encoder = w2v_encoder - self.args = args + self.blank_weight = cfg.blank_weight + self.blank_mode = cfg.blank_mode def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) return state_dict @classmethod - def build_model(cls, args, task): + def build_model(cls, cfg: Wav2Vec2CtcConfig, task: FairseqTask): """Build a new model instance.""" - base_architecture(args) - w2v_encoder = Wav2VecEncoder(args, task.target_dictionary) - return cls(w2v_encoder, args) + w2v_encoder = Wav2VecEncoder(cfg, len(task.target_dictionary)) + return cls(cfg, w2v_encoder) + + def get_logits(self, net_output, normalize=False): + logits = net_output["encoder_out"] + if self.blank_weight != 0: + if self.blank_mode == "add": + logits[..., 0] += self.blank_weight + elif self.blank_mode == "set": + logits[..., 0] = self.blank_weight + else: + raise Exception(f"invalid blank mode {self.blank_mode}") + + if net_output["padding_mask"] is not None and net_output["padding_mask"].any(): + number_of_classes = logits.size(-1) + masking_tensor = torch.ones( + number_of_classes, device=logits.device + ) * float("-inf") + masking_tensor[0] = 0 + + if logits.size(0) > net_output["padding_mask"].size(1): + net_output["padding_mask"] = F.pad( + net_output["padding_mask"], (1, 0), value=False + ) + + logits[net_output["padding_mask"].T] = masking_tensor.type_as(logits) + + if normalize: + logits = utils.log_softmax(logits.float(), dim=-1) + + return logits def get_normalized_probs(self, net_output, log_probs): """Get normalized probabilities (or log probs) from a net's output.""" - logits = net_output["encoder_out"] + logits = self.get_logits(net_output) + if log_probs: return utils.log_softmax(logits.float(), dim=-1) else: @@ -179,95 +267,71 @@ def forward(self, **kwargs): x = self.w2v_encoder(**kwargs) return x - # def max_positions(self): - # return None - - -@register_model("wav2vec_seq2seq") -class TransformerModel(FairseqEncoderDecoderModel): - def __init__(self, args, encoder, decoder): - super().__init__(encoder, decoder) - - @staticmethod - def add_args(parser): - add_common_args(parser) - parser.add_argument( - "--decoder-embed-dim", - type=int, - metavar="N", - help="decoder embedding dimension", - ) - parser.add_argument( - "--decoder-ffn-embed-dim", - type=int, - metavar="N", - help="decoder embedding dimension for FFN", - ) - parser.add_argument( - "--decoder-layers", type=int, metavar="N", help="num decoder layers" - ) - parser.add_argument( - "--decoder-layerdrop", - type=float, - metavar="D", - help="decoder layerdrop chance", - ) - parser.add_argument( - "--decoder-attention-heads", - type=int, - metavar="N", - help="num decoder attention heads", - ) - parser.add_argument( - "--decoder-learned-pos", - action="store_true", - help="use learned positional embeddings in the decoder", - ) - parser.add_argument( - "--decoder-normalize-before", - action="store_true", - help="apply layernorm before each decoder block", - ) - parser.add_argument( - "--no-token-positional-embeddings", - default=False, - action="store_true", - help="if set, disables positional embeddings (outside self attention)", - ) +@dataclass +class Wav2Vec2Seq2SeqConfig(Wav2Vec2AsrConfig): + decoder_embed_dim: int = field( + default=768, metadata={"help": "decoder embedding dimension"} + ) + decoder_ffn_embed_dim: int = field( + default=3072, metadata={"help": "decoder embedding dimension for FFN"} + ) + decoder_layers: int = field(default=6, metadata={"help": "num of decoder layers"}) + decoder_layerdrop: float = field( + default=0.0, metadata={"help": "decoder layerdrop chance"} + ) + decoder_attention_heads: int = field( + default=4, metadata={"help": "num decoder attention heads"} + ) + decoder_learned_pos: bool = field( + default=False, + metadata={"help": "use learned positional embeddings in the decoder"}, + ) + decoder_normalize_before: bool = field( + default=False, metadata={"help": "apply layernorm before each decoder block"} + ) + no_token_positional_embeddings: bool = field( + default=False, + metadata={ + "help": "if set, disables positional embeddings (outside self attention)" + }, + ) + decoder_dropout: float = field( + default=0.0, metadata={"help": "dropout probability in the decoder"} + ) + decoder_attention_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability for attention weights inside the decoder" + }, + ) + decoder_activation_dropout: float = field( + default=0.0, + metadata={ + "help": "dropout probability after activation in FFN inside the decoder" + }, + ) + max_target_positions: int = field( + default=2048, metadata={"help": "max target positions"} + ) + share_decoder_input_output_embed: bool = field( + default=False, metadata={"help": "share decoder input and output embeddings"} + ) + autoregressive: bool = II("task.autoregressive") - parser.add_argument( - "--decoder-dropout", - type=float, - metavar="D", - help="dropout probability in the decoder", - ) - parser.add_argument( - "--decoder-attention-dropout", - type=float, - metavar="D", - help="dropout probability for attention weights inside the decoder", - ) - parser.add_argument( - "--decoder-activation-dropout", - type=float, - metavar="D", - help="dropout probability after activation in FFN inside the decoder", - ) - # fmt: on +@register_model("wav2vec_seq2seq", dataclass=Wav2Vec2Seq2SeqConfig) +class Wav2Vec2Seq2SeqModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) @classmethod - def build_model(cls, args, task): + def build_model(cls, cfg: Wav2Vec2Seq2SeqConfig, task: FairseqTask): """Build a new model instance.""" - # make sure all arguments are present in older models - base_architecture(args) - - if not hasattr(args, "max_source_positions"): - args.max_source_positions = 2048 - if not hasattr(args, "max_target_positions"): - args.max_target_positions = 2048 + assert ( + cfg.autoregressive + ), "Please set task.autoregressive=true for seq2seq asr models" src_dict, tgt_dict = task.source_dictionary, task.target_dictionary @@ -277,22 +341,23 @@ def build_embedding(dictionary, embed_dim): emb = Embedding(num_embeddings, embed_dim, padding_idx) return emb - decoder_embed_tokens = build_embedding(tgt_dict, args.decoder_embed_dim) + decoder_embed_tokens = build_embedding(tgt_dict, cfg.decoder_embed_dim) + + encoder = cls.build_encoder(cfg) + decoder = cls.build_decoder(cfg, tgt_dict, decoder_embed_tokens) - encoder = cls.build_encoder(args) - decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens) - return TransformerModel(args, encoder, decoder) + return Wav2Vec2Seq2SeqModel(encoder, decoder) @classmethod - def build_encoder(cls, args): - return Wav2VecEncoder(args) + def build_encoder(cls, cfg: Wav2Vec2AsrConfig): + return Wav2VecEncoder(cfg) @classmethod - def build_decoder(cls, args, tgt_dict, embed_tokens): - return TransformerDecoder(args, tgt_dict, embed_tokens) + def build_decoder(cls, cfg: Wav2Vec2Seq2SeqConfig, tgt_dict, embed_tokens): + return TransformerDecoder(cfg, tgt_dict, embed_tokens) def forward(self, **kwargs): - encoder_out = self.encoder(tbc=False, **kwargs) + encoder_out = self.encoder(**kwargs) decoder_out = self.decoder(encoder_out=encoder_out, **kwargs) return decoder_out @@ -302,88 +367,247 @@ def upgrade_state_dict_named(self, state_dict, name): class Wav2VecEncoder(FairseqEncoder): - def __init__(self, args, tgt_dict=None): - self.apply_mask = args.apply_mask + def __init__(self, cfg: Wav2Vec2AsrConfig, output_size=None): + self.apply_mask = cfg.apply_mask arg_overrides = { - "dropout": args.dropout, - "activation_dropout": args.activation_dropout, - "dropout_input": args.dropout_input, - "attention_dropout": args.attention_dropout, - "mask_length": args.mask_length, - "mask_prob": args.mask_prob, - "mask_selection": args.mask_selection, - "mask_other": args.mask_other, - "no_mask_overlap": args.no_mask_overlap, - "mask_channel_length": args.mask_channel_length, - "mask_channel_prob": args.mask_channel_prob, - "mask_channel_selection": args.mask_channel_selection, - "mask_channel_other": args.mask_channel_other, - "no_mask_channel_overlap": args.no_mask_channel_overlap, - "encoder_layerdrop": args.layerdrop, - "feature_grad_mult": args.feature_grad_mult, + "dropout": cfg.dropout, + "activation_dropout": cfg.activation_dropout, + "dropout_input": cfg.dropout_input, + "attention_dropout": cfg.attention_dropout, + "mask_length": cfg.mask_length, + "mask_prob": cfg.mask_prob, + "require_same_masks": getattr(cfg, "require_same_masks", True), + "pct_holes": getattr(cfg, "mask_dropout", 0), + "mask_selection": cfg.mask_selection, + "mask_other": cfg.mask_other, + "no_mask_overlap": cfg.no_mask_overlap, + "mask_channel_length": cfg.mask_channel_length, + "mask_channel_prob": cfg.mask_channel_prob, + "mask_channel_before": cfg.mask_channel_before, + "mask_channel_selection": cfg.mask_channel_selection, + "mask_channel_other": cfg.mask_channel_other, + "no_mask_channel_overlap": cfg.no_mask_channel_overlap, + "encoder_layerdrop": cfg.layerdrop, + "feature_grad_mult": cfg.feature_grad_mult, + "checkpoint_activations": cfg.checkpoint_activations, + "offload_activations": cfg.offload_activations, + "min_params_to_wrap": cfg.min_params_to_wrap, + # d2v multi args + "encoder_dropout": cfg.dropout, + "drop_path": getattr(cfg, "drop_path", 0), + "mask_dropout": getattr(cfg, "mask_dropout", 0), + "zero_mask": getattr(cfg, "zero_mask", False), + "local_grad_mult": cfg.feature_grad_mult, + "layerdrop": cfg.layerdrop, + "prenet_layerdrop": cfg.layerdrop, + "prenet_dropout": cfg.dropout, + "post_mlp_drop": cfg.dropout, + "encoder_zero_mask": getattr(cfg, "zero_mask", False), + "inverse_mask": False, + "learned_alibi_scale": getattr(cfg, "update_alibi", True), } - if getattr(args, "w2v_args", None) is None: - state = checkpoint_utils.load_checkpoint_to_cpu( - args.w2v_path, arg_overrides - ) - w2v_args = state["args"] + if cfg.w2v_args is None: + state = checkpoint_utils.load_checkpoint_to_cpu(cfg.w2v_path, arg_overrides) + w2v_args = state.get("cfg", None) + if w2v_args is None: + w2v_args = convert_namespace_to_omegaconf(state["args"]) + w2v_args.criterion = None + w2v_args.lr_scheduler = None + + cfg.w2v_args = w2v_args + + logger.info(w2v_args) + else: state = None - w2v_args = args.w2v_args + w2v_args = cfg.w2v_args + if isinstance(w2v_args, Namespace): + cfg.w2v_args = w2v_args = convert_namespace_to_omegaconf(w2v_args) - assert ( - args.normalize == w2v_args.normalize - ), "Fine-tuning works best when data normalization is the same" + self.is_d2v_multi = "data2vec_multi" in w2v_args.model.get("_name", None) + + if not self.is_d2v_multi: + model_normalized = w2v_args.task.get( + "normalize", w2v_args.model.get("normalize", False) + ) + assert cfg.normalize == model_normalized, ( + "Fine-tuning works best when data normalization is the same. " + "Please check that --normalize is set or unset for both pre-training and here" + ) - w2v_args.data = args.data - task = tasks.setup_task(w2v_args) - model = task.build_model(w2v_args) + with open_dict(w2v_args): + args_replacement = ["checkpoint_activations", "layer_type", + "adp_num", "adp_dim", + "adp_act_fn", "adp_trf_idx"] + for _args in args_replacement: + if hasattr(cfg, _args) and getattr(cfg, _args, None) is not None: + w2v_args.model[_args] = getattr(cfg, _args, None) + + if hasattr(cfg, "checkpoint_activations") and cfg.checkpoint_activations: + with open_dict(w2v_args): + w2v_args.model.checkpoint_activations = cfg.checkpoint_activations + + w2v_args.task.data = cfg.data + task = tasks.setup_task(w2v_args.task, from_checkpoint=True) + model = task.build_model(w2v_args.model, from_checkpoint=True) + model.remove_pretraining_modules() + d = w2v_args.model.encoder_embed_dim + else: + assert cfg.normalize - if state is not None and not args.no_pretrained_weights: - model.load_state_dict(state["model"], strict=True) + if hasattr(w2v_args.task, "audio"): + w2v_args.task.audio.data = cfg.data + else: + w2v_args.task.data = cfg.data + task = tasks.setup_task(w2v_args.task, from_checkpoint=True) - model.remove_pretraining_modules() + model = task.build_model(w2v_args.model, from_checkpoint=True) - super().__init__(task.source_dictionary) + model.remove_pretraining_modules(modality="audio") + d = w2v_args.model.embed_dim - d = w2v_args.encoder_embed_dim + if state is not None and not cfg.no_pretrained_weights: + if cfg.load_ema: + assert "_ema" in state["model"] + for k in state["model"]["_ema"]: + mk = "encoder." + k + assert mk in state["model"], mk + state["model"][mk] = state["model"]["_ema"][k] + self.load_model_weights(state, model, cfg) + + super().__init__(task.source_dictionary) self.w2v_model = model - self.final_dropout = nn.Dropout(args.final_dropout) - self.freeze_finetune_updates = args.freeze_finetune_updates + self.final_dropout = nn.Dropout(cfg.final_dropout) + self.freeze_finetune_updates = cfg.freeze_finetune_updates self.num_updates = 0 - if tgt_dict is not None: - self.proj = Linear(d, len(tgt_dict)) - elif getattr(args, "decoder_embed_dim", d) != d: - self.proj = Linear(d, args.decoder_embed_dim) + targ_d = None + self.proj = None + + if output_size is not None: + targ_d = output_size + elif getattr(cfg, "decoder_embed_dim", d) != d: + targ_d = cfg.decoder_embed_dim + + if targ_d is not None: + self.proj = Linear(d, targ_d) + + if cfg.freeze_regex is not None: + self.freeze_regex(cfg.freeze_regex) + + layer_decay = getattr(cfg, "layer_decay", 1) + if layer_decay < 1: + mod_encs = list(model.modality_encoders.values()) + assert len(mod_encs) == 1, len(mod_encs) + blocks = list(mod_encs[0].context_encoder.blocks) + list(model.blocks) + num_layers = len(blocks) + 1 + layer_scales = list( + layer_decay ** (num_layers - i) for i in range(num_layers + 1) + ) + + for i, b in enumerate(blocks): + lid = i + 1 + if layer_scales[lid] == 1.0: + continue + + for n, p in b.named_parameters(): + optim_override = getattr(p, "optim_overrides", {}) + if "optimizer" not in optim_override: + optim_override["optimizer"] = {} + + optim_override["optimizer"]["lr_scale"] = layer_scales[lid] + p.optim_overrides = optim_override + + def freeze_regex(self, pattern): + unfrozen_names = [] + for name, param in self.named_parameters(): + if re.fullmatch(pattern, name) is not None: + param.requires_grad_(False) + else: + unfrozen_names.append(name) + + def load_model_weights(self, state, model, cfg): + if cfg.ddp_backend == "fully_sharded": + from fairseq.distributed import FullyShardedDataParallel + + for name, module in model.named_modules(): + if "encoder.layers" in name and len(name.split(".")) == 3: + # Only for layers, we do a special handling and load the weights one by one + # We dont load all weights together as that wont be memory efficient and may + # cause oom + new_dict = { + k.replace(name + ".", ""): v + for (k, v) in state["model"].items() + if name + "." in k + } + assert isinstance(module, FullyShardedDataParallel) + with module.summon_full_params(): + module.load_state_dict(new_dict, strict=True) + module._reset_lazy_init() + + # Once layers are loaded, filter them out and load everything else. + r = re.compile("encoder.layers.\d.") + filtered_list = list(filter(r.match, state["model"].keys())) + + new_big_dict = { + k: v for (k, v) in state["model"].items() if k not in filtered_list + } + + model.load_state_dict(new_big_dict, strict=False) else: - self.proj = None + to_delete = {"_ema", "target_proj", "decoder"} + for k in to_delete: + if k in state["model"]: + del state["model"][k] + + if hasattr(model, "modality_encoders"): + if "modality_encoders.AUDIO.encoder_mask" not in state["model"]: + model.modality_encoders["AUDIO"].encoder_mask = None + elif not cfg.zero_mask: + model.modality_encoders["AUDIO"].encoder_mask = None + del state["model"]["modality_encoders.AUDIO.encoder_mask"] + + for k in list(state["model"].keys()): + if k.startswith("modality_encoders.") and not k.startswith( + "modality_encoders.AUDIO" + ): + del state["model"][k] + + print(model) + model.load_state_dict(state["model"], strict=True) def set_num_updates(self, num_updates): """Set the number of parameters updates.""" super().set_num_updates(num_updates) self.num_updates = num_updates - def forward(self, source, padding_mask, tbc=True, **kwargs): + def forward(self, source, padding_mask, **kwargs): w2v_args = { "source": source, "padding_mask": padding_mask, "mask": self.apply_mask and self.training, } + if "corpus_key" in kwargs: + w2v_args["corpus_key"] = kwargs["corpus_key"] + + if self.is_d2v_multi: + w2v_args["mode"] = "AUDIO" ft = self.freeze_finetune_updates <= self.num_updates with torch.no_grad() if not ft else contextlib.ExitStack(): - x, padding_mask = self.w2v_model.extract_features(**w2v_args) + res = self.w2v_model.extract_features(**w2v_args) + + x = res["x"] + padding_mask = res["padding_mask"] - if tbc: - # B x T x C -> T x B x C - x = x.transpose(0, 1) + # B x T x C -> T x B x C + x = x.transpose(0, 1) x = self.final_dropout(x) @@ -392,19 +616,25 @@ def forward(self, source, padding_mask, tbc=True, **kwargs): return { "encoder_out": x, # T x B x C - "encoder_padding_mask": padding_mask, # B x T - "padding_mask": padding_mask, + "padding_mask": padding_mask, # B x T, + "layer_results": res["layer_results"], } + def forward_torchscript(self, net_input): + if torch.jit.is_scripting(): + return self.forward(net_input["source"], net_input["padding_mask"]) + else: + return self.forward_non_torchscript(net_input) + def reorder_encoder_out(self, encoder_out, new_order): if encoder_out["encoder_out"] is not None: encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( 1, new_order ) - if encoder_out["encoder_padding_mask"] is not None: - encoder_out["encoder_padding_mask"] = encoder_out[ - "encoder_padding_mask" - ].index_select(0, new_order) + if encoder_out["padding_mask"] is not None: + encoder_out["padding_mask"] = encoder_out["padding_mask"].index_select( + 0, new_order + ) return encoder_out def max_positions(self): @@ -428,21 +658,26 @@ class TransformerDecoder(FairseqIncrementalDecoder): (default: False). """ - def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): + def __init__( + self, + cfg: Wav2Vec2Seq2SeqConfig, + dictionary, + embed_tokens, + no_encoder_attn=False, + ): super().__init__(dictionary) - self.dropout = args.decoder_dropout - self.share_input_output_embed = args.share_decoder_input_output_embed + self.dropout = cfg.decoder_dropout + self.share_input_output_embed = cfg.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim - embed_dim = args.decoder_embed_dim - self.output_embed_dim = args.decoder_embed_dim - args.encoder_embed_dim = embed_dim + embed_dim = cfg.decoder_embed_dim + self.output_embed_dim = cfg.decoder_embed_dim - self.layerdrop = args.decoder_layerdrop + self.layerdrop = cfg.decoder_layerdrop - padding_idx = embed_tokens.padding_idx - self.max_target_positions = args.max_target_positions + self.padding_idx = embed_tokens.padding_idx + self.max_target_positions = cfg.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim @@ -455,25 +690,31 @@ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.embed_positions = ( PositionalEmbedding( - args.max_target_positions, + cfg.max_target_positions, embed_dim, - padding_idx, - learned=args.decoder_learned_pos, + self.padding_idx, + learned=cfg.decoder_learned_pos, ) - if not args.no_token_positional_embeddings + if not cfg.no_token_positional_embeddings else None ) - args = copy.deepcopy(args) - args.dropout = args.decoder_dropout - args.attention_dropout = args.decoder_attention_dropout - args.activation_dropout = args.decoder_activation_dropout + # TODO: update this when transformer gets converted to dataclass configs + transformer_cfg = copy.deepcopy(cfg) + with open_dict(transformer_cfg): + transformer_cfg.dropout = transformer_cfg.decoder_dropout + transformer_cfg.attention_dropout = ( + transformer_cfg.decoder_attention_dropout + ) + transformer_cfg.activation_dropout = ( + transformer_cfg.decoder_activation_dropout + ) self.layers = nn.ModuleList([]) self.layers.extend( [ - TransformerDecoderLayer(args, no_encoder_attn) - for _ in range(args.decoder_layers) + TransformerDecoderLayer(transformer_cfg, no_encoder_attn) + for _ in range(transformer_cfg.decoder_layers) ] ) @@ -481,11 +722,9 @@ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim) ) - nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) + nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) - if args.decoder_normalize_before and not getattr( - args, "no_decoder_final_norm", False - ): + if transformer_cfg.decoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None @@ -507,6 +746,16 @@ def forward( - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ + + if type(prev_output_tokens) == list: + max_len = max((len(x) for x in prev_output_tokens)) + tmp = torch.zeros( + [len(prev_output_tokens), max_len], device=prev_output_tokens[0].device + ) + for (i, p) in enumerate(prev_output_tokens): + tmp[i, : len(p)] = p + prev_output_tokens = tmp + prev_output_tokens = prev_output_tokens.long() x, extra = self.extract_features( prev_output_tokens, encoder_out, incremental_state @@ -557,19 +806,21 @@ def extract_features( inner_states = [x] # decoder layers + self_attn_padding_mask = None + if prev_output_tokens.eq(self.padding_idx).any(): + self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) for layer in self.layers: dropout_probability = np.random.random() if not self.training or (dropout_probability > self.layerdrop): x, attn, _ = layer( x, encoder_out["encoder_out"] if encoder_out is not None else None, - encoder_out["encoder_padding_mask"] - if encoder_out is not None - else None, + encoder_out["padding_mask"] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, + self_attn_padding_mask=self_attn_padding_mask, ) inner_states.append(x) @@ -614,7 +865,7 @@ def upgrade_state_dict_named(self, state_dict, name): def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) - nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) nn.init.constant_(m.weight[padding_idx], 0) return m @@ -625,51 +876,3 @@ def Linear(in_features, out_features, bias=True): if bias: nn.init.constant_(m.bias, 0.0) return m - - -@register_model_architecture("wav2vec_ctc", "wav2vec_ctc") -def base_architecture(args): - args.no_pretrained_weights = getattr(args, "no_pretrained_weights", False) - args.dropout_input = getattr(args, "dropout_input", 0) - args.final_dropout = getattr(args, "final_dropout", 0) - args.apply_mask = getattr(args, "apply_mask", False) - args.dropout = getattr(args, "dropout", 0) - args.attention_dropout = getattr(args, "attention_dropout", 0) - args.activation_dropout = getattr(args, "activation_dropout", 0) - - args.mask_length = getattr(args, "mask_length", 10) - args.mask_prob = getattr(args, "mask_prob", 0.5) - args.mask_selection = getattr(args, "mask_selection", "static") - args.mask_other = getattr(args, "mask_other", 0) - args.no_mask_overlap = getattr(args, "no_mask_overlap", False) - args.mask_channel_length = getattr(args, "mask_channel_length", 10) - args.mask_channel_prob = getattr(args, "mask_channel_prob", 0.5) - args.mask_channel_selection = getattr(args, "mask_channel_selection", "static") - args.mask_channel_other = getattr(args, "mask_channel_other", 0) - args.no_mask_channel_overlap = getattr(args, "no_mask_channel_overlap", False) - - args.freeze_finetune_updates = getattr(args, "freeze_finetune_updates", 0) - args.feature_grad_mult = getattr(args, "feature_grad_mult", 0) - args.layerdrop = getattr(args, "layerdrop", 0.0) - - -@register_model_architecture("wav2vec_seq2seq", "wav2vec_seq2seq") -def seq2seq_architecture(args): - args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) - args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) - args.decoder_layers = getattr(args, "decoder_layers", 10) - args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0) - args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) - args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) - args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) - args.no_token_positional_embeddings = getattr( - args, "no_token_positional_embeddings", False - ) - args.decoder_dropout = getattr(args, "decoder_dropout", 0) - args.decoder_attention_dropout = getattr(args, "decoder_attention_dropout", 0) - args.decoder_activation_dropout = getattr(args, "decoder_activation_dropout", 0) - args.share_decoder_input_output_embed = getattr( - args, "share_decoder_input_output_embed", False - ) - - base_architecture(args) diff --git a/fairseq/models/wav2vec/wav2vec2_classification.py b/fairseq/models/wav2vec/wav2vec2_classification.py new file mode 100644 index 0000000000..c9bbaab28e --- /dev/null +++ b/fairseq/models/wav2vec/wav2vec2_classification.py @@ -0,0 +1,348 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import logging +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Any, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from omegaconf import II, MISSING, open_dict + +from fairseq import checkpoint_utils, tasks, utils +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.models import BaseFairseqModel, FairseqEncoder, register_model +from fairseq.models.wav2vec.wav2vec2 import MASKING_DISTRIBUTION_CHOICES, Wav2Vec2Config +from fairseq.models.wav2vec.wav2vec2_asr import Embedding, Linear, Wav2VecEncoder, Wav2Vec2AsrConfig +from fairseq.tasks import FairseqTask + +logging.basicConfig(level=logging.DEBUG) + + +@dataclass +class Wav2Vec2ClassificationConfig(Wav2Vec2AsrConfig): + latent_embed_dim: Optional[int] = field( + default=None, metadata={"help": "latent dim (encoder w2v -> latent -> class"} + ) + pooling: str = field( + default="first_token", + metadata={"help": "pooling layer choices"}, + ) + activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( + default="gelu", metadata={"help": "activation function to use"} + ) + + +@register_model("wav2vec_classification", dataclass=Wav2Vec2ClassificationConfig) +class Wav2VecClassification(BaseFairseqModel): + # TODO: Can be shared/merged with ASR model class as w2v_encoder params are common. + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + w2v_encoder: BaseFairseqModel, + pooling_layer, + ): + super().__init__() + self.cfg = cfg + self.w2v_encoder = w2v_encoder + self.pooling_layer = pooling_layer + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + return state_dict + + @classmethod + def build_model(cls, cfg: Wav2Vec2ClassificationConfig, task: FairseqTask): + """Build a new model instance.""" + w2v_encoder = Wav2VecEncoder(cfg, None) + pooling_layer = get_pooling_layer( + cfg, + w2v_encoder.w2v_model.encoder.layers[-1].embedding_dim, + len(task.target_dictionary), + len(w2v_encoder.w2v_model.encoder.layers), + ) + return cls(cfg, w2v_encoder, pooling_layer) + + def get_normalized_probs(self, net_output, log_probs): + """Get normalized probabilities (or log probs) from a net's output.""" + logits = net_output + + if log_probs: + return utils.log_softmax(logits.float(), dim=-1) + else: + return utils.softmax(logits.float(), dim=-1) + + def get_logits(self, net_output): + return net_output + + def forward(self, **kwargs): + encoder_out_dict = self.w2v_encoder(**kwargs) + w2v_encoder_out = encoder_out_dict["encoder_out"] # TxBxC + w2v_encoder_padding_mask = encoder_out_dict["padding_mask"] # BxT + # w2v_encoder_layer_results = encoder_out_dict["layer_results"] + return self.pooling_layer( + last_layer_feats=w2v_encoder_out, + padding_mask=w2v_encoder_padding_mask, + # all_layer_feats=w2v_encoder_layer_results, + ) + + # def forward_latent(self, **kwargs): + # encoder_out_dict = self.w2v_encoder(**kwargs) + # w2v_encoder_out = encoder_out_dict["encoder_out"] + # w2v_encoder_padding_mask = encoder_out_dict["encoder_padding_mask"] + # w2v_encoder_layer_results = encoder_out_dict["layer_results"] + # return self.pooling_layer.forward_latent( + # last_layer_feats=w2v_encoder_out, + # padding_mask=w2v_encoder_padding_mask, + # all_layer_feats=w2v_encoder_layer_results, + # ) + + +def get_pooling_layer( + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + encoder_layers: int, +): + assert cfg.pooling == 'mean' + if cfg.pooling == "first_token": + return FirstToken(cfg, encoder_embed_dim, num_targets) + # elif cfg.pooling == "mean": + # return MeanPooling(cfg, encoder_embed_dim, num_targets) + elif cfg.pooling == "mean": + return MeanPoolingFast(cfg, encoder_embed_dim, num_targets) + elif cfg.pooling == "mean_amsoftmax": + return MeanPoolingFastAMSoftmax(cfg, encoder_embed_dim, num_targets) + elif cfg.pooling == "max": + return MaxPoolingFast(cfg, encoder_embed_dim, num_targets) + elif cfg.pooling == "elmo": + return LayerWeightedMeanPooling( + cfg, encoder_embed_dim, num_targets, encoder_layers + ) + else: + raise NotImplementedError(f"{cfg.pooling} has not been implemented yet.") + + +class Pooling(nn.Module): + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + ): + super().__init__() + self.projection = Linear(encoder_embed_dim, num_targets) + + def forward(self, last_layer_feats, **kwargs): + raise NotImplementedError() + + +class FirstToken(Pooling): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, last_layer_feats, **kwargs): + return self.projection(last_layer_feats[:, 0]) + + +# class MeanPooling(Pooling): +# def __init__( +# self, +# cfg: Wav2VecClassificationConfig, +# encoder_embed_dim: int, +# num_targets: int, +# **kwargs, +# ): +# super().__init__(cfg, encoder_embed_dim, num_targets) +# self.activation_fn = utils.get_activation_fn(cfg.activation_fn) +# self.linear = Linear(encoder_embed_dim, encoder_embed_dim) + +# def forward(self, last_layer_feats, padding_mask, **kwargs): +# # last_layer_feats: [BxTxD] +# # padding_mask: [BxT] +# last_layer_feats = self.linear(self.activation_fn(last_layer_feats)) +# input_lengths = (1 - padding_mask.long()).sum(-1) +# pooled_feature_list = [] +# for i in range(len(last_layer_feats)): +# length = input_lengths[i] +# pooled_feature = torch.mean(last_layer_feats[i][:length], dim=0) +# pooled_feature_list.append(pooled_feature) +# return self.projection(torch.stack(pooled_feature_list)) + + +def fn_mean(x, mask): + """ + Args: + x: TxBxD + mask: BxT + Return: + y: BxD + """ + if mask is not None: + mask = mask.t()[:, :, None] + return (x * mask).sum(0) / mask.sum(0) + else: + return x.sum(0) / x.shape[0] + + +class MeanPoolingFast(nn.Module): + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + **kwargs, + ): + super().__init__() + self.activation_fn = utils.get_activation_fn(cfg.activation_fn) + self.latent_embed_dim = ( + cfg.latent_embed_dim + if cfg.latent_embed_dim is not None + else encoder_embed_dim + ) + logging.debug(f"| {self.latent_embed_dim=}") + self.linear = Linear(encoder_embed_dim, self.latent_embed_dim) + self.projection = Linear(self.latent_embed_dim, num_targets) + + def forward(self, last_layer_feats, padding_mask, **kwargs): + """ + Arguments + features - [TxBxD] Acoustic feature with shape + padding_mask - [BxT] Padding Mask + """ + if padding_mask is not None: + feat_mask = (~padding_mask).to(last_layer_feats.dtype) + else: + feat_mask = None + feat = self.linear(last_layer_feats) + feat = fn_mean(feat, feat_mask) + feat = self.activation_fn(feat) + return self.projection(feat) + + def forward_latent(self, last_layer_feats, padding_mask, **kwargs): + """ + Arguments + features - [TxBxD] Acoustic feature with shape + padding_mask - [BxT] Padding Mask + """ + if padding_mask is not None: + feat_mask = (~padding_mask).to(last_layer_feats.dtype) + else: + feat_mask = None + feat = self.linear(last_layer_feats) + feat = fn_mean(feat, feat_mask) + return feat + + +class MeanPoolingFastAMSoftmax(MeanPoolingFast): + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + **kwargs, + ): + super().__init__(cfg, encoder_embed_dim, num_targets, **kwargs) + self.projection = Linear(self.latent_embed_dim, num_targets, bias=False) + nn.init.xavier_normal_(self.projection.weight, gain=1) + + def forward(self, last_layer_feats, padding_mask, **kwargs): + + """ + Arguments + features - [BxTxD] Acoustic feature with shape + padding_mask - [BxT] Padding Mask + """ + feat_mask = (~padding_mask).to(last_layer_feats.dtype) # T,B -> B,T + feat = self.linear(last_layer_feats) # B,T,D + feat = fn_mean(feat, feat_mask) # B,D + feat = self.activation_fn(feat) + # normalize feat + feat_norm = F.normalize(feat, p=2, dim=-1) # B,D + weight_norm = F.normalize(self.projection.weight.t(), p=2, dim=-1) # D,K + cos_fw = feat_norm @ weight_norm + return cos_fw + + +def fn_max(x, mask): + """ + Args: + x: TxBxD + mask: BxT + Return: + y: BxD + """ + mask = mask.t()[:, :, None].to(torch.bool) + return x.masked_fill(~mask, -1e-8).max(0)[0] + + +class MaxPoolingFast(Pooling): + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + **kwargs, + ): + super().__init__(cfg, encoder_embed_dim, num_targets) + self.activation_fn = utils.get_activation_fn(cfg.activation_fn) + self.linear = Linear(encoder_embed_dim, encoder_embed_dim) + + def forward(self, last_layer_feats, padding_mask, **kwargs): + + """ + Arguments + features - [TxBxD] Acoustic feature with shape + padding_mask - [BxT] Padding Mask + """ + feat_mask = (~padding_mask).to(last_layer_feats.dtype) + feat = self.linear(last_layer_feats) + feat = fn_max(feat, feat_mask) + feat = self.activation_fn(feat) + return self.projection(feat) + + +class LayerWeightedMeanPooling(MeanPoolingFast): + """Elmo-style weighted average representation.""" + + def __init__( + self, + cfg: Wav2Vec2ClassificationConfig, + encoder_embed_dim: int, + num_targets: int, + encoder_layers: int, + ): + super().__init__(cfg, encoder_embed_dim, num_targets) + self.num_layers = encoder_layers + self.weights = nn.Parameter(torch.ones(encoder_layers)) + + def forward(self, last_layer_feats, padding_mask, all_layer_feats): + # last_layer_feats: [BxTxD] + # padding_mask: [BxT] + if not self.training: + msg = ( + f"Number of layers in input features = {len(all_layer_feats)}." + f" Expected {self.num_layers} layers." + ) + assert len(all_layer_feats) == self.num_layers, msg + + # Stack up all layers and reshape to (num_layers, features) + all_layer_feats_stacked = torch.stack(all_layer_feats, dim=0) + num_layers, *original_feat_shape = all_layer_feats_stacked.shape + all_layer_feats_stacked_flat = all_layer_feats_stacked.view(num_layers, -1) + + # Weighted average + normalized_weights = F.softmax(self.weights, dim=-1) + weighted_avg_features = ( + normalized_weights.unsqueeze(-1) * all_layer_feats_stacked_flat + ).sum(dim=0) + weighted_avg_features = weighted_avg_features.view(*original_feat_shape) + + # Mean Pooling on weighted average features. + return super().forward(weighted_avg_features, padding_mask) \ No newline at end of file diff --git a/fairseq/models/wav2vec/wav2vec2_laser.py b/fairseq/models/wav2vec/wav2vec2_laser.py new file mode 100755 index 0000000000..ff89759d38 --- /dev/null +++ b/fairseq/models/wav2vec/wav2vec2_laser.py @@ -0,0 +1,39 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.models import BaseFairseqModel, register_model +from fairseq.models.wav2vec.wav2vec2_asr import ( + Wav2Vec2CtcConfig, + Wav2VecCtc, + Wav2VecEncoder, +) +from fairseq.tasks import FairseqTask + + +@register_model("wav2vec2_laser", dataclass=Wav2Vec2CtcConfig) +class Wav2VecLaser(Wav2VecCtc): + def __init__(self, cfg: Wav2Vec2CtcConfig, w2v_encoder: BaseFairseqModel): + super().__init__(cfg, w2v_encoder) + self.num_updates = 0 + self.freeze_finetune_updates = cfg.freeze_finetune_updates + + @classmethod + def build_model(cls, cfg: Wav2Vec2CtcConfig, task: FairseqTask): + """Build a new model instance.""" + w2v_encoder = Wav2VecEncoder(cfg, 1024) + return cls(cfg, w2v_encoder) + + def forward(self, **kwargs): + output = super().forward(**kwargs) + x_out = output["encoder_out"] * 0.01 + out_pad_mask = output["padding_mask"] + # Set padded outputs to -inf so they are not selected by max-pooling + if out_pad_mask is not None and out_pad_mask.any(): + x_out = ( + x_out.float() + .masked_fill_(out_pad_mask.T.unsqueeze(-1), float("-inf")) + .type_as(x_out) + ) + return x_out.max(dim=0)[0] diff --git a/fairseq/models/xmod/__init__.py b/fairseq/models/xmod/__init__.py new file mode 100644 index 0000000000..bbf7694920 --- /dev/null +++ b/fairseq/models/xmod/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .model import * # noqa +from .transformer_layer_xmod import * # noqa diff --git a/fairseq/models/xmod/hub_interface.py b/fairseq/models/xmod/hub_interface.py new file mode 100644 index 0000000000..909bb423ca --- /dev/null +++ b/fairseq/models/xmod/hub_interface.py @@ -0,0 +1,51 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from fairseq.models.roberta.hub_interface import RobertaHubInterface +import torch +import torch.nn.functional as F + + +class XMODHubInterface(RobertaHubInterface): + def extract_features( + self, + tokens: torch.LongTensor, + return_all_hiddens: bool = False, + lang_id=None, + ) -> torch.Tensor: + if tokens.dim() == 1: + tokens = tokens.unsqueeze(0) + if tokens.size(-1) > self.model.max_positions(): + raise ValueError( + "tokens exceeds maximum length: {} > {}".format( + tokens.size(-1), self.model.max_positions() + ) + ) + features, extra = self.model( + tokens.to(device=self.device), + features_only=True, + return_all_hiddens=return_all_hiddens, + lang_id=lang_id, + ) + if return_all_hiddens: + # convert from T x B x C -> B x T x C + inner_states = extra["inner_states"] + return [inner_state.transpose(0, 1) for inner_state in inner_states] + else: + return features # just the last layer's features + + def predict( + self, + head: str, + tokens: torch.LongTensor, + return_logits: bool = False, + lang_id=None, + ): + features = self.extract_features(tokens.to(device=self.device), lang_id=lang_id) + logits = self.model.classification_heads[head](features) + if return_logits: + return logits + return F.log_softmax(logits, dim=-1) diff --git a/fairseq/models/xmod/model.py b/fairseq/models/xmod/model.py new file mode 100644 index 0000000000..fb6c7a8dea --- /dev/null +++ b/fairseq/models/xmod/model.py @@ -0,0 +1,742 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from ..roberta.model_xlmr import XLMRModel +from fairseq.models.xmod.transformer_layer_xmod import XMODTransformerEncoderLayerBase +from ..roberta.model import base_architecture, RobertaEncoder +from fairseq.models.transformer import TransformerEncoder +from fairseq.modules.transformer_sentence_encoder import init_bert_params +from typing import Optional +from fairseq.models.xmod.hub_interface import XMODHubInterface +import torch +from fairseq.distributed import fsdp_wrap +from fairseq.models import ( + register_model, + register_model_architecture, +) + +from fairseq.modules.checkpoint_activations import checkpoint_wrapper + +DEFAULT_MIN_PARAMS_TO_WRAP = int(1e8) + + +@register_model("xmod") +class XMODModel(XLMRModel): + @classmethod + def hub_models(cls): + return { + "xmod.base": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.81.1M.tar.gz", + "xmod.large.prenorm": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.large.prenorm.81.500k.tar.gz", + "xmod.base.13.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.13.125k.tar.gz", + "xmod.base.30.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.125k.tar.gz", + "xmod.base.30.195k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.195k.tar.gz", + "xmod.base.60.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.125k.tar.gz", + "xmod.base.60.265k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.265k.tar.gz", + "xmod.base.75.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.125k.tar.gz", + "xmod.base.75.269k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.269k.tar.gz", + } + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + bpe="sentencepiece", + **kwargs, + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + bpe=bpe, + load_checkpoint_heads=True, + **kwargs, + ) + return XMODHubInterface(x["args"], x["task"], x["models"][0]) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + from omegaconf import OmegaConf + + if OmegaConf.is_config(args): + OmegaConf.set_struct(args, False) + + # make sure all arguments are present + base_architecture(args) + + if not hasattr(args, "max_positions"): + if not hasattr(args, "tokens_per_sample"): + args.tokens_per_sample = task.max_positions() + args.max_positions = args.tokens_per_sample + + encoder = XMODEncoder(args, task.source_dictionary) + + if OmegaConf.is_config(args): + OmegaConf.set_struct(args, True) + + return cls(args, encoder) + + def forward( + self, + src_tokens, + features_only=False, + return_all_hiddens=False, + classification_head_name=None, + lang_id=None, + **kwargs, + ): + if classification_head_name is not None: + features_only = True + x, extra = self.encoder( + src_tokens, features_only, return_all_hiddens, lang_id=lang_id, **kwargs + ) + + if classification_head_name is not None: + x = self.classification_heads[classification_head_name](x) + return x, extra + + +class XMODEncoder(RobertaEncoder): + """XMOD encoder.""" + + def build_encoder(self, args, dictionary, embed_tokens): + encoder = XMODTransformerEncoder(args, dictionary, embed_tokens) + encoder.apply(init_bert_params) + return encoder + + def forward( + self, + src_tokens, + features_only=False, + return_all_hiddens=False, + masked_tokens=None, + lang_id=None, + **unused, + ): + """ + Args: + src_tokens (LongTensor): input tokens of shape `(batch, src_len)` + features_only (bool, optional): skip LM head and just return + features. If True, the output will be of shape + `(batch, src_len, embed_dim)`. + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + + Returns: + tuple: + - the LM output of shape `(batch, src_len, vocab)` + - a dictionary of additional data, where 'inner_states' + is a list of hidden states. Note that the hidden + states have shape `(src_len, batch, vocab)`. + """ + x, extra = self.extract_features( + src_tokens, return_all_hiddens=return_all_hiddens, lang_id=lang_id + ) + if not features_only: + x = self.output_layer(x, masked_tokens=masked_tokens) + return x, extra + + def extract_features( + self, src_tokens, return_all_hiddens=False, lang_id=None, **kwargs + ): + encoder_out = self.sentence_encoder( + src_tokens, + return_all_hiddens=return_all_hiddens, + lang_id=lang_id, + token_embeddings=kwargs.get("token_embeddings", None), + ) + # T x B x C -> B x T x C + features = encoder_out["encoder_out"][0].transpose(0, 1) + inner_states = encoder_out["encoder_states"] if return_all_hiddens else None + return features, {"inner_states": inner_states} + + +class XMODTransformerEncoder(TransformerEncoder): + def build_encoder_layer(self, cfg): + layer = XMODTransformerEncoderLayerBase(cfg) + checkpoint = cfg.checkpoint_activations + if checkpoint: + offload_to_cpu = cfg.offload_activations + layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) + # if we are checkpointing, enforce that FSDP always wraps the + # checkpointed layer, regardless of layer size + min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 + layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) + return layer + + def forward( + self, + src_tokens, + src_lengths: Optional[torch.Tensor] = None, + return_all_hiddens: bool = False, + token_embeddings: Optional[torch.Tensor] = None, + lang_id=None, + ): + """ + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings + + Returns: + dict: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - **encoder_embedding** (Tensor): the (scaled) embedding lookup + of shape `(batch, src_len, embed_dim)` + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + """ + return self.forward_scriptable( + src_tokens, + src_lengths, + return_all_hiddens, + token_embeddings, + lang_id=lang_id, + ) + # TorchScript doesn't support super() method so that the scriptable Subclass + # can't access the base class model in Torchscript. + # Current workaround is to add a helper function with different name and + # call the helper function from scriptable Subclass. + + def forward_scriptable( + self, + src_tokens, + src_lengths: Optional[torch.Tensor] = None, + return_all_hiddens: bool = False, + token_embeddings: Optional[torch.Tensor] = None, + lang_id=None, + ): + """ + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings + + Returns: + dict: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - **encoder_embedding** (Tensor): the (scaled) embedding lookup + of shape `(batch, src_len, embed_dim)` + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + """ + # compute padding mask + encoder_padding_mask = src_tokens.eq(self.padding_idx) + has_pads = src_tokens.device.type == "xla" or encoder_padding_mask.any() + + x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings) + + # account for padding while computing the representation + if has_pads: + x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x)) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + encoder_states = [] + + if return_all_hiddens: + encoder_states.append(x) + + # encoder layers + for layer in self.layers: + x = layer( + x, + encoder_padding_mask=encoder_padding_mask if has_pads else None, + lang_id=lang_id, + ) + if return_all_hiddens: + assert encoder_states is not None + encoder_states.append(x) + + if self.layer_norm is not None: + x = self.layer_norm(x) + + # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in + # `forward` so we use a dictionary instead. + # TorchScript does not support mixed values so the values are all lists. + # The empty list is equivalent to None. + src_lengths = ( + src_tokens.ne(self.padding_idx) + .sum(dim=1, dtype=torch.int32) + .reshape(-1, 1) + .contiguous() + ) + return { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [encoder_padding_mask], # B x T + "encoder_embedding": [encoder_embedding], # B x T x C + "encoder_states": encoder_states, # List[T x B x C] + "src_tokens": [], + "src_lengths": [src_lengths], + } + + +@register_model_architecture("xmod", "xmod_base_13") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) + args.ln_before_adapter = getattr(args, "ln_before_adapter", True) + args.languages = getattr( + args, + "languages", + [ + "ar_AR", + "en_XX", + "fi_FI", + "fr_XX", + "hi_IN", + "id_ID", + "ka_GE", + "ko_KR", + "ru_RU", + "sw_KE", + "ta_IN", + "th_TH", + "vi_VN", + ], + ) + base_architecture(args) + + +@register_model_architecture("xmod", "xmod_base_30") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) + args.ln_before_adapter = getattr(args, "ln_before_adapter", True) + args.languages = getattr( + args, + "languages", + [ + "ar_AR", + "cs_CZ", + "en_XX", + "eu_ES", + "fi_FI", + "fr_XX", + "hi_IN", + "hr_HR", + "hu_HU", + "hy_AM", + "id_ID", + "it_IT", + "ka_GE", + "ko_KR", + "lt_LT", + "ml_IN", + "mn_MN", + "ms_MY", + "pl_PL", + "ro_RO", + "ru_RU", + "si_LK", + "sk_SK", + "sq_AL", + "sv_SE", + "sw_KE", + "ta_IN", + "th_TH", + "tl_XX", + "vi_VN", + ], + ) + base_architecture(args) + + +@register_model_architecture("xmod", "xmod_base_60") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) + args.ln_before_adapter = getattr(args, "ln_before_adapter", True) + args.languages = getattr( + args, + "languages", + [ + "af_ZA", + "am_ET", + "ar_AR", + "be_BY", + "bn_IN", + "ca_ES", + "cs_CZ", + "cy_GB", + "da_DK", + "en_XX", + "eo_EO", + "et_EE", + "eu_ES", + "fa_IR", + "fi_FI", + "fr_XX", + "ga_IE", + "gl_ES", + "gu_IN", + "ha_NG", + "hi_IN", + "hr_HR", + "hu_HU", + "hy_AM", + "id_ID", + "is_IS", + "it_IT", + "ka_GE", + "ko_KR", + "ku_TR", + "la_VA", + "lt_LT", + "lv_LV", + "mk_MK", + "ml_IN", + "mn_MN", + "ms_MY", + "ne_NP", + "nl_XX", + "no_XX", + "pl_PL", + "ps_AF", + "pt_XX", + "ro_RO", + "ru_RU", + "sa_IN", + "sd_PK", + "si_LK", + "sk_SK", + "sl_SI", + "so_SO", + "sq_AL", + "sr_RS", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "vi_VN", + ], + ) + base_architecture(args) + + +@register_model_architecture("xmod", "xmod_base_75") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) + args.ln_before_adapter = getattr(args, "ln_before_adapter", True) + args.languages = getattr( + args, + "languages", + [ + "af_ZA", + "am_ET", + "ar_AR", + "as_IN", + "be_BY", + "bn_IN", + "br_FR", + "bs_BA", + "ca_ES", + "cs_CZ", + "cy_GB", + "da_DK", + "en_XX", + "eo_EO", + "et_EE", + "eu_ES", + "fa_IR", + "fi_FI", + "fr_XX", + "fy_NL", + "ga_IE", + "gd_GB", + "gl_ES", + "gu_IN", + "ha_NG", + "hi_IN", + "hr_HR", + "hu_HU", + "hy_AM", + "id_ID", + "is_IS", + "it_IT", + "jv_ID", + "ka_GE", + "kn_IN", + "ko_KR", + "ku_TR", + "la_VA", + "lt_LT", + "lv_LV", + "mg_MG", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "ms_MY", + "ne_NP", + "nl_XX", + "no_XX", + "om_KE", + "or_IN", + "pa_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "ro_RO", + "ru_RU", + "sa_IN", + "sd_PK", + "si_LK", + "sk_SK", + "sl_SI", + "so_SO", + "sq_AL", + "sr_RS", + "su_ID", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "vi_VN", + "xh_ZA", + "yi_DE", + ], + ) + base_architecture(args) + + +@register_model_architecture("xmod", "xmod_base") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) + args.ln_before_adapter = getattr(args, "ln_before_adapter", True) + args.languages = getattr( + args, + "languages", + [ + "en_XX", + "id_ID", + "vi_VN", + "ru_RU", + "fa_IR", + "sv_SE", + "ja_XX", + "fr_XX", + "de_DE", + "ro_RO", + "ko_KR", + "hu_HU", + "es_XX", + "fi_FI", + "uk_UA", + "da_DK", + "pt_XX", + "no_XX", + "th_TH", + "pl_PL", + "bg_BG", + "nl_XX", + "zh_CN", + "he_IL", + "el_GR", + "it_IT", + "sk_SK", + "hr_HR", + "tr_TR", + "ar_AR", + "cs_CZ", + "lt_LT", + "hi_IN", + "zh_TW", + "ca_ES", + "ms_MY", + "sl_SI", + "lv_LV", + "ta_IN", + "bn_IN", + "et_EE", + "az_AZ", + "sq_AL", + "sr_RS", + "kk_KZ", + "ka_GE", + "tl_XX", + "ur_PK", + "is_IS", + "hy_AM", + "ml_IN", + "mk_MK", + "be_BY", + "la_VA", + "te_IN", + "eu_ES", + "gl_ES", + "mn_MN", + "kn_IN", + "ne_NP", + "sw_KE", + "si_LK", + "mr_IN", + "af_ZA", + "gu_IN", + "cy_GB", + "eo_EO", + "km_KH", + "ky_KG", + "uz_UZ", + "ps_AF", + "pa_IN", + "ga_IE", + "ha_NG", + "am_ET", + "lo_LA", + "ku_TR", + "so_SO", + "my_MM", + "or_IN", + "sa_IN", + ], + ) + base_architecture(args) + + +@register_model_architecture("xmod", "xmod_large_prenorm") +def roberta_base_architecture(args): + args.ffn_modules = getattr(args, "ffn_modules", False) + args.adapter_modules = getattr(args, "adapter_modules", True) + args.adapter_layer_norm = getattr(args, "adapter_layer_norm", True) + args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", False) + args.ln_before_adapter = getattr(args, "ln_before_adapter", False) + # args.bottleneck = getattr(args, "bottleneck", 8) + args.bottleneck = getattr(args, "bottleneck", 4) + args.languages = getattr( + args, + "languages", + [ + "en_XX", + "id_ID", + "vi_VN", + "ru_RU", + "fa_IR", + "sv_SE", + "ja_XX", + "fr_XX", + "de_DE", + "ro_RO", + "ko_KR", + "hu_HU", + "es_XX", + "fi_FI", + "uk_UA", + "da_DK", + "pt_XX", + "no_XX", + "th_TH", + "pl_PL", + "bg_BG", + "nl_XX", + "zh_CN", + "he_IL", + "el_GR", + "it_IT", + "sk_SK", + "hr_HR", + "tr_TR", + "ar_AR", + "cs_CZ", + "lt_LT", + "hi_IN", + "zh_TW", + "ca_ES", + "ms_MY", + "sl_SI", + "lv_LV", + "ta_IN", + "bn_IN", + "et_EE", + "az_AZ", + "sq_AL", + "sr_RS", + "kk_KZ", + "ka_GE", + "tl_XX", + "ur_PK", + "is_IS", + "hy_AM", + "ml_IN", + "mk_MK", + "be_BY", + "la_VA", + "te_IN", + "eu_ES", + "gl_ES", + "mn_MN", + "kn_IN", + "ne_NP", + "sw_KE", + "si_LK", + "mr_IN", + "af_ZA", + "gu_IN", + "cy_GB", + "eo_EO", + "km_KH", + "ky_KG", + "uz_UZ", + "ps_AF", + "pa_IN", + "ga_IE", + "ha_NG", + "am_ET", + "lo_LA", + "ku_TR", + "so_SO", + "my_MM", + "or_IN", + "sa_IN", + ], + ) + + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) + args.encoder_layers = getattr(args, "encoder_layers", 24) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + base_architecture(args) diff --git a/fairseq/models/xmod/transformer_layer_xmod.py b/fairseq/models/xmod/transformer_layer_xmod.py new file mode 100644 index 0000000000..47a91cdc23 --- /dev/null +++ b/fairseq/models/xmod/transformer_layer_xmod.py @@ -0,0 +1,179 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.modules.transformer_layer import TransformerEncoderLayer +from typing import Optional +import torch +import torch.nn as nn +from fairseq import utils +from fairseq.modules import LayerNorm +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.quant_noise import quant_noise +from torch import Tensor + + +class Adapter(nn.Module): + def __init__(self, cfg, red_fac=2): + super(Adapter, self).__init__() + self.cfg = cfg + self.embed_dim = cfg.encoder_embed_dim + self.quant_noise = getattr(cfg, "quant_noise_pq", 0) + self.quant_noise_block_size = getattr(cfg, "quant_noise_pq_block_size", 8) or 8 + self.activation_fn = utils.get_activation_fn( + activation=getattr(cfg, "activation_fn", "relu") or "relu" + ) + self.fc1 = quant_noise( + nn.Linear(self.embed_dim, self.embed_dim // red_fac), + p=self.quant_noise, + block_size=self.quant_noise_block_size, + ) + self.fc2 = quant_noise( + nn.Linear(self.embed_dim // red_fac, self.embed_dim), + p=self.quant_noise, + block_size=self.quant_noise_block_size, + ) + activation_dropout_p = getattr(cfg, "activation_dropout", 0) or 0 + if activation_dropout_p == 0: + # for backwards compatibility with models that use cfg.relu_dropout + activation_dropout_p = getattr(cfg, "relu_dropout", 0) or 0 + self.activation_dropout_module = FairseqDropout( + float(activation_dropout_p), module_name=self.__class__.__name__ + ) + + def forward(self, x): + x = self.activation_fn(self.fc1(x)) + if not hasattr(self.cfg, "adapter_dropout") or self.cfg.adapter_dropout: + x = self.activation_dropout_module(x) + x = self.fc2(x) + return x + + +class XMODTransformerEncoderLayerBase(TransformerEncoderLayer): + """Encoder layer block. + + In the original paper each operation (multi-head attention or FFN) is + postprocessed with: `dropout -> add residual -> layernorm`. In the + tensor2tensor code they suggest that learning is more robust when + preprocessing each layer with layernorm and postprocessing with: + `dropout -> add residual`. We default to the approach in the paper, but the + tensor2tensor approach can be enabled by setting + *cfg.encoder.normalize_before* to ``True``. + + Args: + args (argparse.Namespace): parsed command-line arguments + """ + + def __init__(self, cfg): + super().__init__(cfg) + if hasattr(cfg, "adapter_modules") and cfg.adapter_modules: + export = getattr(cfg, "export", False) + if cfg.adapter_layer_norm: + self.adapter_layer_norm = LayerNorm(self.embed_dim, export=export) + self.adapter_modules = nn.ModuleDict(dict()) + if hasattr(self.cfg, "bottleneck"): + bottleneck = self.cfg.bottleneck + else: + bottleneck = 2 + for language in cfg.languages: + self.adapter_modules[str(language)] = Adapter(cfg, red_fac=bottleneck) + + def lang_adapter(self, lang_id, x): + # If language adapters exist pass throught them + if hasattr(self.cfg, "adapter_modules") and self.cfg.adapter_modules: + if lang_id is None: + lang_id = ["en_XX"] * x.shape[1] + d_langs = [lang_id[0]] + lang_lengths = [1] + for lang in lang_id[1:]: + if lang == d_langs[-1]: + lang_lengths[-1] += 1 + else: + d_langs.append(lang) + lang_lengths.append(1) + + if ( + not hasattr(self.cfg, "ln_before_adapter") + or not self.cfg.ln_before_adapter + ): + residual = x + if self.cfg.adapter_layer_norm: + x = self.adapter_layer_norm(x) + elif self.cfg.adapter_reuse_layer_norm: + x = self.final_layer_norm(x) + if hasattr(self.cfg, "ln_before_adapter") and self.cfg.ln_before_adapter: + residual = x + + split_x = torch.split(x, lang_lengths, 1) + x_ = [] + for i, (lang, s_x) in enumerate(zip(d_langs, split_x)): + lang = lang.replace("_rom", "").replace("_zaw", "") + x_.append(self.adapter_modules[str(lang)](s_x)) + x = torch.cat(x_, 1) + + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + + return x + + def forward( + self, + x, + encoder_padding_mask: Optional[Tensor], + attn_mask: Optional[Tensor] = None, + lang_id: Optional[list] = None, + ): + """ + Args: + x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_padding_mask (ByteTensor): binary ByteTensor of shape + `(batch, seq_len)` where padding elements are indicated by ``1``. + attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`, + where `tgt_len` is the length of output and `src_len` is the + length of input, though here both are equal to `seq_len`. + `attn_mask[tgt_i, src_j] = 1` means that when calculating the + embedding for `tgt_i`, we exclude (mask out) `src_j`. This is + useful for strided self-attention. + + Returns: + encoded output of shape `(seq_len, batch, embed_dim)` + """ + # anything in original attn_mask = 1, becomes -1e8 + # anything in original attn_mask = 0, becomes 0 + # Note that we cannot use -inf here, because at some edge cases, + # the attention weight (before softmax) for some padded element in query + # will become -inf, which results in NaN in model parameters + if attn_mask is not None: + attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8) + + residual = x + if self.normalize_before: + x = self.self_attn_layer_norm(x) + x, _ = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=encoder_padding_mask, + need_weights=False, + attn_mask=attn_mask, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.self_attn_layer_norm(x) + + residual = x + if self.normalize_before: + x = self.final_layer_norm(x) + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + x = self.fc2(x) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + + x = self.lang_adapter(lang_id, x) + + if not self.normalize_before: + x = self.final_layer_norm(x) + return x diff --git a/fairseq/modules/__init__.py b/fairseq/modules/__init__.py index e2326ac6e3..dcfda9b82a 100644 --- a/fairseq/modules/__init__.py +++ b/fairseq/modules/__init__.py @@ -6,15 +6,19 @@ from .adaptive_input import AdaptiveInput from .adaptive_softmax import AdaptiveSoftmax +from .base_layer import BaseLayer from .beamable_mm import BeamableMM from .character_token_embedder import CharacterTokenEmbedder from .conv_tbc import ConvTBC from .cross_entropy import cross_entropy from .downsampled_multihead_attention import DownsampledMultiHeadAttention -from .dynamic_convolution import DynamicConv, DynamicConv1dTBC +from .dynamic_convolution import DynamicConv, DynamicConv1dTBC, DynamicConv_scripatable from .dynamic_crf_layer import DynamicCRF +from .ema_module import EMAModuleConfig, EMAModule from .fairseq_dropout import FairseqDropout +from .fp32_batch_norm import Fp32BatchNorm from .fp32_group_norm import Fp32GroupNorm +from .fp32_instance_norm import Fp32InstanceNorm from .gelu import gelu, gelu_accurate from .grad_multiply import GradMultiply from .gumbel_vector_quantizer import GumbelVectorQuantizer @@ -24,9 +28,11 @@ from .learned_positional_embedding import LearnedPositionalEmbedding from .lightweight_convolution import LightweightConv, LightweightConv1dTBC from .linearized_convolution import LinearizedConvolution +from .location_attention import LocationAttention +from .lstm_cell_with_zoneout import LSTMCellWithZoneOut from .multihead_attention import MultiheadAttention from .positional_embedding import PositionalEmbedding -from .same_pad import SamePad +from .same_pad import SamePad, SamePad2d from .scalar_bias import ScalarBias from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding from .transformer_sentence_encoder_layer import TransformerSentenceEncoderLayer @@ -35,10 +41,20 @@ from .unfold import unfold1d from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer from .vggblock import VGGBlock +from .espnet_multihead_attention import ( + ESPNETMultiHeadedAttention, + RelPositionMultiHeadedAttention, + RotaryPositionMultiHeadedAttention, +) +from .rotary_positional_embedding import RotaryPositionalEmbedding +from .positional_encoding import ( + RelPositionalEncoding, +) __all__ = [ "AdaptiveInput", "AdaptiveSoftmax", + "BaseLayer", "BeamableMM", "CharacterTokenEmbedder", "ConvTBC", @@ -46,10 +62,15 @@ "DownsampledMultiHeadAttention", "DynamicConv1dTBC", "DynamicConv", + "DynamicConv_scripatable", "DynamicCRF", + "EMAModule", + "EMAModuleConfig", "FairseqDropout", + "Fp32BatchNorm", "Fp32GroupNorm", "Fp32LayerNorm", + "Fp32InstanceNorm", "gelu", "gelu_accurate", "GradMultiply", @@ -61,9 +82,12 @@ "LightweightConv1dTBC", "LightweightConv", "LinearizedConvolution", + "LocationAttention", + "LSTMCellWithZoneOut", "MultiheadAttention", "PositionalEmbedding", "SamePad", + "SamePad2d", "ScalarBias", "SinusoidalPositionalEmbedding", "TransformerSentenceEncoderLayer", @@ -73,4 +97,10 @@ "TransposeLast", "VGGBlock", "unfold1d", + "ESPNETMultiheadedAttention", + "PositionalEmbedding", + "RelPositionMultiHeadedAttention", + "RelPositionalEncoding", + "RotaryPositionalEmbedding", + "RotaryPositionMultiHeadedAttention", ] diff --git a/fairseq/modules/adaptive_input.py b/fairseq/modules/adaptive_input.py index 446534a9f8..01ac4accac 100644 --- a/fairseq/modules/adaptive_input.py +++ b/fairseq/modules/adaptive_input.py @@ -7,9 +7,10 @@ from typing import List import torch -from fairseq.modules.quant_noise import quant_noise from torch import nn +from fairseq.modules.quant_noise import quant_noise + class AdaptiveInput(nn.Module): def __init__( @@ -40,7 +41,7 @@ def __init__( for i in range(len(self.cutoff)): prev = self.cutoff[i - 1] if i > 0 else 0 size = self.cutoff[i] - prev - dim = int(initial_dim // (factor ** i)) + dim = int(initial_dim // (factor**i)) seq = nn.Sequential( nn.Embedding(size, dim, self.padding_idx), quant_noise( diff --git a/fairseq/modules/base_layer.py b/fairseq/modules/base_layer.py new file mode 100644 index 0000000000..e823f7bae2 --- /dev/null +++ b/fairseq/modules/base_layer.py @@ -0,0 +1,170 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch.nn as nn +import torch +import sys +from fairseq import utils +from fairseq.distributed import utils as distributed_utils +from fairseq.modules.layer_norm import LayerNorm + + +class BaseLayer(nn.Module): + def __init__(self, args): + super().__init__() + self.num_workers = distributed_utils.get_data_parallel_world_size() + expert_centroids = torch.empty(self.num_workers, args.decoder_embed_dim) + torch.nn.init.orthogonal_(expert_centroids, gain=0.1) + self.register_parameter( + "expert_centroids", torch.nn.Parameter(expert_centroids) + ) + self.expert_network = nn.Sequential( + *([BaseSublayer(args) for _ in range(args.base_sublayers)]) + ) + self.expert_id = distributed_utils.get_data_parallel_rank() + self.shuffle = args.base_shuffle + self.cpp = self.load_assignment() + + # Add a special attribute to the expert parameters, so we know not to sync their gradients + for param in self.expert_network.parameters(): + param.expert = True + + def forward(self, input_features, *args, **kwargs): + features = input_features.reshape(-1, input_features.size(-1)) + is_training = input_features.requires_grad + + if self.shuffle and is_training: + # Send each token to a random worker, to break correlations within the batch + shuffle_sort = torch.randperm(features.size(0), device=features.device) + features = All2All.apply(features[shuffle_sort]) + + with torch.no_grad(): + # Compute similarity of each token to each expert, for routing + token_expert_affinities = features.matmul( + self.expert_centroids.transpose(0, 1) + ) + + # Compute which token goes to which expert + sort_by_expert, input_splits, output_splits = ( + self.balanced_assignment(token_expert_affinities) + if is_training + else self.greedy_assignment(token_expert_affinities) + ) + # Swap these tokens for the right ones for our expert + routed_features = All2All.apply( + features[sort_by_expert], output_splits, input_splits + ) + + if routed_features.size(0) > 0: + # Mix in the expert network based on how appropriate it is for these tokens + alpha = torch.sigmoid( + routed_features.mv(self.expert_centroids[self.expert_id]) + ).unsqueeze(1) + routed_features = ( + alpha * self.expert_network(routed_features) + + (1 - alpha) * routed_features + ) + # Return to original worker and ordering + result = All2All.apply(routed_features, input_splits, output_splits)[ + self.inverse_sort(sort_by_expert) + ] + + if self.shuffle and is_training: + # Undo shuffling + result = All2All.apply(result)[self.inverse_sort(shuffle_sort)] + + # Return additional Nones for compatibility with TransformerDecoderLayer + return result.view(input_features.size()), None, None + + def inverse_sort(self, order): + # Creates an index that undoes a sort: xs==xs[order][inverse_sort(order)] + return torch.empty_like(order).scatter_( + 0, order, torch.arange(0, order.size(0), device=order.device) + ) + + def balanced_assignment(self, scores): + ok = scores.isfinite() + if not ok.all(): + # NaNs here can break the assignment algorithm + scores[~ok] = scores[ok].min() + return self.cpp.balanced_assignment(scores), None, None + + # Assigns each token to the top k experts + def greedy_assignment(self, scores, k=1): + token_to_workers = torch.topk(scores, dim=1, k=k, largest=True).indices.view(-1) + token_to_workers, sort_ordering = torch.sort(token_to_workers) + worker2token = sort_ordering // k + + # Find how many tokens we're sending to each other worker (being careful for sending 0 tokens to some workers) + output_splits = torch.zeros( + (self.num_workers,), dtype=torch.long, device=scores.device + ) + workers, counts = torch.unique_consecutive(token_to_workers, return_counts=True) + output_splits[workers] = counts + # Tell other workers how many tokens to expect from us + input_splits = All2All.apply(output_splits) + return worker2token, input_splits.tolist(), output_splits.tolist() + + def load_assignment(self): + try: + from fairseq import libbase + + return libbase + + except ImportError as e: + sys.stderr.write( + "ERROR: missing libbase. run `python setup.py build_ext --inplace`\n" + ) + raise e + + +class BaseSublayer(nn.Module): + def __init__(self, args): + super().__init__() + self.activation_fn = utils.get_activation_fn( + activation=getattr(args, "activation_fn", "relu") or "relu" + ) + self.norm = LayerNorm(args.decoder_embed_dim, export=False) + self.ff1 = torch.nn.Linear(args.decoder_embed_dim, args.decoder_ffn_embed_dim) + self.ff2 = torch.nn.Linear(args.decoder_ffn_embed_dim, args.decoder_embed_dim) + self.ff2.weight.data.zero_() + + def forward(self, xs): + return xs + self.ff2(self.activation_fn(self.ff1(self.norm(xs)))) + + +# Wraps torch.distributed.all_to_all_single as a function that supports autograd +class All2All(torch.autograd.Function): + @staticmethod + def forward(ctx, xs, input_splits=None, output_splits=None): + ctx.input_splits = input_splits + ctx.output_splits = output_splits + + ys = ( + torch.empty_like(xs) + if output_splits is None + else xs.new_empty(size=[sum(output_splits)] + list(xs.size()[1:])) + ) + torch.distributed.all_to_all_single( + ys, xs, output_split_sizes=output_splits, input_split_sizes=input_splits + ) + return ys + + @staticmethod + def backward(ctx, grad_output): + result = ( + torch.empty_like(grad_output) + if ctx.input_splits is None + else grad_output.new_empty( + size=[sum(ctx.input_splits)] + list(grad_output.size()[1:]) + ) + ) + torch.distributed.all_to_all_single( + result, + grad_output, + output_split_sizes=ctx.input_splits, + input_split_sizes=ctx.output_splits, + ) + return result, None, None diff --git a/fairseq/modules/checkpoint_activations.py b/fairseq/modules/checkpoint_activations.py new file mode 100644 index 0000000000..aa0b5929a3 --- /dev/null +++ b/fairseq/modules/checkpoint_activations.py @@ -0,0 +1,242 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import functools +from typing import Any, Dict, List, Tuple, Union + +import torch +import torch.utils.checkpoint as checkpoint +from fairseq import utils + + +def checkpoint_wrapper(m, offload_to_cpu=False): + """ + A friendlier wrapper for performing activation checkpointing. + + Compared to the PyTorch version, this version: + - wraps an nn.Module, so that all subsequent calls will use checkpointing + - handles keyword arguments in the forward + - handles non-Tensor outputs from the forward + + Usage:: + + checkpointed_module = checkpoint_wrapper(my_module, offload_to_cpu=True) + a, b = checkpointed_module(x, y=3, z=torch.Tensor([1])) + """ + # should I check whether original_forward has already been set? + assert not hasattr( + m, "precheckpoint_forward" + ), "checkpoint function has already been applied?" + m.precheckpoint_forward = m.forward + m.forward = functools.partial( + _checkpointed_forward, + m.precheckpoint_forward, # original_forward + offload_to_cpu, + ) + return m + + +def unwrap_checkpoint(m: torch.nn.Module): + """ + unwrap a module and its children from checkpoint_wrapper + """ + for module in m.modules(): + if hasattr(module, "precheckpoint_forward"): + module.forward = module.precheckpoint_forward + del module.precheckpoint_forward + if hasattr(module, "old_deepcopy_method"): + module.__deepcopy__ = module.old_deepcopy_method + del module.old_deepcopy_method + return m + + +def _checkpointed_forward(original_forward, offload_to_cpu, *args, **kwargs): + # Autograd Functions in PyTorch work best with positional args, since + # the backward must return gradients (or None) for every input argument. + # We can flatten keyword arguments to make this easier. + kwarg_keys, flat_args = pack_kwargs(*args, **kwargs) + parent_ctx_dict = {"offload": offload_to_cpu} + output = CheckpointFunction.apply( + original_forward, parent_ctx_dict, kwarg_keys, *flat_args + ) + if isinstance(output, torch.Tensor): + return output + else: + packed_non_tensor_outputs = parent_ctx_dict["packed_non_tensor_outputs"] + if packed_non_tensor_outputs: + output = unpack_non_tensors(output, packed_non_tensor_outputs) + return output + + +def pack_kwargs(*args, **kwargs) -> Tuple[List[str], List[Any]]: + """ + Usage:: + + kwarg_keys, flat_args = pack_kwargs(1, 2, a=3, b=4) + args, kwargs = unpack_kwargs(kwarg_keys, flat_args) + assert args == [1, 2] + assert kwargs == {"a": 3, "b": 4} + """ + kwarg_keys = [] + flat_args = list(args) + for k, v in kwargs.items(): + kwarg_keys.append(k) + flat_args.append(v) + return kwarg_keys, flat_args + + +def unpack_kwargs( + kwarg_keys: List[str], flat_args: List[Any] +) -> Tuple[List[Any], Dict[str, Any]]: + if len(kwarg_keys) == 0: + return flat_args, {} + args = flat_args[: -len(kwarg_keys)] + kwargs = {k: v for k, v in zip(kwarg_keys, flat_args[-len(kwarg_keys) :])} + return args, kwargs + + +def split_non_tensors( + mixed: Union[torch.Tensor, Tuple[Any]] +) -> Tuple[Tuple[torch.Tensor], Dict[str, List[Any]]]: + """ + Usage:: + + x = torch.Tensor([1]) + y = torch.Tensor([2]) + tensors, packed_non_tensors = split_non_tensors((x, y, None, 3)) + recon = unpack_non_tensors(tensors, packed_non_tensors) + assert recon == (x, y, None, 3) + """ + if isinstance(mixed, torch.Tensor): + return (mixed,), None + tensors = [] + packed_non_tensors = {"is_tensor": [], "objects": []} + for o in mixed: + if isinstance(o, torch.Tensor): + packed_non_tensors["is_tensor"].append(True) + tensors.append(o) + else: + packed_non_tensors["is_tensor"].append(False) + packed_non_tensors["objects"].append(o) + return tuple(tensors), packed_non_tensors + + +def unpack_non_tensors( + tensors: Tuple[torch.Tensor], + packed_non_tensors: Dict[str, List[Any]], +) -> Tuple[Any]: + if packed_non_tensors is None: + return tensors + assert isinstance(packed_non_tensors, dict) + mixed = [] + is_tensor_list = packed_non_tensors["is_tensor"] + objects = packed_non_tensors["objects"] + assert len(tensors) + len(objects) == len(is_tensor_list) + obj_i = tnsr_i = 0 + for is_tensor in is_tensor_list: + if is_tensor: + mixed.append(tensors[tnsr_i]) + tnsr_i += 1 + else: + mixed.append(objects[obj_i]) + obj_i += 1 + return tuple(mixed) + + +class CheckpointFunction(torch.autograd.Function): + """Similar to the torch version, but support non-Tensor outputs. + + The caller is expected to provide a dict (*parent_ctx_dict*) that will hold + the non-Tensor outputs. These should be combined with the Tensor *outputs* + by calling ``unpack_non_tensors``. + """ + + @staticmethod + def forward(ctx, run_function, parent_ctx_dict, kwarg_keys, *args): + if torch.is_grad_enabled(): # grad may be disabled, e.g., during validation + checkpoint.check_backward_validity(args) + + ctx.run_function = run_function + ctx.kwarg_keys = kwarg_keys + ctx.fwd_rng_state = utils.get_rng_state() + + tensor_inputs, packed_non_tensor_inputs = split_non_tensors(args) + if parent_ctx_dict["offload"]: + ctx.fwd_device = tuple(x.device for x in tensor_inputs) + ctx.grad_requirements = tuple(x.requires_grad for x in tensor_inputs) + tensor_inputs = tuple( + x.to(torch.device("cpu"), non_blocking=True) for x in tensor_inputs + ) + + else: + ctx.fwd_device, ctx.grad_requirements = None, None + + ctx.save_for_backward(*tensor_inputs) + ctx.packed_non_tensor_inputs = packed_non_tensor_inputs + + with torch.no_grad(): + unpacked_args, unpacked_kwargs = unpack_kwargs(kwarg_keys, args) + outputs = run_function(*unpacked_args, **unpacked_kwargs) + + if isinstance(outputs, torch.Tensor): + return outputs + else: + # Autograd Functions don't like non-Tensor outputs. We can split the + # non-Tensor and Tensor outputs, returning the former by reference + # through *parent_ctx_dict* and returning the latter directly. + outputs, packed_non_tensor_outputs = split_non_tensors(outputs) + parent_ctx_dict["packed_non_tensor_outputs"] = packed_non_tensor_outputs + return outputs + + @staticmethod + def backward(ctx, *args): + if not torch.autograd._is_checkpoint_valid(): + raise RuntimeError( + "Checkpointing is not compatible with .grad(), please use .backward() if possible" + ) + + tensor_inputs: Tuple = ctx.saved_tensors + tensor_inputs = checkpoint.detach_variable(tensor_inputs) + if ctx.fwd_device is not None: + tensor_inputs = [ + t.to(ctx.fwd_device[i], non_blocking=True) + for i, t in enumerate(tensor_inputs) + ] + for i, need_grad in enumerate(ctx.grad_requirements): + tensor_inputs[i].requires_grad = need_grad + inputs = unpack_non_tensors(tensor_inputs, ctx.packed_non_tensor_inputs) + + # Store the current states. + bwd_rng_state = utils.get_rng_state() + + # Set the states to what it used to be before the forward pass. + utils.set_rng_state(ctx.fwd_rng_state) + + with torch.enable_grad(): + unpacked_args, unpacked_kwargs = unpack_kwargs(ctx.kwarg_keys, inputs) + outputs = ctx.run_function(*unpacked_args, **unpacked_kwargs) + tensor_outputs, _ = split_non_tensors(outputs) + # Set the states back to what it was at the start of this function. + utils.set_rng_state(bwd_rng_state) + + # Run backward() with only Tensors that require grad + outputs_with_grad = [] + args_with_grad = [] + for i in range(len(tensor_outputs)): + if tensor_outputs[i].requires_grad: + outputs_with_grad.append(tensor_outputs[i]) + args_with_grad.append(args[i]) + if len(outputs_with_grad) == 0: + raise RuntimeError( + "None of the outputs have requires_grad=True, " + "this checkpoint() is not necessary" + ) + + torch.autograd.backward(outputs_with_grad, args_with_grad) + + grads = tuple( + inp.grad if isinstance(inp, torch.Tensor) else None for inp in inputs + ) + return (None, None, None) + grads diff --git a/fairseq/modules/conformer_layer.py b/fairseq/modules/conformer_layer.py new file mode 100644 index 0000000000..964af243ec --- /dev/null +++ b/fairseq/modules/conformer_layer.py @@ -0,0 +1,301 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Optional + +import torch + +from fairseq.modules import ( + ESPNETMultiHeadedAttention, + LayerNorm, + MultiheadAttention, + RelPositionMultiHeadedAttention, + RotaryPositionMultiHeadedAttention, +) +from fairseq.utils import get_activation_fn + + +class ConvolutionModule(torch.nn.Module): + """Convolution block used in the conformer block""" + + def __init__( + self, + embed_dim, + channels, + depthwise_kernel_size, + dropout, + activation_fn="swish", + bias=False, + export=False, + ): + """ + Args: + embed_dim: Embedding dimension + channels: Number of channels in depthwise conv layers + depthwise_kernel_size: Depthwise conv layer kernel size + dropout: dropout value + activation_fn: Activation function to use after depthwise convolution kernel + bias: If bias should be added to conv layers + export: If layernorm should be exported to jit + """ + super(ConvolutionModule, self).__init__() + assert ( + depthwise_kernel_size - 1 + ) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding" + self.layer_norm = LayerNorm(embed_dim, export=export) + self.pointwise_conv1 = torch.nn.Conv1d( + embed_dim, + 2 * channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.glu = torch.nn.GLU(dim=1) + self.depthwise_conv = torch.nn.Conv1d( + channels, + channels, + depthwise_kernel_size, + stride=1, + padding=(depthwise_kernel_size - 1) // 2, + groups=channels, + bias=bias, + ) + self.batch_norm = torch.nn.BatchNorm1d(channels) + self.activation = get_activation_fn(activation_fn)(channels) + self.pointwise_conv2 = torch.nn.Conv1d( + channels, + embed_dim, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.dropout = torch.nn.Dropout(dropout) + + def forward(self, x): + """ + Args: + x: Input of shape B X T X C + Returns: + Tensor of shape B X T X C + """ + x = self.layer_norm(x) + # exchange the temporal dimension and the feature dimension + x = x.transpose(1, 2) + + # GLU mechanism + x = self.pointwise_conv1(x) # (batch, 2*channel, dim) + x = self.glu(x) # (batch, channel, dim) + + # 1D Depthwise Conv + x = self.depthwise_conv(x) + x = self.batch_norm(x) + x = self.activation(x) + + x = self.pointwise_conv2(x) + x = self.dropout(x) + return x.transpose(1, 2) + + +class FeedForwardModule(torch.nn.Module): + """Positionwise feed forward layer used in conformer""" + + def __init__( + self, + input_feat, + hidden_units, + dropout1, + dropout2, + activation_fn="swish", + bias=True, + ): + """ + Args: + input_feat: Input feature dimension + hidden_units: Hidden unit dimension + dropout1: dropout value for layer1 + dropout2: dropout value for layer2 + activation_fn: Name of activation function + bias: If linear layers should have bias + """ + + super(FeedForwardModule, self).__init__() + self.layer_norm = LayerNorm(input_feat) + self.w_1 = torch.nn.Linear(input_feat, hidden_units, bias=bias) + self.w_2 = torch.nn.Linear(hidden_units, input_feat, bias=bias) + self.dropout1 = torch.nn.Dropout(dropout1) + self.dropout2 = torch.nn.Dropout(dropout2) + self.activation = get_activation_fn(activation_fn)(hidden_units) + + def forward(self, x): + """ + Args: + x: Input Tensor of shape T X B X C + Returns: + Tensor of shape T X B X C + """ + x = self.layer_norm(x) + x = self.w_1(x) + x = self.activation(x) + x = self.dropout1(x) + x = self.w_2(x) + return self.dropout2(x) + + +class ConformerEncoderLayer(torch.nn.Module): + """Conformer block based on https://arxiv.org/abs/2005.08100. We currently don't support relative positional encoding in MHA""" + + def __init__( + self, + embed_dim, + ffn_embed_dim, + attention_heads, + dropout, + use_fp16, + depthwise_conv_kernel_size=31, + activation_fn="swish", + attn_type=None, + pos_enc_type="abs", + ): + """ + Args: + embed_dim: Input embedding dimension + ffn_embed_dim: FFN layer dimension + attention_heads: Number of attention heads in MHA + dropout: dropout value + depthwise_conv_kernel_size: Size of kernel in depthwise conv layer in convolution module + activation_fn: Activation function name to use in convulation block and feed forward block + attn_type: MHA implementation from ESPNET vs fairseq + pos_enc_type: Positional encoding type - abs, rope, rel_pos + """ + self.pos_enc_type = pos_enc_type + super(ConformerEncoderLayer, self).__init__() + + self.ffn1 = FeedForwardModule( + embed_dim, + ffn_embed_dim, + dropout, + dropout, + ) + + self.self_attn_layer_norm = LayerNorm(embed_dim, export=False) + self.self_attn_dropout = torch.nn.Dropout(dropout) + if attn_type == "espnet": + if self.pos_enc_type == "rel_pos": + self.self_attn = RelPositionMultiHeadedAttention( + embed_dim, + attention_heads, + dropout=dropout, + ) + elif self.pos_enc_type == "rope": + self.self_attn = RotaryPositionMultiHeadedAttention( + embed_dim, attention_heads, dropout=dropout, precision=use_fp16 + ) + elif self.pos_enc_type == "abs": + self.self_attn = ESPNETMultiHeadedAttention( + embed_dim, + attention_heads, + dropout=dropout, + ) + else: + raise Exception(f"Unsupported attention type {self.pos_enc_type}") + else: + # Default to fairseq MHA + self.self_attn = MultiheadAttention( + embed_dim, + attention_heads, + dropout=dropout, + ) + + self.conv_module = ConvolutionModule( + embed_dim=embed_dim, + channels=embed_dim, + depthwise_kernel_size=depthwise_conv_kernel_size, + dropout=dropout, + activation_fn=activation_fn, + ) + + self.ffn2 = FeedForwardModule( + embed_dim, + ffn_embed_dim, + dropout, + dropout, + activation_fn=activation_fn, + ) + self.final_layer_norm = LayerNorm(embed_dim, export=False) + + def forward( + self, + x, + encoder_padding_mask: Optional[torch.Tensor], + position_emb: Optional[torch.Tensor] = None, + ): + """ + Args: + x: Tensor of shape T X B X C + encoder_padding_mask: Optional mask tensor + positions: + Returns: + Tensor of shape T X B X C + """ + residual = x + x = self.ffn1(x) + x = x * 0.5 + residual + residual = x + x = self.self_attn_layer_norm(x) + if self.pos_enc_type == "rel_pos": + x, attn = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=encoder_padding_mask, + pos_emb=position_emb, + need_weights=False, + ) + else: + x, attn = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=encoder_padding_mask, + need_weights=False, + ) + x = self.self_attn_dropout(x) + x = x + residual + + residual = x + # TBC to BTC + x = x.transpose(0, 1) + x = self.conv_module(x) + # BTC to TBC + x = x.transpose(0, 1) + x = residual + x + + residual = x + x = self.ffn2(x) + + layer_result = x + + x = x * 0.5 + residual + + x = self.final_layer_norm(x) + return x, (attn, layer_result) + + +class ConformerWav2Vec2EncoderLayer(ConformerEncoderLayer): + """Encoder layer for Wav2vec2 encoder""" + + def forward( + self, + x: torch.Tensor, + self_attn_mask: torch.Tensor = None, + self_attn_padding_mask: torch.Tensor = None, + need_weights: bool = False, + att_args=None, + position_emb=None, + ): + return super().forward(x, self_attn_padding_mask, position_emb) diff --git a/fairseq/modules/conv_tbc.py b/fairseq/modules/conv_tbc.py index 2dc46c4b9b..65e17ec94f 100644 --- a/fairseq/modules/conv_tbc.py +++ b/fairseq/modules/conv_tbc.py @@ -4,7 +4,9 @@ # LICENSE file in the root directory of this source tree. import torch +from torch import nn from torch.nn.modules.utils import _single +from torch import Tensor class ConvTBC(torch.nn.Module): @@ -26,11 +28,20 @@ def __init__(self, in_channels, out_channels, kernel_size, padding=0): ) self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) - def forward(self, input): + self.reset_parameters() + + def reset_parameters(self): + nn.init.xavier_normal_(self.weight) + nn.init.zeros_(self.bias) + + def conv_tbc(self, input: Tensor): return torch.conv_tbc( input.contiguous(), self.weight, self.bias, self.padding[0] ) + def forward(self, input: Tensor): + return self.conv_tbc(input) + def __repr__(self): s = ( "{name}({in_channels}, {out_channels}, kernel_size={kernel_size}" diff --git a/fairseq/modules/cross_entropy.py b/fairseq/modules/cross_entropy.py index 0d2beb44bb..286c00eecc 100644 --- a/fairseq/modules/cross_entropy.py +++ b/fairseq/modules/cross_entropy.py @@ -8,7 +8,6 @@ import torch import torch.nn.functional as F - logger = logging.getLogger(__name__) @@ -26,12 +25,14 @@ def _cross_entropy_pytorch(logits, target, ignore_index=None, reduction="mean"): import xentropy_cuda from apex.contrib import xentropy - logger.info("using fused cross entropy") - def cross_entropy(logits, target, ignore_index=-100, reduction="mean"): if logits.device == torch.device("cpu"): return _cross_entropy_pytorch(logits, target, ignore_index, reduction) else: + if not getattr(cross_entropy, "_has_logged_once", False): + logger.info("using fused cross entropy") + cross_entropy._has_logged_once = True + half_to_float = logits.dtype == torch.half losses = xentropy.SoftmaxCrossEntropyLoss.apply( logits, @@ -52,7 +53,6 @@ def cross_entropy(logits, target, ignore_index=-100, reduction="mean"): else: raise NotImplementedError - except ImportError: def cross_entropy(logits, target, ignore_index=-100, reduction="mean"): diff --git a/fairseq/modules/cuda_utils.cu b/fairseq/modules/cuda_utils.cu index 516f1d9244..924f852758 100644 --- a/fairseq/modules/cuda_utils.cu +++ b/fairseq/modules/cuda_utils.cu @@ -1,20 +1,17 @@ /** * Copyright (c) Facebook, Inc. and its affiliates. - * + * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ - -template <typename U, typename V> -constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { - return (a + b - 1) / b; +template <typename U, typename V> +constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { + return (a + b - 1) / b; } - -template<int FS, int SB, int padding_l, typename scalar_t> -__inline__ __device__ -void zeroSharedMem(scalar_t* data) { +template <int FS, int SB, int padding_l, typename scalar_t> +__inline__ __device__ void zeroSharedMem(scalar_t* data) { /* Given an array of length FS + SB, zero out the first padding_l and last (FS - padding_l) values in the array @@ -23,13 +20,11 @@ void zeroSharedMem(scalar_t* data) { int tid = threadIdx.x; if (FS < SB) { - // zero all if we have enough threads in a block to do all of them if (tid < padding_l || tid > SB - FS + padding_l - 1) { data[tid] = scalar_t(0.0); } } else { - // otherwise zero out one block at a time const int numIterations = divUp<int, int>(FS, SB); for (int i = 0; i < numIterations; i++) { @@ -43,9 +38,8 @@ void zeroSharedMem(scalar_t* data) { } } -template<typename scalar_t> -__inline__ __device__ -scalar_t warpReduce(scalar_t data) { +template <typename scalar_t> +__inline__ __device__ scalar_t warpReduce(scalar_t data) { /* Reduce an array within each warp. After processing all values in warp will caontain the sum of all original values in that warp. @@ -60,9 +54,8 @@ scalar_t warpReduce(scalar_t data) { return data; } -template<typename scalar_t> -__inline__ __device__ -scalar_t blockReduce(scalar_t data) { +template <typename scalar_t> +__inline__ __device__ scalar_t blockReduce(scalar_t data) { /* Reduce an entire array on the block level. After processing, the first value in the array will contain the reduced sum. @@ -82,7 +75,7 @@ scalar_t blockReduce(scalar_t data) { if (lane == 0) { warpSum[wid] = sum; } - + __syncthreads(); scalar_t v; @@ -102,21 +95,23 @@ scalar_t blockReduce(scalar_t data) { } void checkCudaStatus(cudaError_t status, int lineNumber = -1) { - if (status != cudaSuccess) { - std::cout << cudaGetErrorString(status) - << " at line " << lineNumber << std::endl; + std::cout << cudaGetErrorString(status) << " at line " << lineNumber + << std::endl; std::cout << "Exiting" << std::endl; exit(1); } } -template<int FS, int SB, int padding_l, typename scalar_t> -__device__ -void load_input_to_shared(const scalar_t* input, // global memory - int inputOffset, int sequenceLength, - int iteration, int numIterations, - bool no_prev, scalar_t* output /* shared memory */) { +template <int FS, int SB, int padding_l, typename scalar_t> +__device__ void load_input_to_shared( + const scalar_t* input, // global memory + int inputOffset, + int sequenceLength, + int iteration, + int numIterations, + bool no_prev, + scalar_t* output /* shared memory */) { /* Load a block size of input into shared memory with right and left overhang of total size FS. If previously @@ -138,19 +133,20 @@ void load_input_to_shared(const scalar_t* input, // global memory // Load the left "overhang" of input if (iteration > 0) { if (padding_l < SB) { - // load all at once if (tid < padding_l) { - output[tid] = (no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB]; + output[tid] = + (no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB]; } } else { - // load in chunks of size SB int numIterations = divUp<int, int>(padding_l, SB); for (int i = 0; i < numIterations; i++) { int offset = i * SB; if ((tid + offset) < padding_l) { - output[tid + offset] = (no_prev) ? input[inputOffset - padding_l + tid + offset] : output[tid + offset + SB]; + output[tid + offset] = (no_prev) + ? input[inputOffset - padding_l + tid + offset] + : output[tid + offset + SB]; } } } @@ -158,22 +154,25 @@ void load_input_to_shared(const scalar_t* input, // global memory // Load the right "overhang" of input if (iteration < (numIterations - 1)) { - const int elementsLeft = sequenceLength - (iteration+1) * SB; + const int elementsLeft = sequenceLength - (iteration + 1) * SB; if ((FS - padding_l) < SB) { - // load all at once if (tid < (FS - padding_l)) { - output[padding_l + SB + tid] = (tid < elementsLeft) ? input[inputOffset + SB + tid] : scalar_t(0.0); + output[padding_l + SB + tid] = (tid < elementsLeft) + ? input[inputOffset + SB + tid] + : scalar_t(0.0); } } else { - // load in chunks of size SB int numIterations = divUp<int, int>(FS - padding_l, SB); for (int i = 0; i < numIterations; i++) { int offset = i * SB; if ((tid + offset) < (FS - padding_l)) { - output[padding_l + SB + tid + offset] = ((tid + offset) < elementsLeft) ? input[inputOffset + SB + tid + offset] : scalar_t(0.0); + output[padding_l + SB + tid + offset] = + ((tid + offset) < elementsLeft) + ? input[inputOffset + SB + tid + offset] + : scalar_t(0.0); } } } @@ -182,13 +181,11 @@ void load_input_to_shared(const scalar_t* input, // global memory // We should also clear out the right "overhang" if (iteration == (numIterations - 1)) { if ((FS - padding_l) < SB) { - // clear out all at once if (tid < (FS - padding_l)) { - output[padding_l + SB + tid] = scalar_t(0.0); + output[padding_l + SB + tid] = scalar_t(0.0); } } else { - // clear in chunks of size SB int numIterations = divUp<int, int>(FS - padding_l, SB); for (int i = 0; i < numIterations; i++) { @@ -199,5 +196,7 @@ void load_input_to_shared(const scalar_t* input, // global memory } } } - output[tid + padding_l] = ((inputOffset + tid) < sequenceLength) ? input[inputOffset + tid] : scalar_t(0.0); + output[tid + padding_l] = ((inputOffset + tid) < sequenceLength) + ? input[inputOffset + tid] + : scalar_t(0.0); } diff --git a/fairseq/modules/downsampled_multihead_attention.py b/fairseq/modules/downsampled_multihead_attention.py index 2cdece3f7f..5e42942a9f 100644 --- a/fairseq/modules/downsampled_multihead_attention.py +++ b/fairseq/modules/downsampled_multihead_attention.py @@ -9,6 +9,7 @@ import torch import torch.nn as nn import torch.nn.functional as F + from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.scalar_bias import scalar_bias @@ -69,7 +70,7 @@ def __init__( else: self.out_proj = Linear(out_proj_size, out_channels, bias=bias) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 def forward( self, diff --git a/fairseq/modules/dynamic_convolution.py b/fairseq/modules/dynamic_convolution.py index 5999a04539..0ff02cd62a 100644 --- a/fairseq/modules/dynamic_convolution.py +++ b/fairseq/modules/dynamic_convolution.py @@ -3,12 +3,18 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +from typing import Dict, Optional + import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils -from fairseq.incremental_decoding_utils import with_incremental_state +from fairseq.incremental_decoding_utils import ( + FairseqIncrementalState, + with_incremental_state, +) from fairseq.modules.fairseq_dropout import FairseqDropout +from torch import Tensor from .unfold import unfold1d @@ -37,7 +43,10 @@ def DynamicConv( num_heads=num_heads, weight_dropout=weight_dropout, weight_softmax=weight_softmax, + renorm_padding=renorm_padding, bias=bias, + conv_bias=conv_bias, + query_size=query_size, ) except ImportError as e: print(e) @@ -48,7 +57,10 @@ def DynamicConv( num_heads=num_heads, weight_dropout=weight_dropout, weight_softmax=weight_softmax, + renorm_padding=renorm_padding, bias=bias, + conv_bias=conv_bias, + query_size=query_size, ) @@ -257,7 +269,7 @@ def _forward_expanded(self, x, incremental_stat, query): weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False) else: P = self.padding_l - # For efficieny, we cut the kernel size and reduce the padding when the kernel is larger than the length + # For efficiency, we cut the kernel size and reduce the padding when the kernel is larger than the length if K > T and P == K - 1: weight = weight.narrow(2, K - T, T) K, P = T, T - 1 @@ -302,3 +314,213 @@ def extra_repr(self): if self.weight_dropout_module.p > 0.0: s += ", weight_dropout={}".format(self.weight_dropout_module.p) return s + + +class DynamicConv_scripatable(nn.Module, FairseqIncrementalState): + """Dynamic lightweight convolution taking T x B x C inputs + Args: + input_size: # of channels of the input + kernel_size: convolution channels + padding_l: padding to the left when using "same" padding + num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size) + weight_dropout: the drop rate of the DropConnect to drop the weight + weight_softmax: normalize the weight with softmax before the convolution + renorm_padding: re-normalize the filters to ignore the padded part (only the non-padding parts sum up to 1) + bias: use bias + conv_bias: bias of the convolution + query_size: specified when feeding a different input as the query + in_proj: project the input and generate the filter together + + Shape: + Input: TxBxC, i.e. (timesteps, batch_size, input_size) + Output: TxBxC, i.e. (timesteps, batch_size, input_size) + + Attributes: + weight: the learnable weights of the module of shape + `(num_heads, 1, kernel_size)` + bias: the learnable bias of the module of shape `(input_size)` + """ + + def __init__( + self, + input_size, + kernel_size=1, + padding_l=None, + num_heads=1, + weight_dropout=0.0, + weight_softmax=False, + renorm_padding=False, + bias=False, + conv_bias=False, + query_size=None, + in_proj=False, + ): + super().__init__() + self.input_size = input_size + self.query_size = input_size if query_size is None else query_size + self.kernel_size = kernel_size + self.padding_l = padding_l + self.num_heads = num_heads + self.weight_dropout_module = FairseqDropout( + weight_dropout, module_name=self.__class__.__name__ + ) + self.weight_softmax = weight_softmax + self.renorm_padding = renorm_padding + + if in_proj: + self.weight_linear = Linear( + self.input_size, self.input_size + num_heads * kernel_size * 1 + ) + else: + self.weight_linear = Linear( + self.query_size, num_heads * kernel_size * 1, bias=bias + ) + self.in_proj = ( + self.weight_linear.out_features + == self.input_size + self.num_heads * self.kernel_size + ) + self.has_conv_bias = conv_bias + self.conv_bias = nn.Parameter(torch.Tensor(input_size).view(1, 1, -1)) + self.init_incremental_state() + + self.reset_parameters() + + def reset_parameters(self): + self.weight_linear.reset_parameters() + if self.has_conv_bias: + nn.init.constant_(self.conv_bias, 0.0) + + def forward( + self, + x, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + query: Optional[Tensor] = None, + ): + """Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C + args: + x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size) + incremental_state: A dict to keep the state + unfold: unfold the input or not. If not, we use the matrix trick instead + query: use the specified query to predict the conv filters + """ + assert query is None or not self.in_proj + + if query is None: + query = x + + output = self._forward_unfolded(x, incremental_state, query) + + if self.has_conv_bias: + output = output + self.conv_bias + return output + + def _forward_unfolded( + self, + x, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], + query, + ): + """The conventional implementation of convolutions. + Unfolding the input by having a window shifting to the right.""" + T, B, C = x.size() + K, H = self.kernel_size, self.num_heads + R = C // H + assert R * H == C == self.input_size + + TxBxH = T * B * H + + if self.in_proj: + proj = self.weight_linear(x) + x = proj.narrow(2, 0, self.input_size).contiguous() + weight = proj.narrow(2, self.input_size, H * K).contiguous().view(TxBxH, -1) + else: + weight = self.weight_linear(query).view(TxBxH, -1) + + # renorm_padding is only implemented in _forward_expanded + assert not self.renorm_padding or incremental_state is not None + + if incremental_state is not None: + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is not None: + x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3) + else: + x_unfold = x.unsqueeze(3).clone() + if self.kernel_size > 1: + self._set_input_buffer( + incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :] + ) + x_unfold = x_unfold.view(TxBxH, R, -1) + else: + padding_l = self.padding_l + if K > T and padding_l == K - 1: + weight = weight.narrow(1, K - T, T) + K, padding_l = T, T - 1 + # unfold the input: T x B x C --> T' x B x C x K + x_unfold = unfold1d(x, K, padding_l, 0.0) + x_unfold = x_unfold.view(TxBxH, R, K) + + if self.weight_softmax and not self.renorm_padding: + weight = F.softmax(weight, dim=1) + weight = weight.narrow(1, 0, K) + + if incremental_state is not None: + weight = weight[:, -(x_unfold.size(2)) :] + K = weight.size(1) + + if self.weight_softmax and self.renorm_padding: + weight = F.softmax(weight, dim=1) + + weight = self.weight_dropout_module(weight, inplace=False) + + output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T x B x H x R x 1 + output = output.view(T, B, C) + return output + + def reorder_incremental_state( + self, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], + new_order: Tensor, + ): + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is not None: + input_buffer = input_buffer.index_select(1, new_order) + self._set_input_buffer(incremental_state, input_buffer) + + def _get_input_buffer( + self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] + ): + result = self.get_incremental_state(incremental_state, "input_buffer") + if result is not None and "input_buffer" in result: + return result["input_buffer"] + else: + return None + + def _set_input_buffer( + self, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], + new_buffer: Optional[Tensor], + ): + result = self.set_incremental_state( + incremental_state, "input_buffer", {"input_buffer": new_buffer} + ) + if result is not None: + incremental_state = result + return incremental_state + + def extra_repr(self): + s = "{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, conv_bias={}, renorm_padding={}, in_proj={}".format( # noqa + self.input_size, + self.kernel_size, + self.padding_l, + self.num_heads, + self.weight_softmax, + self.conv_bias is not None, + self.renorm_padding, + self.in_proj, + ) + + if self.query_size != self.input_size: + s += ", query_size={}".format(self.query_size) + if self.weight_dropout_module.p > 0.0: + s += ", weight_dropout={}".format(self.weight_dropout_module.p) + return s diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp index ebd4df0e96..744c363e55 100644 --- a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp +++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp @@ -8,10 +8,8 @@ #include <torch/extension.h> #include <vector> -std::vector<at::Tensor> dynamicconv_cuda_forward( - at::Tensor input, - at::Tensor filters, - int padding_l); +std::vector<at::Tensor> +dynamicconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l); std::vector<at::Tensor> dynamicconv_cuda_backward( at::Tensor gradOutput, @@ -19,21 +17,20 @@ std::vector<at::Tensor> dynamicconv_cuda_backward( at::Tensor input, at::Tensor filters); +#define CHECK_CUDA(x) \ + AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) \ + AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) \ + CHECK_CUDA(x); \ + CHECK_CONTIGUOUS(x) -#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") -#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) - -std::vector<at::Tensor> dynamicconv_forward( - at::Tensor input, - at::Tensor filters, - int padding_l) { +std::vector<at::Tensor> +dynamicconv_forward(at::Tensor input, at::Tensor filters, int padding_l) { + CHECK_INPUT(input); + CHECK_INPUT(filters); - CHECK_INPUT(input); - CHECK_INPUT(filters); - - return dynamicconv_cuda_forward(input, filters, - padding_l); + return dynamicconv_cuda_forward(input, filters, padding_l); } std::vector<at::Tensor> dynamicconv_backward( @@ -41,16 +38,14 @@ std::vector<at::Tensor> dynamicconv_backward( int padding_l, at::Tensor input, at::Tensor filters) { + CHECK_INPUT(gradOutput); + CHECK_INPUT(input); + CHECK_INPUT(filters); - CHECK_INPUT(gradOutput); - CHECK_INPUT(input); - CHECK_INPUT(filters); - - return dynamicconv_cuda_backward(gradOutput, padding_l, - input, filters); + return dynamicconv_cuda_backward(gradOutput, padding_l, input, filters); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("forward", &dynamicconv_forward, "dynamicconv forward (CUDA)"); - m.def("backward", &dynamicconv_backward, "dynamicconv backward (CUDA)"); + m.def("forward", &dynamicconv_forward, "dynamicconv forward (CUDA)"); + m.def("backward", &dynamicconv_backward, "dynamicconv backward (CUDA)"); } diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh index 2196259433..44baf21bdd 100644 --- a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh +++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh @@ -1,6 +1,6 @@ /** * Copyright (c) Facebook, Inc. and its affiliates. - * + * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ @@ -19,26 +19,25 @@ #include <utility> #include <vector> -#include <stdlib.h> #include <assert.h> #include <math.h> +#include <stdlib.h> #define SHFL_MASK 0xffffffff -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void dynamicconv_forward_kernel(const scalar_t* input, - const scalar_t* weight, - int minibatch, - int sequenceLength, - int numFeatures, - int numFiltersInBlock, - int numHeads, - scalar_t* output); +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void dynamicconv_forward_kernel( + const scalar_t* input, + const scalar_t* weight, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + int numHeads, + scalar_t* output); -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void dynamicconv_backward_kernel( +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void dynamicconv_backward_kernel( const scalar_t* gradOutput, // B * C * T const scalar_t* input, // B * C * T const scalar_t* weight, diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu index 300d35b647..4630f1e982 100644 --- a/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu +++ b/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu @@ -1,26 +1,26 @@ /** * Copyright (c) Facebook, Inc. and its affiliates. - * + * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ +#include "../cuda_utils.cu" #include "dynamicconv_cuda.cuh" -#include "dynamicconv_cuda_forward.cu" #include "dynamicconv_cuda_backward.cu" -#include "../cuda_utils.cu" +#include "dynamicconv_cuda_forward.cu" // FS is filter size and kernels are specialized for filter sizes -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void dynamicconv_forward_kernel(const scalar_t* input, - const scalar_t* weight, - int minibatch, - int sequenceLength, - int numFeatures, - int numFiltersInBlock, - int numHeads, - scalar_t* output) { +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void dynamicconv_forward_kernel( + const scalar_t* input, + const scalar_t* weight, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + int numHeads, + scalar_t* output) { assert(blockDim.x == SB); const int tid = threadIdx.x; @@ -28,8 +28,8 @@ void dynamicconv_forward_kernel(const scalar_t* input, const int featureIdx = blockIdx.y; const int head = featureIdx / numFiltersInBlock; - const int IOOffset = batchIdx * numFeatures * sequenceLength - + featureIdx * sequenceLength; + const int IOOffset = + batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength; const scalar_t* inputFeature = &input[IOOffset]; scalar_t* outputFeature = &output[IOOffset]; @@ -43,36 +43,36 @@ void dynamicconv_forward_kernel(const scalar_t* input, for (int i = 0; i < numIterations; ++i) { __syncthreads(); const int inputOffset = i * SB; - load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, - sequenceLength, i, - numIterations, false, tempInput); + load_input_to_shared<FS, SB, padding_l>( + inputFeature, + inputOffset, + sequenceLength, + i, + numIterations, + false, + tempInput); __syncthreads(); if (inputOffset + tid < sequenceLength) { - - #pragma unroll +#pragma unroll for (int k = 0; k < FS; ++k) { - const int filterOffset = batchIdx * numHeads * FS * sequenceLength - + head * FS * sequenceLength - + k * sequenceLength - + i * SB + tid; + const int filterOffset = batchIdx * numHeads * FS * sequenceLength + + head * FS * sequenceLength + k * sequenceLength + i * SB + tid; filter[k] = weight[filterOffset]; } scalar_t out = scalar_t(0.0); - #pragma unroll +#pragma unroll for (int k = 0; k < FS; ++k) { out += filter[k] * tempInput[tid + k]; } outputFeature[inputOffset + tid] = out; - } } } -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void dynamicconv_backward_kernel( +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void dynamicconv_backward_kernel( const scalar_t* gradOutput, // B * C * T const scalar_t* input, // B * C * T const scalar_t* weight, @@ -111,52 +111,60 @@ void dynamicconv_backward_kernel( int idxOffset = inputOffset + tid + k - padding; if (idxOffset >= 0 && idxOffset < sequenceLength) { - int bfilterOffset = batchIdx * numHeads * FS * sequenceLength - + headIdx * FS * sequenceLength - + (FS - k - 1) * sequenceLength - + idxOffset; + int bfilterOffset = batchIdx * numHeads * FS * sequenceLength + + headIdx * FS * sequenceLength + (FS - k - 1) * sequenceLength + + idxOffset; bfilter[k] = weight[bfilterOffset]; } else { bfilter[k] = scalar_t(0.0); } } - // iterate over filter block for (int featureIdx = 0; featureIdx < numFiltersInBlock; ++featureIdx) { __syncthreads(); // load input and output gradient for this channel and chunk - const int IOOffset = batchIdx * numFeatures * sequenceLength - + (headIdx * numFiltersInBlock + featureIdx) * sequenceLength; + const int IOOffset = batchIdx * numFeatures * sequenceLength + + (headIdx * numFiltersInBlock + featureIdx) * sequenceLength; const scalar_t* inputFeature = &input[IOOffset]; const scalar_t* gradOutputFeature = &gradOutput[IOOffset]; scalar_t* gradInputFeature = &gradInput[IOOffset]; - load_input_to_shared<FS, SB, padding>(gradOutputFeature, inputOffset, - sequenceLength, chunkIdx, - numChunks, true, tempGradOutput); - load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, - sequenceLength, chunkIdx, - numChunks, true, tempInput); + load_input_to_shared<FS, SB, padding>( + gradOutputFeature, + inputOffset, + sequenceLength, + chunkIdx, + numChunks, + true, + tempGradOutput); + load_input_to_shared<FS, SB, padding_l>( + inputFeature, + inputOffset, + sequenceLength, + chunkIdx, + numChunks, + true, + tempInput); __syncthreads(); - + // sum input and weight gradients scalar_t out = scalar_t(0.0); - #pragma unroll +#pragma unroll for (int k = 0; k < FS; ++k) { tempGradSum[k] += tempInput[tid + k] * tempGradOutput[tid + padding]; out += bfilter[k] * tempGradOutput[tid + k]; } - + if (inputOffset + tid < sequenceLength) { gradInputFeature[inputOffset + tid] = out; } } - const int gradOffset = batchIdx * numHeads * FS * sequenceLength - + headIdx * FS * sequenceLength; - scalar_t *gradWeightFeature = &gradWeight[gradOffset]; + const int gradOffset = + batchIdx * numHeads * FS * sequenceLength + headIdx * FS * sequenceLength; + scalar_t* gradWeightFeature = &gradWeight[gradOffset]; // write weight gradient if (inputOffset + tid < sequenceLength) { diff --git a/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py b/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py index 4a683d2690..711ed03483 100644 --- a/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py +++ b/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py @@ -212,7 +212,7 @@ def _forward_expanded(self, x, incremental_stat, query): weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False) else: P = self.padding_l - # For efficieny, we cut the kernel size and reduce the padding when the kernel is larger than the length + # For efficiency, we cut the kernel size and reduce the padding when the kernel is larger than the length if K > T and P == K - 1: weight = weight.narrow(2, K - T, T) K, P = T, T - 1 diff --git a/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp b/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp index 8a6af4285d..d7e57c8590 100644 --- a/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp +++ b/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp @@ -1,10 +1,8 @@ #include <torch/torch.h> #include <vector> -std::vector<float*> dynamicconv_cpu_forward( - float* input, - float* filters, - int padding_l); +std::vector<float*> +dynamicconv_cpu_forward(float* input, float* filters, int padding_l); std::vector<float*> dynamicconv_cpu_backward( float* gradOutput, @@ -12,12 +10,9 @@ std::vector<float*> dynamicconv_cpu_backward( float* input, float* filters); -std::vector<float*> dynamicconv_forward( - float* input, - float* filters, - int padding_l) { - - return dynamicconv_cpu_forward(input, filters, padding_l); +std::vector<float*> +dynamicconv_forward(float* input, float* filters, int padding_l) { + return dynamicconv_cpu_forward(input, filters, padding_l); } std::vector<float*> dynamicconv_backward( @@ -25,11 +20,10 @@ std::vector<float*> dynamicconv_backward( int padding_l, float* input, float* filters) { - - return dynamicconv_cpu_backward(gradOutput, padding_l, input, filters); + return dynamicconv_cpu_backward(gradOutput, padding_l, input, filters); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("forward", &dynamicconv_forward, "dynamicconv forward (CPU)"); - m.def("backward", &dynamicconv_backward, "dynamicconv backward (CPU)"); + m.def("forward", &dynamicconv_forward, "dynamicconv forward (CPU)"); + m.def("backward", &dynamicconv_backward, "dynamicconv backward (CPU)"); } diff --git a/fairseq/modules/ema_module.py b/fairseq/modules/ema_module.py new file mode 100644 index 0000000000..f0ece842d4 --- /dev/null +++ b/fairseq/modules/ema_module.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 + +""" +Used for EMA tracking a given pytorch module. The user is responsible for calling step() +and setting the appropriate decay +""" + +import copy +from dataclasses import dataclass, field +import logging + +import torch + +from omegaconf import II +from fairseq.dataclass import FairseqDataclass + +try: + from amp_C import multi_tensor_l2norm + + multi_tensor_l2norm_available = True +except ImportError: + multi_tensor_l2norm_available = False + +logger = logging.getLogger(__name__) + + +@dataclass +class EMAModuleConfig(FairseqDataclass): + ema_decay: float = field( + default=0.9999, metadata={"help": "decay for exponential moving average model"} + ) + ema_fp32: bool = field( + default=False, + metadata={"help": "If true, store EMA model in fp32 even if model is in fp16"}, + ) + add_missing_params: bool = True + log_norms: bool = False + + +class EMAModule: + """Exponential Moving Average of Fairseq Models""" + + def __init__( + self, + model, + config: EMAModuleConfig, + copy_model=True, + device=None, + skip_keys=None, + ): + """ + @param model model to initialize the EMA with + @param config EMAConfig object with configuration like + ema_decay, ema_update_freq, ema_fp32 + @param device If provided, copy EMA to this device (e.g. gpu). + Otherwise EMA is in the same device as the model. + """ + + self.config = config + + if copy_model: + self.model = copy.deepcopy(model) + self.model.requires_grad_(False) + else: + self.model = model + + self.config = config + self.decay = config.ema_decay + self.skip_keys = skip_keys or set() + self.add_missing_params = config.add_missing_params + self.fp32_params = {} + + if device is not None: + logging.info(f"Copying EMA model to device {device}") + self.model = self.model.to(device=device) + + if self.config.ema_fp32: + self.build_fp32_params() + + self.log_norms = config.log_norms and multi_tensor_l2norm_available + self.logs = {} + + def build_fp32_params(self, state_dict=None): + """ + Store a copy of the EMA params in fp32. + If state dict is passed, the EMA params is copied from + the provided state dict. Otherwise, it is copied from the + current EMA model parameters. + """ + if not self.config.ema_fp32: + raise RuntimeError( + "build_fp32_params should not be called if ema_fp32=False. " + "Use ema_fp32=True if this is really intended." + ) + + if state_dict is None: + state_dict = self.model.state_dict() + + def _to_float(t): + return t.float() if torch.is_floating_point(t) else t + + for param_key in state_dict: + if param_key in self.fp32_params: + if param_key == "__sq_mom": + self.fp32_params[param_key] = state_dict[param_key] + else: + self.fp32_params[param_key].copy_(state_dict[param_key]) + else: + self.fp32_params[param_key] = _to_float(state_dict[param_key]) + if "__sq_mom" in self.fp32_params: + self.fp32_params["__sq_mom"][param_key] = torch.zeros_like( + self.fp32_params[param_key] + ) + + def restore(self, state_dict, build_fp32_params=False): + """Load data from a model spec into EMA model""" + self.model.load_state_dict(state_dict, strict=False) + if build_fp32_params: + self.build_fp32_params(state_dict) + + def set_decay(self, decay, weight_decay=None): + self.decay = decay + if weight_decay is not None: + self.weight_decay = weight_decay + + def get_decay(self): + return self.decay + + def _step_internal(self, new_model): + """One update of the EMA model based on new model weights""" + decay = self.decay + + ema_state_dict = {} + ema_params = ( + self.fp32_params if self.config.ema_fp32 else self.model.state_dict() + ) + + new_p = [] + ema_p = [] + + for key, param in new_model.named_parameters(): + if isinstance(param, dict): + continue + + if not self.add_missing_params and key not in ema_params: + continue + + try: + ema_param = ema_params[key] + except KeyError: + ema_param = ( + param.float().clone() if param.ndim == 1 else copy.deepcopy(param) + ) + ema_params[key] = ema_param + + if param.shape != ema_param.shape: + raise ValueError( + "incompatible tensor shapes between model param and ema param" + + "{} vs. {}".format(param.shape, ema_param.shape) + ) + + if "version" in key: + # Do not decay a model.version pytorch param + continue + + lr = 1 - decay + + if key in self.skip_keys or not param.requires_grad: + ema_params[key].copy_(param.to(dtype=ema_param.dtype).data) + ema_param = ema_params[key] + else: + if self.log_norms: + new_p.append(param) + ema_p.append(ema_param) + + ema_param.mul_(1 - lr) + ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=lr) + + ema_state_dict[key] = ema_param + + for key, param in new_model.named_buffers(): + ema_state_dict[key] = param + + if self.log_norms: + if "model_norm" in self.logs: + self.prev_model_norm = self.logs["model_norm"] + + chunk_size = 2048 * 32 + has_inf = torch.zeros( + (1, 1), dtype=torch.int, device=next(new_model.parameters()).device + ) + + new_norm = multi_tensor_l2norm(chunk_size, has_inf, [new_p], False) + old_norm = multi_tensor_l2norm(chunk_size, has_inf, [ema_p], False) + + self.logs["model_norm"] = new_norm[0] + self.logs["ema_norm"] = old_norm[0] + + self.restore(ema_state_dict, build_fp32_params=False) + + @torch.no_grad() + def step(self, new_model): + self._step_internal(new_model) + + def reverse(self, model): + """ + Load the model parameters from EMA model. + Useful for inference or fine-tuning from the EMA model. + """ + d = self.model.state_dict() + if "_ema" in d: + del d["_ema"] + + model.load_state_dict(d, strict=False) + return model diff --git a/fairseq/modules/espnet_multihead_attention.py b/fairseq/modules/espnet_multihead_attention.py new file mode 100644 index 0000000000..82bc0d7b45 --- /dev/null +++ b/fairseq/modules/espnet_multihead_attention.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2019 Shigeki Karita +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Multi-Head Attention layer definition.""" + +import math + +import torch +from torch import nn + +from fairseq.modules.rotary_positional_embedding import ( + RotaryPositionalEmbedding, + apply_rotary_pos_emb, +) + + +class ESPNETMultiHeadedAttention(nn.Module): + """Multi-Head Attention layer. + Args: + n_head: The number of heads. + n_feat: The number of features. + dropout: Dropout rate. + """ + + def __init__(self, n_feat, n_head, dropout): + """Construct an MultiHeadedAttention object.""" + super(ESPNETMultiHeadedAttention, self).__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + self.linear_q = nn.Linear(n_feat, n_feat) + self.linear_k = nn.Linear(n_feat, n_feat) + self.linear_v = nn.Linear(n_feat, n_feat) + self.linear_out = nn.Linear(n_feat, n_feat) + self.attn = None + self.dropout = nn.Dropout(p=dropout) + + def forward_qkv(self, query, key, value, **kwargs): + """Transform query, key and value. + Args: + query: Query tensor B X T1 X C + key: Key tensor B X T2 X C + value: Value tensor B X T2 X C + Returns: + torch.Tensor: Transformed query tensor B X n_head X T1 X d_k + torch.Tensor: Transformed key tensor B X n_head X T2 X d_k + torch.Tensor: Transformed value tensor B X n_head X T2 X d_k + """ + n_batch = query.size(0) + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + q = q.transpose(1, 2) # (batch, head, time1, d_k) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + return q, k, v + + def forward_attention(self, value, scores, mask): + """Compute attention context vector. + Args: + value: Transformed value B X n_head X T2 X d_k. + scores: Attention score B X n_head X T1 X T2 + mask: Mask T2 X B + Returns: + torch.Tensor: Transformed value B X T1 X d_model + weighted by the attention score B X T1 X T2 + """ + n_batch = value.size(0) + if mask is not None: + scores = scores.masked_fill( + mask.unsqueeze(1).unsqueeze(2).to(bool), + float("-inf"), # (batch, head, time1, time2) + ) + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + else: + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + p_attn = self.dropout(self.attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, query, key, value, key_padding_mask=None, **kwargs): + """Compute scaled dot product attention. + Args: + query (torch.Tensor): Query tensor T X B X C + key (torch.Tensor): Key tensor T X B X C + value (torch.Tensor): Value tensor T X B X C + mask (torch.Tensor): Mask tensor T X B + Returns: + torch.Tensor: Output tensor T X B X D. + """ + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + + q, k, v = self.forward_qkv(query, key, value) + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + scores = self.forward_attention(v, scores, key_padding_mask) + scores = scores.transpose(0, 1) + return scores, None + + +class RelPositionMultiHeadedAttention(ESPNETMultiHeadedAttention): + """Multi-Head Attention layer with relative position encoding. + Paper: https://arxiv.org/abs/1901.02860 + Args: + n_head: The number of heads. + n_feat: The number of features. + dropout: Dropout rate. + zero_triu: Whether to zero the upper triangular part of attention matrix. + """ + + def __init__(self, n_feat, n_head, dropout, zero_triu=False): + """Construct an RelPositionMultiHeadedAttention object.""" + super().__init__(n_feat, n_head, dropout) + self.zero_triu = zero_triu + # linear transformation for positional encoding + self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.zeros(self.h, self.d_k)) + self.pos_bias_v = nn.Parameter(torch.zeros(self.h, self.d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + def rel_shift(self, x): + """Compute relative positional encoding. + Args: + x: Input tensor B X n_head X T X 2T-1 + Returns: + torch.Tensor: Output tensor. + """ + zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) + x = x_padded[:, :, 1:].view_as(x)[ + :, :, :, : x.size(-1) // 2 + 1 + ] # only keep the positions from 0 to time2 + + if self.zero_triu: + ones = torch.ones((x.size(2), x.size(3)), device=x.device) + x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] + + return x + + def forward(self, query, key, value, pos_emb, key_padding_mask=None, **kwargs): + """Compute scaled dot product attention. + Args: + query: Query tensor T X B X C + key: Key tensor T X B X C + value: Value tensor T X B X C + pos_emb: Positional embedding tensor B X 2T-1 X C + key_padding_mask: Mask tensor T X B + Returns: + torch.Tensor: Output tensor T X B X C. + """ + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + pos_emb = pos_emb.transpose(0, 1) + q, k, v = self.forward_qkv(query, key, value) + q = q.transpose(1, 2) # (batch, time1, head, d_k) + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) + p = p.transpose(1, 2) # (batch, head, 2*time1-1, d_k) + + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch, head, time1, time2) + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + # compute matrix b and matrix d + # (batch, head, time1, 2*time1-1) + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + matrix_bd = self.rel_shift(matrix_bd) + + scores = (matrix_ac + matrix_bd) / math.sqrt( + self.d_k + ) # (batch, head, time1, time2) + + scores = self.forward_attention(v, scores, key_padding_mask) + scores = scores.transpose(0, 1) + return scores, None + + +class RotaryPositionMultiHeadedAttention(ESPNETMultiHeadedAttention): + def __init__( + self, + n_feat, + n_head, + dropout, + precision, + rotary_emd_base=10000, + ): + """Construct an RotaryPositionMultiHeadedAttention object.""" + super().__init__(n_feat, n_head, dropout) + precision = torch.float + self.rotary_ndims = self.d_k # also try self.d_k//2 + if precision == "fp16": + precision = torch.half + + self.rotary_emb = RotaryPositionalEmbedding( + self.rotary_ndims, base=rotary_emd_base, precision=precision + ) + + def forward(self, query, key, value, key_padding_mask=None, **kwargs): + """Compute rotary position attention. + Args: + query: Query tensor T X B X C + key: Key tensor T X B X C + value: Value tensor T X B X C + key_padding_mask: Mask tensor T X B + Returns: + torch.Tensor: Output tensor T X B X D. + Notes: + Assumes self attn + """ + + T, B, C = value.size() + query = query.view(T, B, self.h, self.d_k) + key = key.view(T, B, self.h, self.d_k) + value = value.view(T, B, self.h, self.d_k) + cos, sin = self.rotary_emb(value, seq_len=T) + query, key = apply_rotary_pos_emb( + query, key, cos, sin, offset=0 + ) # offset is based on layer_past + + query = query.view(T, B, self.h * self.d_k) + key = key.view(T, B, self.h * self.d_k) + value = value.view(T, B, self.h * self.d_k) + + # TBD to BTD + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + + q, k, v = self.forward_qkv(query, key, value) + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + scores = self.forward_attention(v, scores, key_padding_mask) + scores = scores.transpose(0, 1) + return scores, None diff --git a/fairseq/modules/fairseq_dropout.py b/fairseq/modules/fairseq_dropout.py index f070a804e6..3cddca7718 100644 --- a/fairseq/modules/fairseq_dropout.py +++ b/fairseq/modules/fairseq_dropout.py @@ -21,7 +21,7 @@ def __init__(self, p, module_name=None): self.apply_during_inference = False def forward(self, x, inplace: bool = False): - if self.training or self.apply_during_inference: + if self.p > 0 and (self.training or self.apply_during_inference): return F.dropout(x, p=self.p, training=True, inplace=inplace) else: return x diff --git a/fairseq/modules/fp32_batch_norm.py b/fairseq/modules/fp32_batch_norm.py new file mode 100644 index 0000000000..c560f338fd --- /dev/null +++ b/fairseq/modules/fp32_batch_norm.py @@ -0,0 +1,44 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +batch norm done in fp32 (for fp16 training) +""" +import torch +import torch.nn as nn + + +class Fp32BatchNorm(nn.Module): + def __init__(self, sync=False, *args, **kwargs): + super().__init__() + + if sync: + from fairseq.distributed import utils + + if utils.get_global_world_size() == 1: + sync = False + + if sync: + self.bn = nn.SyncBatchNorm(*args, **kwargs) + else: + self.bn = nn.BatchNorm1d(*args, **kwargs) + + self.sync = sync + + def forward(self, input): + if self.bn.running_mean.dtype != torch.float: + if self.sync: + self.bn.running_mean = self.bn.running_mean.float() + self.bn.running_var = self.bn.running_var.float() + if self.bn.affine: + try: + self.bn.weight = self.bn.weight.float() + self.bn.bias = self.bn.bias.float() + except: + self.bn.float() + else: + self.bn.float() + + output = self.bn(input.float()) + return output.type_as(input) diff --git a/fairseq/modules/fp32_instance_norm.py b/fairseq/modules/fp32_instance_norm.py new file mode 100644 index 0000000000..30a54496de --- /dev/null +++ b/fairseq/modules/fp32_instance_norm.py @@ -0,0 +1,35 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Layer norm done in fp32 (for fp16 training) +""" + +import torch.nn as nn +import torch.nn.functional as F + + +class Fp32InstanceNorm(nn.InstanceNorm1d): + def __init__(self, *args, **kwargs): + self.transpose_last = "transpose_last" in kwargs and kwargs["transpose_last"] + if "transpose_last" in kwargs: + del kwargs["transpose_last"] + super().__init__(*args, **kwargs) + + def forward(self, input): + if self.transpose_last: + input = input.transpose(1, 2) + output = F.instance_norm( + input.float(), + running_mean=self.running_mean, + running_var=self.running_var, + weight=self.weight.float() if self.weight is not None else None, + bias=self.bias.float() if self.bias is not None else None, + use_input_stats=self.training or not self.track_running_stats, + momentum=self.momentum, + eps=self.eps, + ) + if self.transpose_last: + output = output.transpose(1, 2) + return output.type_as(input) diff --git a/fairseq/modules/gumbel_vector_quantizer.py b/fairseq/modules/gumbel_vector_quantizer.py index 47657bb0ab..867b019f67 100644 --- a/fairseq/modules/gumbel_vector_quantizer.py +++ b/fairseq/modules/gumbel_vector_quantizer.py @@ -21,6 +21,8 @@ def __init__( activation=nn.GELU(), weight_proj_depth=1, weight_proj_factor=1, + hard=True, + std=0, ): """Vector quantization using gumbel softmax @@ -44,6 +46,7 @@ def __init__( self.input_dim = dim self.num_vars = num_vars self.time_first = time_first + self.hard = hard assert ( vq_dim % groups == 0 @@ -53,7 +56,10 @@ def __init__( num_groups = groups if not combine_groups else 1 self.vars = nn.Parameter(torch.FloatTensor(1, num_groups * num_vars, var_dim)) - nn.init.uniform_(self.vars) + if std == 0: + nn.init.uniform_(self.vars) + else: + nn.init.normal_(self.vars, mean=0, std=std) if weight_proj_depth > 1: @@ -73,7 +79,11 @@ def block(input_dim, output_dim): nn.init.normal_(self.weight_proj.weight, mean=0, std=1) nn.init.zeros_(self.weight_proj.bias) - assert len(temp) == 3, temp + if isinstance(temp, str): + import ast + + temp = ast.literal_eval(temp) + assert len(temp) == 3, f"{temp}, {len(temp)}" self.max_temp, self.min_temp, self.temp_decay = temp self.curr_temp = self.max_temp @@ -81,7 +91,7 @@ def block(input_dim, output_dim): def set_num_updates(self, num_updates): self.curr_temp = max( - self.max_temp * self.temp_decay ** num_updates, self.min_temp + self.max_temp * self.temp_decay**num_updates, self.min_temp ) def get_codebook_indices(self): @@ -96,7 +106,7 @@ def get_codebook_indices(self): if not self.combine_groups: self.codebook_indices = self.codebook_indices.view( - self.num_vars ** self.groups, -1 + self.num_vars**self.groups, -1 ) for b in range(1, self.groups): self.codebook_indices[:, b] += self.num_vars * b @@ -108,7 +118,7 @@ def codebook(self): return ( self.vars.squeeze(0) .index_select(0, indices) - .view(self.num_vars ** self.groups, -1) + .view(self.num_vars**self.groups, -1) ) def sample_from_codebook(self, b, n): @@ -128,7 +138,7 @@ def to_codebook_index(self, indices): res = indices.new_full(indices.shape[:-1], 0) for i in range(self.groups): exponent = self.groups - i - 1 - res += indices[..., i] * (self.num_vars ** exponent) + res += indices[..., i] * (self.num_vars**exponent) return res def forward_idx(self, x): @@ -147,16 +157,17 @@ def forward(self, x, produce_targets=False): x = self.weight_proj(x) x = x.view(bsz * tsz * self.groups, -1) - _, k = x.max(-1) - hard_x = ( - x.new_zeros(*x.shape) - .scatter_(-1, k.view(-1, 1), 1.0) - .view(bsz * tsz, self.groups, -1) - ) - hard_probs = torch.mean(hard_x.float(), dim=0) - result["code_perplexity"] = torch.exp( - -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1) - ).sum() + with torch.no_grad(): + _, k = x.max(-1) + hard_x = ( + x.new_zeros(*x.shape) + .scatter_(-1, k.view(-1, 1), 1.0) + .view(bsz * tsz, self.groups, -1) + ) + hard_probs = torch.mean(hard_x.float(), dim=0) + result["code_perplexity"] = torch.exp( + -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1) + ).sum() avg_probs = torch.softmax( x.view(bsz * tsz, self.groups, -1).float(), dim=-1 @@ -168,7 +179,9 @@ def forward(self, x, produce_targets=False): result["temp"] = self.curr_temp if self.training: - x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=True).type_as(x) + x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=self.hard).type_as( + x + ) else: x = hard_x diff --git a/fairseq/modules/kmeans_attention.py b/fairseq/modules/kmeans_attention.py new file mode 100644 index 0000000000..0088d1ebdc --- /dev/null +++ b/fairseq/modules/kmeans_attention.py @@ -0,0 +1,744 @@ +import math +from functools import reduce, wraps +from inspect import isfunction +from operator import mul + +import torch +import torch.nn as nn +import torch.nn.functional as F +from aml.multimodal_video.utils.einops.lib import rearrange, repeat +from aml.multimodal_video.utils.einops.lib.layers.torch import Rearrange + +from fairseq.modules.local_attention import LocalAttention + +# constants + +TOKEN_SELF_ATTN_VALUE = -5e4 +KMEAN_INIT_ITERS = 10 + +# helper functions + + +def exists(val): + return val is not None + + +def identity(x, *args, **kwargs): + return x + + +def default(x, d): + if not exists(x): + return d if not isfunction(d) else d() + return x + + +def cast_tuple(x): + return x if isinstance(x, tuple) else (x,) + + +def cache_fn(f): + cache = None + + @wraps(f) + def cached_fn(*args, **kwargs): + nonlocal cache + if exists(cache): + return cache + cache = f(*args, **kwargs) + return cache + + return cached_fn + + +def to(t): + return {"device": t.device, "dtype": t.dtype} + + +def find_modules(nn_module, type): + return [module for module in nn_module.modules() if isinstance(module, type)] + + +def is_empty(t): + return t.nelement() == 0 + + +def max_neg_value(tensor): + return -torch.finfo(tensor.dtype).max + + +def batched_index_select(values, indices): + last_dim = values.shape[-1] + return values.gather(2, expand_dim(indices, -1, last_dim)) + + +def merge_dims(ind_from, ind_to, tensor): + shape = list(tensor.shape) + arr_slice = slice(ind_from, ind_to + 1) + shape[arr_slice] = [reduce(mul, shape[arr_slice])] + return tensor.reshape(*shape) + + +def expand_dim(t, dim, k): + t = t.unsqueeze(dim) + expand_shape = [-1] * len(t.shape) + expand_shape[dim] = k + return t.expand(*expand_shape) + + +def scatter_mean(src, t, index, dim, eps=1e-5): + numer = src.scatter_add(dim, index, t) + denom = src.scatter_add(dim, index, torch.ones_like(t)) + return numer / (denom + eps) + + +def split_at_index(dim, index, t): + pre_slices = (slice(None),) * dim + l = (*pre_slices, slice(None, index)) + r = (*pre_slices, slice(index, None)) + return t[l], t[r] + + +def reshape_dim(t, dim, split_dims): + shape = list(t.shape) + num_dims = len(shape) + dim = (dim + num_dims) % num_dims + shape[dim : dim + 1] = split_dims + return t.reshape(shape) + + +def ema(old, new, decay): + if not exists(old): + return new + return old * decay + new * (1 - decay) + + +def ema_inplace(moving_avg, new, decay): + if is_empty(moving_avg): + moving_avg.data.copy_(new) + return + moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay)) + + +# helper classes + + +def map_first_tuple_or_el(x, fn): + if isinstance(x, tuple): + return (fn(x[0]),) + x[1:] + return fn(x) + + +class Chunk(nn.Module): + def __init__(self, chunks, fn, along_dim=-1): + super().__init__() + self.dim = along_dim + self.chunks = chunks + self.fn = fn + + def forward(self, x, **kwargs): + if self.chunks <= 1: + return self.fn(x, **kwargs) + chunks = x.chunk(self.chunks, dim=self.dim) + return torch.cat([self.fn(c, **kwargs) for c in chunks], dim=self.dim) + + +class PreNorm(nn.ModuleList): + def __init__(self, norm_class, dim, fn): + super().__init__() + self.norm = norm_class(dim) + self.fn = fn + + def forward(self, x, **kwargs): + x = self.norm(x) + return self.fn(x, **kwargs) + + +class ReZero(nn.Module): + def __init__(self, fn): + super().__init__() + self.residual_weight = nn.Parameter(torch.zeros(1)) + self.fn = fn + + def forward(self, x, **kwargs): + x = self.fn(x, **kwargs) + return map_first_tuple_or_el(x, lambda t: t * self.residual_weight) + + +class ScaleNorm(nn.Module): + def __init__(self, dim, eps=1e-5): + super().__init__() + self.g = nn.Parameter(torch.ones(1)) + self.eps = eps + + def forward(self, x): + def norm(t): + n = torch.norm(t, dim=-1, keepdim=True).clamp(min=self.eps) + return t / n * self.g + + return map_first_tuple_or_el(x, norm) + + +class ProjectInOut(nn.Module): + def __init__(self, fn, dim_in, dim_out, project_out=True): + super().__init__() + self.fn = fn + self.project_in = nn.Linear(dim_in, dim_out) + self.project_out = nn.Linear(dim_out, dim_in) if project_out else identity + + def forward(self, x, **kwargs): + x = self.project_in(x) + x, loss = self.fn(x, **kwargs) + x = self.project_out(x) + return x, loss + + +class MatrixMultiply(nn.Module): + def __init__(self, tensor, transpose=False): + super().__init__() + self.tensor = tensor + self.transpose = transpose + + def forward(self, x): + tensor = self.tensor + if self.transpose: + tensor = tensor.t() + return x @ tensor + + +# positional embeddings + + +class DepthWiseConv1d(nn.Module): + def __init__(self, dim_in, dim_out, kernel_size, stride=1, bias=True, causal=False): + super().__init__() + self.padding = ( + ((kernel_size - 1), 0) if causal else (kernel_size // 2, kernel_size // 2) + ) + + self.net = nn.Sequential( + nn.Conv1d( + dim_in, + dim_in, + kernel_size=kernel_size, + groups=dim_in, + stride=stride, + bias=bias, + ), + nn.Conv1d(dim_in, dim_out, 1, bias=bias), + ) + + def forward(self, x): + x = F.pad(x, self.padding, value=0.0) + return self.net(x) + + +class FixedPositionalEmbedding(nn.Module): + def __init__(self, dim, max_seq_len): + super().__init__() + inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) + position = torch.arange(0, max_seq_len, dtype=torch.float) + sinusoid_inp = torch.einsum("i,j->ij", position, inv_freq) + emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1) + self.register_buffer("emb", emb) + + def forward(self, x): + return self.emb[None, : x.shape[1], :].to(x) + + +def rotate_every_two(x): + x = rearrange(x, "... (d j) -> ... d j", j=2) + x1, x2 = x.unbind(dim=-1) + x = torch.stack((-x2, x1), dim=-1) + return rearrange(x, "... d j -> ... (d j)") + + +def apply_rotary_pos_emb(q, k, sinu_pos): + sinu_pos = rearrange(sinu_pos, "() n (j d) -> n j d", j=2) + sin, cos = sinu_pos.unbind(dim=-2) + sin, cos = map(lambda t: repeat(t, "b n -> b (n j)", j=2), (sin, cos)) + q, k = map(lambda t: (t * cos) + (rotate_every_two(t) * sin), (q, k)) + return q, k + + +# kmeans related function and class + + +def update_kmeans_on_backwards(module): + module.kmean_modules = find_modules(module, Kmeans) + + def hook(_, grad_in, grad_out): + for m in module.kmean_modules: + m.update() + + return module.register_backward_hook(hook) + + +def similarity(x, means): + return torch.einsum("bhld,hcd->bhlc", x, means) + + +def dists_and_buckets(x, means): + dists = similarity(x, means) + _, buckets = torch.max(dists, dim=-1) + return dists, buckets + + +def batched_bincount(index, num_classes, dim=-1): + shape = list(index.shape) + shape[dim] = num_classes + out = index.new_zeros(shape) + out.scatter_add_(dim, index, torch.ones_like(index, dtype=index.dtype)) + return out + + +def kmeans_iter(x, means, buckets=None): + b, h, _, d, dtype, num_clusters = *x.shape, x.dtype, means.shape[1] + + if not exists(buckets): + _, buckets = dists_and_buckets(x, means) + + bins = batched_bincount(buckets, num_clusters).sum(0, keepdim=True) + zero_mask = bins.long() == 0 + + means_ = buckets.new_zeros(b, h, num_clusters, d, dtype=dtype) + means_.scatter_add_(-2, expand_dim(buckets, -1, d), x) + means_ = F.normalize(means_.sum(0, keepdim=True), dim=-1).type(dtype) + + means = torch.where(zero_mask.unsqueeze(-1), means, means_) + means = means.squeeze(0) + return means + + +def distribution(dists, window_size): + _, topk_indices = dists.topk(k=window_size, dim=-2) + indices = topk_indices.transpose(-2, -1) + return indices.reshape(*indices.size()[:2], -1) + + +class Kmeans(nn.Module): + def __init__( + self, num_heads, head_dim, num_clusters, ema_decay=0.999, commitment=1e-4 + ): + super().__init__() + self.commitment = commitment + self.ema_decay = ema_decay + + self.register_buffer("means", torch.randn(num_heads, num_clusters, head_dim)) + self.register_buffer("initted", torch.tensor(False)) + self.num_new_means = 0 + self.new_means = None + + @torch.no_grad() + def init(self, x): + if self.initted: + return + _, h, _, d, device, _ = *x.shape, x.device, x.dtype + + num_clusters = self.means.shape[1] + + means = x.transpose(0, 1).contiguous().view(h, -1, d) + num_samples = means.shape[1] + + if num_samples >= num_clusters: + indices = torch.randperm(num_samples, device=device)[:num_clusters] + else: + indices = torch.randint(0, num_samples, (num_clusters,), device=device) + + means = means[:, indices] + + for _ in range(KMEAN_INIT_ITERS): + means = kmeans_iter(x, means) + + self.num_new_means = 0 + self.means.data.copy_(means) + self.initted.data.copy_(torch.tensor(True)) + + @torch.no_grad() + def update(self, new_means=None): + new_means = default(new_means, self.new_means) + assert exists(new_means), "new kmeans has not been supplied" + ema_inplace(self.means, new_means, self.ema_decay) + + del self.new_means + self.new_means = None + self.num_new_means = 0 + + def forward(self, x, update_means=False): + self.init(x) + + b, dtype = x.shape[0], x.dtype + means = self.means.type(dtype) + x = F.normalize(x, 2, dim=-1).type(dtype) + + with torch.no_grad(): + dists, buckets = dists_and_buckets(x, means) + + routed_means = batched_index_select(expand_dim(means, 0, b), buckets) + loss = F.mse_loss(x, routed_means) * self.commitment + + if update_means: + with torch.no_grad(): + means = kmeans_iter(x, means, buckets) + self.new_means = ema( + self.new_means, means, self.num_new_means / (self.num_new_means + 1) + ) + self.num_new_means += 1 + + return dists, loss + + +# kmeans attention class + + +class KmeansAttention(nn.Module): + def __init__( + self, + num_clusters, + window_size, + num_heads, + head_dim, + causal=False, + dropout=0.0, + ema_decay=0.999, + commitment=1e-4, + context_window_size=None, + receives_context=False, + num_mem_kv=0, + shared_qk=False, + ): + super().__init__() + self.num_heads = num_heads + self.num_clusters = num_clusters + self.head_dim = head_dim + + self.window_size = window_size + self.context_window_size = default(context_window_size, window_size) + self.causal = causal + + self.shared_qk = shared_qk + self.receives_context = receives_context + self.kmeans = Kmeans(num_heads, head_dim, num_clusters, ema_decay, commitment) + self.dropout = nn.Dropout(dropout) + + self.num_mem_kv = max(num_mem_kv, 1 if causal and not shared_qk else 0) + self.mem_key = nn.Parameter( + torch.randn(num_heads, num_clusters, self.num_mem_kv, head_dim) + ) + self.mem_value = nn.Parameter( + torch.randn(num_heads, num_clusters, self.num_mem_kv, head_dim) + ) + + def forward(self, q, k, v, query_mask=None, key_mask=None, **kwargs): + b, h, t, d, kv_t, wsz, c_wsz, nc, device, dtype = ( + *q.shape, + k.shape[2], + self.window_size, + self.context_window_size, + self.num_clusters, + q.device, + q.dtype, + ) + is_reverse = kwargs.pop("_reverse", False) + + out = torch.zeros_like(q, dtype=dtype) + + update_kmeans = self.training and not is_reverse + + key_mask = ( + default(key_mask, query_mask) if not self.receives_context else key_mask + ) + kv_wsz = wsz if not self.receives_context else c_wsz + + wsz = min(wsz, t) + kv_wsz = min(kv_wsz, kv_t) + + if not self.shared_qk or self.receives_context: + dists, aux_loss = self.kmeans(torch.cat((q, k), dim=2), update_kmeans) + q_dists, k_dists = split_at_index(2, t, dists) + indices = distribution(q_dists, wsz) + kv_indices = distribution(k_dists, kv_wsz) + else: + dists, aux_loss = self.kmeans(q, update_kmeans) + k = F.normalize(k, dim=-1).to(q) + indices = distribution(dists, wsz) + kv_indices = indices + + q = batched_index_select(q, indices) + k = batched_index_select(k, kv_indices) + v = batched_index_select(v, kv_indices) + + reshape_with_window = lambda x: x.reshape(b, h, nc, -1, d) + q, k, v = map(reshape_with_window, (q, k, v)) + + m_k, m_v = map( + lambda x: expand_dim(x, 0, b).to(q), (self.mem_key, self.mem_value) + ) + k, v = map(lambda x: torch.cat(x, dim=3), ((m_k, k), (m_v, v))) + + dots = torch.einsum("bhnid,bhnjd->bhnij", q, k) * (d**-0.5) + + mask_value = max_neg_value(dots) + + if exists(query_mask) or exists(key_mask): + query_mask = default( + query_mask, lambda: torch.ones((b, t), device=device).bool() + ) + key_mask = default( + key_mask, lambda: torch.ones((b, kv_t), device=device).bool() + ) + + q_mask = expand_dim(query_mask, 1, h).gather(2, indices) + kv_mask = expand_dim(key_mask, 1, h).gather(2, kv_indices) + q_mask, kv_mask = map(lambda t: t.reshape(b, h, nc, -1), (q_mask, kv_mask)) + mask = q_mask[:, :, :, :, None] * kv_mask[:, :, :, None, :] + mask = F.pad(mask, (self.num_mem_kv, 0), value=1) + dots.masked_fill_(~mask, mask_value) + del mask + + if self.causal: + q_mask, kv_mask = map( + lambda t: t.reshape(b, h, nc, -1), (indices, kv_indices) + ) + mask = q_mask[:, :, :, :, None] >= kv_mask[:, :, :, None, :] + mask = F.pad(mask, (self.num_mem_kv, 0), value=1) + dots.masked_fill_(~mask, mask_value) + del mask + + if self.shared_qk: + q_mask, kv_mask = map( + lambda t: t.reshape(b, h, nc, -1), (indices, kv_indices) + ) + mask = q_mask[:, :, :, :, None] == kv_mask[:, :, :, None, :] + mask = F.pad(mask, (self.num_mem_kv, 0), value=0) + dots.masked_fill_(mask, TOKEN_SELF_ATTN_VALUE) + del mask + + dots = dots.softmax(dim=-1) + dots = self.dropout(dots) + + bo = torch.einsum("bhcij,bhcjd->bhcid", dots, v) + so = torch.reshape(bo, (b, h, -1, bo.shape[-1])).type(dtype) + out = scatter_mean(out, so, indices.unsqueeze(-1).expand_as(so), -2) + return out, aux_loss + + +# feedforward + + +class GELU_(nn.Module): + def forward(self, x): + return ( + 0.5 + * x + * ( + 1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))) + ) + ) + + +GELU = nn.GELU if hasattr(nn, "GELU") else GELU_ + + +class FeedForward(nn.Module): + def __init__(self, dim, mult=4, dropout=0.0, activation=None, glu=False): + super().__init__() + activation = default(activation, GELU) + + self.glu = glu + self.w1 = nn.Linear(dim, dim * mult * (2 if glu else 1)) + self.act = activation() + self.dropout = nn.Dropout(dropout) + self.w2 = nn.Linear(dim * mult, dim) + + def forward(self, x, **kwargs): + if not self.glu: + x = self.w1(x) + x = self.act(x) + else: + x, v = self.w1(x).chunk(2, dim=-1) + x = self.act(x) * v + + x = self.dropout(x) + x = self.w2(x) + return x + + +# self attention + + +class SelfAttention(nn.Module): + def __init__( + self, + dim, + max_seq_len, + heads, + local_attn_heads, + window_size, + dim_head=None, + local_attn_window_size=None, + local_attn_radius_blocks=1, + causal=False, + attn_dropout=0.0, + dropout=0.0, + kmeans_ema_decay=0.999, + commitment_factor=1e-4, + receives_context=False, + context_window_size=None, + rel_pos_emb=True, + num_mem_kv=0, + shared_qk=False, + conv_query_kernel=9, + ): + super().__init__() + assert ( + dim_head or (dim % heads) == 0 + ), "hidden dimension must be divisible by number of heads" + assert ( + max_seq_len % window_size + ) == 0, "maximum sequence length must be divisible by the target window size" + assert ( + local_attn_heads <= heads + ), "number of local attention heads must be less than total heads" + assert not ( + receives_context and local_attn_heads > 0 + ), "local attention cannot be used for self attention with context" + assert not ( + receives_context and causal + ), "contextual attention layer cannot be causal" + + local_attn_window_size = default(local_attn_window_size, window_size) + context_window_size = default(context_window_size, window_size) + + self.shared_qk = shared_qk + self.receives_context = receives_context + self.heads = heads + self.local_attn_heads = local_attn_heads + self.global_attn_heads = heads - local_attn_heads + + self.causal = causal + self.window_size = window_size + + dim_head = default(dim_head, dim // heads) + dim_heads = dim_head * heads + self.dim_head = dim_head + + num_clusters = max_seq_len // window_size + + # local + + local_dim_heads = dim_head * self.local_attn_heads + + if self.local_attn_heads > 0: + rel_pos_emb_config = (dim_head, local_attn_heads) if rel_pos_emb else None + self.local_attn = LocalAttention( + dim=dim_head, + window_size=local_attn_window_size, + causal=causal, + dropout=attn_dropout, + rel_pos_emb_config=rel_pos_emb_config, + look_backward=local_attn_radius_blocks, + look_forward=0 if causal else local_attn_radius_blocks, + ) + self.local_to_qkv = nn.Linear(dim, 3 * local_dim_heads) + + # global + + global_dim_heads = dim_head * self.global_attn_heads + + if self.global_attn_heads > 0: + self.global_attn = KmeansAttention( + num_clusters, + window_size, + self.global_attn_heads, + dim_head, + causal=causal, + dropout=attn_dropout, + ema_decay=kmeans_ema_decay, + commitment=commitment_factor, + receives_context=receives_context, + num_mem_kv=num_mem_kv, + shared_qk=shared_qk, + ) + + self.to_q = nn.Sequential( + Rearrange("b n c -> b c n"), + DepthWiseConv1d(dim, global_dim_heads, conv_query_kernel, causal=causal), + Rearrange("b c n -> b n c"), + ) + + self.to_v = nn.Linear(dim, global_dim_heads, bias=False) + + if not self.shared_qk: + self.to_k = nn.Linear(dim, global_dim_heads, bias=False) + + # out + + self.to_out = nn.Linear(dim_heads, dim, bias=False) + self.dropout = nn.Dropout(dropout) + + def forward( + self, + query, + key, + value, + context=None, + key_padding_mask=None, + context_mask=None, + pos_emb=None, + **kwargs + ): + assert not ( + self.receives_context and not exists(context) + ), "context must be passed if self attention is set to receive context" + input_mask = key_padding_mask + x = query.transpose(0, 1) + b, t, _, h, dh = *x.shape, self.heads, self.dim_head + has_local, has_global = map( + lambda x: x > 0, (self.local_attn_heads, self.global_attn_heads) + ) + + split_heads = ( + lambda v: reshape_dim(v, -1, (-1, dh)).transpose(1, 2).contiguous() + ) + + if has_local: + local_qkv = self.local_to_qkv(x).chunk(3, dim=-1) + lq, lk, lv = map(split_heads, local_qkv) + + if has_global: + kv_input = x if not self.receives_context else context + + q, v = self.to_q(x), self.to_v(kv_input) + + if not self.shared_qk: + k = self.to_k(kv_input) + else: + k = self.to_q(kv_input) if self.receives_context else q + + q, k, v = map(split_heads, (q, k, v)) + + out = [] + total_loss = torch.tensor(0.0, requires_grad=True, **to(x)) + + if has_local: + local_out = self.local_attn(lq, lk, lv, input_mask=input_mask) + out.append(local_out) + + if has_global: + if not self.receives_context and exists(pos_emb): + q, k = apply_rotary_pos_emb(q, k, pos_emb) + + global_out, loss = self.global_attn( + q, k, v, query_mask=input_mask, key_mask=context_mask + ) + total_loss = total_loss + loss + + out.append(global_out) + + out = torch.cat(out, dim=1) + out = out.reshape(b, h, t, -1).transpose(1, 2).reshape(b, t, -1) + out = self.dropout(out.transpose(0, 1)) + # out = self.to_out(out) + return out, total_loss diff --git a/fairseq/modules/kmeans_vector_quantizer.py b/fairseq/modules/kmeans_vector_quantizer.py index 040db1e83e..1015c38999 100644 --- a/fairseq/modules/kmeans_vector_quantizer.py +++ b/fairseq/modules/kmeans_vector_quantizer.py @@ -100,15 +100,16 @@ def forward(self, x, produce_targets=False): assert ze.shape == zq.shape, (ze.shape, zq.shape) x = self._pass_grad(ze, zq) - hard_x = ( - idx.new_zeros(bsz * tsz * self.groups, self.num_vars) - .scatter_(-1, idx.view(-1, 1), 1.0) - .view(bsz * tsz, self.groups, -1) - ) - hard_probs = torch.mean(hard_x.float(), dim=0) - result["code_perplexity"] = torch.exp( - -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1) - ).sum() + with torch.no_grad(): + hard_x = ( + idx.new_zeros(bsz * tsz * self.groups, self.num_vars) + .scatter_(-1, idx.view(-1, 1), 1.0) + .view(bsz * tsz, self.groups, -1) + ) + hard_probs = torch.mean(hard_x.float(), dim=0) + result["code_perplexity"] = torch.exp( + -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1) + ).sum() if produce_targets: result["targets"] = idx diff --git a/fairseq/modules/layer_norm.py b/fairseq/modules/layer_norm.py index 234609d9e2..0b276ce02f 100644 --- a/fairseq/modules/layer_norm.py +++ b/fairseq/modules/layer_norm.py @@ -7,7 +7,6 @@ import torch.nn as nn import torch.nn.functional as F - try: from apex.normalization import FusedLayerNorm as _FusedLayerNorm @@ -22,13 +21,12 @@ def forward(self, x): with torch.cuda.device(x.device): return super().forward(x) - except ImportError: has_fused_layernorm = False def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False): - if torch.jit.is_scripting(): + if torch.jit.is_scripting() or torch.jit.is_tracing(): export = True if not export and torch.cuda.is_available() and has_fused_layernorm: return FusedLayerNorm(normalized_shape, eps, elementwise_affine) diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda.cpp b/fairseq/modules/lightconv_layer/lightconv_cuda.cpp index 4bf6b5ad36..ece47a8d90 100644 --- a/fairseq/modules/lightconv_layer/lightconv_cuda.cpp +++ b/fairseq/modules/lightconv_layer/lightconv_cuda.cpp @@ -8,10 +8,8 @@ #include <torch/extension.h> #include <vector> -std::vector<at::Tensor> lightconv_cuda_forward( - at::Tensor input, - at::Tensor filters, - int padding_l); +std::vector<at::Tensor> +lightconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l); std::vector<at::Tensor> lightconv_cuda_backward( at::Tensor gradOutput, @@ -19,20 +17,20 @@ std::vector<at::Tensor> lightconv_cuda_backward( at::Tensor input, at::Tensor filters); +#define CHECK_CUDA(x) \ + AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) \ + AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) \ + CHECK_CUDA(x); \ + CHECK_CONTIGUOUS(x) -#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") -#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) - -std::vector<at::Tensor> lightconv_forward( - at::Tensor input, - at::Tensor filters, - int padding_l) { +std::vector<at::Tensor> +lightconv_forward(at::Tensor input, at::Tensor filters, int padding_l) { + CHECK_INPUT(input); + CHECK_INPUT(filters); - CHECK_INPUT(input); - CHECK_INPUT(filters); - - return lightconv_cuda_forward(input, filters, padding_l); + return lightconv_cuda_forward(input, filters, padding_l); } std::vector<at::Tensor> lightconv_backward( @@ -40,15 +38,14 @@ std::vector<at::Tensor> lightconv_backward( int padding_l, at::Tensor input, at::Tensor filters) { + CHECK_INPUT(gradOutput); + CHECK_INPUT(input); + CHECK_INPUT(filters); - CHECK_INPUT(gradOutput); - CHECK_INPUT(input); - CHECK_INPUT(filters); - - return lightconv_cuda_backward(gradOutput, padding_l, input, filters); + return lightconv_cuda_backward(gradOutput, padding_l, input, filters); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("forward", &lightconv_forward, "lighconv forward (CUDA)"); - m.def("backward", &lightconv_backward, "lighconv backward (CUDA)"); + m.def("forward", &lightconv_forward, "lighconv forward (CUDA)"); + m.def("backward", &lightconv_backward, "lighconv backward (CUDA)"); } diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda.cuh b/fairseq/modules/lightconv_layer/lightconv_cuda.cuh index 3cae57b68f..610ab399e9 100644 --- a/fairseq/modules/lightconv_layer/lightconv_cuda.cuh +++ b/fairseq/modules/lightconv_layer/lightconv_cuda.cuh @@ -1,6 +1,6 @@ /** * Copyright (c) Facebook, Inc. and its affiliates. - * + * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ @@ -18,23 +18,24 @@ #include <utility> #include <vector> -#include <stdlib.h> #include <assert.h> +#include <stdlib.h> #define SHFL_MASK 0xffffffff -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void lightconv_forward_kernel(const scalar_t* input, - const scalar_t* filters, - int minibatch, int sequenceLength, - int numFeatures, int numFiltersInBlock, - scalar_t* output); +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void lightconv_forward_kernel( + const scalar_t* input, + const scalar_t* filters, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + scalar_t* output); -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void lightconv_grad_wrt_input_kernel( - const scalar_t* input, +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void lightconv_grad_wrt_input_kernel( + const scalar_t* input, const scalar_t* filters, int minibatch, int sequenceLength, @@ -42,9 +43,8 @@ void lightconv_grad_wrt_input_kernel( int numFiltersInBlock, scalar_t* output); -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void lightconv_grad_wrt_weights_firstpass_short_kernel( +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void lightconv_grad_wrt_weights_firstpass_short_kernel( const scalar_t* input, const scalar_t* gradInput, int minibatch, @@ -54,17 +54,15 @@ void lightconv_grad_wrt_weights_firstpass_short_kernel( int numHeads, float* output); -template<int FS, int SB, typename scalar_t> -__global__ -void lightconv_grad_wrt_weights_secondpass_short_kernel( +template <int FS, int SB, typename scalar_t> +__global__ void lightconv_grad_wrt_weights_secondpass_short_kernel( const float* input, - const int minibatch, + const int minibatch, const int numFiltersInBlock, scalar_t* output); -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void lightconv_grad_wrt_weights_firstpass_kernel( +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void lightconv_grad_wrt_weights_firstpass_kernel( const scalar_t* input, const scalar_t* gradInput, int minibatch, @@ -73,11 +71,9 @@ void lightconv_grad_wrt_weights_firstpass_kernel( int numFiltersInBlock, float* output); -template<int FS, int SB, typename scalar_t> -__global__ -void lightconv_grad_wrt_weights_secondpass_kernel( +template <int FS, int SB, typename scalar_t> +__global__ void lightconv_grad_wrt_weights_secondpass_kernel( const float* input, - const int minibatch, + const int minibatch, const int numFiltersInBlock, scalar_t* output); - diff --git a/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu b/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu index 8ee83a56c8..cdf31d5d2d 100644 --- a/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu +++ b/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu @@ -1,29 +1,31 @@ /** * Copyright (c) Facebook, Inc. and its affiliates. - * + * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ +#include "../cuda_utils.cu" #include "lightconv_cuda.cuh" -#include "lightconv_cuda_forward.cu" #include "lightconv_cuda_backward.cu" -#include "../cuda_utils.cu" - -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void lightconv_forward_kernel(const scalar_t* input, - const scalar_t* filters, - int minibatch, int sequenceLength, - int numFeatures, int numFiltersInBlock, - scalar_t* output) { +#include "lightconv_cuda_forward.cu" +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void lightconv_forward_kernel( + const scalar_t* input, + const scalar_t* filters, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + scalar_t* output) { const int tid = threadIdx.x; const int batchIdx = blockIdx.x; const int featureIdx = blockIdx.y; const int filterIdx = featureIdx / numFiltersInBlock; - const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength; + const int IOOffset = + numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength; const scalar_t* inputFeature = &input[IOOffset]; scalar_t* outputFeature = &output[IOOffset]; const scalar_t* inputFilter = &filters[filterIdx * FS]; @@ -31,7 +33,7 @@ void lightconv_forward_kernel(const scalar_t* input, assert(blockDim.x == SB); scalar_t filter[FS]; - #pragma unroll +#pragma unroll for (int i = 0; i < FS; ++i) { filter[i] = inputFilter[i]; } @@ -45,13 +47,19 @@ void lightconv_forward_kernel(const scalar_t* input, // Read input into shared memory const int inputOffset = i * SB; - load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength, - i, numIterations, (numIterations == 1), temp); + load_input_to_shared<FS, SB, padding_l>( + inputFeature, + inputOffset, + sequenceLength, + i, + numIterations, + (numIterations == 1), + temp); __syncthreads(); scalar_t out = 0; - #pragma unroll +#pragma unroll for (int j = 0; j < FS; ++j) { out += filter[j] * temp[tid + j]; } @@ -66,9 +74,8 @@ void lightconv_forward_kernel(const scalar_t* input, } } -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void lightconv_grad_wrt_input_kernel( +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void lightconv_grad_wrt_input_kernel( const scalar_t* input, const scalar_t* filters, int minibatch, @@ -76,14 +83,14 @@ void lightconv_grad_wrt_input_kernel( int numFeatures, int numFiltersInBlock, scalar_t* output) { - // input grad kernel is similar to forward kernel const int tid = threadIdx.x; const int batchIdx = blockIdx.x; const int featureIdx = blockIdx.y; const int filterIdx = featureIdx / numFiltersInBlock; - const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength; + const int IOOffset = + numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength; const scalar_t* inputFeature = &input[IOOffset]; scalar_t* outputFeature = &output[IOOffset]; const scalar_t* inputFilter = &filters[filterIdx * FS]; @@ -92,8 +99,8 @@ void lightconv_grad_wrt_input_kernel( scalar_t filter[FS]; - // The only change is loading the filter in reverse - #pragma unroll +// The only change is loading the filter in reverse +#pragma unroll for (int i = 0; i < FS; ++i) { filter[i] = inputFilter[FS - i - 1]; } @@ -110,13 +117,19 @@ void lightconv_grad_wrt_input_kernel( // Read input into shared memory const int inputOffset = i * SB; - load_input_to_shared<FS, SB, padding>(inputFeature, inputOffset, sequenceLength, - i, numIterations, false, temp); + load_input_to_shared<FS, SB, padding>( + inputFeature, + inputOffset, + sequenceLength, + i, + numIterations, + false, + temp); __syncthreads(); scalar_t out = 0; - #pragma unroll +#pragma unroll for (int j = 0; j < FS; ++j) { out += filter[j] * temp[tid + j]; } @@ -133,9 +146,8 @@ void lightconv_grad_wrt_input_kernel( // This is by far the most expensive kernel in terms of time taken. // Can be 16x slower than the forward or grad_wrt_input when filter size is 31 -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void lightconv_grad_wrt_weights_firstpass_short_kernel( +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void lightconv_grad_wrt_weights_firstpass_short_kernel( const scalar_t* input, const scalar_t* gradInput, int minibatch, @@ -144,7 +156,6 @@ void lightconv_grad_wrt_weights_firstpass_short_kernel( int numFiltersInBlock, int numHeads, float* output) { - const int tid = threadIdx.x; const int batchIdx = blockIdx.x; const int filterIdx = blockIdx.y; @@ -166,52 +177,60 @@ void lightconv_grad_wrt_weights_firstpass_short_kernel( accumWeights[i] = float(0.0); } - // loop over each sequence within filterblock - for (int idxInFilterBlock = 0; idxInFilterBlock < numFiltersInBlock; ++idxInFilterBlock) { - - const int featureOffset = batchIdx * numFeatures * sequenceLength + (filterIdx * numFiltersInBlock + idxInFilterBlock) * sequenceLength; + for (int idxInFilterBlock = 0; idxInFilterBlock < numFiltersInBlock; + ++idxInFilterBlock) { + const int featureOffset = batchIdx * numFeatures * sequenceLength + + (filterIdx * numFiltersInBlock + idxInFilterBlock) * sequenceLength; const scalar_t* inputFeature = &input[featureOffset]; const scalar_t* gradInputFeature = &gradInput[featureOffset]; zeroSharedMem<FS, SB, padding_l>(tempInput); - zeroSharedMem<FS, SB, (FS/2)>(tempGradInput); + zeroSharedMem<FS, SB, (FS / 2)>(tempGradInput); __syncthreads(); for (int i = 0; i < numIterations; ++i) { - const int inputOffset = i * SB; - load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength, - i, numIterations, false, tempInput); - load_input_to_shared<FS, SB, (FS/2)>(gradInputFeature, inputOffset, sequenceLength, - i, numIterations, false, tempGradInput); + load_input_to_shared<FS, SB, padding_l>( + inputFeature, + inputOffset, + sequenceLength, + i, + numIterations, + false, + tempInput); + load_input_to_shared<FS, SB, (FS / 2)>( + gradInputFeature, + inputOffset, + sequenceLength, + i, + numIterations, + false, + tempGradInput); __syncthreads(); - const int gradIndex = (FS/2) + tid; + const int gradIndex = (FS / 2) + tid; scalar_t tempGrad = tempGradInput[gradIndex]; - #pragma unroll +#pragma unroll for (int j = 0; j < FS; j++) { const int inputIndex = tid + j; accumWeights[j] += tempInput[inputIndex] * tempGrad; } __syncthreads(); - } - } // Row-major sum for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) { - float temp; if (tid < sequenceLength) { - temp = accumWeights[filterWeightIdx]; + temp = accumWeights[filterWeightIdx]; } else { - temp = float(0.0); + temp = float(0.0); } const int outputOffset = filterWeightIdx * minibatch + batchIdx; @@ -224,14 +243,12 @@ void lightconv_grad_wrt_weights_firstpass_short_kernel( } } -template<int FS, int SB, typename scalar_t> -__global__ -void lightconv_grad_wrt_weights_secondpass_short_kernel( +template <int FS, int SB, typename scalar_t> +__global__ void lightconv_grad_wrt_weights_secondpass_short_kernel( const float* input, const int minibatch, const int numFiltersInBlock, scalar_t* output) { - assert(blockDim.x == SB); const int tid = threadIdx.x; @@ -239,8 +256,8 @@ void lightconv_grad_wrt_weights_secondpass_short_kernel( const int filterIdx = blockIdx.x; const int filterWeightIdx = blockIdx.y; - const int inputOffset = filterIdx * FS * minibatch + - filterWeightIdx * minibatch; + const int inputOffset = + filterIdx * FS * minibatch + filterWeightIdx * minibatch; const float* tempInput = &input[inputOffset]; // read into shared memory for reduction @@ -261,9 +278,8 @@ void lightconv_grad_wrt_weights_secondpass_short_kernel( // This is by far the most expensive kernel in terms of time taken. // Can be 16x slower than the forward or grad_wrt_input when filter size is 31 -template<int FS, int SB, int padding_l, typename scalar_t> -__global__ -void lightconv_grad_wrt_weights_firstpass_kernel( +template <int FS, int SB, int padding_l, typename scalar_t> +__global__ void lightconv_grad_wrt_weights_firstpass_kernel( const scalar_t* input, const scalar_t* gradInput, int minibatch, @@ -271,7 +287,6 @@ void lightconv_grad_wrt_weights_firstpass_kernel( int numFeatures, int numFiltersInBlock, float* output) { - assert(blockDim.x == SB); const int tid = threadIdx.x; @@ -287,7 +302,7 @@ void lightconv_grad_wrt_weights_firstpass_kernel( __shared__ scalar_t tempInput[SB + FS]; __shared__ scalar_t tempGradInput[SB + FS]; zeroSharedMem<FS, SB, padding_l>(tempInput); - zeroSharedMem<FS, SB, (FS/2)>(tempGradInput); + zeroSharedMem<FS, SB, (FS / 2)>(tempGradInput); __syncthreads(); float accumWeights[FS]; @@ -296,23 +311,37 @@ void lightconv_grad_wrt_weights_firstpass_kernel( accumWeights[i] = float(0.0); } - const int IOOffset = batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength; + const int IOOffset = + batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength; const scalar_t* inputFeature = &input[IOOffset]; const scalar_t* gradInputFeature = &gradInput[IOOffset]; - float* tempOutputGradWeight = &output[filterIdx * FS * minibatch * numFiltersInBlock]; + float* tempOutputGradWeight = + &output[filterIdx * FS * minibatch * numFiltersInBlock]; for (int i = 0; i < numIterations; ++i) { const int inputOffset = i * SB; - load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength, - i, numIterations, false, tempInput); - load_input_to_shared<FS, SB, (FS/2)>(gradInputFeature, inputOffset, sequenceLength, - i, numIterations, false, tempGradInput); + load_input_to_shared<FS, SB, padding_l>( + inputFeature, + inputOffset, + sequenceLength, + i, + numIterations, + false, + tempInput); + load_input_to_shared<FS, SB, (FS / 2)>( + gradInputFeature, + inputOffset, + sequenceLength, + i, + numIterations, + false, + tempGradInput); __syncthreads(); - #pragma unroll +#pragma unroll for (int j = 0; j < FS; ++j) { - accumWeights[j] += tempInput[tid + j] * tempGradInput[tid + (FS/2)]; + accumWeights[j] += tempInput[tid + j] * tempGradInput[tid + (FS / 2)]; } __syncthreads(); @@ -320,7 +349,6 @@ void lightconv_grad_wrt_weights_firstpass_kernel( // Row-major sum for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) { - // Write to shared memory before reduction if (tid < sequenceLength) { temp = accumWeights[filterWeightIdx]; @@ -331,8 +359,7 @@ void lightconv_grad_wrt_weights_firstpass_kernel( temp = blockReduce(temp); const int outputOffset = filterWeightIdx * minibatch * numFiltersInBlock + - batchIdx * numFiltersInBlock + - idxInFilterBlock; + batchIdx * numFiltersInBlock + idxInFilterBlock; if (tid == 0) { tempOutputGradWeight[outputOffset] = temp; @@ -340,14 +367,12 @@ void lightconv_grad_wrt_weights_firstpass_kernel( } } -template<int FS, int SB, typename scalar_t> -__global__ -void lightconv_grad_wrt_weights_secondpass_kernel( +template <int FS, int SB, typename scalar_t> +__global__ void lightconv_grad_wrt_weights_secondpass_kernel( const float* input, const int minibatch, const int numFiltersInBlock, scalar_t* output) { - assert(blockDim.x == SB); const int tid = threadIdx.x; @@ -356,7 +381,7 @@ void lightconv_grad_wrt_weights_secondpass_kernel( const int filterWeightIdx = blockIdx.y; const int inputOffset = filterIdx * FS * minibatch * numFiltersInBlock + - filterWeightIdx * minibatch * numFiltersInBlock; + filterWeightIdx * minibatch * numFiltersInBlock; const float* tempInput = &input[inputOffset]; int readIndex = tid; diff --git a/fairseq/modules/linearized_convolution.py b/fairseq/modules/linearized_convolution.py index 09a8f201c0..1c7a9f09ac 100644 --- a/fairseq/modules/linearized_convolution.py +++ b/fairseq/modules/linearized_convolution.py @@ -10,6 +10,9 @@ from .conv_tbc import ConvTBC +from typing import Dict, Optional +from torch import Tensor + @with_incremental_state class LinearizedConvolution(ConvTBC): @@ -38,7 +41,12 @@ def upgrade_state_dict_named(self, state_dict, name): if prefix + "_linearized_weight" in state_dict: del state_dict[prefix + "_linearized_weight"] - def forward(self, input, incremental_state=None): + @torch.jit.export + def forward( + self, + input, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + ): """ Args: incremental_state: Used to buffer signal; if not None, then input is @@ -49,7 +57,7 @@ def forward(self, input, incremental_state=None): Batch x Time x Channel during inference """ if incremental_state is None: - output = super().forward(input) + output = self.conv_tbc(input) if self.kernel_size[0] > 1 and self.padding[0] > 0: # remove future timesteps added by padding output = output[: -self.padding[0], :, :] @@ -76,29 +84,42 @@ def forward(self, input, incremental_state=None): output = F.linear(input.view(bsz, -1), weight, self.bias) return output.view(bsz, 1, -1) - def reorder_incremental_state(self, incremental_state, new_order): + @torch.jit.unused + def reorder_incremental_state( + self, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], + new_order, + ): input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: input_buffer = input_buffer.index_select(0, new_order) self._set_input_buffer(incremental_state, input_buffer) - def _get_input_buffer(self, incremental_state): + @torch.jit.unused + def _get_input_buffer( + self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] + ): return utils.get_incremental_state(self, incremental_state, "input_buffer") - def _set_input_buffer(self, incremental_state, new_buffer): + @torch.jit.unused + def _set_input_buffer( + self, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], + new_buffer, + ): return utils.set_incremental_state( self, incremental_state, "input_buffer", new_buffer ) + @torch.jit.unused def _get_linearized_weight(self): if self._linearized_weight is None: kw = self.kernel_size[0] weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous() assert weight.size() == (self.out_channels, kw, self.in_channels) - self._linearized_weight = torch.nn.Parameter( - weight.view(self.out_channels, -1) - ) + return weight.view(self.out_channels, -1) return self._linearized_weight + @torch.jit.unused def _clear_linearized_weight(self, *args): self._linearized_weight = None diff --git a/fairseq/modules/location_attention.py b/fairseq/modules/location_attention.py new file mode 100644 index 0000000000..dbbbfb9f2d --- /dev/null +++ b/fairseq/modules/location_attention.py @@ -0,0 +1,83 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch.nn as nn +import torch +import torch.nn.functional as F + + +class LocationAttention(nn.Module): + """ + Attention-Based Models for Speech Recognition + https://arxiv.org/pdf/1506.07503.pdf + + :param int encoder_dim: # projection-units of encoder + :param int decoder_dim: # units of decoder + :param int attn_dim: attention dimension + :param int conv_dim: # channels of attention convolution + :param int conv_kernel_size: filter size of attention convolution + """ + + def __init__( + self, + attn_dim, + encoder_dim, + decoder_dim, + attn_state_kernel_size, + conv_dim, + conv_kernel_size, + scaling=2.0, + ): + super(LocationAttention, self).__init__() + self.attn_dim = attn_dim + self.decoder_dim = decoder_dim + self.scaling = scaling + self.proj_enc = nn.Linear(encoder_dim, attn_dim) + self.proj_dec = nn.Linear(decoder_dim, attn_dim, bias=False) + self.proj_attn = nn.Linear(conv_dim, attn_dim, bias=False) + self.conv = nn.Conv1d( + attn_state_kernel_size, + conv_dim, + 2 * conv_kernel_size + 1, + padding=conv_kernel_size, + bias=False, + ) + self.proj_out = nn.Sequential(nn.Tanh(), nn.Linear(attn_dim, 1)) + + self.proj_enc_out = None # cache + + def clear_cache(self): + self.proj_enc_out = None + + def forward(self, encoder_out, encoder_padding_mask, decoder_h, attn_state): + """ + :param torch.Tensor encoder_out: padded encoder hidden state B x T x D + :param torch.Tensor encoder_padding_mask: encoder padding mask + :param torch.Tensor decoder_h: decoder hidden state B x D + :param torch.Tensor attn_prev: previous attention weight B x K x T + :return: attention weighted encoder state (B, D) + :rtype: torch.Tensor + :return: previous attention weights (B x T) + :rtype: torch.Tensor + """ + bsz, seq_len, _ = encoder_out.size() + if self.proj_enc_out is None: + self.proj_enc_out = self.proj_enc(encoder_out) + + # B x K x T -> B x C x T + attn = self.conv(attn_state) + # B x C x T -> B x T x C -> B x T x D + attn = self.proj_attn(attn.transpose(1, 2)) + + if decoder_h is None: + decoder_h = encoder_out.new_zeros(bsz, self.decoder_dim) + dec_h = self.proj_dec(decoder_h).view(bsz, 1, self.attn_dim) + + out = self.proj_out(attn + self.proj_enc_out + dec_h).squeeze(2) + out.masked_fill_(encoder_padding_mask, -float("inf")) + + w = F.softmax(self.scaling * out, dim=1) + c = torch.sum(encoder_out * w.view(bsz, seq_len, 1), dim=1) + return c, w diff --git a/fairseq/modules/lstm_cell_with_zoneout.py b/fairseq/modules/lstm_cell_with_zoneout.py new file mode 100644 index 0000000000..273308951f --- /dev/null +++ b/fairseq/modules/lstm_cell_with_zoneout.py @@ -0,0 +1,37 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch.nn as nn + + +class LSTMCellWithZoneOut(nn.Module): + """ + Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations + https://arxiv.org/abs/1606.01305 + """ + + def __init__( + self, prob: float, input_size: int, hidden_size: int, bias: bool = True + ): + super(LSTMCellWithZoneOut, self).__init__() + self.lstm_cell = nn.LSTMCell(input_size, hidden_size, bias=bias) + self.prob = prob + if prob > 1.0 or prob < 0.0: + raise ValueError( + "zoneout probability must be in the range from " "0.0 to 1.0." + ) + + def zoneout(self, h, next_h, prob): + if isinstance(h, tuple): + return tuple([self.zoneout(h[i], next_h[i], prob) for i in range(len(h))]) + + if self.training: + mask = h.new_zeros(*h.size()).bernoulli_(prob) + return mask * h + (1 - mask) * next_h + + return prob * h + (1 - prob) * next_h + + def forward(self, x, h): + return self.zoneout(h, self.lstm_cell(x, h), self.prob) diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py index 99f95deb5f..262132dfe7 100644 --- a/fairseq/modules/multihead_attention.py +++ b/fairseq/modules/multihead_attention.py @@ -4,20 +4,63 @@ # LICENSE file in the root directory of this source tree. import math -from typing import Dict, Optional, Tuple +from typing import Dict, List, Optional, Tuple import torch import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn import Parameter + +try: + from xformers.components.attention import build_attention + from xformers.components.attention.utils import maybe_merge_masks + + _xformers_available = True +except ImportError: + _xformers_available = False + from fairseq import utils -from fairseq.incremental_decoding_utils import with_incremental_state from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.quant_noise import quant_noise -from torch import Tensor, nn -from torch.nn import Parameter +from fairseq.models.fairseq_incremental_decoder import FairseqIncrementalDecoder -@with_incremental_state -class MultiheadAttention(nn.Module): +# TODO: move this into xformers? +# TODO: uint8 input type should just output a bool +def _mask_for_xformers(mask: Tensor, to_dtype: Optional[torch.dtype] = None): + """ + call to pytorch multihead accepts three mask types: + - ByteTensor where non-zero means to mask + - FloatTensor which is an additive mask + - BoolTensor where True means to mask + xFormers currently accepts boolean and additive maks. For boolean masks + the values have opposite meaning. For a BoolTensor True mean to keep the value. + """ + float_types = [torch.float, torch.float16] + # If an input mask is a float it is an additive mask. Otherwise it is either uint8 or bool. + additive = mask.dtype in float_types + # If to_dype is not specified, keep same dtype as mask. + to_dtype = mask.dtype if to_dtype is None else to_dtype + to_additive = to_dtype in float_types + + if additive: + if to_additive: + return mask.to(to_dtype) + mask = mask < 0 + + if to_additive: + # return additive mask + new_mask = torch.zeros_like(mask, dtype=to_dtype) + new_mask = new_mask.masked_fill_(mask, -float("inf")) + return new_mask + + # In xFormers True is value to keep rather than value to mask + mask = ~mask.to(torch.bool) + mask = mask.to(to_dtype) + return mask + + +class MultiheadAttention(FairseqIncrementalDecoder): """Multi-headed attention. See "Attention Is All You Need" for more details. @@ -35,10 +78,25 @@ def __init__( add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, + dictionary=None, q_noise=0.0, qn_block_size=8, + # TODO: pass in config rather than string. + # config defined in xformers.components.attention.AttentionConfig + xformers_att_config: Optional[str] = None, + xformers_blocksparse_layout: Optional[ + torch.Tensor + ] = None, # This should be part of the config + xformers_blocksparse_blocksize: Optional[ + int + ] = 16, # This should be part of the config ): - super().__init__() + super().__init__(dictionary) + + xformers_att_config = utils.eval_str_dict(xformers_att_config) + self.use_xformers = xformers_att_config is not None + if self.use_xformers and not _xformers_available: + raise ImportError("\n\n Please install xFormers.") self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim @@ -53,7 +111,7 @@ def __init__( assert ( self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention @@ -83,18 +141,30 @@ def __init__( self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn - + self.beam_size = 1 self.reset_parameters() + if self.use_xformers: + xformers_att_config["dropout"] = xformers_att_config.get("dropout", dropout) + xformers_att_config["num_heads"] = xformers_att_config.get( + "num_heads", num_heads + ) + + if xformers_blocksparse_layout is not None: + # Could be part of a single config passed only once + xformers_att_config["block_size"] = xformers_blocksparse_blocksize + xformers_att_config["layout"] = xformers_blocksparse_layout + xformers_att_config["name"] = "blocksparse" + + self.attention = build_attention(xformers_att_config) + self.onnx_trace = False - self.tpu = False + self.skip_embed_dim_check = False + self.init_incremental_state() def prepare_for_onnx_export_(self): self.onnx_trace = True - def prepare_for_tpu_(self, **kwargs): - self.tpu = True - def reset_parameters(self): if self.qkv_same_dim: # Empirically observed the convergence to be much better with @@ -115,12 +185,293 @@ def reset_parameters(self): if self.bias_v is not None: nn.init.xavier_normal_(self.bias_v) - def forward( + def _get_reserve_head_index(self, num_heads_to_keep: int): + k_proj_heads_norm = [] + q_proj_heads_norm = [] + v_proj_heads_norm = [] + + for i in range(self.num_heads): + start_idx = i * self.head_dim + end_idx = (i + 1) * self.head_dim + k_proj_heads_norm.append( + torch.sum( + torch.abs( + self.k_proj.weight[ + start_idx:end_idx, + ] + ) + ).tolist() + + torch.sum(torch.abs(self.k_proj.bias[start_idx:end_idx])).tolist() + ) + q_proj_heads_norm.append( + torch.sum( + torch.abs( + self.q_proj.weight[ + start_idx:end_idx, + ] + ) + ).tolist() + + torch.sum(torch.abs(self.q_proj.bias[start_idx:end_idx])).tolist() + ) + v_proj_heads_norm.append( + torch.sum( + torch.abs( + self.v_proj.weight[ + start_idx:end_idx, + ] + ) + ).tolist() + + torch.sum(torch.abs(self.v_proj.bias[start_idx:end_idx])).tolist() + ) + + heads_norm = [] + for i in range(self.num_heads): + heads_norm.append( + k_proj_heads_norm[i] + q_proj_heads_norm[i] + v_proj_heads_norm[i] + ) + + sorted_head_index = sorted( + range(self.num_heads), key=lambda k: heads_norm[k], reverse=True + ) + reserve_head_index = [] + for i in range(num_heads_to_keep): + start = sorted_head_index[i] * self.head_dim + end = (sorted_head_index[i] + 1) * self.head_dim + reserve_head_index.append((start, end)) + return reserve_head_index + + def _adaptive_prune_heads(self, reserve_head_index: List[Tuple[int, int]]): + new_q_weight = [] + new_q_bias = [] + new_k_weight = [] + new_k_bias = [] + new_v_weight = [] + new_v_bias = [] + new_out_proj_weight = [] + + for ele in reserve_head_index: + start_idx, end_idx = ele + new_q_weight.append( + self.q_proj.weight[ + start_idx:end_idx, + ] + ) + new_q_bias.append(self.q_proj.bias[start_idx:end_idx]) + + new_k_weight.append( + self.k_proj.weight[ + start_idx:end_idx, + ] + ) + + new_k_bias.append(self.k_proj.bias[start_idx:end_idx]) + + new_v_weight.append( + self.v_proj.weight[ + start_idx:end_idx, + ] + ) + new_v_bias.append(self.v_proj.bias[start_idx:end_idx]) + + new_out_proj_weight.append(self.out_proj.weight[:, start_idx:end_idx]) + + new_q_weight = torch.cat(new_q_weight).detach() + new_k_weight = torch.cat(new_k_weight).detach() + new_v_weight = torch.cat(new_v_weight).detach() + new_out_proj_weight = torch.cat(new_out_proj_weight, dim=-1).detach() + new_q_weight.requires_grad = True + new_k_weight.requires_grad = True + new_v_weight.requires_grad = True + new_out_proj_weight.requires_grad = True + + new_q_bias = torch.cat(new_q_bias).detach() + new_q_bias.requires_grad = True + + new_k_bias = torch.cat(new_k_bias).detach() + new_k_bias.requires_grad = True + + new_v_bias = torch.cat(new_v_bias).detach() + new_v_bias.requires_grad = True + + self.q_proj.weight = torch.nn.Parameter(new_q_weight) + self.q_proj.bias = torch.nn.Parameter(new_q_bias) + + self.k_proj.weight = torch.nn.Parameter(new_k_weight) + self.k_proj.bias = torch.nn.Parameter(new_k_bias) + + self.v_proj.weight = torch.nn.Parameter(new_v_weight) + self.v_proj.bias = torch.nn.Parameter(new_v_bias) + + self.out_proj.weight = torch.nn.Parameter(new_out_proj_weight) + + self.num_heads = len(reserve_head_index) + self.embed_dim = self.head_dim * self.num_heads + self.q_proj.out_features = self.embed_dim + self.k_proj.out_features = self.embed_dim + self.v_proj.out_features = self.embed_dim + + def _set_skip_embed_dim_check(self): + self.skip_embed_dim_check = True + + def _pad_masks( + self, + key_padding_mask: Optional[Tensor], + attn_mask: Optional[Tensor], + ) -> Tuple[Optional[Tensor], Optional[Tensor]]: + if attn_mask is not None: + shape = attn_mask.size()[:-1] + torch.Size([1]) + attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(shape)], dim=-1) + if key_padding_mask is not None: + shape = key_padding_mask.size()[:-1] + torch.Size([1]) + key_padding_mask = torch.cat( + [ + key_padding_mask, + key_padding_mask.new_zeros(shape), + ], + dim=-1, + ) + return key_padding_mask, attn_mask + + def _add_bias( + self, + k: Tensor, + v: Tensor, + key_padding_mask: Optional[Tensor], + attn_mask: Optional[Tensor], + bsz: int, + ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]: + assert self.bias_k is not None + assert self.bias_v is not None + k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) + key_padding_mask, attn_mask = self._pad_masks( + key_padding_mask=key_padding_mask, attn_mask=attn_mask + ) + return k, v, key_padding_mask, attn_mask + + def _append_zero_attn( + self, + k: Tensor, + v: Tensor, + key_padding_mask: Optional[Tensor], + attn_mask: Optional[Tensor], + ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]: + zero_attn_shape = k.size()[:-2] + torch.Size([1]) + k.size()[-1:] + k = torch.cat( + [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=-2 + ) + v = torch.cat( + [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=-2 + ) + key_padding_mask, attn_mask = self._pad_masks( + key_padding_mask=key_padding_mask, attn_mask=attn_mask + ) + return k, v, key_padding_mask, attn_mask + + def _xformers_attn_forward( self, query, key: Optional[Tensor], value: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + ) -> Tuple[Tensor, Optional[Tensor]]: + + tgt_len, bsz, embed_dim = query.size() + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(1) == tgt_len + + if self.self_attention: + key = query + value = query + elif self.encoder_decoder_attention: + value = key + + q = self.q_proj(query) + k = self.k_proj(key) + v = self.v_proj(value) + + if self.bias_k is not None: + assert self.bias_v is not None + k, v, attn_mask, key_padding_mask = self._add_bias( + k, v, attn_mask, key_padding_mask, bsz + ) + + def fold_heads(x): + return ( + x.contiguous() + .view(-1, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + + def split_heads(x): + return ( + x.contiguous() + .view(-1, bsz, self.num_heads, self.head_dim) + .transpose(0, 1) + .transpose(1, 2) + ) + + massage = split_heads if self.attention.requires_head_dimension else fold_heads + q = massage(q) + if k is not None: + k = massage(k) + if v is not None: + v = massage(v) + + if self.add_zero_attn: + k, v, key_padding_mask, attn_mask = self._append_zero_attn( + k=k, v=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask + ) + + kwargs = {} + + if attn_mask is not None and self.attention.supports_attention_mask: + attn_mask = _mask_for_xformers(attn_mask, to_dtype=q.dtype) + kwargs["att_mask"] = attn_mask + + if key_padding_mask is not None: + key_padding_mask = _mask_for_xformers(key_padding_mask, to_dtype=torch.bool) + if not self.attention.requires_separate_masks: + attn_mask = maybe_merge_masks( + attn_mask, + key_padding_mask, + batch_size=bsz, + src_len=k.size(-2), + tgt_len=q.size(-2), + num_heads=self.num_heads, + ) + key_padding_mask = None + kwargs["att_mask"] = attn_mask + if self.attention.supports_key_padding_mask: + kwargs["key_padding_mask"] = key_padding_mask + + y = self.attention(q, k, v, **kwargs) + + y = ( + y.view(bsz, self.num_heads, tgt_len, self.head_dim) + .transpose(1, 2) + .flatten(start_dim=2, end_dim=3) + .transpose(0, 1) + ) + assert list(y.size()) == [tgt_len, bsz, embed_dim] + + # Dropout not needed because already applied in attention. + # It is applied to the attention weights before matmul with v. + y = self.out_proj(y) + + # TODO: support returning attention weights if needed. + return y, None + + def forward( + self, + query: Tensor, + key: Optional[Tensor], + value: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, need_weights: bool = True, static_kv: bool = False, @@ -148,43 +499,66 @@ def forward( if need_head_weights: need_weights = True + is_tpu = query.device.type == "xla" + tgt_len, bsz, embed_dim = query.size() - assert embed_dim == self.embed_dim + src_len = tgt_len + if not self.skip_embed_dim_check: + assert ( + embed_dim == self.embed_dim + ), f"query dim {embed_dim} != {self.embed_dim}" assert list(query.size()) == [tgt_len, bsz, embed_dim] + if key is not None: + src_len, key_bsz, _ = key.size() + if not torch.jit.is_scripting(): + assert value is not None + assert src_len, key_bsz == value.shape[:2] if ( not self.onnx_trace - and not self.tpu # don't use PyTorch version on TPUs + and not is_tpu # don't use PyTorch version on TPUs and incremental_state is None and not static_kv # A workaround for quantization to work. Otherwise JIT compilation # treats bias in linear module as method. and not torch.jit.is_scripting() + # The Multihead attention implemented in pytorch forces strong dimension check + # for input embedding dimention and K,Q,V projection dimension. + # Since pruning will break the dimension check and it is not easy to modify the pytorch API, + # it is preferred to bypass the pytorch MHA when we need to skip embed_dim_check + and not self.skip_embed_dim_check ): assert key is not None and value is not None - return F.multi_head_attention_forward( - query, - key, - value, - self.embed_dim, - self.num_heads, - torch.empty([0]), - torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)), - self.bias_k, - self.bias_v, - self.add_zero_attn, - self.dropout_module.p, - self.out_proj.weight, - self.out_proj.bias, - self.training or self.dropout_module.apply_during_inference, - key_padding_mask, - need_weights, - attn_mask, - use_separate_proj_weight=True, - q_proj_weight=self.q_proj.weight, - k_proj_weight=self.k_proj.weight, - v_proj_weight=self.v_proj.weight, - ) + + if self.use_xformers: + return self._xformers_attn_forward( + query, key, value, key_padding_mask, need_weights, attn_mask + ) + + else: + return F.multi_head_attention_forward( + query, + key, + value, + self.embed_dim, + self.num_heads, + torch.empty([0]), + torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)), + self.bias_k, + self.bias_v, + self.add_zero_attn, + self.dropout_module.p, + self.out_proj.weight, + self.out_proj.bias, + self.training or self.dropout_module.apply_during_inference, + key_padding_mask.bool() if key_padding_mask is not None else None, + need_weights, + attn_mask, + use_separate_proj_weight=True, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + ) if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) @@ -208,6 +582,15 @@ def forward( assert value is None k = v = None else: + if self.beam_size > 1 and bsz == key.size(1): + # key is [T, bsz*beam_size, C], reduce to [T, bsz, C] + key = key.view(key.size(0), -1, self.beam_size, key.size(2))[ + :, :, 0, : + ] + if key_padding_mask is not None: + key_padding_mask = key_padding_mask.view( + -1, self.beam_size, key_padding_mask.size(1) + )[:, 0, :] k = self.k_proj(key) v = self.v_proj(key) @@ -220,36 +603,27 @@ def forward( if self.bias_k is not None: assert self.bias_v is not None - k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) - v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) - if attn_mask is not None: - attn_mask = torch.cat( - [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 - ) - if key_padding_mask is not None: - key_padding_mask = torch.cat( - [ - key_padding_mask, - key_padding_mask.new_zeros(key_padding_mask.size(0), 1), - ], - dim=1, - ) + k, v, attn_mask, key_padding_mask = self._add_bias( + k, v, attn_mask, key_padding_mask, bsz + ) q = ( q.contiguous() .view(tgt_len, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) + kv_bsz = bsz # need default value for scripting if k is not None: + kv_bsz = k.size(1) k = ( k.contiguous() - .view(-1, bsz * self.num_heads, self.head_dim) + .view(-1, kv_bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) if v is not None: v = ( v.contiguous() - .view(-1, bsz * self.num_heads, self.head_dim) + .view(-1, kv_bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) @@ -258,16 +632,21 @@ def forward( if "prev_key" in saved_state: _prev_key = saved_state["prev_key"] assert _prev_key is not None - prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim) + kv_bsz = _prev_key.size(0) + prev_key = _prev_key.view(kv_bsz * self.num_heads, -1, self.head_dim) if static_kv: k = prev_key else: assert k is not None k = torch.cat([prev_key, k], dim=1) + src_len = k.size(1) if "prev_value" in saved_state: _prev_value = saved_state["prev_value"] assert _prev_value is not None - prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim) + assert kv_bsz == _prev_value.size(0) + prev_value = _prev_value.view( + kv_bsz * self.num_heads, -1, self.head_dim + ) if static_kv: v = prev_value else: @@ -280,19 +659,21 @@ def forward( key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( key_padding_mask=key_padding_mask, prev_key_padding_mask=prev_key_padding_mask, - batch_size=bsz, + batch_size=kv_bsz, src_len=k.size(1), static_kv=static_kv, ) - saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim) - saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim) + saved_state["prev_key"] = k.view(kv_bsz, self.num_heads, -1, self.head_dim) + saved_state["prev_value"] = v.view( + kv_bsz, self.num_heads, -1, self.head_dim + ) saved_state["prev_key_padding_mask"] = key_padding_mask # In this branch incremental_state is never None assert incremental_state is not None incremental_state = self._set_input_buffer(incremental_state, saved_state) assert k is not None - src_len = k.size(1) + assert k.size(1) == src_len # This is part of a workaround to get around fork/join parallelism # not supporting Optional types. @@ -300,30 +681,25 @@ def forward( key_padding_mask = None if key_padding_mask is not None: - assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(0) == kv_bsz assert key_padding_mask.size(1) == src_len if self.add_zero_attn: assert v is not None src_len += 1 - k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) - v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) - if attn_mask is not None: - attn_mask = torch.cat( - [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 - ) - if key_padding_mask is not None: - key_padding_mask = torch.cat( - [ - key_padding_mask, - torch.zeros(key_padding_mask.size(0), 1).type_as( - key_padding_mask - ), - ], - dim=1, - ) + k, v, key_padding_mask, attn_mask = self._append_zero_attn( + k=k, v=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask + ) - attn_weights = torch.bmm(q, k.transpose(1, 2)) + if self.encoder_decoder_attention and bsz != kv_bsz: + attn_weights = torch.einsum( + "bxhtd,bhsd->bxhts", + q.view((kv_bsz, -1, self.num_heads) + q.size()[1:]), + k.view((kv_bsz, self.num_heads) + k.size()[1:]), + ) + attn_weights = attn_weights.reshape((-1,) + attn_weights.size()[-2:]) + else: + attn_weights = torch.bmm(q, k.transpose(1, 2)) attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] @@ -337,9 +713,15 @@ def forward( if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) - if not self.tpu: + if not is_tpu: + attn_weights = attn_weights.view( + kv_bsz, -1, self.num_heads, tgt_len, src_len + ) attn_weights = attn_weights.masked_fill( - key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), + key_padding_mask.unsqueeze(1) + .unsqueeze(2) + .unsqueeze(3) + .to(torch.bool), float("-inf"), ) else: @@ -358,14 +740,36 @@ def forward( attn_probs = self.dropout_module(attn_weights) assert v is not None - attn = torch.bmm(attn_probs, v) + attn: Optional[Tensor] = None + if self.encoder_decoder_attention and bsz != kv_bsz: + attn = torch.einsum( + "bxhts,bhsd->bxhtd", + attn_probs.view( + ( + kv_bsz, + -1, + self.num_heads, + ) + + attn_probs.size()[1:] + ), + v.view( + ( + kv_bsz, + self.num_heads, + ) + + v.size()[1:] + ), + ) + attn = attn.reshape((-1,) + attn.size()[-2:]) + else: + attn = torch.bmm(attn_probs, v) assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] if self.onnx_trace and attn.size(1) == 1: # when ONNX tracing a single decoder step (sequence length == 1) # the transpose is a no-op copy before view, thus unnecessary - attn = attn.contiguous().view(tgt_len, bsz, embed_dim) + attn = attn.contiguous().view(tgt_len, bsz, self.embed_dim) else: - attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim) attn = self.out_proj(attn) attn_weights: Optional[Tensor] = None if need_weights: @@ -397,21 +801,27 @@ def _append_prev_key_padding_mask( # leaves the frame, there will be a time when prev or current # is None elif prev_key_padding_mask is not None: - filler = torch.zeros( - (batch_size, src_len - prev_key_padding_mask.size(1)), - device=prev_key_padding_mask.device, - ) - new_key_padding_mask = torch.cat( - [prev_key_padding_mask.float(), filler.float()], dim=1 - ) + if src_len > prev_key_padding_mask.size(1): + filler = torch.zeros( + (batch_size, src_len - prev_key_padding_mask.size(1)), + device=prev_key_padding_mask.device, + ) + new_key_padding_mask = torch.cat( + [prev_key_padding_mask.float(), filler.float()], dim=1 + ) + else: + new_key_padding_mask = prev_key_padding_mask.float() elif key_padding_mask is not None: - filler = torch.zeros( - (batch_size, src_len - key_padding_mask.size(1)), - device=key_padding_mask.device, - ) - new_key_padding_mask = torch.cat( - [filler.float(), key_padding_mask.float()], dim=1 - ) + if src_len > key_padding_mask.size(1): + filler = torch.zeros( + (batch_size, src_len - key_padding_mask.size(1)), + device=key_padding_mask.device, + ) + new_key_padding_mask = torch.cat( + [filler.float(), key_padding_mask.float()], dim=1 + ) + else: + new_key_padding_mask = key_padding_mask.float() else: new_key_padding_mask = prev_key_padding_mask return new_key_padding_mask @@ -419,7 +829,7 @@ def _append_prev_key_padding_mask( @torch.jit.export def reorder_incremental_state( self, - incremental_state: Dict[str, Dict[str, Optional[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], new_order: Tensor, ): """Reorder buffered internal state (for incremental generation).""" @@ -428,14 +838,26 @@ def reorder_incremental_state( for k in input_buffer.keys(): input_buffer_k = input_buffer[k] if input_buffer_k is not None: - if self.encoder_decoder_attention and input_buffer_k.size( - 0 - ) == new_order.size(0): - break - input_buffer[k] = input_buffer_k.index_select(0, new_order) + if self.encoder_decoder_attention: + if input_buffer_k.size(0) * self.beam_size == new_order.size(0): + return incremental_state + elif self.beam_size > 1: + input_buffer[k] = input_buffer_k.index_select( + 0, + new_order.reshape(-1, self.beam_size)[:, 0] + // self.beam_size, + ) + else: + input_buffer[k] = input_buffer_k.index_select(0, new_order) + else: + input_buffer[k] = input_buffer_k.index_select(0, new_order) incremental_state = self._set_input_buffer(incremental_state, input_buffer) return incremental_state + def set_beam_size(self, beam_size): + """Used for effiecient beamable enc-dec attention""" + self.beam_size = beam_size + def _get_input_buffer( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] ) -> Dict[str, Optional[Tensor]]: @@ -448,7 +870,7 @@ def _get_input_buffer( def _set_input_buffer( self, - incremental_state: Dict[str, Dict[str, Optional[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], buffer: Dict[str, Optional[Tensor]], ): return self.set_incremental_state(incremental_state, "attn_state", buffer) diff --git a/fairseq/modules/positional_embedding.py b/fairseq/modules/positional_embedding.py index 8e94e35edb..fbc13d80ac 100644 --- a/fairseq/modules/positional_embedding.py +++ b/fairseq/modules/positional_embedding.py @@ -14,6 +14,7 @@ def PositionalEmbedding( embedding_dim: int, padding_idx: int, learned: bool = False, + auto_expand: bool = True, ): if learned: # if padding_idx is specified then offset the embedding ids by @@ -23,7 +24,7 @@ def PositionalEmbedding( if padding_idx is not None: num_embeddings = num_embeddings + padding_idx + 1 m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx) - nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) + nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) if padding_idx is not None: nn.init.constant_(m.weight[padding_idx], 0) else: @@ -31,5 +32,6 @@ def PositionalEmbedding( embedding_dim, padding_idx, init_size=num_embeddings + padding_idx + 1, + auto_expand=auto_expand, ) return m diff --git a/fairseq/modules/positional_encoding.py b/fairseq/modules/positional_encoding.py new file mode 100644 index 0000000000..67f6353539 --- /dev/null +++ b/fairseq/modules/positional_encoding.py @@ -0,0 +1,129 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch.nn as nn +import math +import torch + + +class PositionalEncoding(nn.Module): + """Positional encoding. + + Args: + d_model: Embedding dimension. + dropout_rate: Dropout rate. + max_len: Maximum input length. + reverse: Whether to reverse the input position. + """ + + def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False): + """Construct an PositionalEncoding object.""" + super(PositionalEncoding, self).__init__() + self.d_model = d_model + self.reverse = reverse + self.xscale = math.sqrt(self.d_model) + self.dropout = nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x): + """Reset the positional encodings.""" + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + if self.reverse: + position = torch.arange( + x.size(1) - 1, -1, -1.0, dtype=torch.float32 + ).unsqueeze(1) + else: + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) + * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor): + """Add positional encoding. + Args: + x (torch.Tensor): Input tensor B X T X C + Returns: + torch.Tensor: Encoded tensor B X T X C + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class RelPositionalEncoding(nn.Module): + """Relative positional encoding module (new implementation). + + Args: + d_model: Embedding dimension. + dropout_rate: Dropout rate. + max_len: Maximum input length. + """ + + def __init__(self, max_len, d_model): + """Construct an PositionalEncoding object.""" + super(RelPositionalEncoding, self).__init__() + self.d_model = d_model + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x): + """Reset the positional encodings.""" + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x.size(1) * 2 - 1: + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` means to the position of query vecotr and `j` means the + # position of key vector. We use position relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i<j). + pe_positive = torch.zeros(x.size(1), self.d_model) + pe_negative = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) + * -(math.log(10000.0) / self.d_model) + ) + pe_positive[:, 0::2] = torch.sin(position * div_term) + pe_positive[:, 1::2] = torch.cos(position * div_term) + pe_negative[:, 0::2] = torch.sin(-1 * position * div_term) + pe_negative[:, 1::2] = torch.cos(-1 * position * div_term) + + # Reserve the order of positive indices and concat both positive and + # negative indices. This is used to support the shifting trick + # as in https://arxiv.org/abs/1901.02860 + pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0) + pe_negative = pe_negative[1:].unsqueeze(0) + pe = torch.cat([pe_positive, pe_negative], dim=1) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor): + """Add positional encoding. + Args: + x : Input tensor T X B X C. + Returns: + torch.Tensor: Encoded tensor T X B X C. + + """ + x = x.transpose(0, 1) # Change TBC to BTC + self.extend_pe(x) + pos_emb = self.pe[ + :, + self.pe.size(1) // 2 - x.size(1) + 1 : self.pe.size(1) // 2 + x.size(1), + ] + pos_emb = pos_emb.transpose(0, 1) # change to TBC + return pos_emb diff --git a/fairseq/modules/quantization/pq/__init__.py b/fairseq/modules/quantization/pq/__init__.py index 5b10b51b1b..c142a802e0 100644 --- a/fairseq/modules/quantization/pq/__init__.py +++ b/fairseq/modules/quantization/pq/__init__.py @@ -3,4 +3,4 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .utils import SizeTracker, quantize_model_ # NOQA +from .utils import SizeTracker, get_param, attrsetter, quantize_model_ # NOQA diff --git a/fairseq/modules/quantization/pq/utils.py b/fairseq/modules/quantization/pq/utils.py index 03b15e4b1b..eceeef8ba3 100644 --- a/fairseq/modules/quantization/pq/utils.py +++ b/fairseq/modules/quantization/pq/utils.py @@ -6,7 +6,7 @@ import logging import re from operator import attrgetter, itemgetter - +import torch import numpy as np import torch.distributed as dist import torch.nn as nn @@ -25,7 +25,9 @@ def quantize_model_( n_iter=15, eps=1e-6, max_tentatives=100, + remove_weights=False, verbose=True, + state_dict=None, ): """ Quantize a model in-place by stages. All the targeted @@ -58,7 +60,9 @@ def quantize_model_( to layers_to_quantize[step] """ - quantized_layers = get_layers(model, layers_to_quantize[step]) + quantized_layers = get_layers( + model, layers_to_quantize[step], remove_weights=remove_weights + ) for layer in quantized_layers: @@ -96,6 +100,37 @@ def quantize_model_( centroids = quantizer.centroids.contiguous() assignments = quantizer.assignments.contiguous() + # If n_iter = 0 and state_dict is provided, then + # we initialize random assignments and centroids to + # random values of the appropriate dimensions + # because the quantized model parameters will + # overwritten by the state_dict later on. + if n_iter == 0 and state_dict: + # Initialize random centroids of the correct size + centroids = torch.rand(centroids.size()) + centroids.cuda() + # Get counts and assignment keys from layer in loaded checkpoint. + counts_key = layer + "." + "counts" + assignment_key = layer + "." + "assignments" + # Get number of different bins to include. + counts = list(state_dict[counts_key].shape)[0] + print(layer) + print(state_dict[counts_key]) + print(counts) + # Initialize random assignments of the correct size + # with an appropriate number of bins. + num_assignments = list(state_dict[assignment_key].shape)[0] + num_extra = num_assignments - counts + print(num_assignments) + print(num_extra) + assignments_bins = torch.arange(counts) + assignments_rand = torch.randint(0, counts - 1, (num_extra,)) + assignments = torch.cat((assignments_bins, assignments_rand), 0) + # assignments = assignments.type(torch.IntTensor) + assignments.cuda() + print("assignments") + print(assignments) + # broadcast results to make sure weights are up-to-date if dist.is_initialized(): dist.broadcast(centroids, 0) @@ -152,7 +187,7 @@ def quantize_model_( return quantized_layers -def get_layers(model, filter_regexp): +def get_layers(model, filter_regexp, remove_weights=False): """ Filters out the layers according to a regexp. Note that we omit biases. @@ -181,6 +216,10 @@ def get_layers(model, filter_regexp): # remove .weight in all other names (or .weight_orig is spectral norm) all_layers = map(lambda x: x.replace(".weight_orig", ""), all_layers) + # remove weights indicates whether the weights extension should be removed, in addition to + # weight_orig and weight extension on names + if remove_weights: + all_layers = map(lambda x: x.replace(".weights", ""), all_layers) all_layers = map(lambda x: x.replace(".weight", ""), all_layers) # return filtered layers diff --git a/fairseq/modules/quantization/scalar/modules/qact.py b/fairseq/modules/quantization/scalar/modules/qact.py index c5dd1d6336..b362c30dc7 100644 --- a/fairseq/modules/quantization/scalar/modules/qact.py +++ b/fairseq/modules/quantization/scalar/modules/qact.py @@ -81,7 +81,7 @@ def quantize_hook(module, x, y): # using straight-through estimator (STE) clamp_low = -self.scale * self.zero_point - clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point) + clamp_high = self.scale * (2**self.bits - 1 - self.zero_point) return torch.clamp(y, clamp_low.item(), clamp_high.item()) + noise.detach() # register hook diff --git a/fairseq/modules/quantization/scalar/modules/qconv.py b/fairseq/modules/quantization/scalar/modules/qconv.py index 83788c6f71..29744744ec 100644 --- a/fairseq/modules/quantization/scalar/modules/qconv.py +++ b/fairseq/modules/quantization/scalar/modules/qconv.py @@ -119,7 +119,7 @@ def forward(self, input): # using straight-through estimator (STE) clamp_low = -self.scale * self.zero_point - clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point) + clamp_high = self.scale * (2**self.bits - 1 - self.zero_point) weight = ( torch.clamp(self.weight, clamp_low.item(), clamp_high.item()) + noise.detach() diff --git a/fairseq/modules/quantization/scalar/modules/qemb.py b/fairseq/modules/quantization/scalar/modules/qemb.py index d6cf06e587..3b293ac31e 100644 --- a/fairseq/modules/quantization/scalar/modules/qemb.py +++ b/fairseq/modules/quantization/scalar/modules/qemb.py @@ -113,7 +113,7 @@ def forward(self, input): # using straight-through estimator (STE) clamp_low = -self.scale * self.zero_point - clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point) + clamp_high = self.scale * (2**self.bits - 1 - self.zero_point) weight = ( torch.clamp(self.weight, clamp_low.item(), clamp_high.item()) + noise.detach() diff --git a/fairseq/modules/quantization/scalar/modules/qlinear.py b/fairseq/modules/quantization/scalar/modules/qlinear.py index 9db1559386..78606a25b9 100644 --- a/fairseq/modules/quantization/scalar/modules/qlinear.py +++ b/fairseq/modules/quantization/scalar/modules/qlinear.py @@ -92,7 +92,7 @@ def forward(self, input): # using straight-through estimator (STE) clamp_low = -self.scale * self.zero_point - clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point) + clamp_high = self.scale * (2**self.bits - 1 - self.zero_point) weight = ( torch.clamp(self.weight, clamp_low.item(), clamp_high.item()) + noise.detach() diff --git a/fairseq/modules/quantization/scalar/ops.py b/fairseq/modules/quantization/scalar/ops.py index 2a855159be..e0f9a0c1f8 100644 --- a/fairseq/modules/quantization/scalar/ops.py +++ b/fairseq/modules/quantization/scalar/ops.py @@ -5,45 +5,55 @@ import torch +try: + import torch.ao.quantization as quantization +except ImportError: + import torch.quantization as quantization + def emulate_int(w, bits, method, scale=None, zero_point=None): - q = globals()[f"emulate_int{bits}_{method}"] - return q(w, scale=scale, zero_point=zero_point) + q = globals()[f"emulate_int8_{method}"] + return q(w, scale=scale, zero_point=zero_point, bits=bits) -def quantize(w, scale, zero_point): +def quantize(w, scale, zero_point, bits=8): + # In the default behavior, max_val = 255. + max_val = 2**bits - 1 return ( - torch.clamp(torch.round(w / scale + zero_point), 0, 255) - zero_point + torch.clamp(torch.round(w / scale + zero_point), 0, max_val) - zero_point ) * scale -def emulate_int8_histogram(w, scale=None, zero_point=None): +def emulate_int8_histogram(w, scale=None, zero_point=None, bits=8): if scale is None: - obs = torch.quantization.observer.HistogramObserver() + obs = quantization.observer.HistogramObserver() + obs.to(device=w.device) _ = obs(w.float()) scale, zero_point = obs.calculate_qparams() scale = scale.cuda().type_as(w) zero_point = zero_point.cuda().type_as(w) - return quantize(w, scale, zero_point), scale, zero_point + return quantize(w, scale, zero_point, bits=bits), scale, zero_point -def emulate_int8_channel(w, scale=None, zero_point=None): +def emulate_int8_channel(w, scale=None, zero_point=None, bits=8): if scale is None: - obs = torch.quantization.observer.PerChannelMinMaxObserver( + obs = quantization.observer.PerChannelMinMaxObserver( ch_axis=-1, qscheme=torch.per_channel_symmetric ) + obs.to(device=w.device) _ = obs(w) scale, zero_point, ch_axis = obs.get_qparams() scale = scale.cuda().type_as(w) zero_point = zero_point.cuda().type_as(w) - return quantize(w, scale, zero_point), scale, zero_point + return quantize(w, scale, zero_point, bits=bits), scale, zero_point -def emulate_int8_tensor(w, scale=None, zero_point=None): +def emulate_int8_tensor(w, scale=None, zero_point=None, bits=8): if scale is None: - obs = torch.quantization.observer.MinMaxObserver() + obs = quantization.observer.MinMaxObserver() + obs.to(device=w.device) _ = obs(w) scale, zero_point = obs.calculate_qparams() scale = scale.cuda().type_as(w) zero_point = zero_point.cuda().type_as(w) - return quantize(w, scale, zero_point), scale, zero_point + return quantize(w, scale, zero_point, bits=bits), scale, zero_point diff --git a/fairseq/modules/quantization/scalar/utils.py b/fairseq/modules/quantization/scalar/utils.py index 32cf616568..d4b1cc255b 100644 --- a/fairseq/modules/quantization/scalar/utils.py +++ b/fairseq/modules/quantization/scalar/utils.py @@ -16,7 +16,9 @@ MAPPING = {nn.Linear: IntLinear, nn.Embedding: IntEmbedding, nn.Conv2d: IntConv2d} -def quantize_model_(model, p=0.2, bits=8, update_step=3000): +def quantize_model_( + model, p=0.2, bits=8, update_step=3000, method="histogram", remove_weights=False +): """ Replaces all modules with their scalar quantized counterpart and registers hooks to quantize the post-ativations of those modules. @@ -27,9 +29,10 @@ def quantize_model_(model, p=0.2, bits=8, update_step=3000): - bits: number of bits - update_step: update quantization parameters every update_step steps """ - # quantize all layers - quantized_layers = get_layers(model, "(.*?)") + # remove weights indicates whether the weights extension should be removed, in addition to + # weight_orig and weight extension on names + quantized_layers = get_layers(model, "(.*?)", remove_weights=remove_weights) for layer in quantized_layers: @@ -50,7 +53,7 @@ def quantize_model_(model, p=0.2, bits=8, update_step=3000): "p": p, "update_step": update_step, "bits": bits, - "method": "histogram", + "method": method, "counter": 0, } @@ -68,7 +71,7 @@ def quantize_model_(model, p=0.2, bits=8, update_step=3000): continue # activation quantization - a_q = ActivationQuantizer(quantized_module, p=0, bits=bits, method="histogram") + a_q = ActivationQuantizer(quantized_module, p=0, bits=bits, method=method) # replace layer by its quantized counterpart attrsetter(layer)(model, quantized_module) diff --git a/fairseq/modules/rotary_positional_embedding.py b/fairseq/modules/rotary_positional_embedding.py new file mode 100644 index 0000000000..b74028b011 --- /dev/null +++ b/fairseq/modules/rotary_positional_embedding.py @@ -0,0 +1,50 @@ +import torch + + +class RotaryPositionalEmbedding(torch.nn.Module): + def __init__(self, dim, base=10000, precision=torch.half): + """Rotary positional embedding + Reference : https://blog.eleuther.ai/rotary-embeddings/ + Paper: https://arxiv.org/pdf/2104.09864.pdf + Args: + dim: Dimension of embedding + base: Base value for exponential + precision: precision to use for numerical values + """ + super().__init__() + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer("inv_freq", inv_freq) + self.seq_len_cached = 0 + self.cos_cached = torch.empty(self.seq_len_cached, 1, 1, dim) + self.sin_cached = torch.empty(self.seq_len_cached, 1, 1, dim) + self.precision = precision + + def forward(self, x, seq_len: int = 0): + """ + Args: + x: Input x with T X B X C + seq_len: Sequence length of input x + """ + if seq_len > self.seq_len_cached: + self.seq_len_cached = seq_len + t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq) + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + emb = torch.cat((freqs, freqs), dim=-1).to(x.device) + self.cos_cached = emb.cos().view(emb.size(0), 1, 1, emb.size(1)) + self.sin_cached = emb.sin().view(emb.size(0), 1, 1, emb.size(1)) + return self.cos_cached, self.sin_cached + +# rotary pos emb helpers: +def rotate_half(x): + x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] + return torch.cat( + (-x2, x1), dim=x1.ndim - 1 + ) # dim=-1 triggers a bug in earlier torch versions + + +def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0): + cos, sin = ( + cos[offset : q.shape[0] + offset, ...], + sin[offset : q.shape[0] + offset, ...], + ) + return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) diff --git a/fairseq/modules/same_pad.py b/fairseq/modules/same_pad.py index b46f94d635..a3ce4131c6 100644 --- a/fairseq/modules/same_pad.py +++ b/fairseq/modules/same_pad.py @@ -8,11 +8,26 @@ class SamePad(nn.Module): + def __init__(self, kernel_size, causal=False): + super().__init__() + if causal: + self.remove = kernel_size - 1 + else: + self.remove = 1 if kernel_size % 2 == 0 else 0 + + def forward(self, x): + if self.remove > 0: + x = x[:, :, : -self.remove] + return x + + +class SamePad2d(nn.Module): def __init__(self, kernel_size): super().__init__() - self.remove = kernel_size % 2 == 0 + self.remove = 1 if kernel_size % 2 == 0 else 0 def forward(self, x): - if self.remove: - x = x[:, :, :-1] + assert len(x.size()) == 4 + if self.remove > 0: + x = x[:, :, : -self.remove, : -self.remove] return x diff --git a/fairseq/modules/sinusoidal_positional_embedding.py b/fairseq/modules/sinusoidal_positional_embedding.py index 857830faf7..dd93ddc397 100644 --- a/fairseq/modules/sinusoidal_positional_embedding.py +++ b/fairseq/modules/sinusoidal_positional_embedding.py @@ -9,7 +9,7 @@ import torch import torch.onnx.operators from fairseq import utils -from torch import Tensor, nn +from torch import nn, Tensor class SinusoidalPositionalEmbedding(nn.Module): @@ -18,20 +18,32 @@ class SinusoidalPositionalEmbedding(nn.Module): Padding symbols are ignored. """ - def __init__(self, embedding_dim, padding_idx, init_size=1024): + def __init__(self, embedding_dim, padding_idx, init_size=1024, auto_expand=True): super().__init__() self.embedding_dim = embedding_dim - self.padding_idx = padding_idx - self.weights = SinusoidalPositionalEmbedding.get_embedding( - init_size, embedding_dim, padding_idx + self.padding_idx = padding_idx if padding_idx is not None else 0 + self.register_buffer( + "weights", + SinusoidalPositionalEmbedding.get_embedding( + init_size, embedding_dim, padding_idx + ), + persistent=False, ) - self.onnx_trace = False - self.register_buffer("_float_tensor", torch.FloatTensor(1)) self.max_positions = int(1e5) + self.auto_expand = auto_expand + self.onnx_trace = False def prepare_for_onnx_export_(self): self.onnx_trace = True + def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): + # Ignore some deprecated keys that were used in older versions + deprecated_keys = ["weights", "_float_tensor"] + for key in deprecated_keys: + if prefix + key in state_dict: + del state_dict[prefix + key] + super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) + @staticmethod def get_embedding( num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None @@ -68,29 +80,36 @@ def forward( bspair = torch.onnx.operators.shape_as_tensor(input) bsz, seq_len = bspair[0], bspair[1] max_pos = self.padding_idx + 1 + seq_len - if self.weights is None or max_pos > self.weights.size(0): - # recompute/expand embeddings if needed - self.weights = SinusoidalPositionalEmbedding.get_embedding( + weights = self.weights + + if max_pos > self.weights.size(0): + # If the input is longer than the number of pre-computed embeddings, + # compute the extra embeddings on the fly. + # Only store the expanded embeddings if auto_expand=True. + # In multithreading environments, mutating the weights of a module + # may cause trouble. Set auto_expand=False if this happens. + weights = SinusoidalPositionalEmbedding.get_embedding( max_pos, self.embedding_dim, self.padding_idx - ) - self.weights = self.weights.to(self._float_tensor) + ).to(self.weights) + if self.auto_expand: + self.weights = weights if incremental_state is not None: # positions is the same for every token when decoding a single step pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len if self.onnx_trace: return ( - self.weights.index_select(index=self.padding_idx + pos, dim=0) + weights.index_select(index=self.padding_idx + pos, dim=0) .unsqueeze(1) .repeat(bsz, 1, 1) ) - return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1) + return weights[self.padding_idx + pos, :].expand(bsz, 1, -1) positions = utils.make_positions( input, self.padding_idx, onnx_trace=self.onnx_trace ) if self.onnx_trace: - flat_embeddings = self.weights.detach().index_select(0, positions.view(-1)) + flat_embeddings = weights.detach().index_select(0, positions.view(-1)) embedding_shape = torch.cat( (bsz.view(1), seq_len.view(1), torch.tensor([-1], dtype=torch.long)) ) @@ -99,7 +118,5 @@ def forward( ) return embeddings return ( - self.weights.index_select(0, positions.view(-1)) - .view(bsz, seq_len, -1) - .detach() + weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach() ) diff --git a/fairseq/modules/transformer_layer.py b/fairseq/modules/transformer_layer.py index 8775aa7766..19e035dec5 100644 --- a/fairseq/modules/transformer_layer.py +++ b/fairseq/modules/transformer_layer.py @@ -7,14 +7,16 @@ import torch import torch.nn as nn +from torch import Tensor + from fairseq import utils +from fairseq.models.transformer import TransformerConfig from fairseq.modules import LayerNorm, MultiheadAttention from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.quant_noise import quant_noise -from torch import Tensor -class TransformerEncoderLayer(nn.Module): +class TransformerEncoderLayerBase(nn.Module): """Encoder layer block. In the original paper each operation (multi-head attention or FFN) is @@ -23,47 +25,47 @@ class TransformerEncoderLayer(nn.Module): preprocessing each layer with layernorm and postprocessing with: `dropout -> add residual`. We default to the approach in the paper, but the tensor2tensor approach can be enabled by setting - *args.encoder_normalize_before* to ``True``. + *cfg.encoder.normalize_before* to ``True``. Args: - args (argparse.Namespace): parsed command-line arguments + cfg (argparse.Namespace): parsed command-line arguments """ - def __init__(self, args): + def __init__(self, cfg, return_fc=False): super().__init__() - self.embed_dim = args.encoder_embed_dim - self.quant_noise = getattr(args, 'quant_noise_pq', 0) - self.quant_noise_block_size = getattr(args, 'quant_noise_pq_block_size', 8) or 8 - self.self_attn = self.build_self_attention(self.embed_dim, args) - self.self_attn_layer_norm = LayerNorm(self.embed_dim) + self.cfg = cfg + self.return_fc = return_fc + self.embed_dim = cfg.encoder.embed_dim + self.quant_noise = cfg.quant_noise.pq + self.quant_noise_block_size = cfg.quant_noise.pq_block_size + self.self_attn = self.build_self_attention(self.embed_dim, cfg) + self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) self.dropout_module = FairseqDropout( - args.dropout, module_name=self.__class__.__name__ - ) - self.activation_fn = utils.get_activation_fn( - activation=getattr(args, 'activation_fn', 'relu') or "relu" + cfg.dropout, module_name=self.__class__.__name__ ) - activation_dropout_p = getattr(args, "activation_dropout", 0) or 0 + self.activation_fn = utils.get_activation_fn(activation=cfg.activation_fn) + activation_dropout_p = cfg.activation_dropout if activation_dropout_p == 0: - # for backwards compatibility with models that use args.relu_dropout - activation_dropout_p = getattr(args, "relu_dropout", 0) or 0 + # for backwards compatibility with models that use cfg.relu_dropout + activation_dropout_p = cfg.relu_dropout or 0 self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) - self.normalize_before = args.encoder_normalize_before + self.normalize_before = cfg.encoder.normalize_before self.fc1 = self.build_fc1( self.embed_dim, - args.encoder_ffn_embed_dim, + cfg.encoder.ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( - args.encoder_ffn_embed_dim, + cfg.encoder.ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) - self.final_layer_norm = LayerNorm(self.embed_dim) + self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise( @@ -75,14 +77,70 @@ def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size ) - def build_self_attention(self, embed_dim, args): + def _get_fc_rank(self, remove_num: int) -> List[int]: + f1_filter_param = [] + for i in range(self.fc1.out_features): + f1_filter_param.append( + torch.sum(torch.abs(self.fc1.weight[i])) + + torch.sum(torch.abs(self.fc2.weight[:, i])) + + torch.abs(self.fc1.bias[i]) + ) + return sorted( + range(len(f1_filter_param)), key=lambda k: f1_filter_param[k], reverse=False + )[0:remove_num] + + def _prune_fc_layer(self, remove_index: List[int]): + new_fc1_weight = [] + new_fc1_bias = [] + for i in range(self.fc1.out_features): + if i not in remove_index: + new_fc1_weight.append(self.fc1.weight[i]) + new_fc1_bias.append(self.fc1.bias[i]) + + new_fc1_weight = torch.stack(new_fc1_weight).detach() + new_fc1_weight.requires_grad = True + + new_fc1_bias = torch.stack(new_fc1_bias).detach() + new_fc1_bias.requires_grad = True + + self.fc1 = quant_noise( + nn.Linear(self.fc1.in_features, self.fc1.out_features - len(remove_index)), + p=self.quant_noise, + block_size=self.quant_noise_block_size, + ) + self.fc1.weight = torch.nn.Parameter(new_fc1_weight) + self.fc1.bias = torch.nn.Parameter(new_fc1_bias) + + new_fc2_weight = [] + new_fc2_bias = [] + for i in range(self.fc2.in_features): + if i not in remove_index: + new_fc2_weight.append(self.fc2.weight[:, i]) + new_fc2_bias = self.fc2.bias.detach() + + new_fc2_weight = torch.stack(new_fc2_weight, dim=-1).detach() + new_fc2_weight.requires_grad = True + + new_fc2_bias = self.fc2.bias.detach() + new_fc2_bias.requires_grad = True + + self.fc2 = quant_noise( + nn.Linear(self.fc2.in_features - len(remove_index), self.fc2.out_features), + p=self.quant_noise, + block_size=self.quant_noise_block_size, + ) + self.fc2.weight = torch.nn.Parameter(new_fc2_weight) + self.fc2.bias = torch.nn.Parameter(new_fc2_bias) + + def build_self_attention(self, embed_dim, cfg): return MultiheadAttention( embed_dim, - args.encoder_attention_heads, - dropout=args.attention_dropout, + cfg.encoder.attention_heads, + dropout=cfg.attention_dropout, self_attention=True, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, + xformers_att_config=cfg.encoder.xformers_att_config, ) def residual_connection(self, x, residual): @@ -102,7 +160,12 @@ def upgrade_state_dict_named(self, state_dict, name): state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k] del state_dict[k] - def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None): + def forward( + self, + x, + encoder_padding_mask: Optional[Tensor], + attn_mask: Optional[Tensor] = None, + ): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` @@ -124,7 +187,9 @@ def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None): # the attention weight (before softmax) for some padded element in query # will become -inf, which results in NaN in model parameters if attn_mask is not None: - attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8) + attn_mask = attn_mask.masked_fill( + attn_mask.to(torch.bool), -1e8 if x.dtype == torch.float32 else -1e4 + ) residual = x if self.normalize_before: @@ -134,6 +199,7 @@ def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None): key=x, value=x, key_padding_mask=encoder_padding_mask, + need_weights=False, attn_mask=attn_mask, ) x = self.dropout_module(x) @@ -144,18 +210,35 @@ def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None): residual = x if self.normalize_before: x = self.final_layer_norm(x) - x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) x = self.fc2(x) + + fc_result = x + x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.final_layer_norm(x) + + if self.return_fc and not torch.jit.is_scripting(): + return x, fc_result return x -class TransformerDecoderLayer(nn.Module): +# backward compatible with the legacy argparse format +class TransformerEncoderLayer(TransformerEncoderLayerBase): + def __init__(self, args): + super().__init__(TransformerConfig.from_namespace(args)) + self.args = args + + def build_self_attention(self, embed_dim, args): + return super().build_self_attention( + embed_dim, TransformerConfig.from_namespace(args) + ) + + +class TransformerDecoderLayerBase(nn.Module): """Decoder layer block. In the original paper each operation (multi-head attention, encoder @@ -164,7 +247,7 @@ class TransformerDecoderLayer(nn.Module): robust when preprocessing each layer with layernorm and postprocessing with: `dropout -> add residual`. We default to the approach in the paper, but the tensor2tensor approach can be enabled by setting - *args.decoder_normalize_before* to ``True``. + *cfg.decoder.normalize_before* to ``True``. Args: args (argparse.Namespace): parsed command-line arguments @@ -173,66 +256,87 @@ class TransformerDecoderLayer(nn.Module): """ def __init__( - self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False + self, cfg, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False ): super().__init__() - self.embed_dim = args.decoder_embed_dim + self.embed_dim = cfg.decoder.embed_dim self.dropout_module = FairseqDropout( - args.dropout, module_name=self.__class__.__name__ + cfg.dropout, module_name=self.__class__.__name__ ) - self.quant_noise = getattr(args, "quant_noise_pq", 0) - self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8) + self.quant_noise = cfg.quant_noise.pq + self.quant_noise_block_size = cfg.quant_noise.pq_block_size - self.cross_self_attention = getattr(args, "cross_self_attention", False) + self.cross_self_attention = cfg.cross_self_attention self.self_attn = self.build_self_attention( self.embed_dim, - args, + cfg, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) - - self.activation_fn = utils.get_activation_fn( - activation=str(args.activation_fn) - if getattr(args, "activation_fn", None) is not None - else "relu" + self.attn_ln = ( + LayerNorm(self.embed_dim) + if utils.safe_getattr(cfg, "scale_attn", False) + else None + ) + self.nh = self.self_attn.num_heads + self.head_dim = self.self_attn.head_dim + scale_heads = utils.safe_getattr(cfg, "scale_heads", False) + self.c_attn = ( + nn.Parameter(torch.ones((self.nh,)), requires_grad=True) + if scale_heads + else None ) - activation_dropout_p = getattr(args, "activation_dropout", 0) or 0 + + self.activation_fn = utils.get_activation_fn(activation=cfg.activation_fn) + activation_dropout_p = cfg.activation_dropout if activation_dropout_p == 0: - # for backwards compatibility with models that use args.relu_dropout - activation_dropout_p = getattr(args, "relu_dropout", 0) or 0 + # for backwards compatibility with models that use cfg.relu_dropout + activation_dropout_p = cfg.relu_dropout or 0 self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) - self.normalize_before = args.decoder_normalize_before + self.normalize_before = cfg.decoder.normalize_before - # use layerNorm rather than FusedLayerNorm for exporting. - # char_inputs can be used to determint this. - # TODO remove this once we update apex with the fix - export = getattr(args, "char_inputs", False) - self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) + self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: - self.encoder_attn = self.build_encoder_attention(self.embed_dim, args) - self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) + self.encoder_attn = self.build_encoder_attention(self.embed_dim, cfg) + self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) + + self.ffn_layernorm = ( + LayerNorm(cfg.decoder.ffn_embed_dim) + if utils.safe_getattr(cfg, "scale_fc", False) + else None + ) + self.w_resid = ( + nn.Parameter( + torch.ones( + self.embed_dim, + ), + requires_grad=True, + ) + if utils.safe_getattr(cfg, "scale_resids", False) + else None + ) self.fc1 = self.build_fc1( self.embed_dim, - args.decoder_ffn_embed_dim, + cfg.decoder.ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( - args.decoder_ffn_embed_dim, + cfg.decoder.ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) - self.final_layer_norm = LayerNorm(self.embed_dim, export=export) + self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) self.need_attn = True self.onnx_trace = False @@ -244,29 +348,31 @@ def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) def build_self_attention( - self, embed_dim, args, add_bias_kv=False, add_zero_attn=False + self, embed_dim, cfg, add_bias_kv=False, add_zero_attn=False ): return MultiheadAttention( embed_dim, - args.decoder_attention_heads, - dropout=args.attention_dropout, + cfg.decoder.attention_heads, + dropout=cfg.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, - self_attention=not getattr(args, "cross_self_attention", False), + self_attention=not cfg.cross_self_attention, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, + xformers_att_config=cfg.decoder.xformers_att_config, ) - def build_encoder_attention(self, embed_dim, args): + def build_encoder_attention(self, embed_dim, cfg): return MultiheadAttention( embed_dim, - args.decoder_attention_heads, - kdim=getattr(args, "encoder_embed_dim", None), - vdim=getattr(args, "encoder_embed_dim", None), - dropout=args.attention_dropout, + cfg.decoder.attention_heads, + kdim=cfg.encoder.embed_dim, + vdim=cfg.encoder.embed_dim, + dropout=cfg.attention_dropout, encoder_decoder_attention=True, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, + xformers_att_config=cfg.encoder.xformers_att_config, ) def prepare_for_onnx_export_(self): @@ -351,6 +457,13 @@ def forward( need_weights=False, attn_mask=self_attn_mask, ) + if self.c_attn is not None: + tgt_len, bsz = x.size(0), x.size(1) + x = x.view(tgt_len, bsz, self.nh, self.head_dim) + x = torch.einsum("tbhd,h->tbhd", x, self.c_attn) + x = x.reshape(tgt_len, bsz, self.embed_dim) + if self.attn_ln is not None: + x = self.attn_ln(x) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: @@ -392,8 +505,12 @@ def forward( x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) + if self.ffn_layernorm is not None: + x = self.ffn_layernorm(x) x = self.fc2(x) x = self.dropout_module(x) + if self.w_resid is not None: + residual = torch.mul(self.w_resid, residual) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.final_layer_norm(x) @@ -415,9 +532,31 @@ def make_generation_fast_(self, need_attn: bool = False, **kwargs): self.need_attn = need_attn -def Linear(in_features, out_features, bias=True): - m = nn.Linear(in_features, out_features, bias) - nn.init.xavier_uniform_(m.weight) - if bias: - nn.init.constant_(m.bias, 0.0) - return m +# backward compatible with the legacy argparse format +class TransformerDecoderLayer(TransformerDecoderLayerBase): + def __init__( + self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False + ): + super().__init__( + TransformerConfig.from_namespace(args), + no_encoder_attn=no_encoder_attn, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ) + self.args = args + + def build_self_attention( + self, embed_dim, args, add_bias_kv=False, add_zero_attn=False + ): + return super().build_self_attention( + embed_dim, + TransformerConfig.from_namespace(args), + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ) + + def build_encoder_attention(self, embed_dim, args): + return super().build_encoder_attention( + embed_dim, + TransformerConfig.from_namespace(args), + ) diff --git a/fairseq/modules/transformer_layer_aug.py b/fairseq/modules/transformer_layer_aug.py new file mode 100644 index 0000000000..7eb816978a --- /dev/null +++ b/fairseq/modules/transformer_layer_aug.py @@ -0,0 +1,315 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, List, Optional + +import torch +from numpy.random import uniform +from torch import Tensor + +from fairseq.modules import LayerNorm +from fairseq.modules.transformer_layer import TransformerDecoderLayerBase + + +class AugTransformerDecoderLayerBase(TransformerDecoderLayerBase): + """Decoder layer block augmented with an additional cross-attention. + + This decoder block is processed with the sequence of the following sub-modules. + self-attention -> cross-attention (first) -> cross-attention (second) -> FFN + + Args: + cfg (argparse.Namespace): parsed command-line arguments + encoder_attn_merge_type (str, optional): the way to combine outputs from + two cross-attention modules. If "sequential" is set, two cross-attention + modules are stacked sequentially. If "parallel" is set, they are processed + in parallel and combined before feeding it to FFN (default: sequential). + dropnet_ratio (float, optional): a probability to drop each cross-attention + module during training (default: 0.0). + """ + + def __init__( + self, + cfg, + add_bias_kv=False, + add_zero_attn=False, + encoder_attn_merge_type="sequential", + dropnet_ratio=0.0, + ): + super().__init__( + cfg, + no_encoder_attn=False, + add_bias_kv=add_bias_kv, + add_zero_attn=False, + ) + self.encoder_attn = self.build_encoder_attention(self.embed_dim, cfg) + self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) + self.encoder_attn2 = self.build_encoder_attention(self.embed_dim, cfg) + if encoder_attn_merge_type == "sequential": + self.encoder_attn_layer_norm2 = LayerNorm(self.embed_dim, export=cfg.export) + else: + self.encoder_attn_layer_norm2 = None + + self.encoder_attn_merge_type = encoder_attn_merge_type + self.dropnet_ratio = dropnet_ratio + + def forward( + self, + x, + encoder_out: Optional[torch.Tensor] = None, + encoder_padding_mask: Optional[torch.Tensor] = None, + encoder_out_aug: Optional[torch.Tensor] = None, + encoder_padding_mask2: Optional[torch.Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + prev_self_attn_state: Optional[List[torch.Tensor]] = None, + prev_attn_state: Optional[List[torch.Tensor]] = None, + self_attn_mask: Optional[torch.Tensor] = None, + self_attn_padding_mask: Optional[torch.Tensor] = None, + need_attn: bool = False, + need_head_weights: bool = False, + ): + """ + Args: + x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_padding_mask (ByteTensor, optional): binary + ByteTensor of shape `(batch, src_len)` where padding + elements are indicated by ``1``. + need_attn (bool, optional): return attention weights + need_head_weights (bool, optional): return attention weights + for each head (default: return average over heads). + + Returns: + encoded output of shape `(seq_len, batch, embed_dim)` + """ + if need_head_weights: + need_attn = True + + residual = x + if self.normalize_before: + x = self.self_attn_layer_norm(x) + if prev_self_attn_state is not None: + prev_key, prev_value = prev_self_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_self_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_self_attn_state[2] + assert incremental_state is not None + self.self_attn._set_input_buffer(incremental_state, saved_state) + _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state) + if self.cross_self_attention and not ( + incremental_state is not None + and _self_attn_input_buffer is not None + and "prev_key" in _self_attn_input_buffer + ): + if self_attn_mask is not None: + assert encoder_out is not None + self_attn_mask = torch.cat( + (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1 + ) + if self_attn_padding_mask is not None: + if encoder_padding_mask is None: + assert encoder_out is not None + encoder_padding_mask = self_attn_padding_mask.new_zeros( + encoder_out.size(1), encoder_out.size(0) + ) + self_attn_padding_mask = torch.cat( + (encoder_padding_mask, self_attn_padding_mask), dim=1 + ) + assert encoder_out is not None + y = torch.cat((encoder_out, x), dim=0) + else: + y = x + + x, attn = self.self_attn( + query=x, + key=y, + value=y, + key_padding_mask=self_attn_padding_mask, + incremental_state=incremental_state, + need_weights=False, + attn_mask=self_attn_mask, + ) + if self.c_attn is not None: + tgt_len, bsz = x.size(0), x.size(1) + x = x.view(tgt_len, bsz, self.nh, self.head_dim) + x = torch.einsum("tbhd,h->tbhd", x, self.c_attn) + x = x.reshape(tgt_len, bsz, self.embed_dim) + if self.attn_ln is not None: + x = self.attn_ln(x) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.self_attn_layer_norm(x) + + assert encoder_out is not None + assert encoder_out_aug is not None + + if self.encoder_attn_merge_type == "sequential": + ratios = self.get_dropnet_ratio() + + # first encoder attention + if ratios[0] > 0: + residual = x + if self.normalize_before: + x = self.encoder_attn_layer_norm(x) + if prev_attn_state is not None: + prev_key, prev_value = prev_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_attn_state[2] + assert incremental_state is not None + self.encoder_attn._set_input_buffer(incremental_state, saved_state) + + x, attn = self.encoder_attn( + query=x, + key=encoder_out, + value=encoder_out, + key_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + static_kv=True, + need_weights=need_attn or (not self.training and self.need_attn), + need_head_weights=need_head_weights, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.encoder_attn_layer_norm(x) + x = ratios[0] * x + + # second encoder attention + if ratios[1] > 0: + residual = x + if self.normalize_before: + x = self.encoder_attn_layer_norm2(x) + if prev_attn_state is not None: + prev_key, prev_value = prev_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_attn_state[2] + assert incremental_state is not None + self.encoder_attn2._set_input_buffer(incremental_state, saved_state) + + x, attn2 = self.encoder_attn2( + query=x, + key=encoder_out_aug, + value=encoder_out_aug, + key_padding_mask=encoder_padding_mask2, + incremental_state=incremental_state, + static_kv=True, + need_weights=need_attn or (not self.training and self.need_attn), + need_head_weights=need_head_weights, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.encoder_attn_layer_norm2(x) + x = ratios[1] * x + + elif self.encoder_attn_merge_type == "parallel": + residual = x + if self.normalize_before: + x = self.encoder_attn_layer_norm(x) + if prev_attn_state is not None: + prev_key, prev_value = prev_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_attn_state[2] + assert incremental_state is not None + self.encoder_attn._set_input_buffer(incremental_state, saved_state) + + x1, attn = self.encoder_attn( + query=x, + key=encoder_out, + value=encoder_out, + key_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + static_kv=True, + need_weights=need_attn or (not self.training and self.need_attn), + need_head_weights=need_head_weights, + ) + x2, attn2 = self.encoder_attn2( + query=x, + key=encoder_out_aug, + value=encoder_out_aug, + key_padding_mask=encoder_padding_mask2, + incremental_state=incremental_state, + static_kv=True, + need_weights=need_attn or (not self.training and self.need_attn), + need_head_weights=need_head_weights, + ) + x1 = self.dropout_module(x1) + x2 = self.dropout_module(x2) + ratios = self.get_dropnet_ratio() + x = ratios[0] * x1 + ratios[1] * x2 + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.encoder_attn_layer_norm(x) + + else: + raise NotImplementedError(self.encoder_attn_merge_type) + + residual = x + if self.normalize_before: + x = self.final_layer_norm(x) + + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + if self.ffn_layernorm is not None: + x = self.ffn_layernorm(x) + x = self.fc2(x) + x = self.dropout_module(x) + if self.w_resid is not None: + residual = torch.mul(self.w_resid, residual) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.final_layer_norm(x) + if self.onnx_trace and incremental_state is not None: + saved_state = self.self_attn._get_input_buffer(incremental_state) + assert saved_state is not None + if self_attn_padding_mask is not None: + self_attn_state = [ + saved_state["prev_key"], + saved_state["prev_value"], + saved_state["prev_key_padding_mask"], + ] + else: + self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]] + return x, attn, attn2, self_attn_state + return x, attn, attn2, None + + def get_dropnet_ratio(self): + if self.encoder_attn_merge_type == "sequential": + if self.dropnet_ratio > 0: + frand = float(uniform(0, 1)) + if frand < self.dropnet_ratio and self.training: + return [2, 0] + elif frand > 1 - self.dropnet_ratio and self.training: + return [0, 2] + else: + return [1, 1] + else: + return [1, 1] + + elif self.encoder_attn_merge_type == "parallel": + if self.dropnet_ratio > 0: + frand = float(uniform(0, 1)) + if frand < self.dropnet_ratio and self.training: + return [1, 0] + elif frand > 1 - self.dropnet_ratio and self.training: + return [0, 1] + else: + return [0.5, 0.5] + else: + return [0.5, 0.5] diff --git a/fairseq/modules/transformer_sentence_encoder.py b/fairseq/modules/transformer_sentence_encoder.py index 208488f562..5d2db91ad7 100644 --- a/fairseq/modules/transformer_sentence_encoder.py +++ b/fairseq/modules/transformer_sentence_encoder.py @@ -32,18 +32,23 @@ def init_bert_params(module): the normal distribution (to be validated). """ + def normal_(data): + # with FSDP, module params will be on CUDA, so we cast them back to CPU + # so that the RNG is consistent with and without FSDP + data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device)) + if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=0.02) + normal_(module.weight.data) if module.bias is not None: module.bias.data.zero_() if isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=0.02) + normal_(module.weight.data) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() if isinstance(module, MultiheadAttention): - module.q_proj.weight.data.normal_(mean=0.0, std=0.02) - module.k_proj.weight.data.normal_(mean=0.0, std=0.02) - module.v_proj.weight.data.normal_(mean=0.0, std=0.02) + normal_(module.q_proj.weight.data) + normal_(module.k_proj.weight.data) + normal_(module.v_proj.weight.data) class TransformerSentenceEncoder(nn.Module): @@ -113,7 +118,6 @@ def __init__( self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.traceable = traceable - self.tpu = False # whether we're on TPU self.embed_tokens = self.build_embedding( self.vocab_size, self.embedding_dim, self.padding_idx @@ -146,6 +150,11 @@ def __init__( else None ) + if encoder_normalize_before: + self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) + else: + self.emb_layer_norm = None + if self.layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.layerdrop) else: @@ -168,11 +177,6 @@ def __init__( ] ) - if encoder_normalize_before: - self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) - else: - self.emb_layer_norm = None - # Apply initialization of model params after building the model if self.apply_bert_init: self.apply(init_bert_params) @@ -220,9 +224,6 @@ def build_transformer_sentence_encoder_layer( qn_block_size=qn_block_size, ) - def prepare_for_tpu_(self, **kwargs): - self.tpu = True - def forward( self, tokens: torch.Tensor, @@ -230,11 +231,13 @@ def forward( last_state_only: bool = False, positions: Optional[torch.Tensor] = None, token_embeddings: Optional[torch.Tensor] = None, + attn_mask: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: + is_tpu = tokens.device.type == "xla" # compute padding mask. This is needed for multi-head attention padding_mask = tokens.eq(self.padding_idx) - if not self.traceable and not self.tpu and not padding_mask.any(): + if not self.traceable and not is_tpu and not padding_mask.any(): padding_mask = None if token_embeddings is not None: @@ -271,7 +274,9 @@ def forward( inner_states.append(x) for layer in self.layers: - x, _ = layer(x, self_attn_padding_mask=padding_mask) + x, _ = layer( + x, self_attn_padding_mask=padding_mask, self_attn_mask=attn_mask + ) if not last_state_only: inner_states.append(x) diff --git a/fairseq/modules/transformer_sentence_encoder_layer.py b/fairseq/modules/transformer_sentence_encoder_layer.py index 3589c60fe6..f869c4b2f8 100644 --- a/fairseq/modules/transformer_sentence_encoder_layer.py +++ b/fairseq/modules/transformer_sentence_encoder_layer.py @@ -40,6 +40,11 @@ def __init__( # Initialize parameters self.embedding_dim = embedding_dim + self.num_attention_heads = num_attention_heads + self.attention_dropout = attention_dropout + self.q_noise = q_noise + self.qn_block_size = qn_block_size + self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) diff --git a/fairseq/modules/transpose_last.py b/fairseq/modules/transpose_last.py index e578b3ec50..d7cca9a4bb 100644 --- a/fairseq/modules/transpose_last.py +++ b/fairseq/modules/transpose_last.py @@ -10,11 +10,12 @@ class TransposeLast(nn.Module): - def __init__(self, deconstruct_idx=None): + def __init__(self, deconstruct_idx=None, tranpose_dim=-2): super().__init__() self.deconstruct_idx = deconstruct_idx + self.tranpose_dim = tranpose_dim def forward(self, x): if self.deconstruct_idx is not None: x = x[self.deconstruct_idx] - return x.transpose(-2, -1) + return x.transpose(self.tranpose_dim, -1) diff --git a/fairseq/modules/unfold.py b/fairseq/modules/unfold.py index 138272f1ef..bbaafbd6bf 100644 --- a/fairseq/modules/unfold.py +++ b/fairseq/modules/unfold.py @@ -6,7 +6,7 @@ import torch.nn.functional as F -def unfold1d(x, kernel_size, padding_l, pad_value=0): +def unfold1d(x, kernel_size: int, padding_l: int, pad_value: float = 0): """unfold T x B x C to T x B x C x K""" if kernel_size > 1: T, B, C = x.size() diff --git a/fairseq/nan_detector.py b/fairseq/nan_detector.py index faa8031d46..bd0f911073 100644 --- a/fairseq/nan_detector.py +++ b/fairseq/nan_detector.py @@ -37,8 +37,8 @@ def __exit__(self, exc_type, exc_value, exc_traceback): gradients = {} for name, param in self.named_parameters: if param.grad is not None: - grad_norm = torch.norm(param.grad.data, p=2, dtype=torch.float32) - norm[name] = grad_norm.item() + grad_norm = torch.norm(param.grad.data.float(), p=2) + norm[name] = param.norm().item() if torch.isnan(grad_norm).any() or torch.isinf(grad_norm).any(): gradients[name] = param.grad.data if len(gradients) > 0: diff --git a/fairseq/ngram_repeat_block.py b/fairseq/ngram_repeat_block.py new file mode 100644 index 0000000000..4eb5030311 --- /dev/null +++ b/fairseq/ngram_repeat_block.py @@ -0,0 +1,120 @@ +# Originally from Microsoft Corporation. +# Licensed under the MIT License. + +""" Wrapper for ngram_repeat_block cuda extension """ +import math +import warnings +from typing import List + +import torch +from torch import nn + +try: + from fairseq import ngram_repeat_block_cuda + + EXTENSION_BUILT = True +except ImportError: + EXTENSION_BUILT = False + + +def is_cuda_extension_usable() -> bool: + """Check whether ngram_repeat_block_cuda is built properly""" + if not EXTENSION_BUILT or not torch.cuda.is_available(): + return False + bsz = 2 + tokens = torch.tensor([[4, 4, 3, 2], [1, 2, 3, 4]], dtype=torch.long, device="cuda") + lprobs = torch.rand((8, 12), device="cuda") + try: + outputs = ngram_repeat_block_cuda.forward(tokens, lprobs, bsz, 3, 4, 3) + outputs = outputs + 4 # This line breaks if the extension is built incorrectly. + return True + except RuntimeError: + warnings.warn( + "NGramRepeatBlock extension must be rebuilt." + 'Run TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0" python setup.py build_ext --inplace' + ) + return False + + +class NGramRepeatBlock(nn.Module): + """Wrapper class for calling ngram_repeat_block cuda extension""" + + def __init__(self, no_repeat_ngram_size: int, use_extension: bool = True): + super().__init__() + self.use_extension = is_cuda_extension_usable() if use_extension else False + self.no_repeat_ngram_size = no_repeat_ngram_size + + def reset_parameters(self): + pass + + @torch.jit.unused + def call_cuda_extension( + self, + tokens, + lprobs, + bsz: int, + beam_size: int, + step: int, + ): + return ngram_repeat_block_cuda.forward( + tokens, lprobs, bsz, step, beam_size, self.no_repeat_ngram_size + ) + + def forward( + self, + tokens, + lprobs, + bsz: int, + beam_size: int, + step: int, + ): + """ + Args: + tokens(Tensor): Input tokens(Bsz*beam, seq_len) + lprobs(Tensor): likelihood probability, + Expected to be updated in place.(Bsz*beam, vocab_size) + bsz(int): batch size + step(int): current step + beam_size(int): beam size + no_repeat_ngram_size(int): Ngram size + """ + msg = f"expected {bsz *beam_size} got" + assert tokens.size(0) == bsz * beam_size, f"{msg} {tokens.size(0)}" + assert lprobs.size(0) == bsz * beam_size, f"{msg} {lprobs.size(0)}" + if self.use_extension: + return self.call_cuda_extension(tokens, lprobs, bsz, beam_size, step) + + else: + return self._no_repeat_ngram( + tokens, + lprobs, + bsz, + beam_size, + step, + ) + + def _no_repeat_ngram(self, tokens, lprobs, bsz: int, beam_size: int, step: int): + """For each hypothesis generate a list of previous ngrams and set associated lprobs to -inf""" + banned_tokens = [ + torch.jit.annotate(List[int], []) for bbsz_idx in range(bsz * beam_size) + ] + if step + 2 - self.no_repeat_ngram_size >= 0: + cpu_tokens: List[List[int]] = tokens.cpu().tolist() + check_start_pos = step + 2 - self.no_repeat_ngram_size + for bbsz_idx in range(bsz * beam_size): + ngram_to_check = cpu_tokens[bbsz_idx][ + -(self.no_repeat_ngram_size - 1) : + ] + for i in range(check_start_pos): + if ( + ngram_to_check + == cpu_tokens[bbsz_idx][i : i + self.no_repeat_ngram_size - 1] + ): + banned_tokens[bbsz_idx].append( + cpu_tokens[bbsz_idx][i + self.no_repeat_ngram_size - 1] + ) + for bbsz_idx in range(bsz * beam_size): + lprobs[bbsz_idx][ + torch.tensor(banned_tokens[bbsz_idx], dtype=torch.int64) + ] = torch.tensor(-math.inf).to(lprobs) + return lprobs diff --git a/fairseq/optim/__init__.py b/fairseq/optim/__init__.py index d8e581729e..be783be896 100644 --- a/fairseq/optim/__init__.py +++ b/fairseq/optim/__init__.py @@ -13,11 +13,13 @@ FairseqOptimizer, LegacyFairseqOptimizer, ) +from fairseq.optim.amp_optimizer import AMPOptimizer from fairseq.optim.fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer from fairseq.optim.shard import shard_ from omegaconf import DictConfig __all__ = [ + "AMPOptimizer", "FairseqOptimizer", "FP16Optimizer", "MemoryEfficientFP16Optimizer", @@ -32,9 +34,7 @@ ) = registry.setup_registry("--optimizer", base_class=FairseqOptimizer, required=True) -def build_optimizer( - cfg: DictConfig, params, *extra_args, **extra_kwargs -): +def build_optimizer(cfg: DictConfig, params, *extra_args, **extra_kwargs): if all(isinstance(p, dict) for p in params): params = [t for p in params for t in p.values()] params = list(filter(lambda p: p.requires_grad, params)) @@ -42,7 +42,7 @@ def build_optimizer( # automatically import any Python files in the optim/ directory -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): file_name = file[: file.find(".py")] importlib.import_module("fairseq.optim." + file_name) diff --git a/fairseq/optim/adafactor.py b/fairseq/optim/adafactor.py index 91745ce10e..042ae926b0 100644 --- a/fairseq/optim/adafactor.py +++ b/fairseq/optim/adafactor.py @@ -76,7 +76,7 @@ class Adafactor(torch.optim.Optimizer): schedule you should set `scale_parameter=False` and `relative_step=False`. - Arguments: + Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): external learning rate (default: None) @@ -168,7 +168,7 @@ def _approx_sq_grad(self, exp_avg_sq_row, exp_avg_sq_col): def step(self, closure=None): """Performs a single optimization step. - Arguments: + Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ @@ -224,7 +224,7 @@ def step(self, closure=None): group["lr"] = self._get_lr(group, state) beta2t = 1.0 - math.pow(state["step"], group["decay_rate"]) - update = (grad ** 2) + group["eps"][0] + update = (grad**2) + group["eps"][0] if factored: exp_avg_sq_row = state["exp_avg_sq_row"] exp_avg_sq_col = state["exp_avg_sq_col"] diff --git a/fairseq/optim/adagrad.py b/fairseq/optim/adagrad.py index a79b6c39da..4f539541c1 100644 --- a/fairseq/optim/adagrad.py +++ b/fairseq/optim/adagrad.py @@ -37,4 +37,4 @@ def optimizer_config(self): @property def supports_flat_params(self): - return True + return False diff --git a/fairseq/optim/adam.py b/fairseq/optim/adam.py index 9b8ddffd7e..678ec7c617 100644 --- a/fairseq/optim/adam.py +++ b/fairseq/optim/adam.py @@ -5,9 +5,9 @@ import logging import math -from collections import Collection +from collections.abc import Collection from dataclasses import dataclass, field -from typing import List +from typing import Any, List import torch import torch.distributed as dist @@ -15,7 +15,7 @@ from fairseq.dataclass import FairseqDataclass from fairseq.optim import FairseqOptimizer, register_optimizer from fairseq.optim.fused_adam import get_fused_adam_class -from omegaconf import II, DictConfig +from omegaconf import II, OmegaConf logger = logging.getLogger(__name__) @@ -23,8 +23,8 @@ @dataclass class FairseqAdamConfig(FairseqDataclass): - adam_betas: str = field( - default="(0.9, 0.999)", metadata={"help": "betas for Adam optimizer"} + adam_betas: Any = field( + default=(0.9, 0.999), metadata={"help": "betas for Adam optimizer"} ) adam_eps: float = field( default=1e-8, metadata={"help": "epsilon for Adam optimizer"} @@ -33,6 +33,9 @@ class FairseqAdamConfig(FairseqDataclass): use_old_adam: bool = field( default=False, metadata={"help": "Use fairseq.optim.adam.Adam"} ) + fp16_adam_stats: bool = field( + default=False, metadata={"help": "use FP16 stats (with automatic scaling)"} + ) # TODO common vars below in parent tpu: bool = II("common.tpu") lr: List[float] = II("optimization.lr") @@ -47,7 +50,7 @@ class FairseqAdam(FairseqOptimizer): analogous to torch.optim.AdamW from PyTorch. """ - def __init__(self, cfg: DictConfig, params): + def __init__(self, cfg: FairseqAdamConfig, params): super().__init__(cfg) fused_adam_cls = get_fused_adam_class() use_fused_adam = ( @@ -56,13 +59,21 @@ def __init__(self, cfg: DictConfig, params): and torch.cuda.is_available() ) if getattr(cfg, "tpu", False): + if self.cfg.fp16_adam_stats: + raise NotImplementedError("--fp16-adam-stats is only supported on GPU") # on TPUs we use the Adam defined here, since it # automatically casts gradients to FP32 self._optimizer = Adam(params, **self.optimizer_config) elif use_fused_adam: logger.info("using FusedAdam") - self._optimizer = fused_adam_cls(params, **self.optimizer_config) + self._optimizer = fused_adam_cls( + params, use_fp16_stats=self.cfg.fp16_adam_stats, **self.optimizer_config + ) else: + if self.cfg.fp16_adam_stats: + raise NotImplementedError( + "--fp16-adam-stats is only supported with FusedAdamV1" + ) self._optimizer = Adam(params, **self.optimizer_config) @property @@ -77,7 +88,9 @@ def optimizer_config(self): "lr": self.cfg.lr[0] if isinstance(self.cfg.lr, Collection) else self.cfg.lr, - "betas": eval(self.cfg.adam_betas), + "betas": eval(self.cfg.adam_betas) + if isinstance(self.cfg.adam_betas, str) + else OmegaConf.to_container(self.cfg.adam_betas), "eps": self.cfg.adam_eps, "weight_decay": self.cfg.weight_decay, } @@ -95,7 +108,7 @@ def average_params(self): class Adam(torch.optim.Optimizer): - """Implements Adam algorithm. + r"""Implements Adam algorithm. This implementation is modified from torch.optim.Adam based on: `Fixed Weight Decay Regularization in Adam` @@ -103,7 +116,7 @@ class Adam(torch.optim.Optimizer): It has been proposed in `Adam: A Method for Stochastic Optimization`_. - Arguments: + Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) @@ -146,7 +159,7 @@ def supports_flat_params(self): def step(self, closure=None): """Performs a single optimization step. - Arguments: + Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ diff --git a/fairseq/optim/adamax.py b/fairseq/optim/adamax.py index 577a688166..98ff8ad7ad 100644 --- a/fairseq/optim/adamax.py +++ b/fairseq/optim/adamax.py @@ -53,7 +53,7 @@ class Adamax(torch.optim.Optimizer): Compared to the version in PyTorch, this version implements a fix for weight decay. - Arguments: + Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 2e-3) @@ -107,7 +107,7 @@ def supports_flat_params(self): def step(self, closure=None): """Performs a single optimization step. - Arguments: + Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ diff --git a/fairseq/optim/amp_optimizer.py b/fairseq/optim/amp_optimizer.py new file mode 100644 index 0000000000..cfe57d07f9 --- /dev/null +++ b/fairseq/optim/amp_optimizer.py @@ -0,0 +1,106 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch +from fairseq import optim +from omegaconf import DictConfig + +logger = logging.getLogger(__name__) + + +class AMPOptimizer(optim.FairseqOptimizer): + """ + Wrap an *optimizer* to support AMP (automatic mixed precision) training. + """ + + def __init__(self, cfg: DictConfig, params, fp32_optimizer, **kwargs): + super().__init__(cfg.optimizer) + self.fp32_optimizer = fp32_optimizer + amp_kwargs = {"init_scale": cfg.common.fp16_init_scale} + if getattr(cfg.common, "amp_scale_window", None) is not None: + amp_kwargs["growth_interval"] = cfg.common.amp_init_scale + self._grad_scaler = torch.cuda.amp.GradScaler(**amp_kwargs) + self.min_loss_scale = cfg.common.min_loss_scale + + @classmethod + def build_optimizer(cls, cfg: DictConfig, params, **kwargs): + """ + Args: + cfg (omegaconf.DictConfig): fairseq args + params (iterable): iterable of parameters to optimize + """ + fp32_optimizer = optim.build_optimizer(cfg.optimizer, params) + return cls(cfg, params, fp32_optimizer, **kwargs) + + def backward(self, loss): + """Computes the sum of gradients of the given tensor w.r.t. graph leaves. + + Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this + function additionally dynamically scales the loss to avoid gradient + underflow. + """ + self._grad_scaler.scale(loss).backward() + + def step(self): + self.scaler.step(self.fp32_optimizer) + self.scaler.update() + + def clip_grad_norm(self, max_norm, aggregate_norm_fn=None): + """Clips gradient norm.""" + self.scaler.unscale_(self.optimizer) + grad_norm = self.fp32_optimizer.clip_grad_norm(max_norm, aggregate_norm_fn) + if not torch.isfinite(grad_norm).all(): + new_loss_scale = self.next_loss_scale + if new_loss_scale <= self.min_loss_scale: + raise FloatingPointError( + ( + "AMP: Minimum loss scale reached ({}). Your loss is probably exploding. " + "Try restarting training or use fp32. {}" + ).format(self.min_loss_scale, new_loss_scale) + ) + else: + logger.info( + "AMP: overflow detected, setting scale to " f"to {new_loss_scale}" + ) + return grad_norm + + @property + def scaler(self): + return self._grad_scaler + + @property + def next_loss_scale(self): + return self.scaler.get_scale() * self.scaler.get_backoff_factor() + + @property + def optimizer(self): + return self.fp32_optimizer.optimizer + + @optimizer.setter + def optimizer(self, optimizer): + self.fp32_optimizer.optimizer = optimizer + + @property + def lr_scheduler(self): + return getattr(self.fp32_optimizer, "lr_scheduler", None) + + @property + def optimizer_config(self): + return self.fp32_optimizer.optimizer_config + + def get_lr(self): + return self.fp32_optimizer.get_lr() + + def set_lr(self, lr): + self.fp32_optimizer.set_lr(lr) + + def all_reduce_grads(self, module): + self.fp32_optimizer.all_reduce_grads(module) + + @property + def supports_flat_params(self): + return self.fp32_optimizer.supports_flat_params diff --git a/fairseq/optim/bmuf.py b/fairseq/optim/bmuf.py index 55f225ba6a..d6d0e04e86 100644 --- a/fairseq/optim/bmuf.py +++ b/fairseq/optim/bmuf.py @@ -7,39 +7,9 @@ import torch import torch.distributed as dist -from fairseq.dataclass import FairseqDataclass +from fairseq.dataclass.configs import FairseqBMUFConfig from fairseq.dataclass.utils import gen_parser_from_dataclass from fairseq.optim.fairseq_optimizer import FairseqOptimizer -from omegaconf import II, DictConfig - - -@dataclass -class FairseqBMUFConfig(FairseqDataclass): - block_lr: float = field( - default=1, metadata={"help": "block learning rate for bmuf"} - ) - block_momentum: float = field( - default=0.875, metadata={"help": "block momentum for bmuf"} - ) - global_sync_iter: int = field( - default=50, metadata={"help": "Iteration for syncing global model"} - ) - warmup_iterations: int = field( - default=500, metadata={"help": "warmup iterations for model to broadcast"} - ) - use_nbm: bool = field( - default=False, - metadata={"help": "Specify whether you want to use classical BM / Nesterov BM"}, - ) - average_sync: bool = field( - default=False, - metadata={ - "help": "Specify whether you want to average the local momentum after each sync" - }, - ) - distributed_world_size: int = II( - "distributed_training.distributed_world_size" - ) class FairseqBMUF(FairseqOptimizer): @@ -52,7 +22,7 @@ class FairseqBMUF(FairseqOptimizer): model-update filtering """ - def __init__(self, cfg: DictConfig, optimizer): + def __init__(self, cfg: FairseqBMUFConfig, optimizer): super().__init__(cfg) self._optimizer = optimizer self._num_updates = 0 diff --git a/fairseq/optim/composite.py b/fairseq/optim/composite.py new file mode 100644 index 0000000000..1ef0114ed6 --- /dev/null +++ b/fairseq/optim/composite.py @@ -0,0 +1,273 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Dict, Any, List, Optional + +import torch.optim +from fairseq.dataclass import FairseqDataclass +from fairseq.optim import FairseqOptimizer, register_optimizer, _build_optimizer +from fairseq.optim.lr_scheduler import FairseqLRScheduler, build_lr_scheduler +from omegaconf import II, open_dict +import copy + + +logger = logging.getLogger(__name__) + + +@dataclass +class OptimizerAndSchedulerConfig(FairseqDataclass): + optimizer: Any = None + lr_scheduler: Optional[Any] = None + lr: List = II("optimization.lr") + lr_float: Optional[ + float + ] = None # this makes it easier to sweep on learning rate with auto sweepers + + +@dataclass +class CompositeOptimizerConfig(FairseqDataclass): + groups: Dict[str, Any] = field( + default_factory=lambda: {}, + metadata={ + "help": "optimizer name -> optimizer OptimizerAndSchedulerConfig. " + "Configures a different optimizer and (optionally) lr scheduler for each parameter group" + }, + ) + dynamic_groups: bool = field( + default=False, + metadata={ + "help": "create groups dynamically based on parameters, if set to False, all parameters needs to have group_names" + }, + ) + + +@register_optimizer("composite", dataclass=CompositeOptimizerConfig) +class FairseqCompositeOptimizer(FairseqOptimizer): + + optimizers: Dict[str, FairseqOptimizer] = {} + lr_schedulers: Dict[str, FairseqLRScheduler] = {} + lr_scheduler: FairseqLRScheduler = None + _optimizer: torch.optim.Optimizer + + def __init__(self, cfg: CompositeOptimizerConfig, params): + super().__init__(cfg) + + assert ( + len(params) > 1 + ), "Composite optimizer only works when there are multiple parameter groups (try fp16_no_flatten_grads: true)" + + def dict_hash(dictionary: Dict[str, Any]) -> str: + import hashlib + import json + + dhash = hashlib.md5() + encoded = json.dumps(dictionary, sort_keys=True).encode() + dhash.update(encoded) + return dhash.hexdigest() + + groupped_params = defaultdict(list) + overrides = defaultdict(dict) + if not cfg.dynamic_groups: + for p in params: + group = getattr(p, "param_group", "default") + override_config = getattr(p, "optim_overrides", None) + if override_config is not None and bool(override_config): + overrides[group] = override_config + else: + assert ( + override_config == None or override_config == overrides[group] + ), f"For group {group}, different overrides found {override_config} v/s {overrides[group]}" + groupped_params[group].append(p) + + for p, params in groupped_params.items(): + override_config = getattr(params[0], "optim_overrides", None) + if override_config is not None: + for pp in params[1:]: + assert override_config == getattr( + pp, "optim_overrides", None + ), f" {str(override_config)} != {str(getattr(pp, 'optim_overrides', None))}" + else: + for p in params: + group = getattr(p, "param_group", "default") + override_config = getattr(p, "optim_overrides", None) + if override_config is not None: + override_config["group_name"] = group + group_name = dict_hash(override_config) + overrides[group_name] = override_config + else: + group_name = group + groupped_params[group_name].append(p) + + self.optimizers_config = {} + for group, group_params in groupped_params.items(): + p_group = group + if group in overrides and "group_name" in overrides[group]: + p_group = overrides[group]["group_name"] + if group in cfg.groups: + group_cfg = cfg.groups[group] + optimizer_config = copy.deepcopy(group_cfg.optimizer) + scheduler_config = copy.deepcopy(group_cfg.lr_scheduler) + explicit_group_present = True + else: + group_cfg = cfg.groups[p_group] + optimizer_config = copy.deepcopy(group_cfg.optimizer) + scheduler_config = copy.deepcopy(group_cfg.lr_scheduler) + explicit_group_present = False + + if getattr(group_cfg, "lr_float", None) is not None: + with open_dict(optimizer_config): + optimizer_config.lr = [group_cfg.lr_float] + + if group in overrides and "optimizer" in overrides[group]: + with open_dict(optimizer_config): + if "lr_scale" in overrides[group]["optimizer"]: + lr_scale = overrides[group]["optimizer"]["lr_scale"] + optimizer_config.lr = [ + lr * lr_scale for lr in optimizer_config.lr + ] + + if explicit_group_present: + logger.info( + f"For group:{group}, config as well as override present for lr" + ) + + if ( + "weight_decay_scale" in overrides[group]["optimizer"] + and "optimizer_config" in optimizer_config + ): + weight_decay_scale = overrides[group]["optimizer"][ + "weight_decay_scale" + ] + optimizer_config.weight_decay = ( + optimizer_config.weight_decay * weight_decay_scale + ) + if explicit_group_present: + logger.info( + f"For group:{group}, config as well as override present for weight_decay" + ) + + with open_dict(scheduler_config): + scheduler_config.lr = optimizer_config.lr + self.optimizers[group] = _build_optimizer(optimizer_config, group_params) + self.optimizers_config[group] = optimizer_config + if scheduler_config is not None: + self.lr_schedulers[group] = build_lr_scheduler( + scheduler_config, self.optimizers[group] + ) + logger.info("Optimizers for different groups are as below") + for group in self.optimizers_config.keys(): + logger.info(f"Group : {group}:{self.optimizers_config[group]}") + if len(self.lr_schedulers) > 0: + assert len(self.lr_schedulers) == len(self.optimizers), ( + f"Please provide an lr scheduler for each optimizer to use pass_through scheduler. " + f"Optimizers: {self.optimizers}; Lr scheds: {self.lr_schedulers}" + ) + self.lr_scheduler = CompositeLRScheduler(self.lr_schedulers) + + self._optimizer = CompositeOptimizer(self.optimizers) + + @property + def supports_groups(self): + return True + + @property + def param_groups(self): + for opt in self.optimizers.values(): + for group in opt.param_groups: + yield group + + def get_lr(self): + """Return the current learning rate.""" + k = ( + "default" + if "default" in self.optimizers + else next(iter(self.optimizers.keys())) + ) + return self.optimizers[k].param_groups[0]["lr"] + + def state_dict(self): + """Return the LR scheduler state dict.""" + return {k: s.state_dict() for k, s in self.optimizers.items()} + + def load_state_dict(self, state_dict, optimizer_overrides=None): + """Load an LR scheduler state dict.""" + for k, state in state_dict.items(): + if k not in self.optimizers: + # skip extra keys like "loss_scale" added by fp16 optimizer + continue + + overrides = ( + optimizer_overrides[k] + if isinstance(optimizer_overrides, dict) and k in optimizer_overrides + else None + ) + self.optimizers[k].load_state_dict(state, optimizer_overrides=overrides) + + +class CompositeOptimizer(torch.optim.Optimizer): + def __init__(self, optimizers: Dict[str, FairseqOptimizer]): + self.optimizers = optimizers + + @property + def supports_memory_efficient_fp16(self): + return all(o.supports_memory_efficient_fp16 for o in self.optimizers.values()) + + @property + def supports_flat_params(self): + return all(o.supports_flat_params for o in self.optimizers.values()) + + def step(self, closure=None, groups=None): + """Performs a single optimization step. + + Args: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for k, opt in self.optimizers.items(): + if groups is None or k in groups: + opt.step() + + return loss + + def zero_grad(self): + for opt in self.optimizers.values(): + opt.zero_grad() + + +class CompositeLRScheduler(FairseqLRScheduler): + def __init__(self, lr_schedulers): + super().__init__(None, None) + + self.lr_schedulers = lr_schedulers + + def state_dict(self): + """Return the LR scheduler state dict.""" + return {k: s.state_dict() for k, s in self.lr_schedulers.items()} + + def load_state_dict(self, state_dict): + """Load an LR scheduler state dict.""" + for k, state in state_dict.items(): + self.lr_schedulers[k].load_state_dict(state) + + def step_begin_epoch(self, epoch): + """Update the learning rate at the beginning of the given epoch.""" + for s in self.lr_schedulers.values(): + s.step_begin_epoch(epoch) + + def step(self, epoch, val_loss=None): + """Update the learning rate at the end of the given epoch.""" + for s in self.lr_schedulers.values(): + s.step(epoch) + + def step_update(self, num_updates): + """Update the learning rate after each update.""" + return {k: s.step_update(num_updates) for k, s in self.lr_schedulers.items()} diff --git a/fairseq/optim/cpu_adam.py b/fairseq/optim/cpu_adam.py new file mode 100644 index 0000000000..b218934e71 --- /dev/null +++ b/fairseq/optim/cpu_adam.py @@ -0,0 +1,210 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import importlib +from collections.abc import Collection +from dataclasses import dataclass, field +from typing import List + +import torch +from fairseq.dataclass import FairseqDataclass +from fairseq.optim import FairseqOptimizer, register_optimizer +from omegaconf import II, DictConfig + + +try: + import deepspeed + + has_deepspeed = True +except ImportError as e: + has_deepspeed = False + + +def _get_cpu_adam(): + try: + from deepspeed.ops.op_builder import CPUAdamBuilder + + return CPUAdamBuilder().load() + except ImportError: + # fbcode + from deepspeed.ops.adam import DeepSpeedCPUAdam as ds_opt_adam + + return ds_opt_adam + + +@dataclass +class FairseqCPUAdamConfig(FairseqDataclass): + adam_betas: str = field( + default="(0.9, 0.999)", metadata={"help": "betas for Adam optimizer"} + ) + adam_eps: float = field( + default=1e-8, metadata={"help": "epsilon for Adam optimizer"} + ) + weight_decay: float = field(default=0.0, metadata={"help": "weight decay"}) + fp16_adam_stats: bool = field( + default=False, metadata={"help": "use FP16 stats (with automatic scaling)"} + ) + # TODO common vars below in parent + lr: List[float] = II("optimization.lr") + + +@register_optimizer("cpu_adam", dataclass=FairseqCPUAdamConfig) +class FairseqCPUAdam(FairseqOptimizer): + """Adam optimizer for fairseq, optimized for CPU tensors. + + Important note: this optimizer corresponds to the "AdamW" variant of + Adam in its weight decay behavior. As such, it is most closely + analogous to torch.optim.AdamW from PyTorch. + """ + + def __init__(self, cfg: DictConfig, params): + super().__init__(cfg) + self._optimizer = CPUAdam(params, **self.optimizer_config) + + @property + def optimizer_config(self): + """ + Return a kwarg dictionary that will be used to override optimizer + args stored in checkpoints. This allows us to load a checkpoint and + resume training using a different set of optimizer args, e.g., with a + different learning rate. + """ + return { + "lr": self.cfg.lr[0] + if isinstance(self.cfg.lr, Collection) + else self.cfg.lr, + "betas": eval(self.cfg.adam_betas), + "eps": self.cfg.adam_eps, + "weight_decay": self.cfg.weight_decay, + "use_fp16_stats": self.cfg.fp16_adam_stats, + } + + +class CPUAdam(torch.optim.Optimizer): + + optimizer_id = 0 + + def __init__( + self, + params, + lr=1e-3, + bias_correction=True, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=0, + use_fp16_stats=False, + ): + defaults = { + "lr": lr, + "bias_correction": bias_correction, + "betas": betas, + "eps": eps, + "weight_decay": weight_decay, + } + super().__init__(params, defaults) + + self.use_fp16_stats = use_fp16_stats + self.FLOAT16_MAX = 65504.0 + + if not has_deepspeed: + raise ImportError("Please install DeepSpeed: pip install deepspeed") + + self.opt_id = CPUAdam.optimizer_id + CPUAdam.optimizer_id = CPUAdam.optimizer_id + 1 + + self.ds_opt_adam = _get_cpu_adam() + adamw_mode = True + self.ds_opt_adam.create_adam( + self.opt_id, lr, betas[0], betas[1], eps, weight_decay, adamw_mode + ) + + @property + def supports_memory_efficient_fp16(self): + return True + + @property + def supports_flat_params(self): + return True + + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + torch.cuda.synchronize() + + for group_id, group in enumerate(self.param_groups): + for param_id, p in enumerate(group["params"]): + if p.grad is None: + continue + + state = self.state[p] + if len(state) == 0: + state["step"] = 0 + dtype = torch.float16 if self.use_fp16_stats else p.data.dtype + # gradient momentums + state["exp_avg"] = torch.zeros_like( + p.data, dtype=dtype, device="cpu" + ) + # gradient variances + state["exp_avg_sq"] = torch.zeros_like( + p.data, dtype=dtype, device="cpu" + ) + if self.use_fp16_stats: + assert torch.is_floating_point(p.data) + state["exp_avg_scale"] = 1.0 + state["exp_avg_sq_scale"] = 1.0 + + exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] + + p_data_bak = p.data # backup of the original data pointer + + p.data = p.data.to(dtype=torch.float32, device="cpu") + p.grad.data = p.grad.data.to(dtype=torch.float32, device="cpu") + + if self.use_fp16_stats: + exp_avg = exp_avg.float() * state["exp_avg_scale"] + exp_avg_sq = exp_avg_sq.float() * state["exp_avg_sq_scale"] + + state["step"] += 1 + beta1, beta2 = group["betas"] + + self.ds_opt_adam.adam_update( + self.opt_id, + state["step"], + group["lr"], + beta1, + beta2, + group["eps"], + group["weight_decay"], + group["bias_correction"], + p.data, + p.grad.data, + exp_avg, + exp_avg_sq, + ) + + if p_data_bak.data_ptr() != p.data.data_ptr(): + p_data_bak.copy_(p.data) + p.data = p_data_bak + + if self.use_fp16_stats: + + def inf_norm(t): + return torch.norm(t, float("inf")) + + # from github.com/openai/jukebox/blob/master/jukebox/utils/fp16.py + state["exp_avg_scale"], state["exp_avg_sq_scale"] = ( + 1e-8 + inf_norm(exp_avg) / self.FLOAT16_MAX, + 1e-8 + inf_norm(exp_avg_sq) / self.FLOAT16_MAX, + ) + state["exp_avg"], state["exp_avg_sq"] = ( + (exp_avg / state["exp_avg_scale"]).half(), + (exp_avg_sq / state["exp_avg_sq_scale"]).half(), + ) + + return loss diff --git a/fairseq/optim/dynamic_loss_scaler.py b/fairseq/optim/dynamic_loss_scaler.py index c5da604220..60c47b8db0 100644 --- a/fairseq/optim/dynamic_loss_scaler.py +++ b/fairseq/optim/dynamic_loss_scaler.py @@ -7,10 +7,10 @@ class DynamicLossScaler(object): def __init__( self, - init_scale=2.0 ** 15, + init_scale=2.0**15, scale_factor=2.0, scale_window=2000, - tolerance=0.05, + tolerance=0.0, threshold=None, min_loss_scale=1e-4, ): diff --git a/fairseq/optim/fairseq_optimizer.py b/fairseq/optim/fairseq_optimizer.py index 9c0938331d..73c7c695ee 100644 --- a/fairseq/optim/fairseq_optimizer.py +++ b/fairseq/optim/fairseq_optimizer.py @@ -6,6 +6,7 @@ import torch from fairseq import utils from fairseq.dataclass.utils import gen_parser_from_dataclass +from collections import defaultdict class FairseqOptimizer(object): @@ -94,24 +95,44 @@ def backward(self, loss): """Computes the sum of gradients of the given tensor w.r.t. graph leaves.""" loss.backward() + def all_reduce_grads(self, module): + """Manually all-reduce gradients (if required).""" + if hasattr(module, "all_reduce_grads"): + module.all_reduce_grads() + def multiply_grads(self, c): """Multiplies grads by a constant *c*.""" + per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list)) for p in self.params: if p.grad is not None: - p.grad.data.mul_(c) + if p.grad.is_sparse: + p.grad.data.mul_(c.to(p.grad.device) if torch.is_tensor(c) else c) + else: + per_device_and_dtype_grads[p.grad.device][p.grad.dtype].append( + p.grad.data + ) + for device, per_dtype_grads in per_device_and_dtype_grads.items(): + for grads in per_dtype_grads.values(): + torch._foreach_mul_(grads, c.to(device) if torch.is_tensor(c) else c) def clip_grad_norm(self, max_norm, aggregate_norm_fn=None): """Clips gradient norm.""" return utils.clip_grad_norm_(self.params, max_norm, aggregate_norm_fn) - def step(self, closure=None, scale=1.0): + def step(self, closure=None, scale=1.0, groups=None): """Performs a single optimization step.""" if self.supports_step_with_scale: - self.optimizer.step(closure, scale=scale) + if self.supports_groups: + self.optimizer.step(closure, scale=scale, groups=groups) + else: + self.optimizer.step(closure, scale=scale) else: if scale != 1.0: self.multiply_grads(1.0 / scale) - self.optimizer.step(closure) + if self.supports_groups: + self.optimizer.step(closure, groups=groups) + else: + self.optimizer.step(closure) def zero_grad(self): """Clears the gradients of all optimized parameters.""" @@ -131,6 +152,12 @@ def supports_step_with_scale(self): return self.optimizer.supports_step_with_scale return False + @property + def supports_groups(self): + if hasattr(self.optimizer, "supports_groups"): + return self.optimizer.supports_groups + return False + @property def supports_flat_params(self): """ @@ -144,6 +171,16 @@ def supports_flat_params(self): def average_params(self): pass + def broadcast_global_state_dict(self, state_dict): + """ + Broadcasts a global state dict to all ranks. + Useful for optimizers that shard state between ranks. + """ + if hasattr(self.optimizer, "broadcast_global_state_dict"): + return self.optimizer.broadcast_global_state_dict(state_dict) + else: + return state_dict + class LegacyFairseqOptimizer(FairseqOptimizer): def __init__(self, args): diff --git a/fairseq/optim/fp16_optimizer.py b/fairseq/optim/fp16_optimizer.py index b08a7237a9..6a4da342ca 100644 --- a/fairseq/optim/fp16_optimizer.py +++ b/fairseq/optim/fp16_optimizer.py @@ -7,9 +7,10 @@ from itertools import chain import torch -from fairseq import optim from omegaconf import DictConfig +from fairseq import optim + from .dynamic_loss_scaler import DynamicLossScaler @@ -64,7 +65,15 @@ def build_fp32_params(cls, args, params, flatten=True): fp32_params = [] for p in params: p32 = torch.nn.Parameter(p.data.float()) + if hasattr(p, "expert"): + p32.expert = True + elif hasattr(p, "base_expert"): + p32.base_expert = True p32.grad = torch.zeros_like(p32.data) + if hasattr(p, "param_group"): + p32.param_group = p.param_group + if hasattr(p, "optim_overrides"): + p32.optim_overrides = p.optim_overrides fp32_params.append(p32) return fp32_params @@ -127,7 +136,10 @@ def _sync_fp16_grads_to_fp32(self): if not p.requires_grad: continue if p.grad is not None: - p32.grad.data.copy_(p.grad.data) + if p32.grad is None: + p32.grad = p.grad.data.float() + else: + p32.grad.data.copy_(p.grad.data) else: p32.grad = torch.zeros_like(p.data, dtype=torch.float) @@ -159,7 +171,16 @@ def _sync_fp32_params_to_fp16(self): def _unscale_grads(self): self._sync_fp16_grads_to_fp32() - if self._multiply_factor != 1.0: + if ( + # Skip the multiplication if it's a no-op (i.e., if _multiply_factor + # is 1.0). At the same time, we want to avoid the device-to-host + # transfer by comparing it to 1.0. Since _multiply_factor starts as + # a Python float, we roughly assume that if it's a tensor then it's + # probably not =1.0 anymore and we do the multiplication. Otherwise + # we can safely check the value without a D2H transfer. + torch.is_tensor(self._multiply_factor) + or self._multiply_factor != 1.0 + ): self.fp32_optimizer.multiply_grads(self._multiply_factor) self._multiply_factor = 1.0 @@ -175,6 +196,9 @@ def clip_grad_norm(self, max_norm, aggregate_norm_fn=None): 0, aggregate_norm_fn ) + if torch.is_tensor(self._multiply_factor): + self._multiply_factor = self._multiply_factor.to(grad_norm.device) + if self.scaler is not None: if grad_norm > max_norm > 0.0: self._multiply_factor *= max_norm / grad_norm @@ -186,15 +210,17 @@ def clip_grad_norm(self, max_norm, aggregate_norm_fn=None): return grad_norm - def step(self, closure=None): + def step(self, closure=None, groups=None): """Performs a single optimization step.""" self._sync_fp16_grads_to_fp32() if getattr(self, "supports_step_with_scale", False): - self.fp32_optimizer.step(closure, scale=(1.0 / self._multiply_factor)) + self.fp32_optimizer.step( + closure, scale=(1.0 / self._multiply_factor), groups=groups + ) else: self._unscale_grads() - self.fp32_optimizer.step(closure) + self.fp32_optimizer.step(closure, groups=groups) if self.scaler is not None: self.scaler.update() @@ -215,7 +241,8 @@ def zero_grad(self): raise RuntimeError("self.fp32_params must be a tensor or dict") else: for p32 in self.fp32_params: - p32.grad.zero_() + if p32.grad is not None: + p32.grad.zero_() self._needs_sync = False if self.scaler is not None: @@ -244,7 +271,7 @@ def __init__(self, cfg: DictConfig, params, fp32_optimizer, fp32_params, **kwarg / cfg.common.model_parallel_size ) scale_window = int( - 2 ** 14 / data_parallel_size / cfg.optimization.update_freq[0] + 2**14 / data_parallel_size / cfg.optimization.update_freq[0] ) else: scale_window = cfg.common.fp16_scale_window @@ -290,6 +317,10 @@ def optimizer(self): def optimizer(self, optimizer): self.fp32_optimizer.optimizer = optimizer + @property + def lr_scheduler(self): + return getattr(self.fp32_optimizer, "lr_scheduler", None) + @property def optimizer_config(self): return self.fp32_optimizer.optimizer_config @@ -300,6 +331,13 @@ def get_lr(self): def set_lr(self, lr): self.fp32_optimizer.set_lr(lr) + def all_reduce_grads(self, module): + self.fp32_optimizer.all_reduce_grads(module) + + @property + def supports_flat_params(self): + return self.fp32_optimizer.supports_flat_params + class _MemoryEfficientFP16OptimizerMixin(object): def __init__(self, *args, **kwargs): @@ -363,7 +401,16 @@ def backward(self, loss): loss.backward() def _unscale_grads(self): - if self._multiply_factor != 1.0: + if ( + # Skip the multiplication if it's a no-op (i.e., if _multiply_factor + # is 1.0). At the same time, we want to avoid the device-to-host + # transfer by comparing it to 1.0. Since _multiply_factor starts as + # a Python float, we roughly assume that if it's a tensor then it's + # probably not =1.0 anymore and we do the multiplication. Otherwise + # we can safely check the value without a D2H transfer. + torch.is_tensor(self._multiply_factor) + or self._multiply_factor != 1.0 + ): self.wrapped_optimizer.multiply_grads(self._multiply_factor) self._multiply_factor = 1.0 @@ -391,14 +438,16 @@ def clip_grad_norm(self, max_norm, aggregate_norm_fn=None): return grad_norm - def step(self, closure=None): + def step(self, closure=None, groups=None): """Performs a single optimization step.""" if getattr(self, "supports_step_with_scale", False): # NOTE(msb) optimizer divides by scale factor - self.wrapped_optimizer.step(closure, scale=(1.0 / self._multiply_factor)) + self.wrapped_optimizer.step( + closure, scale=(1.0 / self._multiply_factor), groups=groups + ) else: self._unscale_grads() - self.wrapped_optimizer.step(closure) + self.wrapped_optimizer.step(closure, groups=groups) if self.scaler is not None: self.scaler.update() @@ -411,6 +460,10 @@ def zero_grad(self): else: self._multiply_factor = 1.0 + @property + def supports_flat_params(self): + return self.wrapped_optimizer.supports_flat_params + class MemoryEfficientFP16Optimizer( _MemoryEfficientFP16OptimizerMixin, optim.FairseqOptimizer @@ -430,13 +483,15 @@ class MemoryEfficientFP16Optimizer( *supports_memory_efficient_fp16* property. """ - def __init__(self, cfg: DictConfig, params, optimizer, **kwargs): - if not optimizer.supports_memory_efficient_fp16: + def __init__( + self, cfg: DictConfig, params, optimizer, allow_unsupported=False, **kwargs + ): + if not allow_unsupported and not optimizer.supports_memory_efficient_fp16: raise ValueError( "Unsupported optimizer: {}".format(optimizer.__class__.__name__) ) - super().__init__(cfg.optimizer) + super().__init__(getattr(cfg, "optimizer", None)) self.wrapped_optimizer = optimizer if getattr(cfg.common, "fp16_scale_window", None) is None: @@ -449,8 +504,8 @@ def __init__(self, cfg: DictConfig, params, optimizer, **kwargs): cfg.distributed_training.distributed_world_size / cfg.common.model_parallel_size ) - scale_window = ( - 2 ** 14 / data_parallel_size / cfg.optimization.update_freq[0] + scale_window = int( + 2**14 / data_parallel_size / cfg.optimization.update_freq[0] ) else: scale_window = cfg.common.fp16_scale_window @@ -489,8 +544,15 @@ def optimizer(self, optimizer): def optimizer_config(self): return self.wrapped_optimizer.optimizer_config + @property + def lr_scheduler(self): + return getattr(self.wrapped_optimizer, "lr_scheduler", None) + def get_lr(self): return self.wrapped_optimizer.get_lr() def set_lr(self, lr): self.wrapped_optimizer.set_lr(lr) + + def all_reduce_grads(self, module): + self.wrapped_optimizer.all_reduce_grads(module) diff --git a/fairseq/optim/fused_adam.py b/fairseq/optim/fused_adam.py index 1780f9c0bb..39a2a83694 100644 --- a/fairseq/optim/fused_adam.py +++ b/fairseq/optim/fused_adam.py @@ -27,8 +27,8 @@ def get_fused_adam_class(): except ImportError: try: # fallback to the newer interface - from apex.optimizers import FusedAdam as _FusedAdam # noqa from apex.multi_tensor_apply import multi_tensor_applier + from apex.optimizers import FusedAdam as _FusedAdam # noqa if multi_tensor_applier.available: return FusedAdamV2 @@ -47,7 +47,7 @@ class FusedAdamV1(torch.optim.Optimizer): Compared to the original version in Apex, the fairseq version casts grads and params to FP32 internally to support ``--memory-efficient-fp16``. - Arguments: + Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups. lr (float, optional): learning rate. (default: 1e-3) @@ -80,6 +80,7 @@ def __init__( weight_decay=0.0, max_grad_norm=0.0, amsgrad=False, + use_fp16_stats=False, ): global fused_adam_cuda import importlib @@ -99,6 +100,9 @@ def __init__( super().__init__(params, defaults) self.eps_mode = 0 if eps_inside_sqrt else 1 + self.use_fp16_stats = use_fp16_stats + self.FLOAT16_MAX = 65504.0 + @property def supports_memory_efficient_fp16(self): return True @@ -113,7 +117,7 @@ def supports_step_with_scale(self): def step(self, closure=None, grads=None, scale=1.0, grad_norms=None): """Performs a single optimization step. - Arguments: + Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. grads (list of tensors, optional): weight gradient to use for the @@ -173,29 +177,45 @@ def step(self, closure=None, grads=None, scale=1.0, grad_norms=None): "please consider SparseAdam instead" ) - p_data_fp32 = p.data.float() + if p.device.type == "cpu": + p_data_fp32 = p.data.cuda(non_blocking=True).float() + out_p = torch.tensor([], dtype=torch.float) + else: + p_data_fp32 = p.data.float() + out_p = p.data state = self.state[p] # State initialization + dtype = torch.float16 if self.use_fp16_stats else p_data_fp32.dtype if len(state) == 0: state["step"] = 0 # Exponential moving average of gradient values - state["exp_avg"] = torch.zeros_like(p_data_fp32) + state["exp_avg"] = torch.zeros_like(p_data_fp32, dtype=dtype) # Exponential moving average of squared gradient values - state["exp_avg_sq"] = torch.zeros_like(p_data_fp32) + state["exp_avg_sq"] = torch.zeros_like(p_data_fp32, dtype=dtype) + if self.use_fp16_stats: + state["exp_avg_scale"] = 1.0 + state["exp_avg_sq_scale"] = 1.0 else: - state["exp_avg"] = state["exp_avg"].to(p_data_fp32) - state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32) + device = p_data_fp32.device + state["exp_avg"] = state["exp_avg"].to(device, dtype) + state["exp_avg_sq"] = state["exp_avg_sq"].to(device, dtype) exp_avg = state["exp_avg"] exp_avg_sq = state["exp_avg_sq"] + if self.use_fp16_stats: + assert exp_avg.dtype == torch.float16 + exp_avg = exp_avg.float() * state["exp_avg_scale"] + exp_avg_sq = exp_avg_sq.float() * state["exp_avg_sq_scale"] beta1, beta2 = group["betas"] + if "step" not in state: + state["step"] = group["step"] + state["step"] += 1 - out_p = p.data - with torch.cuda.device(p.device): + with torch.cuda.device(p_data_fp32.device): fused_adam_cuda.adam( p_data_fp32, out_p, @@ -213,12 +233,30 @@ def step(self, closure=None, grads=None, scale=1.0, grad_norms=None): group["weight_decay"], ) + if p.device.type == "cpu": + p.data.copy_(p_data_fp32, non_blocking=True) + + if self.use_fp16_stats: + + def inf_norm(t): + return torch.norm(t, float("inf")) + + # from github.com/openai/jukebox/blob/master/jukebox/utils/fp16.py + state["exp_avg_scale"], state["exp_avg_sq_scale"] = ( + 1e-8 + inf_norm(exp_avg) / self.FLOAT16_MAX, + 1e-8 + inf_norm(exp_avg_sq) / self.FLOAT16_MAX, + ) + state["exp_avg"], state["exp_avg_sq"] = ( + (exp_avg / state["exp_avg_scale"]).half(), + (exp_avg_sq / state["exp_avg_sq_scale"]).half(), + ) + return loss try: - from apex.optimizers import FusedAdam from apex.multi_tensor_apply import multi_tensor_applier + from apex.optimizers import FusedAdam class FusedAdamV2(FusedAdam): """ @@ -226,7 +264,11 @@ class FusedAdamV2(FusedAdam): and params to FP32 internally to support ``--memory-efficient-fp16``. """ - def __init__(self, *args, **kwargs): + def __init__(self, *args, use_fp16_stats=False, **kwargs): + if use_fp16_stats: + raise NotImplementedError( + "--fp16-adam-stats is only supported with FusedAdamV1" + ) super().__init__(*args, **kwargs) if not hasattr(self, "multi_tensor_adam"): raise Exception( @@ -343,6 +385,5 @@ def step( return loss - except ImportError: pass diff --git a/fairseq/optim/lr_scheduler/__init__.py b/fairseq/optim/lr_scheduler/__init__.py index f07d43c7c3..5b3dbc023a 100644 --- a/fairseq/optim/lr_scheduler/__init__.py +++ b/fairseq/optim/lr_scheduler/__init__.py @@ -30,7 +30,7 @@ def build_lr_scheduler(cfg: DictConfig, optimizer): # automatically import any Python files in the optim/lr_scheduler/ directory -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): file_name = file[: file.find(".py")] importlib.import_module("fairseq.optim.lr_scheduler." + file_name) diff --git a/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py b/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py index c3c6663ece..5fcaea25d4 100644 --- a/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py +++ b/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py @@ -4,18 +4,18 @@ # LICENSE file in the root directory of this source tree. import math -from collections import Collection +from collections.abc import Collection from dataclasses import dataclass, field from typing import List -from fairseq.dataclass import FairseqDataclass -from omegaconf import II, DictConfig +from omegaconf import II -from . import FairseqLRScheduler, register_lr_scheduler +from fairseq.dataclass import FairseqDataclass +from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler @dataclass -class CosineConfig(FairseqDataclass): +class CosineLRScheduleConfig(FairseqDataclass): warmup_updates: int = field( default=0, metadata={"help": "warmup the learning rate linearly for the first N updates"}, @@ -23,12 +23,14 @@ class CosineConfig(FairseqDataclass): warmup_init_lr: float = field( default=-1, metadata={ - "help": "initial learning rate during warmup phase; default is args.lr" + "help": "initial learning rate during warmup phase; default is cfg.lr" }, ) - max_lr: float = field( - default=1.0, metadata={"help": "max learning rate, must be more than args.lr"} + lr: List[float] = field( + default=II("optimization.lr"), + metadata={"help": "max learning rate, must be more than cfg.min_lr"}, ) + min_lr: float = field(default=0.0, metadata={"help": "min learning rate"}) t_mult: float = field( default=1.0, metadata={"help": "factor to grow the length of each period"} ) @@ -38,72 +40,62 @@ class CosineConfig(FairseqDataclass): lr_shrink: float = field( default=0.1, metadata={"help": "shrink factor for annealing"} ) - # TODO common var for parent class - lr: List[float] = II("optimization.lr") + # This is not required, but is for convenience in inferring lr_period_updates max_update: int = II("optimization.max_update") -@register_lr_scheduler("cosine", dataclass=CosineConfig) -class CosineSchedule(FairseqLRScheduler): +@register_lr_scheduler("cosine", dataclass=CosineLRScheduleConfig) +class CosineLRSchedule(FairseqLRScheduler): """Assign LR based on a cyclical schedule that follows the cosine function. See https://arxiv.org/pdf/1608.03983.pdf for details. We also support a warmup phase where we linearly increase the learning rate from some initial learning rate (``--warmup-init-lr``) until the configured - max learning rate (``--max-lr``). + max learning rate (``--lr``). During warmup:: - lrs = torch.linspace(args.warmup_init_lr, args.lr, args.warmup_updates) + lrs = torch.linspace(cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates) lr = lrs[update_num] After warmup:: - lr = lr_min + 0.5*(lr_max - lr_min)*(1 + cos(t_curr / t_i)) + lr = cfg.min_lr + 0.5*(cfg.lr - cfg.min_lr)*(1 + cos(t_curr / t_i)) where ``t_curr`` is current percentage of updates within the current period range and ``t_i`` is the current period range, which is scaled by ``t_mul`` after every iteration. """ - def __init__( - self, cfg: DictConfig, fairseq_optimizer - ): + def __init__(self, cfg: CosineLRScheduleConfig, fairseq_optimizer): super().__init__(cfg, fairseq_optimizer) if isinstance(cfg.lr, Collection) and len(cfg.lr) > 1: raise ValueError( "Cannot use a fixed learning rate schedule with cosine." - " Consider --lr-scheduler=fixed instead." + f" Consider --lr-scheduler=fixed instead. ({cfg.lr})" ) - warmup_end_lr = cfg.max_lr - lr = ( - cfg.lr[0] - if isinstance(cfg.lr, Collection) - else cfg.lr - ) - if cfg.warmup_init_lr < 0: - cfg.warmup_init_lr = lr + self.max_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr + if self.max_lr < cfg.min_lr: + cfg.min_lr = self.max_lr - self.min_lr = lr - self.max_lr = cfg.max_lr - assert self.max_lr > self.min_lr, "max_lr must be more than lr" + warmup_end_lr = self.max_lr + if cfg.warmup_init_lr < 0: + cfg.warmup_init_lr = cfg.min_lr self.t_mult = cfg.t_mult self.period = cfg.lr_period_updates if self.period <= 0: assert ( - cfg.max_update >= 0 + cfg.max_update > 0 ), "Either --max_update or --lr-period-updates must be set" self.period = cfg.max_update - cfg.warmup_updates if cfg.warmup_updates > 0: - # linearly warmup for the first args.warmup_updates - self.lr_step = ( - warmup_end_lr - cfg.warmup_init_lr - ) / cfg.warmup_updates + # linearly warmup for the first cfg.warmup_updates + self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates else: self.lr_step = 1 @@ -132,18 +124,18 @@ def step_update(self, num_updates): 1 - curr_updates / self.period * (1 - self.t_mult), self.t_mult ) ) - t_i = self.t_mult ** i * self.period + t_i = self.t_mult**i * self.period t_curr = ( curr_updates - - (1 - self.t_mult ** i) / (1 - self.t_mult) * self.period + - (1 - self.t_mult**i) / (1 - self.t_mult) * self.period ) else: i = math.floor(curr_updates / self.period) t_i = self.period t_curr = curr_updates - (self.period * i) - lr_shrink = self.lr_shrink ** i - min_lr = self.min_lr * lr_shrink + lr_shrink = self.lr_shrink**i + min_lr = self.cfg.min_lr * lr_shrink max_lr = self.max_lr * lr_shrink self.lr = min_lr + 0.5 * (max_lr - min_lr) * ( diff --git a/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py b/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py index 569e448262..6c12fa56b8 100644 --- a/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py +++ b/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py @@ -6,14 +6,13 @@ from argparse import Namespace from fairseq.dataclass.utils import gen_parser_from_dataclass - -from .. import FairseqOptimizer +from fairseq.optim import FairseqOptimizer class FairseqLRScheduler(object): def __init__(self, cfg, optimizer): super().__init__() - if not isinstance(optimizer, FairseqOptimizer): + if optimizer is not None and not isinstance(optimizer, FairseqOptimizer): raise ValueError("optimizer must be an instance of FairseqOptimizer") self.cfg = cfg self.optimizer = optimizer @@ -34,6 +33,10 @@ def load_state_dict(self, state_dict): """Load an LR scheduler state dict.""" self.best = state_dict["best"] + def step_begin_epoch(self, epoch): + """Update the learning rate at the beginning of the given epoch.""" + pass + def step(self, epoch, val_loss=None): """Update the learning rate at the end of the given epoch.""" if val_loss is not None: diff --git a/fairseq/optim/lr_scheduler/fixed_schedule.py b/fairseq/optim/lr_scheduler/fixed_schedule.py index 7ca7826ed2..d0e7e14b7e 100644 --- a/fairseq/optim/lr_scheduler/fixed_schedule.py +++ b/fairseq/optim/lr_scheduler/fixed_schedule.py @@ -3,37 +3,44 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from . import LegacyFairseqLRScheduler, register_lr_scheduler +from dataclasses import dataclass, field +from typing import Optional, List +from omegaconf import II +from fairseq.dataclass import FairseqDataclass +from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler -@register_lr_scheduler("fixed") -class FixedSchedule(LegacyFairseqLRScheduler): - """Decay the LR on a fixed schedule.""" - def __init__(self, args, optimizer): - super().__init__(args, optimizer) +@dataclass +class FixedLRScheduleConfig(FairseqDataclass): + force_anneal: Optional[int] = field( + default=None, + metadata={"help": "force annealing at specified epoch"}, + ) + lr_shrink: float = field( + default=0.1, + metadata={"help": "shrink factor for annealing, lr_new = (lr * lr_shrink)"}, + ) + warmup_updates: int = field( + default=0, + metadata={"help": "warmup the learning rate linearly for the first N updates"}, + ) + lr: List[float] = II("optimization.lr") + - # set defaults - args.warmup_updates = getattr(args, "warmup_updates", 0) or 0 +@register_lr_scheduler("fixed", dataclass=FixedLRScheduleConfig) +class FixedLRSchedule(FairseqLRScheduler): + """Decay the LR on a fixed schedule.""" - self.lr = args.lr[0] - if args.warmup_updates > 0: - self.warmup_factor = 1.0 / args.warmup_updates + def __init__(self, cfg: FixedLRScheduleConfig, optimizer): + super().__init__(cfg, optimizer) + + self.lr = cfg.lr[0] + if cfg.warmup_updates > 0: + self.warmup_factor = 1.0 / cfg.warmup_updates else: self.warmup_factor = 1 - @staticmethod - def add_args(parser): - """Add arguments to the parser for this LR scheduler.""" - # fmt: off - parser.add_argument('--force-anneal', '--fa', type=int, metavar='N', - help='force annealing at specified epoch') - parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS', - help='shrink factor for annealing, lr_new = (lr * lr_shrink)') - parser.add_argument('--warmup-updates', default=0, type=int, metavar='N', - help='warmup the learning rate linearly for the first N updates') - # fmt: on - def state_dict(self): return {"lr": self.lr} @@ -42,28 +49,27 @@ def load_state_dict(self, state_dict): self.lr = state_dict["lr"] def get_next_lr(self, epoch): - lrs = self.args.lr - if self.args.force_anneal is None or epoch < self.args.force_anneal: + lrs = self.cfg.lr + if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal: # use fixed LR schedule - next_lr = lrs[min(epoch, len(lrs) - 1)] + next_lr = lrs[min(epoch - 1, len(lrs) - 1)] else: # annneal based on lr_shrink - next_lr = lrs[-1] * self.args.lr_shrink ** ( - epoch + 1 - self.args.force_anneal + next_lr = lrs[-1] * self.cfg.lr_shrink ** ( + epoch + 1 - self.cfg.force_anneal ) return next_lr - def step(self, epoch, val_loss=None): - """Update the learning rate at the end of the given epoch.""" - super().step(epoch, val_loss) + def step_begin_epoch(self, epoch): + """Update the learning rate at the beginning of the given epoch.""" self.lr = self.get_next_lr(epoch) self.optimizer.set_lr(self.warmup_factor * self.lr) return self.optimizer.get_lr() def step_update(self, num_updates): """Update the learning rate after each update.""" - if self.args.warmup_updates > 0 and num_updates < self.args.warmup_updates: - self.warmup_factor = (num_updates + 1) / float(self.args.warmup_updates) + if self.cfg.warmup_updates > 0 and num_updates < self.cfg.warmup_updates: + self.warmup_factor = (num_updates + 1) / float(self.cfg.warmup_updates) self.optimizer.set_lr(self.warmup_factor * self.lr) else: self.optimizer.set_lr(self.lr) diff --git a/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py b/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py index c42e090677..987c905a23 100644 --- a/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py +++ b/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py @@ -3,18 +3,18 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from collections import Collection +from collections.abc import Collection from dataclasses import dataclass, field from typing import List -from fairseq.dataclass import FairseqDataclass -from omegaconf import II, DictConfig +from omegaconf import II -from . import FairseqLRScheduler, register_lr_scheduler +from fairseq.dataclass import FairseqDataclass +from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler @dataclass -class InverseSquareRootScheduleConfig(FairseqDataclass): +class InverseSquareRootLRScheduleConfig(FairseqDataclass): warmup_updates: int = field( default=4000, metadata={"help": "warmup the learning rate linearly for the first N updates"}, @@ -22,14 +22,13 @@ class InverseSquareRootScheduleConfig(FairseqDataclass): warmup_init_lr: float = field( default=-1, metadata={ - "help": "initial learning rate during warmup phase; default is args.lr" + "help": "initial learning rate during warmup phase; default is cfg.lr" }, ) - # TODO common vars at parent class lr: List[float] = II("optimization.lr") -@register_lr_scheduler("inverse_sqrt", dataclass=InverseSquareRootScheduleConfig) +@register_lr_scheduler("inverse_sqrt", dataclass=InverseSquareRootLRScheduleConfig) class InverseSquareRootSchedule(FairseqLRScheduler): """Decay the LR based on the inverse square root of the update number. @@ -40,39 +39,31 @@ class InverseSquareRootSchedule(FairseqLRScheduler): During warmup:: - lrs = torch.linspace(args.warmup_init_lr, args.lr, args.warmup_updates) + lrs = torch.linspace(cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates) lr = lrs[update_num] After warmup:: - decay_factor = args.lr * sqrt(args.warmup_updates) + decay_factor = cfg.lr * sqrt(cfg.warmup_updates) lr = decay_factor / sqrt(update_num) """ - def __init__(self, cfg: DictConfig, optimizer): + def __init__(self, cfg: InverseSquareRootLRScheduleConfig, optimizer): super().__init__(cfg, optimizer) if isinstance(cfg.lr, Collection) and len(cfg.lr) > 1: raise ValueError( "Cannot use a fixed learning rate schedule with inverse_sqrt." " Consider --lr-scheduler=fixed instead." ) - warmup_end_lr = ( - cfg.lr[0] - if isinstance(cfg.lr, Collection) - else cfg.lr - ) + warmup_end_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr if cfg.warmup_init_lr < 0: - cfg.warmup_init_lr = ( - 0 if cfg.warmup_updates > 0 else warmup_end_lr - ) + cfg.warmup_init_lr = 0 if cfg.warmup_updates > 0 else warmup_end_lr - # linearly warmup for the first args.warmup_updates - self.lr_step = ( - warmup_end_lr - cfg.warmup_init_lr - ) / cfg.warmup_updates + # linearly warmup for the first cfg.warmup_updates + self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates # then, decay prop. to the inverse square root of the update number - self.decay_factor = warmup_end_lr * cfg.warmup_updates ** 0.5 + self.decay_factor = warmup_end_lr * cfg.warmup_updates**0.5 # initial learning rate self.lr = cfg.warmup_init_lr @@ -89,6 +80,6 @@ def step_update(self, num_updates): if num_updates < self.cfg.warmup_updates: self.lr = self.cfg.warmup_init_lr + num_updates * self.lr_step else: - self.lr = self.decay_factor * num_updates ** -0.5 + self.lr = self.decay_factor * num_updates**-0.5 self.optimizer.set_lr(self.lr) return self.lr diff --git a/fairseq/optim/lr_scheduler/manual_lr_scheduler.py b/fairseq/optim/lr_scheduler/manual_lr_scheduler.py new file mode 100644 index 0000000000..57edc256fd --- /dev/null +++ b/fairseq/optim/lr_scheduler/manual_lr_scheduler.py @@ -0,0 +1,121 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import LegacyFairseqLRScheduler, register_lr_scheduler +import logging +import ast + +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + + +@register_lr_scheduler("manual") +class ManualSchedule(LegacyFairseqLRScheduler): + """Decay the LR on a manual schedule.""" + + def __init__(self, args, optimizer): + super().__init__(args, optimizer) + + self.epoch2lr = self.parse_manuallr_args(args.epoch2lr) + self.update2lr = self.parse_manuallr_args(args.update2lr) + logger.info("@@@ ManualSchedule epoch2lr={}".format(self.epoch2lr)) + logger.info("@@@ ManualSchedule update2lr={}".format(self.update2lr)) + + if 1 in self.epoch2lr: + self.lr = self.epoch2lr[1] + elif 1 in self.update2lr: + self.lr = self.update2lr[1] + else: + self.lr = args.lr[0] + self.optimizer.set_lr(self.lr) # Set the beginning of the epoch. + + def parse_manuallr_args(self, lr_args_str): + lr_dict = ast.literal_eval(lr_args_str.replace(" ", "")) + if not isinstance(lr_dict, dict): + raise ValueError("epoch2lr/update2lr must be abel to evaluated to a dict") + + lr_args = {} + logger.info("@@@ after parsing input dictionary lr_dict = {}".format(lr_dict)) + for key, val in lr_dict.items(): + if "," in key: + for k in key.split(","): + lr_args[int(k)] = float(val) + elif "-" in key: + s = int(key.split("-")[0]) + e = int(key.split("-")[1]) + for k in range(s, e + 1, 1): + lr_args[k] = float(val) + else: + lr_args[int(key)] = float(val) + + return lr_args + + @staticmethod + def add_args(parser): + """Add arguments to the parser for this LR scheduler.""" + # fmt: off + parser.add_argument( + "--epoch2lr", + type=str, + metavar="DICT", + default="{}", + help="a dictionary used to set lr for each epoch manually", + ) + parser.add_argument( + "--update2lr", + type=str, + metavar="DICT", + default="{}", + help="a dictionary used to set lr for each update manually", + ) + # fmt: on + + def state_dict(self): + return {"lr": self.lr} + + def load_state_dict(self, state_dict): + if "lr" in state_dict: + self.lr = state_dict["lr"] + + def get_next_lr(self, epoch): + manual_keys = [k for k in self.epoch2lr if k <= epoch] + if manual_keys: + manual_lr = self.epoch2lr[max(manual_keys)] + else: + logger.warning( + "@@@ epoch={} does not exist in manual lr input. epoch2lr={}...".format( + epoch, + list(self.epoch2lr.items())[ + : min(10, len(self.epoch2lr.keys()) - 1) + ], + ) + ) + manual_lr = self.optimizer.get_lr() + return manual_lr + + def step_begin_epoch(self, epoch): + """Update the learning rate at the beginning of the given epoch.""" + self.lr = self.get_next_lr(epoch) + self.optimizer.set_lr(self.lr) + return self.optimizer.get_lr() + + def step_update(self, num_updates): + """Update the learning rate after each update.""" + manual_keys = [k for k in self.update2lr if k <= num_updates] + if manual_keys: + manual_lr = self.update2lr[max(manual_keys)] + else: + logger.warning( + "epoch={} does not exist in manual lr input update2lr={}...".format( + num_updates, + list(self.update2lr.items())[ + : min(10, len(self.update2lr.keys()) - 1) + ], + ) + ) + manual_lr = self.optimizer.get_lr() + + self.optimizer.set_lr(manual_lr) + return self.optimizer.get_lr() diff --git a/fairseq/optim/lr_scheduler/pass_through.py b/fairseq/optim/lr_scheduler/pass_through.py new file mode 100644 index 0000000000..2f93db328c --- /dev/null +++ b/fairseq/optim/lr_scheduler/pass_through.py @@ -0,0 +1,39 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass + +from fairseq.dataclass import FairseqDataclass +from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler + + +@dataclass +class PassThroughScheduleConfig(FairseqDataclass): + pass + + +@register_lr_scheduler("pass_through", dataclass=PassThroughScheduleConfig) +class PassThroughScheduleSchedule(FairseqLRScheduler): + """Delegate lr scheduling to the optimizer.""" + + def __init__(self, cfg: PassThroughScheduleConfig, optimizer): + super().__init__(cfg, optimizer) + assert ( + hasattr(optimizer, "lr_scheduler") and optimizer.lr_scheduler is not None + ), "Pass-through schedule can only be used with optimizers with their own schedulers" + + def state_dict(self): + return self.optimizer.lr_scheduler.state_dict() + + def load_state_dict(self, state_dict): + self.optimizer.lr_scheduler.load_state_dict(state_dict) + + def step_begin_epoch(self, epoch): + """Update the learning rate at the beginning of the given epoch.""" + return self.optimizer.lr_scheduler.step_begin_epoch(epoch) + + def step_update(self, num_updates): + """Update the learning rate after each update.""" + return self.optimizer.lr_scheduler.step_update(num_updates) diff --git a/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py b/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py index ea8e647668..b8109a7c1e 100644 --- a/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py +++ b/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py @@ -3,53 +3,61 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from . import LegacyFairseqLRScheduler, register_lr_scheduler +from dataclasses import dataclass, field +from typing import Optional, List +from omegaconf import II +from fairseq.dataclass import FairseqDataclass +from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler -@register_lr_scheduler("polynomial_decay") -class PolynomialDecaySchedule(LegacyFairseqLRScheduler): + +@dataclass +class PolynomialDecayLRScheduleConfig(FairseqDataclass): + warmup_updates: int = field( + default=0, + metadata={"help": "warmup the learning rate linearly for the first N updates"}, + ) + force_anneal: Optional[int] = field( + default=None, + metadata={"help": "force annealing at specified epoch"}, + ) + end_learning_rate: float = field( + default=0.0, + metadata={"help": "learning rate to decay to"}, + ) + power: float = field( + default=1.0, + metadata={"help": "decay exponent"}, + ) + total_num_update: float = field( + default=II("optimization.max_update"), + metadata={"help": "total number of updates over which to decay learning rate"}, + ) + lr: List[float] = II("optimization.lr") + + +@register_lr_scheduler("polynomial_decay", dataclass=PolynomialDecayLRScheduleConfig) +class PolynomialDecayLRSchedule(FairseqLRScheduler): """Decay the LR on a fixed schedule.""" - def __init__(self, args, optimizer): - super().__init__(args, optimizer) + def __init__(self, cfg: PolynomialDecayLRScheduleConfig, optimizer): + super().__init__(cfg, optimizer) - # set defaults - args.warmup_updates = getattr(args, "warmup_updates", 0) or 0 + assert cfg.total_num_update > 0 - self.lr = args.lr[0] - if args.warmup_updates > 0: - self.warmup_factor = 1.0 / args.warmup_updates + self.lr = cfg.lr[0] + if cfg.warmup_updates > 0: + self.warmup_factor = 1.0 / cfg.warmup_updates else: self.warmup_factor = 1 - self.end_learning_rate = args.end_learning_rate - self.total_num_update = args.total_num_update - self.power = args.power + self.end_learning_rate = cfg.end_learning_rate + self.total_num_update = cfg.total_num_update + self.power = cfg.power self.optimizer.set_lr(self.warmup_factor * self.lr) - @staticmethod - def add_args(parser): - """Add arguments to the parser for this LR scheduler.""" - parser.add_argument( - "--force-anneal", - "--fa", - type=int, - metavar="N", - help="force annealing at specified epoch", - ) - parser.add_argument( - "--warmup-updates", - default=0, - type=int, - metavar="N", - help="warmup the learning rate linearly for the first N updates", - ) - parser.add_argument("--end-learning-rate", default=0.0, type=float) - parser.add_argument("--power", default=1.0, type=float) - parser.add_argument("--total-num-update", default=1000000, type=int) - def get_next_lr(self, epoch): - lrs = self.args.lr - if self.args.force_anneal is None or epoch < self.args.force_anneal: + lrs = self.cfg.lr + if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal: # use fixed LR schedule next_lr = lrs[min(epoch, len(lrs) - 1)] else: @@ -57,22 +65,21 @@ def get_next_lr(self, epoch): next_lr = self.optimizer.get_lr() return next_lr - def step(self, epoch, val_loss=None): - """Update the learning rate at the end of the given epoch.""" - super().step(epoch, val_loss) + def step_begin_epoch(self, epoch): + """Update the learning rate at the beginning of the given epoch.""" self.lr = self.get_next_lr(epoch) self.optimizer.set_lr(self.warmup_factor * self.lr) return self.optimizer.get_lr() def step_update(self, num_updates): """Update the learning rate after each update.""" - if self.args.warmup_updates > 0 and num_updates <= self.args.warmup_updates: - self.warmup_factor = num_updates / float(self.args.warmup_updates) + if self.cfg.warmup_updates > 0 and num_updates <= self.cfg.warmup_updates: + self.warmup_factor = num_updates / float(self.cfg.warmup_updates) lr = self.warmup_factor * self.lr elif num_updates >= self.total_num_update: lr = self.end_learning_rate else: - warmup = self.args.warmup_updates + warmup = self.cfg.warmup_updates lr_range = self.lr - self.end_learning_rate pct_remaining = 1 - (num_updates - warmup) / ( self.total_num_update - warmup diff --git a/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py b/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py index 82bb36efe9..5ee9c1be4a 100644 --- a/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py +++ b/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py @@ -3,13 +3,59 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +from dataclasses import dataclass, field +from typing import List + import torch.optim.lr_scheduler +from omegaconf import II + +from fairseq.dataclass import FairseqDataclass +from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler + -from . import LegacyFairseqLRScheduler, register_lr_scheduler +@dataclass +class ReduceLROnPlateauLRScheduleConfig(FairseqDataclass): + lr_shrink: float = field( + default=0.1, metadata={"help": "shrink factor for annealing"} + ) + lr_threshold: float = field( + default=1e-4, + metadata={ + "help": ( + "threshold for measuring the new optimum, to only focus on " + "significant changes" + ) + }, + ) + lr_patience: int = field( + default=0, + metadata={ + "help": ( + "number of epochs with no improvement after which learning rate will " + "be reduced" + ) + }, + ) + warmup_updates: int = field( + default=0, + metadata={"help": "warmup the learning rate linearly for the first N updates"}, + ) + warmup_init_lr: float = field( + default=-1, + metadata={ + "help": "initial learning rate during warmup phase; default is cfg.lr" + }, + ) + lr: List[float] = II("optimization.lr") + maximize_best_checkpoint_metric: bool = II( + "checkpoint.maximize_best_checkpoint_metric" + ) -@register_lr_scheduler("reduce_lr_on_plateau") -class ReduceLROnPlateau(LegacyFairseqLRScheduler): +@register_lr_scheduler( + "reduce_lr_on_plateau", dataclass=ReduceLROnPlateauLRScheduleConfig +) +class ReduceLROnPlateauLRSchedule(FairseqLRScheduler): """ Decay the LR by a factor every time the validation loss plateaus. Also comes with optional warmup phase, where we linearly increase @@ -21,61 +67,43 @@ class ReduceLROnPlateau(LegacyFairseqLRScheduler): During warmup:: lrs = torch.linspace( - args.warmup_init_lr, args.lr, args.warmup_updates + cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates ) lr = lrs[update_num] """ - def __init__(self, args, optimizer): - super().__init__(args, optimizer) - if len(args.lr) > 1: + def __init__(self, cfg: ReduceLROnPlateauLRScheduleConfig, optimizer): + super().__init__(cfg, optimizer) + if len(cfg.lr) > 1: raise ValueError( "Cannot use a fixed learning rate schedule with reduce_lr_on_plateau." " Consider --lr-scheduler=fixed instead." ) self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer.optimizer, - patience=args.lr_patience, - factor=args.lr_shrink, - mode="max" if args.maximize_best_checkpoint_metric else "min", - threshold=args.lr_threshold, + patience=cfg.lr_patience, + factor=cfg.lr_shrink, + mode="max" if cfg.maximize_best_checkpoint_metric else "min", + threshold=cfg.lr_threshold, ) - warmup_end_lr = args.lr[0] - # if no warm up, sets initial lr to be args.lr[0] - if args.warmup_init_lr < 0: - args.warmup_init_lr = 0 if args.warmup_updates > 0 else warmup_end_lr + warmup_end_lr = cfg.lr[0] + # if no warm up, sets initial lr to be cfg.lr[0] + if cfg.warmup_init_lr < 0: + cfg.warmup_init_lr = 0 if cfg.warmup_updates > 0 else warmup_end_lr - # linearly warmup for the first args.warmup_updates - if args.warmup_updates > 0: - self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates + # linearly warmup for the first cfg.warmup_updates + if cfg.warmup_updates > 0: + self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates # this flag is either set from arg when no warm up, or set by # step_update() when warmup finishes - self.warmup_end = True if args.warmup_updates <= 0 else False + self.warmup_end = True if cfg.warmup_updates <= 0 else False # initial learning rate # this self.lr is used only during init and/or warm up period - self.lr = args.warmup_init_lr + self.lr = warmup_end_lr if self.warmup_end else cfg.warmup_init_lr self.optimizer.set_lr(self.lr) - @staticmethod - def add_args(parser): - """Add arguments to the parser for this LR scheduler.""" - # fmt: off - parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS', - help='shrink factor for annealing, lr_new = (lr * lr_shrink)') - parser.add_argument('--lr-threshold', default=1e-4, type=float, metavar='LT', - help='threshold for measuring the new optimum, ' - 'to only focus on significant changes') - parser.add_argument('--lr-patience', default=0, type=int, - help='number of epochs with no improvement after which ' - 'learning rate will be reduced') - parser.add_argument('--warmup-updates', default=0, type=int, metavar='N', - help='warmup the learning rate linearly for the first N updates') - parser.add_argument('--warmup-init-lr', default=-1, type=float, metavar='LR', - help='initial learning rate during warmup phase; default is args.lr') - # fmt: on - def state_dict(self): """Return the LR scheduler state dict.""" return { @@ -104,9 +132,9 @@ def step_update(self, num_updates): """ Update the learning rate after each update.""" # if there is warmup - if self.args.warmup_updates > 0: - if num_updates <= self.args.warmup_updates: - self.lr = self.args.warmup_init_lr + num_updates * self.lr_step + if self.cfg.warmup_updates > 0: + if num_updates <= self.cfg.warmup_updates: + self.lr = self.cfg.warmup_init_lr + num_updates * self.lr_step self.optimizer.set_lr(self.lr) else: if self.warmup_end is False: diff --git a/fairseq/optim/lr_scheduler/step_lr_scheduler.py b/fairseq/optim/lr_scheduler/step_lr_scheduler.py new file mode 100644 index 0000000000..db99d4eee8 --- /dev/null +++ b/fairseq/optim/lr_scheduler/step_lr_scheduler.py @@ -0,0 +1,85 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from collections.abc import Collection +from dataclasses import dataclass, field +from typing import List + +from omegaconf import II + +from fairseq.dataclass import FairseqDataclass +from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler + + +@dataclass +class StepLRScheduleConfig(FairseqDataclass): + warmup_updates: int = field( + default=0, + metadata={"help": "warmup the learning rate linearly for the first N updates"}, + ) + warmup_init_lr: float = field( + default=-1, + metadata={ + "help": "initial learning rate during warmup phase; default is cfg.lr" + }, + ) + lr: List[float] = field( + default=II("optimization.lr"), + metadata={"help": "max learning rate, must be more than cfg.min_lr"}, + ) + min_lr: float = field(default=0.0, metadata={"help": "min learning rate"}) + lr_deacy_period: int = field(default=25000, metadata={"help": "decay period"}) + lr_decay: float = field(default=0.5, metadata={"help": "decay factor"}) + + +@register_lr_scheduler("step", dataclass=StepLRScheduleConfig) +class StepLRSchedule(FairseqLRScheduler): + """Decay learning rate every k updates by a fixed factor""" + + def __init__(self, cfg: StepLRScheduleConfig, fairseq_optimizer): + super().__init__(cfg, fairseq_optimizer) + self.max_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr + self.min_lr = cfg.min_lr + self.lr_deacy_period = cfg.lr_deacy_period + self.lr_decay = cfg.lr_decay + self.warmup_updates = cfg.warmup_updates + self.warmup_init_lr = ( + cfg.warmup_init_lr if cfg.warmup_init_lr >= 0 else self.min_lr + ) + + assert self.lr_deacy_period > 0 + assert self.lr_decay <= 1 + assert self.min_lr >= 0 + assert self.max_lr > self.min_lr + + if cfg.warmup_updates > 0: + # linearly warmup for the first cfg.warmup_updates + self.warmup_lr_step = ( + self.max_lr - self.warmup_init_lr + ) / self.warmup_updates + else: + self.warmup_lr_step = 1 + + # initial learning rate + self.lr = self.warmup_init_lr + self.optimizer.set_lr(self.lr) + + def step(self, epoch, val_loss=None): + """Update the learning rate at the end of the given epoch.""" + super().step(epoch, val_loss) + # we don't change the learning rate at epoch boundaries + return self.optimizer.get_lr() + + def step_update(self, num_updates): + """Update the learning rate after each update.""" + if num_updates < self.cfg.warmup_updates: + self.lr = self.warmup_init_lr + num_updates * self.warmup_lr_step + else: + curr_updates = num_updates - self.cfg.warmup_updates + lr_mult = self.lr_decay ** (curr_updates // self.lr_deacy_period) + self.lr = max(self.max_lr * lr_mult, self.min_lr) + + self.optimizer.set_lr(self.lr) + return self.lr diff --git a/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py b/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py index c573237f11..4d5547c39b 100644 --- a/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py +++ b/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py @@ -4,12 +4,51 @@ # LICENSE file in the root directory of this source tree. import math - -from . import LegacyFairseqLRScheduler, register_lr_scheduler - - -@register_lr_scheduler("tri_stage") -class TriStageLRSchedule(LegacyFairseqLRScheduler): +from dataclasses import dataclass, field +from typing import Optional, List, Tuple +from omegaconf import II + +from fairseq.dataclass import FairseqDataclass +from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler + + +@dataclass +class TriStageLRScheduleConfig(FairseqDataclass): + warmup_steps: int = field( + default=0, + metadata={"help": "warmup the learning rate linearly for the first N updates"}, + ) + hold_steps: int = field( + default=0, + metadata={"help": "steps in hold stage"}, + ) + decay_steps: int = field( + default=0, + metadata={"help": "steps in decay stages"}, + ) + phase_ratio: Optional[Tuple[float, float, float]] = field( + default=None, + metadata={ + "help": ( + "if set, automatically sets warmup/hold/decay steps to the ratio " + "specified here from max_updates. the ratios must add up to 1.0" + ) + }, + ) + init_lr_scale: float = field( + default=0.01, + metadata={"help": "initial learning rate scale during warmup phase"}, + ) + final_lr_scale: float = field( + default=0.01, + metadata={"help": "final learning rate scale"}, + ) + max_update: float = II("optimization.max_update") + lr: List[float] = II("optimization.lr") + + +@register_lr_scheduler("tri_stage", dataclass=TriStageLRScheduleConfig) +class TriStageLRSchedule(FairseqLRScheduler): """Tristage learning rate schedulr Implement the learning rate scheduler in https://arxiv.org/pdf/1904.08779.pdf @@ -29,92 +68,63 @@ class TriStageLRSchedule(LegacyFairseqLRScheduler): During warmup:: - init_lr = args.init_lr_scale * args.lr - lrs = torch.linspace(init_lr, args.lr, args.warmup_steps) + init_lr = cfg.init_lr_scale * cfg.lr + lrs = torch.linspace(init_lr, cfg.lr, cfg.warmup_steps) lr = lrs[update_num] During hold:: - lr = args.lr + lr = cfg.lr During decay:: - decay_factor = - math.log(args.final_lr_scale) / args.decay_steps - lr = args.lr * exp(- (update_num - warmup_steps - decay_steps) * decay_factor) + decay_factor = - math.log(cfg.final_lr_scale) / cfg.decay_steps + lr = cfg.lr * exp(- (update_num - warmup_steps - decay_steps) * decay_factor) After that:: - lr = args.lr * args.final_lr_scale + lr = cfg.lr * cfg.final_lr_scale """ - def __init__(self, args, optimizer): - super().__init__(args, optimizer) - if len(args.lr) > 1: + def __init__(self, cfg: TriStageLRScheduleConfig, optimizer): + super().__init__(cfg, optimizer) + if len(cfg.lr) > 1: raise ValueError( "Cannot use a fixed learning rate schedule with tri-stage lr." " Consider --lr-scheduler=fixed instead." ) # calculate LR at each point - self.peak_lr = args.lr[0] - self.init_lr = args.init_lr_scale * args.lr[0] - self.final_lr = args.final_lr_scale * args.lr[0] + self.peak_lr = cfg.lr[0] + self.init_lr = cfg.init_lr_scale * cfg.lr[0] + self.final_lr = cfg.final_lr_scale * cfg.lr[0] + + if cfg.phase_ratio is not None: + assert cfg.max_update > 0 + assert sum(cfg.phase_ratio) == 1, "phase ratios must add up to 1" + self.warmup_steps = int(cfg.max_update * cfg.phase_ratio[0]) + self.hold_steps = int(cfg.max_update * cfg.phase_ratio[1]) + self.decay_steps = int(cfg.max_update * cfg.phase_ratio[2]) + else: + self.warmup_steps = cfg.warmup_steps + self.hold_steps = cfg.hold_steps + self.decay_steps = cfg.decay_steps - # remember the steps at each stage - self.warmup_steps = args.warmup_steps - self.hold_steps = args.hold_steps - self.decay_steps = args.decay_steps + assert ( + self.warmup_steps + self.hold_steps + self.decay_steps > 0 + ), "please specify steps or phase_ratio" self.warmup_rate = ( (self.peak_lr - self.init_lr) / self.warmup_steps if self.warmup_steps != 0 else 0 ) - self.decay_factor = -math.log(args.final_lr_scale) / args.decay_steps + self.decay_factor = -math.log(cfg.final_lr_scale) / self.decay_steps # initial learning rate self.lr = self.init_lr self.optimizer.set_lr(self.lr) - @staticmethod - def add_args(parser): - """Add arguments to the parser for this LR scheduler.""" - # fmt: off - parser.add_argument( - '--warmup-steps', - default=4000, - type=int, - metavar='N', - help='warmup the learning rate linearly for the first N updates' - ) - parser.add_argument( - '--hold-steps', - default=20000, - type=int, - metavar='N', - help='steps in hold stage.' - ) - parser.add_argument( - '--decay-steps', - default=60000, - type=int, - metavar='N', - help='steps in decay stages' - ) - parser.add_argument( - '--init-lr-scale', - default=0.01, - type=float, - help=""" - initial learning rate scale during warmup phase; default is 0.01""") - parser.add_argument( - '--final-lr-scale', - default=0.01, - type=float, - help="final learning rate scale; default to 0.01" - ) - # fmt: on - def _decide_stage(self, update_step): """ return stage, and the corresponding steps within the current stage diff --git a/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py b/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py index 0f3193f2b8..2a32bd10f2 100644 --- a/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py +++ b/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py @@ -4,52 +4,61 @@ # LICENSE file in the root directory of this source tree. import math +from dataclasses import dataclass, field +from typing import List -from . import LegacyFairseqLRScheduler, register_lr_scheduler +from omegaconf import II +from fairseq.dataclass import FairseqDataclass +from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler -@register_lr_scheduler("triangular") -class TriangularSchedule(LegacyFairseqLRScheduler): + +@dataclass +class TriangularLRScheduleConfig(FairseqDataclass): + max_lr: float = field( + default="???", metadata={"help": "max learning rate, must be more than cfg.lr"} + ) + lr_period_updates: float = field( + default=5000, + metadata={"help": "initial number of updates per period (cycle length)"}, + ) + lr_shrink: float = field( + default=0.1, metadata={"help": "shrink factor for annealing"} + ) + shrink_min: bool = field( + default=False, metadata={"help": "if set, also shrinks min lr"} + ) + lr: List[float] = II("optimization.lr") + + +@register_lr_scheduler("triangular", dataclass=TriangularLRScheduleConfig) +class TriangularLRSchedule(FairseqLRScheduler): """Assign LR based on a triangular cyclical schedule. See https://arxiv.org/pdf/1506.01186.pdf for details. """ - def __init__(self, args, optimizer): - super().__init__(args, optimizer) - if len(args.lr) > 1: + def __init__(self, cfg: TriangularLRScheduleConfig, optimizer): + super().__init__(cfg, optimizer) + if len(cfg.lr) > 1: raise ValueError( "Cannot use a fixed learning rate schedule with triangular." " Consider --lr-scheduler=fixed instead." ) - lr = args.lr[0] + lr = cfg.lr[0] - assert args.max_lr > lr, "max_lr must be more than lr" + assert cfg.max_lr > lr, "max_lr must be more than lr" self.min_lr = lr - self.max_lr = args.max_lr - self.stepsize = args.lr_period_updates // 2 - self.lr_shrink = args.lr_shrink - self.shrink_min = args.shrink_min + self.max_lr = cfg.max_lr + self.stepsize = cfg.lr_period_updates // 2 + self.lr_shrink = cfg.lr_shrink + self.shrink_min = cfg.shrink_min # initial learning rate self.lr = self.min_lr self.optimizer.set_lr(self.lr) - @staticmethod - def add_args(parser): - """Add arguments to the parser for this LR scheduler.""" - # fmt: off - parser.add_argument('--max-lr', required=True, type=float, metavar='LR', - help='max learning rate, must be more than args.lr') - parser.add_argument('--lr-period-updates', default=5000, type=float, metavar='LR', - help='initial number of updates per period (cycle length)') - parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS', - help='shrink factor for annealing') - parser.add_argument('--shrink-min', action='store_true', - help='if set, also shrinks min lr') - # fmt: on - def step(self, epoch, val_loss=None): """Update the learning rate at the end of the given epoch.""" super().step(epoch, val_loss) @@ -60,7 +69,7 @@ def step_update(self, num_updates): """Update the learning rate after each update.""" cycle = math.floor(num_updates / (2 * self.stepsize)) - lr_shrink = self.lr_shrink ** cycle + lr_shrink = self.lr_shrink**cycle max_lr = self.max_lr * lr_shrink if self.shrink_min: min_lr = self.min_lr * lr_shrink diff --git a/fairseq/optim/nag.py b/fairseq/optim/nag.py index 3982a8271d..c30a6c0fb1 100644 --- a/fairseq/optim/nag.py +++ b/fairseq/optim/nag.py @@ -3,7 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from collections import Collection +from collections.abc import Collection from dataclasses import dataclass, field from typing import List @@ -62,7 +62,7 @@ def supports_flat_params(self): def step(self, closure=None): """Performs a single optimization step. - Arguments: + Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ @@ -75,7 +75,7 @@ def step(self, closure=None): momentum = group["momentum"] lr = group["lr"] lr_old = group.get("lr_old", lr) - lr_correct = lr / lr_old + lr_correct = lr / lr_old if lr_old > 0 else lr for p in group["params"]: if p.grad is None: diff --git a/fairseq/optim/shard.py b/fairseq/optim/shard.py index ecef05b442..9d7f2eb9e5 100644 --- a/fairseq/optim/shard.py +++ b/fairseq/optim/shard.py @@ -3,6 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +from typing import Any, Dict + +from fairseq.distributed import utils + try: from fairscale.optim import OSS @@ -30,6 +34,19 @@ def __getattr__(self, name): "'FairseqOSS' object has no attribute {0!r}".format(name) ) + def broadcast_global_state_dict( + self, state_dict: Dict[str, Any] + ) -> Dict[str, Any]: + """ + Broadcasts the entire state_dict to all other ranks + each rank is responsible to load their own partition of data + """ + return utils.broadcast_object( + state_dict, + src_rank=0, + group=self.group, + ) + torch_optimizer = optimizer.optimizer optim_cls = type(torch_optimizer) diff --git a/fairseq/options.py b/fairseq/options.py index 58e5e46190..920591635a 100644 --- a/fairseq/options.py +++ b/fairseq/options.py @@ -4,12 +4,13 @@ # LICENSE file in the root directory of this source tree. import argparse -from typing import Callable, List, Optional +from pathlib import Path +from typing import Callable, List, Optional, Union import torch from fairseq import utils from fairseq.data.indexed_dataset import get_available_dataset_impl -from fairseq.dataclass.data_class import ( +from fairseq.dataclass.configs import ( CheckpointConfig, CommonConfig, CommonEvalConfig, @@ -19,6 +20,7 @@ GenerationConfig, InteractiveConfig, OptimizationConfig, + EMAConfig, ) from fairseq.dataclass.utils import gen_parser_from_dataclass @@ -39,6 +41,7 @@ def get_training_parser(default_task="translation"): add_model_args(parser) add_optimization_args(parser) add_checkpoint_args(parser) + add_ema_args(parser) return parser @@ -53,6 +56,14 @@ def get_generation_parser(interactive=False, default_task="translation"): return parser +def get_speech_generation_parser(default_task="text_to_speech"): + parser = get_parser("Speech Generation", default_task) + add_dataset_args(parser, gen=True) + add_distributed_training_args(parser, default_world_size=1) + add_speech_generation_args(parser) + return parser + + def get_interactive_generation_parser(default_task="translation"): return get_generation_parser(interactive=True, default_task=default_task) @@ -197,6 +208,13 @@ def parse_args_and_arch( else: args.no_seed_provided = False + if getattr(args, "update_epoch_batch_itr", None) is None: + if hasattr(args, "grouped_shuffling"): + args.update_epoch_batch_itr = args.grouped_shuffling + else: + args.grouped_shuffling = False + args.update_epoch_batch_itr = False + # Apply architecture configuration. if hasattr(args, "arch") and args.arch in ARCH_CONFIG_REGISTRY: ARCH_CONFIG_REGISTRY[args.arch](args) @@ -249,11 +267,13 @@ def add_preprocess_args(parser): group.add_argument("-t", "--target-lang", default=None, metavar="TARGET", help="target language") group.add_argument("--trainpref", metavar="FP", default=None, - help="train file prefix") + help="train file prefix (also used to build dictionaries)") group.add_argument("--validpref", metavar="FP", default=None, - help="comma separated, valid file prefixes") + help="comma separated, valid file prefixes " + "(words missing from train set are replaced with <unk>)") group.add_argument("--testpref", metavar="FP", default=None, - help="comma separated, test file prefixes") + help="comma separated, test file prefixes " + "(words missing from train set are replaced with <unk>)") group.add_argument("--align-suffix", metavar="FP", default=None, help="alignment file suffix") group.add_argument("--destdir", metavar="DIR", default="data-bin", @@ -283,6 +303,8 @@ def add_preprocess_args(parser): help="Pad dictionary size to be multiple of N") group.add_argument("--workers", metavar="N", default=1, type=int, help="number of parallel workers") + group.add_argument("--dict-only", action='store_true', + help="if true, only builds a dictionary and then exits") # fmt: on return parser @@ -337,6 +359,16 @@ def add_generation_args(parser): return group +def add_speech_generation_args(parser): + group = parser.add_argument_group("Speech Generation") + add_common_eval_args(group) # NOTE: remove_bpe is not needed + # fmt: off + group.add_argument('--eos_prob_threshold', default=0.5, type=float, + help='terminate when eos probability exceeds this') + # fmt: on + return group + + def add_interactive_args(parser): group = parser.add_argument_group("Interactive") gen_parser_from_dataclass(group, InteractiveConfig()) @@ -359,3 +391,23 @@ def add_model_args(parser): help='model architecture') # fmt: on return group + + +def get_args( + data: Union[str, Path], + task: str = "translation", + arch: str = "transformer", + **overrides +): + parser = get_training_parser(task) + args = parse_args_and_arch(parser, [str(data), "--task", task, "--arch", arch]) + + for k, v in overrides.items(): + setattr(args, k, v) + + return args + + +def add_ema_args(parser): + group = parser.add_argument_group("EMA configuration") + gen_parser_from_dataclass(group, EMAConfig()) diff --git a/fairseq/registry.py b/fairseq/registry.py index 4446084d4a..904ffcd602 100644 --- a/fairseq/registry.py +++ b/fairseq/registry.py @@ -7,7 +7,8 @@ from typing import Union from fairseq.dataclass import FairseqDataclass -from fairseq.dataclass.utils import populate_dataclass +from fairseq.dataclass.utils import merge_with_parent +from hydra.core.config_store import ConfigStore from omegaconf import DictConfig REGISTRIES = {} @@ -24,21 +25,32 @@ def setup_registry(registry_name: str, base_class=None, default=None, required=F # maintain a registry of all registries if registry_name in REGISTRIES: return # registry already exists - REGISTRIES[registry_name] = {"registry": REGISTRY, "default": default, "dataclass_registry": DATACLASS_REGISTRY} + REGISTRIES[registry_name] = { + "registry": REGISTRY, + "default": default, + "dataclass_registry": DATACLASS_REGISTRY, + } def build_x(cfg: Union[DictConfig, str, Namespace], *extra_args, **extra_kwargs): if isinstance(cfg, DictConfig): choice = cfg._name + + if choice and choice in DATACLASS_REGISTRY: + from_checkpoint = extra_kwargs.get("from_checkpoint", False) + dc = DATACLASS_REGISTRY[choice] + cfg = merge_with_parent(dc(), cfg, remove_missing=from_checkpoint) elif isinstance(cfg, str): choice = cfg + if choice in DATACLASS_REGISTRY: + cfg = DATACLASS_REGISTRY[choice]() else: choice = getattr(cfg, registry_name, None) if choice in DATACLASS_REGISTRY: - cfg = populate_dataclass(cfg, DATACLASS_REGISTRY[choice]()) + cfg = DATACLASS_REGISTRY[choice].from_namespace(cfg) if choice is None: if required: - raise ValueError('{} is required!'.format(registry_name)) + raise ValueError("{} is required!".format(registry_name)) return None cls = REGISTRY[choice] @@ -47,6 +59,9 @@ def build_x(cfg: Union[DictConfig, str, Namespace], *extra_args, **extra_kwargs) else: builder = cls + if "from_checkpoint" in extra_kwargs: + del extra_kwargs["from_checkpoint"] + return builder(cfg, *extra_args, **extra_kwargs) def register_x(name, dataclass=None): @@ -72,9 +87,16 @@ def register_x_cls(cls): ) cls.__dataclass = dataclass - REGISTRY[name] = cls if cls.__dataclass is not None: DATACLASS_REGISTRY[name] = cls.__dataclass + + cs = ConfigStore.instance() + node = dataclass() + node._name = name + cs.store(name=name, group=registry_name, node=node, provider="fairseq") + + REGISTRY[name] = cls + return cls return register_x_cls diff --git a/fairseq/scoring/__init__.py b/fairseq/scoring/__init__.py index 8c706cb585..58f2f563e4 100644 --- a/fairseq/scoring/__init__.py +++ b/fairseq/scoring/__init__.py @@ -18,10 +18,6 @@ def __init__(self, cfg): self.ref = [] self.pred = [] - @staticmethod - def add_args(parser): - pass - def add_string(self, ref, pred): self.ref.append(ref) self.pred.append(pred) @@ -41,10 +37,9 @@ def result_string(self) -> str: def build_scorer(choice, tgt_dict): - if isinstance(choice, DictConfig): - choice = choice._name + _choice = choice._name if isinstance(choice, DictConfig) else choice - if choice == "bleu": + if _choice == "bleu": from fairseq.scoring import bleu return bleu.Scorer( @@ -54,7 +49,7 @@ def build_scorer(choice, tgt_dict): # automatically import any Python files in the current directory -for file in os.listdir(os.path.dirname(__file__)): +for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): module = file[: file.find(".py")] importlib.import_module("fairseq.scoring." + module) diff --git a/fairseq/scoring/bertscore.py b/fairseq/scoring/bertscore.py new file mode 100644 index 0000000000..6d5a8450d3 --- /dev/null +++ b/fairseq/scoring/bertscore.py @@ -0,0 +1,44 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass, field + +import numpy as np + +from fairseq.dataclass import FairseqDataclass +from fairseq.scoring import BaseScorer, register_scorer + + +@dataclass +class BertScoreScorerConfig(FairseqDataclass): + bert_score_lang: str = field(default="en", metadata={"help": "BERTScore language"}) + + +@register_scorer("bert_score", dataclass=BertScoreScorerConfig) +class BertScoreScorer(BaseScorer): + def __init__(self, cfg): + super(BertScoreScorer, self).__init__(cfg) + try: + import bert_score as _bert_score + except ImportError: + raise ImportError("Please install BERTScore: pip install bert-score") + + self.cfg = cfg + self._bert_score = _bert_score + self.scores = None + + def add_string(self, ref, pred): + self.ref.append(ref) + self.pred.append(pred) + + def score(self, order=4): + _, _, self.scores = self._bert_score.score( + self.pred, self.ref, lang=self.cfg.bert_score_lang + ) + self.scores = self.scores.numpy() + return np.mean(self.scores) + + def result_string(self, order=4): + return f"BERTScore: {self.score():.4f}" diff --git a/fairseq/scoring/bleu.py b/fairseq/scoring/bleu.py index 97de5f966e..e55bd2f393 100644 --- a/fairseq/scoring/bleu.py +++ b/fairseq/scoring/bleu.py @@ -59,16 +59,17 @@ def add_string(self, ref, pred): self.ref.append(self.tokenizer.tokenize(ref)) self.pred.append(self.tokenizer.tokenize(pred)) - def score(self, order=4): - return self.result_string(order).score - - def result_string(self, order=4): + def _score(self, order=4): if order != 4: raise NotImplementedError # tokenization and lowercasing are performed by self.tokenizer instead. - return self.sacrebleu.corpus_bleu( - self.pred, [self.ref], tokenize="none" - ).format() + return self.sacrebleu.corpus_bleu(self.pred, [self.ref], tokenize="none") + + def score(self, order=4): + return self._score(order).score + + def result_string(self, order=4): + return self._score(order).format() @dataclass diff --git a/fairseq/scoring/chrf.py b/fairseq/scoring/chrf.py index 0d6cb77383..5df5a1c011 100644 --- a/fairseq/scoring/chrf.py +++ b/fairseq/scoring/chrf.py @@ -3,10 +3,19 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass + +from fairseq.dataclass import FairseqDataclass from fairseq.scoring import BaseScorer, register_scorer -@register_scorer("chrf") +@dataclass +class ChrFScorerConfig(FairseqDataclass): + pass + + +@register_scorer("chrf", dataclass=ChrFScorerConfig) class ChrFScorer(BaseScorer): def __init__(self, args): super(ChrFScorer, self).__init__(args) diff --git a/fairseq/scoring/meteor.py b/fairseq/scoring/meteor.py new file mode 100644 index 0000000000..32719956fe --- /dev/null +++ b/fairseq/scoring/meteor.py @@ -0,0 +1,42 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +from dataclasses import dataclass + +from fairseq.dataclass import FairseqDataclass +from fairseq.scoring import BaseScorer, register_scorer + + +@dataclass +class MeteorScorerConfig(FairseqDataclass): + pass + + +@register_scorer("meteor", dataclass=MeteorScorerConfig) +class MeteorScorer(BaseScorer): + def __init__(self, args): + super(MeteorScorer, self).__init__(args) + try: + import nltk + except ImportError: + raise ImportError("Please install nltk to use METEOR scorer") + + self.nltk = nltk + self.scores = [] + + def add_string(self, ref, pred): + self.ref.append(ref) + self.pred.append(pred) + + def score(self, order=4): + self.scores = [ + self.nltk.translate.meteor_score.single_meteor_score(r, p) + for r, p in zip(self.ref, self.pred) + ] + return np.mean(self.scores) + + def result_string(self, order=4): + return f"METEOR: {self.score():.4f}" diff --git a/fairseq/scoring/tokenizer.py b/fairseq/scoring/tokenizer.py index 0d0702bf15..b0cedd5099 100644 --- a/fairseq/scoring/tokenizer.py +++ b/fairseq/scoring/tokenizer.py @@ -5,7 +5,11 @@ import unicodedata -from fairseq.dataclass.utils import ChoiceEnum +import sacrebleu as sb + +from fairseq.dataclass import ChoiceEnum + +SACREBLEU_V2_ABOVE = int(sb.__version__[0]) >= 2 class EvaluationTokenizer(object): @@ -24,7 +28,12 @@ class EvaluationTokenizer(object): SPACE = chr(32) SPACE_ESCAPE = chr(9601) - ALL_TOKENIZER_TYPES = ChoiceEnum(["none", "13a", "intl", "zh", "ja-mecab"]) + _ALL_TOKENIZER_TYPES = ( + sb.BLEU.TOKENIZERS + if SACREBLEU_V2_ABOVE + else ["none", "13a", "intl", "zh", "ja-mecab"] + ) + ALL_TOKENIZER_TYPES = ChoiceEnum(_ALL_TOKENIZER_TYPES) def __init__( self, @@ -33,13 +42,17 @@ def __init__( punctuation_removal: bool = False, character_tokenization: bool = False, ): - from sacrebleu.tokenizers import TOKENIZERS - assert tokenizer_type in TOKENIZERS, f"{tokenizer_type}, {TOKENIZERS}" + assert ( + tokenizer_type in self._ALL_TOKENIZER_TYPES + ), f"{tokenizer_type}, {self._ALL_TOKENIZER_TYPES}" self.lowercase = lowercase self.punctuation_removal = punctuation_removal self.character_tokenization = character_tokenization - self.tokenizer = TOKENIZERS[tokenizer_type] + if SACREBLEU_V2_ABOVE: + self.tokenizer = sb.BLEU(tokenize=str(tokenizer_type)).tokenizer + else: + self.tokenizer = sb.tokenizers.TOKENIZERS[tokenizer_type]() @classmethod def remove_punctuation(cls, sent: str): @@ -51,7 +64,7 @@ def remove_punctuation(cls, sent: str): ) def tokenize(self, sent: str): - tokenized = self.tokenizer()(sent) + tokenized = self.tokenizer(sent) if self.punctuation_removal: tokenized = self.remove_punctuation(tokenized) diff --git a/fairseq/search.py b/fairseq/search.py index d5ea68b4ce..c7378bbb51 100644 --- a/fairseq/search.py +++ b/fairseq/search.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. import math + from typing import List, Optional import torch @@ -113,6 +114,7 @@ def step( scores: Optional[Tensor], prev_output_tokens: Optional[Tensor] = None, original_batch_idxs: Optional[Tensor] = None, + candidate_multiple: int = 2, ): bsz, beam_size, vocab_size = lprobs.size() @@ -128,16 +130,16 @@ def step( top_prediction = torch.topk( lprobs.view(bsz, -1), k=min( - # Take the best 2 x beam_size predictions. We'll choose the first + # Take the best `candidate_muliple`(default 2) x beam_size predictions. We'll choose the first # beam_size of these which don't predict eos to continue with. - beam_size * 2, + candidate_multiple * beam_size, lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad ), ) scores_buf = top_prediction[0] indices_buf = top_prediction[1] # Project back into relative indices and beams - beams_buf = indices_buf // vocab_size + beams_buf = torch.div(indices_buf, vocab_size, rounding_mode="trunc") indices_buf = indices_buf.fmod(vocab_size) # At this point, beams_buf and indices_buf are single-dim and contain relative indices @@ -554,15 +556,57 @@ class DiverseBeamSearch(Search): See "Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models" for details. - We only implement the Hamming Diversity penalty here, which performed best - in the original paper. + We implement cumulative diversity penalty here as default, optionally provide Hamming diversity described + in the original paper, and a way to interpolate between the two through diversity_discount. + + Take the example below for illustration of cumulative diversity implemented. + A) I like dogs. + B) I like ____. + C) There are ___. + And we are at step=2, trying to fill in the blank: + + Hamming diversity: + Penalty for B from A is 1 for "dogs" and 0 for any other words like "cats". + Penalty for C from A is 1 for "dogs" and 0 for any other words like "cats". + + Cumulative diversity (default): + Penalty for B from A is 3 for "dogs" and 0 for any other words like "cats". + Penalty for C from A is 1 for "dogs" and 0 for any other words like "cats". + B and C differ because B matches with A for "I" and "like" at respective steps incurring 2 cumulative penalty. + + Using divesrity_discount to interpolate between the two: + if diverstiy_discount = 0.5, then + Penalty for B from A is 1.75 (1 + 0.5 + 0.25) for "dogs" and 0 for any other words like "cats". + Penalty for C from A is 1 for "dogs" and 0 for any other words like "cats". + "I" and "like" matched for B and A at step 0 and 1 respectively. Since "I" is two steps away and "like" is one step away, they are discounted by (0.5)^2 and 0.5 respectively. + When diversity_discount = 0, we recover Hammning diversity and when diversity_discount = 1, we recover cumulative diversity. + + NB: During beam search for each diversity group, `candidate_mutiple` is set to 1 rather than BeamSearch default(2). + This is to ensure we have final `beam_size` candidates so that no diversity groups would be dropped during final token selection in sequence generation. + For full backwards compatibility, use diversity_discount=0 and candidate_multiple=2. + """ - def __init__(self, tgt_dict, num_groups, diversity_strength): + def __init__( + self, + tgt_dict, + num_groups, + diversity_strength, + diversity_discount=1.0, + candidate_multiple=1, + ): super().__init__(tgt_dict) self.num_groups = num_groups self.diversity_strength = -diversity_strength self.beam = BeamSearch(tgt_dict) + self.diversity_discount = diversity_discount + self.candidate_multiple = candidate_multiple + + # Float tensor to keep track of overlap between groups. + # Each token shared at the same step between two groups is counted as one. + # Then token counts are discounted by `diversity_discount` for every next timestep. + # Once initialized, dimension is batch_size * num_groups * num_groups. + self.group_overlap = torch.empty(0) @torch.jit.export def step( @@ -582,13 +626,38 @@ def step( # initialize diversity penalty diversity_buf = torch.zeros(lprobs[:, 0, :].size()).to(lprobs) - scores_G, indices_G, beams_G = [], [], [] + scores_G, beams_G = [], [] + + # pre-allocating tensor for indices for all groups + indices_G_stacked = torch.empty( + bsz, + int(beam_size / self.num_groups) * self.candidate_multiple, + self.num_groups, + dtype=torch.long, + device=lprobs.device, + ) + for g in range(self.num_groups): lprobs_g = lprobs[:, g :: self.num_groups, :] scores_g = scores[:, g :: self.num_groups, :] if step > 0 else None + diversity_buf.zero_() # apply diversity penalty if g > 0: + indices_ = indices_G_stacked[:, :, :g] + if step > 0: + penalty_val = 1 + self.group_overlap[original_batch_idxs, g, :g] + penalty_val = penalty_val.unsqueeze(1) + else: + penalty_val = torch.ones(bsz, 1, 1) + diversity_buf.scatter_add_( + 1, + indices_.reshape(bsz, -1), + penalty_val.expand(indices_.size()) + .reshape(bsz, -1) + .to(diversity_buf), + ) + lprobs_g = torch.add( lprobs_g, other=diversity_buf.unsqueeze(1), @@ -598,23 +667,32 @@ def step( lprobs_g = lprobs_g.contiguous() scores_buf, indices_buf, beams_buf = self.beam.step( - step, lprobs_g, scores_g + step, lprobs_g, scores_g, candidate_multiple=self.candidate_multiple ) beams_buf.mul_(self.num_groups).add_(g) scores_G.append(scores_buf.clone()) - indices_G.append(indices_buf.clone()) beams_G.append(beams_buf.clone()) - # update diversity penalty - diversity_buf.scatter_add_( - 1, indices_buf, torch.ones(indices_buf.size()).to(diversity_buf) - ) + indices_G_stacked[:, :, g] = indices_buf # interleave results from different groups scores_buf = torch.stack(scores_G, dim=2).view(bsz, -1) - indices_buf = torch.stack(indices_G, dim=2).view(bsz, -1) + indices_buf = indices_G_stacked.view(bsz, -1) beams_buf = torch.stack(beams_G, dim=2).view(bsz, -1) + # find num of overlapped tokens for each group pair + # then discount it for next timestamp + overlap = self.diversity_discount * torch.sum( + indices_G_stacked.unsqueeze(2).eq(indices_G_stacked.unsqueeze(3)), dim=1 + ) + if step == 0: + self.group_overlap = overlap + else: + self.group_overlap[original_batch_idxs] = ( + self.group_overlap[original_batch_idxs] * self.diversity_discount + + overlap + ) + return scores_buf, indices_buf, beams_buf diff --git a/fairseq/sequence_generator.py b/fairseq/sequence_generator.py index 9c5423e2b1..78db504e6c 100644 --- a/fairseq/sequence_generator.py +++ b/fairseq/sequence_generator.py @@ -4,15 +4,17 @@ # LICENSE file in the root directory of this source tree. import math +import sys from typing import Dict, List, Optional import torch import torch.nn as nn +from torch import Tensor + from fairseq import search, utils from fairseq.data import data_utils from fairseq.models import FairseqIncrementalDecoder -from fairseq.models.fairseq_encoder import EncoderOut -from torch import Tensor +from fairseq.ngram_repeat_block import NGramRepeatBlock class SequenceGenerator(nn.Module): @@ -23,6 +25,7 @@ def __init__( beam_size=1, max_len_a=0, max_len_b=200, + max_len=0, min_len=1, normalize_scores=True, len_penalty=1.0, @@ -35,6 +38,7 @@ def __init__( symbols_to_strip_from_output=None, lm_model=None, lm_weight=1.0, + tokens_to_suppress=(), ): """Generates translations of a given source sentence. @@ -44,6 +48,8 @@ def __init__( beam_size (int, optional): beam width (default: 1) max_len_a/b (int, optional): generate sequences of maximum length ax + b, where x is the source length + max_len (int, optional): the maximum length of the generated output + (not including end-of-sentence) min_len (int, optional): the minimum length of the generated output (not including end-of-sentence) normalize_scores (bool, optional): normalize scores by the length @@ -72,20 +78,39 @@ def __init__( if symbols_to_strip_from_output is not None else {self.eos} ) + + self.token_indices_to_suppress: Optional[Tensor] = None + token_indices_to_suppress = [] + for token_string in tokens_to_suppress: + token_index = tgt_dict.index(token_string) + assert token_index != self.unk + token_indices_to_suppress.append(token_index) + if len(token_indices_to_suppress) > 0: + self.token_indices_to_suppress = torch.Tensor( + token_indices_to_suppress + ).long() + self.vocab_size = len(tgt_dict) self.beam_size = beam_size # the max beam size is the dictionary size - 1, since we never select pad self.beam_size = min(beam_size, self.vocab_size - 1) + self.model.set_decoder_beam_size(self.beam_size) self.max_len_a = max_len_a self.max_len_b = max_len_b self.min_len = min_len + self.max_len = max_len or self.model.max_decoder_positions() self.normalize_scores = normalize_scores self.len_penalty = len_penalty self.unk_penalty = unk_penalty self.temperature = temperature self.match_source_len = match_source_len - self.no_repeat_ngram_size = no_repeat_ngram_size + + if no_repeat_ngram_size > 0: + self.repeat_ngram_blocker = NGramRepeatBlock(no_repeat_ngram_size) + else: + self.repeat_ngram_blocker = None + assert temperature > 0, "--temperature must be greater than 0" self.search = ( @@ -161,7 +186,9 @@ def generate_batched_itr(self, data_itr, beam_size=None, cuda=False, timer=None) yield id, src, ref, hypos[i] @torch.no_grad() - def generate(self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs): + def generate( + self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs + ) -> List[List[Dict[str, Tensor]]]: """Generate translations. Match the api of other fairseq generators. Args: @@ -195,9 +222,15 @@ def _generate( if "src_tokens" in net_input: src_tokens = net_input["src_tokens"] # length of the source text being the character length except EndOfSentence and pad - src_lengths = ( - (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1) - ) + # if src_lengths exists in net_input (speech_to_text dataset case), then use it + if "src_lengths" in net_input: + src_lengths = net_input["src_lengths"] + else: + src_lengths = ( + (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)) + .long() + .sum(dim=1) + ) elif "source" in net_input: src_tokens = net_input["source"] src_lengths = ( @@ -205,11 +238,21 @@ def _generate( if net_input["padding_mask"] is not None else torch.tensor(src_tokens.size(-1)).to(src_tokens) ) + elif "features" in net_input: + src_tokens = net_input["features"] + src_lengths = ( + net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1) + if net_input["padding_mask"] is not None + else torch.tensor(src_tokens.size(-1)).to(src_tokens) + ) else: - raise Exception("expected src_tokens or source in net input") + raise Exception( + "expected src_tokens or source in net input. input keys: " + + str(net_input.keys()) + ) # bsz: total number of sentences in beam - # Note that src_tokens may have more than 2 dimenions (i.e. audio features) + # Note that src_tokens may have more than 2 dimensions (i.e. audio features) bsz, src_len = src_tokens.size()[:2] beam_size = self.beam_size @@ -227,14 +270,14 @@ def _generate( else: max_len = min( int(self.max_len_a * src_len + self.max_len_b), - # exclude the EOS marker - self.model.max_decoder_positions() - 1, + self.max_len - 1, ) assert ( self.min_len <= max_len ), "min_len cannot be larger than max_len, please adjust these!" # compute the encoder output for each beam - encoder_outs = self.model.forward_encoder(net_input) + with torch.autograd.profiler.record_function("EnsembleModel: forward_encoder"): + encoder_outs = self.model.forward_encoder(net_input) # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) @@ -270,17 +313,21 @@ def _generate( [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)], ) # contains lists of dictionaries of infomation about the hypothesis being finalized at each step - finished = [ - False for i in range(bsz) - ] # a boolean array indicating if the sentence at the index is finished or not + # a boolean array indicating if the sentence at the index is finished or not + finished = [False for i in range(bsz)] num_remaining_sent = bsz # number of sentences remaining # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes - bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens) - cand_offsets = torch.arange(0, cand_size).type_as(tokens) + bbsz_offsets = ( + (torch.arange(0, bsz) * beam_size) + .unsqueeze(1) + .type_as(tokens) + .to(src_tokens.device) + ) + cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(src_tokens.device) reorder_state: Optional[Tensor] = None batch_idxs: Optional[Tensor] = None @@ -307,13 +354,15 @@ def _generate( encoder_outs = self.model.reorder_encoder_out( encoder_outs, reorder_state ) - - lprobs, avg_attn_scores = self.model.forward_decoder( - tokens[:, : step + 1], - encoder_outs, - incremental_states, - self.temperature, - ) + with torch.autograd.profiler.record_function( + "EnsembleModel: forward_decoder" + ): + lprobs, avg_attn_scores = self.model.forward_decoder( + tokens[:, : step + 1], + encoder_outs, + incremental_states, + self.temperature, + ) if self.lm_model is not None: lm_out = self.lm_model(tokens[:, : step + 1]) @@ -342,9 +391,13 @@ def _generate( lprobs, tokens, scores = self._prefix_tokens( step, lprobs, scores, tokens, prefix_tokens, beam_size ) - elif step < self.min_len: - # minimum length constraint (does not apply if using prefix_tokens) - lprobs[:, self.eos] = -math.inf + else: + if step < self.min_len: + # minimum length constraint (does not apply if using prefix_tokens) + lprobs[:, self.eos] = -math.inf + + if self.token_indices_to_suppress is not None: + lprobs[:, self.token_indices_to_suppress] = -math.inf # Record attention scores, only support avg_attn_scores is a Tensor if avg_attn_scores is not None: @@ -365,8 +418,8 @@ def _generate( if self.should_set_src_lengths: self.search.set_src_lengths(src_lengths) - if self.no_repeat_ngram_size > 0: - lprobs = self._no_repeat_ngram(tokens, lprobs, bsz, beam_size, step) + if self.repeat_ngram_blocker is not None: + lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz, beam_size, step) # Shape: (batch, cand_size) cand_scores, cand_indices, cand_beams = self.search.step( @@ -420,7 +473,7 @@ def _generate( break if self.search.stop_on_max_len and step >= max_len: break - assert step < max_len + assert step < max_len, f"{step} < {max_len}" # Remove finalized sentences (ones for which {beam_size} # finished hypotheses have been generated) from the batch. @@ -633,44 +686,38 @@ def finalize_hypos( prev += 1 else: cum_unfin.append(prev) + cum_fin_tensor = torch.tensor(cum_unfin, dtype=torch.int).to(bbsz_idx) - # The keys here are of the form "{sent}_{unfin_idx}", where + unfin_idx = torch.div(bbsz_idx, beam_size, rounding_mode="trunc") + sent = unfin_idx + torch.index_select(cum_fin_tensor, 0, unfin_idx) + + # Create a set of "{sent}{unfin_idx}", where # "unfin_idx" is the index in the current (possibly reduced) # list of sentences, and "sent" is the index in the original, # unreduced batch - # set() is not supported in script export - sents_seen: Dict[str, Optional[Tensor]] = {} - # For every finished beam item - for i in range(bbsz_idx.size()[0]): - idx = bbsz_idx[i] - score = eos_scores[i] - # sentence index in the current (possibly reduced) batch - unfin_idx = idx // beam_size - # sentence index in the original (unreduced) batch - sent = unfin_idx + cum_unfin[unfin_idx] - # Cannot create dict for key type '(int, int)' in torchscript. - # The workaround is to cast int to string - seen = str(sent.item()) + "_" + str(unfin_idx.item()) - if seen not in sents_seen: - sents_seen[seen] = None - - if self.match_source_len and step > src_lengths[unfin_idx]: - score = torch.tensor(-math.inf).to(score) + # sentence index in the current (possibly reduced) batch + seen = (sent << 32) + unfin_idx + unique_seen: List[int] = torch.unique(seen).tolist() + if self.match_source_len: + condition = step > torch.index_select(src_lengths, 0, unfin_idx) + eos_scores = torch.where(condition, torch.tensor(-math.inf), eos_scores) + sent_list: List[int] = sent.tolist() + for i in range(bbsz_idx.size()[0]): # An input sentence (among those in a batch) is finished when # beam_size hypotheses have been collected for it - if len(finalized[sent]) < beam_size: + if len(finalized[sent_list[i]]) < beam_size: if attn_clone is not None: # remove padding tokens from attn scores hypo_attn = attn_clone[i] else: hypo_attn = torch.empty(0) - finalized[sent].append( + finalized[sent_list[i]].append( { "tokens": tokens_clone[i], - "score": score, + "score": eos_scores[i], "attention": hypo_attn, # src_len x tgt_len "alignment": torch.empty(0), "positional_scores": pos_scores[i], @@ -678,17 +725,16 @@ def finalize_hypos( ) newly_finished: List[int] = [] - - for seen in sents_seen.keys(): + for unique_s in unique_seen: # check termination conditions for this sentence - sent: int = int(float(seen.split("_")[0])) - unfin_idx: int = int(float(seen.split("_")[1])) + unique_sent: int = unique_s >> 32 + unique_unfin_idx: int = unique_s - (unique_sent << 32) - if not finished[sent] and self.is_finished( - step, unfin_idx, max_len, len(finalized[sent]), beam_size + if not finished[unique_sent] and self.is_finished( + step, unique_unfin_idx, max_len, len(finalized[unique_sent]), beam_size ): - finished[sent] = True - newly_finished.append(unfin_idx) + finished[unique_sent] = True + newly_finished.append(unique_unfin_idx) return newly_finished @@ -710,62 +756,6 @@ def is_finished( return True return False - def calculate_banned_tokens( - self, - tokens, - step: int, - gen_ngrams: List[Dict[str, List[int]]], - no_repeat_ngram_size: int, - bbsz_idx: int, - ): - tokens_list: List[int] = tokens[ - bbsz_idx, step + 2 - no_repeat_ngram_size : step + 1 - ].tolist() - # before decoding the next token, prevent decoding of ngrams that have already appeared - ngram_index = ",".join([str(x) for x in tokens_list]) - return gen_ngrams[bbsz_idx].get(ngram_index, torch.jit.annotate(List[int], [])) - - def transpose_list(self, l: List[List[int]]): - # GeneratorExp aren't supported in TS so ignoring the lint - min_len = min([len(x) for x in l]) # noqa - l2 = [[row[i] for row in l] for i in range(min_len)] - return l2 - - def _no_repeat_ngram(self, tokens, lprobs, bsz: int, beam_size: int, step: int): - # for each beam and batch sentence, generate a list of previous ngrams - gen_ngrams: List[Dict[str, List[int]]] = [ - torch.jit.annotate(Dict[str, List[int]], {}) - for bbsz_idx in range(bsz * beam_size) - ] - cpu_tokens = tokens.cpu() - for bbsz_idx in range(bsz * beam_size): - gen_tokens: List[int] = cpu_tokens[bbsz_idx].tolist() - for ngram in self.transpose_list( - [gen_tokens[i:] for i in range(self.no_repeat_ngram_size)] - ): - key = ",".join([str(x) for x in ngram[:-1]]) - gen_ngrams[bbsz_idx][key] = gen_ngrams[bbsz_idx].get( - key, torch.jit.annotate(List[int], []) - ) + [ngram[-1]] - - if step + 2 - self.no_repeat_ngram_size >= 0: - # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet - banned_tokens = [ - self.calculate_banned_tokens( - tokens, step, gen_ngrams, self.no_repeat_ngram_size, bbsz_idx - ) - for bbsz_idx in range(bsz * beam_size) - ] - else: - banned_tokens = [ - torch.jit.annotate(List[int], []) for bbsz_idx in range(bsz * beam_size) - ] - for bbsz_idx in range(bsz * beam_size): - lprobs[bbsz_idx][ - torch.tensor(banned_tokens[bbsz_idx]).long() - ] = torch.tensor(-math.inf).to(lprobs) - return lprobs - class EnsembleModel(nn.Module): """A wrapper around an ensemble of models.""" @@ -794,7 +784,21 @@ def has_incremental_states(self): return self.has_incremental def max_decoder_positions(self): - return min([m.max_decoder_positions() for m in self.models]) + return min( + [ + m.max_decoder_positions() + for m in self.models + if hasattr(m, "max_decoder_positions") + ] + + [sys.maxsize] + ) + + def set_decoder_beam_size(self, beam_size): + """Set beam size for efficient beamable enc-dec attention.""" + if beam_size > 1: + for model in self.models: + if hasattr(model, "set_beam_size"): + model.set_beam_size(beam_size) @torch.jit.export def forward_encoder(self, net_input: Dict[str, Tensor]): @@ -806,13 +810,13 @@ def forward_encoder(self, net_input: Dict[str, Tensor]): def forward_decoder( self, tokens, - encoder_outs: List[EncoderOut], + encoder_outs: List[Dict[str, List[Tensor]]], incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], temperature: float = 1.0, ): log_probs = [] avg_attn: Optional[Tensor] = None - encoder_out: Optional[EncoderOut] = None + encoder_out: Optional[Dict[str, List[Tensor]]] = None for i, model in enumerate(self.models): if self.has_encoder(): encoder_out = encoder_outs[i] @@ -824,7 +828,10 @@ def forward_decoder( incremental_state=incremental_states[i], ) else: - decoder_out = model.decoder.forward(tokens, encoder_out=encoder_out) + if hasattr(model, "decoder"): + decoder_out = model.decoder.forward(tokens, encoder_out=encoder_out) + else: + decoder_out = model.forward(tokens) attn: Optional[Tensor] = None decoder_len = len(decoder_out) @@ -844,7 +851,6 @@ def forward_decoder( decoder_out[0][:, -1:, :].div_(temperature), None if decoder_len <= 1 else decoder_out[1], ) - probs = model.get_normalized_probs( decoder_out_tuple, log_probs=True, sample=None ) @@ -868,7 +874,9 @@ def forward_decoder( return avg_probs, avg_attn @torch.jit.export - def reorder_encoder_out(self, encoder_outs: Optional[List[EncoderOut]], new_order): + def reorder_encoder_out( + self, encoder_outs: Optional[List[Dict[str, List[Tensor]]]], new_order + ): """ Reorder encoder output according to *new_order*. @@ -879,7 +887,7 @@ def reorder_encoder_out(self, encoder_outs: Optional[List[EncoderOut]], new_orde Returns: *encoder_out* rearranged according to *new_order* """ - new_outs: List[EncoderOut] = [] + new_outs: List[Dict[str, List[Tensor]]] = [] if not self.has_encoder(): return new_outs for i, model in enumerate(self.models): @@ -904,7 +912,9 @@ def reorder_incremental_state( class SequenceGeneratorWithAlignment(SequenceGenerator): - def __init__(self, models, tgt_dict, left_pad_target=False, **kwargs): + def __init__( + self, models, tgt_dict, left_pad_target=False, print_alignment="hard", **kwargs + ): """Generates translations of a given source sentence. Produces alignments following "Jointly Learning to Align and @@ -918,6 +928,11 @@ def __init__(self, models, tgt_dict, left_pad_target=False, **kwargs): super().__init__(EnsembleModelWithAlignment(models), tgt_dict, **kwargs) self.left_pad_target = left_pad_target + if print_alignment == "hard": + self.extract_alignment = utils.extract_hard_alignment + elif print_alignment == "soft": + self.extract_alignment = utils.extract_soft_alignment + @torch.no_grad() def generate(self, models, sample, **kwargs): finalized = super()._generate(sample, **kwargs) @@ -946,7 +961,7 @@ def generate(self, models, sample, **kwargs): # Process the attn matrix to extract hard alignments. for i in range(bsz * beam_size): - alignment = utils.extract_hard_alignment( + alignment = self.extract_alignment( attn[i], src_tokens[i], tgt_tokens[i], self.pad, self.eos ) finalized[i // beam_size][i % beam_size]["alignment"] = alignment diff --git a/fairseq/speech_generator.py b/fairseq/speech_generator.py new file mode 100644 index 0000000000..f2cc8b5e86 --- /dev/null +++ b/fairseq/speech_generator.py @@ -0,0 +1,427 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch + +from fairseq.data.audio.speech_to_text_dataset import S2TDataConfig + + +class SpeechGenerator(object): + def __init__(self, model, vocoder, data_cfg: S2TDataConfig): + self.model = model + self.vocoder = vocoder + stats_npz_path = data_cfg.global_cmvn_stats_npz + self.gcmvn_stats = None + if stats_npz_path is not None: + self.gcmvn_stats = np.load(stats_npz_path) + + def gcmvn_denormalize(self, x): + # x: B x T x C + if self.gcmvn_stats is None: + return x + mean = torch.from_numpy(self.gcmvn_stats["mean"]).to(x) + std = torch.from_numpy(self.gcmvn_stats["std"]).to(x) + assert len(x.shape) == 3 and mean.shape[0] == std.shape[0] == x.shape[2] + x = x * std.view(1, 1, -1).expand_as(x) + return x + mean.view(1, 1, -1).expand_as(x) + + def get_waveform(self, feat): + # T x C -> T + return None if self.vocoder is None else self.vocoder(feat).squeeze(0) + + +class AutoRegressiveSpeechGenerator(SpeechGenerator): + def __init__( + self, + model, + vocoder, + data_cfg, + max_iter: int = 6000, + eos_prob_threshold: float = 0.5, + ): + super().__init__(model, vocoder, data_cfg) + self.max_iter = max_iter + self.eos_prob_threshold = eos_prob_threshold + + @torch.no_grad() + def generate(self, model, sample, has_targ=False, **kwargs): + model.eval() + + src_tokens = sample["net_input"]["src_tokens"] + src_lengths = sample["net_input"]["src_lengths"] + bsz, src_len = src_tokens.size()[:2] + n_frames_per_step = model.decoder.n_frames_per_step + out_dim = model.decoder.out_dim + raw_dim = out_dim // n_frames_per_step + + # initialize + encoder_out = model.forward_encoder( + src_tokens, src_lengths, speaker=sample["speaker"] + ) + incremental_state = {} + feat, attn, eos_prob = [], [], [] + finished = src_tokens.new_zeros((bsz,)).bool() + out_lens = src_lengths.new_zeros((bsz,)).long().fill_(self.max_iter) + + prev_feat_out = encoder_out["encoder_out"][0].new_zeros(bsz, 1, out_dim) + for step in range(self.max_iter): + cur_out_lens = out_lens.clone() + cur_out_lens.masked_fill_(cur_out_lens.eq(self.max_iter), step + 1) + _, cur_eos_out, cur_extra = model.forward_decoder( + prev_feat_out, + encoder_out=encoder_out, + incremental_state=incremental_state, + target_lengths=cur_out_lens, + speaker=sample["speaker"], + **kwargs, + ) + cur_eos_prob = torch.sigmoid(cur_eos_out).squeeze(2) + feat.append(cur_extra["feature_out"]) + attn.append(cur_extra["attn"]) + eos_prob.append(cur_eos_prob) + + cur_finished = cur_eos_prob.squeeze(1) > self.eos_prob_threshold + out_lens.masked_fill_((~finished) & cur_finished, step + 1) + finished = finished | cur_finished + if finished.sum().item() == bsz: + break + prev_feat_out = cur_extra["feature_out"] + + feat = torch.cat(feat, dim=1) + feat = model.decoder.postnet(feat) + feat + eos_prob = torch.cat(eos_prob, dim=1) + attn = torch.cat(attn, dim=2) + alignment = attn.max(dim=1)[1] + + feat = feat.reshape(bsz, -1, raw_dim) + feat = self.gcmvn_denormalize(feat) + + eos_prob = eos_prob.repeat_interleave(n_frames_per_step, dim=1) + attn = attn.repeat_interleave(n_frames_per_step, dim=2) + alignment = alignment.repeat_interleave(n_frames_per_step, dim=1) + out_lens = out_lens * n_frames_per_step + + finalized = [ + { + "feature": feat[b, :out_len], + "eos_prob": eos_prob[b, :out_len], + "attn": attn[b, :, :out_len], + "alignment": alignment[b, :out_len], + "waveform": self.get_waveform(feat[b, :out_len]), + } + for b, out_len in zip(range(bsz), out_lens) + ] + + if has_targ: + assert sample["target"].size(-1) == out_dim + tgt_feats = sample["target"].view(bsz, -1, raw_dim) + tgt_feats = self.gcmvn_denormalize(tgt_feats) + tgt_lens = sample["target_lengths"] * n_frames_per_step + for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)): + finalized[b]["targ_feature"] = f[:l] + finalized[b]["targ_waveform"] = self.get_waveform(f[:l]) + return finalized + + +class MultiDecoderSpeechGenerator(SpeechGenerator): + def __init__( + self, + models, + args, + vocoder, + data_cfg, + tgt_dict_mt, + max_iter: int = 6000, + eos_prob_threshold: float = 0.5, + eos_mt=None, + symbols_to_strip_from_output=None, + ): + super().__init__(models[0], vocoder, data_cfg) + self.max_iter = max_iter + self.eos_prob_threshold = eos_prob_threshold + + self.tgt_dict_mt = tgt_dict_mt + self.eos_mt = eos_mt + + from examples.speech_to_speech.unity.sequence_generator import SequenceGenerator + from fairseq import search + + self.text_generator = SequenceGenerator( + models, + tgt_dict_mt, + beam_size=max(1, getattr(args, "beam", 5)), + max_len_a=getattr(args, "max_len_a", 0), + max_len_b=getattr(args, "max_len_b", 200), + min_len=getattr(args, "min_len", 1), + normalize_scores=(not getattr(args, "unnormalized", False)), + len_penalty=getattr(args, "lenpen", 1), + unk_penalty=getattr(args, "unkpen", 0), + temperature=getattr(args, "temperature", 1.0), + match_source_len=getattr(args, "match_source_len", False), + no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0), + search_strategy=search.BeamSearch(tgt_dict_mt), + eos=eos_mt, + symbols_to_strip_from_output=symbols_to_strip_from_output, + ) + + @torch.no_grad() + def generate(self, model, sample, has_targ=False, **kwargs): + model.eval() + + src_tokens = sample["net_input"]["src_tokens"] + src_lengths = sample["net_input"]["src_lengths"] + bsz, src_len = src_tokens.size()[:2] + n_frames_per_step = model.decoder.n_frames_per_step + out_dim = model.decoder.out_dim + raw_dim = out_dim // n_frames_per_step + + # initialize + encoder_out = model.forward_encoder( + src_tokens, src_lengths, speaker=sample["speaker"] + ) + + prefix_tokens = None + constraints = None + bos_token = None + + mt_decoder = getattr(model, f"{model.mt_task_name}_decoder") + + # 1. MT decoder + finalized_mt = self.text_generator.generate_decoder( + [encoder_out], + src_tokens, + src_lengths, + sample, + prefix_tokens, + constraints, + bos_token, + aux_task_name=model.mt_task_name, + ) + + # extract decoder output corresponding to the best hypothesis + max_tgt_len = max([len(hypo[0]["tokens"]) for hypo in finalized_mt]) + prev_output_tokens_mt = ( + src_tokens.new_zeros(src_tokens.shape[0], max_tgt_len) + .fill_(mt_decoder.padding_idx) + .int() + ) # B x T + for i, hypo in enumerate(finalized_mt): + i_beam = 0 + tmp = hypo[i_beam]["tokens"].int() # hyp + eos + prev_output_tokens_mt[i, 0] = self.text_generator.eos + if tmp[-1] == self.text_generator.eos: + tmp = tmp[:-1] + prev_output_tokens_mt[i, 1 : len(tmp) + 1] = tmp + + text = "".join([self.tgt_dict_mt[c] for c in tmp]) + text = text.replace("_", " ") + text = text.replace("▁", " ") + text = text.replace("<unk>", " ") + text = text.replace("<s>", "") + text = text.replace("</s>", "") + if len(text) > 0 and text[0] == " ": + text = text[1:] + sample_id = sample["id"].tolist()[i] + print("{} (None-{})".format(text, sample_id)) + + mt_decoder_out = mt_decoder( + prev_output_tokens_mt, + encoder_out=encoder_out, + features_only=True, + ) + x = mt_decoder_out[0].transpose(0, 1) + + mt_decoder_padding_mask = None + if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any(): + mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx) + + # 2. TTS encoder + if getattr(model, "synthesizer_encoder", None) is not None: + synthesizer_encoder_out = model.synthesizer_encoder( + x, + mt_decoder_padding_mask, + ) + else: + synthesizer_encoder_out = { + "encoder_out": [x], # T x B x C + "encoder_padding_mask": [mt_decoder_padding_mask] + if mt_decoder_padding_mask is not None + else [], # B x T + "encoder_embedding": [], + "encoder_states": [], + "src_tokens": [], + "src_lengths": [], + } + + # 3. TTS decoder + incremental_state = {} + feat, attn, eos_prob = [], [], [] + finished = src_tokens.new_zeros((bsz,)).bool() + out_lens = src_lengths.new_zeros((bsz,)).long().fill_(self.max_iter) + + prev_feat_out = encoder_out["encoder_out"][0].new_zeros(bsz, 1, out_dim) + for step in range(self.max_iter): + cur_out_lens = out_lens.clone() + cur_out_lens.masked_fill_(cur_out_lens.eq(self.max_iter), step + 1) + _, cur_eos_out, cur_extra = model.forward_decoder( + prev_feat_out, + encoder_out=synthesizer_encoder_out, + incremental_state=incremental_state, + target_lengths=cur_out_lens, + speaker=sample["speaker"], + **kwargs, + ) + cur_eos_prob = torch.sigmoid(cur_eos_out).squeeze(2) + feat.append(cur_extra["feature_out"]) + attn.append(cur_extra["attn"]) + eos_prob.append(cur_eos_prob) + + cur_finished = cur_eos_prob.squeeze(1) > self.eos_prob_threshold + out_lens.masked_fill_((~finished) & cur_finished, step + 1) + finished = finished | cur_finished + if finished.sum().item() == bsz: + break + prev_feat_out = cur_extra["feature_out"] + + feat = torch.cat(feat, dim=1) + feat = model.decoder.postnet(feat) + feat + eos_prob = torch.cat(eos_prob, dim=1) + attn = torch.cat(attn, dim=2) + alignment = attn.max(dim=1)[1] + + feat = feat.reshape(bsz, -1, raw_dim) + feat = self.gcmvn_denormalize(feat) + + eos_prob = eos_prob.repeat_interleave(n_frames_per_step, dim=1) + attn = attn.repeat_interleave(n_frames_per_step, dim=2) + alignment = alignment.repeat_interleave(n_frames_per_step, dim=1) + out_lens = out_lens * n_frames_per_step + + finalized = [ + { + "feature": feat[b, :out_len], + "eos_prob": eos_prob[b, :out_len], + "attn": attn[b, :, :out_len], + "alignment": alignment[b, :out_len], + "waveform": self.get_waveform(feat[b, :out_len]), + } + for b, out_len in zip(range(bsz), out_lens) + ] + + if has_targ: + assert sample["target"].size(-1) == out_dim + tgt_feats = sample["target"].view(bsz, -1, raw_dim) + tgt_feats = self.gcmvn_denormalize(tgt_feats) + tgt_lens = sample["target_lengths"] * n_frames_per_step + for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)): + finalized[b]["targ_feature"] = f[:l] + finalized[b]["targ_waveform"] = self.get_waveform(f[:l]) + return finalized + + +class NonAutoregressiveSpeechGenerator(SpeechGenerator): + @torch.no_grad() + def generate(self, model, sample, has_targ=False, **kwargs): + model.eval() + + bsz, max_src_len = sample["net_input"]["src_tokens"].size() + n_frames_per_step = model.encoder.n_frames_per_step + out_dim = model.encoder.out_dim + raw_dim = out_dim // n_frames_per_step + + feat, feat_post, out_lens, log_dur_out, _, _ = model( + src_tokens=sample["net_input"]["src_tokens"], + src_lengths=sample["net_input"]["src_lengths"], + prev_output_tokens=sample["net_input"]["prev_output_tokens"], + incremental_state=None, + target_lengths=sample["target_lengths"], + speaker=sample["speaker"], + ) + if feat_post is not None: + feat = feat_post + + feat = feat.view(bsz, -1, raw_dim) + feat = self.gcmvn_denormalize(feat) + + dur_out = torch.clamp(torch.round(torch.exp(log_dur_out) - 1).long(), min=0) + + def get_dur_plot_data(d): + r = [] + for i, dd in enumerate(d): + r += [i + 1] * dd.item() + return r + + out_lens = out_lens * n_frames_per_step + finalized = [ + { + "feature": feat[b, :l] if l > 0 else feat.new_zeros([1, raw_dim]), + "waveform": self.get_waveform( + feat[b, :l] if l > 0 else feat.new_zeros([1, raw_dim]) + ), + "attn": feat.new_tensor(get_dur_plot_data(dur_out[b])), + } + for b, l in zip(range(bsz), out_lens) + ] + + if has_targ: + tgt_feats = sample["target"].view(bsz, -1, raw_dim) + tgt_feats = self.gcmvn_denormalize(tgt_feats) + tgt_lens = sample["target_lengths"] * n_frames_per_step + for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)): + finalized[b]["targ_feature"] = f[:l] + finalized[b]["targ_waveform"] = self.get_waveform(f[:l]) + return finalized + + +class TeacherForcingAutoRegressiveSpeechGenerator(AutoRegressiveSpeechGenerator): + @torch.no_grad() + def generate(self, model, sample, has_targ=False, **kwargs): + model.eval() + + src_tokens = sample["net_input"]["src_tokens"] + src_lens = sample["net_input"]["src_lengths"] + prev_out_tokens = sample["net_input"]["prev_output_tokens"] + tgt_lens = sample["target_lengths"] + n_frames_per_step = model.decoder.n_frames_per_step + raw_dim = model.decoder.out_dim // n_frames_per_step + bsz = src_tokens.shape[0] + + feat, eos_prob, extra = model( + src_tokens, + src_lens, + prev_out_tokens, + incremental_state=None, + target_lengths=tgt_lens, + speaker=sample["speaker"], + ) + + attn = extra["attn"] # B x T_s x T_t + alignment = attn.max(dim=1)[1] + feat = feat.reshape(bsz, -1, raw_dim) + feat = self.gcmvn_denormalize(feat) + eos_prob = eos_prob.repeat_interleave(n_frames_per_step, dim=1) + attn = attn.repeat_interleave(n_frames_per_step, dim=2) + alignment = alignment.repeat_interleave(n_frames_per_step, dim=1) + tgt_lens = sample["target_lengths"] * n_frames_per_step + + finalized = [ + { + "feature": feat[b, :tgt_len], + "eos_prob": eos_prob[b, :tgt_len], + "attn": attn[b, :, :tgt_len], + "alignment": alignment[b, :tgt_len], + "waveform": self.get_waveform(feat[b, :tgt_len]), + } + for b, tgt_len in zip(range(bsz), tgt_lens) + ] + + if has_targ: + tgt_feats = sample["target"].view(bsz, -1, raw_dim) + tgt_feats = self.gcmvn_denormalize(tgt_feats) + for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)): + finalized[b]["targ_feature"] = f[:l] + finalized[b]["targ_waveform"] = self.get_waveform(f[:l]) + return finalized diff --git a/fairseq/tasks/__init__.py b/fairseq/tasks/__init__.py index 41f461f802..6da1f001f0 100644 --- a/fairseq/tasks/__init__.py +++ b/fairseq/tasks/__init__.py @@ -9,7 +9,8 @@ import os from fairseq.dataclass import FairseqDataclass -from omegaconf import DictConfig +from fairseq.dataclass.utils import merge_with_parent +from hydra.core.config_store import ConfigStore from .fairseq_task import FairseqTask, LegacyFairseqTask # noqa @@ -20,10 +21,30 @@ TASK_CLASS_NAMES = set() -def setup_task(cfg: DictConfig, **kwargs): - if isinstance(cfg, DictConfig): - return TASK_REGISTRY[cfg._name].setup_task(cfg, **kwargs) - return TASK_REGISTRY[cfg.task].setup_task(cfg, **kwargs) +def setup_task(cfg: FairseqDataclass, **kwargs): + task = None + task_name = getattr(cfg, "task", None) + + if isinstance(task_name, str): + # legacy tasks + task = TASK_REGISTRY[task_name] + if task_name in TASK_DATACLASS_REGISTRY: + dc = TASK_DATACLASS_REGISTRY[task_name] + cfg = dc.from_namespace(cfg) + else: + task_name = getattr(cfg, "_name", None) + + if task_name and task_name in TASK_DATACLASS_REGISTRY: + remove_missing = "from_checkpoint" in kwargs and kwargs["from_checkpoint"] + dc = TASK_DATACLASS_REGISTRY[task_name] + cfg = merge_with_parent(dc(), cfg, remove_missing=remove_missing) + task = TASK_REGISTRY[task_name] + + assert ( + task is not None + ), f"Could not infer task type from {cfg}. Available argparse tasks: {TASK_REGISTRY.keys()}. Available hydra tasks: {TASK_DATACLASS_REGISTRY.keys()}" + + return task.setup_task(cfg, **kwargs) def register_task(name, dataclass=None): @@ -48,7 +69,8 @@ class ClassificationTask(FairseqTask): def register_task_cls(cls): if name in TASK_REGISTRY: - raise ValueError("Cannot register duplicate task ({})".format(name)) + return TASK_REGISTRY[name] + if not issubclass(cls, FairseqTask): raise ValueError( "Task ({}: {}) must extend FairseqTask".format(name, cls.__name__) @@ -71,6 +93,11 @@ def register_task_cls(cls): if dataclass is not None: TASK_DATACLASS_REGISTRY[name] = dataclass + cs = ConfigStore.instance() + node = dataclass() + node._name = name + cs.store(name=name, group="task", node=node, provider="fairseq") + return cls return register_task_cls @@ -80,26 +107,32 @@ def get_task(name): return TASK_REGISTRY[name] +def import_tasks(tasks_dir, namespace): + for file in os.listdir(tasks_dir): + path = os.path.join(tasks_dir, file) + if ( + not file.startswith("_") + and not file.startswith(".") + and (file.endswith(".py") or os.path.isdir(path)) + ): + task_name = file[: file.find(".py")] if file.endswith(".py") else file + importlib.import_module(namespace + "." + task_name) + + # expose `task_parser` for sphinx + if task_name in TASK_REGISTRY: + parser = argparse.ArgumentParser(add_help=False) + group_task = parser.add_argument_group("Task name") + # fmt: off + group_task.add_argument('--task', metavar=task_name, + help='Enable this task with: ``--task=' + task_name + '``') + # fmt: on + group_args = parser.add_argument_group( + "Additional command-line arguments" + ) + TASK_REGISTRY[task_name].add_args(group_args) + globals()[task_name + "_parser"] = parser + + # automatically import any Python files in the tasks/ directory tasks_dir = os.path.dirname(__file__) -for file in os.listdir(tasks_dir): - path = os.path.join(tasks_dir, file) - if ( - not file.startswith("_") - and not file.startswith(".") - and (file.endswith(".py") or os.path.isdir(path)) - ): - task_name = file[: file.find(".py")] if file.endswith(".py") else file - module = importlib.import_module("fairseq.tasks." + task_name) - - # expose `task_parser` for sphinx - if task_name in TASK_REGISTRY: - parser = argparse.ArgumentParser(add_help=False) - group_task = parser.add_argument_group("Task name") - # fmt: off - group_task.add_argument('--task', metavar=task_name, - help='Enable this task with: ``--task=' + task_name + '``') - # fmt: on - group_args = parser.add_argument_group("Additional command-line arguments") - TASK_REGISTRY[task_name].add_args(group_args) - globals()[task_name + "_parser"] = parser +import_tasks(tasks_dir, "fairseq.tasks") diff --git a/fairseq/tasks/audio_classification.py b/fairseq/tasks/audio_classification.py new file mode 100644 index 0000000000..4c21d23b69 --- /dev/null +++ b/fairseq/tasks/audio_classification.py @@ -0,0 +1,269 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +from collections import OrderedDict +import itertools +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import numpy as np +import torch +from omegaconf import II, MISSING +from sklearn import metrics as sklearn_metrics + +from fairseq.data import AddTargetDataset, Dictionary, FileAudioDataset +from fairseq.data.multi_corpus_dataset import MultiCorpusDataset +from fairseq.data.text_compressor import TextCompressionLevel, TextCompressor +from fairseq.dataclass import FairseqDataclass +from fairseq.tasks.audio_pretraining import AudioPretrainingConfig, AudioPretrainingTask +from fairseq.tasks.audio_finetuning import label_len_fn, LabelEncoder + +from .. import utils +from ..logging import metrics +from . import FairseqTask, register_task + +logger = logging.getLogger(__name__) + +@dataclass +class AudioClassificationConfig(AudioPretrainingConfig): + target_dictionary: Optional[str] = field( + default=None, metadata={"help": "override default dictionary location"} + ) + + +@register_task("audio_classification", dataclass=AudioClassificationConfig) +class AudioClassificationTask(AudioPretrainingTask): + """Task for audio classification tasks.""" + + cfg: AudioClassificationConfig + + def __init__( + self, + cfg: AudioClassificationConfig, + ): + super().__init__(cfg) + self.state.add_factory("target_dictionary", self.load_target_dictionary) + logging.info(f"=== Number of labels = {len(self.target_dictionary)}") + + def load_target_dictionary(self): + if self.cfg.labels: + target_dictionary = self.cfg.data + if self.cfg.target_dictionary: # override dict + target_dictionary = self.cfg.target_dictionary + dict_path = os.path.join(target_dictionary, f"dict.{self.cfg.labels}.txt") + logger.info("Using dict_path : {}".format(dict_path)) + return Dictionary.load(dict_path, add_special_symbols=False) + return None + + def load_dataset( + self, split: str, task_cfg: AudioClassificationConfig = None, **kwargs + ): + super().load_dataset(split, task_cfg, **kwargs) + task_cfg = task_cfg or self.cfg + assert task_cfg.labels is not None + text_compression_level = getattr( + TextCompressionLevel, str(self.cfg.text_compression_level) + ) + data_path = self.cfg.data + if task_cfg.multi_corpus_keys is None: + label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}") + skipped_indices = getattr(self.datasets[split], "skipped_indices", set()) + text_compressor = TextCompressor(level=text_compression_level) + with open(label_path, "r") as f: + labels = [ + text_compressor.compress(l) + for i, l in enumerate(f) + if i not in skipped_indices + ] + + assert len(labels) == len(self.datasets[split]), ( + f"labels length ({len(labels)}) and dataset length " + f"({len(self.datasets[split])}) do not match" + ) + + process_label = LabelEncoder(self.target_dictionary) + + self.datasets[split] = AddTargetDataset( + self.datasets[split], + labels, + pad=self.target_dictionary.pad(), + eos=self.target_dictionary.eos(), + batch_targets=True, + process_label=process_label, + label_len_fn=label_len_fn, + add_to_input=False, + # text_compression_level=text_compression_level, + ) + else: + target_dataset_map = OrderedDict() + + multi_corpus_keys = [ + k.strip() for k in task_cfg.multi_corpus_keys.split(",") + ] + corpus_idx_map = {k: idx for idx, k in enumerate(multi_corpus_keys)} + + data_keys = [k.split(":") for k in split.split(",")] + + multi_corpus_sampling_weights = [ + float(val.strip()) + for val in task_cfg.multi_corpus_sampling_weights.split(",") + ] + data_weights = [] + for key, file_name in data_keys: + k = key.strip() + label_path = os.path.join( + data_path, f"{file_name.strip()}.{task_cfg.labels}" + ) + skipped_indices = getattr( + self.dataset_map[split][k], "skipped_indices", set() + ) + text_compressor = TextCompressor(level=text_compression_level) + with open(label_path, "r") as f: + labels = [ + text_compressor.compress(l) + for i, l in enumerate(f) + if i not in skipped_indices + ] + + assert len(labels) == len(self.dataset_map[split][k]), ( + f"labels length ({len(labels)}) and dataset length " + f"({len(self.dataset_map[split][k])}) do not match" + ) + + process_label = LabelEncoder(self.target_dictionary) + + # TODO: Remove duplication of code from the if block above + target_dataset_map[k] = AddTargetDataset( + self.dataset_map[split][k], + labels, + pad=self.target_dictionary.pad(), + eos=self.target_dictionary.eos(), + batch_targets=True, + process_label=process_label, + label_len_fn=label_len_fn, + add_to_input=False, + # text_compression_level=text_compression_level, + ) + + data_weights.append(multi_corpus_sampling_weights[corpus_idx_map[k]]) + + if len(target_dataset_map) == 1: + self.datasets[split] = list(target_dataset_map.values())[0] + else: + self.datasets[split] = MultiCorpusDataset( + target_dataset_map, + distribution=data_weights, + seed=0, + sort_indices=True, + ) + + @property + def source_dictionary(self): + return None + + @property + def target_dictionary(self): + """Return the :class:`~fairseq.data.Dictionary` for the language + model.""" + return self.state.target_dictionary + + def train_step(self, sample, model, *args, **kwargs): + sample["target"] = sample["target"].to(dtype=torch.long) + loss, sample_size, logging_output = super().train_step( + sample, model, *args, **kwargs + ) + self._log_metrics(sample, model, logging_output) + return loss, sample_size, logging_output + + def valid_step(self, sample, model, criterion): + sample["target"] = sample["target"].to(dtype=torch.long) + loss, sample_size, logging_output = super().valid_step(sample, model, criterion) + self._log_metrics(sample, model, logging_output) + return loss, sample_size, logging_output + + def _log_metrics(self, sample, model, logging_output): + metrics = self._inference_with_metrics( + sample, + model, + ) + """ + logging_output["_precision"] = metrics["precision"] + logging_output["_recall"] = metrics["recall"] + logging_output["_f1"] = metrics["f1"] + logging_output["_eer"] = metrics["eer"] + logging_output["_accuracy"] = metrics["accuracy"] + """ + logging_output["_correct"] = metrics["correct"] + logging_output["_total"] = metrics["total"] + + def _inference_with_metrics(self, sample, model): + def _compute_eer(target_list, lprobs): + # from scipy.optimize import brentq + # from scipy.interpolate import interp1d + + y_one_hot = np.eye(len(self.state.target_dictionary))[target_list] + fpr, tpr, thresholds = sklearn_metrics.roc_curve( + y_one_hot.ravel(), lprobs.ravel() + ) + # Revisit the interpolation approach. + # eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0) + + fnr = 1 - tpr + eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))] + + return eer + + with torch.no_grad(): + net_output = model(**sample["net_input"]) + lprobs = ( + model.get_normalized_probs(net_output, log_probs=True).cpu().detach() + ) + target_list = sample["target"][:, 0].detach().cpu() + predicted_list = torch.argmax(lprobs, 1).detach().cpu() # B,C->B + + metrics = { + "correct": torch.sum(target_list == predicted_list).item(), + "total": len(target_list), + } + return metrics + + def reduce_metrics(self, logging_outputs, criterion): + super().reduce_metrics(logging_outputs, criterion) + + zero = torch.scalar_tensor(0.0) + correct, total = 0, 0 + for log in logging_outputs: + correct += log.get("_correct", zero) + total += log.get("_total", zero) + metrics.log_scalar("_correct", correct) + metrics.log_scalar("_total", total) + + if total > 0: + def _fn_accuracy(meters): + if meters["_total"].sum > 0: + return utils.item(meters["_correct"].sum / meters["_total"].sum) + return float("nan") + + metrics.log_derived("accuracy", _fn_accuracy) + """ + prec_sum, recall_sum, f1_sum, acc_sum, eer_sum = 0.0, 0.0, 0.0, 0.0, 0.0 + for log in logging_outputs: + prec_sum += log.get("_precision", zero).item() + recall_sum += log.get("_recall", zero).item() + f1_sum += log.get("_f1", zero).item() + acc_sum += log.get("_accuracy", zero).item() + eer_sum += log.get("_eer", zero).item() + + metrics.log_scalar("avg_precision", prec_sum / len(logging_outputs)) + metrics.log_scalar("avg_recall", recall_sum / len(logging_outputs)) + metrics.log_scalar("avg_f1", f1_sum / len(logging_outputs)) + metrics.log_scalar("avg_accuracy", acc_sum / len(logging_outputs)) + metrics.log_scalar("avg_eer", eer_sum / len(logging_outputs)) + """ \ No newline at end of file diff --git a/fairseq/tasks/audio_finetuning.py b/fairseq/tasks/audio_finetuning.py new file mode 100644 index 0000000000..d79553cb86 --- /dev/null +++ b/fairseq/tasks/audio_finetuning.py @@ -0,0 +1,404 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import logging +import os +from fairseq.data.multi_corpus_dataset import MultiCorpusDataset +import torch +import json + +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Optional, Any, OrderedDict + +from fairseq.data import AddTargetDataset, Dictionary, encoders +from fairseq.tasks.audio_pretraining import AudioPretrainingTask, AudioPretrainingConfig +from fairseq.dataclass import FairseqDataclass +from fairseq.dataclass.configs import GenerationConfig +from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel + +from . import register_task +from .. import utils +from ..logging import metrics + + +logger = logging.getLogger(__name__) + + +class LabelEncoder(object): + def __init__(self, dictionary): + self.dictionary = dictionary + + def __call__(self, label): + return self.dictionary.encode_line( + label, append_eos=False, add_if_not_exist=False + ) + + +def label_len_fn(label): + return len(label.split(" ")) + + +@dataclass +class AudioFinetuningConfig(AudioPretrainingConfig): + # Options for reporting WER metrics during validation. Only applicable to + # Seq2Seq models during fine-tuning + eval_wer: bool = field( + default=False, metadata={"help": "compute WER for Seq2Seq models"} + ) + eval_wer_config: GenerationConfig = field( + default_factory=lambda: GenerationConfig(), + metadata={"help": "beam search config for evaluating wer during training"}, + ) + eval_wer_tokenizer: Any = field( + default=None, + metadata={"help": "tokenizer config for evaluating wer during training"}, + ) + eval_wer_post_process: str = field( + default="letter", + metadata={ + "help": "remove BPE tokens before scoring (can be sentencepiece, letter, and more)" + }, + ) + eval_bleu: bool = field( + default=False, metadata={"help": "evaluation with BLEU scores"} + ) + eval_bleu_detok: Optional[str] = field( + default=None, + metadata={ + "help": "detokenize before computing BLEU (e.g., 'moses'); " + "required if using --eval-bleu; use 'space' to disable " + "detokenization; see fairseq.data.encoders for other options" + }, + ) + eval_bleu_detok_args: str = field( + default="{}", metadata={"help": "args for building the tokenizer, if needed"} + ) + eval_tokenized_bleu: bool = field( + default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"} + ) + eval_bleu_remove_bpe: Optional[str] = field( + default=None, metadata={"help": "remove BPE before computing BLEU"} + ) + eval_bleu_args: str = field( + default="{}", + metadata={ + "help": "generation args for BLUE scoring, e.g., " + '\'{"beam": 4, "lenpen": 0.6}\'' + }, + ) + eval_bleu_print_samples: bool = field( + default=False, metadata={"help": "print sample generations during validation"} + ) + autoregressive: bool = field( + default=False, + metadata={ + "help": "required for autoregressive decoders (like seq2seq models); " + "adds 'prev_output_tokens' to input and appends eos to target" + }, + ) + rebuild_batches: bool = True + target_dictionary: Optional[str] = field( + default=None, + metadata={ + "help": "override default dictionary location" + } + ) + +@register_task("audio_finetuning", dataclass=AudioFinetuningConfig) +class AudioFinetuningTask(AudioPretrainingTask): + """ """ + + cfg: AudioFinetuningConfig + + def __init__( + self, + cfg: AudioFinetuningConfig, + ): + super().__init__(cfg) + self.blank_symbol = "<s>" + + self.state.add_factory("target_dictionary", self.load_target_dictionary) + + def load_target_dictionary(self): + if self.cfg.labels: + target_dictionary = self.cfg.data + if self.cfg.target_dictionary: # override dict + target_dictionary = self.cfg.target_dictionary + dict_path = os.path.join(target_dictionary, f"dict.{self.cfg.labels}.txt") + logger.info('Using dict_path : {}'.format(dict_path)) + return Dictionary.load(dict_path) + return None + + def load_dataset( + self, split: str, task_cfg: AudioFinetuningConfig = None, **kwargs + ): + super().load_dataset(split, task_cfg, **kwargs) + + task_cfg = task_cfg or self.cfg + assert task_cfg.labels is not None + text_compression_level = getattr( + TextCompressionLevel, str(self.cfg.text_compression_level) + ) + data_path = self.cfg.data + if task_cfg.multi_corpus_keys is None: + label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}") + skipped_indices = getattr(self.datasets[split], "skipped_indices", set()) + text_compressor = TextCompressor(level=text_compression_level) + with open(label_path, "r") as f: + labels = [ + text_compressor.compress(l) + for i, l in enumerate(f) + if i not in skipped_indices + ] + + assert len(labels) == len(self.datasets[split]), ( + f"labels length ({len(labels)}) and dataset length " + f"({len(self.datasets[split])}) do not match" + ) + + process_label = LabelEncoder(self.target_dictionary) + + self.datasets[split] = AddTargetDataset( + self.datasets[split], + labels, + pad=self.target_dictionary.pad(), + eos=self.target_dictionary.eos(), + batch_targets=True, + process_label=process_label, + label_len_fn=label_len_fn, + add_to_input=task_cfg.get("autoregressive", False), + text_compression_level=text_compression_level, + ) + else: + + target_dataset_map = OrderedDict() + + multi_corpus_keys = [k.strip() for k in task_cfg.multi_corpus_keys.split(",")] + corpus_idx_map = {k: idx for idx, k in enumerate(multi_corpus_keys)} + + data_keys = [k.split(":") for k in split.split(",")] + + multi_corpus_sampling_weights = [float(val.strip()) for val in task_cfg.multi_corpus_sampling_weights.split(",")] + data_weights = [] + for key, file_name in data_keys: + k = key.strip() + label_path = os.path.join(data_path, f"{file_name.strip()}.{task_cfg.labels}") + skipped_indices = getattr(self.dataset_map[split][k], "skipped_indices", set()) + text_compressor = TextCompressor(level=text_compression_level) + with open(label_path, "r") as f: + labels = [ + text_compressor.compress(l) + for i, l in enumerate(f) + if i not in skipped_indices + ] + + assert len(labels) == len(self.dataset_map[split][k]), ( + f"labels length ({len(labels)}) and dataset length " + f"({len(self.dataset_map[split][k])}) do not match" + ) + + process_label = LabelEncoder(self.target_dictionary) + + # TODO: Remove duplication of code from the if block above + target_dataset_map[k] = AddTargetDataset( + self.dataset_map[split][k], + labels, + pad=self.target_dictionary.pad(), + eos=self.target_dictionary.eos(), + batch_targets=True, + process_label=process_label, + label_len_fn=label_len_fn, + add_to_input=task_cfg.get("autoregressive", False), + text_compression_level=text_compression_level, + ) + + data_weights.append(multi_corpus_sampling_weights[corpus_idx_map[k]]) + + if len(target_dataset_map) == 1: + self.datasets[split] = list(target_dataset_map.values())[0] + else: + self.datasets[split] = MultiCorpusDataset(target_dataset_map, distribution=data_weights, seed=0, sort_indices=True) + + @property + def target_dictionary(self): + """Return the :class:`~fairseq.data.Dictionary` for the language + model.""" + return self.state.target_dictionary + + def valid_step(self, sample, model, criterion): + loss, sample_size, logging_output = super().valid_step(sample, model, criterion) + if self.cfg.eval_wer and self.cfg.autoregressive: + metrics = self._inference_with_wer(self.sequence_generator, sample, model) + logging_output["_num_char_errors"] = metrics["num_char_errors"] + logging_output["_num_chars"] = metrics["num_chars"] + logging_output["_num_word_errors"] = metrics["num_word_errors"] + logging_output["_num_words"] = metrics["num_words"] + if self.cfg.eval_bleu and self.cfg.autoregressive: + metrics = self._inference_with_bleu(self.sequence_generator, sample, model) + logging_output["_bleu_sys_len"] = metrics.sys_len + logging_output["_bleu_ref_len"] = metrics.ref_len + # we split counts into separate entries so that they can be + # summed efficiently across workers using fast-stat-sync + assert len(metrics.counts) == 4 + for i in range(4): + logging_output[f"_bleu_counts_{i}"] = metrics.counts[i] + logging_output[f"_bleu_totals_{i}"] = metrics.totals[i] + return loss, sample_size, logging_output + + def build_model(self, model_cfg: FairseqDataclass, from_checkpoint=False): + model = super().build_model(model_cfg, from_checkpoint) + + if self.cfg.eval_wer and self.cfg.autoregressive: + self.sequence_generator = self.build_generator( + [model], + self.cfg.eval_wer_config, + ) + if self.cfg.eval_wer_tokenizer: + self.tokenizer = encoders.build_tokenizer(self.cfg.eval_wer_tokenizer) + else: + self.tokenizer = None + if self.cfg.eval_bleu and self.cfg.autoregressive: + assert self.cfg.eval_bleu_detok is not None, ( + "--eval-bleu-detok is required if using --eval-bleu; " + "try --eval-bleu-detok=moses (or --eval-bleu-detok=space " + "to disable detokenization, e.g., when using sentencepiece)" + ) + detok_args = json.loads(self.cfg.eval_bleu_detok_args) + self.tokenizer = encoders.build_tokenizer( + Namespace(tokenizer=self.cfg.eval_bleu_detok, **detok_args) + ) + gen_args = json.loads(self.cfg.eval_bleu_args) + gen_args = Namespace(**gen_args) + self.sequence_generator = self.build_generator([model], gen_args) + + return model + + def _inference_with_wer(self, generator, sample, model): + import editdistance + + def decode(toks): + s = self.target_dictionary.string( + toks.int().cpu(), + self.cfg.eval_wer_post_process, + escape_unk=True, + ) + if self.tokenizer: + s = self.tokenizer.decode(s) + return s + + num_word_errors, num_char_errors = 0, 0 + num_chars, num_words = 0, 0 + gen_out = self.inference_step(generator, [model], sample, None) + for i in range(len(gen_out)): + hyp = decode(gen_out[i][0]["tokens"]) + ref = decode( + utils.strip_pad(sample["target"][i], self.target_dictionary.pad()), + ) + num_char_errors += editdistance.eval(hyp, ref) + num_chars += len(ref) + hyp_words = hyp.split() + ref_words = ref.split() + num_word_errors += editdistance.eval(hyp_words, ref_words) + num_words += len(ref_words) + + return { + "num_char_errors": num_char_errors, + "num_chars": num_chars, + "num_word_errors": num_word_errors, + "num_words": num_words, + } + + def _inference_with_bleu(self, generator, sample, model): + import sacrebleu + + def decode(toks, is_ref): + s = self.target_dictionary.string( + toks.int().cpu(), + self.cfg.eval_bleu_remove_bpe, + # The default unknown string in fairseq is `<unk>`, but + # this is tokenized by sacrebleu as `< unk >`, inflating + # BLEU scores. Instead, we use a somewhat more verbose + # alternative that is unlikely to appear in the real + # reference, but doesn't get split into multiple tokens. + unk_string=("UNKNOWNTOKENINREF" if is_ref else "UNKNOWNTOKENINHYP"), + ) + if self.tokenizer: + s = self.tokenizer.decode(s) + return s + + gen_out = self.inference_step(generator, [model], sample) + hyps, refs = [], [] + for i in range(len(gen_out)): + hyps.append(decode(gen_out[i][0]["tokens"], is_ref=False)) + refs.append( + decode( + utils.strip_pad(sample["target"][i], self.target_dictionary.pad()), + is_ref=True, # don't count <unk> as matches to the hypo + ) + ) + if self.cfg.eval_bleu_print_samples: + logger.info("H-{} {}".format(sample["id"][0], hyps[0])) + logger.info("T-{} {}".format(sample["id"][0], refs[0])) + + eval_tokenization = "none" if self.cfg.eval_tokenized_bleu else "13a" + return sacrebleu.corpus_bleu(hyps, [refs], tokenize=eval_tokenization) + + def reduce_metrics(self, logging_outputs, criterion): + super().reduce_metrics(logging_outputs, criterion) + + if self.cfg.eval_wer: + zero = torch.scalar_tensor(0.0) + num_char_errors = sum( + log.get("_num_char_errors", zero) for log in logging_outputs + ) + num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs) + num_word_errors = sum( + log.get("_num_word_errors", zero) for log in logging_outputs + ) + num_words = sum(log.get("_num_words", zero) for log in logging_outputs) + metrics.log_scalar("_num_char_errors", num_char_errors) + metrics.log_scalar("_num_chars", num_chars) + metrics.log_scalar("_num_word_errors", num_word_errors) + metrics.log_scalar("_num_words", num_words) + if num_chars > 0: + metrics.log_derived( + "uer", + lambda meters: meters["_num_char_errors"].sum + * 100.0 + / meters["_num_chars"].sum + if meters["_num_chars"].sum > 0 + else float("nan"), + ) + if num_words > 0: + metrics.log_derived( + "wer", + lambda meters: meters["_num_word_errors"].sum + * 100.0 + / meters["_num_words"].sum + if meters["_num_words"].sum > 0 + else float("nan"), + ) + if self.cfg.eval_bleu: + len_keys = ["_bleu_sys_len", "_bleu_ref_len"] + count_keys = [f"_bleu_counts_{i}" for i in range(4)] + total_keys = [f"_bleu_totals_{i}" for i in range(4)] + for k in len_keys + count_keys + total_keys: + metrics.log_scalar(k, sum(log.get(k, 0) for log in logging_outputs)) + + import sacrebleu + + metrics.log_derived( + "bleu", + lambda meters: sacrebleu.compute_bleu( + correct=[meters[k].sum for k in count_keys], + total=[meters[k].sum for k in total_keys], + sys_len=meters["_bleu_sys_len"].sum, + ref_len=meters["_bleu_ref_len"].sum, + smooth_method="exp", + ).score, + ) diff --git a/fairseq/tasks/audio_pretraining.py b/fairseq/tasks/audio_pretraining.py index 298bdbe938..3e91303b69 100644 --- a/fairseq/tasks/audio_pretraining.py +++ b/fairseq/tasks/audio_pretraining.py @@ -5,147 +5,249 @@ # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. +import logging import os import sys -from fairseq.data import AddTargetDataset, Dictionary, FileAudioDataset +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Optional, OrderedDict +from fairseq.data.multi_corpus_dataset import MultiCorpusDataset +from omegaconf import MISSING, II, OmegaConf + +from fairseq.data import BinarizedAudioDataset, FileAudioDataset, SubsampleDataset +from fairseq.dataclass import FairseqDataclass, ChoiceEnum +from fairseq.data.text_compressor import TextCompressionLevel + +from . import FairseqTask, register_task + + +logger = logging.getLogger(__name__) + + +@dataclass +class AudioMaskingConfig: + feature_encoder_spec: str = II("model.modalities.audio.feature_encoder_spec") + mask_prob: float = II("model.modalities.audio.mask_prob") + mask_prob_adjust: float = II("model.modalities.audio.mask_prob_adjust") + mask_length: int = II("model.modalities.audio.mask_length") + inverse_mask: bool = II("model.modalities.audio.inverse_mask") + mask_dropout: float = II("model.modalities.audio.mask_dropout") + clone_batch: int = II("model.clone_batch") + expand_adjacent: bool = False + non_overlapping: bool = False + + +@dataclass +class AudioPretrainingConfig(FairseqDataclass): + data: str = field(default=MISSING, metadata={"help": "path to data directory"}) + labels: Optional[str] = field( + default=None, + metadata={"help": "extension of the label file to load, used for fine-tuning"}, + ) + multi_corpus_keys: Optional[str] = field( + default=None, + metadata={"help": "Comma separated names for loading multi corpus datasets"}) + multi_corpus_sampling_weights: Optional[str] = field( + default=None, + metadata={"help": "Comma separated string of sampling weights corresponding to the multi_corpus_keys"}) + binarized_dataset: bool = field( + default=False, + metadata={ + "help": "if true, loads binarized dataset (useful for very large datasets). " + "See examples/wav2vec/scripts/binarize_manifest.sh" + }, + ) + sample_rate: int = field( + default=16_000, + metadata={ + "help": "target sample rate. audio files will be up/down sampled to this rate" + }, + ) + normalize: bool = field( + default=False, + metadata={"help": "if set, normalizes input to have 0 mean and unit variance"}, + ) + enable_padding: bool = field( + default=False, metadata={"help": "pad shorter samples instead of cropping"} + ) + max_sample_size: Optional[int] = field( + default=None, metadata={"help": "max sample size to crop to for batching"} + ) + min_sample_size: Optional[int] = field( + default=None, metadata={"help": "min sample size to skip small examples"} + ) + num_batch_buckets: int = field( + default=0, + metadata={"help": "number of buckets"}, + ) + tpu: bool = II("common.tpu") + text_compression_level: ChoiceEnum([x.name for x in TextCompressionLevel]) = field( + default="none", + metadata={ + "help": "compression level for texts (e.g. audio filenames, " + "target texts): none/low/high (default: none). " + }, + ) + + rebuild_batches: bool = True + precompute_mask_config: Optional[AudioMaskingConfig] = None + + post_save_script: Optional[str] = None + + subsample: float = 1 + seed: int = II("common.seed") + + +@register_task("audio_pretraining", dataclass=AudioPretrainingConfig) +class AudioPretrainingTask(FairseqTask): + """ """ + + cfg: AudioPretrainingConfig -from . import LegacyFairseqTask, register_task + @classmethod + def setup_task(cls, cfg: AudioPretrainingConfig, **kwargs): + """Setup the task (e.g., load dictionaries). + Args: + cfg (AudioPretrainingConfig): configuration of this task + """ -class LabelEncoder(object): - def __init__(self, dictionary): - self.dictionary = dictionary + return cls(cfg) - def __call__(self, label): - return self.dictionary.encode_line( - label, append_eos=False, add_if_not_exist=False - ) + def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): + data_path = self.cfg.data + task_cfg = task_cfg or self.cfg + # upgrade old task + if isinstance(task_cfg, Namespace): + if not hasattr(task_cfg, "autoregressive"): + task_cfg.autoregressive = not task_cfg.criterion == "ctc" -@register_task("audio_pretraining") -class AudioPretrainingTask(LegacyFairseqTask): - """""" - - @staticmethod - def add_args(parser): - """Add task-specific arguments to the parser.""" - parser.add_argument("data", help="path to data directory") - parser.add_argument( - "--sample-rate", - default=16000, - type=int, - help="target sample rate. audio files will be up/down sampled to this rate", - ) - parser.add_argument( - "--normalize", - action="store_true", - help="if set, normalizes input to have 0 mean and unit variance", - ) - parser.add_argument( - "--max-sample-size", - default=None, - type=int, - help="max sample size to crop to for batching. default = min sample length", - ) - parser.add_argument( - "--min-sample-size", - default=None, - type=int, - help="min sample size to crop to for batching. default = same as --max-sample-size", + text_compression_level = getattr( + TextCompressionLevel, str(self.cfg.text_compression_level) ) - parser.add_argument( - "--enable-padding", - action="store_true", - help="pad shorter samples instead of cropping", - ) - - parser.add_argument( - "--labels", - type=str, - default=None, - help="extension of the label file to load, if any", - ) - - def __init__(self, args, source_dictionary=None, target_dictionary=None): - super().__init__(args) - self._target_dictionary = target_dictionary - self._source_dictionary = source_dictionary - self.is_ctc = args.criterion == "ctc" - - @classmethod - def setup_task(cls, args, **kwargs): - """Setup the task (e.g., load dictionaries). - - Args: - args (omegaconf.DictConfig): parsed command-line arguments - """ - - if args.labels: - dict_path = os.path.join(args.data, f"dict.{args.labels}.txt") - target_dictionary = Dictionary.load(dict_path) + compute_mask = getattr(task_cfg, "precompute_mask_config", None) is not None + mask_args = {} + if compute_mask: + mask_args = task_cfg.precompute_mask_config + + if getattr(task_cfg, "binarized_dataset", False): + self.datasets[split] = BinarizedAudioDataset( + data_path, + split=split, + sample_rate=task_cfg.get("sample_rate", self.cfg.sample_rate), + max_sample_size=self.cfg.max_sample_size, + min_sample_size=self.cfg.min_sample_size, + pad=task_cfg.labels is not None or task_cfg.enable_padding, + normalize=task_cfg.normalize, + num_buckets=self.cfg.num_batch_buckets or int(self.cfg.tpu), + compute_mask=compute_mask, + **mask_args, + ) else: - target_dictionary = None + if task_cfg.multi_corpus_keys is None: + manifest_path = os.path.join(data_path, "{}.tsv".format(split)) + + self.datasets[split] = FileAudioDataset( + manifest_path=manifest_path, + sample_rate=task_cfg.get("sample_rate", self.cfg.sample_rate), + max_sample_size=self.cfg.max_sample_size, + min_sample_size=self.cfg.min_sample_size, + pad=task_cfg.labels is not None or task_cfg.enable_padding, + normalize=task_cfg.normalize, + num_buckets=self.cfg.num_batch_buckets or int(self.cfg.tpu), + text_compression_level=text_compression_level, + compute_mask=compute_mask, + **mask_args, + ) + else: + dataset_map = OrderedDict() + self.dataset_map = {} + multi_corpus_keys = [k.strip() for k in task_cfg.multi_corpus_keys.split(",")] + corpus_idx_map = {k: idx for idx, k in enumerate(multi_corpus_keys)} + data_keys = [k.split(":") for k in split.split(",")] + + multi_corpus_sampling_weights = [float(val.strip()) for val in task_cfg.multi_corpus_sampling_weights.split(",")] + data_weights = [] + + for key, file_name in data_keys: + + k = key.strip() + manifest_path = os.path.join(data_path, "{}.tsv".format(file_name.strip())) + + # TODO: Remove duplication of code from the if block above + dataset_map[k] = FileAudioDataset( + manifest_path=manifest_path, + sample_rate=task_cfg.get("sample_rate", self.cfg.sample_rate), + max_sample_size=self.cfg.max_sample_size, + min_sample_size=self.cfg.min_sample_size, + pad=task_cfg.labels is not None or task_cfg.enable_padding, + normalize=task_cfg.normalize, + num_buckets=self.cfg.num_batch_buckets or int(self.cfg.tpu), + text_compression_level=text_compression_level, + compute_mask=compute_mask, + corpus_key=corpus_idx_map[k], + **mask_args, + ) + + data_weights.append(multi_corpus_sampling_weights[corpus_idx_map[k]]) + + self.dataset_map[split] = dataset_map + + if len(dataset_map) == 1: + self.datasets[split] = list(dataset_map.values())[0] + else: + self.datasets[split] = MultiCorpusDataset(dataset_map, distribution=data_weights, seed=0, sort_indices=True) + + if getattr(task_cfg, "subsample", 1) < 1: + self.datasets[split] = SubsampleDataset( + self.datasets[split], + task_cfg.subsample, + shuffle=True, + seed=task_cfg.seed, + ) - return cls(args, target_dictionary=target_dictionary) + if self.cfg.tpu and task_cfg.inferred_w2v_config.mask_channel_prob == 0.0: + logger.info( + "Pretraining on TPUs may suffer convergence " + "issues when training with `mask_channel_prob` value of " + "0. You may want to set this to a low value close to 0." + ) - def load_dataset(self, split, **kwargs): - """Load a given dataset split. + def max_positions(self): + """Maximum input length supported by the encoder.""" + return sys.maxsize, sys.maxsize - Args: - split (str): name of the split (e.g., train, valid, test) - """ - manifest = os.path.join(self.args.data, "{}.tsv".format(split)) - self.datasets[split] = FileAudioDataset( - manifest, - sample_rate=self.args.sample_rate, - max_sample_size=self.args.max_sample_size, - min_sample_size=self.args.max_sample_size, - min_length=self.args.min_sample_size, - pad=self.args.labels is not None or self.args.enable_padding, - normalize=self.args.normalize, - ) + def build_model(self, model_cfg: FairseqDataclass, from_checkpoint=False): + model = super().build_model(model_cfg, from_checkpoint) - if self.args.labels: - label_path = os.path.join(self.args.data, f"{split}.{self.args.labels}") - labels = [] - with open(label_path, "r") as f: - for line in f: - labels.append(line) + actualized_cfg = getattr(model, "cfg", None) + if actualized_cfg is not None: + # if "w2v_args" in actualized_cfg: + if hasattr(actualized_cfg, "w2v_args"): + model_cfg.w2v_args = actualized_cfg.w2v_args - process_label = LabelEncoder(self.target_dictionary) + return model - self.datasets[split] = AddTargetDataset( - self.datasets[split], - labels, - pad=self.target_dictionary.pad(), - eos=self.target_dictionary.eos(), - batch_targets=True, - process_label=process_label, - add_to_input=not self.is_ctc, + def post_save(self, cp_path, num_updates): + if self.cfg.post_save_script is not None: + logger.info(f"launching {self.cfg.post_save_script}") + import os.path as osp + from fairseq.file_io import PathManager + + eval_cp_path = osp.join( + osp.dirname(cp_path), f"checkpoint_eval_{num_updates}.pt" ) - @property - def source_dictionary(self): - return self._source_dictionary + print(cp_path, eval_cp_path, osp.dirname(cp_path)) - @property - def target_dictionary(self): - """Return the :class:`~fairseq.data.Dictionary` for the language - model.""" - return self._target_dictionary + assert PathManager.copy( + cp_path, eval_cp_path, overwrite=True + ), f"Failed to copy {cp_path} to {eval_cp_path}" - def max_positions(self): - """Maximum input length supported by the encoder.""" - return (sys.maxsize, sys.maxsize) - - def filter_indices_by_size( - self, - indices, - dataset, - max_positions=None, - ignore_invalid_inputs=False, - ): - # we do not need to filter by size in this task as dataloaders take care of this - return indices + import subprocess + import shlex + + subprocess.call(shlex.split(f"{self.cfg.post_save_script} {eval_cp_path}")) diff --git a/fairseq/tasks/denoising.py b/fairseq/tasks/denoising.py index 41bddc1a05..57b824d581 100644 --- a/fairseq/tasks/denoising.py +++ b/fairseq/tasks/denoising.py @@ -5,6 +5,11 @@ import logging import os +from dataclasses import dataclass, field +from typing import Any, Optional + +import numpy as np +from omegaconf import II, MISSING from fairseq import utils from fairseq.data import ( @@ -21,130 +26,143 @@ data_utils, ) from fairseq.data.encoders.utils import get_whole_word_mask -from fairseq.tasks import LegacyFairseqTask, register_task -import numpy as np +from fairseq.data.shorten_dataset import maybe_shorten_dataset +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.tasks import FairseqTask, register_task +from ..data.indexed_dataset import get_available_dataset_impl logger = logging.getLogger(__name__) +SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"]) +SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) +MASK_LENGTH_CHOICES = ChoiceEnum(["subword", "word", "span-poisson"]) + + +@dataclass +class DenoisingConfig(FairseqDataclass): + data: str = field( + default=MISSING, + metadata={"help": "path to data directory"}, + ) + bpe: Optional[str] = field( + default=None, + metadata={"help": "TODO"}, + ) + tokens_per_sample: int = field( + default=512, + metadata={ + "help": "max number of total tokens over all segments " + "per sample for dataset" + }, + ) + sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( + default="complete_doc", + metadata={ + "help": 'If omitted or "none", fills each sample with tokens-per-sample ' + 'tokens. If set to "complete", splits samples only at the end ' + "of sentence, but may include multiple sentences per sample. " + '"complete_doc" is similar but respects doc boundaries. ' + 'If set to "eos", includes only one sentence per sample.' + }, + ) + replace_length: int = field( + default=0, + metadata={"help": "TODO, should only allow -1, 0 and 1"}, + ) + mask: float = field( + default=0.0, + metadata={"help": "fraction of words/subwords that will be masked"}, + ) + mask_random: float = field( + default=0.0, + metadata={"help": "instead of using [MASK], use random token this often"}, + ) + insert: float = field( + default=0.0, + metadata={"help": "insert this percentage of additional random tokens"}, + ) + permute: float = field( + default=0.0, + metadata={"help": "take this proportion of subwords and permute them"}, + ) + rotate: float = field( + default=0.5, + metadata={"help": "rotate this proportion of inputs"}, + ) + poisson_lambda: float = field( + default=3.0, + metadata={"help": "randomly shuffle sentences for this proportion of inputs"}, + ) + shuffle_instance: float = field( + default=0.0, + metadata={"help": "shuffle this proportion of sentences in all inputs"}, + ) + mask_length: MASK_LENGTH_CHOICES = field( + default="subword", + metadata={"help": "mask length to choose"}, + ) + permute_sentences: int = field( + default=-1, + metadata={ + "help": "when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)" + }, + ) + seed: int = II("common.seed") + shorten_method: SHORTEN_METHOD_CHOICES = field( + default="none", + metadata={ + "help": "if not none, shorten sequences that exceed --tokens-per-sample" + }, + ) + shorten_data_split_list: str = field( + default="", + metadata={ + "help": "comma-separated list of dataset splits to apply shortening to, " + 'e.g., "train,valid" (default: all dataset splits)' + }, + ) + max_source_positions: int = field( + default=1024, + metadata={"help": "max number of tokens in the source sequence"}, + ) + max_target_positions: int = field( + default=1024, + metadata={"help": "max number of tokens in the target sequence"}, + ) + dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( + "dataset.dataset_impl" + ) -@register_task("denoising") -class DenoisingTask(LegacyFairseqTask): + +@register_task("denoising", dataclass=DenoisingConfig) +class DenoisingTask(FairseqTask): """ Denoising task for applying sequence to sequence denoising. (ie. BART) """ - @staticmethod - def add_args(parser): - """Add task-specific arguments to the parser.""" - parser.add_argument("data", help="path to data directory") - parser.add_argument( - "--tokens-per-sample", - default=512, - type=int, - help="max number of total tokens over all segments" - " per sample for dataset", - ) - parser.add_argument( - "--sample-break-mode", - default="complete_doc", - type=str, - help="mode for breaking sentence", - ) - parser.add_argument( - "--mask", - default=0.0, - type=float, - help="fraction of words/subwords that will be masked", - ) - parser.add_argument( - "--mask-random", - default=0.0, - type=float, - help="instead of using [MASK], use random token this often", - ) - parser.add_argument( - "--insert", - default=0.0, - type=float, - help="insert this percentage of additional random tokens", - ) - parser.add_argument( - "--permute", - default=0.0, - type=float, - help="take this proportion of subwords and permute them", - ) - parser.add_argument( - "--rotate", - default=0.5, - type=float, - help="rotate this proportion of inputs", - ) - parser.add_argument( - "--poisson-lambda", - default=3.0, - type=float, - help="randomly shuffle sentences for this proportion of inputs", - ) - parser.add_argument( - "--permute-sentences", - default=0.0, - type=float, - help="shuffle this proportion of sentences in all inputs", - ) - parser.add_argument( - "--mask-length", - default="subword", - type=str, - choices=["subword", "word", "span-poisson"], - help="mask length to choose", - ) - parser.add_argument( - "--replace-length", - default=-1, - type=int, - help="when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)", - ) - parser.add_argument( - "--max-source-positions", - default=1024, - type=int, - metavar="N", - help="max number of tokens in the source sequence", - ) - parser.add_argument( - "--max-target-positions", - default=1024, - type=int, - metavar="N", - help="max number of tokens in the target sequence", - ) + cfg: DenoisingConfig - def __init__(self, args, dictionary): - super().__init__(args) + def __init__(self, cfg, dictionary): + super().__init__(cfg) self.dictionary = dictionary - self.seed = args.seed # add mask token self.mask_idx = self.dictionary.add_symbol("<mask>") @classmethod - def setup_task(cls, args, **kwargs): + def setup_task(cls, cfg: DenoisingConfig, **kwargs): """Setup the task.""" - dictionary = Dictionary.load(os.path.join(args.data, "dict.txt")) + paths = utils.split_paths(cfg.data) + assert len(paths) > 0 + dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) - if not hasattr(args, "shuffle_instance"): - args.shuffle_instance = False - return cls(args, dictionary) - - def load_dataset(self, split, epoch=1, combine=False, **kwargs): - """Load a given dataset split. + if not hasattr(cfg, "shuffle_instance"): + cfg.shuffle_instance = False + return cls(cfg, dictionary) - Args: - split (str): name of the split (e.g., train, valid, test) - """ - paths = utils.split_paths(self.args.data) + def _load_dataset_split(self, split, epoch, combine): + paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) @@ -152,7 +170,7 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, - self.args.dataset_impl, + self.cfg.dataset_impl, combine=combine, ) if dataset is None: @@ -162,24 +180,44 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): dataset = StripTokenDataset(dataset, self.dictionary.eos()) + dataset = maybe_shorten_dataset( + dataset, + split, + self.cfg.shorten_data_split_list, + self.cfg.shorten_method, + self.cfg.tokens_per_sample, + self.cfg.seed, + ) + # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, - self.args.tokens_per_sample - 2, # one less for <s> and one for </s> + self.cfg.tokens_per_sample - 2, + # one less for <s> and one for </s> pad=self.dictionary.pad(), eos=self.dictionary.eos(), - break_mode=self.args.sample_break_mode, + break_mode=self.cfg.sample_break_mode, document_sep_len=0, ) + logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) dataset = AppendTokenDataset(dataset, self.source_dictionary.eos()) + return dataset + + def load_dataset(self, split, epoch=1, combine=False, **kwargs): + """Load a given dataset split. + + Args: + split (str): name of the split (e.g., train, valid, test) + """ + dataset = self._load_dataset_split(split, epoch, combine) mask_whole_words = ( - get_whole_word_mask(self.args, self.source_dictionary) - if self.args.mask_length != "subword" + get_whole_word_mask(self.cfg.bpe, self.source_dictionary) + if self.cfg.mask_length != "subword" else None ) @@ -189,9 +227,17 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): self.dictionary, self.mask_idx, mask_whole_words, - shuffle=self.args.shuffle_instance, - seed=self.seed, - args=self.args, + shuffle=self.cfg.shuffle_instance, + seed=self.cfg.seed, + mask=self.cfg.mask, + mask_random=self.cfg.mask_random, + insert=self.cfg.insert, + rotate=self.cfg.rotate, + permute_sentences=self.cfg.permute_sentences, + bpe=self.cfg.bpe, + replace_length=self.cfg.replace_length, + mask_length=self.cfg.mask_length, + poisson_lambda=self.cfg.poisson_lambda, ) logger.info( "Split: {0}, Loaded {1} samples of denoising_dataset".format( @@ -210,10 +256,10 @@ def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): src_dataset = TokenBlockDataset( src_tokens, src_lengths, - block_size=self.args.tokens_per_sample - 2, # for <s> and </s> + block_size=self.cfg.tokens_per_sample - 2, # for <s> and </s> pad=pad, eos=eos, - break_mode=self.args.sample_break_mode, + break_mode=self.cfg.sample_break_mode, document_sep_len=0, ) prev_output_tokens = PrependTokenDataset( @@ -237,7 +283,7 @@ def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): def max_positions(self): """Return the max sentence length allowed by the task.""" - return (self.args.max_source_positions, self.args.max_target_positions) + return (self.cfg.max_source_positions, self.cfg.max_target_positions) @property def source_dictionary(self): diff --git a/fairseq/tasks/fairseq_task.py b/fairseq/tasks/fairseq_task.py index 3cdb64cfae..e39d1d6848 100644 --- a/fairseq/tasks/fairseq_task.py +++ b/fairseq/tasks/fairseq_task.py @@ -7,21 +7,60 @@ import os import warnings from argparse import Namespace +from typing import Any, Callable, Dict, List import torch -from fairseq import metrics, search, tokenizer, utils +from fairseq import search, tokenizer, utils +from fairseq.logging import metrics from fairseq.data import Dictionary, FairseqDataset, data_utils, encoders, iterators +from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import gen_parser_from_dataclass +from fairseq.optim.amp_optimizer import AMPOptimizer from omegaconf import DictConfig logger = logging.getLogger(__name__) +class StatefulContainer(object): + def __init__(self): + self._state = dict() + self._factories = dict() + + def add_factory(self, name, factory: Callable[[], Any]): + self._factories[name] = factory + + def merge_state_dict(self, state_dict: Dict[str, Any]): + self._state.update(state_dict) + + @property + def state_dict(self) -> Dict[str, Any]: + return self._state + + def __getattr__(self, name): + if name not in self._state and name in self._factories: + self._state[name] = self._factories[name]() + + if name in self._state: + return self._state[name] + + raise AttributeError(f"Task state has no factory for attribute {name}") + + class FairseqTask(object): """ Tasks store dictionaries and provide helpers for loading/iterating over Datasets, initializing the Model/Criterion and calculating the loss. + + Tasks have limited statefulness. In particular, state that needs to be + saved to/loaded from checkpoints needs to be stored in the `self.state` + :class:`StatefulContainer` object. For example:: + + self.state.add_factory("dictionary", self.load_dictionary) + print(self.state.dictionary) # calls self.load_dictionary() + + This is necessary so that when loading checkpoints, we can properly + recreate the task state after initializing the task instance. """ @classmethod @@ -40,10 +79,11 @@ def logging_outputs_can_be_summed(criterion) -> bool: """ return criterion.logging_outputs_can_be_summed() - def __init__(self, cfg: DictConfig, **kwargs): + def __init__(self, cfg: FairseqDataclass, **kwargs): self.cfg = cfg - self.datasets = {} - self.dataset_to_epoch_iter = {} + self.datasets = dict() + self.dataset_to_epoch_iter = dict() + self.state = StatefulContainer() @classmethod def load_dictionary(cls, filename): @@ -90,11 +130,20 @@ def setup_task(cls, cfg: DictConfig, **kwargs): def has_sharded_data(self, split): return os.pathsep in getattr(self.cfg, "data", "") - def load_dataset(self, split, combine=False, **kwargs): + def load_dataset( + self, + split: str, + combine: bool = False, + task_cfg: FairseqDataclass = None, + **kwargs, + ): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) + combine (bool): combines a split segmented into pieces into one dataset + task_cfg (FairseqDataclass): optional task configuration stored in the checkpoint that can be used + to load datasets """ raise NotImplementedError @@ -143,7 +192,7 @@ def filter_indices_by_size( ) logger.warning( ( - "{} samples have invalid sizes and will be skipped, " + "{:,} samples have invalid sizes and will be skipped, " "max_positions={}, first few sample ids={}" ).format(len(ignored), max_positions, ignored[:10]) ) @@ -171,6 +220,9 @@ def get_batch_iterator( epoch=1, data_buffer_size=0, disable_iterator_cache=False, + skip_remainder_batch=False, + grouped_shuffling=False, + update_epoch_batch_itr=False, ): """ Get an iterator that yields batches of data from the given dataset. @@ -203,13 +255,25 @@ def get_batch_iterator( disable_iterator_cache (bool, optional): don't cache the EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`) (default: False). + skip_remainder_batch (bool, optional): if set, discard the last + batch in each training epoch, as the last batch is often smaller than + local_batch_size * distributed_word_size (default: ``True``). + grouped_shuffling (bool, optional): group batches with each groups + containing num_shards batches and shuffle groups. Reduces difference + between sequence lengths among workers for batches sorted by length. + update_epoch_batch_itr (bool optional): if true then donot use the cached + batch iterator for the epoch + Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ - can_reuse_epoch_itr = not disable_iterator_cache and self.can_reuse_epoch_itr( - dataset + can_reuse_epoch_itr = ( + not disable_iterator_cache + and not update_epoch_batch_itr + and self.can_reuse_epoch_itr(dataset) ) + logger.info(f"can_reuse_epoch_itr = {can_reuse_epoch_itr}") if can_reuse_epoch_itr and dataset in self.dataset_to_epoch_iter: logger.debug("reusing EpochBatchIterator for epoch {}".format(epoch)) return self.dataset_to_epoch_iter[dataset] @@ -219,23 +283,39 @@ def get_batch_iterator( # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) - # get indices ordered by example size - with data_utils.numpy_seed(seed): - indices = dataset.ordered_indices() + def make_batches(dataset, epoch): + logger.info(f"creating new batches for epoch {epoch}") - # filter examples that are too large - if max_positions is not None: - indices = self.filter_indices_by_size( - indices, dataset, max_positions, ignore_invalid_inputs + # get indices ordered by example size + with data_utils.numpy_seed(seed + epoch): + indices = dataset.ordered_indices() + + # filter examples that are too large + if max_positions is not None: + indices = self.filter_indices_by_size( + indices, dataset, max_positions, ignore_invalid_inputs + ) + + # create mini-batches with given size constraints + batches = dataset.batch_by_size( + indices, + max_tokens=max_tokens, + max_sentences=max_sentences, + required_batch_size_multiple=required_batch_size_multiple, ) + return batches - # create mini-batches with given size constraints - batch_sampler = dataset.batch_by_size( - indices, - max_tokens=max_tokens, - max_sentences=max_sentences, - required_batch_size_multiple=required_batch_size_multiple, - ) + reuse_dataloader = getattr(self.cfg, "reuse_dataloader", True) + persistent_workers = getattr(self.cfg, "persistent_workers", True) + rebuild_batches = getattr(self.cfg, "rebuild_batches", False) + logger.info(f"reuse_dataloader = {reuse_dataloader}") + logger.info(f"rebuild_batches = {rebuild_batches}") + + if rebuild_batches: + logger.info("batches will be rebuilt for each epoch") + batch_sampler = make_batches + else: + batch_sampler = make_batches(dataset, epoch) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( @@ -248,6 +328,10 @@ def get_batch_iterator( num_workers=num_workers, epoch=epoch, buffer_size=data_buffer_size, + skip_remainder_batch=skip_remainder_batch, + grouped_shuffling=grouped_shuffling, + reuse_dataloader=reuse_dataloader, + persistent_workers=persistent_workers, ) if can_reuse_epoch_itr: @@ -255,26 +339,24 @@ def get_batch_iterator( return epoch_iter - def build_model(self, cfg: DictConfig): + def build_model(self, cfg: FairseqDataclass, from_checkpoint=False): """ Build the :class:`~fairseq.models.BaseFairseqModel` instance for this task. Args: - cfg (omegaconf.DictConfig): configuration object + cfg (FairseqDataclass): configuration object Returns: a :class:`~fairseq.models.BaseFairseqModel` instance """ from fairseq import models, quantization_utils - model = models.build_model(cfg, self) - if getattr(cfg, "tpu", False): - model.prepare_for_tpu_() + model = models.build_model(cfg, self, from_checkpoint) model = quantization_utils.quantize_model_scalar(model, cfg) return model - def build_criterion(self, cfg: DictConfig): + def build_criterion(self, cfg: DictConfig, from_checkpoint=False): """ Build the :class:`~fairseq.criterions.FairseqCriterion` instance for this task. @@ -287,11 +369,40 @@ def build_criterion(self, cfg: DictConfig): """ from fairseq import criterions - return criterions.build_criterion(cfg, self) + return criterions.build_criterion(cfg, self, from_checkpoint=from_checkpoint) def build_generator( - self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None + self, + models, + args, + seq_gen_cls=None, + extra_gen_cls_kwargs=None, + prefix_allowed_tokens_fn=None, ): + """ + Build a :class:`~fairseq.SequenceGenerator` instance for this + task. + + Args: + models (List[~fairseq.models.FairseqModel]): ensemble of models + args (fairseq.dataclass.configs.GenerationConfig): + configuration object (dataclass) for generation + extra_gen_cls_kwargs (Dict[str, Any]): extra options to pass + through to SequenceGenerator + prefix_allowed_tokens_fn (Callable[[int, torch.Tensor], List[int]]): + If provided, this function constrains the beam search to + allowed tokens only at each step. The provided function + should take 2 arguments: the batch ID (`batch_id: int`) + and a unidimensional tensor of token ids (`inputs_ids: + torch.Tensor`). It has to return a `List[int]` with the + allowed tokens for the next generation step conditioned + on the previously generated tokens (`inputs_ids`) and + the batch ID (`batch_id`). This argument is useful for + constrained generation conditioned on the prefix, as + described in "Autoregressive Entity Retrieval" + (https://arxiv.org/abs/2010.00904) and + https://github.com/facebookresearch/GENRE. + """ if getattr(args, "score_reference", False): from fairseq.sequence_scorer import SequenceScorer @@ -314,7 +425,8 @@ def build_generator( match_source_len = getattr(args, "match_source_len", False) diversity_rate = getattr(args, "diversity_rate", -1) constrained = getattr(args, "constraints", False) - prefix_allowed_tokens_fn = getattr(args, "prefix_allowed_tokens_fn", None) + if prefix_allowed_tokens_fn is None: + prefix_allowed_tokens_fn = getattr(args, "prefix_allowed_tokens_fn", None) if ( sum( int(cond) @@ -365,12 +477,14 @@ def build_generator( else: search_strategy = search.BeamSearch(self.target_dictionary) + extra_gen_cls_kwargs = extra_gen_cls_kwargs or {} if seq_gen_cls is None: if getattr(args, "print_alignment", False): seq_gen_cls = SequenceGeneratorWithAlignment + extra_gen_cls_kwargs["print_alignment"] = args.print_alignment else: seq_gen_cls = SequenceGenerator - extra_gen_cls_kwargs = extra_gen_cls_kwargs or {} + return seq_gen_cls( models, self.target_dictionary, @@ -414,7 +528,8 @@ def train_step( model.train() model.set_num_updates(update_num) with torch.autograd.profiler.record_function("forward"): - loss, sample_size, logging_output = criterion(model, sample) + with torch.cuda.amp.autocast(enabled=(isinstance(optimizer, AMPOptimizer))): + loss, sample_size, logging_output = criterion(model, sample) if ignore_grad: loss *= 0 with torch.autograd.profiler.record_function("backward"): @@ -427,6 +542,14 @@ def valid_step(self, sample, model, criterion): loss, sample_size, logging_output = criterion(model, sample) return loss, sample_size, logging_output + def optimizer_step(self, optimizer, model, update_num): + optimizer.step() + + def build_dataset_for_inference( + self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs + ) -> torch.utils.data.Dataset: + raise NotImplementedError + def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): @@ -489,6 +612,15 @@ def reduce_metrics(self, logging_outputs, criterion): criterion.__class__.reduce_metrics(logging_outputs) + def state_dict(self): + if self.state is not None: + return self.state.state_dict + return {} + + def load_state_dict(self, state_dict: Dict[str, Any]): + if self.state is not None: + self.state.merge_state_dict(state_dict) + def max_positions(self): """Return the max input length allowed by the task.""" return None @@ -497,13 +629,13 @@ def max_positions(self): def source_dictionary(self): """Return the source :class:`~fairseq.data.Dictionary` (if applicable for this task).""" - raise NotImplementedError + return None @property def target_dictionary(self): """Return the target :class:`~fairseq.data.Dictionary` (if applicable for this task).""" - raise NotImplementedError + return None def build_tokenizer(self, args): """Build the pre-tokenizer for this task.""" @@ -513,9 +645,20 @@ def build_bpe(self, args): """Build the tokenizer for this task.""" return encoders.build_bpe(args) + def get_interactive_tokens_and_lengths(self, lines, encode_fn): + tokens = [ + self.source_dictionary.encode_line( + encode_fn(src_str), add_if_not_exist=False + ).long() + for src_str in lines + ] + lengths = [t.numel() for t in tokens] + return tokens, lengths + class LegacyFairseqTask(FairseqTask): def __init__(self, args: Namespace): + super().__init__(None) self.args = args self.datasets = {} self.dataset_to_epoch_iter = {} @@ -532,7 +675,7 @@ def setup_task(cls, args: Namespace, **kwargs): def has_sharded_data(self, split): return os.pathsep in getattr(self.args, "data", "") - def build_model(self, args: Namespace): + def build_model(self, args: Namespace, from_checkpoint=False): """ Build the :class:`~fairseq.models.BaseFairseqModel` instance for this task. @@ -545,9 +688,7 @@ def build_model(self, args: Namespace): """ from fairseq import models, quantization_utils - model = models.build_model(args, self) - if getattr(args, "tpu", False): - model.prepare_for_tpu_() + model = models.build_model(args, self, from_checkpoint) model = quantization_utils.quantize_model_scalar(model, args) return model diff --git a/fairseq/tasks/frm_text_to_speech.py b/fairseq/tasks/frm_text_to_speech.py new file mode 100644 index 0000000000..667f5f8ee4 --- /dev/null +++ b/fairseq/tasks/frm_text_to_speech.py @@ -0,0 +1,55 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +from fairseq.data.audio.frm_text_to_speech_dataset import FrmTextToSpeechDatasetCreator +from fairseq.tasks import register_task +from fairseq.tasks.text_to_speech import TextToSpeechTask + + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +@register_task("frm_text_to_speech") +class FrmTextToSpeechTask(TextToSpeechTask): + @staticmethod + def add_args(parser): + TextToSpeechTask.add_args(parser) + parser.add_argument("--do_chunk", action="store_true", help="train on chunks") + parser.add_argument("--chunk_bound", default=-1, type=int) + parser.add_argument("--chunk_init", default=50, type=int) + parser.add_argument("--chunk_incr", default=5, type=int) + parser.add_argument("--add_eos", action="store_true") + parser.add_argument("--dedup", action="store_true") + parser.add_argument("--ref_fpu", default=-1, type=float) + + def load_dataset(self, split, **unused_kwargs): + is_train_split = split.startswith("train") + pre_tokenizer = self.build_tokenizer(self.args) + bpe_tokenizer = self.build_bpe(self.args) + self.datasets[split] = FrmTextToSpeechDatasetCreator.from_tsv( + self.args.data, + self.data_cfg, + split, + self.src_dict, + pre_tokenizer, + bpe_tokenizer, + is_train_split=is_train_split, + n_frames_per_step=self.args.n_frames_per_step, + speaker_to_id=self.speaker_to_id, + do_chunk=self.args.do_chunk, + chunk_bound=self.args.chunk_bound, + chunk_init=self.args.chunk_init, + chunk_incr=self.args.chunk_incr, + add_eos=self.args.add_eos, + dedup=self.args.dedup, + ref_fpu=self.args.ref_fpu, + ) diff --git a/fairseq/tasks/hubert_pretraining.py b/fairseq/tasks/hubert_pretraining.py new file mode 100644 index 0000000000..1a3605f14d --- /dev/null +++ b/fairseq/tasks/hubert_pretraining.py @@ -0,0 +1,191 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import logging +import os +import sys +from typing import Dict, List, Optional, Tuple + +import numpy as np + +from dataclasses import dataclass, field +from fairseq.data import Dictionary, HubertDataset +from fairseq.dataclass.configs import FairseqDataclass +from fairseq.tasks import register_task +from fairseq.tasks.fairseq_task import FairseqTask +from omegaconf import MISSING + +logger = logging.getLogger(__name__) + + +class LabelEncoder(object): + def __init__(self, dictionary: Dictionary) -> None: + self.dictionary = dictionary + + def __call__(self, label: str) -> List[str]: + return self.dictionary.encode_line( + label, + append_eos=False, + add_if_not_exist=False, + ) + + +@dataclass +class HubertPretrainingConfig(FairseqDataclass): + data: str = field(default=MISSING, metadata={"help": "path to data directory"}) + fine_tuning: bool = field( + default=False, metadata={"help": "set to true if fine-tuning Hubert"} + ) + labels: List[str] = field( + default_factory=lambda: ["ltr"], + metadata={ + "help": ( + "extension of the label files to load, frame-level labels for" + " pre-training, and sequence-level label for fine-tuning" + ) + }, + ) + label_dir: Optional[str] = field( + default=None, + metadata={ + "help": "if set, looks for labels in this directory instead", + }, + ) + label_rate: float = field( + default=-1.0, + metadata={"help": "label frame rate. -1.0 for sequence label"}, + ) + sample_rate: int = field( + default=16_000, + metadata={ + "help": "target sample rate. audio files will be up/down " + "sampled to this rate" + }, + ) + normalize: bool = field( + default=False, + metadata={"help": "if set, normalizes input to have 0 mean and unit variance"}, + ) + enable_padding: bool = field( + default=False, + metadata={"help": "pad shorter samples instead of cropping"}, + ) + max_keep_size: Optional[int] = field( + default=None, + metadata={"help": "exclude sample longer than this"}, + ) + max_sample_size: Optional[int] = field( + default=None, + metadata={"help": "max sample size to crop to for batching"}, + ) + min_sample_size: Optional[int] = field( + default=None, + metadata={"help": "min sample size to crop to for batching"}, + ) + single_target: Optional[bool] = field( + default=False, + metadata={ + "help": "if set, AddTargetDatasets outputs same keys " "as AddTargetDataset" + }, + ) + random_crop: Optional[bool] = field( + default=True, + metadata={"help": "always crop from the beginning if false"}, + ) + pad_audio: Optional[bool] = field( + default=False, + metadata={"help": "pad audio to the longest one in the batch if true"}, + ) + + +@register_task("hubert_pretraining", dataclass=HubertPretrainingConfig) +class HubertPretrainingTask(FairseqTask): + + cfg: HubertPretrainingConfig + + def __init__( + self, + cfg: HubertPretrainingConfig, + ) -> None: + super().__init__(cfg) + + logger.info(f"current directory is {os.getcwd()}") + logger.info(f"HubertPretrainingTask Config {cfg}") + + self.cfg = cfg + self.fine_tuning = cfg.fine_tuning + + if cfg.fine_tuning: + self.state.add_factory("target_dictionary", self.load_dictionaries) + else: + self.state.add_factory("dictionaries", self.load_dictionaries) + + self.blank_symbol = "<s>" + + @property + def source_dictionary(self) -> Optional[Dictionary]: + return None + + @property + def target_dictionary(self) -> Optional[Dictionary]: + return self.state.target_dictionary + + @property + def dictionaries(self) -> List[Dictionary]: + return self.state.dictionaries + + @classmethod + def setup_task( + cls, cfg: HubertPretrainingConfig, **kwargs + ) -> "HubertPretrainingTask": + return cls(cfg) + + def load_dictionaries(self): + label_dir = self.cfg.data if self.cfg.label_dir is None else self.cfg.label_dir + dictionaries = [ + Dictionary.load(f"{label_dir}/dict.{label}.txt") + for label in self.cfg.labels + ] + return dictionaries[0] if self.cfg.fine_tuning else dictionaries + + def get_label_dir(self) -> str: + if self.cfg.label_dir is None: + return self.cfg.data + return self.cfg.label_dir + + def load_dataset(self, split: str, **kwargs) -> None: + manifest = f"{self.cfg.data}/{split}.tsv" + dicts = [self.target_dictionary] if self.cfg.fine_tuning else self.dictionaries + pad_list = [dict.pad() for dict in dicts] + eos_list = [dict.eos() for dict in dicts] + procs = [LabelEncoder(dict) for dict in dicts] + paths = [f"{self.get_label_dir()}/{split}.{l}" for l in self.cfg.labels] + + # hubert v1: pad_audio=True, random_crop=False; + self.datasets[split] = HubertDataset( + manifest, + sample_rate=self.cfg.sample_rate, + label_paths=paths, + label_rates=self.cfg.label_rate, + pad_list=pad_list, + eos_list=eos_list, + label_processors=procs, + max_keep_sample_size=self.cfg.max_keep_size, + min_keep_sample_size=self.cfg.min_sample_size, + max_sample_size=self.cfg.max_sample_size, + pad_audio=self.cfg.pad_audio, + normalize=self.cfg.normalize, + store_labels=False, + random_crop=self.cfg.random_crop, + single_target=self.cfg.single_target, + ) + + def max_positions(self) -> Tuple[int, int]: + return (sys.maxsize, sys.maxsize) + + def filter_indices_by_size(self, indices: np.array, *args, **kwargs) -> np.array: + return indices diff --git a/fairseq/tasks/language_modeling.py b/fairseq/tasks/language_modeling.py index 6e85417ff5..44d5324b3d 100644 --- a/fairseq/tasks/language_modeling.py +++ b/fairseq/tasks/language_modeling.py @@ -15,6 +15,7 @@ AppendTokenDataset, Dictionary, IdDataset, + LMContextWindowDataset, MonolingualDataset, NestedDictionaryDataset, NumelDataset, @@ -39,7 +40,6 @@ @dataclass class LanguageModelingConfig(FairseqDataclass): - # TODO common var add to parent data: Optional[str] = field( default=None, metadata={"help": "path to data directory"} ) @@ -84,13 +84,26 @@ class LanguageModelingConfig(FairseqDataclass): 'e.g., "train,valid" (default: all dataset splits)' }, ) + pad_to_fixed_length: Optional[bool] = field( + default=False, + metadata={"help": "pad to fixed length"}, + ) + pad_to_fixed_bsz: Optional[bool] = field( + default=False, + metadata={"help": "boolean to pad to fixed batch size"}, + ) + # TODO common vars below add to parent seed: int = II("common.seed") + batch_size: Optional[int] = II("dataset.batch_size") + batch_size_valid: Optional[int] = II("dataset.batch_size_valid") dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( "dataset.dataset_impl" ) data_buffer_size: int = II("dataset.data_buffer_size") tpu: bool = II("common.tpu") + use_plasma_view: bool = II("common.use_plasma_view") + plasma_path: str = II("common.plasma_path") @register_task("language_modeling", dataclass=LanguageModelingConfig) @@ -158,8 +171,8 @@ def setup_task(cls, args, **kwargs): dictionary, output_dictionary = cls.setup_dictionary(args, **kwargs) # upgrade old checkpoints - if hasattr(args, "exclude_self_target"): - args.self_target = not args.exclude_self_target + if getattr(args, "exclude_self_target", False): + args.self_target = False targets = [] if getattr(args, "self_target", False): @@ -174,9 +187,8 @@ def setup_task(cls, args, **kwargs): return cls(args, dictionary, output_dictionary, targets=targets) - def build_model(self, args): - model = super().build_model(args) - + def build_model(self, args, from_checkpoint=False): + model = super().build_model(args, from_checkpoint) for target in self.targets: if target not in model.supported_targets: raise ValueError( @@ -185,11 +197,13 @@ def build_model(self, args): return model - def load_dataset(self, split, epoch=1, combine=False, **kwargs): + def load_dataset( + self, split: str, epoch=1, combine=False, **kwargs + ) -> MonolingualDataset: """Load a given dataset split. Args: - split (str): name of the split (e.g., train, valid, test) + split (str): name of the split (e.g., train, valid, valid1, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 @@ -197,13 +211,12 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) + # each process has its own copy of the raw data (likely to be an np.memmap) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.args.dataset_impl, combine=combine ) if dataset is None: - raise FileNotFoundError( - "Dataset not found: {} ({})".format(split, split_path) - ) + raise FileNotFoundError(f"Dataset not found: {split} ({split_path})") dataset = maybe_shorten_dataset( dataset, @@ -213,7 +226,6 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): self.args.tokens_per_sample, self.args.seed, ) - dataset = TokenBlockDataset( dataset, dataset.sizes, @@ -222,14 +234,26 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, + use_plasma_view=self.args.use_plasma_view, + split_path=split_path, + plasma_path=self.args.plasma_path, ) add_eos_for_other_targets = ( self.args.sample_break_mode is not None and self.args.sample_break_mode != "none" ) + fixed_pad_length = None + if self.args.pad_to_fixed_length: + fixed_pad_length = self.args.tokens_per_sample + + pad_to_bsz = None + if self.args.pad_to_fixed_bsz: + pad_to_bsz = ( + self.args.batch_size_valid if "valid" in split else self.args.batch_size + ) - self.datasets[split] = self._initialize_dataset( + self.datasets[split] = MonolingualDataset( dataset=dataset, sizes=dataset.sizes, src_vocab=self.dictionary, @@ -238,11 +262,10 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): shuffle=True, targets=self.targets, add_bos_token=self.args.add_bos_token, + fixed_pad_length=fixed_pad_length, + pad_to_bsz=pad_to_bsz, ) - def _initialize_dataset(self, **kwargs): - return MonolingualDataset(**kwargs) - def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): """ Generate batches for inference. We prepend an eos token to src_tokens @@ -314,6 +337,39 @@ def inference_step( models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token ) + def eval_lm_dataloader( + self, + dataset, + max_tokens: Optional[int] = 36000, + batch_size: Optional[int] = None, + max_positions: Optional[int] = None, + num_shards: int = 1, + shard_id: int = 0, + num_workers: int = 1, + data_buffer_size: int = 10, + # ensures that every evaluated token has access to a context of at least + # this size, if possible + context_window: int = 0, + ): + if context_window > 0: + dataset = LMContextWindowDataset( + dataset=dataset, + tokens_per_sample=self.args.tokens_per_sample, + context_window=context_window, + pad_idx=self.source_dictionary.pad(), + ) + return self.get_batch_iterator( + dataset=dataset, + max_tokens=max_tokens, + max_sentences=batch_size, + max_positions=max_positions, + ignore_invalid_inputs=True, + num_shards=num_shards, + shard_id=shard_id, + num_workers=num_workers, + data_buffer_size=data_buffer_size, + ).next_epoch_itr(shuffle=False) + @property def source_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language diff --git a/fairseq/tasks/masked_lm.py b/fairseq/tasks/masked_lm.py index 56086f5e81..b064907a50 100644 --- a/fairseq/tasks/masked_lm.py +++ b/fairseq/tasks/masked_lm.py @@ -5,8 +5,11 @@ import logging import os +from dataclasses import dataclass, field import numpy as np +from omegaconf import II, MISSING, OmegaConf + from fairseq import utils from fairseq.data import ( Dictionary, @@ -17,113 +20,140 @@ NumSamplesDataset, PrependTokenDataset, RightPadDataset, + RightPaddingMaskDataset, SortDataset, TokenBlockDataset, data_utils, ) from fairseq.data.encoders.utils import get_whole_word_mask from fairseq.data.shorten_dataset import maybe_shorten_dataset -from fairseq.tasks import LegacyFairseqTask, register_task +from fairseq.dataclass import FairseqDataclass +from fairseq.tasks import FairseqTask, register_task +from .language_modeling import SAMPLE_BREAK_MODE_CHOICES, SHORTEN_METHOD_CHOICES logger = logging.getLogger(__name__) -@register_task("masked_lm") -class MaskedLMTask(LegacyFairseqTask): - """Task for training masked language models (e.g., BERT, RoBERTa).""" - - @staticmethod - def add_args(parser): - """Add task-specific arguments to the parser.""" - parser.add_argument( - "data", - help="colon separated path to data directories list, \ - will be iterated upon during epochs in round-robin manner", - ) - parser.add_argument( - "--sample-break-mode", - default="complete", - choices=["none", "complete", "complete_doc", "eos"], - help='If omitted or "none", fills each sample with tokens-per-sample ' +@dataclass +class MaskedLMConfig(FairseqDataclass): + data: str = field( + default=MISSING, + metadata={ + "help": "colon separated path to data directories list, \ + will be iterated upon during epochs in round-robin manner" + }, + ) + sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( + default="none", + metadata={ + "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' - 'If set to "eos", includes only one sentence per sample.', - ) - parser.add_argument( - "--tokens-per-sample", - default=512, - type=int, - help="max number of total tokens over all segments " - "per sample for BERT dataset", - ) - parser.add_argument( - "--mask-prob", - default=0.15, - type=float, - help="probability of replacing a token with mask", - ) - parser.add_argument( - "--leave-unmasked-prob", - default=0.1, - type=float, - help="probability that a masked token is unmasked", - ) - parser.add_argument( - "--random-token-prob", - default=0.1, - type=float, - help="probability of replacing a token with a random token", - ) - parser.add_argument( - "--freq-weighted-replacement", - default=False, - action="store_true", - help="sample random replacement words based on word frequencies", - ) - parser.add_argument( - "--mask-whole-words", - default=False, - action="store_true", - help="mask whole words; you may also want to set --bpe", - ) - parser.add_argument( - "--shorten-method", - default="none", - choices=["none", "truncate", "random_crop"], - help="if not none, shorten sequences that exceed --tokens-per-sample", - ) - parser.add_argument( - "--shorten-data-split-list", - default="", - help="comma-separated list of dataset splits to apply shortening to, " - 'e.g., "train,valid" (default: all dataset splits)', - ) + 'If set to "eos", includes only one sentence per sample.' + }, + ) + tokens_per_sample: int = field( + default=1024, + metadata={"help": "max number of tokens per sample for LM dataset"}, + ) + mask_prob: float = field( + default=0.15, + metadata={"help": "probability of replacing a token with mask"}, + ) + leave_unmasked_prob: float = field( + default=0.1, + metadata={"help": "probability that a masked token is unmasked"}, + ) + random_token_prob: float = field( + default=0.1, + metadata={"help": "probability of replacing a token with a random token"}, + ) + freq_weighted_replacement: bool = field( + default=False, + metadata={"help": "sample random replacement words based on word frequencies"}, + ) + mask_whole_words: bool = field( + default=False, + metadata={"help": "mask whole words; you may also want to set --bpe"}, + ) + mask_multiple_length: int = field( + default=1, + metadata={"help": "repeat the mask indices multiple times"}, + ) + mask_stdev: float = field( + default=0.0, + metadata={"help": "stdev of the mask length"}, + ) + shorten_method: SHORTEN_METHOD_CHOICES = field( + default="none", + metadata={ + "help": "if not none, shorten sequences that exceed --tokens-per-sample" + }, + ) + shorten_data_split_list: str = field( + default="", + metadata={ + "help": "comma-separated list of dataset splits to apply shortening to, " + 'e.g., "train,valid" (default: all dataset splits)' + }, + ) + seed: int = II("common.seed") + + include_target_tokens: bool = field( + default=False, + metadata={ + "help": "include target tokens in model input. this is used for data2vec" + }, + ) + include_index: bool = field( + default=True, + metadata={"help": "include index in model input. this is used for data2vec"}, + ) + skip_masking: bool = field( + default=False, + metadata={"help": "skip masking at dataset"}, + ) + # subsample_train: float = field( + # default=1, + # metadata={"help": "shorten training set for debugging"}, + # ) + d2v2_multi: bool = field( + default=False, + metadata={"help": "prepare dataset for data2vec_multi"}, + ) - def __init__(self, args, dictionary): - super().__init__(args) - self.dictionary = dictionary - self.seed = args.seed + +@register_task("masked_lm", dataclass=MaskedLMConfig) +class MaskedLMTask(FairseqTask): + + cfg: MaskedLMConfig + + """Task for training masked language models (e.g., BERT, RoBERTa).""" + + def __init__(self, cfg: MaskedLMConfig, dictionary=None): + super().__init__(cfg) + self.dictionary = dictionary or self.load_dict(cfg) # add mask token - self.mask_idx = dictionary.add_symbol("<mask>") + self.mask_idx = self.dictionary.add_symbol("<mask>") + + @classmethod + def setup_task(cls, cfg: MaskedLMConfig, **kwargs): + dictionary = cls.load_dict(cfg) + return cls(cfg, dictionary) @classmethod - def setup_task(cls, args, **kwargs): - paths = utils.split_paths(args.data) + def load_dict(cls, cfg): + paths = utils.split_paths(cfg.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) - return cls(args, dictionary) + return dictionary - def load_dataset(self, split, epoch=1, combine=False, **kwargs): - """Load a given dataset split. - - Args: - split (str): name of the split (e.g., train, valid, test) - """ - paths = utils.split_paths(self.args.data) + def _load_dataset_split(self, split, epoch, combine): + paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) @@ -131,7 +161,6 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, - self.args.dataset_impl, combine=combine, ) if dataset is None: @@ -142,30 +171,38 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): dataset = maybe_shorten_dataset( dataset, split, - self.args.shorten_data_split_list, - self.args.shorten_method, - self.args.tokens_per_sample, - self.args.seed, + self.cfg.shorten_data_split_list, + self.cfg.shorten_method, + self.cfg.tokens_per_sample, + self.cfg.seed, ) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, - self.args.tokens_per_sample - 1, # one less for <s> + self.cfg.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), - break_mode=self.args.sample_break_mode, + break_mode=self.cfg.sample_break_mode, ) logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) - dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) + return PrependTokenDataset(dataset, self.source_dictionary.bos()) + + def load_dataset(self, split, epoch=1, combine=False, **kwargs): + """Load a given dataset split. + + Args: + split (str): name of the split (e.g., train, valid, test) + """ + dataset = self._load_dataset_split(split, epoch, combine) # create masked input and targets mask_whole_words = ( get_whole_word_mask(self.args, self.source_dictionary) - if self.args.mask_whole_words + if self.cfg.mask_whole_words else None ) @@ -174,49 +211,86 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): self.source_dictionary, pad_idx=self.source_dictionary.pad(), mask_idx=self.mask_idx, - seed=self.args.seed, - mask_prob=self.args.mask_prob, - leave_unmasked_prob=self.args.leave_unmasked_prob, - random_token_prob=self.args.random_token_prob, - freq_weighted_replacement=self.args.freq_weighted_replacement, + seed=self.cfg.seed, + mask_prob=self.cfg.mask_prob, + leave_unmasked_prob=self.cfg.leave_unmasked_prob, + random_token_prob=self.cfg.random_token_prob, + freq_weighted_replacement=self.cfg.freq_weighted_replacement, mask_whole_words=mask_whole_words, + mask_multiple_length=self.cfg.mask_multiple_length, + mask_stdev=self.cfg.mask_stdev, + skip_masking=self.cfg.skip_masking, ) - with data_utils.numpy_seed(self.args.seed + epoch): + with data_utils.numpy_seed(self.cfg.seed): shuffle = np.random.permutation(len(src_dataset)) + target_dataset = RightPadDataset( + tgt_dataset, + pad_idx=self.source_dictionary.pad(), + ) + + if self.cfg.d2v2_multi: + dataset = self._d2v2_multi_dataset(src_dataset) + else: + dataset = self._regular_dataset(src_dataset, target_dataset) + self.datasets[split] = SortDataset( - NestedDictionaryDataset( - { - "id": IdDataset(), - "net_input": { - "src_tokens": RightPadDataset( - src_dataset, - pad_idx=self.source_dictionary.pad(), - ), - "src_lengths": NumelDataset(src_dataset, reduce=False), - }, - "target": RightPadDataset( - tgt_dataset, - pad_idx=self.source_dictionary.pad(), - ), - "nsentences": NumSamplesDataset(), - "ntokens": NumelDataset(src_dataset, reduce=True), - }, - sizes=[src_dataset.sizes], + dataset, sort_order=[shuffle, src_dataset.sizes] + ) + + def _regular_dataset(self, src_dataset, target_dataset): + input_dict = { + "src_tokens": RightPadDataset( + src_dataset, + pad_idx=self.source_dictionary.pad(), ), - sort_order=[ - shuffle, - src_dataset.sizes, - ], + "src_lengths": NumelDataset(src_dataset, reduce=False), + } + if self.cfg.include_target_tokens: + input_dict["target_tokens"] = target_dataset + if self.cfg.include_index: + input_dict["src_id"] = IdDataset() + + dataset = NestedDictionaryDataset( + { + "id": IdDataset(), + "net_input": input_dict, + "target": target_dataset, + "nsentences": NumSamplesDataset(), + "ntokens": NumelDataset(src_dataset, reduce=True), + }, + sizes=[src_dataset.sizes], ) + return dataset + + def _d2v2_multi_dataset(self, src_dataset): + input_dict = { + "source": RightPadDataset( + src_dataset, + pad_idx=self.source_dictionary.pad(), + ), + "id": IdDataset(), + "padding_mask": RightPaddingMaskDataset(src_dataset), + } + + dataset = NestedDictionaryDataset( + { + "id": IdDataset(), + "net_input": input_dict, + "nsentences": NumSamplesDataset(), + "ntokens": NumelDataset(src_dataset, reduce=True), + }, + sizes=[src_dataset.sizes], + ) + return dataset def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True): src_dataset = RightPadDataset( TokenBlockDataset( src_tokens, src_lengths, - self.args.tokens_per_sample - 1, # one less for <s> + self.cfg.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", @@ -245,3 +319,9 @@ def source_dictionary(self): @property def target_dictionary(self): return self.dictionary + + def begin_epoch(self, epoch, model): + model.set_epoch(epoch) + + def max_positions(self): + return self.cfg.tokens_per_sample diff --git a/fairseq/tasks/multilingual_denoising.py b/fairseq/tasks/multilingual_denoising.py index d1c914917f..cb5ee34554 100644 --- a/fairseq/tasks/multilingual_denoising.py +++ b/fairseq/tasks/multilingual_denoising.py @@ -2,11 +2,14 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - import logging import os +from dataclasses import dataclass, field +from typing import Optional import numpy as np +from omegaconf import II + from fairseq.data import ( AppendTokenDataset, ConcatDataset, @@ -21,44 +24,49 @@ from fairseq.data.encoders.utils import get_whole_word_mask from fairseq.tasks import register_task -from .denoising import DenoisingTask - +from .denoising import DenoisingConfig, DenoisingTask logger = logging.getLogger(__name__) -@register_task("multilingual_denoising") +@dataclass +class MultilingualDenoisingConfig(DenoisingConfig): + multilang_sampling_alpha: float = field( + default=1.0, + metadata={"help": "smoothing alpha for sample ratios across multiple datasets"}, + ) + add_lang_token: bool = field( + default=False, + metadata={"help": ""}, + ) + langs: Optional[str] = field( + default=None, + metadata={"help": "language ids we are considering"}, + ) + no_whole_word_mask_langs: str = field( + default="", + metadata={ + "help": "languages without spacing between words don't support whole word masking" + }, + ) + train_subset: str = II("common.train_subset") + valid_subset: str = II("common.valid_subset") + + +@register_task("multilingual_denoising", dataclass=MultilingualDenoisingConfig) class MultilingualDenoisingTask(DenoisingTask): - @staticmethod - def add_args(parser): - DenoisingTask.add_args(parser) - parser.add_argument( - "--multilang-sampling-alpha", - type=float, - default=1.0, - help="smoothing alpha for sample ratios across multiple datasets", - ) - parser.add_argument("--add-lang-token", default=False, action="store_true") - parser.add_argument( - "--langs", type=str, help="language ids we are considering", default=None - ) - parser.add_argument( - "--no-whole-word-mask-langs", - type=str, - default="", - metavar="N", - help="languages without spacing between words dont support whole word masking", - ) + + cfg: MultilingualDenoisingConfig @classmethod - def setup_task(cls, args, **kwargs): + def setup_task(cls, cfg: MultilingualDenoisingConfig, **kwargs): """Setup the task.""" - paths = args.data.split(":") + paths = cfg.data.split(":") assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) data_path = paths[0] - if args.langs is None: + if cfg.langs is None: languages = sorted( [ name @@ -67,34 +75,32 @@ def setup_task(cls, args, **kwargs): ] ) else: - languages = args.langs.split(",") + languages = cfg.langs.split(",") - if args.add_lang_token: + if cfg.add_lang_token: for lang in languages: dictionary.add_symbol("[{}]".format(lang)) logger.info("dictionary: {} types".format(len(dictionary))) - if not hasattr(args, "shuffle_instance"): - args.shuffle_instance = False - return cls(args, dictionary) + if not hasattr(cfg, "shuffle_instance"): + cfg.shuffle_instance = False + return cls(cfg, dictionary) - def __init__(self, args, dictionary): - super().__init__(args, dictionary) + def __init__(self, cfg: MultilingualDenoisingConfig, dictionary): + super().__init__(cfg, dictionary) self.dictionary = dictionary - self.seed = args.seed # add mask token self.mask_idx = self.dictionary.add_symbol("<mask>") - self.langs = args.langs - self.args = args + self.cfg = cfg def _get_sample_prob(self, dataset_lens): """ - Get smoothed sampling porbability by languages. This helps low resource + Get smoothed sampling probability by languages. This helps low resource languages by upsampling them. """ prob = dataset_lens / dataset_lens.sum() - smoothed_prob = prob ** self.args.multilang_sampling_alpha + smoothed_prob = prob**self.cfg.multilang_sampling_alpha smoothed_prob = smoothed_prob / smoothed_prob.sum() return smoothed_prob @@ -104,12 +110,12 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): Args: split (str): name of the split (e.g., train, valid, test) """ - paths = self.args.data.split(":") + paths = self.cfg.data.split(":") assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) - if self.langs is None: + if self.cfg.langs is None: languages = sorted( [ name @@ -118,7 +124,7 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): ] ) else: - languages = self.langs.split(",") + languages = self.cfg.langs.split(",") for name in languages: p = os.path.join(data_path, name) assert os.path.exists(p), "data not found: {}".format(p) @@ -128,8 +134,8 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): "Language to id mapping: ", {lang: id for id, lang in enumerate(languages)} ) - mask_whole_words = get_whole_word_mask(self.args, self.dictionary) - language_without_segmentations = self.args.no_whole_word_mask_langs.split(",") + mask_whole_words = get_whole_word_mask(self.cfg.bpe, self.dictionary) + language_without_segmentations = self.cfg.no_whole_word_mask_langs.split(",") lang_datasets = [] for language in languages: split_path = os.path.join(data_path, language, split) @@ -137,7 +143,7 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, - self.args.dataset_impl, + self.cfg.dataset_impl, combine=combine, ) if dataset is None: @@ -147,7 +153,7 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): end_token = ( self.source_dictionary.index("[{}]".format(language)) - if self.args.add_lang_token + if self.cfg.add_lang_token else self.source_dictionary.eos() ) @@ -155,10 +161,10 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): dataset = TokenBlockDataset( dataset, dataset.sizes, - self.args.tokens_per_sample - 2, # one less for <s> + self.cfg.tokens_per_sample - 2, # one less for <s> pad=self.source_dictionary.pad(), eos=end_token, - break_mode=self.args.sample_break_mode, + break_mode=self.cfg.sample_break_mode, ) logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) @@ -177,11 +183,19 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): self.dictionary, self.mask_idx, lang_mask_whole_words, - shuffle=self.args.shuffle_instance, - seed=self.seed, - args=self.args, + shuffle=self.cfg.shuffle_instance, + seed=self.cfg.seed, + mask=self.cfg.mask, + mask_random=self.cfg.mask_random, + insert=self.cfg.insert, + rotate=self.cfg.rotate, + permute_sentences=self.cfg.permute_sentences, + bpe=self.cfg.bpe, + replace_length=self.cfg.replace_length, + mask_length=self.cfg.mask_length, + poisson_lambda=self.cfg.poisson_lambda, eos=None - if not self.args.add_lang_token + if not self.cfg.add_lang_token else self.source_dictionary.index("[{}]".format(language)), ) lang_datasets.append(lang_dataset) @@ -195,7 +209,7 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): int(dataset_lengths.sum()), ) ) - if split == self.args.train_subset: + if split == self.cfg.train_subset: # For train subset, additionally up or down sample languages. sample_probs = self._get_sample_prob(dataset_lengths) logger.info( @@ -220,7 +234,7 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): ResamplingDataset( lang_datasets[i], size_ratio=size_ratio[i], - seed=self.args.seed, + seed=self.cfg.seed, epoch=epoch, replace=size_ratio[i] >= 1.0, ) @@ -237,12 +251,12 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): lang_splits.append(split_name) self.datasets[split_name] = lang_dataset - if split in self.args.valid_subset: - self.args.valid_subset = self.args.valid_subset.replace( + if split in self.cfg.valid_subset: + self.cfg.valid_subset = self.cfg.valid_subset.replace( split, ",".join(lang_splits) ) - with data_utils.numpy_seed(self.args.seed + epoch): + with data_utils.numpy_seed(self.cfg.seed + epoch): shuffle = np.random.permutation(len(dataset)) self.datasets[split] = SortDataset( diff --git a/fairseq/tasks/multilingual_language_modeling.py b/fairseq/tasks/multilingual_language_modeling.py new file mode 100644 index 0000000000..8fd5e5954d --- /dev/null +++ b/fairseq/tasks/multilingual_language_modeling.py @@ -0,0 +1,627 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +from dataclasses import dataclass, field +from typing import Optional + +import numpy as np +import torch +from omegaconf import II + +from fairseq import utils +from fairseq.data import ( + AppendTokenDataset, + ConcatDataset, + Dictionary, + IdDataset, + LMContextWindowDataset, + MonolingualDataset, + NestedDictionaryDataset, + NumelDataset, + PadDataset, + PrependTokenDataset, + ResamplingDataset, + SortDataset, + StripTokenDataset, + TokenBlockDataset, + TruncatedDictionary, + data_utils, +) +from fairseq.data.indexed_dataset import get_available_dataset_impl +from fairseq.data.shorten_dataset import maybe_shorten_dataset +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.tasks import LegacyFairseqTask, register_task + +SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"]) +SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) +logger = logging.getLogger(__name__) + + +def lang_token(lang): + return f"<{lang}>" + + +@dataclass +class MultilingualLanguageModelingConfig(FairseqDataclass): + # TODO common var add to parent + data: Optional[str] = field( + default=None, metadata={"help": "path to data directory"} + ) + sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( + default="none", + metadata={ + "help": 'If omitted or "none", fills each sample with tokens-per-sample ' + 'tokens. If set to "complete", splits samples only at the end ' + "of sentence, but may include multiple sentences per sample. " + '"complete_doc" is similar but respects doc boundaries. ' + 'If set to "eos", includes only one sentence per sample.' + }, + ) + tokens_per_sample: int = field( + default=1024, + metadata={"help": "max number of tokens per sample for LM dataset"}, + ) + output_dictionary_size: int = field( + default=-1, metadata={"help": "limit the size of output dictionary"} + ) + self_target: bool = field(default=False, metadata={"help": "include self target"}) + future_target: bool = field( + default=False, metadata={"help": "include future target"} + ) + past_target: bool = field(default=False, metadata={"help": "include past target"}) + add_bos_token: bool = field( + default=False, metadata={"help": "prepend lang id token <dialect>"} + ) + max_source_positions: Optional[int] = field( + default=None, metadata={"help": "max number of tokens in the source sequence"} + ) + max_target_positions: Optional[int] = field( + default=None, metadata={"help": "max number of tokens in the target sequence"} + ) + pad_to_fixed_length: Optional[bool] = field( + default=False, metadata={"help": "pad to fixed length"} + ) + pad_to_fixed_bsz: Optional[bool] = field( + default=False, metadata={"help": "boolean to pad to fixed batch size"} + ) + + multilang_sampling_alpha: Optional[float] = field( + default=1.0, + metadata={ + "help": "smoothing alpha for sample rations across multiple datasets" + }, + ) + + shorten_method: SHORTEN_METHOD_CHOICES = field( + default="none", + metadata={ + "help": "if not none, shorten sequences that exceed --tokens-per-sample" + }, + ) + shorten_data_split_list: str = field( + default="", + metadata={ + "help": "comma-separated list of dataset splits to apply shortening to, " + 'e.g., "train,valid" (default: all dataset splits)' + }, + ) + + langs: str = field( + default="", + metadata={ + "help": "comma-separated list of languages (default: all directories in data path)" + }, + ) + baseline_model_langs: str = field( + default="", + metadata={ + "help": "comma-separated list of languages in the baseline model (default: none)" + }, + ) + # TODO: legacy parameter kept for compatibility + baseline_model: str = field( + default="", + metadata={"help": "path to the baseline model (default: none)"}, + ) + + lang_to_offline_shard_ratio: str = field( + default="", + metadata={ + "help": "absolute path of tsv file location to indicate lang to offline shard ratio.", + }, + ) + # TODO common vars below add to parent + seed: int = II("common.seed") + dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( + "dataset.dataset_impl" + ) + data_buffer_size: int = II("dataset.data_buffer_size") + tpu: bool = II("common.tpu") + batch_size: Optional[int] = II("dataset.batch_size") + batch_size_valid: Optional[int] = II("dataset.batch_size_valid") + train_subset: str = II("common.train_subset") + valid_subset: str = II("common.valid_subset") + + +@register_task( + "multilingual_language_modeling", dataclass=MultilingualLanguageModelingConfig +) +class MultilingualLanguageModelingTask(LegacyFairseqTask): + """ + Train a language model. + + Args: + dictionary (~fairseq.data.Dictionary): the dictionary for the input of + the language model + output_dictionary (~fairseq.data.Dictionary): the dictionary for the + output of the language model. In most cases it will be the same as + *dictionary*, but could possibly be a more limited version of the + dictionary (if ``--output-dictionary-size`` is used). + targets (List[str]): list of the target types that the language model + should predict. Can be one of "self", "future", and "past". + Defaults to "future". + + .. note:: + + The language modeling task is compatible with :mod:`fairseq-train`, + :mod:`fairseq-generate`, :mod:`fairseq-interactive` and + :mod:`fairseq-eval-lm`. + + The language modeling task provides the following additional command-line + arguments: + + .. argparse:: + :ref: fairseq.tasks.language_modeling_parser + :prog: + """ + + def __init__(self, args, dictionary, output_dictionary=None, targets=None): + super().__init__(args) + self.dictionary = dictionary + self.output_dictionary = output_dictionary or dictionary + + if targets is None: + targets = ["future"] + self.targets = targets + + @staticmethod + def _get_langs(args, epoch=1): + paths = utils.split_paths(args.data) + assert len(paths) > 0 + data_path = paths[(epoch - 1) % len(paths)] + + languages = sorted( + name + for name in os.listdir(data_path) + if os.path.isdir(os.path.join(data_path, name)) + ) + if args.langs: + keep_langs = set(args.langs.split(",")) + languages = [lang for lang in languages if lang in keep_langs] + assert len(languages) == len(keep_langs) + + return languages, data_path + + @classmethod + def setup_dictionary(cls, args, **kwargs): + dictionary = None + output_dictionary = None + if args.data: + paths = utils.split_paths(args.data) + assert len(paths) > 0 + dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) + if args.add_bos_token: + languages, _ = cls._get_langs(args) + logger.info("----------------") + for lang in languages: + dictionary.add_symbol(lang_token(lang)) + logger.info(f"add language token: {lang_token(lang)}") + logger.info("----------------") + + logger.info("dictionary: {} types".format(len(dictionary))) + output_dictionary = dictionary + if args.output_dictionary_size >= 0: + output_dictionary = TruncatedDictionary( + dictionary, args.output_dictionary_size + ) + return (dictionary, output_dictionary) + + @classmethod + def setup_task(cls, args, **kwargs): + """Setup the task (e.g., load dictionaries). + + Args: + args (argparse.Namespace): parsed command-line arguments + """ + dictionary, output_dictionary = cls.setup_dictionary(args, **kwargs) + + # upgrade old checkpoints + if hasattr(args, "exclude_self_target"): + args.self_target = not args.exclude_self_target + + targets = [] + if getattr(args, "self_target", False): + targets.append("self") + if getattr(args, "future_target", False): + targets.append("future") + if getattr(args, "past_target", False): + targets.append("past") + if len(targets) == 0: + # standard language modeling + targets = ["future"] + + return cls(args, dictionary, output_dictionary, targets=targets) + + def build_model(self, args, from_checkpoint=False): + model = super().build_model(args, from_checkpoint) + for target in self.targets: + if target not in model.supported_targets: + raise ValueError( + f"Unsupported language modeling target: {target} not in {model.supported_targets}" + ) + + return model + + def _get_sample_prob(self, dataset_lens): + """ + Get smoothed sampling porbability by languages. This helps low resource + languages by upsampling them. + """ + prob = dataset_lens / dataset_lens.sum() + smoothed_prob = prob**self.args.multilang_sampling_alpha + smoothed_prob = smoothed_prob / smoothed_prob.sum() + return smoothed_prob + + def load_dataset(self, split: str, epoch=1, combine=False, **kwargs): + """Load a given dataset split. + + Args: + split (str): name of the split (e.g., train, valid, test) + """ + languages, data_path = MultilingualLanguageModelingTask._get_langs( + self.args, epoch + ) + lang_to_offline_shard_ratio = None + if self.args.lang_to_offline_shard_ratio != "": + lang_to_offline_shard_ratio = {} + assert os.path.exists( + self.args.lang_to_offline_shard_ratio + ), "provided offline shard ratio file doesn't exist: {0}".format( + self.args.lang_to_offline_shard_ratio + ) + with open(self.args.lang_to_offline_shard_ratio) as fin: + for line in fin: + lang, ratio = line.strip().split("\t") + ratio = float(ratio) + lang_to_offline_shard_ratio[lang] = ratio + + logger.info( + "Found offline sharded ratio: %s", + lang_to_offline_shard_ratio, + ) + + if split == self.args.train_subset: + logger.info( + "Training on {0} languages: {1}".format(len(languages), languages) + ) + else: + logger.info( + "Evaluating on {0} languages: {1}".format(len(languages), languages) + ) + + tokens_per_sample = self.args.tokens_per_sample - int(self.args.add_bos_token) + + fixed_pad_length = None + if self.args.pad_to_fixed_length: + fixed_pad_length = self.args.tokens_per_sample + + pad_to_bsz = None + if self.args.pad_to_fixed_bsz: + pad_to_bsz = ( + self.args.batch_size_valid if "valid" in split else self.args.batch_size + ) + + lang_datasets = [] + for lang_id, language in enumerate(languages): + split_path = os.path.join(data_path, language, split) + dataset = data_utils.load_indexed_dataset( + split_path, self.dictionary, self.args.dataset_impl, combine=combine + ) + # print('len(dataset) =', len(dataset)) + if dataset is None: + raise FileNotFoundError( + "Dataset not found: {} ({})".format(split, split_path) + ) + + dataset = maybe_shorten_dataset( + dataset, + split, + self.args.shorten_data_split_list, + self.args.shorten_method, + tokens_per_sample, + self.args.seed, + ) + + dataset = TokenBlockDataset( + dataset, + dataset.sizes, + tokens_per_sample, + pad=self.dictionary.pad(), + eos=self.dictionary.eos(), + break_mode=self.args.sample_break_mode, + include_targets=True, + ) + + add_eos_for_other_targets = ( + self.args.sample_break_mode is not None + and self.args.sample_break_mode != "none" + ) + src_lang_idx, tgt_lang_idx = None, None + if self.args.add_bos_token: + src_lang_idx = self.dictionary.index(lang_token(language)) + tgt_lang_idx = self.output_dictionary.index(lang_token(language)) + + lang_datasets.append( + MonolingualDataset( + dataset=dataset, + sizes=dataset.sizes, + src_vocab=self.dictionary, + tgt_vocab=self.output_dictionary, + add_eos_for_other_targets=add_eos_for_other_targets, + shuffle=True, + targets=self.targets, + fixed_pad_length=fixed_pad_length, + pad_to_bsz=pad_to_bsz, + add_bos_token=self.args.add_bos_token, + src_lang_idx=src_lang_idx, + tgt_lang_idx=tgt_lang_idx, + ) + ) + + dataset_lengths = np.array( + [len(d) for d in lang_datasets], + dtype=float, + ) + logger.info( + "loaded total {} blocks for all languages".format( + dataset_lengths.sum(), + ) + ) + if split == self.args.train_subset: + dataset_lengths_ratio_multiplier = np.ones(len(dataset_lengths)) + if lang_to_offline_shard_ratio is not None: + dataset_lengths_ratio_multiplier = [] + for lang in languages: + assert ( + lang in lang_to_offline_shard_ratio + ), "Lang: {0} missing in offline shard ratio file: {1}".format( + lang, + self.args.lang_to_offline_shard_ratio, + ) + dataset_lengths_ratio_multiplier.append( + lang_to_offline_shard_ratio[lang] + ) + dataset_lengths_ratio_multiplier = np.array( + dataset_lengths_ratio_multiplier + ) + true_dataset_lengths = ( + dataset_lengths * dataset_lengths_ratio_multiplier + ) + else: + true_dataset_lengths = dataset_lengths + # For train subset, additionally up or down sample languages. + sample_probs = self._get_sample_prob(true_dataset_lengths) + + logger.info( + "Sample probability by language: %s", + { + lang: "{0:.4f}".format(sample_probs[id]) + for id, lang in enumerate(languages) + }, + ) + size_ratio = (sample_probs * true_dataset_lengths.sum()) / dataset_lengths + # TODO: add an option for shrinking all size ratios to below 1 + # if self.args.multilang_sampling_alpha != 1: + # size_ratio /= size_ratio.max() + + # Fix numeric errors in size ratio computation + # 0.999999999999999999 -> 1 + # 1.000000000000000002 -> 1 + for i in range(len(size_ratio)): + size_ratio[i] = round(size_ratio[i], 8) + + logger.info( + "Up/Down Sampling ratio by language: %s", + { + lang: "{0:.2f}".format(size_ratio[id]) + for id, lang in enumerate(languages) + }, + ) + logger.info( + "Actual dataset size by language: %s", + { + lang: "{0:.2f}".format(len(lang_datasets[id])) + for id, lang in enumerate(languages) + }, + ) + resampled_lang_datasets = [ + ResamplingDataset( + lang_datasets[i], + size_ratio=size_ratio[i], + seed=self.args.seed, + epoch=epoch, + replace=size_ratio[i] > 1.0, + ) + for i, d in enumerate(lang_datasets) + ] + logger.info( + "Resampled dataset size by language: %s", + { + lang: "{0:.2f}".format(len(resampled_lang_datasets[id])) + for id, lang in enumerate(languages) + }, + ) + dataset = ConcatDataset(resampled_lang_datasets) + else: + dataset = ConcatDataset(lang_datasets) + lang_splits = [split] + for lang_id, lang_dataset in enumerate(lang_datasets): + split_name = split + "_" + languages[lang_id] + lang_splits.append(split_name) + self.datasets[split_name] = lang_dataset + + # [TODO]: This is hacky for now to print validation ppl for each + # language individually. Maybe need task API changes to allow it + # in more generic ways. + if split in self.args.valid_subset: + self.args.valid_subset = self.args.valid_subset.replace( + split, ",".join(lang_splits) + ) + + with data_utils.numpy_seed(self.args.seed + epoch): + shuffle = np.random.permutation(len(dataset)) + + self.datasets[split] = SortDataset( + dataset, + sort_order=[ + shuffle, + dataset.sizes, + ], + ) + + def build_dataset_for_inference( + self, src_tokens, src_lengths, language="en_XX", **kwargs + ): + """ + Generate batches for inference. We prepend an eos token to src_tokens + (or bos if `--add-bos-token` is set) and we append a <pad> to target. + This is convenient both for generation with a prefix and LM scoring. + """ + dataset = StripTokenDataset( + TokenBlockDataset( + src_tokens, + src_lengths, + block_size=None, # ignored for "eos" break mode + pad=self.source_dictionary.pad(), + eos=self.source_dictionary.eos(), + break_mode="eos", + ), + # remove eos from (end of) target sequence + self.source_dictionary.eos(), + ) + + src_lang_idx = self.dictionary.index(lang_token(language)) + src_dataset = PrependTokenDataset( + dataset, + token=( + (src_lang_idx or self.source_dictionary.bos()) + if getattr(self.args, "add_bos_token", False) + else self.source_dictionary.eos() + ), + ) + + max_seq_len = max(src_lengths) + 1 + tgt_dataset = AppendTokenDataset(dataset, token=self.source_dictionary.pad()) + return NestedDictionaryDataset( + { + "id": IdDataset(), + "net_input": { + "src_tokens": PadDataset( + src_dataset, + pad_idx=self.source_dictionary.pad(), + left_pad=False, + pad_length=max_seq_len, + ), + "src_lengths": NumelDataset(src_dataset, reduce=False), + }, + "target": PadDataset( + tgt_dataset, + pad_idx=self.source_dictionary.pad(), + left_pad=False, + pad_length=max_seq_len, + ), + }, + sizes=[np.array(src_lengths)], + ) + + @torch.no_grad() + def inference_step( + self, + generator, + models, + sample, + language="en_XX", + prefix_tokens=None, + constraints=None, + ): + # Generation will always be conditioned on bos_token + if getattr(self.args, "add_bos_token", False): + src_lang_idx = self.dictionary.index(lang_token(language)) + bos_token = src_lang_idx or self.source_dictionary.bos() + else: + bos_token = self.source_dictionary.eos() + + if constraints is not None: + raise NotImplementedError( + "Constrained decoding with the language_modeling task is not supported" + ) + + # SequenceGenerator doesn't use src_tokens directly, we need to + # pass the `prefix_tokens` argument instead + if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement(): + prefix_tokens = sample["net_input"]["src_tokens"] + if prefix_tokens[:, 0].eq(bos_token).all(): + prefix_tokens = prefix_tokens[:, 1:] + + return generator.generate( + models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token + ) + + def eval_lm_dataloader( + self, + dataset, + max_tokens: Optional[int] = 36000, + batch_size: Optional[int] = None, + max_positions: Optional[int] = None, + num_shards: int = 1, + shard_id: int = 0, + num_workers: int = 1, + data_buffer_size: int = 10, + # ensures that every evaluated token has access to a context of at least + # this size, if possible + context_window: int = 0, + ): + if context_window > 0: + dataset = LMContextWindowDataset( + dataset=dataset, + tokens_per_sample=self.args.tokens_per_sample, + context_window=context_window, + pad_idx=self.source_dictionary.pad(), + ) + return self.get_batch_iterator( + dataset=dataset, + max_tokens=max_tokens, + max_sentences=batch_size, + max_positions=max_positions, + ignore_invalid_inputs=True, + num_shards=num_shards, + shard_id=shard_id, + num_workers=num_workers, + data_buffer_size=data_buffer_size, + ) + + @property + def source_dictionary(self): + """Return the :class:`~fairseq.data.Dictionary` for the language + model.""" + return self.dictionary + + @property + def target_dictionary(self): + """Return the :class:`~fairseq.data.Dictionary` for the language + model.""" + return self.output_dictionary diff --git a/fairseq/tasks/multilingual_masked_lm.py b/fairseq/tasks/multilingual_masked_lm.py index 9e6ce4b8a2..156d085aa4 100644 --- a/fairseq/tasks/multilingual_masked_lm.py +++ b/fairseq/tasks/multilingual_masked_lm.py @@ -8,6 +8,7 @@ import numpy as np import torch + from fairseq import utils from fairseq.data import ( ConcatDataset, @@ -28,7 +29,6 @@ ) from fairseq.tasks import LegacyFairseqTask, register_task - logger = logging.getLogger(__name__) @@ -144,7 +144,7 @@ def _get_sample_prob(self, dataset_lens): languages by upsampling them. """ prob = dataset_lens / dataset_lens.sum() - smoothed_prob = prob ** self.args.multilang_sampling_alpha + smoothed_prob = prob**self.args.multilang_sampling_alpha smoothed_prob = smoothed_prob / smoothed_prob.sum() return smoothed_prob diff --git a/fairseq/tasks/multilingual_translation.py b/fairseq/tasks/multilingual_translation.py index 26e0b529d5..cef7656691 100644 --- a/fairseq/tasks/multilingual_translation.py +++ b/fairseq/tasks/multilingual_translation.py @@ -7,9 +7,11 @@ import logging import os from collections import OrderedDict +from argparse import ArgumentError import torch -from fairseq import metrics, options, utils +from fairseq import options, utils +from fairseq.logging import metrics from fairseq.data import ( Dictionary, LanguagePairDataset, @@ -77,10 +79,14 @@ def add_args(parser): help='pad the source on the left (default: True)') parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL', help='pad the target on the left (default: False)') - parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N', - help='max number of tokens in the source sequence') - parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N', - help='max number of tokens in the target sequence') + try: + parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N', + help='max number of tokens in the source sequence') + parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N', + help='max number of tokens in the target sequence') + except ArgumentError: + # this might have already been defined. Once we transition this to hydra it should be fine to add it here. + pass parser.add_argument('--upsample-primary', default=1, type=int, help='amount to upsample primary dataset') parser.add_argument('--encoder-langtok', default=None, type=str, choices=['src', 'tgt'], @@ -276,7 +282,7 @@ def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None) eval_key=lang_pair, ) - def build_model(self, args): + def build_model(self, args, from_checkpoint=False): def check_args(): messages = [] if ( @@ -311,7 +317,7 @@ def check_args(): from fairseq import models - model = models.build_model(args, self) + model = models.build_model(args, self, from_checkpoint) if not isinstance(model, FairseqMultiModel): raise ValueError( "MultilingualTranslationTask requires a FairseqMultiModel architecture" diff --git a/fairseq/tasks/multires_hubert_pretraining.py b/fairseq/tasks/multires_hubert_pretraining.py new file mode 100644 index 0000000000..cfed147cb5 --- /dev/null +++ b/fairseq/tasks/multires_hubert_pretraining.py @@ -0,0 +1,204 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import logging +import os +import sys +from typing import Dict, List, Optional, Tuple + +import numpy as np + +from dataclasses import dataclass, field +from fairseq.data import Dictionary, HubertDataset +from fairseq.dataclass.configs import FairseqDataclass +from fairseq.tasks import register_task +from fairseq.tasks.fairseq_task import FairseqTask +from omegaconf import MISSING + +logger = logging.getLogger(__name__) + + +class LabelEncoder(object): + def __init__(self, dictionary: Dictionary) -> None: + self.dictionary = dictionary + + def __call__(self, label: str) -> List[str]: + return self.dictionary.encode_line( + label, + append_eos=False, + add_if_not_exist=False, + ) + + +@dataclass +class MultiresHubertPretrainingConfig(FairseqDataclass): + data: str = field(default=MISSING, metadata={"help": "path to data directory"}) + fine_tuning: bool = field( + default=False, metadata={"help": "set to true if fine-tuning Hubert"} + ) + labels: List[str] = field( + default_factory=lambda: ["ltr50", "ltr25"], + metadata={ + "help": ( + "extension of the label files to load, frame-level labels for" + " pre-training, and sequence-level label for fine-tuning" + ) + }, + ) + label_dir: Optional[str] = field( + default=None, + metadata={ + "help": "if set, looks for labels in this directory instead", + }, + ) + label_rate: float = field( + default=-1.0, + metadata={"help": "label frame rate. -1.0 for sequence label"}, + ) + # label_rate: 1,2,2,5 + # (imply (1,2), (2,5)) + # if base label_rate = 50 + # (1,2), (2,5) --> label rates 50, 25, 10 + label_rate_ratios: List[int] = field(default=MISSING, metadata={"help": "tuple for label rates e.g., [(1,2), (2,5)]"}) + sample_rate: int = field( + default=16_000, + metadata={ + "help": "target sample rate. audio files will be up/down " + "sampled to this rate" + }, + ) + normalize: bool = field( + default=False, + metadata={"help": "if set, normalizes input to have 0 mean and unit variance"}, + ) + enable_padding: bool = field( + default=False, + metadata={"help": "pad shorter samples instead of cropping"}, + ) + max_keep_size: Optional[int] = field( + default=None, + metadata={"help": "exclude sample longer than this"}, + ) + max_sample_size: Optional[int] = field( + default=None, + metadata={"help": "max sample size to crop to for batching"}, + ) + min_sample_size: Optional[int] = field( + default=None, + metadata={"help": "min sample size to crop to for batching"}, + ) + random_crop: Optional[bool] = field( + default=True, + metadata={"help": "always crop from the beginning if false"}, + ) + pad_audio: Optional[bool] = field( + default=False, + metadata={"help": "pad audio to the longest one in the batch if true"}, + ) + + +@register_task("multires_hubert_pretraining", dataclass=MultiresHubertPretrainingConfig) +class MultiresHubertPretrainingTask(FairseqTask): + """ + Multiresolution HuBERT Pretraining Task. + The task is based on `HubertPretrainingTask` but extended to multiresolution. + """ + + cfg: MultiresHubertPretrainingConfig + + def __init__( + self, + cfg: MultiresHubertPretrainingConfig, + ) -> None: + super().__init__(cfg) + + logger.info(f"current directory is {os.getcwd()}") + logger.info(f"MultiresHubertPretrainingTask Config {cfg}") + + self.cfg = cfg + self.fine_tuning = cfg.fine_tuning + + if cfg.fine_tuning: + self.state.add_factory("target_dictionary", self.load_dictionaries) + self.res_number = 1 + else: + self.state.add_factory("dictionaries", self.load_dictionaries) + + self.blank_symbol = "<s>" + + @property + def source_dictionary(self) -> Optional[Dictionary]: + return None + + @property + def target_dictionary(self) -> Optional[Dictionary]: + return self.state.target_dictionary + + @property + def dictionaries(self) -> List[Dictionary]: + return self.state.dictionaries + + @classmethod + def setup_task( + cls, cfg: MultiresHubertPretrainingConfig, **kwargs + ) -> "MultiresHubertPretrainingTask": + return cls(cfg) + + def load_dictionaries(self): + label_dir = self.cfg.data if self.cfg.label_dir is None else self.cfg.label_dir + self.res_number = len(label_dir) + dictionaries = [ (Dictionary.load(f"{label_dir}/dict.{label}.txt") if label is not "" else None ) for label in self.cfg.labels] + return dictionaries[0] if self.cfg.fine_tuning else dictionaries + + def get_label_dir(self) -> str: + if self.cfg.label_dir is None: + return self.cfg.data + return self.cfg.label_dir + + def load_dataset(self, split: str, **kwargs) -> None: + manifest = f"{self.cfg.data}/{split}.tsv" + dicts = [self.target_dictionary] if self.cfg.fine_tuning else self.dictionaries + pad_list = [(dict.pad() if dict is not None else None) for dict in dicts] + eos_list = [(dict.eos() if dict is not None else None) for dict in dicts] + procs = [LabelEncoder(dict) for dict in dicts] + paths = [(f"{self.get_label_dir()}/{split}.{l}" if l != "" else None) for l in self.cfg.labels] + + base_rate = self.cfg.label_rate + self.label_rates = [base_rate] + label_rate_ratios = self.cfg.label_rate_ratios + self.label_rate_ratios = [] + for i in range(len(label_rate_ratios) // 2): + + upsample_rate, downsample_rate = label_rate_ratios[i * 2], label_rate_ratios[i * 2 + 1] + # parse label rate ratios + self.label_rate_ratios.append((upsample_rate, downsample_rate)) + base_rate = base_rate * upsample_rate // downsample_rate + self.label_rates.append(base_rate) + + # hubert v1: pad_audio=True, random_crop=False; + self.datasets[split] = HubertDataset( + manifest, + sample_rate=self.cfg.sample_rate, + label_paths=paths, + label_rates=self.label_rates, + pad_list=pad_list, + eos_list=eos_list, + label_processors=procs, + max_keep_sample_size=self.cfg.max_keep_size, + min_keep_sample_size=self.cfg.min_sample_size, + max_sample_size=self.cfg.max_sample_size, + pad_audio=self.cfg.pad_audio, + normalize=self.cfg.normalize, + store_labels=False, + random_crop=self.cfg.random_crop, + ) + + def max_positions(self) -> Tuple[int, int]: + return (sys.maxsize, sys.maxsize) + + def filter_indices_by_size(self, indices: np.array, *args, **kwargs) -> np.array: + return indices diff --git a/fairseq/tasks/nlu_finetuning.py b/fairseq/tasks/nlu_finetuning.py new file mode 100644 index 0000000000..a335021335 --- /dev/null +++ b/fairseq/tasks/nlu_finetuning.py @@ -0,0 +1,477 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import logging +import os +import torch +import json + +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Optional, Any + +from fairseq.data import AddTargetDataset, Dictionary, encoders +from fairseq.tasks.audio_pretraining import AudioPretrainingTask, AudioPretrainingConfig +from fairseq.dataclass import FairseqDataclass +from fairseq.dataclass.configs import GenerationConfig +from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel + +from . import register_task +from .. import utils +from ..logging import metrics + + +logger = logging.getLogger(__name__) + + +class LabelEncoder(object): + def __init__(self, dictionary): + self.dictionary = dictionary + + def __call__(self, label): + return self.dictionary.encode_line( + label, append_eos=False, add_if_not_exist=False + ) + + +def label_len_fn(label): + return len(label.split(" ")) + + +@dataclass +class NLUFinetuningConfig(AudioPretrainingConfig): + # Options for reporting WER metrics during validation. Only applicable to + # Seq2Seq models during fine-tuning + eval_wer: bool = field( + default=False, metadata={"help": "compute WER for Seq2Seq models"} + ) + eval_wer_parse: bool = field( + default=False, metadata={"help": "compute WER for Seq2Seq models"} + ) + eval_wer_config: GenerationConfig = field( + default_factory=lambda: GenerationConfig(), + metadata={"help": "beam search config for evaluating wer during training"}, + ) + eval_wer_tokenizer: Any = field( + default=None, + metadata={"help": "tokenizer config for evaluating wer during training"}, + ) + eval_wer_post_process: str = field( + default="letter", + metadata={ + "help": "remove BPE tokens before scoring (can be sentencepiece, letter, and more)" + }, + ) + eval_bleu: bool = field( + default=False, metadata={"help": "evaluation with BLEU scores"} + ) + eval_bleu_detok: Optional[str] = field( + default=None, + metadata={ + "help": "detokenize before computing BLEU (e.g., 'moses'); " + "required if using --eval-bleu; use 'space' to disable " + "detokenization; see fairseq.data.encoders for other options" + }, + ) + eval_bleu_detok_args: str = field( + default="{}", metadata={"help": "args for building the tokenizer, if needed"} + ) + eval_tokenized_bleu: bool = field( + default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"} + ) + eval_bleu_remove_bpe: Optional[str] = field( + default=None, metadata={"help": "remove BPE before computing BLEU"} + ) + eval_bleu_args: str = field( + default="{}", + metadata={ + "help": "generation args for BLUE scoring, e.g., " + '\'{"beam": 4, "lenpen": 0.6}\'' + }, + ) + eval_bleu_print_samples: bool = field( + default=False, metadata={"help": "print sample generations during validation"} + ) + autoregressive: bool = field( + default=False, + metadata={ + "help": "required for autoregressive decoders (like seq2seq models); " + "adds 'prev_output_tokens' to input and appends eos to target" + }, + ) + + +@register_task("nlu_finetuning", dataclass=NLUFinetuningConfig) +class NLUFinetuningTask(AudioPretrainingTask): + """ """ + + cfg: NLUFinetuningConfig + + def __init__( + self, + cfg: NLUFinetuningConfig, + ): + super().__init__(cfg) + self.blank_symbol = "<s>" + + self.state.add_factory("target_dictionary", self.load_target_dictionary) + + def load_target_dictionary(self): + if self.cfg.labels: + dict_path = os.path.join(self.cfg.data, f"dict.{self.cfg.labels}.txt") + return Dictionary.load(dict_path) + return None + + def load_dataset(self, split: str, task_cfg: NLUFinetuningConfig = None, **kwargs): + super().load_dataset(split, task_cfg, **kwargs) + + task_cfg = task_cfg or self.cfg + assert task_cfg.labels is not None + text_compression_level = getattr( + TextCompressionLevel, str(self.cfg.text_compression_level) + ) + data_path = self.cfg.data + label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}") + skipped_indices = getattr(self.datasets[split], "skipped_indices", set()) + text_compressor = TextCompressor(level=text_compression_level) + with open(label_path, "r") as f: + labels = [ + text_compressor.compress(l) + for i, l in enumerate(f) + if i not in skipped_indices + ] + + assert len(labels) == len(self.datasets[split]), ( + f"labels length ({len(labels)}) and dataset length " + f"({len(self.datasets[split])}) do not match" + ) + + process_label = LabelEncoder(self.target_dictionary) + + self.datasets[split] = AddTargetDataset( + self.datasets[split], + labels, + pad=self.target_dictionary.pad(), + eos=self.target_dictionary.eos(), + batch_targets=True, + process_label=process_label, + label_len_fn=label_len_fn, + add_to_input=task_cfg.get("autoregressive", False), + text_compression_level=text_compression_level, + ) + + @property + def target_dictionary(self): + """Return the :class:`~fairseq.data.Dictionary` for the language + model.""" + return self.state.target_dictionary + + def valid_step(self, sample, model, criterion): + loss, sample_size, logging_output = super().valid_step(sample, model, criterion) + if self.cfg.eval_wer_parse and self.cfg.autoregressive: + metrics = self._inference_with_wer_parse( + self.sequence_generator, sample, model + ) + logging_output["_num_char_errors"] = metrics["num_char_errors"] + logging_output["_num_chars"] = metrics["num_chars"] + logging_output["_num_word_errors"] = metrics["num_word_errors"] + logging_output["_num_words"] = metrics["num_words"] + logging_output["_num_em_errors"] = metrics["num_em_errors"] + logging_output["_num_ems"] = metrics["num_ems"] + logging_output["_num_tree_errors"] = metrics["num_tree_errors"] + logging_output["_num_trees"] = metrics["num_trees"] + if self.cfg.eval_wer and self.cfg.autoregressive: + metrics = self._inference_with_wer(self.sequence_generator, sample, model) + logging_output["_num_char_errors"] = metrics["num_char_errors"] + logging_output["_num_chars"] = metrics["num_chars"] + logging_output["_num_word_errors"] = metrics["num_word_errors"] + logging_output["_num_words"] = metrics["num_words"] + if self.cfg.eval_bleu and self.cfg.autoregressive: + metrics = self._inference_with_bleu(self.sequence_generator, sample, model) + logging_output["_bleu_sys_len"] = metrics.sys_len + logging_output["_bleu_ref_len"] = metrics.ref_len + # we split counts into separate entries so that they can be + # summed efficiently across workers using fast-stat-sync + assert len(metrics.counts) == 4 + for i in range(4): + logging_output[f"_bleu_counts_{i}"] = metrics.counts[i] + logging_output[f"_bleu_totals_{i}"] = metrics.totals[i] + return loss, sample_size, logging_output + + def build_model(self, model_cfg: FairseqDataclass): + model = super().build_model(model_cfg) + + if (self.cfg.eval_wer or self.cfg.eval_wer_parse) and self.cfg.autoregressive: + self.sequence_generator = self.build_generator( + [model], + self.cfg.eval_wer_config, + ) + if self.cfg.eval_wer_tokenizer: + self.tokenizer = encoders.build_tokenizer(self.cfg.eval_wer_tokenizer) + else: + self.tokenizer = None + if self.cfg.eval_bleu and self.cfg.autoregressive: + assert self.cfg.eval_bleu_detok is not None, ( + "--eval-bleu-detok is required if using --eval-bleu; " + "try --eval-bleu-detok=moses (or --eval-bleu-detok=space " + "to disable detokenization, e.g., when using sentencepiece)" + ) + detok_args = json.loads(self.cfg.eval_bleu_detok_args) + self.tokenizer = encoders.build_tokenizer( + Namespace(tokenizer=self.cfg.eval_bleu_detok, **detok_args) + ) + gen_args = json.loads(self.cfg.eval_bleu_args) + gen_args = Namespace(**gen_args) + self.sequence_generator = self.build_generator([model], gen_args) + + return model + + def _inference_with_wer_parse(self, generator, sample, model): + import editdistance + + def decode(toks): + s = self.target_dictionary.string( + toks.int().cpu(), + self.cfg.eval_wer_post_process, + escape_unk=True, + ) + if self.tokenizer: + s = self.tokenizer.decode(s) + return s + + def decode_to_list(toks): + def token_string(i): + if i == self.target_dictionary.unk(): + return self.target_dictionary.unk_string(False) + else: + return self.target_dictionary[i] + + return [token_string(i) for i in toks] + + def is_ont_token(token): + return "[" in token or "]" in token + + def post_process(l): + o = [] + for w in l: + if w == self.target_dictionary.eos_word or w == "|": + continue + if w == "_": + o.append(" ") + else: + o.append(w) + if is_ont_token(w): + o.append(" ") + return o + + num_word_errors, num_char_errors = 0, 0 + num_chars, num_words = 0, 0 + num_em_errors, num_ems = 0, 0 + num_tree_errors, num_trees = 0, 0 + gen_out = self.inference_step(generator, [model], sample, None) + for i in range(len(gen_out)): + hyp_tokens = gen_out[i][0]["tokens"] + # hyp = decode(hyp_tokens) + ref_tokens = utils.strip_pad( + sample["target"][i], self.target_dictionary.pad() + ) + # ref = decode(ref_tokens) + hyp_list = decode_to_list(hyp_tokens) + ref_list = decode_to_list(ref_tokens) + + hyp_list = post_process(hyp_list) + ref_list = post_process(ref_list) + + hyp = "".join(hyp_list).strip() + ref = "".join(ref_list).strip() + num_chars += len(ref) + num_char_errors += editdistance.eval(hyp, ref) + hyp_words = hyp.split() + ref_words = ref.split() + hyp_tree = [word for word in hyp_list if ("[" in word or "]" in word)] + ref_tree = [word for word in ref_list if ("[" in word or "]" in word)] + # num_word_errors += editdistance.eval(hyp_words, ref_words) + hyp_before = decode(hyp_tokens).split() + ref_before = decode(ref_tokens).split() + + num_word_errors += editdistance.eval(hyp_before, ref_before) + num_words += len(ref_before) + if hyp != ref: + num_em_errors += 1 + if hyp_tree != ref_tree: + num_tree_errors += 1 + num_ems += 1 + num_trees += 1 + + return { + "num_char_errors": num_char_errors, + "num_chars": num_chars, + "num_word_errors": num_word_errors, + "num_words": num_words, + "num_ems": num_ems, + "num_em_errors": num_em_errors, + "num_trees": num_trees, + "num_tree_errors": num_tree_errors, + } + + def _inference_with_wer(self, generator, sample, model): + import editdistance + + def decode(toks): + s = self.target_dictionary.string( + toks.int().cpu(), + self.cfg.eval_wer_post_process, + escape_unk=True, + ) + if self.tokenizer: + s = self.tokenizer.decode(s) + return s + + num_word_errors, num_char_errors = 0, 0 + num_chars, num_words = 0, 0 + gen_out = self.inference_step(generator, [model], sample, None) + for i in range(len(gen_out)): + hyp = decode(gen_out[i][0]["tokens"]) + ref = decode( + utils.strip_pad(sample["target"][i], self.target_dictionary.pad()), + ) + num_char_errors += editdistance.eval(hyp, ref) + num_chars += len(ref) + hyp_words = hyp.split() + ref_words = ref.split() + num_word_errors += editdistance.eval(hyp_words, ref_words) + num_words += len(ref_words) + + return { + "num_char_errors": num_char_errors, + "num_chars": num_chars, + "num_word_errors": num_word_errors, + "num_words": num_words, + } + + def _inference_with_bleu(self, generator, sample, model): + import sacrebleu + + def decode(toks, is_ref): + s = self.target_dictionary.string( + toks.int().cpu(), + self.cfg.eval_bleu_remove_bpe, + # The default unknown string in fairseq is `<unk>`, but + # this is tokenized by sacrebleu as `< unk >`, inflating + # BLEU scores. Instead, we use a somewhat more verbose + # alternative that is unlikely to appear in the real + # reference, but doesn't get split into multiple tokens. + unk_string=("UNKNOWNTOKENINREF" if is_ref else "UNKNOWNTOKENINHYP"), + ) + if self.tokenizer: + s = self.tokenizer.decode(s) + return s + + gen_out = self.inference_step(generator, [model], sample) + hyps, refs = [], [] + for i in range(len(gen_out)): + hyps.append(decode(gen_out[i][0]["tokens"], is_ref=False)) + refs.append( + decode( + utils.strip_pad(sample["target"][i], self.target_dictionary.pad()), + is_ref=True, # don't count <unk> as matches to the hypo + ) + ) + if self.cfg.eval_bleu_print_samples: + logger.info("H-{} {}".format(sample["id"][0], hyps[0])) + logger.info("T-{} {}".format(sample["id"][0], refs[0])) + + eval_tokenization = "none" if self.cfg.eval_tokenized_bleu else "13a" + return sacrebleu.corpus_bleu(hyps, [refs], tokenize=eval_tokenization) + + def reduce_metrics(self, logging_outputs, criterion): + super().reduce_metrics(logging_outputs, criterion) + + if self.cfg.eval_wer or self.cfg.eval_wer_parse: + zero = torch.scalar_tensor(0.0) + num_char_errors = sum( + log.get("_num_char_errors", zero) for log in logging_outputs + ) + num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs) + num_word_errors = sum( + log.get("_num_word_errors", zero) for log in logging_outputs + ) + num_words = sum(log.get("_num_words", zero) for log in logging_outputs) + metrics.log_scalar("_num_char_errors", num_char_errors) + metrics.log_scalar("_num_chars", num_chars) + metrics.log_scalar("_num_word_errors", num_word_errors) + metrics.log_scalar("_num_words", num_words) + if num_chars > 0: + metrics.log_derived( + "uer", + lambda meters: meters["_num_char_errors"].sum + * 100.0 + / meters["_num_chars"].sum + if meters["_num_chars"].sum > 0 + else float("nan"), + ) + if num_words > 0: + metrics.log_derived( + "wer", + lambda meters: meters["_num_word_errors"].sum + * 100.0 + / meters["_num_words"].sum + if meters["_num_words"].sum > 0 + else float("nan"), + ) + if self.cfg.eval_wer_parse: + num_em_errors = sum( + log.get("_num_em_errors", zero) for log in logging_outputs + ) + num_ems = sum(log.get("_num_ems", zero) for log in logging_outputs) + metrics.log_scalar("_num_em_errors", num_em_errors) + metrics.log_scalar("_num_ems", num_ems) + num_tree_errors = sum( + log.get("_num_tree_errors", zero) for log in logging_outputs + ) + num_trees = sum(log.get("_num_trees", zero) for log in logging_outputs) + metrics.log_scalar("_num_tree_errors", num_tree_errors) + metrics.log_scalar("_num_trees", num_trees) + + if num_ems > 0: + metrics.log_derived( + "em_error", + lambda meters: meters["_num_em_errors"].sum + * 100.0 + / meters["_num_ems"].sum + if meters["_num_ems"].sum > 0 + else float("nan"), + ) + if num_trees > 0: + metrics.log_derived( + "tree_error", + lambda meters: meters["_num_tree_errors"].sum + * 100.0 + / meters["_num_trees"].sum + if meters["_num_trees"].sum > 0 + else float("nan"), + ) + + if self.cfg.eval_bleu: + len_keys = ["_bleu_sys_len", "_bleu_ref_len"] + count_keys = [f"_bleu_counts_{i}" for i in range(4)] + total_keys = [f"_bleu_totals_{i}" for i in range(4)] + for k in len_keys + count_keys + total_keys: + metrics.log_scalar(k, sum(log.get(k, 0) for log in logging_outputs)) + + import sacrebleu + + metrics.log_derived( + "bleu", + lambda meters: sacrebleu.compute_bleu( + correct=[meters[k].sum for k in count_keys], + total=[meters[k].sum for k in total_keys], + sys_len=meters["_bleu_sys_len"].sum, + ref_len=meters["_bleu_ref_len"].sum, + smooth_method="exp", + ).score, + ) diff --git a/fairseq/tasks/online_backtranslation.py b/fairseq/tasks/online_backtranslation.py new file mode 100644 index 0000000000..da24fe8981 --- /dev/null +++ b/fairseq/tasks/online_backtranslation.py @@ -0,0 +1,683 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import json +import logging +import math +import os +from argparse import Namespace +from collections import OrderedDict, defaultdict +from pathlib import Path +from typing import Dict, Sequence, Tuple +from argparse import ArgumentError + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +import fairseq +from fairseq import options, utils +from fairseq.logging import metrics +from fairseq.data import ( + FairseqDataset, + LanguagePairDataset, + NoisingDataset, + PrependTokenDataset, + RoundRobinZipDatasets, + TransformEosLangPairDataset, + data_utils, + encoders, +) +from fairseq.sequence_generator import SequenceGenerator +from fairseq.tasks import register_task +from fairseq.tasks.translation import TranslationTask, load_langpair_dataset + +logger = logging.getLogger(__name__) + + +class PiecewiseLinearFn: + """Piecewise linear function. Can be configured with a string.""" + + def __init__(self, pieces: Sequence[Tuple[int, float]]): + assert pieces == sorted( + pieces + ), f"PiecewiseLinearFn configuration should be sorted, received: {pieces}" + + self.pieces = pieces + + def __call__(self, x: int) -> float: + for i, (x_a, y_a) in enumerate(self.pieces[:-1]): + x_b, y_b = self.pieces[i + 1] + if x_a <= x <= x_b: + return y_a + (x - x_a) * (y_b - y_a) / (x_b - x_a) + + return self.pieces[-1][1] + + @staticmethod + def from_string(configuration: str) -> "PiecewiseLinearFn": + """ + Parse the configuration of lambda coefficient (for scheduling). + x = "3" # lambda will be a constant equal to x + x = "0:1,1000:0" # lambda will start from 1 and linearly decrease + # to 0 during the first 1000 iterations + x = "0:0,1000:0,2000:1" # lambda will be equal to 0 for the first 1000 + # iterations, then will linearly increase to 1 until iteration 2000 + """ + if isinstance(configuration, float): + return PiecewiseLinearFn([(0, configuration)]) + + try: + parts = configuration.split(",") + if len(parts) == 1: + v = float(configuration) + return PiecewiseLinearFn([(0, v)]) + + split = [s.split(":") for s in parts] + pieces = [(int(t), float(v)) for t, v in split] + return PiecewiseLinearFn(pieces) + except Exception: + raise ValueError( + f"Invalid PiecewiseLinearFn configuration: {configuration!r}" + ) + + @staticmethod + def one() -> "PiecewiseLinearFn": + return PiecewiseLinearFn([(0, 1.0)]) + + +@register_task("online_backtranslation") +class OnlineBackTranslationTask(TranslationTask): + @staticmethod + def add_args(parser): + """Add task-specific arguments to the parser.""" + # fmt: off + # Generic translation args + parser.add_argument('data', help='colon separated path to data directories list, \ + will be iterated upon during epochs in round-robin manner; \ + however, valid and test data are always in the first directory to \ + avoid the need for repeating them in all directories') + parser.add_argument('--mono-langs', metavar='MONO_LANGS', + help='monolingual languages for training') + parser.add_argument('--valid-lang-pairs', default=None, metavar='VALID_LANG_PAIRS', + help='language pairs for validation') + parser.add_argument('--load-alignments', action='store_true', + help='load the binarized alignments') + parser.add_argument('--left-pad-source', default='False', type=str, metavar='BOOL', + help='pad the source on the left') + parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL', + help='pad the target on the left') + parser.add_argument('--upsample-primary', default=1, type=int, + help='amount to upsample primary dataset') + try: + parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N', + help='max number of tokens in the source sequence') + parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N', + help='max number of tokens in the target sequence') + except ArgumentError: + # this might have already been defined. Once we transition this to hydra it should be fine to add it here. + pass + parser.add_argument('--truncate-source', action='store_true', default=False, + help='truncate source to max-source-positions') + parser.add_argument('--num-batch-buckets', default=0, type=int, metavar='N', + help='if >0, then bucket source and target lengths into N ' + 'buckets and pad accordingly; this is useful on TPUs ' + 'to minimize the number of compilations') + + # Denoising args + parser.add_argument('--max-word-shuffle-distance', default=3.0, type=float, metavar='N', + help='maximum word shuffle distance for denoising autoencoding data generation') + parser.add_argument('--word-dropout-prob', default=0.1, type=float, metavar='N', + help='word dropout probability for denoising autoencoding data generation') + parser.add_argument('--word-blanking-prob', default=0.2, type=float, metavar='N', + help='word blanking probability for denoising autoencoding data generation') + + # Backtranslation args + parser.add_argument('--lambda-bt', default="1.0", type=str, metavar='N', + help='back-translation weight') + parser.add_argument('--lambda-dae', default="1.0", type=str, metavar='N', + help='denoising auto-encoder weight') + + # Evaluation args + parser.add_argument('--generate-one-by-one', action='store_true', + help='generate one sentence at a time for backtranslation') + + parser.add_argument('--eval-bleu', action='store_true', + help='evaluation with BLEU scores') + parser.add_argument('--eval-bleu-detok', type=str, default="space", + help='detokenize before computing BLEU (e.g., "moses"); ' + 'required if using --eval-bleu; use "space" to ' + 'disable detokenization; see fairseq.data.encoders ' + 'for other options') + parser.add_argument('--eval-bleu-detok-args', type=str, metavar='JSON', + help='args for building the tokenizer, if needed') + parser.add_argument('--eval-tokenized-bleu', action='store_true', default=False, + help='compute tokenized BLEU instead of sacrebleu') + parser.add_argument('--eval-bleu-remove-bpe', nargs='?', const='@@ ', default=None, + help='remove BPE before computing BLEU') + parser.add_argument('--eval-bleu-args', type=str, metavar='JSON', + help='generation args for BLUE scoring, ' + 'e.g., \'{"beam": 4, "lenpen": 0.6}\'') + parser.add_argument('--eval-bleu-print-samples', action='store_true', + help='print sample generations during validation') + # fmt: on + + def __init__(self, args, common_dict, mono_langs, valid_lang_pairs): + super().__init__(args, common_dict, common_dict) + self.common_dict = common_dict + self.mono_langs = mono_langs + self.valid_lang_pairs = valid_lang_pairs + + self.SHOW_SAMPLES_INTERVAL = 1000 + # Start by showing samples + self._show_samples_ctr = self.SHOW_SAMPLES_INTERVAL + self.SHOW_SAMPLES_NUMBER = 5 + self.lambda_bt = PiecewiseLinearFn.from_string(args.lambda_bt) + self.lambda_dae = PiecewiseLinearFn.from_string(args.lambda_dae) + + self.args = args + self.data = utils.split_paths(self.args.data) + if len(self.data) == 1: + shards = list(Path(self.data[0]).glob("shard*")) + if len(shards) > 0: + # keep this as strings, since it can also be a manifold path + old_data = self.data + self.data = [str(shard) for shard in shards] + logging.warning(f"Expanded data directory {old_data} to {self.data}") + + @classmethod + def setup_task(cls, args, **kwargs): + """Setup the task (e.g., load dictionaries). + + Args: + args (argparse.Namespace): parsed command-line arguments + """ + args.left_pad_source = options.eval_bool(args.left_pad_source) + args.left_pad_target = options.eval_bool(args.left_pad_target) + + paths = utils.split_paths(args.data) + assert len(paths) > 0 + assert args.mono_langs is not None + + mono_langs = args.mono_langs.split(",") + valid_lang_pairs = args.valid_lang_pairs.split(",") + + # load dictionary + dict_path = os.path.join(paths[0], "dict.txt") + common_dict = cls.load_dictionary(dict_path) + + return cls(args, common_dict, mono_langs, valid_lang_pairs) + + def load_dataset(self, split, epoch=1, combine=False, **kwargs) -> FairseqDataset: + """Load a given dataset split. + + Args: + split (str): name of the split (e.g., train, valid, test) + """ + if split == "train": + data_path = self.data[(epoch - 1) % len(self.data)] + dataset = self.load_train_dataset(data_path) + else: + # valid/test should always be the same. + dataset = self.load_translation_dataset(split, self.data[0]) + + self.datasets[split] = dataset + return dataset + + def load_train_dataset(self, data_path: str) -> FairseqDataset: + """The training dataset is made of backtranslation dataset and denoising dataset.""" + data = [] + for lang in self.mono_langs: + train_path = os.path.join(data_path, lang, "train") + # TODO: could we do the BT using denoise sample ? + # this would half the data loading work + data.append((f"{lang}-BT", self.load_bt_dataset(train_path, lang))) + data.append( + (f"{lang}-DENOISE", self.load_denoise_dataset(train_path, lang)) + ) + + return RoundRobinZipDatasets(OrderedDict(data)) + + def _langpair_dataset( + self, src: FairseqDataset, tgt: FairseqDataset + ) -> LanguagePairDataset: + return LanguagePairDataset( + src, + src.sizes, + self.dictionary, + tgt=tgt, + tgt_sizes=tgt.sizes, + tgt_dict=self.dictionary, + left_pad_source=self.args.left_pad_source, + left_pad_target=self.args.left_pad_target, + # TODO: should we shuffle ? we are already sorting batch by sizes so ? + # shuffle=True, + ) + + def _prepend_lang_bos_to_target( + self, dataset: LanguagePairDataset, lang: str + ) -> LanguagePairDataset: + bos = _lang_token_index(self.dictionary, lang) + return TransformEosLangPairDataset( + dataset, + src_eos=self.dictionary.eos(), + new_src_eos=self.dictionary.eos(), + tgt_bos=self.dictionary.eos(), + new_tgt_bos=bos, + ) + + def load_bt_dataset(self, data_path: str, lang: str) -> FairseqDataset: + """The BT dataset is generated with (tgt, tgt) pairs. + The actual translation to a (generated_src, tgt) pair + is done on the fly during training. + """ + mono_dataset = data_utils.load_indexed_dataset( + data_path, self.common_dict, self.args.dataset_impl + ) + assert mono_dataset is not None, f"No dataset found for {lang}" + + mono_dataset_src = PrependTokenDataset( + mono_dataset, _lang_token_index(self.dictionary, lang) + ) + + mono_dataset_bt = self._langpair_dataset(mono_dataset_src, mono_dataset) + logger.info( + f"mono_lang = {lang} " + f"lang token index = {_lang_token_index(self.dictionary, lang)} " + f"lang token = {_lang_token(lang)}" + ) + + mono_dataset_bt = self._prepend_lang_bos_to_target(mono_dataset_bt, lang) + return mono_dataset_bt + + def load_denoise_dataset(self, data_path: str, lang: str) -> FairseqDataset: + """Classic denoising dataset""" + dataset = data_utils.load_indexed_dataset( + data_path, self.common_dict, self.args.dataset_impl + ) + noisy_dataset = NoisingDataset( + dataset, + self.dictionary, + seed=1, + max_word_shuffle_distance=self.args.max_word_shuffle_distance, + word_dropout_prob=self.args.word_dropout_prob, + word_blanking_prob=self.args.word_blanking_prob, + ) + noisy_dataset = PrependTokenDataset( + noisy_dataset, _lang_token_index(self.dictionary, lang) + ) + + clean_dataset = data_utils.load_indexed_dataset( + data_path, self.common_dict, self.args.dataset_impl + ) + denoising_dataset = self._langpair_dataset(noisy_dataset, clean_dataset) + denoising_dataset = self._prepend_lang_bos_to_target(denoising_dataset, lang) + return denoising_dataset + + def load_translation_dataset( + self, split: str, data_path: str, combine: bool = False + ): + # only judging with one language pair for the moment, + # since ConcatDataset doesn't work as expected + assert len(self.valid_lang_pairs) == 1, "For now..." + valid_lang_pair = self.valid_lang_pairs[0] + src, tgt = valid_lang_pair.split("-") + + # use the same function than TranslationTask + src_tgt_dt = load_langpair_dataset( + data_path, + split, + src, + self.common_dict, + tgt, + self.common_dict, + combine=combine, + dataset_impl=self.args.dataset_impl, + upsample_primary=self.args.upsample_primary, + left_pad_source=self.args.left_pad_source, + left_pad_target=self.args.left_pad_target, + max_source_positions=self.args.max_source_positions, + max_target_positions=self.args.max_target_positions, + load_alignments=self.args.load_alignments, + truncate_source=self.args.truncate_source, + num_buckets=self.args.num_batch_buckets, + shuffle=(split != "test"), + prepend_bos_src=_lang_token_index(self.dictionary, src), + ) + + src_tgt_eos_dt = self._prepend_lang_bos_to_target(src_tgt_dt, tgt) + src_tgt_eos_dt.args = self.args + return src_tgt_eos_dt + + def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None): + raise NotImplementedError + + def build_model(self, args, from_checkpoint=False): + # torch.autograd.set_detect_anomaly(True) + model = super().build_model(args, from_checkpoint) + + add_secial_tokens_to_dict_and_model(self.common_dict, model, self.mono_langs) + + self.sequence_generators = {} + for mono_lang in self.mono_langs: + self.sequence_generators[mono_lang] = SequenceGenerator( + [model], + tgt_dict=self.dictionary, + beam_size=1, + max_len_a=1.3, + max_len_b=5, + min_len=5, + # keep 1 to be able to prepend bos + max_len=model.max_decoder_positions() - 1, + ) + + if getattr(args, "eval_bleu", False): + assert getattr(args, "eval_bleu_detok", None) is not None, ( + "--eval-bleu-detok is required if using --eval-bleu; " + "try --eval-bleu-detok=moses (or --eval-bleu-detok=space " + "to disable detokenization, e.g., when using sentencepiece)" + ) + detok_args = json.loads(getattr(args, "eval_bleu_detok_args", "{}") or "{}") + self.tokenizer = encoders.build_tokenizer( + Namespace( + tokenizer=getattr(args, "eval_bleu_detok", None), **detok_args + ) + ) + + gen_args = json.loads(getattr(args, "eval_bleu_args", "{}") or "{}") + self.bleu_sequence_generator = self.build_generator( + [model], Namespace(**gen_args) + ) + + return model + + def max_positions(self): + """Return the max sentence length allowed by the task.""" + return (self.args.max_source_positions, self.args.max_target_positions) + + @property + def dictionary(self): + """Return the source :class:`~fairseq.data.Dictionary`.""" + return self.common_dict + + def display_samples_once_in_a_while(self, smp, mono_lang, other_lang): + self._show_samples_ctr += 1 + if self._show_samples_ctr < self.SHOW_SAMPLES_INTERVAL: + return + self._show_samples_ctr = 0 + + ln = smp["net_input"]["src_tokens"].shape[0] + + logger.info( + f"(r:{self.args.distributed_rank}) : " + f"{other_lang} ---> {mono_lang} " + f"({other_lang} was generated by back-translation.) {ln} samples" + ) + + for i in range(min(ln, self.SHOW_SAMPLES_NUMBER)): + src_tokens = smp["net_input"]["src_tokens"][i] + tgt_tokens = smp["target"][i] + + src_str = self.dictionary.string(src_tokens, "sentencepiece") + tgt_str = self.dictionary.string(tgt_tokens, "sentencepiece") + logger.info( + f"\n{i}\t\t[{other_lang} generated] {src_str}\n" + f"\t\t[{mono_lang} original ] {tgt_str}\n" + f"\t\t[ src tokens] {src_tokens}\n" + ) + + def backtranslate_sample(self, smp, orig_lang, other_lang) -> None: + """ + * WARNING: smp is modified in place. + * At the start of this function, `smp` has the same input and target: + |--------------------------------------------------------| + | smp['net_input']['src_tokens'] | smp['target'] | + | (from data) __en__ hello world | __en__ hello world | + |--------------------------------------------------------| + + * We call generator.generate(smp, bos_token = token("ro")), + and copy the result as input + * At the end, `smp` has the translation to other language. + |--------------------------------------------------------| + | smp['net_input']['src_tokens'] | smp['target'] | + | (generated) __ro__ salut lume | __en__ hello world | + |--------------------------------------------------------| + + """ + bos_token = _lang_token_index(self.dictionary, other_lang) + generated = self.sequence_generators[orig_lang].generate( + models=[], sample=smp, bos_token=bos_token + ) + + max_lngth = max([gn[0]["tokens"].size(0) for gn in generated]) + net_input = smp["net_input"] + n_src_tokens = torch.empty( + size=(len(generated), max_lngth + 1), dtype=net_input["src_tokens"].dtype + ) + n_src_lengths = torch.empty( + len(generated), dtype=net_input["src_lengths"].dtype + ) + + for i, gn in enumerate(generated): + tokens = gn[0]["tokens"] + tokens_size = tokens.size(0) + padding_needed = max_lngth - tokens_size + tokens = torch.cat([tokens.new([bos_token]), tokens]) + tokens = F.pad(tokens, (0, padding_needed), value=self.dictionary.pad()) + n_src_tokens[i] = tokens + n_src_lengths[i] = tokens_size + 1 + + device = net_input["src_tokens"].device + # This seems to be important + del net_input["src_tokens"] + del net_input["src_lengths"] + net_input["src_tokens"] = n_src_tokens.to(device) + net_input["src_lengths"] = n_src_lengths.to(device) + + def generate(self, smp, model): + model.eval() + orig_lang = ( + self.dictionary[smp["net_input"]["src_tokens"][0][0]] + .replace(" ", "") + .replace("_", "") + ) + bos_token = smp["net_input"]["prev_output_tokens"][0][0] + with torch.no_grad(): + generated = self.sequence_generators[orig_lang].generate( + models=[model], sample=smp, bos_token=bos_token + ) + return generated + + def get_other_lang(self, lang): + # TODO: allow more complex mapping + if lang != self.mono_langs[0]: + return self.mono_langs[0] + if len(self.mono_langs) == 2: + return self.mono_langs[1] + return self.mono_langs[np.random.randint(1, len(self.mono_langs))] + + def train_step( + self, sample, model, criterion, optimizer, update_num, ignore_grad=False + ): + + model.train() + model.set_num_updates(update_num) + + agg_loss, agg_sample_size = 0.0, 0.0 + agg_logging_output: Dict[str, float] = defaultdict(float) + + dataset_keys = self.datasets["train"].datasets.keys() + + weights = { + "BT": self.lambda_bt(update_num), + "DENOISE": self.lambda_dae(update_num), + } + log_keys = {"BT": "bt_", "DENOISE": "dae_"} + + for dataset_key in dataset_keys: + smp = sample[dataset_key] + mono_lang, task_subtype = dataset_key.split("-") + if weights[task_subtype] == 0: + continue + + if task_subtype == "BT": + with torch.autograd.profiler.record_function("backtranslation"): + model.eval() + # TODO: Could we translate to several language at once ? + # this would allow to share encoder_out and maximize GPU usage. + other_lang = self.get_other_lang(mono_lang) + self.backtranslate_sample(smp, mono_lang, other_lang) + self.display_samples_once_in_a_while(smp, mono_lang, other_lang) + model.train() + + # Like in FairseqTask.train_step + with torch.autograd.profiler.record_function("forward"): + loss, sample_size, logging_output = criterion(model, smp) + loss *= weights[task_subtype] + if ignore_grad: + loss *= 0 + with torch.autograd.profiler.record_function("backward"): + optimizer.backward(loss) + + agg_loss += loss.item() + agg_sample_size += sample_size + for k in logging_output: + agg_logging_output[log_keys[task_subtype] + k] += logging_output[k] + agg_logging_output[k] += logging_output[k] + + return agg_loss, agg_sample_size, agg_logging_output + + def get_bos_token_from_sample(self, sample): + net_input = sample["net_input"] + source_lang_token_id = torch.unique(net_input["src_tokens"][:, 0]).item() + source_lang_token = self.dictionary[source_lang_token_id].replace("_", "") + target_lang_token_id = _lang_token_index( + self.dictionary, self.get_other_lang(source_lang_token) + ) + + return target_lang_token_id + + def reduce_metrics(self, logging_outputs, criterion): + super().reduce_metrics(logging_outputs, criterion) + bt_sample_size = sum(x.get("bt_sample_size", 0) for x in logging_outputs) + if bt_sample_size: + bt_loss_sum = sum(x.get("bt_loss", 0) for x in logging_outputs) + bt_loss_sum *= 1 / bt_sample_size / math.log(2) + metrics.log_scalar("bt_loss", bt_loss_sum, bt_sample_size, round=3) + + bt_nll_loss_sum = sum(x.get("bt_nll_loss", 0) for x in logging_outputs) + bt_ntokens = sum(x.get("bt_ntokens", 0) for x in logging_outputs) + bt_nll_loss_sum *= 1 / bt_ntokens / math.log(2) + metrics.log_scalar("bt_nll_loss", bt_nll_loss_sum, bt_ntokens, round=3) + metrics.log_derived( + "bt_ppl", lambda meters: utils.get_perplexity(meters["bt_nll_loss"].avg) + ) + + dae_sample_size = sum(x.get("dae_sample_size", 0) for x in logging_outputs) + if dae_sample_size: + dae_loss_sum = sum(x.get("dae_loss", 0) for x in logging_outputs) + dae_loss_sum *= 1 / dae_sample_size / math.log(2) + metrics.log_scalar("dae_loss", dae_loss_sum, dae_sample_size, round=3) + + dae_nll_loss_sum = sum(x.get("dae_nll_loss", 0) for x in logging_outputs) + dae_ntokens = sum(x.get("dae_ntokens", 0) for x in logging_outputs) + dae_nll_loss_sum *= 1 / dae_ntokens / math.log(2) + metrics.log_scalar("dae_nll_loss", dae_nll_loss_sum, dae_ntokens, round=3) + metrics.log_derived( + "dae_ppl", + lambda meters: utils.get_perplexity(meters["dae_nll_loss"].avg), + ) + + +@torch.no_grad() +def extend_embedding( + emb: nn.Module, new_vocab_size: int, copy_from_token_id: int +) -> None: + old_emb_data = emb.weight.data + (old_vocab_size, dim) = old_emb_data.shape + assert new_vocab_size >= old_vocab_size + + if new_vocab_size > old_vocab_size: + emb.weight.data = torch.zeros((new_vocab_size, dim)) + emb.weight.data[:old_vocab_size, :] = old_emb_data + # initialize new embeddings + emb.weight.data[old_vocab_size:, :] = old_emb_data[copy_from_token_id] + if hasattr(emb, "num_embeddings"): + emb.num_embeddings = new_vocab_size + if hasattr(emb, "out_features"): + emb.out_features = new_vocab_size + + if getattr(emb, "bias", None) is None: + return + + # Fix the bias. + # Bias shape can be different from the previous vocab size + # if the weight matrix was shared and alread extended but not the bias. + (old_vocab_size,) = emb.bias.shape + assert new_vocab_size >= old_vocab_size + if new_vocab_size > old_vocab_size: + old_bias = emb.bias.data + new_bias = torch.zeros( + (new_vocab_size,), dtype=old_bias.dtype, device=old_bias.device + ) + new_bias[:old_vocab_size] = old_bias + emb.bias.data = new_bias + + +def add_secial_tokens_to_dict_and_model( + dictionary: "fairseq.data.Dictionary", + model: nn.Module, + mono_langs: Sequence[str], +) -> None: + embs = model.encoder.embed_tokens + vocab_size, embedding_dim = embs.weight.shape + + # The model may or may not have a '<mask>' embedding yet + assert ( + len(dictionary) <= vocab_size <= len(dictionary) + 1 + ), f"Dictionary len ({len(dictionary)}) doesn't match embs shape ({embs.weight.shape})" + # TODO: we should reuse the pretrained model dict which already has <mask> + dictionary.add_symbol("<mask>") + + for lang in mono_langs: + lang_token = _lang_token(lang) + dictionary.add_symbol(lang_token) + logger.info( + f"dictionary: {len(dictionary)} -> {vocab_size} tokens " + f"after adding {len(mono_langs)} lang tokens." + ) + + if len(dictionary) <= vocab_size: + return + + extend_embedding(embs, len(dictionary), dictionary.bos()) + dec_embs = model.decoder.embed_tokens + extend_embedding(dec_embs, len(dictionary), dictionary.bos()) + lm_head = model.decoder.output_projection + extend_embedding(lm_head, len(dictionary), dictionary.bos()) + assert lm_head.weight.shape == (len(dictionary), embedding_dim) + + +def _lang_token(lang: str) -> str: + return f"__{lang}__" + + +def _lang_token_index(dictionary, lang: str) -> int: + return dictionary.index(_lang_token(lang)) + + +@contextlib.contextmanager +def assert_weights_have_changed(model: nn.Module): + def checksum(model: nn.Module) -> float: + return sum(p.sum().item() for p in model.parameters()) + + initial_checksum = checksum(model) + yield model + final_checksum = checksum(model) + logger.info( + f"initial_checksum={initial_checksum} -> final_checksum={final_checksum}" + ) + assert initial_checksum != final_checksum, "Model hasn't changed !" diff --git a/fairseq/tasks/semisupervised_translation.py b/fairseq/tasks/semisupervised_translation.py index b2f9bf9a73..432b8a52ca 100644 --- a/fairseq/tasks/semisupervised_translation.py +++ b/fairseq/tasks/semisupervised_translation.py @@ -353,10 +353,10 @@ def language_pair_dataset(lang_pair): else "%s-%s" % (self.args.source_lang, self.args.target_lang), ) - def build_model(self, args): + def build_model(self, args, from_checkpoint=False): from fairseq import models - model = models.build_model(args, self) + model = models.build_model(args, self, from_checkpoint) if not isinstance(model, FairseqMultiModel): raise ValueError( "SemisupervisedTranslationTask requires a FairseqMultiModel architecture" diff --git a/fairseq/tasks/sentence_prediction.py b/fairseq/tasks/sentence_prediction.py index 69dc996e6a..de80addaf2 100644 --- a/fairseq/tasks/sentence_prediction.py +++ b/fairseq/tasks/sentence_prediction.py @@ -6,8 +6,12 @@ import logging import os +import contextlib +from dataclasses import dataclass, field +from typing import Optional +from omegaconf import MISSING, II, open_dict, OmegaConf + import numpy as np -from fairseq import utils from fairseq.data import ( ConcatSentencesDataset, Dictionary, @@ -19,20 +23,75 @@ PrependTokenDataset, RawLabelDataset, RightPadDataset, + RightPaddingMaskDataset, RollDataset, SortDataset, StripTokenDataset, data_utils, ) from fairseq.data.shorten_dataset import maybe_shorten_dataset -from fairseq.tasks import LegacyFairseqTask, register_task +from fairseq.tasks import FairseqDataclass, FairseqTask, register_task +from fairseq.dataclass import ChoiceEnum logger = logging.getLogger(__name__) - - -@register_task("sentence_prediction") -class SentencePredictionTask(LegacyFairseqTask): +SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) + + +@dataclass +class SentencePredictionConfig(FairseqDataclass): + data: str = field(default=MISSING, metadata={"help": "path to data directory"}) + num_classes: int = field( + default=-1, + metadata={"help": "number of classes or regression targets"}, + ) + init_token: Optional[int] = field( + default=None, + metadata={"help": "add token at the beginning of each batch item"}, + ) + separator_token: Optional[int] = field( + default=None, + metadata={"help": "add separator token between inputs"}, + ) + no_shuffle: bool = field( + default=False, + ) + shorten_method: SHORTEN_METHOD_CHOICES = field( + default="none", + metadata={ + "help": "if not none, shorten sequences that exceed tokens_per_sample" + }, + ) + shorten_data_split_list: str = field( + default="", + metadata={ + "help": "comma-separated list of dataset splits to apply shortening to, " + 'e.g., "train,valid" (default: all dataset splits)' + }, + ) + add_prev_output_tokens: bool = field( + default=False, + metadata={ + "help": "add prev_output_tokens to sample, used for encoder-decoder arch" + }, + ) + max_positions: int = field( + default=512, + metadata={"help": "max tokens per example"}, + ) + + regression_target: bool = II("criterion.regression_target") + classification_head_name: str = II("criterion.classification_head_name") + seed: int = II("common.seed") + + d2v2_multi: bool = field( + default=False, + metadata={"help": "prepare dataset for data2vec_multi"}, + ) + + +@register_task("sentence_prediction", dataclass=SentencePredictionConfig) +class SentencePredictionTask(FairseqTask): """ Sentence (or sentence pair) prediction (classification or regression) task. @@ -40,64 +99,13 @@ class SentencePredictionTask(LegacyFairseqTask): dictionary (Dictionary): the dictionary for the input of the task """ - @staticmethod - def add_args(parser): - """Add task-specific arguments to the parser.""" - parser.add_argument("data", metavar="FILE", help="file prefix for data") - parser.add_argument( - "--num-classes", - type=int, - default=-1, - help="number of classes or regression targets", - ) - parser.add_argument( - "--init-token", - type=int, - default=None, - help="add token at the beginning of each batch item", - ) - parser.add_argument( - "--separator-token", - type=int, - default=None, - help="add separator token between inputs", - ) - parser.add_argument("--regression-target", action="store_true", default=False) - parser.add_argument("--no-shuffle", action="store_true", default=False) - parser.add_argument( - "--shorten-method", - default="none", - choices=["none", "truncate", "random_crop"], - help="if not none, shorten sequences that exceed --tokens-per-sample", - ) - parser.add_argument( - "--shorten-data-split-list", - default="", - help="comma-separated list of dataset splits to apply shortening to, " - 'e.g., "train,valid" (default: all dataset splits)', - ) - parser.add_argument( - "--add-prev-output-tokens", - action="store_true", - default=False, - help="add prev_output_tokens to sample, used for encoder-decoder arch", - ) - - def __init__(self, args, data_dictionary, label_dictionary): - super().__init__(args) + def __init__(self, cfg, data_dictionary, label_dictionary): + super().__init__(cfg) self.dictionary = data_dictionary self._label_dictionary = label_dictionary - if not hasattr(args, "max_positions"): - self._max_positions = ( - args.max_source_positions, - args.max_target_positions, - ) - else: - self._max_positions = args.max_positions - args.tokens_per_sample = self._max_positions @classmethod - def load_dictionary(cls, args, filename, source=True): + def load_dictionary(cls, filename): """Load the dictionary from the filename Args: @@ -108,99 +116,111 @@ def load_dictionary(cls, args, filename, source=True): return dictionary @classmethod - def setup_task(cls, args, **kwargs): - assert args.num_classes > 0, "Must set --num-classes" + def setup_task(cls, cfg, **kwargs): + assert cfg.num_classes > 0, "Must set task.num_classes" # load data dictionary data_dict = cls.load_dictionary( - args, - os.path.join(args.data, "input0", "dict.txt"), - source=True, + os.path.join(cfg.data, "input0", "dict.txt"), ) logger.info("[input] dictionary: {} types".format(len(data_dict))) - label_dict = None - if not args.regression_target: - # load label dictionary + # load label dictionary + if not cfg.regression_target: label_dict = cls.load_dictionary( - args, - os.path.join(args.data, "label", "dict.txt"), - source=False, + os.path.join(cfg.data, "label", "dict.txt"), ) logger.info("[label] dictionary: {} types".format(len(label_dict))) else: label_dict = data_dict - return cls(args, data_dict, label_dict) + return cls(cfg, data_dict, label_dict) def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" - def get_path(type, split): - return os.path.join(self.args.data, type, split) + def get_path(key, split): + return os.path.join(self.cfg.data, key, split) - def make_dataset(type, dictionary): - split_path = get_path(type, split) + def make_dataset(key, dictionary): + split_path = get_path(key, split) - dataset = data_utils.load_indexed_dataset( - split_path, - dictionary, - self.args.dataset_impl, - combine=combine, - ) + try: + dataset = data_utils.load_indexed_dataset( + split_path, + dictionary, + combine=combine, + ) + except Exception as e: + if "StorageException: [404] Path not found" in str(e): + logger.warning(f"dataset {e} not found") + dataset = None + else: + raise e return dataset input0 = make_dataset("input0", self.source_dictionary) assert input0 is not None, "could not find dataset: {}".format( - get_path(type, split) + get_path("input0", split) ) input1 = make_dataset("input1", self.source_dictionary) - if self.args.init_token is not None: - input0 = PrependTokenDataset(input0, self.args.init_token) + if self.cfg.init_token is not None: + input0 = PrependTokenDataset(input0, self.cfg.init_token) if input1 is None: src_tokens = input0 else: - if self.args.separator_token is not None: - input1 = PrependTokenDataset(input1, self.args.separator_token) + if self.cfg.separator_token is not None: + input1 = PrependTokenDataset(input1, self.cfg.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) - with data_utils.numpy_seed(self.args.seed): + with data_utils.numpy_seed(self.cfg.seed): shuffle = np.random.permutation(len(src_tokens)) src_tokens = maybe_shorten_dataset( src_tokens, split, - self.args.shorten_data_split_list, - self.args.shorten_method, - self.args.max_positions, - self.args.seed, + self.cfg.shorten_data_split_list, + self.cfg.shorten_method, + self.max_positions(), + self.cfg.seed, ) - dataset = { - "id": IdDataset(), - "net_input": { + if self.cfg.d2v2_multi: + net_input = { + "source": RightPadDataset( + src_tokens, + pad_idx=self.source_dictionary.pad(), + ), + "id": IdDataset(), + "padding_mask": RightPaddingMaskDataset(src_tokens), + } + else: + net_input = { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset(src_tokens, reduce=False), - }, + } + if self.cfg.add_prev_output_tokens: + prev_tokens_dataset = RightPadDataset( + RollDataset(src_tokens, 1), + pad_idx=self.dictionary.pad(), + ) + net_input.update( + prev_output_tokens=prev_tokens_dataset, + ) + + dataset = { + "id": IdDataset(), + "net_input": net_input, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), } - if self.args.add_prev_output_tokens: - prev_tokens_dataset = RightPadDataset( - RollDataset(src_tokens, 1), - pad_idx=self.dictionary.pad(), - ) - dataset["net_input"].update( - prev_output_tokens=prev_tokens_dataset, - ) - - if not self.args.regression_target: + if not self.cfg.regression_target: label_dataset = make_dataset("label", self.label_dictionary) if label_dataset is not None: dataset.update( @@ -219,8 +239,8 @@ def make_dataset(type, dictionary): def parse_regression_target(i, line): values = line.split() assert ( - len(values) == self.args.num_classes - ), f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"' + len(values) == self.cfg.num_classes + ), f'expected num_classes={self.cfg.num_classes} regression target values on line {i}, found: "{line}"' return [float(x) for x in values] with open(label_path) as h: @@ -238,7 +258,7 @@ def parse_regression_target(i, line): sizes=[src_tokens.sizes], ) - if self.args.no_shuffle: + if self.cfg.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( @@ -252,20 +272,23 @@ def parse_regression_target(i, line): self.datasets[split] = dataset return self.datasets[split] - def build_model(self, args): + def build_model(self, cfg, from_checkpoint=False): from fairseq import models - model = models.build_model(args, self) + with open_dict(cfg) if OmegaConf.is_config(cfg) else contextlib.ExitStack(): + cfg.max_positions = self.cfg.max_positions + + model = models.build_model(cfg, self, from_checkpoint) model.register_classification_head( - getattr(args, "classification_head_name", "sentence_classification_head"), - num_classes=self.args.num_classes, + self.cfg.classification_head_name, + num_classes=self.cfg.num_classes, ) return model def max_positions(self): - return self._max_positions + return self.cfg.max_positions @property def source_dictionary(self): diff --git a/fairseq/tasks/sentence_prediction_adapters.py b/fairseq/tasks/sentence_prediction_adapters.py new file mode 100644 index 0000000000..afe5569626 --- /dev/null +++ b/fairseq/tasks/sentence_prediction_adapters.py @@ -0,0 +1,56 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import contextlib +from omegaconf import open_dict, OmegaConf + +from fairseq.tasks import register_task +from fairseq.tasks.sentence_prediction import ( + SentencePredictionTask, + SentencePredictionConfig, +) + + +logger = logging.getLogger(__name__) + + +@register_task("sentence_prediction_adapters", dataclass=SentencePredictionConfig) +class SentencePredictionAdapterTask(SentencePredictionTask): + def build_model(self, cfg): + from fairseq import models + + with open_dict(cfg) if OmegaConf.is_config(cfg) else contextlib.ExitStack(): + cfg.max_positions = self.cfg.max_positions + + model = models.build_model(cfg, self) + + model.register_classification_head( + self.cfg.classification_head_name, + num_classes=self.cfg.num_classes, + ) + + logger.info("Freezing Embedding Parameters") + for parameter in model.encoder.sentence_encoder.embed_positions.parameters(): + parameter.requires_grad = False + for ( + parameter + ) in model.encoder.sentence_encoder.layernorm_embedding.parameters(): + parameter.requires_grad = False + for parameter in model.encoder.sentence_encoder.embed_tokens.parameters(): + parameter.requires_grad = False + + logger.info("Freezing Adapters") + for k, v in model.encoder.sentence_encoder.layers._modules.items(): + logger.info("Freezing Adapters in Layer " + str(k)) + if hasattr(v, "adapter_layer_norm"): + logger.info("Freezing Adapter LN") + for parameter in v.adapter_layer_norm.parameters(): + parameter.requires_grad = False + for parameter in v.adapter_modules.parameters(): + parameter.requires_grad = False + + return model diff --git a/fairseq/tasks/sentence_ranking.py b/fairseq/tasks/sentence_ranking.py index bed44f34e5..57f63aab67 100644 --- a/fairseq/tasks/sentence_ranking.py +++ b/fairseq/tasks/sentence_ranking.py @@ -195,10 +195,10 @@ def make_dataset(type, dictionary): self.datasets[split] = dataset return self.datasets[split] - def build_model(self, args): + def build_model(self, args, from_checkpoint=False): from fairseq import models - model = models.build_model(args, self) + model = models.build_model(args, self, from_checkpoint) model.register_classification_head( getattr(args, "ranking_head_name", "sentence_classification_head"), diff --git a/fairseq/tasks/simultaneous_translation.py b/fairseq/tasks/simultaneous_translation.py new file mode 100644 index 0000000000..9576b26801 --- /dev/null +++ b/fairseq/tasks/simultaneous_translation.py @@ -0,0 +1,41 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from fairseq.tasks import register_task +from fairseq.tasks.speech_to_text import SpeechToTextTask +from fairseq.tasks.translation import TranslationTask, TranslationConfig + +try: + import examples.simultaneous_translation # noqa + + import_successful = True +except BaseException: + import_successful = False + + +logger = logging.getLogger(__name__) + + +def check_import(flag): + if not flag: + raise ImportError( + "'examples.simultaneous_translation' is not correctly imported. " + "Please considering `pip install -e $FAIRSEQ_DIR`." + ) + + +@register_task("simul_speech_to_text") +class SimulSpeechToTextTask(SpeechToTextTask): + def __init__(self, args, tgt_dict): + check_import(import_successful) + super().__init__(args, tgt_dict) + + +@register_task("simul_text_to_text", dataclass=TranslationConfig) +class SimulTextToTextTask(TranslationTask): + def __init__(self, cfg, src_dict, tgt_dict): + check_import(import_successful) + super().__init__(cfg, src_dict, tgt_dict) diff --git a/fairseq/tasks/span_masked_lm.py b/fairseq/tasks/span_masked_lm.py new file mode 100644 index 0000000000..d746aa154c --- /dev/null +++ b/fairseq/tasks/span_masked_lm.py @@ -0,0 +1,243 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +from dataclasses import dataclass, field +from typing import Optional + +import numpy as np +from omegaconf import II, MISSING + +from fairseq import utils +from fairseq.data import ( + AppendTokenDataset, + Dictionary, + IdDataset, + NestedDictionaryDataset, + NumelDataset, + PadDataset, + PrependTokenDataset, + StripTokenDataset, + TokenBlockDataset, + data_utils, +) +from fairseq.data.shorten_dataset import maybe_shorten_dataset +from fairseq.data.span_mask_tokens_dataset import SpanMaskedTokensDataset +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.tasks import FairseqTask, register_task + +from ..data.indexed_dataset import get_available_dataset_impl + +logger = logging.getLogger(__name__) + +SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"]) +SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) + + +@dataclass +class SpanMaskedLMConfig(FairseqDataclass): + shuffle: bool = field( + default=False, + ) + noise_density: float = field( + default=0.15, + metadata={"help": "What fraction of the tokens to select as noise"}, + ) + mean_noise_span_length: float = field( + default=3, + metadata={"help": "Mean noise span length, must be >= 1"}, + ) + data: str = field( + default=MISSING, + metadata={ + "help": "colon separated path to data directories list, " + "will be iterated upon during epochs in round-robin manner" + }, + ) + sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( + default="none", + metadata={ + "help": 'If omitted or "none", fills each sample with tokens-per-sample ' + 'tokens. If set to "complete", splits samples only at the end ' + "of sentence, but may include multiple sentences per sample. " + '"complete_doc" is similar but respects doc boundaries. ' + 'If set to "eos", includes only one sentence per sample.' + }, + ) + tokens_per_sample: int = field( + default=1024, + metadata={"help": "max number of tokens per sample for LM dataset"}, + ) + shorten_method: SHORTEN_METHOD_CHOICES = field( + default="none", + metadata={ + "help": "if not none, shorten sequences that exceed --tokens-per-sample" + }, + ) + shorten_data_split_list: str = field( + default="", + metadata={ + "help": "comma-separated list of dataset splits to apply shortening to, " + 'e.g., "train,valid" (default: all dataset splits)' + }, + ) + seed: int = II("common.seed") + dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( + "dataset.dataset_impl" + ) + max_source_positions: int = field( + default=1024, metadata={"help": "max number of tokens in the source sequence"} + ) + max_target_positions: int = field( + default=1024, metadata={"help": "max number of tokens in the target sequence"} + ) + include_target_tokens: bool = field( + default=False, + metadata={ + "help": "include target tokens in model input. this is used for data2vec" + }, + ) + + +@register_task("span_masked_lm", dataclass=SpanMaskedLMConfig) +class SpanMaskedLMTask(FairseqTask): + """ + Span masked language modeling task. (ie. T5) + """ + + cfg: SpanMaskedLMConfig + + def __init__(self, cfg, dictionary): + super().__init__(cfg) + self.dictionary = dictionary + + @classmethod + def setup_task(cls, cfg: SpanMaskedLMConfig, **kwargs): + """Setup the task.""" + paths = utils.split_paths(cfg.data) + assert len(paths) > 0 + dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) + logger.info("dictionary: {} types".format(len(dictionary))) + if not hasattr(cfg, "shuffle"): + cfg.shuffle = False + return cls(cfg, dictionary) + + def _load_dataset_split(self, split, epoch, combine): + paths = utils.split_paths(self.cfg.data) + assert len(paths) > 0 + data_path = paths[(epoch - 1) % len(paths)] + split_path = os.path.join(data_path, split) + + dataset = data_utils.load_indexed_dataset( + split_path, + self.dictionary, + self.cfg.dataset_impl, + combine=combine, + ) + if dataset is None: + raise FileNotFoundError( + "Dataset not found: {} ({})".format(split, split_path) + ) + + dataset = StripTokenDataset(dataset, self.dictionary.eos()) + + dataset = maybe_shorten_dataset( + dataset, + split, + self.cfg.shorten_data_split_list, + self.cfg.shorten_method, + self.cfg.tokens_per_sample, + self.cfg.seed, + ) + + # create continuous blocks of tokens + dataset = TokenBlockDataset( + dataset, + dataset.sizes, + self.cfg.tokens_per_sample - 2, # one less for <s> and one for </s> + pad=self.dictionary.pad(), + eos=self.dictionary.eos(), + break_mode=self.cfg.sample_break_mode, + document_sep_len=0, + ) + logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) + + # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) + dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) + dataset = AppendTokenDataset(dataset, self.source_dictionary.eos()) + return dataset + + def load_dataset(self, split, epoch=1, combine=False, **kwargs): + """Load a given dataset split. + + Args: + split (str): name of the split (e.g., train, valid, test) + """ + dataset = self._load_dataset_split(split, epoch, combine) + + self.datasets[split] = SpanMaskedTokensDataset( + dataset, + self.dictionary, + noise_density=self.cfg.noise_density, + mean_noise_span_length=self.cfg.mean_noise_span_length, + shuffle=self.cfg.shuffle, + seed=self.cfg.seed, + ) + logger.info( + "Split: {0}, Loaded {1} samples of span_masked_tokens_dataset".format( + split, + len(self.datasets[split]), + ) + ) + + def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): + """ + Generate batches for inference. We assume that the input begins with a + bos symbol (`<s>`) and ends with an eos symbol (`</s>`). + """ + pad = self.source_dictionary.pad() + eos = self.source_dictionary.eos() + src_dataset = TokenBlockDataset( + src_tokens, + src_lengths, + block_size=self.cfg.tokens_per_sample - 2, # for <s> and </s> + pad=pad, + eos=eos, + break_mode=self.cfg.sample_break_mode, + document_sep_len=0, + ) + prev_output_tokens = PrependTokenDataset( + StripTokenDataset(src_dataset, eos), eos + ) + src_dataset = PadDataset(src_dataset, pad_idx=pad, left_pad=False) + return NestedDictionaryDataset( + { + "id": IdDataset(), + "net_input": { + "src_tokens": src_dataset, + "src_lengths": NumelDataset(src_dataset, reduce=False), + "prev_output_tokens": PadDataset( + prev_output_tokens, pad_idx=pad, left_pad=False + ), + }, + "target": src_dataset, + }, + sizes=[np.array(src_lengths)], + ) + + def max_positions(self): + """Return the max sentence length allowed by the task.""" + return (self.cfg.max_source_positions, self.cfg.max_target_positions) + + @property + def source_dictionary(self): + """Return the source :class:`~fairseq.data.Dictionary`.""" + return self.dictionary + + @property + def target_dictionary(self): + """Return the target :class:`~fairseq.data.Dictionary`.""" + return self.dictionary diff --git a/fairseq/tasks/speech_dlm_task.py b/fairseq/tasks/speech_dlm_task.py new file mode 100644 index 0000000000..340732b928 --- /dev/null +++ b/fairseq/tasks/speech_dlm_task.py @@ -0,0 +1,561 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +from dataclasses import dataclass, field +from typing import Optional +from collections import OrderedDict + +import numpy as np +import torch +from fairseq import utils +from fairseq.data import ( + AppendTokenDataset, + Dictionary, + IdDataset, + LMContextWindowDataset, + MonolingualDataset, + NestedDictionaryDataset, + NumelDataset, + PadDataset, + PrependTokenDataset, + SpeechDLMDataset, + StripTokenDataset, + TokenBlockDataset, + TruncatedDictionary, + data_utils, +) +from fairseq.data.indexed_dataset import get_available_dataset_impl +from fairseq.data.shorten_dataset import maybe_shorten_dataset +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.tasks import LegacyFairseqTask, register_task +from omegaconf import II + + +SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"]) +SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) +logger = logging.getLogger(__name__) + + +@dataclass +class SpeechDLMConfig(FairseqDataclass): + data: Optional[str] = field( + default=None, metadata={"help": "path to data directory"} + ) + channels: Optional[str] = field( + default=None, + metadata={ + "help": 'comma-separated list of channels to load e.g., "unitA,unitB"' + "(default: load all possible channels in the data path)" + }, + ) + channel_weights: Optional[str] = field( + default=None, + metadata={ + "help": "comma-separated list of weights for different losses" + "(default: None, which means all losses are treated equally)" + }, + ) + sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( + default="none", + metadata={ + "help": 'If omitted or "none", fills each sample with tokens-per-sample ' + 'tokens. If set to "complete", splits samples only at the end ' + "of sentence, but may include multiple sentences per sample. " + '"complete_doc" is similar but respects doc boundaries. ' + 'If set to "eos", includes only one sentence per sample.' + }, + ) + tokens_per_sample: int = field( + default=1024, + metadata={"help": "max number of tokens per sample for LM dataset"}, + ) + output_dictionary_size: int = field( + default=-1, metadata={"help": "limit the size of output dictionary"} + ) + # str type is a workaround to put **default=True** here + next_unit_prediction: str = field( + default="False", + metadata={ + "help": "Perform Next Unit Prediction, expected str input ('True' or 'False')" + }, + ) + edge_unit_prediction: str = field( + default="True", + metadata={ + "help": "Perform Edge Unit Prediction, expected str input ('True' or 'False')" + }, + ) + duration_prediction: str = field( + default="True", + metadata={ + "help": "Perform Duration Prediction, expected str input ('True' or 'False')" + }, + ) + delayed_duration_target: str = field( + default="True", + metadata={ + "help": "Perform Delayed Duration Prediction, expected str input ('True' or 'False')" + "(default: 'True')" + }, + ) + max_target_durations: Optional[int] = field( + default=256, + metadata={"help": "max duration considered (cut off to this value)"}, + ) + add_bos_token: bool = field( + default=False, metadata={"help": "prepend beginning of sentence token (<s>)"} + ) + max_target_positions: Optional[int] = field( + default=None, metadata={"help": "max number of tokens in the target sequence"} + ) + shorten_method: SHORTEN_METHOD_CHOICES = field( + default="none", + metadata={ + "help": "if not none, shorten sequences that exceed --tokens-per-sample" + }, + ) + shorten_data_split_list: str = field( + default="", + metadata={ + "help": "comma-separated list of dataset splits to apply shortening to, " + 'e.g., "train,valid" (default: all dataset splits)' + }, + ) + # TODO common vars below add to parent + seed: int = II("common.seed") + dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( + "dataset.dataset_impl" + ) + data_buffer_size: int = II("dataset.data_buffer_size") + tpu: bool = II("common.tpu") + + +@register_task("speech_dlm_task", dataclass=SpeechDLMConfig) +class SpeechDLMTask(LegacyFairseqTask): + """Task for the SpeechDLM model as described in the paper: + https://arxiv.org/pdf/2203.16502.pdf + + It create a multi-channel dataset (SpeechDLMDataset) from multiple + dictionaries. + + Args: + dictionaries (Dict[str, ~fairseq.data.Dictionary]): the dictionaries for + each input channel of the SpeechDLM model + output_dictionaries (Dict[str, ~fairseq.data.Dictionary]): the dictionaries + for the output of each channel of the SpeechDLM model. In most cases it + will be the same as *dictionaries*. + targets (List[str]): list of the target types that the SpeechDLM model + should predict. Can be one of "next", "edge", "duration". + Defaults to "next". + + .. note:: + + The SpeechDLM task is only compatible with + :mod:`fairseq-train` and :mod:`fairseq-validate`. + To generate new samples, please refer to example codes + at examples/textless_nlp/dgslm . + """ + + def __init__(self, args, dicts, output_dicts=None, targets=None): + super().__init__(args) + self.dicts = dicts + self.output_dicts = output_dicts or dicts + + if targets is None: + targets = ["next"] + self.targets = targets + + self.channels = list(dicts.keys()) + + if args.channel_weights is not None: + self.channel_weights = [float(w) for w in args.channel_weights.split(",")] + else: + self.channel_weights = [1.0 for _ in self.channels] + assert len(self.channel_weights) == len( + self.channels + ), "number of channel_weights must be the same as number of channels" + + assert str(args.next_unit_prediction).lower() in [ + "true", + "false", + ], f"Expected to be a string of boolean, found {args.next_unit_prediction}" + assert str(args.edge_unit_prediction).lower() in [ + "true", + "false", + ], f"Expected to be a string of boolean, found {args.edge_unit_prediction}" + assert str(args.duration_prediction).lower() in [ + "true", + "false", + ], f"Expected to be a string of boolean, found {args.duration_prediction}" + assert str(args.delayed_duration_target).lower() in [ + "true", + "false", + ], f"Expected to be a string of boolean, found {args.delayed_duration_target}" + self.next_unit_prediction = bool( + str(args.next_unit_prediction).lower() == "true" + ) + self.edge_unit_prediction = bool( + str(args.edge_unit_prediction).lower() == "true" + ) + self.duration_prediction = bool(str(args.duration_prediction).lower() == "true") + self.delayed_duration_target = bool( + str(args.delayed_duration_target).lower() == "true" + ) + + self.max_target_durations = args.max_target_durations + + @classmethod + def setup_dictionary(cls, args, **kwargs): + """The dictionaries will be a dict over channel keys and values of type + ~fairseq.data.Dictionary. + """ + paths = utils.split_paths(args.data) + assert len(paths) > 0 + data_path = paths[0] + + dicts = None + output_dicts = None + if args.channels is None: + sorted_channels = sorted( + name[5:-4] + for name in os.listdir(data_path) + if name[:5] == "dict." and name[-4:] == ".txt" + ) + else: + sorted_channels = sorted(args.channels.split(",")) + logger.info("channels: {}".format(sorted_channels)) + # load dictionaries + dicts = OrderedDict() + output_dicts = OrderedDict() + for channel in sorted_channels: + dictionary = Dictionary.load( + os.path.join(data_path, "dict.{}.txt".format(channel)) + ) + logger.info("[{}] dictionary: {} types".format(channel, len(dictionary))) + output_dictionary = dictionary + if args.output_dictionary_size >= 0: + output_dictionary = TruncatedDictionary( + dictionary, args.output_dictionary_size + ) + dicts[channel] = dictionary + output_dicts[channel] = output_dictionary + if len(dicts) > 0: + assert dicts[channel].pad() == dicts[sorted_channels[0]].pad() + assert dicts[channel].bos() == dicts[sorted_channels[0]].bos() + assert dicts[channel].eos() == dicts[sorted_channels[0]].eos() + assert dicts[channel].unk() == dicts[sorted_channels[0]].unk() + return (dicts, output_dicts) + + @classmethod + def setup_task(cls, args, **kwargs): + """Setup the task (e.g., load dictionaries). + + Args: + args (argparse.Namespace): parsed command-line arguments + """ + dicts, output_dicts = cls.setup_dictionary(args, **kwargs) + + targets = [] + if str(getattr(args, "next_unit_prediction", "false")).lower() == "true": + targets.append("next") + if str(getattr(args, "edge_unit_prediction", "false")).lower() == "true": + targets.append("edge") + if str(getattr(args, "duration_prediction", "false")).lower() == "true": + targets.append("duration") + if len(targets) == 0: + # standard language modeling + targets = ["next"] + + return cls(args, dicts, output_dicts, targets=targets) + + def build_model(self, args): + model = super().build_model(args) + for target in self.targets: + if target not in model.supported_targets: + raise ValueError("Unsupported SpeechDLM target: {}".format(target)) + return model + + def load_dataset( + self, split: str, epoch=1, combine=False, **kwargs + ) -> SpeechDLMDataset: + """Load a given dataset split. + + Args: + split (str): name of the split (e.g., train, valid, test) + """ + paths = utils.split_paths(self.args.data) + assert len(paths) > 0 + + data_path = paths[(epoch - 1) % len(paths)] + + channel_datasets = {} + for channel in self.channels: + split_path = os.path.join(data_path, split + "." + channel) + dictionary = self.dicts[channel] + output_dictionary = self.output_dicts[channel] + + dataset = data_utils.load_indexed_dataset( + split_path, dictionary, self.args.dataset_impl, combine=combine + ) + + if dataset is None: + raise FileNotFoundError( + "[{}] Dataset not found: {} ({})".format(channel, split, split_path) + ) + + dataset = maybe_shorten_dataset( + dataset, + split, + self.args.shorten_data_split_list, + self.args.shorten_method, + self.args.tokens_per_sample, + self.args.seed, + ) + + dataset = TokenBlockDataset( + dataset, + dataset.sizes, + self.args.tokens_per_sample, + pad=dictionary.pad(), + eos=dictionary.eos(), + break_mode=self.args.sample_break_mode, + include_targets=True, + ) + + add_eos_for_other_targets = ( + self.args.sample_break_mode is not None + and self.args.sample_break_mode != "none" + ) + + channel_datasets[channel] = MonolingualDataset( + dataset=dataset, + sizes=dataset.sizes, + src_vocab=dictionary, + tgt_vocab=output_dictionary, + add_eos_for_other_targets=add_eos_for_other_targets, + shuffle=False, + targets=["future"], + add_bos_token=self.args.add_bos_token, + ) + + self.datasets[split] = SpeechDLMDataset( + datasets=channel_datasets, + targets=self.targets, + max_target_durations=self.max_target_durations, + shuffle=True, + ) + + def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): + """ + Generate batches for inference. We prepend an eos token to src_tokens + (or bos if `--add-bos-token` is set) and we append a <pad> to target. + This is convenient both for generation with a prefix and LM scoring. + """ + src_datasets = {} + tgt_datasets = {} + for channel in src_tokens[0]: + dataset = StripTokenDataset( + TokenBlockDataset( + [src_tokens[i][channel] for i in range(len(src_tokens))], + src_lengths, + block_size=None, # ignored for "eos" break mode + pad=self.source_dictionaries[channel].pad(), + eos=self.source_dictionaries[channel].eos(), + break_mode="eos", + ), + # remove eos from (end of) target sequence + self.source_dictionaries[channel].eos(), + ) + src_dataset = PrependTokenDataset( + dataset, + token=( + self.source_dictionaries[channel].bos() + if getattr(self.args, "add_bos_token", False) + else self.source_dictionaries[channel].eos() + ), + ) + tgt_dataset = AppendTokenDataset( + dataset, token=self.source_dictionaries[channel].pad() + ) + + src_datasets[channel] = src_dataset + tgt_datasets[channel] = tgt_dataset + + return NestedDictionaryDataset( + { + "id": IdDataset(), + "net_input": { + "src_tokens": OrderedDict( + [ + ( + channel, + PadDataset( + src_datasets[channel], + pad_idx=self.source_dictionaries[channel].pad(), + left_pad=False, + ), + ) + for channel in src_datasets + ] + ), + "src_lengths": NumelDataset( + next(iter(src_datasets.values())), reduce=False + ), + }, + "target": OrderedDict( + [ + ( + channel, + PadDataset( + tgt_datasets[channel], + pad_idx=self.source_dictionaries[channel].pad(), + left_pad=False, + ), + ) + for channel in tgt_datasets + ] + ), + }, + sizes=[np.array(src_lengths)], + ) + + def inference_step( + self, generator, models, sample, prefix_tokens=None, constraints=None + ): + with torch.no_grad(): + # Generation will always be conditioned on bos_token + if getattr(self.args, "add_bos_token", False): + bos_token = self.source_dictionary.bos() + else: + bos_token = self.source_dictionary.eos() + + if constraints is not None: + raise NotImplementedError( + "Constrained decoding with the SpeechDLM task is not supported" + ) + # SequenceGenerator doesn't use src_tokens directly, we need to + # pass the `prefix_tokens` argument instead + if prefix_tokens is None: + prefix_tokens = {} + for channel in sample["net_input"]["src_tokens"]: + if sample["net_input"]["src_tokens"][channel].nelement(): + prefix_tokens_channel = sample["net_input"]["src_tokens"][ + channel + ] + if prefix_tokens_channel[:, 0].eq(bos_token).all(): + prefix_tokens_channel = prefix_tokens_channel[:, 1:] + prefix_tokens[channel] = prefix_tokens_channel + else: + prefix_tokens = None + break + return generator.generate( + models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token + ) + + def eval_lm_dataloader( + self, + dataset, + max_tokens: Optional[int] = 36000, + batch_size: Optional[int] = None, + max_positions: Optional[int] = None, + num_shards: int = 1, + shard_id: int = 0, + num_workers: int = 1, + data_buffer_size: int = 10, + # ensures that every evaluated token has access to a context of at least + # this size, if possible + context_window: int = 0, + ): + if context_window > 0: + dataset = LMContextWindowDataset( + dataset=dataset, + tokens_per_sample=self.args.tokens_per_sample, + context_window=context_window, + pad_idx=self.source_dictionary.pad(), + ) + return self.get_batch_iterator( + dataset=dataset, + max_tokens=max_tokens, + max_sentences=batch_size, + max_positions=max_positions, + ignore_invalid_inputs=True, + num_shards=num_shards, + shard_id=shard_id, + num_workers=num_workers, + data_buffer_size=data_buffer_size, + ).next_epoch_itr(shuffle=False) + + @property + def source_dictionary(self): + """Return the :class:`~fairseq.data.Dictionary` for the language + model.""" + return self.dicts[self.channels[0]] + + @property + def target_dictionary(self): + """Return the :class:`~fairseq.data.Dictionary` for the language + model.""" + return self.output_dicts[self.channels[0]] + + @property + def source_dictionaries(self): + """Return the dict of :class:`~fairseq.data.Dictionary` for the + multichannel language model.""" + return self.dicts + + @property + def target_dictionaries(self): + """Return the dict of :class:`~fairseq.data.Dictionary` for the + multichannel language model.""" + return self.output_dicts + + def build_generator(self, models, args, extra_gen_cls_kwargs=None): + + from fairseq.models.speech_dlm.sequence_generator import ( + multichannel_search, + MultichannelSequenceGenerator, + ) + + # Choose search strategy. Defaults to Beam Search. + sampling = getattr(args, "sampling", False) + sampling_topk = getattr(args, "sampling_topk", -1) + sampling_topp = getattr(args, "sampling_topp", -1.0) + assert ( + sampling_topk < 0 or sampling + ), "--sampling-topk requires sampling (not beam search)" + assert ( + sampling_topp < 0 or sampling + ), "--sampling-topp requires sampling (not beam search)" + + if sampling: + search_strategy = multichannel_search.ContiguousMultichannelSampling( + self.target_dictionaries, sampling_topk, sampling_topp + ) + else: + search_strategy = multichannel_search.ContiguousMultichannelBeamSearch( + self.target_dictionaries + ) + + extra_gen_cls_kwargs = extra_gen_cls_kwargs or {} + + return MultichannelSequenceGenerator( + models, + self.target_dictionaries, + beam_size=getattr(args, "beam", 5), + max_len_a=getattr(args, "max_len_a", 0), + max_len_b=getattr(args, "max_len_b", 500), + min_len=getattr(args, "min_len", 1), + normalize_scores=(not getattr(args, "unnormalized", False)), + len_penalty=getattr(args, "lenpen", 1), + unk_penalty=getattr(args, "unkpen", 0), + temperature=getattr(args, "temperature", 1.0), + match_source_len=getattr(args, "match_source_len", False), + no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0), + search_strategy=search_strategy, + duration_temperature=getattr(args, "duration_temperature", 1.0), + **extra_gen_cls_kwargs, + ) diff --git a/fairseq/tasks/speech_to_speech.py b/fairseq/tasks/speech_to_speech.py new file mode 100644 index 0000000000..5aaaa95a90 --- /dev/null +++ b/fairseq/tasks/speech_to_speech.py @@ -0,0 +1,597 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import logging +import math +from argparse import Namespace +from pathlib import Path +from typing import List + +import torch +import torch.nn as nn + +from fairseq import utils +from fairseq.data import Dictionary +from fairseq.data.audio.data_cfg import MultitaskConfig, S2SDataConfig +from fairseq.data.audio.speech_to_speech_dataset import SpeechToSpeechDatasetCreator +from fairseq.data.audio.speech_to_text_dataset import ( + SpeechToTextDataset, + TextTargetMultitaskData, +) +from fairseq.tasks import LegacyFairseqTask, register_task +from fairseq.tasks.speech_to_text import DummyMultiTask +from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion + +logger = logging.getLogger(__name__) + + +class StackUnitSequenceGenerator(nn.Module): + def __init__(self, tgt_dict, vocab_size): + super().__init__() + self.pad = tgt_dict.pad() + self.eos = tgt_dict.eos() + self.unk = tgt_dict.unk() + self.offset = len(tgt_dict) - vocab_size + self.vocab_size = vocab_size + + def pack_units(self, input: torch.Tensor, n_frames_per_step) -> torch.Tensor: + if n_frames_per_step <= 1: + return input + + bsz, _, n = input.shape + assert n == n_frames_per_step + + scale = [ + pow(self.vocab_size, n_frames_per_step - 1 - i) + for i in range(n_frames_per_step) + ] + scale = torch.LongTensor(scale).squeeze(0).to(input.device) + mask = input >= self.offset + res = ((input - self.offset) * scale * mask).sum(dim=2) + self.offset + return res + + @torch.no_grad() + def generate(self, models, sample, **kwargs): + # currently only support viterbi search for stacked units + model = models[0] + model.eval() + + max_len = model.max_decoder_positions() + # TODO: incorporate max_len_a and max_len_b + + src_tokens = sample["net_input"]["src_tokens"] + src_lengths = sample["net_input"]["src_lengths"] + bsz, src_len, _ = src_tokens.size() + n_frames_per_step = model.decoder.n_frames_per_step + + # initialize + encoder_out = model.forward_encoder( + src_tokens, src_lengths, speaker=sample["speaker"] + ) + incremental_state = {} + pred_out, attn, scores = [], [], [] + finished = src_tokens.new_zeros((bsz,)).bool() + + prev_output_tokens = src_lengths.new_zeros((bsz, 1)).long().fill_(self.eos) + for _ in range(max_len): + cur_out, cur_extra = model.forward_decoder( + prev_output_tokens, + encoder_out=encoder_out, + incremental_state=incremental_state, + ) + + lprobs = model.get_normalized_probs([cur_out], log_probs=True) + # never select pad, unk + lprobs[:, :, self.pad] = -math.inf + lprobs[:, :, self.unk] = -math.inf + + cur_pred_lprob, cur_pred_out = torch.max(lprobs, dim=2) + scores.append(cur_pred_lprob) + pred_out.append(cur_pred_out) + + prev_output_tokens = torch.cat( + ( + prev_output_tokens, + self.pack_units( + cur_pred_out.view(bsz, 1, n_frames_per_step), n_frames_per_step + ), + ), + dim=1, + ) + + attn.append(cur_extra["attn"][0]) + + cur_finished = torch.any(cur_pred_out.squeeze(1) == self.eos, dim=1) + finished = finished | cur_finished + if finished.sum().item() == bsz: + break + + pred_out = torch.cat(pred_out, dim=1).view(bsz, -1) + attn = torch.cat(attn, dim=2) + alignment = attn.max(dim=1)[1] + attn = attn.repeat_interleave(n_frames_per_step, dim=2) + alignment = alignment.repeat_interleave(n_frames_per_step, dim=1) + scores = torch.cat(scores, dim=1) + eos_idx = (pred_out == self.eos).nonzero(as_tuple=True) + out_lens = src_lengths.new_zeros((bsz,)).long().fill_(max_len) + for b, l in zip(eos_idx[0], eos_idx[1]): + out_lens[b] = min(l, out_lens[b]) + + hypos = [ + [ + { + "tokens": pred_out[b, :out_len], + "attn": attn[b, :, :out_len], + "alignment": alignment[b, :out_len], + "positional_scores": scores[b, :out_len], + "score": utils.item(scores[b, :out_len].sum().data), + } + ] + for b, out_len in zip(range(bsz), out_lens) + ] + + return hypos + + +@register_task("speech_to_speech") +class SpeechToSpeechTask(LegacyFairseqTask): + @classmethod + def add_args(cls, parser): + parser.add_argument("data", help="manifest root path") + parser.add_argument( + "--config-yaml", + type=str, + default="config.yaml", + help="Configuration YAML filename (under manifest root)", + ) + parser.add_argument( + "--multitask-config-yaml", + type=str, + default=None, + help="Configuration YAML filename for the multitasks (under manifest root)", + ) + parser.add_argument( + "--max-source-positions", + default=6000, + type=int, + metavar="N", + help="max number of tokens in the source sequence", + ) + parser.add_argument( + "--max-target-positions", + default=1024, + type=int, + metavar="N", + help="max number of tokens in the target sequence", + ) + parser.add_argument( + "--target-is-code", + action="store_true", + help="set if target is discrete unit instead of spectrogram", + ) + parser.add_argument( + "--target-code-size", type=int, default=None, help="# discrete units" + ) + parser.add_argument( + "--n-frames-per-step", + type=int, + default=1, + help="# stacked frames, use 0 for reduced discrete unit sequence", + ) + parser.add_argument("--eval-inference", action="store_true") + parser.add_argument( + "--eval-args", + type=str, + default="{}", + help='generation args for speech-to-unit model , e.g., \'{"beam": 5, "max_len_a": 1}\', as JSON string', + ) + parser.add_argument("--eos-prob-threshold", type=float, default=0.5) + parser.add_argument( + "--mcd-normalize-type", + type=str, + default="targ", + choices=["targ", "pred", "path"], + ) + parser.add_argument( + "--vocoder", + type=str, + default="griffin_lim", + choices=["griffin_lim", "hifigan", "code_hifigan"], + ) + parser.add_argument("--spec-bwd-max-iter", type=int, default=8) + parser.add_argument( + "--infer-target-lang", + type=str, + default="", + help="target language for inference", + ) + + def __init__(self, args, tgt_dict, infer_tgt_lang_id=None): + super().__init__(args) + self.tgt_dict = tgt_dict + self.data_cfg = S2SDataConfig(Path(args.data) / args.config_yaml) + + self.multitask_tasks = {} + self.tgt_dict_mt = None + self.eos_token_mt = None + if getattr(args, "multitask_config_yaml", None) is not None: + multitask_cfg = MultitaskConfig( + Path(args.data) / args.multitask_config_yaml + ) + first_pass_task_idx = multitask_cfg.first_pass_decoder_task_index + for i, (task_name, task_config) in enumerate( + multitask_cfg.get_all_tasks().items() + ): + task_obj = DummyMultiTask( + task_config, + task_config.tgt_dict, + first_pass=i == first_pass_task_idx, + ) + self.multitask_tasks[task_name] = task_obj + if task_obj.is_first_pass_decoder: + self.tgt_dict_mt = task_obj.target_dictionary + if task_config.prepend_bos_and_append_tgt_lang_tag: + self.eos_token_mt = task_config.eos_token + assert not isinstance(self.eos_token_mt, List) + + if not self.eos_token_mt: + raise Warning( + "Please provide eos_token in --multitask-config-yaml to replace eos in sequence generator" + ) + + self._infer_tgt_lang_id = infer_tgt_lang_id + + @classmethod + def setup_task(cls, args, **kwargs): + data_cfg = data_cfg = S2SDataConfig(Path(args.data) / args.config_yaml) + tgt_dict = None + infer_tgt_lang_id = None + if args.target_is_code: + if data_cfg.prepend_tgt_lang_tag_as_bos: + # dictionary with language tags + dict_path = Path(args.data) / data_cfg.vocab_filename + if not dict_path.is_file(): + raise FileNotFoundError( + f"Dict has to be provided when setting prepend_tgt_lang_tag_as_bos: true, but dict not found: {dict_path}" + ) + tgt_dict = Dictionary.load(dict_path.as_posix()) + + # target langauge for inference + if args.infer_target_lang != "": + tgt_lang_tag = SpeechToTextDataset.LANG_TAG_TEMPLATE.format( + args.infer_target_lang + ) + infer_tgt_lang_id = tgt_dict.index(tgt_lang_tag) + assert infer_tgt_lang_id != tgt_dict.unk() + else: + assert args.target_code_size is not None + + tgt_dict = Dictionary() + for i in range(args.target_code_size): + tgt_dict.add_symbol(str(i)) + logger.info(f"dictionary size: " f"{len(tgt_dict):,}") + + if getattr(args, "train_subset", None) is not None: + if not all(s.startswith("train") for s in args.train_subset.split(",")): + raise ValueError('Train splits should be named like "train*".') + + assert args.n_frames_per_step >= 1 + assert ( + not args.eval_inference + or (args.target_is_code and args.vocoder == "code_hifigan") + or (not args.target_is_code and args.vocoder != "code_hifigan") + ) + + return cls(args, tgt_dict, infer_tgt_lang_id=infer_tgt_lang_id) + + def build_criterion(self, args): + from fairseq import criterions + + if len(self.multitask_tasks) > 0: + if self.args.target_is_code and not args._name.startswith("speech_to_unit"): + raise ValueError( + "set --criterion speech_to_unit for speech-to-unit loss with multitask" + ) + elif not self.args.target_is_code and not args._name.startswith( + "speech_to_spectrogram" + ): + raise ValueError( + "set --criterion speech_to_spectrogram for speech-to-spectrogram loss with multitask" + ) + + return criterions.build_criterion(args, self) + + def load_dataset(self, split, epoch=1, combine=False, **kwargs): + self.datasets[split] = SpeechToSpeechDatasetCreator.from_tsv( + root=self.args.data, + data_cfg=self.data_cfg, + splits=split, + is_train_split=split.startswith("train"), + epoch=epoch, + seed=self.args.seed, + target_is_code=self.args.target_is_code, + tgt_dict=self.target_dictionary, + n_frames_per_step=self.args.n_frames_per_step, + multitask=self.multitask_tasks, + ) + + @property + def target_dictionary(self): + return self.tgt_dict + + @property + def target_dictionary_mt(self): + return self.tgt_dict_mt + + @property + def source_dictionary(self): + return None + + def max_positions(self): + return self.args.max_source_positions, self.args.max_target_positions + + def build_model(self, args, from_checkpoint=False): + args.input_feat_per_channel = self.data_cfg.input_feat_per_channel + args.input_channels = self.data_cfg.input_transformed_channels + args.target_speaker_embed = self.data_cfg.target_speaker_embed is not None + args.n_frames_per_step = self.args.n_frames_per_step + + model = super().build_model(args, from_checkpoint) + + if len(self.multitask_tasks) > 0: + from fairseq.models.speech_to_speech.s2s_transformer import ( + S2STransformerMultitaskModelBase, + ) + + assert isinstance(model, S2STransformerMultitaskModelBase) + + if self.args.eval_inference: + self.eval_gen_args = json.loads(self.args.eval_args) + self.generator = self.build_generator( + [model], Namespace(**self.eval_gen_args) + ) + + return model + + def build_generator_dual_decoder( + self, + models, + args, + extra_gen_cls_kwargs=None, + ): + from examples.speech_to_speech.unity.sequence_generator_multi_decoder import ( + MultiDecoderSequenceGenerator, + ) + + return MultiDecoderSequenceGenerator( + models, + self.target_dictionary, + self.target_dictionary_mt, + beam_size=max(1, getattr(args, "beam", 1)), + beam_size_mt=max(1, getattr(args, "beam_mt", 1)), + max_len_a=getattr(args, "max_len_a", 0), + max_len_b=getattr(args, "max_len_b", 200), + max_len_a_mt=getattr(args, "max_len_a_mt", 0), + max_len_b_mt=getattr(args, "max_len_b_mt", 200), + min_len=getattr(args, "min_len", 1), + normalize_scores=(not getattr(args, "unnormalized", False)), + len_penalty=getattr(args, "lenpen", 1), + unk_penalty=getattr(args, "unkpen", 0), + temperature=getattr(args, "temperature", 1.0), + match_source_len=getattr(args, "match_source_len", False), + no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0), + **extra_gen_cls_kwargs, + ) + + def build_generator( + self, + models, + args, + seq_gen_cls=None, + extra_gen_cls_kwargs=None, + ): + + if not self.args.target_is_code or self.args.eval_inference: + from fairseq.models.text_to_speech.vocoder import get_vocoder + + self.vocoder = get_vocoder(self.args, self.data_cfg) + self.vocoder = ( + self.vocoder.cuda() + if torch.cuda.is_available() and not self.args.cpu + else self.vocoder.cpu() + ) + + has_dual_decoder = getattr(models[0], "mt_task_name", None) is not None + + if self.args.target_is_code: + if self.args.n_frames_per_step == 1: + if has_dual_decoder: + seq_generator = self.build_generator_dual_decoder( + models, + args, + extra_gen_cls_kwargs=extra_gen_cls_kwargs, + ) + else: + seq_generator = super().build_generator( + models, + args, + seq_gen_cls=None, + extra_gen_cls_kwargs=extra_gen_cls_kwargs, + ) + else: + assert ( + getattr(args, "beam", 1) == 1 and getattr(args, "nbest", 1) == 1 + ), "only support viterbi search for stacked units" + seq_generator = StackUnitSequenceGenerator( + self.tgt_dict, + self.args.target_code_size, + ) + else: + if has_dual_decoder: + if getattr(args, "teacher_forcing", False): + raise NotImplementedError + else: + from fairseq.speech_generator import MultiDecoderSpeechGenerator + + generator = MultiDecoderSpeechGenerator + + lang_token_ids_aux = { + i + for s, i in self.tgt_dict_mt.indices.items() + if TextTargetMultitaskData.is_lang_tag(s) + } + + if extra_gen_cls_kwargs is None: + extra_gen_cls_kwargs = {} + extra_gen_cls_kwargs[ + "symbols_to_strip_from_output" + ] = lang_token_ids_aux + + eos_id_mt = ( + self.tgt_dict_mt.index(self.eos_token_mt) + if self.eos_token_mt + else None + ) + assert eos_id_mt != self.tgt_dict_mt.unk() + extra_gen_cls_kwargs["eos_mt"] = eos_id_mt + + seq_generator = generator( + models, + args, + self.vocoder, + self.data_cfg, + self.target_dictionary_mt, + max_iter=self.args.max_target_positions, + eos_prob_threshold=self.args.eos_prob_threshold, + **extra_gen_cls_kwargs, + ) + else: + if getattr(args, "teacher_forcing", False): + from fairseq.speech_generator import ( + TeacherForcingAutoRegressiveSpeechGenerator, + ) + + generator = TeacherForcingAutoRegressiveSpeechGenerator + logger.info("Teacher forcing mode for generation") + else: + from fairseq.speech_generator import AutoRegressiveSpeechGenerator + + generator = AutoRegressiveSpeechGenerator + + seq_generator = generator( + models[0], + self.vocoder, + self.data_cfg, + max_iter=self.args.max_target_positions, + eos_prob_threshold=self.args.eos_prob_threshold, + ) + + return seq_generator + + def train_step( + self, sample, model, criterion, optimizer, update_num, ignore_grad=False + ): + for task_name, task_obj in self.multitask_tasks.items(): + criterion.set_multitask_loss_weight( + task_name, task_obj.args.get_loss_weight(update_num) + ) + if task_name in model.multitask_decoders: + model.multitask_decoders[task_name].train() + + loss, sample_size, logging_output = super().train_step( + sample, model, criterion, optimizer, update_num, ignore_grad + ) + return loss, sample_size, logging_output + + def valid_step(self, sample, model, criterion): + for task_name in self.multitask_tasks.keys(): + if task_name in model.multitask_decoders: + model.multitask_decoders[task_name].eval() + loss, sample_size, logging_output = super().valid_step(sample, model, criterion) + + if self.args.eval_inference: + hypos, inference_losses = self.valid_step_with_inference( + sample, model, self.generator + ) + for k, v in inference_losses.items(): + assert k not in logging_output + logging_output[k] = v + + return loss, sample_size, logging_output + + def valid_step_with_inference(self, sample, model, generator): + if self.args.target_is_code: + hypos = generator.generate([model], sample) + tgt_lens = ( + sample["target_lengths"] - 1 + ) * self.args.n_frames_per_step # strip <eos> + for b, (f, l) in enumerate(zip(sample["target"], tgt_lens)): + hypos[b][0]["targ_waveform"] = self.vocoder( + {"code": f[:l] - 4}, # remove <bos>, <pad>, <eos>, <unk> + dur_prediction=self.eval_gen_args.get("dur_prediction", False), + ) + if len(hypos[b][0]["tokens"]) > 0: + hypos[b][0]["waveform"] = self.vocoder( + {"code": hypos[b][0]["tokens"] - 4}, + dur_prediction=self.eval_gen_args.get("dur_prediction", False), + ) + else: + hypos[b][0]["waveform"] = torch.flip( + hypos[b][0]["targ_waveform"], dims=[0] + ) + else: + hypos = [ + [hypo] for hypo in generator.generate(model, sample, has_targ=True) + ] + + losses = { + "mcd_loss": 0.0, + "targ_frames": 0.0, + "pred_frames": 0.0, + "path_frames": 0.0, + "nins": 0.0, + "ndel": 0.0, + } + rets = batch_mel_cepstral_distortion( + [hypo[0]["targ_waveform"] for hypo in hypos], + [hypo[0]["waveform"] for hypo in hypos], + self.data_cfg.output_sample_rate, + normalize_type=None, + ) + for d, extra in rets: + pathmap = extra[-1] + losses["mcd_loss"] += d.item() + losses["targ_frames"] += pathmap.size(0) + losses["pred_frames"] += pathmap.size(1) + losses["path_frames"] += pathmap.sum().item() + losses["nins"] += (pathmap.sum(dim=1) - 1).sum().item() + losses["ndel"] += (pathmap.sum(dim=0) - 1).sum().item() + losses["norm_frames"] = losses[ + f"{getattr(self.args, 'mcd_normalize_type', 'targ')}_frames" + ] + + return hypos, losses + + def inference_step( + self, generator, models, sample, prefix_tokens=None, constraints=None + ): + with torch.no_grad(): + if self._infer_tgt_lang_id is not None: + return generator.generate( + models, + sample, + prefix_tokens=prefix_tokens, + constraints=constraints, + bos_token=self._infer_tgt_lang_id, + ) + else: + return super().inference_step( + generator, + models, + sample, + prefix_tokens=prefix_tokens, + constraints=constraints, + ) diff --git a/fairseq/tasks/speech_to_text.py b/fairseq/tasks/speech_to_text.py index 8fb341b0c5..884082112a 100644 --- a/fairseq/tasks/speech_to_text.py +++ b/fairseq/tasks/speech_to_text.py @@ -4,25 +4,28 @@ # LICENSE file in the root directory of this source tree. import logging -import os.path as op from argparse import Namespace +from pathlib import Path +from typing import List from fairseq.data import Dictionary, encoders +from fairseq.data.audio.audio_utils import get_features_or_waveform +from fairseq.data.audio.data_cfg import MultitaskConfig from fairseq.data.audio.speech_to_text_dataset import ( S2TDataConfig, SpeechToTextDataset, SpeechToTextDatasetCreator, + TextTargetMultitaskData, ) from fairseq.tasks import LegacyFairseqTask, register_task - logger = logging.getLogger(__name__) @register_task("speech_to_text") class SpeechToTextTask(LegacyFairseqTask): - @staticmethod - def add_args(parser): + @classmethod + def add_args(cls, parser): parser.add_argument("data", help="manifest root path") parser.add_argument( "--config-yaml", @@ -30,6 +33,12 @@ def add_args(parser): default="config.yaml", help="Configuration YAML filename (under manifest root)", ) + parser.add_argument( + "--multitask-config-yaml", + type=str, + default=None, + help="Configuration YAML filename for the multitasks (under manifest root)", + ) parser.add_argument( "--max-source-positions", default=6000, @@ -48,15 +57,60 @@ def add_args(parser): def __init__(self, args, tgt_dict): super().__init__(args) self.tgt_dict = tgt_dict - self.data_cfg = S2TDataConfig(op.join(args.data, args.config_yaml)) + self.data_cfg = S2TDataConfig(Path(args.data) / args.config_yaml) + self.speaker_to_id = self._get_speaker_to_id() + if ( + self.data_cfg.prepend_tgt_lang_tag + and self.data_cfg.prepend_bos_and_append_tgt_lang_tag + ): + raise ValueError( + "Please set only one of the two options to avoid adding target token multiple times" + ) + + self.multitask_tasks = {} + self.tgt_dict_mt = None + self.eos_token_mt = None + if getattr(args, "multitask_config_yaml", None) is not None: + multitask_cfg = MultitaskConfig( + Path(args.data) / args.multitask_config_yaml + ) + first_pass_task_idx = multitask_cfg.first_pass_decoder_task_index + for i, (task_name, task_config) in enumerate( + multitask_cfg.get_all_tasks().items() + ): + task_obj = DummyMultiTask( + task_config, + task_config.tgt_dict, + first_pass=i == first_pass_task_idx, + ) + self.multitask_tasks[task_name] = task_obj + if task_obj.is_first_pass_decoder: + self.tgt_dict_mt = task_obj.target_dictionary + if task_config.prepend_bos_and_append_tgt_lang_tag: + self.eos_token_mt = task_config.eos_token + assert not isinstance(self.eos_token_mt, List) + + if not self.eos_token_mt: + raise Warning( + "Please provide eos_token in --multitask-config-yaml to replace eos in sequence generator" + ) + + def _get_speaker_to_id(self): + speaker_to_id = None + speaker_set_filename = self.data_cfg.config.get("speaker_set_filename") + if speaker_set_filename is not None: + speaker_set_path = Path(self.args.data) / speaker_set_filename + with open(speaker_set_path) as f: + speaker_to_id = {r.strip(): i for i, r in enumerate(f)} + return speaker_to_id @classmethod def setup_task(cls, args, **kwargs): - data_cfg = S2TDataConfig(op.join(args.data, args.config_yaml)) - dict_path = op.join(args.data, data_cfg.vocab_filename) - if not op.isfile(dict_path): - raise FileNotFoundError(f"Dict not found: {dict_path}") - tgt_dict = Dictionary.load(dict_path) + data_cfg = S2TDataConfig(Path(args.data) / args.config_yaml) + dict_path = Path(args.data) / data_cfg.vocab_filename + if not dict_path.is_file(): + raise FileNotFoundError(f"Dict not found: {dict_path.as_posix()}") + tgt_dict = Dictionary.load(dict_path.as_posix()) logger.info( f"dictionary size ({data_cfg.vocab_filename}): " f"{len(tgt_dict):,}" ) @@ -81,21 +135,27 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): pre_tokenizer = self.build_tokenizer(self.args) bpe_tokenizer = self.build_bpe(self.args) self.datasets[split] = SpeechToTextDatasetCreator.from_tsv( - self.args.data, - self.data_cfg, - split, - self.tgt_dict, - pre_tokenizer, - bpe_tokenizer, + root=self.args.data, + cfg=self.data_cfg, + splits=split, + tgt_dict=self.tgt_dict, + pre_tokenizer=pre_tokenizer, + bpe_tokenizer=bpe_tokenizer, is_train_split=is_train_split, epoch=epoch, seed=self.args.seed, + speaker_to_id=self.speaker_to_id, + multitask=self.multitask_tasks, ) @property def target_dictionary(self): return self.tgt_dict + @property + def target_dictionary_mt(self): + return self.tgt_dict_mt + @property def source_dictionary(self): return None @@ -103,10 +163,56 @@ def source_dictionary(self): def max_positions(self): return self.args.max_source_positions, self.args.max_target_positions - def build_model(self, args): + def build_model(self, args, from_checkpoint=False): args.input_feat_per_channel = self.data_cfg.input_feat_per_channel args.input_channels = self.data_cfg.input_channels - return super(SpeechToTextTask, self).build_model(args) + args.speaker_to_id = self.speaker_to_id + return super(SpeechToTextTask, self).build_model(args, from_checkpoint) + + def build_generator_dual_decoder( + self, + models, + args, + extra_gen_cls_kwargs, + ): + from examples.speech_to_speech.unity.sequence_generator_multi_decoder import ( + MultiDecoderSequenceGenerator, + ) + + lang_token_ids_aux = { + i + for s, i in self.tgt_dict_mt.indices.items() + if TextTargetMultitaskData.is_lang_tag(s) + } + + extra_gen_cls_kwargs["symbols_to_strip_from_output"].update(lang_token_ids_aux) + + eos_id_mt = ( + self.tgt_dict_mt.index(self.eos_token_mt) if self.eos_token_mt else None + ) + assert eos_id_mt != self.tgt_dict_mt.unk() + extra_gen_cls_kwargs["eos_mt"] = eos_id_mt + + return MultiDecoderSequenceGenerator( + models, + self.target_dictionary, + self.target_dictionary_mt, + beam_size=max(1, getattr(args, "beam", 1)), + beam_size_mt=max(1, getattr(args, "beam_mt", 1)), + max_len_a=getattr(args, "max_len_a", 0), + max_len_b=getattr(args, "max_len_b", 200), + max_len_a_mt=getattr(args, "max_len_a_mt", 0), + max_len_b_mt=getattr(args, "max_len_b_mt", 0), + min_len=getattr(args, "min_len", 1), + normalize_scores=(not getattr(args, "unnormalized", False)), + len_penalty=getattr(args, "lenpen", 1), + len_penalty_mt=getattr(args, "lenpen_mt", 1), + unk_penalty=getattr(args, "unkpen", 0), + temperature=getattr(args, "temperature", 1.0), + match_source_len=getattr(args, "match_source_len", False), + no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0), + **extra_gen_cls_kwargs, + ) def build_generator( self, @@ -125,11 +231,64 @@ def build_generator( for s, i in self.tgt_dict.indices.items() if SpeechToTextDataset.is_lang_tag(s) } - extra_gen_cls_kwargs = {"symbols_to_strip_from_output": lang_token_ids} - return super().build_generator( - models, args, seq_gen_cls=None, extra_gen_cls_kwargs=extra_gen_cls_kwargs + + if extra_gen_cls_kwargs is None: + extra_gen_cls_kwargs = {} + extra_gen_cls_kwargs["symbols_to_strip_from_output"] = lang_token_ids + + eos_token = ( + args.eos_token + if "eos_token" in args and args.eos_token is not None + else self.data_cfg.config.get("eos_token", None) ) + if self.data_cfg.prepend_bos_and_append_tgt_lang_tag and not eos_token: + raise Warning( + "Please provide --eos_token to replace eos in sequence generator" + ) + + eos_id = self.tgt_dict.index(eos_token) if eos_token else None + extra_gen_cls_kwargs["eos"] = eos_id + + has_dual_decoder = getattr(models[0], "mt_task_name", None) is not None + + if has_dual_decoder: + return self.build_generator_dual_decoder( + models, + args, + extra_gen_cls_kwargs=extra_gen_cls_kwargs, + ) + else: + return super().build_generator( + models, + args, + seq_gen_cls=None, + extra_gen_cls_kwargs=extra_gen_cls_kwargs, + ) + + def train_step( + self, sample, model, criterion, optimizer, update_num, ignore_grad=False + ): + for task_name, task_obj in self.multitask_tasks.items(): + criterion.set_multitask_loss_weight( + task_name, task_obj.args.get_loss_weight(update_num) + ) + if task_name in model.multitask_decoders: + model.multitask_decoders[task_name].train() + + loss, sample_size, logging_output = super().train_step( + sample, model, criterion, optimizer, update_num, ignore_grad + ) + return loss, sample_size, logging_output + + def valid_step(self, sample, model, criterion): + for task_name, task_obj in self.multitask_tasks.items(): + if task_name in model.multitask_decoders: + model.multitask_decoders[task_name].eval() + loss, sample_size, logging_output = super().valid_step(sample, model, criterion) + + return loss, sample_size, logging_output + def build_tokenizer(self, args): logger.info(f"pre-tokenizer: {self.data_cfg.pre_tokenizer}") return encoders.build_tokenizer(Namespace(**self.data_cfg.pre_tokenizer)) @@ -138,6 +297,54 @@ def build_bpe(self, args): logger.info(f"tokenizer: {self.data_cfg.bpe_tokenizer}") return encoders.build_bpe(Namespace(**self.data_cfg.bpe_tokenizer)) - @classmethod - def build_dataset_for_inference(cls, audio_paths, n_frames): - return SpeechToTextDataset("interactive", False, {}, audio_paths, n_frames) + def get_interactive_tokens_and_lengths(self, lines, encode_fn): + n_frames = [get_features_or_waveform(p).shape[0] for p in lines] + return lines, n_frames + + def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): + return SpeechToTextDataset( + "interactive", False, self.data_cfg, src_tokens, src_lengths + ) + + +class DummyMultiTask(LegacyFairseqTask): + def __init__(self, args, tgt_dict, first_pass=False): + super().__init__(args) + self.tgt_dict = tgt_dict + self.first_pass = first_pass + + @property + def target_dictionary(self): + return self.tgt_dict + + @property + def is_first_pass_decoder(self): + return self.first_pass + + def inference_step( + self, generator, models, sample, prefix_tokens=None, constraints=None + ): + if self.args.decoder_type == "ctc": + model = models[0] # only support single model + encoder_out = model(**sample) + if hasattr(model, "get_logits"): + emissions = model.get_logits( + encoder_out + ) # no need to normalize emissions + else: + emissions = model.get_normalized_probs(encoder_out, log_probs=True) + return generator.decode( + emissions.transpose(0, 1).float().cpu().contiguous() + ) + else: + raise NotImplementedError("only ctc decoder is supported at the moment") + + def build_generator( + self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None + ): + if self.args.decoder_type == "ctc": + from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder + + return W2lViterbiDecoder(args, self.tgt_dict) + else: + raise NotImplementedError("only ctc decoder is supported at the moment") diff --git a/fairseq/tasks/speech_ulm_task.py b/fairseq/tasks/speech_ulm_task.py new file mode 100644 index 0000000000..b9d3019d50 --- /dev/null +++ b/fairseq/tasks/speech_ulm_task.py @@ -0,0 +1,224 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import logging +import sys +import torch +from dataclasses import dataclass, field +from typing import List, Optional, Tuple + +from fairseq.data import Dictionary +from fairseq.data.codedataset import ExpressiveCodeDataConfig, CodeDataset +from fairseq.dataclass.configs import FairseqDataclass +from fairseq.tasks import register_task +from fairseq.tasks.fairseq_task import FairseqTask +from omegaconf import MISSING, DictConfig + + +logger = logging.getLogger(__name__) + + +class UnitDictionary(Dictionary): + """ + A fixed-sized Dictionary that operates on integer-valued tokens + wth a trivial (identity) token <-> id mapping. + Special symbols (bos, eos, ...) have ids above n_units. + """ + + def __init__( + self, + *, # begin keyword-only arguments + n_units, + bos="<s>", + pad="<pad>", + eos="</s>", + unk="<unk>", + extra_special_symbols=None, + clip=False, + ): + self.n_units = n_units + self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos + self.clip = clip + + self.symbols = [] + self.count = [] + self.indices = {} + for i in range(n_units): + self.add_symbol(str(i)) + + self.bos_index = self.add_symbol(bos) + self.pad_index = self.add_symbol(pad) + self.eos_index = self.add_symbol(eos) + self.unk_index = self.add_symbol(unk) + + if extra_special_symbols: + for s in extra_special_symbols: + self.add_symbol(s) + self.nspecial = len(self.symbols) + + def encode_line(self, line, append_eos=True, prepend_bos=False) -> torch.IntTensor: + words = [int(x) for x in line.split()] + if self.clip: + words = [min(self.n_units - 1, word) for word in words] + if prepend_bos: + words = [self.bos_index] + words + if append_eos: + words.append(self.eos_index) + ids = torch.IntTensor(words) + return ids + + +@dataclass +class SpeechUnitModelingConfig(FairseqDataclass): + data: str = field(default=MISSING, metadata={"help": "Path to data config.json"}) + max_token_duration: int = field( + default=20, metadata={"help": "all token durations are capped to this value"} + ) + tokens_per_sample: int = field( + default=1024, metadata={"help": "tokens in a sample"} + ) + max_target_positions: int = field( + default=1024, metadata={"help": "max target positions"} + ) + + # duration modeling + ignore_duration_input: bool = field( + default=False, metadata={"help": "whether token durations should be zeroed out"} + ) + discrete_duration: bool = field( + default=False, metadata={"help": "treat duration as discrete variable"} + ) + # F0 modeling + ignore_f0_input: bool = field( + default=False, metadata={"help": "whether F0 should be zeroed out"} + ) + discrete_f0: bool = field( + default=False, metadata={"help": "load quantized f0. get bin from config"} + ) + log_f0: bool = field( + default=False, metadata={"help": "whether f0 should be modeled in log space"} + ) + normalize_f0_mean: bool = field( + default=False, metadata={"help": "whether normalize f0 by speaker mean"} + ) + normalize_f0_std: bool = field( + default=False, metadata={"help": "whether normalize f0 by speaker stddev"} + ) + interpolate_f0: bool = field( + default=False, + metadata={"help": "whether interpolate f0 for non-voiced segments"}, + ) + + # input/output streams + stream_shifts: str = field( + default="0,0", + metadata={ + "help": ( + "comma-separated integer list denoting right-shift for " + "duration and pitch streams" + ) + }, + ) + + +@register_task("speech_unit_modeling", dataclass=SpeechUnitModelingConfig) +class SpeechUnitLanguageModelingTask(FairseqTask): + def __init__(self, cfg: SpeechUnitModelingConfig) -> None: + super().__init__(cfg) + assert not self.cfg.normalize_f0_std or self.cfg.normalize_f0_mean + + self.data_config = ExpressiveCodeDataConfig(cfg.data) + self._source_dictionary = self._target_dictionary = UnitDictionary( + n_units=self.data_config.n_units + ) + self._source_duration_dictionary = self._target_duration_dictionary = ( + UnitDictionary(n_units=self.cfg.max_token_duration + 1, clip=True) + if self.cfg.discrete_duration + else None + ) + self._source_f0_dictionary = self._target_f0_dictionary = ( + UnitDictionary(n_units=self.data_config.f0_vq_n_units) + if self.cfg.discrete_f0 + else None + ) + + self._channel_names = ["token", "duration", "f0"] + self._channel_sizes = [ + len(self.target_dictionary), + len(self.target_duration_dictionary) if self.cfg.discrete_duration else 1, + len(self.target_f0_dictionary) if self.cfg.discrete_f0 else 1, + ] + + @property + def source_dictionary(self) -> Optional[Dictionary]: + return self._source_dictionary + + @property + def source_duration_dictionary(self) -> Optional[Dictionary]: + return self._source_duration_dictionary + + @property + def source_f0_dictionary(self) -> Optional[Dictionary]: + return self._source_f0_dictionary + + @property + def channel_names(self) -> List[str]: + return self._channel_names + + @property + def channel_sizes(self) -> List[int]: + return self._channel_sizes + + @property + def dictionary(self) -> Optional[Dictionary]: + return self._source_dictionary + + @property + def target_dictionary(self) -> Optional[Dictionary]: + return self._target_dictionary + + @property + def target_duration_dictionary(self) -> Optional[Dictionary]: + return self._target_duration_dictionary + + @property + def target_f0_dictionary(self) -> Optional[Dictionary]: + return self._target_f0_dictionary + + @property + def dictionaries(self) -> List[Dictionary]: + return [self._dictionaries[l] for l in self.cfg.labels] + + @classmethod + def setup_task( + cls, cfg: SpeechUnitModelingConfig, **kwargs + ) -> "SpeechUnitLanguageModelingTask": + return cls(cfg) + + def load_dataset(self, split: str, **kwargs) -> None: + self.datasets[split] = CodeDataset( + manifest=self.data_config.manifests[split], + dictionary=self.source_dictionary, + dur_dictionary=self.source_duration_dictionary, + f0_dictionary=self.source_f0_dictionary, + config=self.data_config, + discrete_dur=self.cfg.discrete_duration, + discrete_f0=self.cfg.discrete_f0, + log_f0=self.cfg.log_f0, + normalize_f0_mean=self.cfg.normalize_f0_mean, + normalize_f0_std=self.cfg.normalize_f0_std, + interpolate_f0=self.cfg.interpolate_f0, + shifts=self.cfg.stream_shifts, + ) + + def max_positions(self) -> Tuple[int, int]: + return (sys.maxsize, sys.maxsize) + + def build_criterion(self, cfg: DictConfig): + import fairseq.criterions + + return fairseq.criterions.build_criterion(cfg, self) diff --git a/fairseq/tasks/text_to_speech.py b/fairseq/tasks/text_to_speech.py new file mode 100644 index 0000000000..82e7e6643a --- /dev/null +++ b/fairseq/tasks/text_to_speech.py @@ -0,0 +1,501 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +import os.path as op + +import torch +import torch.nn.functional as F +import numpy as np + +from fairseq.data.audio.text_to_speech_dataset import TextToSpeechDatasetCreator +from fairseq.tasks import register_task +from fairseq.tasks.speech_to_text import SpeechToTextTask +from fairseq.speech_generator import ( + AutoRegressiveSpeechGenerator, + NonAutoregressiveSpeechGenerator, + TeacherForcingAutoRegressiveSpeechGenerator, +) + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +try: + from tensorboardX import SummaryWriter +except ImportError: + logger.info("Please install tensorboardX: pip install tensorboardX") + SummaryWriter = None + + +@register_task("text_to_speech") +class TextToSpeechTask(SpeechToTextTask): + @staticmethod + def add_args(parser): + parser.add_argument("data", help="manifest root path") + parser.add_argument( + "--config-yaml", + type=str, + default="config.yaml", + help="Configuration YAML filename (under manifest root)", + ) + parser.add_argument( + "--max-source-positions", + default=1024, + type=int, + metavar="N", + help="max number of tokens in the source sequence", + ) + parser.add_argument( + "--max-target-positions", + default=1200, + type=int, + metavar="N", + help="max number of tokens in the target sequence", + ) + parser.add_argument("--n-frames-per-step", type=int, default=1) + parser.add_argument("--eos-prob-threshold", type=float, default=0.5) + parser.add_argument("--eval-inference", action="store_true") + parser.add_argument("--eval-tb-nsample", type=int, default=8) + parser.add_argument("--vocoder", type=str, default="griffin_lim") + parser.add_argument("--spec-bwd-max-iter", type=int, default=8) + + def __init__(self, args, src_dict): + super().__init__(args, src_dict) + self.src_dict = src_dict + self.sr = self.data_cfg.config.get("features").get("sample_rate") + + self.tensorboard_writer = None + self.tensorboard_dir = "" + if args.tensorboard_logdir and SummaryWriter is not None: + self.tensorboard_dir = os.path.join(args.tensorboard_logdir, "valid_extra") + + def load_dataset(self, split, epoch=1, combine=False, **kwargs): + is_train_split = split.startswith("train") + pre_tokenizer = self.build_tokenizer(self.args) + bpe_tokenizer = self.build_bpe(self.args) + self.datasets[split] = TextToSpeechDatasetCreator.from_tsv( + self.args.data, + self.data_cfg, + split, + self.src_dict, + pre_tokenizer, + bpe_tokenizer, + is_train_split=is_train_split, + epoch=epoch, + seed=self.args.seed, + n_frames_per_step=self.args.n_frames_per_step, + speaker_to_id=self.speaker_to_id, + ) + + @property + def target_dictionary(self): + return None + + @property + def source_dictionary(self): + return self.src_dict + + def get_speaker_embeddings_path(self): + speaker_emb_path = None + if self.data_cfg.config.get("speaker_emb_filename") is not None: + speaker_emb_path = op.join( + self.args.data, self.data_cfg.config.get("speaker_emb_filename") + ) + return speaker_emb_path + + @classmethod + def get_speaker_embeddings(cls, args): + embed_speaker = None + if args.speaker_to_id is not None: + if args.speaker_emb_path is None: + embed_speaker = torch.nn.Embedding( + len(args.speaker_to_id), args.speaker_embed_dim + ) + else: + speaker_emb_mat = np.load(args.speaker_emb_path) + assert speaker_emb_mat.shape[1] == args.speaker_embed_dim + embed_speaker = torch.nn.Embedding.from_pretrained( + torch.from_numpy(speaker_emb_mat), + freeze=True, + ) + logger.info( + f"load speaker embeddings from {args.speaker_emb_path}. " + f"train embedding? {embed_speaker.weight.requires_grad}\n" + f"embeddings:\n{speaker_emb_mat}" + ) + return embed_speaker + + def build_model(self, cfg, from_checkpoint=False): + cfg.pitch_min = self.data_cfg.config["features"].get("pitch_min", None) + cfg.pitch_max = self.data_cfg.config["features"].get("pitch_max", None) + cfg.energy_min = self.data_cfg.config["features"].get("energy_min", None) + cfg.energy_max = self.data_cfg.config["features"].get("energy_max", None) + cfg.speaker_emb_path = self.get_speaker_embeddings_path() + model = super().build_model(cfg, from_checkpoint) + self.generator = None + if getattr(cfg, "eval_inference", False): + self.generator = self.build_generator([model], cfg) + return model + + def build_generator(self, models, cfg, vocoder=None, **unused): + if vocoder is None: + vocoder = self.build_default_vocoder() + model = models[0] + if getattr(model, "NON_AUTOREGRESSIVE", False): + return NonAutoregressiveSpeechGenerator(model, vocoder, self.data_cfg) + else: + generator = AutoRegressiveSpeechGenerator + if getattr(cfg, "teacher_forcing", False): + generator = TeacherForcingAutoRegressiveSpeechGenerator + logger.info("Teacher forcing mode for generation") + return generator( + model, + vocoder, + self.data_cfg, + max_iter=self.args.max_target_positions, + eos_prob_threshold=self.args.eos_prob_threshold, + ) + + def build_default_vocoder(self): + from fairseq.models.text_to_speech.vocoder import get_vocoder + + vocoder = get_vocoder(self.args, self.data_cfg) + if torch.cuda.is_available() and not self.args.cpu: + vocoder = vocoder.cuda() + else: + vocoder = vocoder.cpu() + return vocoder + + def valid_step(self, sample, model, criterion): + loss, sample_size, logging_output = super().valid_step(sample, model, criterion) + + if getattr(self.args, "eval_inference", False): + hypos, inference_losses = self.valid_step_with_inference( + sample, model, self.generator + ) + for k, v in inference_losses.items(): + assert k not in logging_output + logging_output[k] = v + + picked_id = 0 + if self.tensorboard_dir and (sample["id"] == picked_id).any(): + self.log_tensorboard( + sample, + hypos[: self.args.eval_tb_nsample], + model._num_updates, + is_na_model=getattr(model, "NON_AUTOREGRESSIVE", False), + ) + return loss, sample_size, logging_output + + def valid_step_with_inference(self, sample, model, generator): + hypos = generator.generate(model, sample, has_targ=True) + + losses = { + "mcd_loss": 0.0, + "targ_frames": 0.0, + "pred_frames": 0.0, + "nins": 0.0, + "ndel": 0.0, + } + rets = batch_mel_cepstral_distortion( + [hypo["targ_waveform"] for hypo in hypos], + [hypo["waveform"] for hypo in hypos], + self.sr, + normalize_type=None, + ) + for d, extra in rets: + pathmap = extra[-1] + losses["mcd_loss"] += d.item() + losses["targ_frames"] += pathmap.size(0) + losses["pred_frames"] += pathmap.size(1) + losses["nins"] += (pathmap.sum(dim=1) - 1).sum().item() + losses["ndel"] += (pathmap.sum(dim=0) - 1).sum().item() + + return hypos, losses + + def log_tensorboard(self, sample, hypos, num_updates, is_na_model=False): + if self.tensorboard_writer is None: + self.tensorboard_writer = SummaryWriter(self.tensorboard_dir) + tb_writer = self.tensorboard_writer + for b in range(len(hypos)): + idx = sample["id"][b] + text = sample["src_texts"][b] + targ = hypos[b]["targ_feature"] + pred = hypos[b]["feature"] + attn = hypos[b]["attn"] + + if is_na_model: + data = plot_tts_output( + [targ.transpose(0, 1), pred.transpose(0, 1)], + [f"target (idx={idx})", "output"], + attn, + "alignment", + ret_np=True, + suptitle=text, + ) + else: + eos_prob = hypos[b]["eos_prob"] + data = plot_tts_output( + [targ.transpose(0, 1), pred.transpose(0, 1), attn], + [f"target (idx={idx})", "output", "alignment"], + eos_prob, + "eos prob", + ret_np=True, + suptitle=text, + ) + + tb_writer.add_image( + f"inference_sample_{b}", data, num_updates, dataformats="HWC" + ) + + if hypos[b]["waveform"] is not None: + targ_wave = hypos[b]["targ_waveform"].detach().cpu().float() + pred_wave = hypos[b]["waveform"].detach().cpu().float() + tb_writer.add_audio( + f"inference_targ_{b}", targ_wave, num_updates, sample_rate=self.sr + ) + tb_writer.add_audio( + f"inference_pred_{b}", pred_wave, num_updates, sample_rate=self.sr + ) + + +def save_figure_to_numpy(fig): + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + return data + + +DEFAULT_V_MIN = np.log(1e-5) + + +def plot_tts_output( + data_2d, + title_2d, + data_1d, + title_1d, + figsize=(24, 4), + v_min=DEFAULT_V_MIN, + v_max=3, + ret_np=False, + suptitle="", +): + try: + import matplotlib.pyplot as plt + from mpl_toolkits.axes_grid1 import make_axes_locatable + except ImportError: + raise ImportError("Please install Matplotlib: pip install matplotlib") + + data_2d = [ + x.detach().cpu().float().numpy() if isinstance(x, torch.Tensor) else x + for x in data_2d + ] + fig, axes = plt.subplots(1, len(data_2d) + 1, figsize=figsize) + if suptitle: + fig.suptitle(suptitle[:400]) # capped at 400 chars + axes = [axes] if len(data_2d) == 0 else axes + for ax, x, name in zip(axes, data_2d, title_2d): + ax.set_title(name) + divider = make_axes_locatable(ax) + cax = divider.append_axes("right", size="5%", pad=0.05) + im = ax.imshow( + x, + origin="lower", + aspect="auto", + vmin=max(x.min(), v_min), + vmax=min(x.max(), v_max), + ) + fig.colorbar(im, cax=cax, orientation="vertical") + + if isinstance(data_1d, torch.Tensor): + data_1d = data_1d.detach().cpu().numpy() + axes[-1].plot(data_1d) + axes[-1].set_title(title_1d) + plt.tight_layout() + + if ret_np: + fig.canvas.draw() + data = save_figure_to_numpy(fig) + plt.close(fig) + return data + + +def antidiag_indices(offset, min_i=0, max_i=None, min_j=0, max_j=None): + """ + for a (3, 4) matrix with min_i=1, max_i=3, min_j=1, max_j=4, outputs + + offset=2 (1, 1), + offset=3 (2, 1), (1, 2) + offset=4 (2, 2), (1, 3) + offset=5 (2, 3) + + constraints: + i + j = offset + min_j <= j < max_j + min_i <= offset - j < max_i + """ + if max_i is None: + max_i = offset + 1 + if max_j is None: + max_j = offset + 1 + min_j = max(min_j, offset - max_i + 1, 0) + max_j = min(max_j, offset - min_i + 1, offset + 1) + j = torch.arange(min_j, max_j) + i = offset - j + return torch.stack([i, j]) + + +def batch_dynamic_time_warping(distance, shapes=None): + """full batched DTW without any constraints + + distance: (batchsize, max_M, max_N) matrix + shapes: (batchsize,) vector specifying (M, N) for each entry + """ + # ptr: 0=left, 1=up-left, 2=up + ptr2dij = {0: (0, -1), 1: (-1, -1), 2: (-1, 0)} + + bsz, m, n = distance.size() + cumdist = torch.zeros_like(distance) + backptr = torch.zeros_like(distance).type(torch.int32) - 1 + + # initialize + cumdist[:, 0, :] = distance[:, 0, :].cumsum(dim=-1) + cumdist[:, :, 0] = distance[:, :, 0].cumsum(dim=-1) + backptr[:, 0, :] = 0 + backptr[:, :, 0] = 2 + + # DP with optimized anti-diagonal parallelization, O(M+N) steps + for offset in range(2, m + n - 1): + ind = antidiag_indices(offset, 1, m, 1, n) + c = torch.stack( + [ + cumdist[:, ind[0], ind[1] - 1], + cumdist[:, ind[0] - 1, ind[1] - 1], + cumdist[:, ind[0] - 1, ind[1]], + ], + dim=2, + ) + v, b = c.min(axis=-1) + backptr[:, ind[0], ind[1]] = b.int() + cumdist[:, ind[0], ind[1]] = v + distance[:, ind[0], ind[1]] + + # backtrace + pathmap = torch.zeros_like(backptr) + for b in range(bsz): + i = m - 1 if shapes is None else (shapes[b][0] - 1).item() + j = n - 1 if shapes is None else (shapes[b][1] - 1).item() + dtwpath = [(i, j)] + while (i != 0 or j != 0) and len(dtwpath) < 10000: + assert i >= 0 and j >= 0 + di, dj = ptr2dij[backptr[b, i, j].item()] + i, j = i + di, j + dj + dtwpath.append((i, j)) + dtwpath = dtwpath[::-1] + indices = torch.from_numpy(np.array(dtwpath)) + pathmap[b, indices[:, 0], indices[:, 1]] = 1 + + return cumdist, backptr, pathmap + + +def compute_l2_dist(x1, x2): + """compute an (m, n) L2 distance matrix from (m, d) and (n, d) matrices""" + return torch.cdist(x1.unsqueeze(0), x2.unsqueeze(0), p=2).squeeze(0).pow(2) + + +def compute_rms_dist(x1, x2): + l2_dist = compute_l2_dist(x1, x2) + return (l2_dist / x1.size(1)).pow(0.5) + + +def get_divisor(pathmap, normalize_type): + if normalize_type is None: + return 1 + elif normalize_type == "len1": + return pathmap.size(0) + elif normalize_type == "len2": + return pathmap.size(1) + elif normalize_type == "path": + return pathmap.sum().item() + else: + raise ValueError(f"normalize_type {normalize_type} not supported") + + +def batch_compute_distortion(y1, y2, sr, feat_fn, dist_fn, normalize_type): + d, s, x1, x2 = [], [], [], [] + for cur_y1, cur_y2 in zip(y1, y2): + assert cur_y1.ndim == 1 and cur_y2.ndim == 1 + cur_x1 = feat_fn(cur_y1) + cur_x2 = feat_fn(cur_y2) + x1.append(cur_x1) + x2.append(cur_x2) + + cur_d = dist_fn(cur_x1, cur_x2) + d.append(cur_d) + s.append(d[-1].size()) + max_m = max(ss[0] for ss in s) + max_n = max(ss[1] for ss in s) + d = torch.stack( + [F.pad(dd, (0, max_n - dd.size(1), 0, max_m - dd.size(0))) for dd in d] + ) + s = torch.LongTensor(s).to(d.device) + cumdists, backptrs, pathmaps = batch_dynamic_time_warping(d, s) + + rets = [] + itr = zip(s, x1, x2, d, cumdists, backptrs, pathmaps) + for (m, n), cur_x1, cur_x2, dist, cumdist, backptr, pathmap in itr: + cumdist = cumdist[:m, :n] + backptr = backptr[:m, :n] + pathmap = pathmap[:m, :n] + divisor = get_divisor(pathmap, normalize_type) + + distortion = cumdist[-1, -1] / divisor + ret = distortion, (cur_x1, cur_x2, dist, cumdist, backptr, pathmap) + rets.append(ret) + return rets + + +def batch_mel_cepstral_distortion(y1, y2, sr, normalize_type="path", mfcc_fn=None): + """ + https://arxiv.org/pdf/2011.03568.pdf + + The root mean squared error computed on 13-dimensional MFCC using DTW for + alignment. MFCC features are computed from an 80-channel log-mel + spectrogram using a 50ms Hann window and hop of 12.5ms. + + y1: list of waveforms + y2: list of waveforms + sr: sampling rate + """ + + try: + import torchaudio + except ImportError: + raise ImportError("Please install torchaudio: pip install torchaudio") + + if mfcc_fn is None or mfcc_fn.sample_rate != sr: + melkwargs = { + "n_fft": int(0.05 * sr), + "win_length": int(0.05 * sr), + "hop_length": int(0.0125 * sr), + "f_min": 20, + "n_mels": 80, + "window_fn": torch.hann_window, + } + mfcc_fn = torchaudio.transforms.MFCC( + sr, n_mfcc=13, log_mels=True, melkwargs=melkwargs + ).to(y1[0].device) + return batch_compute_distortion( + y1, + y2, + sr, + lambda y: mfcc_fn(y).transpose(-1, -2), + compute_rms_dist, + normalize_type, + ) diff --git a/fairseq/tasks/translation.py b/fairseq/tasks/translation.py index 79007a6d9f..6897ebe116 100644 --- a/fairseq/tasks/translation.py +++ b/fairseq/tasks/translation.py @@ -3,14 +3,18 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +from dataclasses import dataclass, field import itertools import json import logging import os +from typing import Optional from argparse import Namespace +from omegaconf import II import numpy as np -from fairseq import metrics, options, utils +from fairseq import utils +from fairseq.logging import metrics from fairseq.data import ( AppendTokenDataset, ConcatDataset, @@ -22,7 +26,9 @@ encoders, indexed_dataset, ) -from fairseq.tasks import LegacyFairseqTask, register_task +from fairseq.data.indexed_dataset import get_available_dataset_impl +from fairseq.dataclass import ChoiceEnum, FairseqDataclass +from fairseq.tasks import FairseqTask, register_task EVAL_BLEU_ORDER = 4 @@ -52,6 +58,7 @@ def load_langpair_dataset( num_buckets=0, shuffle=True, pad_to_multiple=1, + prepend_bos_src=None, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)) @@ -123,6 +130,9 @@ def split_exists(split, src, tgt, lang, data_path): src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) + elif prepend_bos_src is not None: + logger.info(f"prepending src bos: {prepend_bos_src}") + src_dataset = PrependTokenDataset(src_dataset, prepend_bos_src) eos = None if append_source_id: @@ -161,8 +171,102 @@ def split_exists(split, src, tgt, lang, data_path): ) -@register_task("translation") -class TranslationTask(LegacyFairseqTask): +@dataclass +class TranslationConfig(FairseqDataclass): + data: Optional[str] = field( + default=None, + metadata={ + "help": "colon separated path to data directories list, will be iterated upon during epochs " + "in round-robin manner; however, valid and test data are always in the first directory " + "to avoid the need for repeating them in all directories" + }, + ) + source_lang: Optional[str] = field( + default=None, + metadata={ + "help": "source language", + "argparse_alias": "-s", + }, + ) + target_lang: Optional[str] = field( + default=None, + metadata={ + "help": "target language", + "argparse_alias": "-t", + }, + ) + load_alignments: bool = field( + default=False, metadata={"help": "load the binarized alignments"} + ) + left_pad_source: bool = field( + default=True, metadata={"help": "pad the source on the left"} + ) + left_pad_target: bool = field( + default=False, metadata={"help": "pad the target on the left"} + ) + max_source_positions: int = field( + default=1024, metadata={"help": "max number of tokens in the source sequence"} + ) + max_target_positions: int = field( + default=1024, metadata={"help": "max number of tokens in the target sequence"} + ) + upsample_primary: int = field( + default=-1, metadata={"help": "the amount of upsample primary dataset"} + ) + truncate_source: bool = field( + default=False, metadata={"help": "truncate source to max-source-positions"} + ) + num_batch_buckets: int = field( + default=0, + metadata={ + "help": "if >0, then bucket source and target lengths into " + "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations" + }, + ) + train_subset: str = II("dataset.train_subset") + dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( + "dataset.dataset_impl" + ) + required_seq_len_multiple: int = II("dataset.required_seq_len_multiple") + + # options for reporting BLEU during validation + eval_bleu: bool = field( + default=False, metadata={"help": "evaluation with BLEU scores"} + ) + eval_bleu_args: Optional[str] = field( + default="{}", + metadata={ + "help": 'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string' + }, + ) + eval_bleu_detok: str = field( + default="space", + metadata={ + "help": "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; " + "use 'space' to disable detokenization; see fairseq.data.encoders for other options" + }, + ) + eval_bleu_detok_args: Optional[str] = field( + default="{}", + metadata={"help": "args for building the tokenizer, if needed, as JSON string"}, + ) + eval_tokenized_bleu: bool = field( + default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"} + ) + eval_bleu_remove_bpe: Optional[str] = field( + default=None, + metadata={ + "help": "remove BPE before computing BLEU", + "argparse_const": "@@ ", + }, + ) + eval_bleu_print_samples: bool = field( + default=False, metadata={"help": "print sample generations during validation"} + ) + + +@register_task("translation", dataclass=TranslationConfig) +class TranslationTask(FairseqTask): """ Translate from one (source) language to another (target) language. @@ -174,108 +278,47 @@ class TranslationTask(LegacyFairseqTask): The translation task is compatible with :mod:`fairseq-train`, :mod:`fairseq-generate` and :mod:`fairseq-interactive`. - - The translation task provides the following additional command-line - arguments: - - .. argparse:: - :ref: fairseq.tasks.translation_parser - :prog: """ - @staticmethod - def add_args(parser): - """Add task-specific arguments to the parser.""" - # fmt: off - parser.add_argument('data', help='colon separated path to data directories list, \ - will be iterated upon during epochs in round-robin manner; \ - however, valid and test data are always in the first directory to \ - avoid the need for repeating them in all directories') - parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', - help='source language') - parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', - help='target language') - parser.add_argument('--load-alignments', action='store_true', - help='load the binarized alignments') - parser.add_argument('--left-pad-source', default='True', type=str, metavar='BOOL', - help='pad the source on the left') - parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL', - help='pad the target on the left') - parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N', - help='max number of tokens in the source sequence') - parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N', - help='max number of tokens in the target sequence') - parser.add_argument('--upsample-primary', default=1, type=int, - help='amount to upsample primary dataset') - parser.add_argument('--truncate-source', action='store_true', default=False, - help='truncate source to max-source-positions') - parser.add_argument('--num-batch-buckets', default=0, type=int, metavar='N', - help='if >0, then bucket source and target lengths into N ' - 'buckets and pad accordingly; this is useful on TPUs ' - 'to minimize the number of compilations') - - # options for reporting BLEU during validation - parser.add_argument('--eval-bleu', action='store_true', - help='evaluation with BLEU scores') - parser.add_argument('--eval-bleu-detok', type=str, default="space", - help='detokenize before computing BLEU (e.g., "moses"); ' - 'required if using --eval-bleu; use "space" to ' - 'disable detokenization; see fairseq.data.encoders ' - 'for other options') - parser.add_argument('--eval-bleu-detok-args', type=str, metavar='JSON', - help='args for building the tokenizer, if needed') - parser.add_argument('--eval-tokenized-bleu', action='store_true', default=False, - help='compute tokenized BLEU instead of sacrebleu') - parser.add_argument('--eval-bleu-remove-bpe', nargs='?', const='@@ ', default=None, - help='remove BPE before computing BLEU') - parser.add_argument('--eval-bleu-args', type=str, metavar='JSON', - help='generation args for BLUE scoring, ' - 'e.g., \'{"beam": 4, "lenpen": 0.6}\'') - parser.add_argument('--eval-bleu-print-samples', action='store_true', - help='print sample generations during validation') - # fmt: on - - def __init__(self, args, src_dict, tgt_dict): - super().__init__(args) + cfg: TranslationConfig + + def __init__(self, cfg: TranslationConfig, src_dict, tgt_dict): + super().__init__(cfg) self.src_dict = src_dict self.tgt_dict = tgt_dict @classmethod - def setup_task(cls, args, **kwargs): + def setup_task(cls, cfg: TranslationConfig, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ - args.left_pad_source = utils.eval_bool(args.left_pad_source) - args.left_pad_target = utils.eval_bool(args.left_pad_target) - paths = utils.split_paths(args.data) + paths = utils.split_paths(cfg.data) assert len(paths) > 0 # find language pair automatically - if args.source_lang is None or args.target_lang is None: - args.source_lang, args.target_lang = data_utils.infer_language_pair( - paths[0] - ) - if args.source_lang is None or args.target_lang is None: + if cfg.source_lang is None or cfg.target_lang is None: + cfg.source_lang, cfg.target_lang = data_utils.infer_language_pair(paths[0]) + if cfg.source_lang is None or cfg.target_lang is None: raise Exception( "Could not infer language pair, please provide it explicitly" ) # load dictionaries src_dict = cls.load_dictionary( - os.path.join(paths[0], "dict.{}.txt".format(args.source_lang)) + os.path.join(paths[0], "dict.{}.txt".format(cfg.source_lang)) ) tgt_dict = cls.load_dictionary( - os.path.join(paths[0], "dict.{}.txt".format(args.target_lang)) + os.path.join(paths[0], "dict.{}.txt".format(cfg.target_lang)) ) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() - logger.info("[{}] dictionary: {} types".format(args.source_lang, len(src_dict))) - logger.info("[{}] dictionary: {} types".format(args.target_lang, len(tgt_dict))) + logger.info("[{}] dictionary: {} types".format(cfg.source_lang, len(src_dict))) + logger.info("[{}] dictionary: {} types".format(cfg.target_lang, len(tgt_dict))) - return cls(args, src_dict, tgt_dict) + return cls(cfg, src_dict, tgt_dict) def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. @@ -283,15 +326,15 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): Args: split (str): name of the split (e.g., train, valid, test) """ - paths = utils.split_paths(self.args.data) + paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 - if split != getattr(self.args, "train_subset", None): + if split != self.cfg.train_subset: # if not training data set, use the first shard for valid and test paths = paths[:1] data_path = paths[(epoch - 1) % len(paths)] # infer langcode - src, tgt = self.args.source_lang, self.args.target_lang + src, tgt = self.cfg.source_lang, self.cfg.target_lang self.datasets[split] = load_langpair_dataset( data_path, @@ -301,17 +344,17 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): tgt, self.tgt_dict, combine=combine, - dataset_impl=self.args.dataset_impl, - upsample_primary=self.args.upsample_primary, - left_pad_source=self.args.left_pad_source, - left_pad_target=self.args.left_pad_target, - max_source_positions=self.args.max_source_positions, - max_target_positions=self.args.max_target_positions, - load_alignments=self.args.load_alignments, - truncate_source=self.args.truncate_source, - num_buckets=self.args.num_batch_buckets, + dataset_impl=self.cfg.dataset_impl, + upsample_primary=self.cfg.upsample_primary, + left_pad_source=self.cfg.left_pad_source, + left_pad_target=self.cfg.left_pad_target, + max_source_positions=self.cfg.max_source_positions, + max_target_positions=self.cfg.max_target_positions, + load_alignments=self.cfg.load_alignments, + truncate_source=self.cfg.truncate_source, + num_buckets=self.cfg.num_batch_buckets, shuffle=(split != "test"), - pad_to_multiple=self.args.required_seq_len_multiple, + pad_to_multiple=self.cfg.required_seq_len_multiple, ) def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None): @@ -323,22 +366,15 @@ def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None) constraints=constraints, ) - def build_model(self, args): - model = super().build_model(args) - if getattr(args, "eval_bleu", False): - assert getattr(args, "eval_bleu_detok", None) is not None, ( - "--eval-bleu-detok is required if using --eval-bleu; " - "try --eval-bleu-detok=moses (or --eval-bleu-detok=space " - "to disable detokenization, e.g., when using sentencepiece)" - ) - detok_args = json.loads(getattr(args, "eval_bleu_detok_args", "{}") or "{}") + def build_model(self, cfg, from_checkpoint=False): + model = super().build_model(cfg, from_checkpoint) + if self.cfg.eval_bleu: + detok_args = json.loads(self.cfg.eval_bleu_detok_args) self.tokenizer = encoders.build_tokenizer( - Namespace( - tokenizer=getattr(args, "eval_bleu_detok", None), **detok_args - ) + Namespace(tokenizer=self.cfg.eval_bleu_detok, **detok_args) ) - gen_args = json.loads(getattr(args, "eval_bleu_args", "{}") or "{}") + gen_args = json.loads(self.cfg.eval_bleu_args) self.sequence_generator = self.build_generator( [model], Namespace(**gen_args) ) @@ -346,7 +382,7 @@ def build_model(self, args): def valid_step(self, sample, model, criterion): loss, sample_size, logging_output = super().valid_step(sample, model, criterion) - if self.args.eval_bleu: + if self.cfg.eval_bleu: bleu = self._inference_with_bleu(self.sequence_generator, sample, model) logging_output["_bleu_sys_len"] = bleu.sys_len logging_output["_bleu_ref_len"] = bleu.ref_len @@ -360,10 +396,15 @@ def valid_step(self, sample, model, criterion): def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) - if self.args.eval_bleu: + if self.cfg.eval_bleu: def sum_logs(key): - return sum(log.get(key, 0) for log in logging_outputs) + import torch + + result = sum(log.get(key, 0) for log in logging_outputs) + if torch.is_tensor(result): + result = result.cpu() + return result counts, totals = [], [] for i in range(EVAL_BLEU_ORDER): @@ -379,19 +420,28 @@ def sum_logs(key): def compute_bleu(meters): import inspect - import sacrebleu - fn_sig = inspect.getfullargspec(sacrebleu.compute_bleu)[0] + try: + from sacrebleu.metrics import BLEU + + comp_bleu = BLEU.compute_bleu + except ImportError: + # compatibility API for sacrebleu 1.x + import sacrebleu + + comp_bleu = sacrebleu.compute_bleu + + fn_sig = inspect.getfullargspec(comp_bleu)[0] if "smooth_method" in fn_sig: smooth = {"smooth_method": "exp"} else: smooth = {"smooth": "exp"} - bleu = sacrebleu.compute_bleu( + bleu = comp_bleu( correct=meters["_bleu_counts"].sum, total=meters["_bleu_totals"].sum, - sys_len=meters["_bleu_sys_len"].sum, - ref_len=meters["_bleu_ref_len"].sum, - **smooth + sys_len=int(meters["_bleu_sys_len"].sum), + ref_len=int(meters["_bleu_ref_len"].sum), + **smooth, ) return round(bleu.score, 2) @@ -399,7 +449,7 @@ def compute_bleu(meters): def max_positions(self): """Return the max sentence length allowed by the task.""" - return (self.args.max_source_positions, self.args.max_target_positions) + return (self.cfg.max_source_positions, self.cfg.max_target_positions) @property def source_dictionary(self): @@ -417,7 +467,7 @@ def _inference_with_bleu(self, generator, sample, model): def decode(toks, escape_unk=False): s = self.tgt_dict.string( toks.int().cpu(), - self.args.eval_bleu_remove_bpe, + self.cfg.eval_bleu_remove_bpe, # The default unknown string in fairseq is `<unk>`, but # this is tokenized by sacrebleu as `< unk >`, inflating # BLEU scores. Instead, we use a somewhat more verbose @@ -439,10 +489,10 @@ def decode(toks, escape_unk=False): escape_unk=True, # don't count <unk> as matches to the hypo ) ) - if self.args.eval_bleu_print_samples: + if self.cfg.eval_bleu_print_samples: logger.info("example hypothesis: " + hyps[0]) logger.info("example reference: " + refs[0]) - if self.args.eval_tokenized_bleu: + if self.cfg.eval_tokenized_bleu: return sacrebleu.corpus_bleu(hyps, [refs], tokenize="none") else: return sacrebleu.corpus_bleu(hyps, [refs]) diff --git a/fairseq/tasks/translation_from_pretrained_bart.py b/fairseq/tasks/translation_from_pretrained_bart.py index 8710b7fe7d..0fd7a5b29f 100644 --- a/fairseq/tasks/translation_from_pretrained_bart.py +++ b/fairseq/tasks/translation_from_pretrained_bart.py @@ -38,7 +38,7 @@ def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) - parser.add_argument('--langs', required=True, metavar='LANG', + parser.add_argument('--langs', type=str, metavar='LANG', help='comma-separated list of monolingual language, ' 'for example, "en,de,fr". These should match the ' 'langs from pretraining (and be in the same order). ' diff --git a/fairseq/tasks/translation_from_pretrained_xlm.py b/fairseq/tasks/translation_from_pretrained_xlm.py index 347a6eccb7..a05f289152 100644 --- a/fairseq/tasks/translation_from_pretrained_xlm.py +++ b/fairseq/tasks/translation_from_pretrained_xlm.py @@ -3,13 +3,21 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +from dataclasses import dataclass from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary -from fairseq.tasks.translation import TranslationTask +from fairseq.tasks.translation import TranslationConfig, TranslationTask from . import register_task -@register_task("translation_from_pretrained_xlm") +@dataclass +class TranslationFromPretrainedXLMConfig(TranslationConfig): + pass + + +@register_task( + "translation_from_pretrained_xlm", dataclass=TranslationFromPretrainedXLMConfig +) class TranslationFromPretrainedXLMTask(TranslationTask): """ Same as TranslationTask except use the MaskedLMDictionary class so that diff --git a/fairseq/tasks/translation_lev.py b/fairseq/tasks/translation_lev.py index 4678774922..b45fecd1f4 100644 --- a/fairseq/tasks/translation_lev.py +++ b/fairseq/tasks/translation_lev.py @@ -3,33 +3,39 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import os - +from dataclasses import dataclass, field import torch from fairseq import utils from fairseq.data import LanguagePairDataset +from fairseq.dataclass import ChoiceEnum from fairseq.tasks import register_task -from fairseq.tasks.translation import TranslationTask, load_langpair_dataset +from fairseq.tasks.translation import ( + TranslationConfig, + TranslationTask, + load_langpair_dataset, +) from fairseq.utils import new_arange -@register_task("translation_lev") +NOISE_CHOICES = ChoiceEnum(["random_delete", "random_mask", "no_noise", "full_mask"]) + + +@dataclass +class TranslationLevenshteinConfig(TranslationConfig): + noise: NOISE_CHOICES = field( + default="random_delete", + metadata={"help": "type of noise"}, + ) + + +@register_task("translation_lev", dataclass=TranslationLevenshteinConfig) class TranslationLevenshteinTask(TranslationTask): """ Translation (Sequence Generation) task for Levenshtein Transformer See `"Levenshtein Transformer" <https://arxiv.org/abs/1905.11006>`_. """ - @staticmethod - def add_args(parser): - """Add task-specific arguments to the parser.""" - # fmt: off - TranslationTask.add_args(parser) - parser.add_argument( - '--noise', - default='random_delete', - choices=['random_delete', 'random_mask', 'no_noise', 'full_mask']) - # fmt: on + cfg: TranslationLevenshteinConfig def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. @@ -37,12 +43,12 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): Args: split (str): name of the split (e.g., train, valid, test) """ - paths = utils.split_paths(self.args.data) + paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] # infer langcode - src, tgt = self.args.source_lang, self.args.target_lang + src, tgt = self.cfg.source_lang, self.cfg.target_lang self.datasets[split] = load_langpair_dataset( data_path, @@ -52,12 +58,12 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): tgt, self.tgt_dict, combine=combine, - dataset_impl=self.args.dataset_impl, - upsample_primary=self.args.upsample_primary, - left_pad_source=self.args.left_pad_source, - left_pad_target=self.args.left_pad_target, - max_source_positions=self.args.max_source_positions, - max_target_positions=self.args.max_target_positions, + dataset_impl=self.cfg.dataset_impl, + upsample_primary=self.cfg.upsample_primary, + left_pad_source=self.cfg.left_pad_source, + left_pad_target=self.cfg.left_pad_target, + max_source_positions=self.cfg.max_source_positions, + max_target_positions=self.cfg.max_target_positions, prepend_bos=True, ) @@ -133,13 +139,13 @@ def _full_mask(target_tokens): ) return target_tokens.masked_fill(~target_mask, unk) - if self.args.noise == "random_delete": + if self.cfg.noise == "random_delete": return _random_delete(target_tokens) - elif self.args.noise == "random_mask": + elif self.cfg.noise == "random_mask": return _random_mask(target_tokens) - elif self.args.noise == "full_mask": + elif self.cfg.noise == "full_mask": return _full_mask(target_tokens) - elif self.args.noise == "no_noise": + elif self.cfg.noise == "no_noise": return target_tokens else: raise NotImplementedError diff --git a/fairseq/tasks/translation_multi_simple_epoch.py b/fairseq/tasks/translation_multi_simple_epoch.py index 95a2d162c0..5db36a7c79 100644 --- a/fairseq/tasks/translation_multi_simple_epoch.py +++ b/fairseq/tasks/translation_multi_simple_epoch.py @@ -96,25 +96,37 @@ def __init__(self, args, langs, dicts, training): # models.build_model(). This allows multitask type of sub-class can # build models other than the input lang_pairs self.model_lang_pairs = self.lang_pairs + self.source_langs = [d.split("-")[0] for d in self.lang_pairs] + self.target_langs = [d.split("-")[1] for d in self.lang_pairs] + self.check_dicts(self.dicts, self.source_langs, self.target_langs) + self.sampling_method = SamplingMethod.build_sampler(args, self) self.data_manager = MultilingualDatasetManager.setup_data_manager( args, self.lang_pairs, langs, dicts, self.sampling_method ) + def check_dicts(self, dicts, source_langs, target_langs): + if self.args.source_dict is not None or self.args.target_dict is not None: + # no need to check whether the source side and target side are sharing dictionaries + return + src_dict = dicts[source_langs[0]] + tgt_dict = dicts[target_langs[0]] + for src_lang in source_langs: + assert ( + src_dict == dicts[src_lang] + ), "Diffrent dictionary are specified for different source languages; " + "TranslationMultiSimpleEpochTask only supports one shared dictionary across all source languages" + for tgt_lang in target_langs: + assert ( + tgt_dict == dicts[tgt_lang] + ), "Diffrent dictionary are specified for different target languages; " + "TranslationMultiSimpleEpochTask only supports one shared dictionary across all target languages" + @classmethod def setup_task(cls, args, **kwargs): langs, dicts, training = MultilingualDatasetManager.prepare( cls.load_dictionary, args, **kwargs ) - dict0 = None - for _, lang_dict in dicts.items(): - if dict0 is None: - dict0 = lang_dict - else: - assert ( - dict0 == lang_dict - ), "Diffrent dictionary are specified for different languages; " - "TranslationMultiSimpleEpochTask only supports one shared dictionary across all languages" return cls(args, langs, dicts, training) def has_sharded_data(self, split): @@ -128,12 +140,16 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): """ if split in self.datasets: dataset = self.datasets[split] - if self.has_sharded_data(split) and dataset.load_next_shard: - shard_epoch = dataset.shard_epoch - else: - # no need to load next shard so skip loading - # also this avoid always loading from beginning of the data - return + if self.has_sharded_data(split): + if self.args.virtual_epoch_size is not None: + if dataset.load_next_shard: + shard_epoch = dataset.shard_epoch + else: + # no need to load next shard so skip loading + # also this avoid always loading from beginning of the data + return + else: + shard_epoch = epoch else: # estimate the shard epoch from virtual data size and virtual epoch size shard_epoch = self.data_manager.estimate_global_pass_epoch(epoch) @@ -143,7 +159,7 @@ def load_dataset(self, split, epoch=1, combine=False, **kwargs): del self.datasets[split] logger.info("old dataset deleted manually") logger.info(f"mem usage: {data_utils.get_mem_usage()}") - self.datasets[split] = self.data_manager.load_sampled_multi_epoch_dataset( + self.datasets[split] = self.data_manager.load_dataset( split, self.training, epoch=epoch, @@ -200,8 +216,8 @@ def build_generator( models, args, seq_gen_cls=None, extra_gen_cls_kwargs=extra_gen_cls_kwargs ) - def build_model(self, args): - return super().build_model(args) + def build_model(self, args, from_checkpoint=False): + return super().build_model(args, from_checkpoint) def valid_step(self, sample, model, criterion): loss, sample_size, logging_output = super().valid_step(sample, model, criterion) @@ -249,11 +265,11 @@ def max_positions(self): @property def source_dictionary(self): - return next(iter(self.dicts.values())) + return self.data_manager.get_source_dictionary(self.source_langs[0]) @property def target_dictionary(self): - return next(iter(self.dicts.values())) + return self.data_manager.get_target_dictionary(self.target_langs[0]) def create_batch_sampler_func( self, @@ -333,6 +349,9 @@ def get_batch_iterator( epoch=1, data_buffer_size=0, disable_iterator_cache=False, + skip_remainder_batch=False, + grouped_shuffling=False, + update_epoch_batch_itr=False, ): """ Get an iterator that yields batches of data from the given dataset. @@ -365,6 +384,12 @@ def get_batch_iterator( disable_iterator_cache (bool, optional): don't cache the EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`) (default: False). + grouped_shuffling (bool, optional): group batches with each groups + containing num_shards batches and shuffle groups. Reduces difference + between sequence lengths among workers for batches sorted by length. + update_epoch_batch_itr (bool optional): if true then donot use the cached + batch iterator for the epoch + Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split @@ -388,6 +413,8 @@ def get_batch_iterator( epoch=epoch, data_buffer_size=data_buffer_size, disable_iterator_cache=disable_iterator_cache, + skip_remainder_batch=skip_remainder_batch, + update_epoch_batch_itr=update_epoch_batch_itr, ) self.dataset_to_epoch_iter[dataset] = batch_iter return batch_iter diff --git a/fairseq/trainer.py b/fairseq/trainer.py index 8b00e8b431..16b1b91697 100644 --- a/fairseq/trainer.py +++ b/fairseq/trainer.py @@ -9,6 +9,7 @@ import contextlib import logging +import os import sys import time from argparse import Namespace @@ -16,14 +17,18 @@ from typing import Any, Dict, List import torch -from fairseq import checkpoint_utils, distributed_utils, models, optim, utils +from omegaconf import OmegaConf + +from fairseq import checkpoint_utils, models, optim, utils +from fairseq.dataclass.configs import FairseqConfig from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.distributed import utils as distributed_utils from fairseq.file_io import PathManager from fairseq.logging import meters, metrics +from fairseq.models.ema import build_ema from fairseq.nan_detector import NanDetector from fairseq.optim import lr_scheduler -from omegaconf import DictConfig - +from fairseq.utils import safe_hasattr logger = logging.getLogger(__name__) @@ -38,7 +43,7 @@ class Trainer(object): communication of the gradients across workers. """ - def __init__(self, cfg: DictConfig, task, model, criterion, quantizer=None): + def __init__(self, cfg: FairseqConfig, task, model, criterion, quantizer=None): if isinstance(cfg, Namespace): logger.warning( @@ -60,20 +65,53 @@ def __init__(self, cfg: DictConfig, task, model, criterion, quantizer=None): else: self.device = torch.device("cpu") + if self.is_fsdp: + import fairscale + + if self.cfg.common.bf16: + raise ValueError( + "FullyShardedDataParallel is not compatible with --bf16 or " + "--memory-efficient-bf16" + ) + if self.cfg.distributed_training.zero_sharding != "none": + raise ValueError( + "FullyShardedDataParallel is not compatible with --zero-sharding " + "option (it's already built in)" + ) + if ( + max(self.cfg.optimization.update_freq) > 1 + and fairscale.__version__ < "0.4.0" + ): + raise RuntimeError( + "Please update to fairscale 0.4.0 or newer when combining " + "--update-freq with FullyShardedDataParallel" + ) + else: + if ( + hasattr(self.cfg.distributed_training, "cpu_offload") + and self.cfg.distributed_training.cpu_offload + ): + raise ValueError("--cpu-offload requires --ddp-backend=fully_sharded") + # copy model and criterion to current device/dtype self._criterion = criterion self._model = model - if self.tpu: - import torch_xla.core.xla_model as xm - - self._model = xm.send_cpu_data_to_device(self._model, self.device) - if cfg.common.fp16: - self._criterion = self._criterion.half() - self._model = self._model.half() - elif cfg.common.bf16: - self._criterion = self._criterion.to(dtype=torch.bfloat16) - self._model = self._model.to(dtype=torch.bfloat16) - if not cfg.distributed_training.pipeline_model_parallel: + if not self.is_fsdp: + if cfg.common.fp16: + assert not cfg.common.amp, "Cannot use fp16 and AMP together" + self._criterion = self._criterion.half() + self._model = self._model.half() + elif cfg.common.bf16: + self._criterion = self._criterion.to(dtype=torch.bfloat16) + self._model = self._model.to(dtype=torch.bfloat16) + elif cfg.common.amp: + self._amp_retries = 0 + if ( + not cfg.distributed_training.pipeline_model_parallel + # the DistributedFairseqModel wrapper will handle moving to device, + # so only handle cases which don't use the wrapper + and not self.use_distributed_wrapper + ): self._criterion = self._criterion.to(device=self.device) self._model = self._model.to(device=self.device) self.pipeline_model_parallel = cfg.distributed_training.pipeline_model_parallel @@ -101,6 +139,7 @@ def __init__(self, cfg: DictConfig, task, model, criterion, quantizer=None): self._warn_once = set() self._wrapped_criterion = None self._wrapped_model = None + self._ema = None # TODO(myleott): support tpu if self.cuda and self.data_parallel_world_size > 1: @@ -116,7 +155,9 @@ def __init__(self, cfg: DictConfig, task, model, criterion, quantizer=None): if self.cuda: self.cuda_env = utils.CudaEnvironment() if self.data_parallel_world_size > 1: - self.cuda_env_arr = distributed_utils.all_gather_list(self.cuda_env) + self.cuda_env_arr = distributed_utils.all_gather_list( + self.cuda_env, group=distributed_utils.get_global_group() + ) else: self.cuda_env_arr = [self.cuda_env] if self.data_parallel_rank == 0: @@ -140,36 +181,69 @@ def reinitialize(self): @property def data_parallel_world_size(self): - return self.cfg.distributed_training.distributed_world_size + if self.cfg.distributed_training.distributed_world_size == 1: + return 1 + return distributed_utils.get_data_parallel_world_size() @property def data_parallel_process_group(self): - if self.tpu: - return ("tpu", None) - else: - return None + return distributed_utils.get_data_parallel_group() @property def data_parallel_rank(self): - return self.cfg.distributed_training.distributed_rank + if self.cfg.distributed_training.distributed_world_size == 1: + return 0 + return distributed_utils.get_data_parallel_rank() @property def is_data_parallel_master(self): - return distributed_utils.is_master(self.cfg.distributed_training) + # NOTE: this returns true for all model parallel replicas with data + # parallel rank 0 + return self.data_parallel_rank == 0 + + @property + def use_distributed_wrapper(self) -> bool: + return ( + self.data_parallel_world_size > 1 and not self.cfg.optimization.use_bmuf + ) or (self.is_fsdp and self.cfg.distributed_training.cpu_offload) + + @property + def should_save_checkpoint_on_current_rank(self) -> bool: + """Indicates whether to save checkpoints on the current DDP rank.""" + if ( + self.is_fsdp and self.cfg.distributed_training.use_sharded_state + ) or getattr(self.cfg.model, "base_layers", 0) > 0: + return True + else: + return self.is_data_parallel_master + + @property + def always_call_state_dict_during_save_checkpoint(self) -> bool: + if self.is_fsdp and not self.cfg.distributed_training.use_sharded_state: + # FSDP calls communication collective when consolidating checkpoints + return True + else: + return False + + @property + def checkpoint_suffix(self) -> str: + """Suffix to add to the checkpoint file name.""" + if self.is_fsdp and self.cfg.distributed_training.use_sharded_state: + return self.cfg.checkpoint.checkpoint_suffix + "-shard{0}".format( + self.data_parallel_rank + ) + else: + return self.cfg.checkpoint.checkpoint_suffix or "" @property def criterion(self): if self._wrapped_criterion is None: - if ( - utils.has_parameters(self._criterion) - and self.data_parallel_world_size > 1 - and not self.cfg.optimization.use_bmuf - and not self.tpu - ): + if utils.has_parameters(self._criterion) and self.use_distributed_wrapper: self._wrapped_criterion = models.DistributedFairseqModel( self.cfg.distributed_training, self._criterion, process_group=self.data_parallel_process_group, + device=self.device, ) else: self._wrapped_criterion = self._criterion @@ -178,20 +252,28 @@ def criterion(self): @property def model(self): if self._wrapped_model is None: - if ( - self.data_parallel_world_size > 1 - and not self.cfg.optimization.use_bmuf - and not self.tpu - ): + if self.use_distributed_wrapper: self._wrapped_model = models.DistributedFairseqModel( self.cfg.distributed_training, self._model, process_group=self.data_parallel_process_group, + device=self.device, ) else: self._wrapped_model = self._model return self._wrapped_model + @property + def ema(self): + if self._ema is None: + self._build_ema() + return self._ema + + def _build_ema(self): + if self.cfg.ema.store_ema: + self._ema = build_ema(self._model, self.cfg.ema, self.device) + logger.info("Exponential Moving Average Shadow Model is initialized.") + @property def optimizer(self): if self._optimizer is None: @@ -205,17 +287,42 @@ def lr_scheduler(self): return self._lr_scheduler def _build_optimizer(self): - params = list( - filter( - lambda p: p.requires_grad, - chain(self.model.parameters(), self.criterion.parameters()), + + if ( + self.cfg.optimization.debug_param_names + and self.cfg.common.fp16_no_flatten_grads + ): + params = [] + self.param_names = [] + + for n, p in chain( + self.model.named_parameters(), self.criterion.named_parameters() + ): + if p.requires_grad: + params.append(p) + self.param_names.append(n) + else: + params = list( + filter( + lambda p: p.requires_grad, + chain(self.model.parameters(), self.criterion.parameters()), + ) ) - ) - if self.cfg.common.fp16 or self.cfg.common.bf16: + if self.is_fsdp and self.cfg.common.fp16: + # FullyShardedDataParallel always uses MemoryEfficientFP16 wrapper, + # mostly for the grad scaling. But if we don't have the + # --memory-efficient-fp16 flag set, then we're effectively doing + # regular --fp16 and can allow the use of optimizers that would + # otherwise be unsupported by MemoryEfficientFP16Optimizer. + allow_unsupported = not self.cfg.common.memory_efficient_fp16 + self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( + self.cfg, params, allow_unsupported=allow_unsupported + ) + elif self.cfg.common.fp16 or self.cfg.common.bf16 or self.cfg.common.amp: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( - "NOTE: your device does NOT support faster training with --fp16, " + "NOTE: your device does NOT support faster training with --fp16 or --amp, " "please switch to FP32 which is likely to be faster" ) if ( @@ -225,13 +332,28 @@ def _build_optimizer(self): self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.cfg, params ) + elif self.cfg.common.amp: + self._optimizer = optim.AMPOptimizer.build_optimizer(self.cfg, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.cfg, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: - logger.info("NOTE: your device may support faster training with --fp16") + logger.info( + "NOTE: your device may support faster training with --fp16 or --amp" + ) self._optimizer = optim.build_optimizer(self.cfg.optimizer, params) + if self.is_fsdp: + assert ( + not self.cfg.optimization.use_bmuf + ), "--ddp-backend=fully_sharded is not compatible with BMUF" + assert self._optimizer.supports_flat_params, ( + "--ddp-backend=fully_sharded is only compatible with pointwise " + "optimizers (e.g., Adam, AdamW, Adadelta, Adamax, SGD, etc.). " + "However, the sharding will result in slightly different results when " + "using non-pointwise optimizers (e.g., Adagrad, Adafactor, LAMB)" + ) + if self.cfg.optimization.use_bmuf: self._optimizer = optim.FairseqBMUF( self.cfg.bmuf, @@ -259,27 +381,86 @@ def _build_optimizer(self): ) self._lr_scheduler.step_update(0) + @property + def is_fsdp(self): + return self.cfg.distributed_training.ddp_backend == "fully_sharded" + def consolidate_optimizer(self): """For OSS, we need to consolidate the state dict.""" + if self.cfg.checkpoint.no_save_optimizer_state: + return + self._gathered_optim_state = None if hasattr(self.optimizer.optimizer, "consolidate_state_dict"): self.optimizer.optimizer.consolidate_state_dict() + elif self.is_fsdp and not self.model.use_sharded_state: + st = self.model.gather_full_optim_state_dict( + self.optimizer + ) # only returns on rank 0 + self._gathered_optim_state = st + + def state_dict(self): + state_dict = { + "args": None, # legacy + "cfg": ( + OmegaConf.to_container(self.cfg, resolve=True, enum_to_str=True) + if OmegaConf.is_config(self.cfg) + else self.cfg + ), + "model": self.model.state_dict(), + "criterion": ( + self.criterion.state_dict() + if utils.has_parameters(self.criterion) + else None + ), + "optimizer_history": (self._optim_history or []) + + [ + { + "criterion_name": self.get_criterion().__class__.__name__, + "optimizer_name": self.optimizer.__class__.__name__, + "lr_scheduler_state": self.lr_scheduler.state_dict(), + "num_updates": self.get_num_updates(), + } + ], + "task_state": self.task.state_dict() if self.task is not None else {}, + "extra_state": { + "metrics": metrics.state_dict(), + "previous_training_time": self.cumulative_training_time(), + }, + } + if self.cfg.ema.store_ema: + # Save EMA model state as extra state + state_dict["extra_state"]["ema"] = self.ema.get_model().state_dict() + if self.cfg.ema.ema_fp32: + # Save EMA params in fp32 + state_dict["extra_state"]["ema_fp32_params"] = self.ema.fp32_params + if not self.cfg.checkpoint.no_save_optimizer_state: + if self._gathered_optim_state is not None: + state_dict["last_optimizer_state"] = self._gathered_optim_state + self._gathered_optim_state = None + else: + state_dict["last_optimizer_state"] = self.optimizer.state_dict() + if self.is_fsdp: + # save meta data for recombining checkpoint upon loading + state_dict["fsdp_metadata"] = self.model.local_metadata_dict() + return state_dict def save_checkpoint(self, filename, extra_state): """Save all training state in a checkpoint file.""" - if self.is_data_parallel_master: # only save one checkpoint - extra_state["metrics"] = metrics.state_dict() - extra_state["previous_training_time"] = self.cumulative_training_time() - checkpoint_utils.save_state( + if self.should_save_checkpoint_on_current_rank: + + logger.info(f"Saving checkpoint to {os.path.abspath(filename)}") + # call state_dict on all ranks in case it needs internal communication + state_dict = utils.move_to_cpu(self.state_dict()) + state_dict["extra_state"].update(extra_state) + + checkpoint_utils.torch_persistent_save( + state_dict, filename, - self.cfg, - self.get_model().state_dict(), - self.get_criterion(), - self.optimizer, - self.lr_scheduler, - self.get_num_updates(), - self._optim_history, - extra_state, + async_write=self.cfg.checkpoint.write_checkpoints_asynchronously, ) + logger.info(f"Finished saving checkpoint to {os.path.abspath(filename)}") + return os.path.abspath(filename) + return None def load_checkpoint( self, @@ -289,30 +470,136 @@ def load_checkpoint( optimizer_overrides=None, reset_meters=False, ): - """Load all training state from a checkpoint file.""" + """ + Load all training state from a checkpoint file. + rank = 0 will load the checkpoint, and then broadcast it to all + other ranks. + """ extra_state, self._optim_history, last_optim_state = None, [], None + logger.info(f"Preparing to load checkpoint {filename}") + is_distributed = self.data_parallel_world_size > 1 bexists = PathManager.isfile(filename) if bexists: - state = checkpoint_utils.load_checkpoint_to_cpu(filename) + load_on_all_ranks = ( + self.cfg.checkpoint.load_checkpoint_on_all_dp_ranks + # TPUs don't support broadcast yet, so load checkpoints + # on every worker for now + or self.tpu + # FSDP requires loading checkpoint shards on all ranks + or (self.is_fsdp and self.cfg.distributed_training.use_sharded_state) + or getattr(self.cfg.model, "base_layers", 0) > 0 + ) + + if load_on_all_ranks or self.data_parallel_rank == 0: + state = checkpoint_utils.load_checkpoint_to_cpu( + filename, load_on_all_ranks=load_on_all_ranks + ) + last_optim_state = state.get("last_optimizer_state", None) + + # If doing zero_sharding, do not broadcast global optimizer + # state. Later we will broadcast sharded states to each rank + # to avoid memory from exploding. + if ( + not load_on_all_ranks + and self.cfg.distributed_training.zero_sharding == "os" + and "last_optimizer_state" in state + and is_distributed + ): + state["last_optimizer_state"] = "SHARDED" + else: + last_optim_state = None + state = None + + if is_distributed and not load_on_all_ranks: + state = distributed_utils.broadcast_object( + state, + src_rank=0, + group=self.data_parallel_process_group, + dist_device=self.device, + ) + if self.data_parallel_rank > 0: + last_optim_state = state.get("last_optimizer_state", None) + # load model parameters try: - self.get_model().load_state_dict( + if ( + "optimizer_history" in state + and len(state["optimizer_history"]) > 0 + and "num_updates" in state["optimizer_history"][-1] + ): + self.model.set_num_updates( + state["optimizer_history"][-1]["num_updates"] + ) + + # this is the code related to AdaPrune + # In short, it removes redundant heads in multi-head attention module based on heads importance provided + # For more info, please refer to the paper: https://openreview.net/forum?id=_CMSV7FTzGI + # The idea of prune in mha can be summarized as + # Fine tune model (e.g. roberta encoder) on a certain datasets with regularization + # After the model is trained. User could use get_reserve_head_index and _adaptive_prune_heads functions to get the top X heads with most importance. + # Then user uses the rank to prune a new roberta encoder and save the pruned ckpt manually. + # User will fine tune the the new roberta encoder via the ckpt saved above + # To get rid of registering different pruned version of Roberta, I use the argument --mha-heads-to-keep to prune the Roberta model into a pruned version which matches the pruned ckpt. + if ( + safe_hasattr(self.model, "args") + and safe_hasattr(self.model.args, "mha_heads_to_keep") + and self.model.args.mha_heads_to_keep != -1 + ): + logger.info( + f"Prune model: keep {self.model.args.mha_heads_to_keep} heads for each multihead attention module" + ) + for layer in self.model.encoder.sentence_encoder.layers: + reserve_head_index = layer.self_attn._get_reserve_head_index( + num_heads_to_keep=self.model.args.mha_heads_to_keep + ) + layer.self_attn._adaptive_prune_heads( + reserve_head_index=reserve_head_index + ) + layer.self_attn._set_skip_embed_dim_check() + logger.info(self.model) + # this is the code related to AdaPrune + # In short, it removes redundant units in feedforward layer in each transformer layer based on importance + # For more info, please refer to the paper: https://openreview.net/forum?id=_CMSV7FTzGI + # The idea of prune in ffn can be summarized as + # Fine tune model (e.g. roberta encoder) on a certain datasets with regularization + # After the model is trained. User could use _get_fc_rank and _prune_fc_layer functions to get the top X units with most importance. + # Then user uses the rank to prune a new roberta encoder and save the pruned ckpt manually. + # User will fine tune the the new roberta encoder via the ckpt saved above + # To get rid of registering different pruned version of Roberta, I use the argument --ffn-blocks-to-remove to prune the Roberta model into a pruned version which matches the pruned ckpt. + if ( + safe_hasattr(self.model, "args") + and safe_hasattr(self.model.args, "ffn_blocks_to_remove") + and self.model.args.ffn_blocks_to_remove != -1 + ): + logger.info( + f"Prune model: remove {self.model.args.ffn_blocks_to_remove} ffn blocks for each transformer layer" + ) + for layer in self.model.encoder.sentence_encoder.layers: + remove_index = layer._get_fc_rank( + remove_num=self.model.args.ffn_blocks_to_remove + ) + layer._prune_fc_layer(remove_index=remove_index) + logger.info(self.model) + + self.model.load_state_dict( state["model"], strict=True, model_cfg=self.cfg.model ) + # save memory for later steps + del state["model"] if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict( state["criterion"], strict=True ) + del state["criterion"] + except Exception: raise Exception( "Cannot load model parameters from checkpoint {}; " "please ensure that the architectures match.".format(filename) ) - extra_state = state["extra_state"] self._optim_history = state["optimizer_history"] - last_optim_state = state.get("last_optimizer_state", None) if last_optim_state is not None and not reset_optimizer: # rebuild optimizer after loading model, since params may have changed @@ -322,24 +609,31 @@ def load_checkpoint( last_optim = self._optim_history[-1] assert ( last_optim["criterion_name"] == self.get_criterion().__class__.__name__ - ), "Criterion does not match; please reset the optimizer (--reset-optimizer)." + ), f"Criterion does not match; please reset the optimizer (--reset-optimizer). {last_optim['criterion_name']} vs {self.get_criterion().__class__.__name__}" assert ( last_optim["optimizer_name"] == self.optimizer.__class__.__name__ - ), "Optimizer does not match; please reset the optimizer (--reset-optimizer)." + ), f"Optimizer does not match; please reset the optimizer (--reset-optimizer). {last_optim['optimizer_name']} vs {self.optimizer.__class__.__name__}" if not reset_lr_scheduler: self.lr_scheduler.load_state_dict(last_optim["lr_scheduler_state"]) + + if self.is_fsdp and not self.model.use_sharded_state: + # if use_sharded_state, the last_optim_state is already sharded, skip this + last_optim_state = self.model.get_shard_from_optim_state_dict( + last_optim_state + ) + elif not load_on_all_ranks and is_distributed: + last_optim_state = self.optimizer.broadcast_global_state_dict( + last_optim_state + ) + self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) self.set_num_updates(last_optim["num_updates"]) if extra_state is not None: - epoch = extra_state["train_iterator"]["epoch"] - logger.info( - "loaded checkpoint {} (epoch {} @ {} updates)".format( - filename, epoch, self.get_num_updates() - ) - ) + itr_state = extra_state["train_iterator"] + epoch = itr_state["epoch"] if "previous_training_time" in extra_state: self._previous_training_time = extra_state["previous_training_time"] @@ -347,6 +641,13 @@ def load_checkpoint( self.lr_step(epoch) + if ( + itr_state.get("version", 1) >= 2 + and itr_state["iterations_in_epoch"] == 0 + ): + # reset meters at start of epoch + reset_meters = True + if "metrics" in extra_state and not reset_meters: metrics.load_state_dict(extra_state["metrics"]) @@ -354,8 +655,38 @@ def load_checkpoint( for meter in metrics.get_meters("default"): if isinstance(meter, meters.TimeMeter): meter.reset() + + if self.cfg.ema.store_ema: + if "ema" not in extra_state: + logger.warn( + "EMA not found in checkpoint. But store_ema is True. " + "EMA is re-initialized from checkpoint." + ) + self.ema.restore( + state["model"], build_fp32_params=self.cfg.ema.ema_fp32 + ) + else: + logger.info("Loading EMA from checkpoint") + self.ema.restore(extra_state["ema"], build_fp32_params=False) + + if self.cfg.ema.ema_fp32: + if "ema_fp32_params" in extra_state: + logger.info("Loading EMA fp32 params from checkpoint") + self.ema.build_fp32_params(extra_state["ema_fp32_params"]) + else: + logger.info( + "Building EMA fp32 params from EMA model in checkpoint" + ) + self.ema.build_fp32_params() + + logger.info( + "Loaded checkpoint {} (epoch {} @ {} updates)".format( + filename, epoch, self.get_num_updates() + ) + ) + else: - logger.info("no existing checkpoint found {}".format(filename)) + logger.info("No existing checkpoint found {}".format(filename)) return extra_state @@ -376,6 +707,7 @@ def get_train_iterator( epoch=epoch, combine=combine, data_selector=data_selector, + tpu=self.tpu, ) batch_iterator = self.task.get_batch_iterator( dataset=self.task.dataset(self.cfg.dataset.train_subset), @@ -388,13 +720,18 @@ def get_train_iterator( ), ignore_invalid_inputs=True, required_batch_size_multiple=self.cfg.dataset.required_batch_size_multiple, - seed=self.cfg.common.seed, + seed=(self.cfg.common.seed + epoch) + if self.cfg.dataset.update_ordered_indices_seed + else self.cfg.common.seed, num_shards=self.data_parallel_world_size if shard_batch_itr else 1, shard_id=self.data_parallel_rank if shard_batch_itr else 0, num_workers=self.cfg.dataset.num_workers, epoch=epoch, data_buffer_size=self.cfg.dataset.data_buffer_size, disable_iterator_cache=disable_iterator_cache, + skip_remainder_batch=self.cfg.optimization.skip_remainder_batch, + grouped_shuffling=self.cfg.dataset.grouped_shuffling, + update_epoch_batch_itr=self.cfg.dataset.update_epoch_batch_itr, ) self.reset_dummy_batch(batch_iterator.first_batch) return batch_iterator @@ -419,8 +756,12 @@ def get_valid_iterator( num_shards=self.data_parallel_world_size, shard_id=self.data_parallel_rank, num_workers=self.cfg.dataset.num_workers, + # always pass a fixed "epoch" to keep validation data consistent + # across training epochs + epoch=1, data_buffer_size=self.cfg.dataset.data_buffer_size, disable_iterator_cache=disable_iterator_cache, + skip_remainder_batch=False, ) self.reset_dummy_batch(batch_iterator.first_batch) return batch_iterator @@ -429,6 +770,8 @@ def begin_epoch(self, epoch): """Called at the beginning of each epoch.""" logger.info("begin training epoch {}".format(epoch)) + self.lr_step_begin_epoch(epoch) + if self.quantizer is not None: self.quantizer.begin_epoch(epoch) @@ -460,19 +803,19 @@ def train_step(self, samples, raise_oom=False): metrics.log_start_time("train_wall", priority=800, round=0) + # If EMA is enabled through store_ema=True + # and task.uses_ema is True, pass the EMA model as a keyword + # argument to the task. + extra_kwargs = {} + if self.cfg.ema.store_ema and getattr(self.task, "uses_ema", False): + extra_kwargs["ema_model"] = self.ema.get_model() + + has_oom = False + # forward and backward pass logging_outputs, sample_size, ooms = [], 0, 0 - for i, sample in enumerate(samples): - sample = self._prepare_sample(sample) - if sample is None: - # when sample is None, run forward/backward on a dummy batch - # and ignore the resulting gradients - sample = self._prepare_sample(self._dummy_batch) - is_dummy_batch = True - else: - if self._dummy_batch == "DUMMY": - self._dummy_batch = sample - is_dummy_batch = False + for i, sample in enumerate(samples): # delayed update loop + sample, is_dummy_batch = self._prepare_sample(sample) def maybe_no_sync(): """ @@ -484,6 +827,11 @@ def maybe_no_sync(): self.data_parallel_world_size > 1 and hasattr(self.model, "no_sync") and i < len(samples) - 1 + # The no_sync context manager results in increased memory + # usage with FSDP, since full-size gradients will be + # accumulated on each GPU. It's typically a better tradeoff + # to do the extra communication with FSDP. + and not self.is_fsdp ): return self.model.no_sync() else: @@ -499,6 +847,7 @@ def maybe_no_sync(): optimizer=self.optimizer, update_num=self.get_num_updates(), ignore_grad=is_dummy_batch, + **extra_kwargs, ) del loss @@ -512,19 +861,29 @@ def maybe_no_sync(): except RuntimeError as e: if "out of memory" in str(e): self._log_oom(e) + has_oom = True if raise_oom: raise e - logger.warning( - "attempting to recover from OOM in forward/backward pass" - ) - ooms += 1 - self.zero_grad() - if self.cuda: - torch.cuda.empty_cache() - if self.cfg.distributed_training.distributed_world_size == 1: - return None else: raise e + except Exception: + self.consolidate_optimizer() + self.save_checkpoint( + os.path.join(self.cfg.checkpoint.save_dir, "crash.pt"), {} + ) + raise + + if has_oom: + logger.warning( + "attempting to recover from OOM in forward/backward pass" + ) + ooms += 1 + self.zero_grad() + if self.cuda: + torch.cuda.empty_cache() + + if self.cfg.distributed_training.distributed_world_size == 1: + return None if self.tpu and i < len(samples) - 1: # tpu-comment: every XLA operation before marking step is @@ -532,9 +891,7 @@ def maybe_no_sync(): # before marking step can lead to OOM errors. # To handle gradient accumulation use case, we explicitly # mark step here for every forward pass without a backward pass - import torch_xla.core.xla_model as xm - - xm.mark_step() + self._xla_markstep_and_send_to_cpu() if is_dummy_batch: if torch.is_tensor(sample_size): @@ -550,78 +907,119 @@ def maybe_no_sync(): # gather logging outputs from all replicas if self._sync_stats(): train_time = self._local_cumulative_training_time() - logging_outputs, ( - sample_size, - ooms, - total_train_time, - ) = self._aggregate_logging_outputs( + ( logging_outputs, - sample_size, - ooms, - train_time, - ignore=is_dummy_batch, + ( + sample_size, + ooms, + total_train_time, + ), + ) = self._aggregate_logging_outputs( + logging_outputs, sample_size, ooms, train_time, ignore=is_dummy_batch ) self._cumulative_training_time = ( total_train_time / self.data_parallel_world_size ) - if hasattr(self.model, "all_reduce"): - self.model.all_reduce() - overflow = False try: - if self.tpu and self.data_parallel_world_size > 1: - import torch_xla.core.xla_model as xm - - gradients = xm._fetch_gradients(self.optimizer.optimizer) - xm.all_reduce( - "sum", gradients, scale=1.0 / self.data_parallel_world_size - ) + with torch.autograd.profiler.record_function("reduce-grads"): + # reduce gradients across workers + self.optimizer.all_reduce_grads(self.model) + if utils.has_parameters(self.criterion): + self.optimizer.all_reduce_grads(self.criterion) with torch.autograd.profiler.record_function("multiply-grads"): - # multiply gradients by (# GPUs / sample_size) since DDP - # already normalizes by the number of GPUs. Thus we get - # (sum_of_gradients / sample_size). - if not self.cfg.optimization.use_bmuf: - self.optimizer.multiply_grads( - self.data_parallel_world_size / sample_size - ) - elif sample_size > 0: # BMUF needs to check sample size - num = self.data_parallel_world_size if self._sync_stats() else 1 - self.optimizer.multiply_grads(num / sample_size) + # multiply gradients by (data_parallel_size / sample_size) since + # DDP normalizes by the number of data parallel workers for + # improved fp16 precision. + # Thus we get (sum_of_gradients / sample_size) at the end. + # In case of fp16, this step also undoes loss scaling. + # (Debugging note: Some optimizers perform this scaling on the + # fly, so inspecting model.parameters() or optimizer.params may + # still show the original, unscaled gradients.) + numer = ( + self.data_parallel_world_size + if not self.cfg.optimization.use_bmuf or self._sync_stats() + else 1 + ) + self.optimizer.multiply_grads(numer / (sample_size or 1.0)) + # Note: (sample_size or 1.0) handles the case of a zero gradient, in a + # way that avoids CPU/device transfers in case sample_size is a GPU or + # TPU object. The assumption is that the gradient itself is also 0. with torch.autograd.profiler.record_function("clip-grads"): # clip grads grad_norm = self.clip_grad_norm(self.cfg.optimization.clip_norm) # check that grad norms are consistent across workers - if ( - not self.cfg.optimization.use_bmuf - and self.cfg.distributed_training.distributed_wrapper != "SlowMo" - and not self.tpu - ): - self._check_grad_norms(grad_norm) + # on tpu check tensor is slow + if not self.tpu: + if ( + not self.cfg.optimization.use_bmuf + and self.cfg.distributed_training.ddp_backend != "slowmo" + ): + self._check_grad_norms(grad_norm) + if not torch.isfinite(grad_norm).all(): + # in case of AMP, if gradients are Nan/Inf then + # optimizer step is still required + if self.cfg.common.amp: + overflow = True + else: + # check local gradnorm single GPU case, trigger NanDetector + raise FloatingPointError("gradients are Nan/Inf") with torch.autograd.profiler.record_function("optimizer"): # take an optimization step - self.optimizer.step() + self.task.optimizer_step( + self.optimizer, model=self.model, update_num=self.get_num_updates() + ) + if self.cfg.common.amp and overflow: + if self._amp_retries == self.cfg.common.amp_batch_retries: + logger.info("AMP: skipping this batch.") + self._amp_retries = 0 + else: + self._amp_retries += 1 + return self.train_step( + samples, raise_oom + ) # recursion to feed in same batch except FloatingPointError: + + self.consolidate_optimizer() + self.save_checkpoint( + os.path.join(self.cfg.checkpoint.save_dir, "crash.pt"), {} + ) + # re-run the forward and backward pass with hooks attached to print # out where it fails + self.zero_grad() with NanDetector(self.get_model()): - self.task.train_step( - sample, - self.model, - self.criterion, - self.optimizer, - self.get_num_updates(), - ignore_grad=False, - ) + for _, sample in enumerate(samples): + sample, _ = self._prepare_sample(sample) + self.task.train_step( + sample, + self.model, + self.criterion, + self.optimizer, + self.get_num_updates(), + ignore_grad=False, + **extra_kwargs, + ) raise except OverflowError as e: overflow = True - logger.info("NOTE: overflow detected, " + str(e)) + logger.info( + f"NOTE: gradient overflow detected, ignoring gradient, {str(e)}" + ) + + if hasattr(self, "param_names") and hasattr( + self.optimizer, "fp32_optimizer" + ): + for p, n in zip(self.optimizer.fp32_optimizer.params, self.param_names): + if torch.isinf(p.grad).any() or torch.isnan(p.grad).any(): + logger.info(f"overflow in param {n}") + grad_norm = torch.tensor(0.0).cuda() self.zero_grad() except RuntimeError as e: @@ -630,28 +1028,36 @@ def maybe_no_sync(): logger.error("OOM during optimization, irrecoverable") raise e - # Some distributed wrappers (e.g., SlowMo) need access to the optimizer after the step - if hasattr(self.model, "perform_additional_optimizer_actions"): - if hasattr(self.optimizer, "fp32_params"): - self.model.perform_additional_optimizer_actions( - self.optimizer.optimizer, self.optimizer.fp32_params - ) - else: - self.model.perform_additional_optimizer_actions( - self.optimizer.optimizer - ) + # Some distributed wrappers (e.g., SlowMo) need access to the optimizer + # after the step + if hasattr(self.model, "perform_slowmo"): + self.model.perform_slowmo( + self.optimizer.optimizer, getattr(self.optimizer, "fp32_params", None) + ) - if ( - not overflow - or self.cfg.distributed_training.distributed_wrapper == "SlowMo" - ): + logging_output = None + if not overflow or self.cfg.distributed_training.ddp_backend == "slowmo": self.set_num_updates(self.get_num_updates() + 1) + if self.cfg.ema.store_ema: + # Step EMA forward with new model. + self.ema.step( + self.get_model(), + self.get_num_updates(), + ) + metrics.log_scalar( + "ema_decay", + self.ema.get_decay(), + priority=10000, + round=5, + weight=0, + ) + if self.tpu: - # mark step on TPUs import torch_xla.core.xla_model as xm - xm.mark_step() + # mark step on TPUs + self._xla_markstep_and_send_to_cpu() # only log stats every log_interval steps # this causes wps to be misreported when log_interval > 1 @@ -662,24 +1068,16 @@ def maybe_no_sync(): gb_free = mem_info["kb_free"] / 1024 / 1024 gb_total = mem_info["kb_total"] / 1024 / 1024 metrics.log_scalar( - "gb_free", - gb_free, - priority=1500, - round=1, - weight=0, + "gb_free", gb_free, priority=1500, round=1, weight=0 ) metrics.log_scalar( - "gb_total", - gb_total, - priority=1600, - round=1, - weight=0, + "gb_total", gb_total, priority=1600, round=1, weight=0 + ) + logging_outputs = self._xla_markstep_and_send_to_cpu( + logging_outputs ) - logging_output = self._reduce_and_log_stats( - logging_outputs, - sample_size, - grad_norm, + logging_outputs, sample_size, grad_norm ) # log whenever there's an XLA compilation, since these @@ -687,11 +1085,18 @@ def maybe_no_sync(): # optimization self._check_xla_compilation() else: + if self.cuda and self.cuda_env is not None: + # log minimum free memory over the iteration + gb_used = torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024 + torch.cuda.reset_peak_memory_stats() + gb_free = self.cuda_env.total_memory_in_GB - gb_used + metrics.log_scalar( + "gb_free", gb_free, priority=1500, round=1, weight=0 + ) + # log stats logging_output = self._reduce_and_log_stats( - logging_outputs, - sample_size, - grad_norm, + logging_outputs, sample_size, grad_norm ) # clear CUDA cache to reduce memory fragmentation @@ -706,10 +1111,14 @@ def maybe_no_sync(): ): torch.cuda.empty_cache() - if self.cfg.common.fp16: + if self.cfg.common.fp16 or self.cfg.common.amp: metrics.log_scalar( "loss_scale", - self.optimizer.scaler.loss_scale, + ( + self.optimizer.scaler.loss_scale + if self.cfg.common.fp16 + else self.optimizer.scaler.get_scale() + ), priority=700, round=4, weight=0, @@ -725,24 +1134,23 @@ def valid_step(self, sample, raise_oom=False): import torch_xla.core.xla_model as xm xm.rendezvous("valid_step") # wait for all workers - xm.mark_step() + + # If EMA is enabled through store_ema=True + # and task.uses_ema is True, pass the EMA model as a keyword + # argument to the task. + extra_kwargs = {} + if self.cfg.ema.store_ema and getattr(self.task, "uses_ema", False): + extra_kwargs["ema_model"] = self.ema.get_model() with torch.no_grad(): self.model.eval() self.criterion.eval() - sample = self._prepare_sample(sample) - if sample is None: - sample = self._prepare_sample(self._dummy_batch) - is_dummy_batch = True - else: - if self._dummy_batch == "DUMMY": - self._dummy_batch = sample - is_dummy_batch = False + sample, is_dummy_batch = self._prepare_sample(sample) try: _loss, sample_size, logging_output = self.task.valid_step( - sample, self.model, self.criterion + sample, self.model, self.criterion, **extra_kwargs ) except RuntimeError as e: if "out of memory" in str(e): @@ -775,6 +1183,8 @@ def valid_step(self, sample, raise_oom=False): ) # log validation stats + if self.tpu: + logging_outputs = self._xla_markstep_and_send_to_cpu(logging_outputs) logging_output = self._reduce_and_log_stats(logging_outputs, sample_size) return logging_output @@ -782,6 +1192,12 @@ def valid_step(self, sample, raise_oom=False): def zero_grad(self): self.optimizer.zero_grad() + def lr_step_begin_epoch(self, epoch): + """Adjust the learning rate at the beginning of the epoch.""" + self.lr_scheduler.step_begin_epoch(epoch) + # prefer updating the LR based on the number of steps + return self.lr_step_update() + def lr_step(self, epoch, val_loss=None): """Adjust the learning rate at the end of the epoch.""" self.lr_scheduler.step(epoch, val_loss) @@ -791,7 +1207,12 @@ def lr_step(self, epoch, val_loss=None): def lr_step_update(self): """Update the learning rate after each update.""" new_lr = self.lr_scheduler.step_update(self.get_num_updates()) - metrics.log_scalar("lr", new_lr, weight=0, priority=300) + if isinstance(new_lr, dict): + for k, v in new_lr.items(): + metrics.log_scalar(f"lr_{k}", v, weight=0, priority=300) + new_lr = new_lr.get("default", next(iter(new_lr.values()))) + else: + metrics.log_scalar("lr", new_lr, weight=0, priority=300) return new_lr def get_lr(self): @@ -860,7 +1281,20 @@ def set_num_updates(self, num_updates): metrics.log_scalar("num_updates", self._num_updates, weight=0, priority=200) def clip_grad_norm(self, clip_norm): - return self.optimizer.clip_grad_norm(clip_norm, aggregate_norm_fn=None) + def agg_norm_fn(total_norm): + total_norm = total_norm.cuda().float() ** 2 + total_norm = distributed_utils.all_reduce( + total_norm, group=self.data_parallel_process_group + ) + return total_norm**0.5 + + should_agg_norm = self.is_fsdp and ( + self.data_parallel_process_group is not None + or torch.distributed.is_initialized() + ) + return self.optimizer.clip_grad_norm( + clip_norm, aggregate_norm_fn=agg_norm_fn if should_agg_norm else None + ) def cumulative_training_time(self): if self._cumulative_training_time is None: @@ -873,7 +1307,26 @@ def _local_cumulative_training_time(self): """Aggregate training time in seconds.""" return time.time() - self._start_time + self._previous_training_time - def _prepare_sample(self, sample): + def _fp_convert_sample(self, sample): + def apply_half(t): + if t.dtype is torch.float32: + return t.to(dtype=torch.half) + return t + + def apply_bfloat16(t): + if t.dtype is torch.float32: + return t.to(dtype=torch.bfloat16) + return t + + if self.cfg.common.fp16: + sample = utils.apply_to_sample(apply_half, sample) + + if self.cfg.common.bf16: + sample = utils.apply_to_sample(apply_bfloat16, sample) + + return sample + + def _prepare_sample(self, sample, is_dummy=False): if sample == "DUMMY": raise Exception( "Trying to use an uninitialized 'dummy' batch. This usually indicates " @@ -882,7 +1335,18 @@ def _prepare_sample(self, sample): ) if sample is None or len(sample) == 0: - return None + assert ( + self._dummy_batch is not None and len(self._dummy_batch) > 0 + ), "Invalid dummy batch: {}".format(self._dummy_batch) + sample, _ = self._prepare_sample(self._dummy_batch, is_dummy=True) + return sample, True + + # Given that PCIe/NVLink bandwidth is significantly smaller than DRAM bandwidth + # it makes sense to do the format conversion on the CPU and then transfer + # a smaller buffer to the device. This also saves GPU memory capacity. + + if self.cfg.common.on_cpu_convert_precision: + sample = self._fp_convert_sample(sample) if self.cuda: if self.pipeline_model_parallel: @@ -892,24 +1356,17 @@ def _prepare_sample(self, sample): ) else: sample = utils.move_to_cuda(sample) + elif self.tpu and is_dummy: + # the dummy batch may not be on the appropriate device + sample = utils.move_to_cuda(sample, device=self.device) - def apply_half(t): - if t.dtype is torch.float32: - return t.half() - return t + if not self.cfg.common.on_cpu_convert_precision: + sample = self._fp_convert_sample(sample) - def apply_bfloat16(t): - if t.dtype is torch.float32: - return t.to(dtype=torch.bfloat16) - return t - - if self.cfg.common.fp16: - sample = utils.apply_to_sample(apply_half, sample) - - if self.cfg.common.bf16: - sample = utils.apply_to_sample(apply_bfloat16, sample) + if self._dummy_batch == "DUMMY": + self._dummy_batch = sample - return sample + return sample, False def _set_seed(self): # Set seed based on args.seed and the update number so that we get @@ -1034,8 +1491,12 @@ def _check_grad_norms(self, grad_norm): def is_consistent(tensor): max_abs_diff = torch.max(torch.abs(tensor - tensor[0])) return ( - not torch.isfinite(tensor).any() - or (max_abs_diff / (tensor[0] + 1e-6) < 1e-6).all() + ( + torch.isfinite(tensor).all() + and (max_abs_diff / (tensor[0] + 1e-6) < 1e-6).all() + ) + or (self.cfg.common.amp and not torch.isfinite(tensor).all()) + # in case of amp non-finite grads are fine ) if not is_consistent(self._grad_norm_buf): @@ -1046,9 +1507,10 @@ def is_consistent(tensor): error_detail = "grad_norm across the workers:\n{}\n".format( pretty_detail ) - raise RuntimeError( + # use FloatingPointError to trigger NanDetector + raise FloatingPointError( "Fatal error: gradients are inconsistent between workers. " - "Try --ddp-backend=no_c10d. " + "Try --ddp-backend=legacy_ddp. " "Or are you mixing up different generation of GPUs in training?" + "\n" + "-" * 80 @@ -1057,7 +1519,9 @@ def is_consistent(tensor): ) def _reduce_and_log_stats(self, logging_outputs, sample_size, grad_norm=None): - if grad_norm is not None: + if grad_norm is not None and ( + not torch.is_tensor(grad_norm) or torch.isfinite(grad_norm) + ): metrics.log_speed("ups", 1.0, priority=100, round=2) metrics.log_scalar("gnorm", grad_norm, priority=400, round=3) if self.cfg.optimization.clip_norm > 0: @@ -1114,6 +1578,15 @@ def _check_xla_compilation(self): ) self._num_xla_compiles = num_xla_compiles + def _xla_markstep_and_send_to_cpu(self, data=None): + import torch_xla.core.xla_model as xm + + xm.mark_step() + if data is not None: + from fairseq.utils import xla_device_to_cpu + + return xla_device_to_cpu(data) + def _catalog_shared_params(module, memo=None, prefix=""): if memo is None: diff --git a/fairseq/utils.py b/fairseq/utils.py index 0044d76f3d..4d4b350523 100644 --- a/fairseq/utils.py +++ b/fairseq/utils.py @@ -4,26 +4,23 @@ # LICENSE file in the root directory of this source tree. import argparse +import collections import contextlib import copy import importlib import logging import os import sys -import tempfile import warnings from itertools import accumulate -from typing import Callable, Dict, List, Optional +from typing import TYPE_CHECKING, Callable, Dict, List, Optional import torch import torch.nn.functional as F -from fairseq.data import iterators -from fairseq.file_io import PathManager -from fairseq.logging.meters import safe_round -from fairseq.modules import gelu, gelu_accurate -from fairseq.modules.multihead_attention import MultiheadAttention from torch import Tensor +if TYPE_CHECKING: + from fairseq.modules.multihead_attention import MultiheadAttention try: from amp_C import multi_tensor_l2norm @@ -32,6 +29,11 @@ except ImportError: multi_tensor_l2norm_available = False +try: + import torch_xla.core.xla_model as xm +except ImportError: + xm = None + logger = logging.getLogger(__name__) @@ -46,6 +48,8 @@ def __init__(self, option_strings, dest, nargs=None, **kwargs): super(FileContentsAction, self).__init__(option_strings, dest, **kwargs) def __call__(self, parser, namespace, values, option_string=None): + from fairseq.file_io import PathManager + if PathManager.isfile(values): with PathManager.open(values) as f: argument = f.read().strip() @@ -54,11 +58,9 @@ def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, argument) -def split_paths(paths: str) -> List[str]: +def split_paths(paths: str, separator=os.pathsep) -> List[str]: return ( - paths.split(os.pathsep) - if "://" not in paths - else paths.split(MANIFOLD_PATH_SEP) + paths.split(separator) if "://" not in paths else paths.split(MANIFOLD_PATH_SEP) ) @@ -81,6 +83,13 @@ def apply_to_sample(f, sample): def _apply(x): if torch.is_tensor(x): return f(x) + elif isinstance(x, collections.OrderedDict): + # OrderedDict has attributes that needs to be preserved + od = collections.OrderedDict( + (key, _apply(value)) for key, value in x.items() + ) + od.__dict__ = x.__dict__ + return od elif isinstance(x, dict): return {key: _apply(value) for key, value in x.items()} elif isinstance(x, list): @@ -101,7 +110,7 @@ def move_to_cuda(sample, device=None): def _move_to_cuda(tensor): # non_blocking is ignored if tensor is not pinned, so we can always set # to True (see github.com/PyTorchLightning/pytorch-lightning/issues/620) - return tensor.cuda(device=device, non_blocking=True) + return tensor.to(device=device, non_blocking=True) return apply_to_sample(_move_to_cuda, sample) @@ -117,8 +126,20 @@ def _move_to_cpu(tensor): return apply_to_sample(_move_to_cpu, sample) +def move_to_tpu(sample): + + import torch_xla.core.xla_model as xm + + device = xm.xla_device() + + def _move_to_tpu(tensor): + return tensor.to(device) + + return apply_to_sample(_move_to_tpu, sample) + + def get_incremental_state( - module: MultiheadAttention, + module: "MultiheadAttention", incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], key: str, ) -> Optional[Dict[str, Optional[Tensor]]]: @@ -127,7 +148,7 @@ def get_incremental_state( def set_incremental_state( - module: MultiheadAttention, + module: "MultiheadAttention", incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], key: str, value: Dict[str, Optional[Tensor]], @@ -249,9 +270,9 @@ def strip_pad(tensor, pad): return tensor[tensor.ne(pad)] -def buffered_arange(max): +def buffered_arange(max, device="cpu"): if not hasattr(buffered_arange, "buf"): - buffered_arange.buf = torch.LongTensor() + buffered_arange.buf = torch.LongTensor().to(device) if max > buffered_arange.buf.numel(): buffered_arange.buf.resize_(max) torch.arange(max, out=buffered_arange.buf) @@ -286,6 +307,9 @@ def convert_padding_direction( def item(tensor): + # tpu-comment: making this a no-op for xla devices. + if torch.is_tensor(tensor) and tensor.device.type == "xla": + return tensor.detach() if hasattr(tensor, "item"): return tensor.item() if hasattr(tensor, "__getitem__"): @@ -321,10 +345,19 @@ def multi_tensor_total_norm(grads, chunk_size=2048 * 32) -> torch.Tensor: @torch.no_grad() def clip_grad_norm_(params, max_norm, aggregate_norm_fn=None) -> torch.Tensor: + def grad_exists(p): + return p is not None and getattr(p, "grad", None) is not None + if isinstance(params, torch.Tensor): params = [params] params = list(params) - grads = [p.grad.detach() for p in filter(lambda p: p.grad is not None, params)] + grads = [ + p.grad.detach() for p in params if grad_exists(p) and not hasattr(p, "expert") + ] + expert_grads = [ + p.grad.detach() for p in params if grad_exists(p) and hasattr(p, "expert") + ] + if len(grads) == 0: if len(params) > 0: return params[0].new_tensor(0.0) @@ -359,8 +392,8 @@ def clip_grad_norm_(params, max_norm, aggregate_norm_fn=None) -> torch.Tensor: if max_norm > 0: max_norm = float(max_norm) clip_coef = (max_norm / (total_norm + 1e-6)).clamp_(max=1) - for g in grads: - g.mul_(clip_coef) + torch._foreach_mul_(grads + expert_grads, clip_coef) + return total_norm @@ -432,7 +465,9 @@ def import_user_module(args): module_path = getattr(args, "user_dir", None) if module_path is not None: module_path = os.path.abspath(args.user_dir) - if not os.path.exists(module_path): + if not os.path.exists(module_path) and not os.path.isfile( + os.path.dirname(module_path) + ): fairseq_rel_path = os.path.join(os.path.dirname(__file__), args.user_dir) if os.path.exists(fairseq_rel_path): module_path = fairseq_rel_path @@ -445,18 +480,35 @@ def import_user_module(args): else: raise FileNotFoundError(module_path) - # We want to import the module under a unique name so that it doesn't - # collide with existing modules. At the same time we don't want to - # import the module multiple times. The solution is to create a - # temporary directory and symlink the user_dir under a new name, which is - # a deterministic hash of the original module_path. - with tempfile.TemporaryDirectory() as tmpdirname: - unique_mod_name = "fairseq_user_dir_{}".format(hash(module_path) % 100000) - os.symlink(module_path, os.path.join(tmpdirname, unique_mod_name)) + # ensure that user modules are only imported once + import_user_module.memo = getattr(import_user_module, "memo", set()) + if module_path not in import_user_module.memo: + import_user_module.memo.add(module_path) + + module_parent, module_name = os.path.split(module_path) + if module_name not in sys.modules: + sys.path.insert(0, module_parent) + importlib.import_module(module_name) + + tasks_path = os.path.join(module_path, "tasks") + if os.path.exists(tasks_path): + from fairseq.tasks import import_tasks - sys.path.insert(0, tmpdirname) - importlib.import_module(unique_mod_name) - sys.path.remove(tmpdirname) + import_tasks(tasks_path, f"{module_name}.tasks") + + models_path = os.path.join(module_path, "models") + if os.path.exists(models_path): + from fairseq.models import import_models + + import_models(models_path, f"{module_name}.models") + elif module_path in sys.modules[module_name].__path__: + logger.info(f"--user-dir={module_path} has already been imported.") + else: + raise ImportError( + "Failed to import --user-dir={} because the corresponding module name " + "({}) is not globally unique. Please rename the directory to " + "something unique and try again.".format(module_path, module_name) + ) def softmax(x, dim: int, onnx_trace: bool = False): @@ -474,10 +526,12 @@ def log_softmax(x, dim: int, onnx_trace: bool = False): def get_perplexity(loss, round=2, base=2): + from fairseq.logging.meters import safe_round + if loss is None: return 0.0 try: - return safe_round(base ** loss, round) + return safe_round(base**loss, round) except OverflowError: return float("inf") @@ -487,10 +541,18 @@ def deprecation_warning(message, stacklevel=3): warnings.warn(message, stacklevel=stacklevel) +def relu_squared(x: torch.Tensor): + return F.relu(x).pow(2) + + def get_activation_fn(activation: str) -> Callable: - """ Returns the activation function corresponding to `activation` """ + """Returns the activation function corresponding to `activation`""" + from fairseq.modules import gelu, gelu_accurate + if activation == "relu": return F.relu + elif activation == "relu_squared": + return relu_squared elif activation == "gelu": return gelu elif activation == "gelu_fast": @@ -504,6 +566,8 @@ def get_activation_fn(activation: str) -> Callable: return torch.tanh elif activation == "linear": return lambda x: x + elif activation == "swish": + return torch.nn.SiLU else: raise RuntimeError("--activation-fn {} not supported".format(activation)) @@ -535,23 +599,39 @@ def has_parameters(module): return False -def set_torch_seed(seed): - # Set seed based on args.seed and the update number so that we get - # reproducible results when resuming from checkpoints - assert isinstance(seed, int) - torch.manual_seed(seed) - torch.cuda.manual_seed(seed) +def get_rng_state(): + state = {"torch_rng_state": torch.get_rng_state()} + if xm is not None: + state["xla_rng_state"] = xm.get_rng_state() + if torch.cuda.is_available(): + state["cuda_rng_state"] = torch.cuda.get_rng_state() + return state -@contextlib.contextmanager -def with_torch_seed(seed): - assert isinstance(seed, int) - rng_state = torch.get_rng_state() - cuda_rng_state = torch.cuda.get_rng_state() - set_torch_seed(seed) - yield - torch.set_rng_state(rng_state) - torch.cuda.set_rng_state(cuda_rng_state) +def set_rng_state(state): + torch.set_rng_state(state["torch_rng_state"]) + if xm is not None: + xm.set_rng_state(state["xla_rng_state"]) + if torch.cuda.is_available(): + torch.cuda.set_rng_state(state["cuda_rng_state"]) + + +class set_torch_seed(object): + def __init__(self, seed): + assert isinstance(seed, int) + self.rng_state = get_rng_state() + + torch.manual_seed(seed) + if xm is not None: + xm.set_rng_state(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + + def __enter__(self): + return self + + def __exit__(self, *exc): + set_rng_state(self.rng_state) def parse_alignment(line): @@ -607,6 +687,18 @@ def extract_hard_alignment(attn, src_sent, tgt_sent, pad, eos): return alignment +def extract_soft_alignment(attn, src_sent, tgt_sent, pad, eos): + tgt_valid = ((tgt_sent != pad)).nonzero(as_tuple=False) + src_valid = ((src_sent != pad)).nonzero(as_tuple=False).squeeze(dim=-1) + alignment = [] + if len(tgt_valid) != 0 and len(src_valid) != 0: + attn_valid = attn[tgt_valid, src_valid] + alignment = [ + ["{:.6f}".format(p) for p in src_probs.tolist()] for src_probs in attn_valid + ] + return alignment + + def new_arange(x, *size): """ Return a Tensor of `size` filled with a range function on the device of x. @@ -617,9 +709,7 @@ def new_arange(x, *size): return torch.arange(size[-1], device=x.device).expand(*size).contiguous() -def get_tpu_device(args): - import torch_xla.core.xla_model as xm - +def get_tpu_device(): return xm.xla_device() @@ -627,6 +717,8 @@ def tpu_data_loader(itr): import torch_xla.core.xla_model as xm import torch_xla.distributed.parallel_loader as pl + from fairseq.data import iterators + xm.rendezvous("tpu_data_loader") # wait for all workers xm.mark_step() device = xm.xla_device() @@ -637,6 +729,28 @@ def tpu_data_loader(itr): ) +def is_xla_tensor(tensor): + return torch.is_tensor(tensor) and tensor.device.type == "xla" + + +def index_put(tensor, indices, value): + if is_xla_tensor(tensor): + for _ in range(indices.dim(), tensor.dim()): + indices = indices.unsqueeze(-1) + if indices.size(-1) < tensor.size(-1): + indices = indices.expand_as(tensor) + tensor = torch.mul(tensor, ~indices) + torch.mul(value, indices) + else: + tensor[indices] = value + return tensor + + +def xla_device_to_cpu(dat): + import torch_xla.core.xla_model as xm + + return xm._maybe_convert_to_cpu(dat) + + class CudaEnvironment(object): def __init__(self): cur_device = torch.cuda.current_device() @@ -696,3 +810,142 @@ def eval_bool(x, default=False): return bool(eval(x)) except TypeError: return default + + +def reset_logging(): + root = logging.getLogger() + for handler in root.handlers: + root.removeHandler(handler) + root.setLevel(os.environ.get("LOGLEVEL", "INFO").upper()) + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter( + logging.Formatter( + fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + ) + root.addHandler(handler) + + +def safe_getattr(obj, k, default=None): + """Returns obj[k] if it exists and is not None, otherwise returns default.""" + from omegaconf import OmegaConf + + if OmegaConf.is_config(obj): + return obj[k] if k in obj and obj[k] is not None else default + + return getattr(obj, k, default) + + +def safe_hasattr(obj, k): + """Returns True if the given key exists and is not None.""" + return getattr(obj, k, None) is not None + + +def hotreload_function(name=None): + """ + Decorator to function to enable hot-reload for debugging. + It allows you to debug a function without having reloading all heavy models, dataset loading and + preprocessing, allow faster debugging. + If you want to change model or dataset loading, consider relaunching your code + ----------------------------------- + This will run the decorated function func: + if func run successful: + It will pause, allow user to edit code, and prompt user to: + Press enter to re-run the function with updated code + Type "done" to finish the function, return output + Type "disable" to stop pausing this function and let code continue without pause + Ctril + C to terminal + if func raise error: + it will prompt user to + 1. Edit code, and press enter to retry + 2. Ctrl + C to terminate + 3. Type "raise" to raise that exception + * Requirements: + 0. Fairseq was installed with `pip install --editable .` + 1. pip install jurigged[develoop] + 2. set environment HOTRELOAD_PAUSE=1 CUDA_LAUNCH_BLOCKING=1 + 3. Run on only 1 GPU (no distributed) + * How to use: + 1. in python, import and decorate the top-level function to be re-run after code edits: + ```python + from fairseq.utils import hotreload_function + .... + @hotreload_function("train_step") + def train_step(self, sample ....): + .... + .... + ``` + 2. in bash run scripts: + ```bash + watch_dir=<home>/fairseq-py/fairseq/tasks # directory to watch for file changes + export CUDA_VISIBLE_DEVICES=0 # single-gpu + HOTRELOAD_PAUSE=1 CUDA_LAUNCH_BLOCKING=1 python -m jurigged -w ${watch_dir} --poll 2 -v train.py ...... + ``` + * NOTE: + 1. -w ${watch_dir} specify all the files to be watched for changes + once functions, class, ... code are changed, all instances in the process will get updated (hot-reload) + * Limitation: + * Currently distributed debugging not working + * Need to launch train.py locally (cannot submit jobs) + """ + try: + import jurigged + except ImportError as e: + logger.warning("Please install jurigged: pip install jurigged[develoop]") + raise e + from fairseq.distributed import utils as distributed_utils + import traceback + + def hotreload_decorator(func): + assert callable(func), f"not callable: {func}" + jname = name or func.__name__ + logger.info(f"jurigged-hotreload:Apply jurigged on {jname}:{func.__name__}") + HOTRELOAD_PAUSE = bool(os.environ.get("HOTRELOAD_PAUSE", 0)) + cublk = bool(os.environ.get("CUDA_LAUNCH_BLOCKING", 0)) + prefix = f"HOTRELOAD:{jname}:[cublk={cublk}]" + hot_reload_state = {"disable": False} + + def func_wrapper(*args, **kwargs): + if not HOTRELOAD_PAUSE or hot_reload_state["disable"]: + return func(*args, **kwargs) + world_size = distributed_utils.get_global_world_size() + assert ( + world_size <= 1 + ), f"HOTRELOAD_PAUSE:{jname} currently cannot do distributed training" + success = False + while not success: + try: + output = func(*args, **kwargs) + # success = True + end_action = input( + f"{prefix}: PAUSE, you may edit code now. Enter to re-run, ctrl+C to terminate, " + f'type "done" to continue (function still being watched), or type "disable" to stop pausing this function :' + ) + if end_action.strip().lower() in ["disable", "done"]: + success = True + else: + logger.warning( + f"{prefix}: action={end_action} function will re-run now." + ) + except Exception as e: + action = input( + f"{prefix}:ERROR: \n{traceback.format_exc()}\n" + f'Edit code to try again: enter to continue, ctrl+C to terminate, or type "raise" to raise the exception: ' + ) + if action.strip().lower() == "raise": + raise e + + if end_action.strip().lower() == "disable": + logger.warning( + f"{prefix}: Stop pausing {jname}. The function is still being watched and newly editted code will take effect " + f"if the {jname} is called again later." + f' "unset HOTRELOAD_PAUSE" before relaunch to disable hotreload and' + f" remove @hotreload_function decorator in the code." + ) + hot_reload_state["disable"] = True + return output + + return func_wrapper + + return hotreload_decorator diff --git a/fairseq/version.txt b/fairseq/version.txt index 41432f00d9..26acbf080b 100644 --- a/fairseq/version.txt +++ b/fairseq/version.txt @@ -1 +1 @@ -1.0.0a0 +0.12.2 diff --git a/fairseq_cli/eval_lm.py b/fairseq_cli/eval_lm.py index b70c0d3a77..dbd1450a9e 100644 --- a/fairseq_cli/eval_lm.py +++ b/fairseq_cli/eval_lm.py @@ -11,164 +11,92 @@ import logging import math import os +import sys from argparse import Namespace +from typing import Iterable, List, Optional import torch +from omegaconf import DictConfig + +import fairseq from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils -from fairseq.data import LMContextWindowDataset -from fairseq.dataclass.data_class import register_hydra_cfg from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.logging import progress_bar -from fairseq.logging.meters import StopwatchMeter, TimeMeter +from fairseq.logging.meters import StopwatchMeter from fairseq.sequence_scorer import SequenceScorer -from hydra.core.config_store import ConfigStore -from hydra.experimental import initialize -from omegaconf import DictConfig - logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=sys.stdout, ) logger = logging.getLogger("fairseq_cli.eval_lm") -class WordStat(object): - def __init__(self, word, is_bpe): - self.word = word - self.is_bpe = is_bpe - self.log_prob = 0 - self.next_word_prob = 0 - self.count = 0 - self.missing_next_words = 0 - - def add(self, log_prob, next_word_prob): - """increments counters for the sum of log probs of current word and next - word (given context ending at current word). Since the next word might be at the end of the example, - or it might be not counted because it is not an ending subword unit, - also keeps track of how many of those we have seen""" - if next_word_prob is not None: - self.next_word_prob += next_word_prob - else: - self.missing_next_words += 1 - self.log_prob += log_prob - self.count += 1 - - def __str__(self): - return "{}\t{}\t{}\t{}\t{}\t{}".format( - self.word, - self.count, - self.log_prob, - self.is_bpe, - self.next_word_prob, - self.count - self.missing_next_words, - ) - - -def main(cfg: DictConfig, override_args=None, **unused_kwargs): - if isinstance(cfg, Namespace): - cfg = convert_namespace_to_omegaconf(cfg) - - utils.import_user_module(cfg.common) - - use_fp16 = cfg.common.fp16 - use_cuda = torch.cuda.is_available() and not cfg.common.cpu - - if use_cuda: - torch.cuda.set_device(cfg.distributed_training.device_id) - - if override_args is not None: - overrides = vars(override_args) - overrides.update(eval(getattr(override_args, "model_overrides", "{}"))) - else: - overrides = None - - logger.info(cfg) - - # Load ensemble - logger.info("loading model(s) from {}".format(cfg.common_eval.path)) - - # reduce tokens per sample by the required context window size - cfg.task.tokens_per_sample -= cfg.eval_lm.context_window - - models, model_args, task = checkpoint_utils.load_model_ensemble_and_task( - [cfg.common_eval.path], - arg_overrides=overrides, - suffix=cfg.checkpoint.checkpoint_suffix, - strict=(cfg.checkpoint.checkpoint_shard_count == 1), - num_shards=cfg.checkpoint.checkpoint_shard_count, - ) - - # Load dataset splits - gen_subset = cfg.dataset.gen_subset - task.load_dataset(gen_subset) - dataset = task.dataset(gen_subset) - if cfg.eval_lm.context_window > 0: - dataset = LMContextWindowDataset( - dataset=dataset, - tokens_per_sample=cfg.task.tokens_per_sample, - context_window=cfg.eval_lm.context_window, - pad_idx=task.source_dictionary.pad(), - ) - logger.info("{} {} {} examples".format(cfg.task.data, gen_subset, len(dataset))) - - # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) - for model in models: - if use_fp16: - model.half() - if use_cuda and not cfg.distributed_training.pipeline_model_parallel: - model.cuda() - model.prepare_for_inference_(cfg) - - assert len(models) > 0 - - logger.info( - "num. model params: {}".format(sum(p.numel() for p in models[0].parameters())) - ) - - itr = task.get_batch_iterator( - dataset=dataset, - max_tokens=cfg.dataset.max_tokens or 36000, - max_sentences=cfg.dataset.batch_size, - max_positions=utils.resolve_max_positions( - *[model.max_positions() for model in models] - ), - ignore_invalid_inputs=True, - num_shards=max( - cfg.dataset.num_shards, - cfg.distributed_training.distributed_world_size, - ), - shard_id=max( - cfg.dataset.shard_id, - cfg.distributed_training.distributed_rank, - ), - num_workers=cfg.dataset.num_workers, - data_buffer_size=cfg.dataset.data_buffer_size, - ).next_epoch_itr(shuffle=False) - progress = progress_bar.progress_bar( - itr, - log_format=cfg.common.log_format, - log_interval=cfg.common.log_interval, - default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), - ) +def eval_lm( + models: List[fairseq.models.FairseqModel], + source_dictionary: fairseq.data.Dictionary, + batch_iterator: Iterable, + post_process: Optional[str] = None, + output_word_probs: bool = False, + output_word_stats: bool = False, + target_dictionary: Optional[fairseq.data.Dictionary] = None, + softmax_batch: int = 0, + remove_bos_token: bool = False, + device: Optional[torch.device] = None, +): + """ + Args: + models (List[~fairseq.models.FairseqModel]): list of models to + evaluate. Models are essentially `nn.Module` instances, but + must be compatible with fairseq's `SequenceScorer`. + source_dictionary (~fairseq.data.Dictionary): dictionary for + applying any relevant post processing or outputing word + probs/stats. + batch_iterator (Iterable): yield batches of data + post_process (Optional[str]): post-process text by removing BPE, + letter segmentation, etc. Valid options can be found in + fairseq.data.utils.post_process, although not all options + are implemented here. + output_word_probs (Optional[bool]): output words and their + predicted log probabilities + output_word_stats (Optional[bool]): output word statistics such + as word count and average probability + target_dictionary (Optional[~fairseq.data.Dictionary]): output + dictionary (defaults to *source_dictionary*) + softmax_batch (Optional[bool]): if BxT is more than this, will + batch the softmax over vocab to this amount of tokens, in + order to fit into GPU memory + remove_bos_token (Optional[bool]): if True, confirm that the + first token is the beginning-of-sentence symbol (according + to the relevant dictionary) and remove it from the output + device (Optional[torch.device]): device to use for evaluation + (defaults to device of first model parameter) + """ + if target_dictionary is None: + target_dictionary = source_dictionary + if device is None: + device = next(models[0].parameters()).device gen_timer = StopwatchMeter() - scorer = SequenceScorer(task.target_dictionary, cfg.eval_lm.softmax_batch) + scorer = SequenceScorer(target_dictionary, softmax_batch) score_sum = 0.0 count = 0 - if cfg.common_eval.post_process is not None: - if cfg.common_eval.post_process == "sentencepiece": - raise NotImplementedError - else: - bpe_cont = cfg.common_eval.post_process.rstrip() + if post_process is not None: + if post_process in {"subword_nmt", "@@ "}: + bpe_cont = post_process.rstrip() bpe_toks = { i - for i in range(len(task.source_dictionary)) - if task.source_dictionary[i].endswith(bpe_cont) + for i in range(len(source_dictionary)) + if source_dictionary[i].endswith(bpe_cont) } + else: + raise NotImplementedError( + f"--post-process={post_process} is not implemented" + ) bpe_len = len(bpe_cont) else: bpe_toks = None @@ -176,13 +104,11 @@ def main(cfg: DictConfig, override_args=None, **unused_kwargs): word_stats = dict() - wps_meter = TimeMeter() - - for sample in progress: + for sample in batch_iterator: if "net_input" not in sample: continue - sample = utils.move_to_cuda(sample) if use_cuda else sample + sample = utils.move_to_cuda(sample, device=device) gen_timer.start() hypos = scorer.generate(models, sample) @@ -196,8 +122,8 @@ def main(cfg: DictConfig, override_args=None, **unused_kwargs): tgt_len = tokens.numel() pos_scores = hypo["positional_scores"].float() - if cfg.task.add_bos_token: - assert hypo["tokens"][0].item() == task.target_dictionary.bos() + if remove_bos_token: + assert hypo["tokens"][0].item() == target_dictionary.bos() tokens = tokens[1:] pos_scores = pos_scores[1:] @@ -213,19 +139,19 @@ def main(cfg: DictConfig, override_args=None, **unused_kwargs): if inf_scores.any(): logger.info( "skipping tokens with inf scores:", - task.target_dictionary.string(tokens[inf_scores.nonzero()]), + target_dictionary.string(tokens[inf_scores.nonzero()]), ) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks - if cfg.eval_lm.output_word_probs or cfg.eval_lm.output_word_stats: + if output_word_probs or output_word_stats: w = "" word_prob = [] is_bpe = False for i in range(len(tokens)): w_ind = tokens[i].item() - w += task.source_dictionary[w_ind] + w += source_dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True @@ -245,7 +171,7 @@ def main(cfg: DictConfig, override_args=None, **unused_kwargs): ) is_bpe = False w = "" - if cfg.eval_lm.output_word_probs: + if output_word_probs: logger.info( str(int(sample_id)) + " " @@ -256,39 +182,166 @@ def main(cfg: DictConfig, override_args=None, **unused_kwargs): ) ) - wps_meter.update(sample["ntokens"]) - progress.log({"wps": round(wps_meter.avg)}) + avg_nll_loss = ( + -score_sum / count / math.log(2) if count > 0 else 0 + ) # convert to base 2 + logger.info( + "Evaluated {:,} tokens in {:.1f}s ({:.2f} tokens/s)".format( + gen_timer.n, gen_timer.sum, 1.0 / gen_timer.avg if gen_timer.avg > 0 else 0 + ) + ) + + if output_word_stats: + for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): + logger.info(ws) + + return { + "loss": avg_nll_loss, + "perplexity": 2**avg_nll_loss, + } + + +class WordStat(object): + def __init__(self, word, is_bpe): + self.word = word + self.is_bpe = is_bpe + self.log_prob = 0 + self.next_word_prob = 0 + self.count = 0 + self.missing_next_words = 0 + + def add(self, log_prob, next_word_prob): + """increments counters for the sum of log probs of current word and next + word (given context ending at current word). Since the next word might be at the end of the example, + or it might be not counted because it is not an ending subword unit, + also keeps track of how many of those we have seen""" + if next_word_prob is not None: + self.next_word_prob += next_word_prob + else: + self.missing_next_words += 1 + self.log_prob += log_prob + self.count += 1 + + def __str__(self): + return "{}\t{}\t{}\t{}\t{}\t{}".format( + self.word, + self.count, + self.log_prob, + self.is_bpe, + self.next_word_prob, + self.count - self.missing_next_words, + ) + + +def main(cfg: DictConfig, **unused_kwargs): + if isinstance(cfg, Namespace): + cfg = convert_namespace_to_omegaconf(cfg) + + utils.import_user_module(cfg.common) + + logger.info(cfg) + + if cfg.eval_lm.context_window > 0: + # reduce tokens per sample by the required context window size + cfg.task.tokens_per_sample -= cfg.eval_lm.context_window + + # Initialize the task using the current *cfg* + task = tasks.setup_task(cfg.task) + + # Load ensemble + logger.info("loading model(s) from {}".format(cfg.common_eval.path)) + models, model_args, task = checkpoint_utils.load_model_ensemble_and_task( + [cfg.common_eval.path], + arg_overrides=eval(cfg.common_eval.model_overrides), + suffix=cfg.checkpoint.checkpoint_suffix, + strict=(cfg.checkpoint.checkpoint_shard_count == 1), + num_shards=cfg.checkpoint.checkpoint_shard_count, + task=task, + ) + + use_fp16 = cfg.common.fp16 + use_cuda = torch.cuda.is_available() and not cfg.common.cpu + if use_cuda: + torch.cuda.set_device(cfg.distributed_training.device_id) + + # Optimize ensemble for generation and set the source and dest dicts on the model + # (required by scorer) + for model in models: + if use_fp16: + model.half() + if use_cuda and not cfg.distributed_training.pipeline_model_parallel: + model.cuda() + model.prepare_for_inference_(cfg) + + assert len(models) > 0 - avg_nll_loss = -score_sum / count / math.log(2) # convert to base 2 logger.info( - "Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)".format( - gen_timer.n, gen_timer.sum, 1.0 / gen_timer.avg + "num. model params: {:,}".format(sum(p.numel() for p in models[0].parameters())) + ) + + # Load dataset splits + task.load_dataset(cfg.dataset.gen_subset) + dataset = task.dataset(cfg.dataset.gen_subset) + logger.info( + "{} {} {:,} examples".format( + cfg.task.data, cfg.dataset.gen_subset, len(dataset) ) ) + + itr = task.eval_lm_dataloader( + dataset=dataset, + max_tokens=cfg.dataset.max_tokens or 36000, + batch_size=cfg.dataset.batch_size, + max_positions=utils.resolve_max_positions( + *[model.max_positions() for model in models] + ), + num_shards=max( + cfg.dataset.num_shards, + cfg.distributed_training.distributed_world_size, + ), + shard_id=max( + cfg.dataset.shard_id, + cfg.distributed_training.distributed_rank, + ), + num_workers=cfg.dataset.num_workers, + data_buffer_size=cfg.dataset.data_buffer_size, + context_window=cfg.eval_lm.context_window, + ) + + itr = progress_bar.progress_bar( + itr, + log_format=cfg.common.log_format, + log_interval=cfg.common.log_interval, + default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), + ) + + results = eval_lm( + models=models, + source_dictionary=task.source_dictionary, + batch_iterator=itr, + post_process=cfg.common_eval.post_process, + output_word_probs=cfg.eval_lm.output_word_probs, + output_word_stats=cfg.eval_lm.output_word_stats, + target_dictionary=task.target_dictionary, + softmax_batch=cfg.eval_lm.softmax_batch, + remove_bos_token=getattr(cfg.task, "add_bos_token", False), + ) + logger.info( "Loss (base 2): {:.4f}, Perplexity: {:.2f}".format( - avg_nll_loss, 2 ** avg_nll_loss + results["loss"], results["perplexity"] ) ) - if cfg.eval_lm.output_word_stats: - for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): - logger.info(ws) + return results def cli_main(): parser = options.get_eval_lm_parser() args = options.parse_args_and_arch(parser) - # only override args that are explicitly given on the command line - override_parser = options.get_validation_parser() - override_args = options.parse_args_and_arch(override_parser, suppress_defaults=True) - - distributed_utils.call_main(args, main, override_args=override_args) + distributed_utils.call_main(convert_namespace_to_omegaconf(args), main) if __name__ == "__main__": - cs = ConfigStore.instance() - register_hydra_cfg(cs) - initialize(config_path="../config", strict=True) cli_main() diff --git a/fairseq_cli/generate.py b/fairseq_cli/generate.py index f7260e125e..b8757835d4 100644 --- a/fairseq_cli/generate.py +++ b/fairseq_cli/generate.py @@ -17,15 +17,12 @@ import numpy as np import torch +from omegaconf import DictConfig + from fairseq import checkpoint_utils, options, scoring, tasks, utils -from fairseq.data import encoders -from fairseq.dataclass.data_class import register_hydra_cfg from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.logging import progress_bar from fairseq.logging.meters import StopwatchMeter, TimeMeter -from hydra.core.config_store import ConfigStore -from hydra.experimental import initialize -from omegaconf import DictConfig def main(cfg: DictConfig): @@ -84,7 +81,6 @@ def _main(cfg: DictConfig, output_file): # Load dataset splits task = tasks.setup_task(cfg.task) - task.load_dataset(cfg.dataset.gen_subset) # Set dictionaries try: @@ -97,7 +93,7 @@ def _main(cfg: DictConfig, output_file): # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) - models, _model_args = checkpoint_utils.load_model_ensemble( + models, saved_cfg = checkpoint_utils.load_model_ensemble( utils.split_paths(cfg.common_eval.path), arg_overrides=overrides, task=task, @@ -106,6 +102,9 @@ def _main(cfg: DictConfig, output_file): num_shards=cfg.checkpoint.checkpoint_shard_count, ) + # loading the dataset should happen after the checkpoint has been loaded so we can give it the saved task config + task.load_dataset(cfg.dataset.gen_subset, task_cfg=saved_cfg.task) + if cfg.generation.lm_path is not None: overrides["data"] = cfg.task.data @@ -166,12 +165,12 @@ def _main(cfg: DictConfig, output_file): extra_gen_cls_kwargs = {"lm_model": lms[0], "lm_weight": cfg.generation.lm_weight} generator = task.build_generator( - models, cfg.task, extra_gen_cls_kwargs=extra_gen_cls_kwargs + models, cfg.generation, extra_gen_cls_kwargs=extra_gen_cls_kwargs ) # Handle tokenization and BPE - tokenizer = encoders.build_tokenizer(cfg.tokenizer) - bpe = encoders.build_bpe(cfg.bpe) + tokenizer = task.build_tokenizer(cfg.tokenizer) + bpe = task.build_bpe(cfg.bpe) def decode_fn(x): if bpe is not None: @@ -299,7 +298,7 @@ def decode_fn(x): file=output_file, ) - if cfg.generation.print_alignment: + if cfg.generation.print_alignment == "hard": print( "A-{}\t{}".format( sample_id, @@ -312,6 +311,16 @@ def decode_fn(x): ), file=output_file, ) + if cfg.generation.print_alignment == "soft": + print( + "A-{}\t{}".format( + sample_id, + " ".join( + [",".join(src_probs) for src_probs in alignment] + ), + ), + file=output_file, + ) if cfg.generation.print_step: print( @@ -336,7 +345,10 @@ def decode_fn(x): # Score only the top hypothesis if has_target and j == 0: - if align_dict is not None or cfg.common_eval.post_process is not None: + if ( + align_dict is not None + or cfg.common_eval.post_process is not None + ): # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True @@ -357,7 +369,7 @@ def decode_fn(x): logger.info("NOTE: hypothesis and token scores are output in base 2") logger.info( - "Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)".format( + "Translated {:,} sentences ({:,} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)".format( num_sentences, gen_timer.n, gen_timer.sum, @@ -388,12 +400,18 @@ def decode_fn(x): def cli_main(): parser = options.get_generation_parser() + # TODO: replace this workaround with refactoring of `AudioPretraining` + parser.add_argument( + "--arch", + "-a", + metavar="ARCH", + default="wav2vec2", + help="Model architecture. For constructing tasks that rely on " + "model args (e.g. `AudioPretraining`)", + ) args = options.parse_args_and_arch(parser) main(args) if __name__ == "__main__": - cs = ConfigStore.instance() - register_hydra_cfg(cs) - initialize(config_path="../config", strict=True) cli_main() diff --git a/fairseq_cli/hydra_train.py b/fairseq_cli/hydra_train.py new file mode 100644 index 0000000000..607340af0d --- /dev/null +++ b/fairseq_cli/hydra_train.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os + +import hydra +import torch +from hydra.core.hydra_config import HydraConfig +from omegaconf import OmegaConf, open_dict + +from fairseq import distributed_utils, metrics +from fairseq.dataclass.configs import FairseqConfig +from fairseq.dataclass.initialize import add_defaults, hydra_init +from fairseq.dataclass.utils import omegaconf_no_object_check +from fairseq.utils import reset_logging +from fairseq_cli.train import main as pre_main + +logger = logging.getLogger("fairseq_cli.hydra_train") + + +@hydra.main(config_path=os.path.join("..", "fairseq", "config"), config_name="config") +def hydra_main(cfg: FairseqConfig) -> float: + _hydra_main(cfg) + + +def _hydra_main(cfg: FairseqConfig, **kwargs) -> float: + add_defaults(cfg) + + if cfg.common.reset_logging: + reset_logging() # Hydra hijacks logging, fix that + else: + # check if directly called or called through hydra_main + if HydraConfig.initialized(): + with open_dict(cfg): + # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) + cfg.job_logging_cfg = OmegaConf.to_container( + HydraConfig.get().job_logging, resolve=True + ) + + with omegaconf_no_object_check(): + cfg = OmegaConf.create( + OmegaConf.to_container(cfg, resolve=True, enum_to_str=True) + ) + OmegaConf.set_struct(cfg, True) + + try: + if cfg.common.profile: + with torch.cuda.profiler.profile(): + with torch.autograd.profiler.emit_nvtx(): + distributed_utils.call_main(cfg, pre_main, **kwargs) + else: + distributed_utils.call_main(cfg, pre_main, **kwargs) + except BaseException as e: + if not cfg.common.suppress_crashes: + raise + else: + logger.error("Crashed! " + str(e)) + + # get best val and return - useful for sweepers + try: + best_val = metrics.get_smoothed_value( + "valid", cfg.checkpoint.best_checkpoint_metric + ) + except: + best_val = None + + if best_val is None: + best_val = float("inf") + + return best_val + + +def cli_main(): + try: + from hydra._internal.utils import get_args + + cfg_name = get_args().config_name or "config" + except: + logger.warning("Failed to get config name from hydra args") + cfg_name = "config" + + hydra_init(cfg_name) + hydra_main() + + +if __name__ == "__main__": + cli_main() diff --git a/fairseq_cli/hydra_validate.py b/fairseq_cli/hydra_validate.py new file mode 100644 index 0000000000..cb6f7612d0 --- /dev/null +++ b/fairseq_cli/hydra_validate.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +import sys +from itertools import chain + +import torch +from hydra.core.hydra_config import HydraConfig +from omegaconf import OmegaConf, open_dict +import hydra + +from fairseq import checkpoint_utils, distributed_utils, utils +from fairseq.dataclass.configs import FairseqConfig +from fairseq.dataclass.initialize import add_defaults, hydra_init +from fairseq.dataclass.utils import omegaconf_no_object_check +from fairseq.distributed import utils as distributed_utils +from fairseq.logging import metrics, progress_bar +from fairseq.utils import reset_logging + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=sys.stdout, +) +logger = logging.getLogger("fairseq_cli.validate") + + +@hydra.main(config_path=os.path.join("..", "fairseq", "config"), config_name="config") +def hydra_main(cfg: FairseqConfig) -> float: + return _hydra_main(cfg) + + +def _hydra_main(cfg: FairseqConfig, **kwargs) -> float: + add_defaults(cfg) + + if cfg.common.reset_logging: + reset_logging() # Hydra hijacks logging, fix that + else: + # check if directly called or called through hydra_main + if HydraConfig.initialized(): + with open_dict(cfg): + # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) + cfg.job_logging_cfg = OmegaConf.to_container( + HydraConfig.get().job_logging, resolve=True + ) + + with omegaconf_no_object_check(): + cfg = OmegaConf.create( + OmegaConf.to_container(cfg, resolve=True, enum_to_str=True) + ) + OmegaConf.set_struct(cfg, True) + + assert ( + cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None + ), "Must specify batch size either with --max-tokens or --batch-size" + + distributed_utils.call_main(cfg, validate, **kwargs) + + +def validate(cfg): + utils.import_user_module(cfg.common) + + use_fp16 = cfg.common.fp16 + use_cuda = torch.cuda.is_available() and not cfg.common.cpu + + if use_cuda: + torch.cuda.set_device(cfg.distributed_training.device_id) + + if cfg.distributed_training.distributed_world_size > 1: + data_parallel_world_size = distributed_utils.get_data_parallel_world_size() + data_parallel_rank = distributed_utils.get_data_parallel_rank() + else: + data_parallel_world_size = 1 + data_parallel_rank = 0 + + overrides = {"task": {"data": cfg.task.data}} + + # Load ensemble + logger.info("loading model(s) from {}".format(cfg.common_eval.path)) + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [cfg.common_eval.path], + arg_overrides=overrides, + suffix=cfg.checkpoint.checkpoint_suffix, + ) + model = models[0] + + # Move models to GPU + for model in models: + model.eval() + if use_fp16: + model.half() + if use_cuda: + model.cuda() + + # Print args + logger.info(saved_cfg) + + # Build criterion + criterion = task.build_criterion(saved_cfg.criterion, from_checkpoint=True) + criterion.eval() + + for subset in cfg.dataset.valid_subset.split(","): + try: + task.load_dataset(subset, combine=False, epoch=1, task_cfg=saved_cfg.task) + dataset = task.dataset(subset) + except KeyError: + raise Exception("Cannot find dataset: " + subset) + + # Initialize data iterator + itr = task.get_batch_iterator( + dataset=dataset, + max_tokens=cfg.dataset.max_tokens, + max_sentences=cfg.dataset.batch_size, + max_positions=utils.resolve_max_positions( + task.max_positions(), + *[m.max_positions() for m in models], + ), + ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, + required_batch_size_multiple=cfg.dataset.required_batch_size_multiple, + seed=cfg.common.seed, + num_shards=data_parallel_world_size, + shard_id=data_parallel_rank, + num_workers=cfg.dataset.num_workers, + data_buffer_size=cfg.dataset.data_buffer_size, + ).next_epoch_itr(shuffle=False) + progress = progress_bar.progress_bar( + itr, + log_format=cfg.common.log_format, + log_interval=cfg.common.log_interval, + prefix=f"valid on '{subset}' subset", + default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), + ) + + def apply_half(t): + if t.dtype is torch.float32: + return t.to(dtype=torch.half) + return t + + log_outputs = [] + for i, sample in enumerate(progress): + sample = utils.move_to_cuda(sample) if use_cuda else sample + + if use_fp16: + sample = utils.apply_to_sample(apply_half, sample) + + _loss, _sample_size, log_output = task.valid_step(sample, model, criterion) + with metrics.aggregate() as agg: + task.reduce_metrics([log_output], criterion) + progress.log(agg.get_smoothed_values(), step=i) + # progress.log(log_output, step=i) from vision + log_outputs.append(log_output) + + if data_parallel_world_size > 1: + log_outputs = distributed_utils.all_gather_list( + log_outputs, + max_size=cfg.common.all_gather_list_size, + group=distributed_utils.get_data_parallel_group(), + ) + log_outputs = list(chain.from_iterable(log_outputs)) + + with metrics.aggregate() as agg: + task.reduce_metrics(log_outputs, criterion) + log_output = agg.get_smoothed_values() + + progress.print(log_output, tag=subset, step=i) + + +def cli_main(): + try: + from hydra._internal.utils import get_args + + cfg_name = get_args().config_name or "config" + except: + logger.warning("Failed to get config name from hydra args") + cfg_name = "config" + + hydra_init(cfg_name) + hydra_main() + + +if __name__ == "__main__": + cli_main() diff --git a/fairseq_cli/interactive.py b/fairseq_cli/interactive.py index 85607d8f44..03265d00e8 100644 --- a/fairseq_cli/interactive.py +++ b/fairseq_cli/interactive.py @@ -19,16 +19,12 @@ import numpy as np import torch + from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils -from fairseq.data import encoders -from fairseq.dataclass.data_class import register_hydra_cfg +from fairseq.dataclass.configs import FairseqConfig from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.token_generation_constraints import pack_constraints, unpack_constraints from fairseq_cli.generate import get_symbols_to_strip_from_output -from hydra.core.config_store import ConfigStore -from hydra.experimental import initialize -from omegaconf import DictConfig - logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", @@ -79,19 +75,13 @@ def encode_fn_target(x): for constraint in constraint_list ] - tokens = [ - task.source_dictionary.encode_line( - encode_fn(src_str), add_if_not_exist=False - ).long() - for src_str in lines - ] - if cfg.generation.constraints: constraints_tensor = pack_constraints(batch_constraints) else: constraints_tensor = None - lengths = [t.numel() for t in tokens] + tokens, lengths = task.get_interactive_tokens_and_lengths(lines, encode_fn) + itr = task.get_batch_iterator( dataset=task.build_dataset_for_inference( tokens, lengths, constraints=constraints_tensor @@ -115,7 +105,7 @@ def encode_fn_target(x): ) -def main(cfg: DictConfig): +def main(cfg: FairseqConfig): if isinstance(cfg, Namespace): cfg = convert_namespace_to_omegaconf(cfg) @@ -176,11 +166,11 @@ def main(cfg: DictConfig): model.prepare_for_inference_(cfg) # Initialize generator - generator = task.build_generator(models, cfg.task) + generator = task.build_generator(models, cfg.generation) # Handle tokenization and BPE - tokenizer = encoders.build_tokenizer(cfg.tokenizer) - bpe = encoders.build_bpe(cfg.bpe) + tokenizer = task.build_tokenizer(cfg.tokenizer) + bpe = task.build_bpe(cfg.bpe) def encode_fn(x): if tokenizer is not None: @@ -259,6 +249,7 @@ def decode_fn(x): # sort output to match input order for id_, src_tokens, hypos, info in sorted(results, key=lambda x: x[0]): + src_str = "" if src_dict is not None: src_str = src_dict.string(src_tokens, cfg.common_eval.post_process) print("S-{}\t{}".format(id_, src_str)) @@ -266,7 +257,8 @@ def decode_fn(x): for constraint in info["constraints"]: print( "C-{}\t{}".format( - id_, tgt_dict.string(constraint, cfg.common_eval.post_process) + id_, + tgt_dict.string(constraint, cfg.common_eval.post_process), ) ) @@ -322,7 +314,4 @@ def cli_main(): if __name__ == "__main__": - cs = ConfigStore.instance() - register_hydra_cfg(cs) - initialize(config_path="../config", strict=True) cli_main() diff --git a/fairseq_cli/preprocess.py b/fairseq_cli/preprocess.py index fa77da8dba..2ba9e09338 100644 --- a/fairseq_cli/preprocess.py +++ b/fairseq_cli/preprocess.py @@ -11,14 +11,17 @@ import os import shutil import sys -from collections import Counter +import typing as tp +from argparse import Namespace from itertools import zip_longest -from multiprocessing import Pool from fairseq import options, tasks, utils -from fairseq.binarizer import Binarizer -from fairseq.data import indexed_dataset - +from fairseq.binarizer import ( + AlignmentDatasetBinarizer, + FileBinarizer, + VocabularyDatasetBinarizer, +) +from fairseq.data import Dictionary logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", @@ -28,8 +31,251 @@ ) logger = logging.getLogger("fairseq_cli.preprocess") +##################################################################### +# file name tools +##################################################################### + + +def _train_path(lang, trainpref): + return "{}{}".format(trainpref, ("." + lang) if lang else "") + + +def _file_name(prefix, lang): + fname = prefix + if lang is not None: + fname += ".{lang}".format(lang=lang) + return fname + + +def _dest_path(prefix, lang, destdir): + return os.path.join(destdir, _file_name(prefix, lang)) + + +def _dict_path(lang, destdir): + return _dest_path("dict", lang, destdir) + ".txt" + + +def dataset_dest_prefix(args, output_prefix, lang): + base = os.path.join(args.destdir, output_prefix) + if lang is not None: + lang_part = f".{args.source_lang}-{args.target_lang}.{lang}" + elif args.only_source: + lang_part = "" + else: + lang_part = f".{args.source_lang}-{args.target_lang}" + + return "{}{}".format(base, lang_part) + + +def dataset_dest_file(args, output_prefix, lang, extension): + return "{}.{}".format(dataset_dest_prefix(args, output_prefix, lang), extension) + + +##################################################################### +# dictionary tools +##################################################################### + + +def _build_dictionary( + filenames, + task, + args, + src=False, + tgt=False, +): + assert src ^ tgt + return task.build_dictionary( + filenames, + workers=args.workers, + threshold=args.thresholdsrc if src else args.thresholdtgt, + nwords=args.nwordssrc if src else args.nwordstgt, + padding_factor=args.padding_factor, + ) + + +##################################################################### +# bin file creation logic +##################################################################### + + +def _make_binary_dataset( + vocab: Dictionary, + input_prefix: str, + output_prefix: str, + lang: tp.Optional[str], + num_workers: int, + args: Namespace, +): + logger.info("[{}] Dictionary: {} types".format(lang, len(vocab))) + + binarizer = VocabularyDatasetBinarizer( + vocab, + append_eos=True, + ) + + input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") + full_output_prefix = dataset_dest_prefix(args, output_prefix, lang) + + final_summary = FileBinarizer.multiprocess_dataset( + input_file, + args.dataset_impl, + binarizer, + full_output_prefix, + vocab_size=len(vocab), + num_workers=num_workers, + ) + + logger.info(f"[{lang}] {input_file}: {final_summary} (by {vocab.unk_word})") + + +def _make_binary_alignment_dataset( + input_prefix: str, output_prefix: str, num_workers: int, args: Namespace +): + + binarizer = AlignmentDatasetBinarizer(utils.parse_alignment) + + input_file = input_prefix + full_output_prefix = dataset_dest_prefix(args, output_prefix, lang=None) + + final_summary = FileBinarizer.multiprocess_dataset( + input_file, + args.dataset_impl, + binarizer, + full_output_prefix, + vocab_size=None, + num_workers=num_workers, + ) + + logger.info( + "[alignments] {}: parsed {} alignments".format( + input_file, final_summary.num_seq + ) + ) + + +##################################################################### +# routing logic +##################################################################### + + +def _make_dataset( + vocab: Dictionary, + input_prefix: str, + output_prefix: str, + lang: tp.Optional[str], + args: Namespace, + num_workers: int, +): + if args.dataset_impl == "raw": + # Copy original text file to destination folder + output_text_file = _dest_path( + output_prefix + ".{}-{}".format(args.source_lang, args.target_lang), + lang, + args.destdir, + ) + shutil.copyfile(_file_name(input_prefix, lang), output_text_file) + else: + _make_binary_dataset( + vocab, input_prefix, output_prefix, lang, num_workers, args + ) + + +def _make_all(lang, vocab, args): + if args.trainpref: + _make_dataset( + vocab, args.trainpref, "train", lang, args=args, num_workers=args.workers + ) + if args.validpref: + for k, validpref in enumerate(args.validpref.split(",")): + outprefix = "valid{}".format(k) if k > 0 else "valid" + _make_dataset( + vocab, validpref, outprefix, lang, args=args, num_workers=args.workers + ) + if args.testpref: + for k, testpref in enumerate(args.testpref.split(",")): + outprefix = "test{}".format(k) if k > 0 else "test" + _make_dataset( + vocab, testpref, outprefix, lang, args=args, num_workers=args.workers + ) + + +def _make_all_alignments(args): + if args.trainpref and os.path.exists(args.trainpref + "." + args.align_suffix): + _make_binary_alignment_dataset( + args.trainpref + "." + args.align_suffix, + "train.align", + num_workers=args.workers, + args=args, + ) + if args.validpref and os.path.exists(args.validpref + "." + args.align_suffix): + _make_binary_alignment_dataset( + args.validpref + "." + args.align_suffix, + "valid.align", + num_workers=args.workers, + args=args, + ) + if args.testpref and os.path.exists(args.testpref + "." + args.align_suffix): + _make_binary_alignment_dataset( + args.testpref + "." + args.align_suffix, + "test.align", + num_workers=args.workers, + args=args, + ) + + +##################################################################### +# align +##################################################################### + + +def _align_files(args, src_dict, tgt_dict): + assert args.trainpref, "--trainpref must be set if --alignfile is specified" + src_file_name = _train_path(args.source_lang, args.trainpref) + tgt_file_name = _train_path(args.target_lang, args.trainpref) + freq_map = {} + with open(args.alignfile, "r", encoding="utf-8") as align_file: + with open(src_file_name, "r", encoding="utf-8") as src_file: + with open(tgt_file_name, "r", encoding="utf-8") as tgt_file: + for a, s, t in zip_longest(align_file, src_file, tgt_file): + si = src_dict.encode_line(s, add_if_not_exist=False) + ti = tgt_dict.encode_line(t, add_if_not_exist=False) + ai = list(map(lambda x: tuple(x.split("-")), a.split())) + for sai, tai in ai: + srcidx = si[int(sai)] + tgtidx = ti[int(tai)] + if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): + assert srcidx != src_dict.pad() + assert srcidx != src_dict.eos() + assert tgtidx != tgt_dict.pad() + assert tgtidx != tgt_dict.eos() + if srcidx not in freq_map: + freq_map[srcidx] = {} + if tgtidx not in freq_map[srcidx]: + freq_map[srcidx][tgtidx] = 1 + else: + freq_map[srcidx][tgtidx] += 1 + align_dict = {} + for srcidx in freq_map.keys(): + align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) + with open( + os.path.join( + args.destdir, + "alignment.{}-{}.txt".format(args.source_lang, args.target_lang), + ), + "w", + encoding="utf-8", + ) as f: + for k, v in align_dict.items(): + print("{} {}".format(src_dict[k], tgt_dict[v]), file=f) + + +##################################################################### +# MAIN +##################################################################### + def main(args): + # setup some basic things utils.import_user_module(args) os.makedirs(args.destdir, exist_ok=True) @@ -41,39 +287,25 @@ def main(args): ) logger.info(args) - task = tasks.get_task(args.task) + assert ( + args.dataset_impl != "huffman" + ), "preprocessing.py doesn't support Huffman yet, use HuffmanCodeBuilder directly." - def train_path(lang): - return "{}{}".format(args.trainpref, ("." + lang) if lang else "") - - def file_name(prefix, lang): - fname = prefix - if lang is not None: - fname += ".{lang}".format(lang=lang) - return fname - - def dest_path(prefix, lang): - return os.path.join(args.destdir, file_name(prefix, lang)) - - def dict_path(lang): - return dest_path("dict", lang) + ".txt" - - def build_dictionary(filenames, src=False, tgt=False): - assert src ^ tgt - return task.build_dictionary( - filenames, - workers=args.workers, - threshold=args.thresholdsrc if src else args.thresholdtgt, - nwords=args.nwordssrc if src else args.nwordstgt, - padding_factor=args.padding_factor, - ) + # build dictionaries target = not args.only_source - if not args.srcdict and os.path.exists(dict_path(args.source_lang)): - raise FileExistsError(dict_path(args.source_lang)) - if target and not args.tgtdict and os.path.exists(dict_path(args.target_lang)): - raise FileExistsError(dict_path(args.target_lang)) + if not args.srcdict and os.path.exists(_dict_path(args.source_lang, args.destdir)): + raise FileExistsError(_dict_path(args.source_lang, args.destdir)) + + if ( + target + and not args.tgtdict + and os.path.exists(_dict_path(args.target_lang, args.destdir)) + ): + raise FileExistsError(_dict_path(args.target_lang, args.destdir)) + + task = tasks.get_task(args.task) if args.joined_dictionary: assert ( @@ -88,8 +320,13 @@ def build_dictionary(filenames, src=False, tgt=False): assert ( args.trainpref ), "--trainpref must be set if --srcdict is not specified" - src_dict = build_dictionary( - {train_path(lang) for lang in [args.source_lang, args.target_lang]}, + src_dict = _build_dictionary( + { + _train_path(lang, args.trainpref) + for lang in [args.source_lang, args.target_lang] + }, + task=task, + args=args, src=True, ) tgt_dict = src_dict @@ -100,7 +337,12 @@ def build_dictionary(filenames, src=False, tgt=False): assert ( args.trainpref ), "--trainpref must be set if --srcdict is not specified" - src_dict = build_dictionary([train_path(args.source_lang)], src=True) + src_dict = _build_dictionary( + [_train_path(args.source_lang, args.trainpref)], + task=task, + args=args, + src=True, + ) if target: if args.tgtdict: @@ -109,283 +351,36 @@ def build_dictionary(filenames, src=False, tgt=False): assert ( args.trainpref ), "--trainpref must be set if --tgtdict is not specified" - tgt_dict = build_dictionary([train_path(args.target_lang)], tgt=True) + tgt_dict = _build_dictionary( + [_train_path(args.target_lang, args.trainpref)], + task=task, + args=args, + tgt=True, + ) else: tgt_dict = None - src_dict.save(dict_path(args.source_lang)) - if target and tgt_dict is not None: - tgt_dict.save(dict_path(args.target_lang)) - - def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers): - logger.info("[{}] Dictionary: {} types".format(lang, len(vocab))) - n_seq_tok = [0, 0] - replaced = Counter() - - def merge_result(worker_result): - replaced.update(worker_result["replaced"]) - n_seq_tok[0] += worker_result["nseq"] - n_seq_tok[1] += worker_result["ntok"] - - input_file = "{}{}".format( - input_prefix, ("." + lang) if lang is not None else "" - ) - offsets = Binarizer.find_offsets(input_file, num_workers) - pool = None - if num_workers > 1: - pool = Pool(processes=num_workers - 1) - for worker_id in range(1, num_workers): - prefix = "{}{}".format(output_prefix, worker_id) - pool.apply_async( - binarize, - ( - args, - input_file, - vocab, - prefix, - lang, - offsets[worker_id], - offsets[worker_id + 1], - ), - callback=merge_result, - ) - pool.close() + # save dictionaries - ds = indexed_dataset.make_builder( - dataset_dest_file(args, output_prefix, lang, "bin"), - impl=args.dataset_impl, - vocab_size=len(vocab), - ) - merge_result( - Binarizer.binarize( - input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1] - ) - ) - if num_workers > 1: - pool.join() - for worker_id in range(1, num_workers): - prefix = "{}{}".format(output_prefix, worker_id) - temp_file_path = dataset_dest_prefix(args, prefix, lang) - ds.merge_file_(temp_file_path) - os.remove(indexed_dataset.data_file_path(temp_file_path)) - os.remove(indexed_dataset.index_file_path(temp_file_path)) - - ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) - - logger.info( - "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( - lang, - input_file, - n_seq_tok[0], - n_seq_tok[1], - 100 * sum(replaced.values()) / n_seq_tok[1], - vocab.unk_word, - ) - ) + src_dict.save(_dict_path(args.source_lang, args.destdir)) + if target and tgt_dict is not None: + tgt_dict.save(_dict_path(args.target_lang, args.destdir)) - def make_binary_alignment_dataset(input_prefix, output_prefix, num_workers): - nseq = [0] - - def merge_result(worker_result): - nseq[0] += worker_result["nseq"] - - input_file = input_prefix - offsets = Binarizer.find_offsets(input_file, num_workers) - pool = None - if num_workers > 1: - pool = Pool(processes=num_workers - 1) - for worker_id in range(1, num_workers): - prefix = "{}{}".format(output_prefix, worker_id) - pool.apply_async( - binarize_alignments, - ( - args, - input_file, - utils.parse_alignment, - prefix, - offsets[worker_id], - offsets[worker_id + 1], - ), - callback=merge_result, - ) - pool.close() + if args.dict_only: + return - ds = indexed_dataset.make_builder( - dataset_dest_file(args, output_prefix, None, "bin"), impl=args.dataset_impl - ) - - merge_result( - Binarizer.binarize_alignments( - input_file, - utils.parse_alignment, - lambda t: ds.add_item(t), - offset=0, - end=offsets[1], - ) - ) - if num_workers > 1: - pool.join() - for worker_id in range(1, num_workers): - prefix = "{}{}".format(output_prefix, worker_id) - temp_file_path = dataset_dest_prefix(args, prefix, None) - ds.merge_file_(temp_file_path) - os.remove(indexed_dataset.data_file_path(temp_file_path)) - os.remove(indexed_dataset.index_file_path(temp_file_path)) - - ds.finalize(dataset_dest_file(args, output_prefix, None, "idx")) - - logger.info("[alignments] {}: parsed {} alignments".format(input_file, nseq[0])) - - def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): - if args.dataset_impl == "raw": - # Copy original text file to destination folder - output_text_file = dest_path( - output_prefix + ".{}-{}".format(args.source_lang, args.target_lang), - lang, - ) - shutil.copyfile(file_name(input_prefix, lang), output_text_file) - else: - make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers) - - def make_all(lang, vocab): - if args.trainpref: - make_dataset(vocab, args.trainpref, "train", lang, num_workers=args.workers) - if args.validpref: - for k, validpref in enumerate(args.validpref.split(",")): - outprefix = "valid{}".format(k) if k > 0 else "valid" - make_dataset( - vocab, validpref, outprefix, lang, num_workers=args.workers - ) - if args.testpref: - for k, testpref in enumerate(args.testpref.split(",")): - outprefix = "test{}".format(k) if k > 0 else "test" - make_dataset(vocab, testpref, outprefix, lang, num_workers=args.workers) - - def make_all_alignments(): - if args.trainpref and os.path.exists(args.trainpref + "." + args.align_suffix): - make_binary_alignment_dataset( - args.trainpref + "." + args.align_suffix, - "train.align", - num_workers=args.workers, - ) - if args.validpref and os.path.exists(args.validpref + "." + args.align_suffix): - make_binary_alignment_dataset( - args.validpref + "." + args.align_suffix, - "valid.align", - num_workers=args.workers, - ) - if args.testpref and os.path.exists(args.testpref + "." + args.align_suffix): - make_binary_alignment_dataset( - args.testpref + "." + args.align_suffix, - "test.align", - num_workers=args.workers, - ) - - make_all(args.source_lang, src_dict) + _make_all(args.source_lang, src_dict, args) if target: - make_all(args.target_lang, tgt_dict) + _make_all(args.target_lang, tgt_dict, args) + + # align the datasets if needed if args.align_suffix: - make_all_alignments() + _make_all_alignments(args) logger.info("Wrote preprocessed data to {}".format(args.destdir)) if args.alignfile: - assert args.trainpref, "--trainpref must be set if --alignfile is specified" - src_file_name = train_path(args.source_lang) - tgt_file_name = train_path(args.target_lang) - freq_map = {} - with open(args.alignfile, "r", encoding="utf-8") as align_file: - with open(src_file_name, "r", encoding="utf-8") as src_file: - with open(tgt_file_name, "r", encoding="utf-8") as tgt_file: - for a, s, t in zip_longest(align_file, src_file, tgt_file): - si = src_dict.encode_line(s, add_if_not_exist=False) - ti = tgt_dict.encode_line(t, add_if_not_exist=False) - ai = list(map(lambda x: tuple(x.split("-")), a.split())) - for sai, tai in ai: - srcidx = si[int(sai)] - tgtidx = ti[int(tai)] - if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): - assert srcidx != src_dict.pad() - assert srcidx != src_dict.eos() - assert tgtidx != tgt_dict.pad() - assert tgtidx != tgt_dict.eos() - - if srcidx not in freq_map: - freq_map[srcidx] = {} - if tgtidx not in freq_map[srcidx]: - freq_map[srcidx][tgtidx] = 1 - else: - freq_map[srcidx][tgtidx] += 1 - - align_dict = {} - for srcidx in freq_map.keys(): - align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) - - with open( - os.path.join( - args.destdir, - "alignment.{}-{}.txt".format(args.source_lang, args.target_lang), - ), - "w", - encoding="utf-8", - ) as f: - for k, v in align_dict.items(): - print("{} {}".format(src_dict[k], tgt_dict[v]), file=f) - - -def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True): - ds = indexed_dataset.make_builder( - dataset_dest_file(args, output_prefix, lang, "bin"), - impl=args.dataset_impl, - vocab_size=len(vocab), - ) - - def consumer(tensor): - ds.add_item(tensor) - - res = Binarizer.binarize( - filename, vocab, consumer, append_eos=append_eos, offset=offset, end=end - ) - ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) - return res - - -def binarize_alignments(args, filename, parse_alignment, output_prefix, offset, end): - ds = indexed_dataset.make_builder( - dataset_dest_file(args, output_prefix, None, "bin"), - impl=args.dataset_impl, - vocab_size=None, - ) - - def consumer(tensor): - ds.add_item(tensor) - - res = Binarizer.binarize_alignments( - filename, parse_alignment, consumer, offset=offset, end=end - ) - ds.finalize(dataset_dest_file(args, output_prefix, None, "idx")) - return res - - -def dataset_dest_prefix(args, output_prefix, lang): - base = "{}/{}".format(args.destdir, output_prefix) - if lang is not None: - lang_part = ".{}-{}.{}".format(args.source_lang, args.target_lang, lang) - elif args.only_source: - lang_part = "" - else: - lang_part = ".{}-{}".format(args.source_lang, args.target_lang) - - return "{}{}".format(base, lang_part) - - -def dataset_dest_file(args, output_prefix, lang, extension): - base = dataset_dest_prefix(args, output_prefix, lang) - return "{}.{}".format(base, extension) - - -def get_offsets(input_file, num_workers): - return Binarizer.find_offsets(input_file, num_workers) + _align_files(args, src_dict=src_dict, tgt_dict=tgt_dict) def cli_main(): diff --git a/fairseq_cli/score.py b/fairseq_cli/score.py index e06d67259d..0b207be959 100644 --- a/fairseq_cli/score.py +++ b/fairseq_cli/score.py @@ -58,7 +58,7 @@ def readlines(fd): def score(fdsys): with open(args.ref) as fdref: - print(sacrebleu.corpus_bleu(fdsys, [fdref])) + print(sacrebleu.corpus_bleu(fdsys, [fdref]).format()) elif args.sentence_bleu: diff --git a/fairseq_cli/train.py b/fairseq_cli/train.py index 4c00761060..f771bff654 100644 --- a/fairseq_cli/train.py +++ b/fairseq_cli/train.py @@ -12,30 +12,9 @@ import math import os import sys -from typing import Dict, Optional, Any, List, Tuple, Callable - -import numpy as np -import torch -from hydra.core.config_store import ConfigStore - -from fairseq import ( - checkpoint_utils, - distributed_utils, - options, - quantization_utils, - tasks, - utils, -) -from fairseq.data import iterators -from fairseq.dataclass.utils import convert_namespace_to_omegaconf -from fairseq.logging import meters, metrics, progress_bar -from fairseq.model_parallel.megatron_trainer import MegatronTrainer -from omegaconf import DictConfig -from hydra.experimental import initialize -from fairseq.dataclass.data_class import register_hydra_cfg -from fairseq.trainer import Trainer - +from typing import Any, Callable, Dict, List, Optional, Tuple +# We need to setup root logger before importing any fairseq libraries. logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", @@ -44,17 +23,47 @@ ) logger = logging.getLogger("fairseq_cli.train") +import numpy as np +import torch +from omegaconf import DictConfig, OmegaConf -def main(cfg: DictConfig) -> None: +from fairseq import checkpoint_utils, options, quantization_utils, tasks, utils +from fairseq.data import data_utils, iterators +from fairseq.data.plasma_utils import PlasmaStore +from fairseq.dataclass.configs import FairseqConfig +from fairseq.dataclass.initialize import add_defaults +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.distributed import fsdp_enable_wrap, fsdp_wrap +from fairseq.distributed import utils as distributed_utils +from fairseq.file_io import PathManager +from fairseq.logging import meters, metrics, progress_bar +from fairseq.model_parallel.megatron_trainer import MegatronTrainer +from fairseq.trainer import Trainer + + +def main(cfg: FairseqConfig) -> None: if isinstance(cfg, argparse.Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) + add_defaults(cfg) + + if ( + distributed_utils.is_master(cfg.distributed_training) + and "job_logging_cfg" in cfg + ): + # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) + logging.config.dictConfig(OmegaConf.to_container(cfg.job_logging_cfg)) - assert cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None, \ - 'Must specify batch size either with --max-tokens or --batch-size' + assert ( + cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None + ), "Must specify batch size either with --max-tokens or --batch-size" metrics.reset() + if cfg.common.log_file is not None: + handler = logging.FileHandler(filename=cfg.common.log_file) + logger.addHandler(handler) + np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) @@ -64,25 +73,65 @@ def main(cfg: DictConfig) -> None: # Print args logger.info(cfg) + if cfg.checkpoint.write_checkpoints_asynchronously: + try: + import iopath # noqa: F401 + except ImportError: + logging.exception( + "Asynchronous checkpoint writing is specified but iopath is " + "not installed: `pip install iopath`" + ) + return + # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(cfg.task) - # Load valid dataset (we load training data below, based on the latest checkpoint) - for valid_sub_split in cfg.dataset.valid_subset.split(','): - task.load_dataset(valid_sub_split, combine=False, epoch=1) + + assert cfg.criterion, "Please specify criterion to train a model" # Build model and criterion - model = task.build_model(cfg.model) + if cfg.distributed_training.ddp_backend == "fully_sharded": + with fsdp_enable_wrap(cfg.distributed_training): + model = fsdp_wrap(task.build_model(cfg.model)) + else: + model = task.build_model(cfg.model) criterion = task.build_criterion(cfg.criterion) logger.info(model) - logger.info("task: {} ({})".format(cfg.task._name, task.__class__.__name__)) - logger.info("model: {} ({})".format(cfg.model._name, model.__class__.__name__)) + logger.info("task: {}".format(task.__class__.__name__)) + logger.info("model: {}".format(model.__class__.__name__)) + logger.info("criterion: {}".format(criterion.__class__.__name__)) + logger.info( + "num. shared model params: {:,} (num. trained: {:,})".format( + sum( + p.numel() for p in model.parameters() if not getattr(p, "expert", False) + ), + sum( + p.numel() + for p in model.parameters() + if not getattr(p, "expert", False) and p.requires_grad + ), + ) + ) + logger.info( - "criterion: {} ({})".format(cfg.criterion._name, criterion.__class__.__name__) + "num. expert model params: {} (num. trained: {})".format( + sum(p.numel() for p in model.parameters() if getattr(p, "expert", False)), + sum( + p.numel() + for p in model.parameters() + if getattr(p, "expert", False) and p.requires_grad + ), + ) ) - logger.info("num. model params: {} (num. trained: {})".format( - sum(p.numel() for p in model.parameters()), - sum(p.numel() for p in model.parameters() if p.requires_grad), - )) + + # Load valid dataset (we load training data below, based on the latest checkpoint) + # We load the valid dataset AFTER building the model + if not cfg.dataset.disable_validation: + data_utils.raise_if_valid_subsets_unintentionally_ignored(cfg) + if cfg.dataset.combine_valid_subsets: + task.load_dataset("valid", combine=True, epoch=1) + else: + for valid_sub_split in cfg.dataset.valid_subset.split(","): + task.load_dataset(valid_sub_split, combine=False, epoch=1) # (optionally) Configure quantization if cfg.common.quantization_config_path is not None: @@ -99,12 +148,17 @@ def main(cfg: DictConfig) -> None: trainer = Trainer(cfg, task, model, criterion, quantizer) else: trainer = MegatronTrainer(cfg, task, model, criterion) - - logger.info('training on {} devices (GPUs/TPUs)'.format(cfg.distributed_training.distributed_world_size)) - logger.info('max tokens per GPU = {} and batch size per GPU = {}'.format( - cfg.dataset.max_tokens, - cfg.dataset.batch_size, - )) + logger.info( + "training on {} devices (GPUs/TPUs)".format( + cfg.distributed_training.distributed_world_size + ) + ) + logger.info( + "max tokens per device = {} and max sentences per device = {}".format( + cfg.dataset.max_tokens, + cfg.dataset.batch_size, + ) + ) # Load the latest checkpoint if one is available and restore the # corresponding train iterator @@ -114,15 +168,39 @@ def main(cfg: DictConfig) -> None: # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) + if cfg.common.tpu: + import torch_xla.core.xla_model as xm + + xm.rendezvous("load_checkpoint") # wait for all workers max_epoch = cfg.optimization.max_epoch or math.inf lr = trainer.get_lr() + + # TODO: a dry run on validation set to pin the memory + valid_subsets = cfg.dataset.valid_subset.split(",") + if not cfg.dataset.disable_validation: + for subset in valid_subsets: + logger.info('begin dry-run validation on "{}" subset'.format(subset)) + itr = trainer.get_valid_iterator(subset).next_epoch_itr( + shuffle=False, set_dataset_epoch=False # use a fixed valid set + ) + if cfg.common.tpu: + itr = utils.tpu_data_loader(itr) + for _ in itr: + pass + # TODO: end of dry run section + train_meter = meters.StopwatchMeter() train_meter.start() - while ( - lr > cfg.optimization.min_lr - and epoch_itr.next_epoch_idx <= max_epoch - ): + while epoch_itr.next_epoch_idx <= max_epoch: + if lr <= cfg.optimization.stop_min_lr: + logger.info( + f"stopping training because current learning rate ({lr}) is smaller " + "than or equal to minimum learning rate " + f"(--stop-min-lr={cfg.optimization.stop_min_lr})" + ) + break + # train for one epoch valid_losses, should_stop = train(cfg, trainer, task, epoch_itr) if should_stop: @@ -141,6 +219,15 @@ def main(cfg: DictConfig) -> None: train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum)) + # ioPath implementation to wait for all asynchronous file writes to complete. + if cfg.checkpoint.write_checkpoints_asynchronously: + logger.info( + "ioPath PathManager waiting for all asynchronous checkpoint " + "writes to finish." + ) + PathManager.async_close() + logger.info("ioPath PathManager finished waiting.") + def should_stop_early(cfg: DictConfig, valid_loss: float) -> bool: # skip check if no validation was done in the current epoch @@ -160,14 +247,20 @@ def is_better(a, b): else: should_stop_early.num_runs += 1 if should_stop_early.num_runs >= cfg.checkpoint.patience: - logger.info('early stop since valid performance hasn\'t improved for last {} runs'.format(cfg.checkpoint.patience)) + logger.info( + "early stop since valid performance hasn't improved for last {} runs".format( + cfg.checkpoint.patience + ) + ) return True else: return False @metrics.aggregate("train") -def train(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr) -> Tuple[List[Optional[float]], bool]: +def train( + cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr +) -> Tuple[List[Optional[float]], bool]: """Train the model for one epoch and return validation losses.""" # Initialize data iterator itr = epoch_itr.next_epoch_itr( @@ -179,25 +272,58 @@ def train(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr) if epoch_itr.epoch <= len(cfg.optimization.update_freq) else cfg.optimization.update_freq[-1] ) - itr = iterators.GroupedIterator(itr, update_freq) - if getattr(cfg.common, "tpu", False): + itr = iterators.GroupedIterator( + itr, + update_freq, + skip_remainder_batch=cfg.optimization.skip_remainder_batch, + ) + if cfg.common.tpu: itr = utils.tpu_data_loader(itr) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, + log_file=cfg.common.log_file, log_interval=cfg.common.log_interval, epoch=epoch_itr.epoch, + aim_repo=( + cfg.common.aim_repo + if distributed_utils.is_master(cfg.distributed_training) + else None + ), + aim_run_hash=( + cfg.common.aim_run_hash + if distributed_utils.is_master(cfg.distributed_training) + else None + ), + aim_param_checkpoint_dir=cfg.checkpoint.save_dir, tensorboard_logdir=( - cfg.common.tensorboard_logdir if distributed_utils.is_master(cfg.distributed_training) else None + cfg.common.tensorboard_logdir + if distributed_utils.is_master(cfg.distributed_training) + else None + ), + default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), + wandb_project=( + cfg.common.wandb_project + if distributed_utils.is_master(cfg.distributed_training) + else None + ), + wandb_run_name=os.environ.get( + "WANDB_NAME", os.path.basename(cfg.checkpoint.save_dir) + ), + azureml_logging=( + cfg.common.azureml_logging + if distributed_utils.is_master(cfg.distributed_training) + else False ), - default_log_format=('tqdm' if not cfg.common.no_progress_bar else 'simple'), ) + progress.update_config(_flatten_config(cfg)) trainer.begin_epoch(epoch_itr.epoch) - valid_subsets = cfg.dataset.valid_subset.split(',') + valid_subsets = cfg.dataset.valid_subset.split(",") should_stop = False num_updates = trainer.get_num_updates() + logger.info("Start iterating over samples") for i, samples in enumerate(progress): with metrics.aggregate("train_inner"), torch.autograd.profiler.record_function( "train_step-%d" % i @@ -233,12 +359,55 @@ def train(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr) return valid_losses, should_stop -def validate_and_save(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr, valid_subsets: List[str], end_of_epoch: bool) -> Tuple[List[Optional[float]], bool]: +def _flatten_config(cfg: DictConfig): + config = OmegaConf.to_container(cfg) + # remove any legacy Namespaces and replace with a single "args" + namespace = None + for k, v in list(config.items()): + if isinstance(v, argparse.Namespace): + namespace = v + del config[k] + if namespace is not None: + config["args"] = vars(namespace) + return config + + +def validate_and_save( + cfg: DictConfig, + trainer: Trainer, + task: tasks.FairseqTask, + epoch_itr, + valid_subsets: List[str], + end_of_epoch: bool, +) -> Tuple[List[Optional[float]], bool]: num_updates = trainer.get_num_updates() max_update = cfg.optimization.max_update or math.inf + + # Stopping conditions (and an additional one based on validation loss later + # on) + should_stop = False + if num_updates >= max_update: + should_stop = True + logger.info( + f"Stopping training due to " + f"num_updates: {num_updates} >= max_update: {max_update}" + ) + + training_time_hours = trainer.cumulative_training_time() / (60 * 60) + if ( + cfg.optimization.stop_time_hours > 0 + and training_time_hours > cfg.optimization.stop_time_hours + ): + should_stop = True + logger.info( + f"Stopping training due to " + f"cumulative_training_time: {training_time_hours} > " + f"stop_time_hours: {cfg.optimization.stop_time_hours} hour(s)" + ) + do_save = ( (end_of_epoch and epoch_itr.epoch % cfg.checkpoint.save_interval == 0) - or num_updates >= max_update + or should_stop or ( cfg.checkpoint.save_interval_updates > 0 and num_updates > 0 @@ -247,35 +416,34 @@ def validate_and_save(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask ) ) do_validate = ( - (not end_of_epoch and do_save) # validate during mid-epoch saves - or (end_of_epoch and epoch_itr.epoch % cfg.dataset.validate_interval == 0) - or num_updates >= max_update - or ( - cfg.dataset.validate_interval_updates > 0 - and num_updates > 0 - and num_updates % cfg.dataset.validate_interval_updates == 0 + ( + (not end_of_epoch and do_save) # validate during mid-epoch saves + or (end_of_epoch and epoch_itr.epoch % cfg.dataset.validate_interval == 0) + or should_stop + or ( + cfg.dataset.validate_interval_updates > 0 + and num_updates > 0 + and num_updates % cfg.dataset.validate_interval_updates == 0 + ) ) - ) and not cfg.dataset.disable_validation + and not cfg.dataset.disable_validation + and num_updates >= cfg.dataset.validate_after_updates + ) # Validate valid_losses = [None] if do_validate: valid_losses = validate(cfg, trainer, task, epoch_itr, valid_subsets) - # Stopping conditions - should_stop = ( - should_stop_early(cfg, valid_losses[0]) - or num_updates >= max_update - or ( - cfg.optimization.stop_time_hours > 0 - and trainer.cumulative_training_time() / (60 * 60) > cfg.optimization.stop_time_hours - ) - ) + should_stop |= should_stop_early(cfg, valid_losses[0]) # Save checkpoint if do_save or should_stop: - logger.info("begin save checkpoint") - checkpoint_utils.save_checkpoint(cfg.checkpoint, trainer, epoch_itr, valid_losses[0]) + cp_path = checkpoint_utils.save_checkpoint( + cfg.checkpoint, trainer, epoch_itr, valid_losses[0] + ) + if cp_path is not None and hasattr(task, "post_save"): + task.post_save(cp_path, num_updates) return valid_losses, should_stop @@ -285,7 +453,13 @@ def get_training_stats(stats: Dict[str, Any]) -> Dict[str, Any]: return stats -def validate(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr, subsets: List[str]) -> List[Optional[float]]: +def validate( + cfg: DictConfig, + trainer: Trainer, + task: tasks.FairseqTask, + epoch_itr, + subsets: List[str], +) -> List[Optional[float]]: """Evaluate the model on the validation set(s) and return the losses.""" if cfg.dataset.fixed_validation_seed is not None: @@ -294,11 +468,13 @@ def validate(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_i trainer.begin_valid_epoch(epoch_itr.epoch) valid_losses = [] - for subset in subsets: + for subset_idx, subset in enumerate(subsets): logger.info('begin validation on "{}" subset'.format(subset)) # Initialize data iterator - itr = trainer.get_valid_iterator(subset).next_epoch_itr(shuffle=False) + itr = trainer.get_valid_iterator(subset).next_epoch_itr( + shuffle=False, set_dataset_epoch=False # use a fixed valid set + ) if cfg.common.tpu: itr = utils.tpu_data_loader(itr) progress = progress_bar.progress_bar( @@ -307,43 +483,89 @@ def validate(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_i log_interval=cfg.common.log_interval, epoch=epoch_itr.epoch, prefix=f"valid on '{subset}' subset", + aim_repo=( + cfg.common.aim_repo + if distributed_utils.is_master(cfg.distributed_training) + else None + ), + aim_run_hash=( + cfg.common.aim_run_hash + if distributed_utils.is_master(cfg.distributed_training) + else None + ), + aim_param_checkpoint_dir=cfg.checkpoint.save_dir, tensorboard_logdir=( - cfg.common.tensorboard_logdir if distributed_utils.is_master(cfg.distributed_training) else None + cfg.common.tensorboard_logdir + if distributed_utils.is_master(cfg.distributed_training) + else None + ), + default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), + wandb_project=( + cfg.common.wandb_project + if distributed_utils.is_master(cfg.distributed_training) + else None + ), + wandb_run_name=os.environ.get( + "WANDB_NAME", os.path.basename(cfg.checkpoint.save_dir) ), - default_log_format=('tqdm' if not cfg.common.no_progress_bar else 'simple'), ) # create a new root metrics aggregator so validation metrics # don't pollute other aggregators (e.g., train meters) with metrics.aggregate(new_root=True) as agg: - for sample in progress: + for i, sample in enumerate(progress): + if ( + cfg.dataset.max_valid_steps is not None + and i > cfg.dataset.max_valid_steps + ): + break trainer.valid_step(sample) # log validation stats - stats = get_valid_stats(cfg, trainer, agg.get_smoothed_values()) + # only tracking the best metric on the 1st validation subset + tracking_best = subset_idx == 0 + stats = get_valid_stats(cfg, trainer, agg.get_smoothed_values(), tracking_best) + + if hasattr(task, "post_validate"): + task.post_validate(trainer.get_model(), stats, agg) + progress.print(stats, tag=subset, step=trainer.get_num_updates()) valid_losses.append(stats[cfg.checkpoint.best_checkpoint_metric]) return valid_losses -def get_valid_stats(cfg: DictConfig, trainer: Trainer, stats: Dict[str, Any]) -> Dict[str, Any]: +def get_valid_stats( + cfg: DictConfig, + trainer: Trainer, + stats: Dict[str, Any], + tracking_best: bool, +) -> Dict[str, Any]: stats["num_updates"] = trainer.get_num_updates() - if hasattr(checkpoint_utils.save_checkpoint, "best"): + if tracking_best and hasattr(checkpoint_utils.save_checkpoint, "best"): key = "best_{0}".format(cfg.checkpoint.best_checkpoint_metric) best_function = max if cfg.checkpoint.maximize_best_checkpoint_metric else min stats[key] = best_function( - checkpoint_utils.save_checkpoint.best, stats[cfg.checkpoint.best_checkpoint_metric] + checkpoint_utils.save_checkpoint.best, + stats[cfg.checkpoint.best_checkpoint_metric], ) return stats -def cli_main(modify_parser: Optional[Callable[[argparse.ArgumentParser], None]] = None) -> None: +def cli_main( + modify_parser: Optional[Callable[[argparse.ArgumentParser], None]] = None +) -> None: parser = options.get_training_parser() args = options.parse_args_and_arch(parser, modify_parser=modify_parser) cfg = convert_namespace_to_omegaconf(args) + if cfg.common.use_plasma_view: + server = PlasmaStore(path=cfg.common.plasma_path) + logger.info( + f"Started plasma server pid {server.server.pid} {cfg.common.plasma_path}" + ) + if args.profile: with torch.cuda.profiler.profile(): with torch.autograd.profiler.emit_nvtx(): @@ -351,9 +573,9 @@ def cli_main(modify_parser: Optional[Callable[[argparse.ArgumentParser], None]] else: distributed_utils.call_main(cfg, main) + # if cfg.common.use_plasma_view: + # server.server.kill() + -if __name__ == '__main__': - cs = ConfigStore.instance() - register_hydra_cfg(cs) - initialize(config_path="../config", strict=True) +if __name__ == "__main__": cli_main() diff --git a/fairseq_cli/validate.py b/fairseq_cli/validate.py index 368c9cb581..4617b6d542 100644 --- a/fairseq_cli/validate.py +++ b/fairseq_cli/validate.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -u -# !/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the @@ -12,14 +11,12 @@ from itertools import chain import torch +from omegaconf import DictConfig + from fairseq import checkpoint_utils, distributed_utils, options, utils -from fairseq.dataclass.data_class import register_hydra_cfg from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.logging import metrics, progress_bar -from hydra.core.config_store import ConfigStore -from hydra.experimental import initialize -from omegaconf import DictConfig - +from fairseq.utils import reset_logging logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", @@ -36,6 +33,8 @@ def main(cfg: DictConfig, override_args=None): utils.import_user_module(cfg.common) + reset_logging() + assert ( cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" @@ -46,6 +45,13 @@ def main(cfg: DictConfig, override_args=None): if use_cuda: torch.cuda.set_device(cfg.distributed_training.device_id) + if cfg.distributed_training.distributed_world_size > 1: + data_parallel_world_size = distributed_utils.get_data_parallel_world_size() + data_parallel_rank = distributed_utils.get_data_parallel_rank() + else: + data_parallel_world_size = 1 + data_parallel_rank = 0 + if override_args is not None: overrides = vars(override_args) overrides.update(eval(getattr(override_args, "model_overrides", "{}"))) @@ -54,7 +60,7 @@ def main(cfg: DictConfig, override_args=None): # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) - models, model_args, task = checkpoint_utils.load_model_ensemble_and_task( + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( [cfg.common_eval.path], arg_overrides=overrides, suffix=cfg.checkpoint.checkpoint_suffix, @@ -63,21 +69,22 @@ def main(cfg: DictConfig, override_args=None): # Move models to GPU for model in models: + model.eval() if use_fp16: model.half() if use_cuda: model.cuda() # Print args - logger.info(model_args) + logger.info(saved_cfg) # Build criterion - criterion = task.build_criterion(model_args.criterion) + criterion = task.build_criterion(saved_cfg.criterion) criterion.eval() for subset in cfg.dataset.valid_subset.split(","): try: - task.load_dataset(subset, combine=False, epoch=1) + task.load_dataset(subset, combine=False, epoch=1, task_cfg=saved_cfg.task) dataset = task.dataset(subset) except KeyError: raise Exception("Cannot find dataset: " + subset) @@ -94,8 +101,8 @@ def main(cfg: DictConfig, override_args=None): ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=cfg.dataset.required_batch_size_multiple, seed=cfg.common.seed, - num_shards=cfg.distributed_training.distributed_world_size, - shard_id=cfg.distributed_training.distributed_rank, + num_shards=data_parallel_world_size, + shard_id=data_parallel_rank, num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) @@ -114,10 +121,11 @@ def main(cfg: DictConfig, override_args=None): progress.log(log_output, step=i) log_outputs.append(log_output) - if cfg.distributed_training.distributed_world_size > 1: + if data_parallel_world_size > 1: log_outputs = distributed_utils.all_gather_list( log_outputs, max_size=cfg.common.all_gather_list_size, + group=distributed_utils.get_data_parallel_group(), ) log_outputs = list(chain.from_iterable(log_outputs)) @@ -136,11 +144,10 @@ def cli_main(): override_parser = options.get_validation_parser() override_args = options.parse_args_and_arch(override_parser, suppress_defaults=True) - distributed_utils.call_main(args, main, override_args=override_args) + distributed_utils.call_main( + convert_namespace_to_omegaconf(args), main, override_args=override_args + ) if __name__ == "__main__": - cs = ConfigStore.instance() - register_hydra_cfg(cs) - initialize(config_path="../config", strict=True) cli_main() diff --git a/hubconf.py b/hubconf.py index ce7d76cfe1..5949e274ed 100644 --- a/hubconf.py +++ b/hubconf.py @@ -2,21 +2,17 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +"""isort:skip_file""" import functools import importlib -from fairseq.hub_utils import ( # noqa; noqa - BPEHubInterface as bpe, - TokenizerHubInterface as tokenizer, -) -from fairseq.models import MODEL_REGISTRY # noqa - dependencies = [ "dataclasses", "hydra", "numpy", + "omegaconf", "regex", "requests", "torch", @@ -39,6 +35,14 @@ raise RuntimeError("Missing dependencies: {}".format(", ".join(missing_deps))) +# only do fairseq imports after checking for dependencies +from fairseq.hub_utils import ( # noqa; noqa + BPEHubInterface as bpe, + TokenizerHubInterface as tokenizer, +) +from fairseq.models import MODEL_REGISTRY # noqa + + # torch.hub doesn't build Cython components, so if they are not found then try # to build them here try: diff --git a/hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/__init__.py b/hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/__init__.py new file mode 100644 index 0000000000..4884f5bdcb --- /dev/null +++ b/hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +__version__ = "0.1" diff --git a/hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/config.py b/hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/config.py new file mode 100644 index 0000000000..91926c4abc --- /dev/null +++ b/hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/config.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from dataclasses import dataclass, field + +from hydra.core.config_store import ConfigStore + +from hydra_plugins.hydra_submitit_launcher.config import SlurmQueueConf + + +@dataclass +class DependencySubmititConf(SlurmQueueConf): + """Slurm configuration overrides and specific parameters""" + + _target_: str = ( + "hydra_plugins.dependency_submitit_launcher.launcher.DependencySubmititLauncher" + ) + + +ConfigStore.instance().store( + group="hydra/launcher", + name="dependency_submitit_slurm", + node=DependencySubmititConf(), + provider="dependency_submitit_slurm", +) diff --git a/hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/launcher.py b/hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/launcher.py new file mode 100644 index 0000000000..b3fcf79e17 --- /dev/null +++ b/hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/launcher.py @@ -0,0 +1,121 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import os +import subprocess +from pathlib import Path +from typing import Any, List, Sequence + +from hydra.core.singleton import Singleton +from hydra.core.utils import JobReturn, filter_overrides +from omegaconf import OmegaConf + +log = logging.getLogger(__name__) + +from .config import DependencySubmititConf +from hydra_plugins.hydra_submitit_launcher.submitit_launcher import BaseSubmititLauncher + + +class DependencySubmititLauncher(BaseSubmititLauncher): + _EXECUTOR = "slurm" + + def launch( + self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int + ) -> Sequence[JobReturn]: + + # lazy import to ensure plugin discovery remains fast + import submitit + + assert self.config is not None + + num_jobs = len(job_overrides) + assert num_jobs > 0 + + next_script = None + + for jo in job_overrides: + if next_script is None: + for item in jo: + if "next_script=" in item: + next_script = item + break + assert ( + next_script is not None + ), "job overrides must contain +next_script=path/to/next/script" + jo.remove(next_script) + + idx = next_script.find("=") + next_script = next_script[idx + 1 :] + + params = self.params + # build executor + init_params = {"folder": self.params["submitit_folder"]} + specific_init_keys = {"max_num_timeout"} + + init_params.update( + **{ + f"{self._EXECUTOR}_{x}": y + for x, y in params.items() + if x in specific_init_keys + } + ) + init_keys = specific_init_keys | {"submitit_folder"} + executor = submitit.AutoExecutor(cluster=self._EXECUTOR, **init_params) + + # specify resources/parameters + baseparams = set(OmegaConf.structured(DependencySubmititConf).keys()) + params = { + x if x in baseparams else f"{self._EXECUTOR}_{x}": y + for x, y in params.items() + if x not in init_keys + } + executor.update_parameters(**params) + + log.info( + f"Submitit '{self._EXECUTOR}' sweep output dir : " + f"{self.config.hydra.sweep.dir}" + ) + sweep_dir = Path(str(self.config.hydra.sweep.dir)) + sweep_dir.mkdir(parents=True, exist_ok=True) + if "mode" in self.config.hydra.sweep: + mode = int(str(self.config.hydra.sweep.mode), 8) + os.chmod(sweep_dir, mode=mode) + + job_params: List[Any] = [] + for idx, overrides in enumerate(job_overrides): + idx = initial_job_idx + idx + lst = " ".join(filter_overrides(overrides)) + log.info(f"\t#{idx} : {lst}") + job_params.append( + ( + list(overrides), + "hydra.sweep.dir", + idx, + f"job_id_for_{idx}", + Singleton.get_state(), + ) + ) + + jobs = executor.map_array(self, *zip(*job_params)) + + for j, jp in zip(jobs, job_params): + job_id = str(j.job_id) + task_id = "0" if "_" not in job_id else job_id.split("_")[1] + sweep_config = self.config_loader.load_sweep_config(self.config, jp[0]) + dir = sweep_config.hydra.sweep.dir + + dir = ( + dir.replace("[", "") + .replace("]", "") + .replace("{", "") + .replace("}", "") + .replace(",", "_") + .replace("'", "") + .replace('"', "") + ) + + subprocess.call( + [next_script, job_id, task_id, dir], + shell=False, + ) + + return [j.results()[0] for j in jobs] diff --git a/hydra_plugins/dependency_submitit_launcher/setup.py b/hydra_plugins/dependency_submitit_launcher/setup.py new file mode 100644 index 0000000000..bf795462bd --- /dev/null +++ b/hydra_plugins/dependency_submitit_launcher/setup.py @@ -0,0 +1,29 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# type: ignore +from pathlib import Path + +from read_version import read_version +from setuptools import find_namespace_packages, setup + +setup( + name="dependency-submitit-launcher", + version=read_version("hydra_plugins/dependency_submitit_launcher", "__init__.py"), + author="Alexei Baevski", + author_email="abaevski@fb.com", + description="Dependency-supporting Submitit Launcher for Hydra apps", + packages=find_namespace_packages(include=["hydra_plugins.*"]), + classifiers=[ + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Development Status :: 4 - Beta", + ], + install_requires=[ + "hydra-core>=1.0.4", + "submitit>=1.0.0", + ], + include_package_data=True, +) diff --git a/pyproject.toml b/pyproject.toml index 6d1b4c5b6f..4d84c9bc36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,23 @@ [build-system] -requires = ["setuptools", "wheel", "cython"] +requires = [ + "setuptools>=18.0", + "wheel", + "cython", + "numpy>=1.21.3", + "torch>=1.10", +] build-backend = "setuptools.build_meta" + +[tool.black] +extend-exclude = ''' +( +^/examples/| +^/fairseq/model_parallel/megatron| +^/build/ +) +''' + +[tool.isort] +profile = "black" +known_third_party = "_cffi_backend,agg_results,aml,bitarray,boto3,botocore,dump_hubert_feature,dynamicconv_cuda,editdistance,faiss,fasttext,feature_utils,ffmpeg,g2p_en,h5py,hydra,hypothesis,indicnlp,inflect,iopath,joblib,kaldi_io,kenlm,libfb,librosa,lightconv_cuda,matplotlib,misc,mmpt,mmpt_cli,model,nltk,npy_append_array,numpy,omegaconf,pandas,pathbuilder,preprocessing,progressbar,pythainlp,random_sequence_shuffler,regex,sacrebleu,sacremoses,scipy,sentencepiece,setuptools,six,sklearn,soundfile,sweep,sweep_wmt_en2de_transformer_big_common,tabulate,torch,torchaudio,tqdm,unidecode,utils,videoreader,wav2vec_cluster_faiss,wget,yaml" +skip_gitignore = true diff --git a/release_utils.py b/release_utils.py new file mode 100644 index 0000000000..69a5e8dda3 --- /dev/null +++ b/release_utils.py @@ -0,0 +1,72 @@ +import argparse +from typing import Tuple + + +def get_next_version(release_type) -> Tuple[Tuple[int, int, int], str, str]: + current_ver = find_version("fairseq/version.txt") + version_list = [int(x) for x in current_ver.strip("'").split(".")] + major, minor, patch = version_list[0], version_list[1], version_list[2] + if release_type == "patch": + patch += 1 + elif release_type == "minor": + minor += 1 + patch = 0 + elif release_type == "major": + major += 1 + minor = patch = 0 + else: + raise ValueError( + "Incorrect release type specified. Acceptable types are major, minor and patch." + ) + + new_version_tuple = (major, minor, patch) + new_version_str = ".".join([str(x) for x in new_version_tuple]) + new_tag_str = "v" + new_version_str + return new_version_tuple, new_version_str, new_tag_str + + +def find_version(version_file_path) -> str: + with open(version_file_path) as f: + version = f.read().strip() + return version + + +def update_version(new_version_str) -> None: + """ + given the current version, update the version to the + next version depending on the type of release. + """ + + with open("fairseq/version.txt", "w") as writer: + writer.write(new_version_str) + + +def main(args): + if args.release_type in ["major", "minor", "patch"]: + new_version_tuple, new_version, new_tag = get_next_version(args.release_type) + else: + raise ValueError("Incorrect release type specified") + + if args.update_version: + update_version(new_version) + + print(new_version, new_tag) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Versioning utils") + parser.add_argument( + "--release-type", + type=str, + required=True, + help="type of release = major/minor/patch", + ) + parser.add_argument( + "--update-version", + action="store_true", + required=False, + help="updates the version in fairseq/version.txt", + ) + + args = parser.parse_args() + main(args) diff --git a/scripts/average_checkpoints.py b/scripts/average_checkpoints.py index c512f802bc..49f4f9d912 100644 --- a/scripts/average_checkpoints.py +++ b/scripts/average_checkpoints.py @@ -10,6 +10,7 @@ import re import torch + from fairseq.file_io import PathManager @@ -108,16 +109,21 @@ def main(): help='Write the new checkpoint containing the averaged weights to this path.') num_group = parser.add_mutually_exclusive_group() num_group.add_argument('--num-epoch-checkpoints', type=int, - help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, ' - 'and average last this many of them.') + help='if set, will try to find checkpoints with names checkpoint_xx.pt in the ' + 'path specified by input, and average last this many of them.') num_group.add_argument('--num-update-checkpoints', type=int, - help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, ' - 'and average last this many of them.') + help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by' + ' input, and average last this many of them.') + num_group.add_argument('--num-best-checkpoints', type=int, default=0, + help='if set, will try to find checkpoints with names checkpoint_best_ee_xx.pt in the path specified by' + ' input, and average last this many of them.') parser.add_argument('--checkpoint-upper-bound', type=int, help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, ' 'when using --num-update-checkpoints, this will set an upper bound on which update to use' - 'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.' - 'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500' + 'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be' + ' averaged.' + 'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would' + ' be averaged assuming --save-interval-updates 500' ) # fmt: on args = parser.parse_args() @@ -148,6 +154,18 @@ def main(): ) print("averaging checkpoints: ", args.inputs) + if args.num_best_checkpoints > 0: + args.inputs = list( + sorted( + args.inputs, + key=lambda x: float( + os.path.basename(x).split("_")[-1].replace(".pt", "") + ), + ) + ) + args.inputs = args.inputs[: args.num_best_checkpoints] + for path in args.inputs: + print(os.path.basename(path)) new_state = average_checkpoints(args.inputs) with PathManager.open(args.output, "wb") as f: torch.save(new_state, f) diff --git a/scripts/check_installation.py b/scripts/check_installation.py new file mode 100644 index 0000000000..e5a9d9dd46 --- /dev/null +++ b/scripts/check_installation.py @@ -0,0 +1,36 @@ +from pathlib import Path +import os + +cwd = Path(".").resolve() +print("running 'check_installation.py' from:", cwd) + +# Old versions of numpy/torch can prevent loading the .so files +import torch + +print("torch:", torch.__version__) +import numpy + +print("numpy:", numpy.__version__) + +import fairseq + +print("Fairseq installed at:", fairseq.__file__) +import fairseq.criterions +import fairseq.dataclass.configs + +import _imp + +print("Should load following .so suffixes:", _imp.extension_suffixes()) + +so_files = list(Path(fairseq.__file__).parent.glob("*.so")) +so_files.extend(Path(fairseq.__file__).parent.glob("data/*.so")) +print("Found following .so files:") +for so_file in so_files: + print(f"- {so_file}") + +from fairseq import libbleu + +print("Found libbleu at", libbleu.__file__) +from fairseq.data import data_utils_fast + +print("Found data_utils_fast at", data_utils_fast.__file__) diff --git a/scripts/constraints/extract.py b/scripts/constraints/extract.py index f6155d0a05..437b373856 100755 --- a/scripts/constraints/extract.py +++ b/scripts/constraints/extract.py @@ -11,8 +11,6 @@ import random import sys -from sacrebleu import extract_ngrams - def get_phrase(words, index, length): assert index < len(words) - length + 1 diff --git a/scripts/spm_decode.py b/scripts/spm_decode.py index 1c18b1d2a7..7d7b68b240 100644 --- a/scripts/spm_decode.py +++ b/scripts/spm_decode.py @@ -26,13 +26,13 @@ def main(): if args.input_format == "piece": - def decode(l): - return "".join(sp.DecodePieces(l)) + def decode(input): + return "".join(sp.DecodePieces(input)) elif args.input_format == "id": - def decode(l): - return "".join(sp.DecodeIds(l)) + def decode(input): + return "".join(sp.DecodeIds(input)) else: raise NotImplementedError diff --git a/scripts/spm_encode.py b/scripts/spm_encode.py index 83facfb3b1..f91e0bb728 100644 --- a/scripts/spm_encode.py +++ b/scripts/spm_encode.py @@ -49,13 +49,13 @@ def main(): if args.output_format == "piece": - def encode(l): - return sp.EncodeAsPieces(l) + def encode(input): + return sp.EncodeAsPieces(input) elif args.output_format == "id": - def encode(l): - return list(map(str, sp.EncodeAsIds(l))) + def encode(input): + return list(map(str, sp.EncodeAsIds(input))) else: raise NotImplementedError diff --git a/scripts/test_fsdp.sh b/scripts/test_fsdp.sh new file mode 100755 index 0000000000..1f428a035e --- /dev/null +++ b/scripts/test_fsdp.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +rm -rf fsdp_dummy +mkdir -p fsdp_dummy +CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \ + --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \ + --cpu-offload --checkpoint-activations \ + --task language_modeling --tokens-per-sample 256 --batch-size 8 \ + --arch transformer_lm_gpt2_tiny \ + --optimizer cpu_adam --adam-betas "(0.9,0.98)" \ + --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \ + --max-update 5 --log-format json --log-interval 1 \ + --save-interval-updates 5 --save-dir fsdp_dummy --disable-validation \ + --restore-file x.pt "$@" + +# Now we try to load the checkpoint +CUDA_VISIBLE_DEVICES=0,1 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \ + --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \ + --cpu-offload --checkpoint-activations \ + --task language_modeling --tokens-per-sample 256 --batch-size 8 \ + --arch transformer_lm_gpt2_tiny \ + --optimizer cpu_adam --adam-betas "(0.9,0.98)" \ + --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \ + --max-update 2 --log-format json --log-interval 1 \ + --save-interval-updates 2 --save-dir fsdp_dummy diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000..3fa679ddf1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,4 @@ +[flake8] +max-line-length = 127 +extend-ignore = E203, W503 +extend-exclude = fairseq/model_parallel/megatron diff --git a/setup.py b/setup.py index 7b13f13e4c..dae06080c5 100644 --- a/setup.py +++ b/setup.py @@ -7,10 +7,9 @@ import os import subprocess import sys -from setuptools import setup, find_packages, Extension from setuptools import Extension, find_packages, setup - +from torch.utils import cpp_extension if sys.version_info < (3, 6): sys.exit("Sorry, Python >= 3.6 is required for fairseq.") @@ -20,16 +19,9 @@ def write_version_py(): with open(os.path.join("fairseq", "version.txt")) as f: version = f.read().strip() - # append latest commit hash to version string - try: - sha = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("ascii").strip() - version += "+" + sha[:7] - except Exception: - pass - # write version info to fairseq/version.py with open(os.path.join("fairseq", "version.py"), "w") as f: - f.write("__version__ = \"{}\"\n".format(version)) + f.write('__version__ = "{}"\n'.format(version)) return version @@ -88,41 +80,56 @@ def include_dirs(self, dirs): ] -cmdclass = {} - - -try: - # torch is not available when generating docs - from torch.utils import cpp_extension - +extensions.extend( + [ + cpp_extension.CppExtension( + "fairseq.libbase", + sources=[ + "fairseq/clib/libbase/balanced_assignment.cpp", + ], + ), + cpp_extension.CppExtension( + "fairseq.libnat", + sources=[ + "fairseq/clib/libnat/edit_dist.cpp", + ], + ), + cpp_extension.CppExtension( + "alignment_train_cpu_binding", + sources=[ + "examples/operators/alignment_train_cpu.cpp", + ], + ), + ] +) +if "CUDA_HOME" in os.environ: extensions.extend( [ cpp_extension.CppExtension( - "fairseq.libnat", + "fairseq.libnat_cuda", sources=[ - "fairseq/clib/libnat/edit_dist.cpp", + "fairseq/clib/libnat_cuda/edit_dist.cu", + "fairseq/clib/libnat_cuda/binding.cpp", ], - ) + ), + cpp_extension.CppExtension( + "fairseq.ngram_repeat_block_cuda", + sources=[ + "fairseq/clib/cuda/ngram_repeat_block_cuda.cpp", + "fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu", + ], + ), + cpp_extension.CppExtension( + "alignment_train_cuda_binding", + sources=[ + "examples/operators/alignment_train_kernel.cu", + "examples/operators/alignment_train_cuda.cpp", + ], + ), ] ) - if "CUDA_HOME" in os.environ: - extensions.extend( - [ - cpp_extension.CppExtension( - "fairseq.libnat_cuda", - sources=[ - "fairseq/clib/libnat_cuda/edit_dist.cu", - "fairseq/clib/libnat_cuda/binding.cpp", - ], - ) - ] - ) - cmdclass["build_ext"] = cpp_extension.BuildExtension - -except ImportError: - pass - +cmdclass = {"build_ext": cpp_extension.BuildExtension} if "READTHEDOCS" in os.environ: # don't build extensions when generating docs @@ -132,7 +139,7 @@ def include_dirs(self, dirs): # use CPU build of PyTorch dependency_links = [ - "https://download.pytorch.org/whl/cpu/torch-1.3.0%2Bcpu-cp36-cp36m-linux_x86_64.whl" + "https://download.pytorch.org/whl/cpu/torch-1.7.0%2Bcpu-cp36-cp36m-linux_x86_64.whl" ] else: dependency_links = [] @@ -141,7 +148,6 @@ def include_dirs(self, dirs): if "clean" in sys.argv[1:]: # Source: https://bit.ly/2NLVsgE print("deleting Cython files...") - import subprocess subprocess.run( ["rm -f fairseq/*.so fairseq/**/*.so fairseq/*.pyd fairseq/**/*.pyd"], @@ -149,6 +155,11 @@ def include_dirs(self, dirs): ) +extra_packages = [] +if os.path.exists(os.path.join("fairseq", "model_parallel", "megatron", "mpu")): + extra_packages.append("fairseq.model_parallel.megatron.mpu") + + def do_setup(package_data): setup( name="fairseq", @@ -159,27 +170,31 @@ def do_setup(package_data): "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], long_description=readme, long_description_content_type="text/markdown", - setup_requires=[ - "cython", - "numpy", - "setuptools>=18.0", - ], install_requires=[ "cffi", "cython", - "dataclasses", - "editdistance", - "hydra-core", - "numpy", + "hydra-core>=1.0.7,<1.1", + "omegaconf<2.1", + "numpy>=1.21.3", "regex", "sacrebleu>=1.4.12", - "torch", + "torch>=1.13", "tqdm", + "bitarray", + "torchaudio>=0.8.0", + "scikit-learn", + "packaging", ], + extras_require={ + "dev": ["flake8", "pytest", "black==22.3.0"], + "docs": ["sphinx", "sphinx-argparse"], + }, dependency_links=dependency_links, packages=find_packages( exclude=[ @@ -190,7 +205,8 @@ def do_setup(package_data): "tests", "tests.*", ] - ), + ) + + extra_packages, package_data=package_data, ext_modules=extensions, test_suite="tests", @@ -198,6 +214,7 @@ def do_setup(package_data): "console_scripts": [ "fairseq-eval-lm = fairseq_cli.eval_lm:cli_main", "fairseq-generate = fairseq_cli.generate:cli_main", + "fairseq-hydra-train = fairseq_cli.hydra_train:cli_main", "fairseq-interactive = fairseq_cli.interactive:cli_main", "fairseq-preprocess = fairseq_cli.preprocess:cli_main", "fairseq-score = fairseq_cli.score:cli_main", @@ -221,16 +238,20 @@ def get_files(path, relative_to="fairseq"): return all_files -try: - # symlink config and examples into fairseq package so package_data accepts them - if "build_ext" not in sys.argv[1:]: - os.symlink(os.path.join("..", "config"), "fairseq/config") - os.symlink(os.path.join("..", "examples"), "fairseq/examples") - package_data = { - "fairseq": get_files("fairseq/config") + get_files("fairseq/examples"), - } - do_setup(package_data) -finally: - if "build_ext" not in sys.argv[1:]: - os.unlink("fairseq/config") - os.unlink("fairseq/examples") +if __name__ == "__main__": + try: + # symlink examples into fairseq package so package_data accepts them + fairseq_examples = os.path.join("fairseq", "examples") + if "build_ext" not in sys.argv[1:] and not os.path.exists(fairseq_examples): + os.symlink(os.path.join("..", "examples"), fairseq_examples) + + package_data = { + "fairseq": ( + get_files(fairseq_examples) + + get_files(os.path.join("fairseq", "config")) + ) + } + do_setup(package_data) + finally: + if "build_ext" not in sys.argv[1:] and os.path.islink(fairseq_examples): + os.unlink(fairseq_examples) diff --git a/tests/distributed/__init__.py b/tests/distributed/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_bmuf.py b/tests/distributed/test_bmuf.py similarity index 92% rename from tests/test_bmuf.py rename to tests/distributed/test_bmuf.py index e7aa6da1ca..995d0db180 100644 --- a/tests/test_bmuf.py +++ b/tests/distributed/test_bmuf.py @@ -4,15 +4,19 @@ # LICENSE file in the root directory of this source tree. import argparse +import functools import random import unittest from multiprocessing import Manager import torch import torch.nn as nn -from fairseq import distributed_utils, optim from omegaconf import OmegaConf +from fairseq import optim +from fairseq.distributed import utils as distributed_utils + + class Model(nn.Module): def __init__(self, input_size, output_size): super(Model, self).__init__() @@ -39,10 +43,7 @@ def setup_model_loss_criterion(cfg, args, rank, is_cuda): loss_fn = loss_fn.cuda() optimizer = optim.sgd.SGD(args, model.parameters()) - optimizer = optim.FairseqBMUF( - cfg=cfg.bmuf, - optimizer=optimizer - ) + optimizer = optim.FairseqBMUF(cfg=cfg.bmuf, optimizer=optimizer) return model, loss_fn, optimizer @@ -139,18 +140,13 @@ def setup_args(): @unittest.skipIf(torch.cuda.device_count() < 2, "test requires 2 GPUs") class TestBMUF(unittest.TestCase): def bmuf_process(self, cfg, args, iterations): - processes = [] results = Manager().dict() - ctx = torch.multiprocessing.get_context("spawn") - for rank in range(args.distributed_world_size): - p = ctx.Process( - target=single_gpu_training, args=(cfg, args, rank, iterations, results) - ) - p.start() - processes.append(p) - - for p in processes: - p.join() + torch.multiprocessing.spawn( + fn=functools.partial(single_gpu_training, cfg, args), + args=(iterations, results), + nprocs=args.distributed_world_size, + join=True, + ) return results def test_bmuf_sync(self): diff --git a/tests/distributed/test_distributed_timeout_wrapper.py b/tests/distributed/test_distributed_timeout_wrapper.py new file mode 100644 index 0000000000..996093cb2d --- /dev/null +++ b/tests/distributed/test_distributed_timeout_wrapper.py @@ -0,0 +1,52 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import signal +import time +import unittest + +import torch +from torch import nn + +from fairseq.distributed import DistributedTimeoutWrapper + + +class ModuleWithDelay(nn.Module): + def __init__(self, delay): + super().__init__() + self.delay = delay + + def forward(self, x): + time.sleep(self.delay) + return x + + +class TestDistributedTimeoutWrapper(unittest.TestCase): + def setUp(self): + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + def test_no_timeout(self): + module = DistributedTimeoutWrapper(ModuleWithDelay(1), 0, signal.SIGINT) + module(torch.rand(5)) + module.stop_timeout() + + def test_timeout_safe(self): + module = DistributedTimeoutWrapper(ModuleWithDelay(1), 10, signal.SIGINT) + module(torch.rand(5)) + module.stop_timeout() + + def test_timeout_killed(self): + with self.assertRaises(KeyboardInterrupt): + module = DistributedTimeoutWrapper(ModuleWithDelay(5), 1, signal.SIGINT) + module(torch.rand(5)) + module.stop_timeout() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/distributed/test_module_proxy_wrapper.py b/tests/distributed/test_module_proxy_wrapper.py new file mode 100644 index 0000000000..2ac1a877c3 --- /dev/null +++ b/tests/distributed/test_module_proxy_wrapper.py @@ -0,0 +1,74 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from torch import nn + +from fairseq.distributed import ModuleProxyWrapper + +from .utils import objects_are_equal + + +class MockDDPWrapper(nn.Module): + """A simple wrapper with an interface similar to DistributedDataParallel.""" + + def __init__(self, module): + super().__init__() + self.module = module + + def forward(self, x): + return self.module(x) + + +class Model(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(5, 10) + self.xyz = "hello" + + def forward(self, x): + return self.linear(x) + + def get_xyz(self): + return self.xyz + + +class TestModuleProxyWrapper(unittest.TestCase): + def _get_module(self): + module = Model() + wrapped_module = MockDDPWrapper(module) + wrapped_module = ModuleProxyWrapper(wrapped_module) + return wrapped_module, module + + def test_getattr_forwarding(self): + wrapped_module, module = self._get_module() + assert module.xyz == "hello" + assert module.get_xyz() == "hello" + assert wrapped_module.xyz == "hello" + + wrapped_module.xyz = "world" + assert wrapped_module.xyz == "world" + assert module.get_xyz() == "hello" + + def test_state_dict(self): + wrapped_module, module = self._get_module() + assert objects_are_equal(wrapped_module.state_dict(), module.state_dict()) + + def test_load_state_dict(self): + wrapped_module, module = self._get_module() + wrapped_module.load_state_dict(module.state_dict()) + input = torch.rand(4, 5) + torch.testing.assert_allclose(wrapped_module(input), module(input)) + + def test_forward(self): + wrapped_module, module = self._get_module() + input = torch.rand(4, 5) + torch.testing.assert_allclose(wrapped_module(input), module(input)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py new file mode 100644 index 0000000000..30f995b67a --- /dev/null +++ b/tests/distributed/test_utils.py @@ -0,0 +1,124 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import functools +import sys +import unittest + +import torch + +from fairseq.distributed import utils as dist_utils + +from .utils import objects_are_equal, spawn_and_init + + +class DistributedTest(unittest.TestCase): + def setUp(self): + if not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA not available, skipping test") + if sys.platform == "win32": + raise unittest.SkipTest("NCCL doesn't support Windows, skipping test") + if torch.cuda.device_count() < 2: + raise unittest.SkipTest("distributed tests require 2+ GPUs, skipping") + + +class TestBroadcastObject(DistributedTest): + def test_str(self): + spawn_and_init( + functools.partial( + TestBroadcastObject._test_broadcast_object, "hello world" + ), + world_size=2, + ) + + def test_tensor(self): + spawn_and_init( + functools.partial( + TestBroadcastObject._test_broadcast_object, + torch.rand(5), + ), + world_size=2, + ) + + def test_complex(self): + spawn_and_init( + functools.partial( + TestBroadcastObject._test_broadcast_object, + { + "a": "1", + "b": [2, torch.rand(2, 3), 3], + "c": (torch.rand(2, 3), 4), + "d": {5, torch.rand(5)}, + "e": torch.rand(5), + "f": torch.rand(5).int().cuda(), + }, + ), + world_size=2, + ) + + @staticmethod + def _test_broadcast_object(ref_obj, rank, group): + obj = dist_utils.broadcast_object( + ref_obj if rank == 0 else None, src_rank=0, group=group + ) + assert objects_are_equal(ref_obj, obj) + + +class TestAllGatherList(DistributedTest): + def test_str_equality(self): + spawn_and_init( + functools.partial( + TestAllGatherList._test_all_gather_list_equality, + "hello world", + ), + world_size=2, + ) + + def test_tensor_equality(self): + spawn_and_init( + functools.partial( + TestAllGatherList._test_all_gather_list_equality, + torch.rand(5), + ), + world_size=2, + ) + + def test_complex_equality(self): + spawn_and_init( + functools.partial( + TestAllGatherList._test_all_gather_list_equality, + { + "a": "1", + "b": [2, torch.rand(2, 3), 3], + "c": (torch.rand(2, 3), 4), + "d": {5, torch.rand(5)}, + "e": torch.rand(5), + "f": torch.rand(5).int(), + }, + ), + world_size=2, + ) + + @staticmethod + def _test_all_gather_list_equality(ref_obj, rank, group): + objs = dist_utils.all_gather_list(ref_obj, group) + for obj in objs: + assert objects_are_equal(ref_obj, obj) + + def test_rank_tensor(self): + spawn_and_init( + TestAllGatherList._test_all_gather_list_rank_tensor, world_size=2 + ) + + @staticmethod + def _test_all_gather_list_rank_tensor(rank, group): + obj = torch.tensor([rank]) + objs = dist_utils.all_gather_list(obj, group) + for i, obj in enumerate(objs): + assert obj.item() == i + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/distributed/utils.py b/tests/distributed/utils.py new file mode 100644 index 0000000000..be4e19cd1e --- /dev/null +++ b/tests/distributed/utils.py @@ -0,0 +1,65 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import functools +import tempfile + +import torch + + +def spawn_and_init(fn, world_size, args=None): + if args is None: + args = () + with tempfile.NamedTemporaryFile(delete=False) as tmp_file: + torch.multiprocessing.spawn( + fn=functools.partial(init_and_run, fn, args), + args=( + world_size, + tmp_file.name, + ), + nprocs=world_size, + join=True, + ) + + +def distributed_init(rank, world_size, tmp_file): + torch.distributed.init_process_group( + backend="nccl", + init_method="file://{}".format(tmp_file), + world_size=world_size, + rank=rank, + ) + torch.cuda.set_device(rank) + + +def init_and_run(fn, args, rank, world_size, tmp_file): + distributed_init(rank, world_size, tmp_file) + group = torch.distributed.new_group() + fn(rank, group, *args) + + +def objects_are_equal(a, b) -> bool: + if type(a) is not type(b): + return False + if isinstance(a, dict): + if set(a.keys()) != set(b.keys()): + return False + for k in a.keys(): + if not objects_are_equal(a[k], b[k]): + return False + return True + elif isinstance(a, (list, tuple, set)): + if len(a) != len(b): + return False + return all(objects_are_equal(x, y) for x, y in zip(a, b)) + elif torch.is_tensor(a): + return ( + a.size() == b.size() + and a.dtype == b.dtype + and a.device == b.device + and torch.all(a == b) + ) + else: + return a == b diff --git a/tests/gpu/test_binaries_gpu.py b/tests/gpu/test_binaries_gpu.py index 2ac60a0934..5caf94cde7 100644 --- a/tests/gpu/test_binaries_gpu.py +++ b/tests/gpu/test_binaries_gpu.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. import contextlib +import json import logging import os import tempfile @@ -11,6 +12,7 @@ from io import StringIO import torch + from fairseq import options from fairseq_cli import train from tests.utils import ( @@ -18,10 +20,123 @@ generate_main, preprocess_lm_data, preprocess_translation_data, + train_language_model, train_translation_model, ) +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestMultiGPU(unittest.TestCase): + @staticmethod + def parse_logs(logfile): + logs = [] + for ln in open(logfile, "r").readlines(): + try: + logs.append(json.loads(ln)) + except json.JSONDecodeError: + continue + return logs + + @property + def world_size(self): + return torch.cuda.device_count() + + def train_flags(self, mu): + return [ + "--memory-efficient-fp16", + "--update-freq", + "1", + "--seed", + "1", + "--log-format", + "json", + "--max-update", + str(mu), + "--tokens-per-sample", + "20", + "--batch-size", + "2", + "--share-decoder-input-output-embed", + "--optimizer", + "adam", + "--max-valid-steps", + "1", + "--pad-to-fixed-length", + "--sample-break-mode", + "none", + ] + + def _test_resume_multilingual_training( + self, extra_clargs, arch="transformer_lm_gpt2_tiny" + ): + languages = ["en_XX", "fr_XX", "zh_CN"] + save_interval = 5 + mu = 10 + flags = ( + self.train_flags(mu) + + ["--save-interval-updates", str(save_interval), "--log-interval", "1"] + + extra_clargs + ) + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_fp16") as data_dir: + log = os.path.join(data_dir, "train.log") + create_dummy_data( + data_dir, + num_examples=int( + mu * 20 * self.world_size * 1.5 + ), # make sure enough data for max updates + languages=languages, + ) + preprocess_lm_data(data_dir, languages) + train_language_model( + data_dir, + arch, + flags + ["--log-file", log], + task="multilingual_language_modeling", + world_size=self.world_size, + ) + log2 = os.path.join(data_dir, "resume.log") + ckpt_name = f"checkpoint_1_{save_interval}.pt" + restore_file = os.path.join(data_dir, ckpt_name) + train_language_model( + data_dir, + arch, + flags + + ["--log-file", log2, "--restore-file", restore_file, "--no-save"], + task="multilingual_language_modeling", + world_size=self.world_size, + ) + + l1 = self.parse_logs(log) + assert ( + int(l1[-1]["train_num_updates"]) == mu + ), f"The first run did not complete {mu} updates. Add more data" + l2 = self.parse_logs(log2) + + if int(l2[0]["num_updates"]) != save_interval + 1: + all_ckpt_files = [ + x for x in os.listdir(data_dir) if x.endswith(".pt") + ] + import shutil + + shutil.move(data_dir, "last_failed_resume") + raise AssertionError( + f"Likely failed to load {ckpt_name}. {all_ckpt_files} \n LOGS: {l1} \n\n {l2}. " + ) + for k in [ + "train_loss", + "train_num_updates", + "train_ppl", + "train_gnorm", + ]: + from_scratch, resumed = float(l1[-1][k]), float(l2[-1][k]) + # This fails without rounding! + assert ( + from_scratch == resumed + ), f"difference at {k} {from_scratch} != {resumed}" + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestTranslationGPU(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) @@ -29,16 +144,103 @@ def setUp(self): def tearDown(self): logging.disable(logging.NOTSET) - @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") - def test_fp16(self): + def test_fp16_multigpu(self): + self._test_multigpu("test_fp16", ["--fp16"]) + + def test_slowmo_multigpu(self): + self._test_multigpu( + "test_slowmo", ["--ddp-backend", "slowmo", "--nprocs-per-node", "1"] + ) + + def test_slowmo_single_node_multigpu(self): + self._test_multigpu( + "test_slowmo_single_node", + ["--ddp-backend", "slowmo", "--nprocs-per-node", "2"], + ) + + def _test_multigpu(self, test_name, test_args): with contextlib.redirect_stdout(StringIO()): - with tempfile.TemporaryDirectory("test_fp16") as data_dir: + with tempfile.TemporaryDirectory(test_name) as data_dir: + log = os.path.join(data_dir, "train.log") create_dummy_data(data_dir) preprocess_translation_data(data_dir) - train_translation_model(data_dir, "fconv_iwslt_de_en", ["--fp16"]) + train_translation_model( + data_dir, + "fconv_iwslt_de_en", + test_args + ["--log-file", log], + world_size=min(torch.cuda.device_count(), 2), + ) generate_main(data_dir) + assert os.path.exists(log) + + @staticmethod + def parse_logs(logfile): + logs = [] + for ln in open(logfile, "r").readlines(): + try: + logs.append(json.loads(ln)) + except json.JSONDecodeError: + continue + return logs + + def test_resume_training_fsdp(self): + self._test_resume_training(["--ddp-backend", "fully_sharded"]) + + def test_resume_training_fsdp_sharded_state(self): + self._test_resume_training( + ["--ddp-backend", "fully_sharded", "--use-sharded-state"] + ) + + def test_resume_training_noc10d(self): + self._test_resume_training([]) + + def _test_resume_training(self, extra_clargs, arch="fconv_iwslt_de_en"): + flags = [ + "--fp16", + "--log-format", + "json", + "--max-update", + "10", + "--save-interval-updates", + "2", + "--log-interval", + "1", + ] + extra_clargs + world_size = min(torch.cuda.device_count(), 2) + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_fp16") as data_dir: + log = os.path.join(data_dir, "train.log") + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model( + data_dir, + arch, + flags + ["--log-file", log], + world_size=world_size, + ) + log2 = os.path.join(data_dir, "resume.log") + restore_file = os.path.join(data_dir, "checkpoint_1_2.pt") + train_translation_model( + data_dir, + arch, + flags + ["--log-file", log2, "--restore-file", restore_file], + world_size=world_size, + ) + + l1 = self.parse_logs(log) + l2 = self.parse_logs(log2) + assert int(l2[0]["num_updates"]) == 3, f"{l1}\n\n {l2}" + for k in [ + "train_loss", + "train_num_updates", + "train_ppl", + "train_gnorm", + ]: + from_scratch, resumed = l1[-1][k], l2[-1][k] + assert ( + from_scratch == resumed + ), f"difference at {k} {from_scratch} != {resumed}" - @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_memory_efficient_fp16(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_memory_efficient_fp16") as data_dir: @@ -49,7 +251,6 @@ def test_memory_efficient_fp16(self): ) generate_main(data_dir) - @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_transformer_fp16(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_transformer") as data_dir: @@ -73,6 +274,39 @@ def test_transformer_fp16(self): ) generate_main(data_dir) + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_amp(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_amp") as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model(data_dir, "fconv_iwslt_de_en", ["--amp"]) + generate_main(data_dir) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_transformer_amp(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_transformer") as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model( + data_dir, + "transformer_iwslt_de_en", + [ + "--encoder-layers", + "2", + "--decoder-layers", + "2", + "--encoder-embed-dim", + "64", + "--decoder-embed-dim", + "64", + "--amp", + ], + run_validation=True, + ) + generate_main(data_dir) + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_levenshtein_transformer(self): with contextlib.redirect_stdout(StringIO()): @@ -93,18 +327,66 @@ def test_levenshtein_transformer(self): ], task="translation_lev", ) + gen_config = [ + "--task", + "translation_lev", + "--iter-decode-max-iter", + "9", + "--iter-decode-eos-penalty", + "0", + "--print-step", + ] + # non-ensemble generation + generate_main(data_dir, gen_config) + # ensemble generation generate_main( data_dir, + gen_config, + path=os.pathsep.join( + [ + os.path.join(data_dir, "checkpoint_last.pt"), + os.path.join(data_dir, "checkpoint_last.pt"), + ] + ), + ) + + def test_fsdp_checkpoint_generate(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_fsdp_sharded") as data_dir: + log = os.path.join(data_dir, "train.log") + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + world_size = min(torch.cuda.device_count(), 2) + train_translation_model( + data_dir, + "fconv_iwslt_de_en", + ["--log-file", log, "--ddp-backend", "fully_sharded"], + world_size=world_size, + ) + generate_main(data_dir) + assert os.path.exists(log) + + def test_fsdp_sharded_checkpoint_generate(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_fsdp_sharded") as data_dir: + log = os.path.join(data_dir, "train.log") + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + world_size = min(torch.cuda.device_count(), 2) + train_translation_model( + data_dir, + "fconv_iwslt_de_en", [ - "--task", - "translation_lev", - "--iter-decode-max-iter", - "9", - "--iter-decode-eos-penalty", - "0", - "--print-step", + "--log-file", + log, + "--ddp-backend", + "fully_sharded", + "--use-sharded-state", ], + world_size=world_size, ) + generate_main(data_dir, ["--checkpoint-shard-count", str(world_size)]) + assert os.path.exists(log) def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=False): @@ -229,6 +511,10 @@ def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=Fa train.main(quantize_args) +@unittest.skipIf( + int(torch.__version__[2]) < 10, reason="quantized kernels are only supported on CPU" +) +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestQuantization(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) @@ -236,7 +522,6 @@ def setUp(self): def tearDown(self): logging.disable(logging.NOTSET) - @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_quantization(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_quantization") as data_dir: @@ -246,6 +531,7 @@ def test_quantization(self): _quantize_language_model(data_dir, "transformer_lm") +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestOptimizersGPU(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) @@ -253,7 +539,6 @@ def setUp(self): def tearDown(self): logging.disable(logging.NOTSET) - @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_flat_grads(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_flat_grads") as data_dir: diff --git a/tests/gpu/test_ema_gpu.py b/tests/gpu/test_ema_gpu.py new file mode 100644 index 0000000000..33fb5607b4 --- /dev/null +++ b/tests/gpu/test_ema_gpu.py @@ -0,0 +1,215 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from copy import deepcopy +from dataclasses import dataclass +from typing import Optional + +import torch + +from fairseq.models.ema import EMA + + +class DummyModule(torch.nn.Module): + def __init__(self) -> None: + """LightningModule for testing purposes + + Args: + epoch_min_loss_override (int, optional): Pass in an epoch that will be set to the minimum + validation loss for testing purposes (zero based). If None this is ignored. Defaults to None. + """ + super().__init__() + self.layer = torch.nn.Linear(in_features=32, out_features=2) + self.another_layer = torch.nn.Linear(in_features=2, out_features=2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.layer(x) + return self.another_layer(x) + + +@dataclass +class EMAConfig(object): + ema_decay: float = 0.99 + ema_start_update: int = 0 + ema_fp32: bool = False + ema_seed_model: Optional[str] = None + ema_update_freq: int = 1 + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestEMAGPU(unittest.TestCase): + def assertTorchAllClose(self, x, y, atol=1e-8, rtol=1e-5, msg=None): + diff = x.float() - y.float() + diff_norm = torch.norm(diff) + other_norm = torch.norm(y.float()) + + if msg is None: + msg = "|input - other| > {} + {} * |other|".format(atol, rtol) + + self.assertLessEqual( + diff_norm, + atol + rtol * other_norm, + msg=msg, + ) + + def test_ema(self): + model = DummyModule().cuda() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + state = deepcopy(model.state_dict()) + config = EMAConfig() + ema = EMA(model, config) + + # set decay + ema._set_decay(config.ema_decay) + self.assertEqual(ema.get_decay(), config.ema_decay) + + # get model + self.assertEqual(ema.get_model(), ema.model) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + # EMA step + x = torch.randn(32).cuda() + y = model(x) + loss = y.sum() + loss.backward() + optimizer.step() + + ema.step(model) + + ema_state_dict = ema.get_model().state_dict() + + for key, param in model.state_dict().items(): + prev_param = state[key] + ema_param = ema_state_dict[key] + + if "version" in key: + # Do not decay a model.version pytorch param + continue + self.assertTorchAllClose( + ema_param, + config.ema_decay * prev_param + (1 - config.ema_decay) * param, + ) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + # Load EMA into model + model2 = DummyModule().cuda() + ema.reverse(model2) + + for key, param in model2.state_dict().items(): + ema_param = ema_state_dict[key] + self.assertTrue(torch.allclose(ema_param, param)) + + def test_ema_fp32(self): + model = DummyModule().cuda().half() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + state = deepcopy(model.state_dict()) + config = EMAConfig(ema_fp32=True) + ema = EMA(model, config) + + x = torch.randn(32).cuda() + y = model(x.half()) + loss = y.sum() + loss.backward() + optimizer.step() + + ema.step(model) + + for key, param in model.state_dict().items(): + prev_param = state[key] + ema_param = ema.get_model().state_dict()[key] + + if "version" in key: + # Do not decay a model.version pytorch param + continue + self.assertIn(key, ema.fp32_params) + + # EMA update is done in fp32, and hence the EMA param must be + # closer to the EMA update done in fp32 than in fp16. + self.assertLessEqual( + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param.float() + + (1 - config.ema_decay) * param.float() + ) + .half() + .float() + ), + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param + (1 - config.ema_decay) * param + ).float() + ), + ) + self.assertTorchAllClose( + ema_param, + ( + config.ema_decay * prev_param.float() + + (1 - config.ema_decay) * param.float() + ).half(), + ) + + def test_ema_fp16(self): + model = DummyModule().cuda().half() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + state = deepcopy(model.state_dict()) + config = EMAConfig(ema_fp32=False) + ema = EMA(model, config) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + x = torch.randn(32).cuda() + y = model(x.half()) + loss = y.sum() + loss.backward() + optimizer.step() + + ema.step(model) + + for key, param in model.state_dict().items(): + prev_param = state[key] + ema_param = ema.get_model().state_dict()[key] + + if "version" in key: + # Do not decay a model.version pytorch param + continue + + # EMA update is done in fp16, and hence the EMA param must be + # closer to the EMA update done in fp16 than in fp32. + self.assertLessEqual( + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param + (1 - config.ema_decay) * param + ).float() + ), + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param.float() + + (1 - config.ema_decay) * param.float() + ) + .half() + .float() + ), + ) + self.assertTorchAllClose( + ema_param, + config.ema_decay * prev_param + (1 - config.ema_decay) * param, + ) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/speech/__init__.py b/tests/speech/__init__.py new file mode 100644 index 0000000000..dba99e4d93 --- /dev/null +++ b/tests/speech/__init__.py @@ -0,0 +1,210 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from argparse import Namespace +import os +import re +import unittest +from pathlib import Path +from tqdm import tqdm +from typing import List, Dict, Optional +import torch +from fairseq.checkpoint_utils import load_model_ensemble_and_task +from fairseq.scoring.wer import WerScorer +from fairseq.scoring.bleu import SacrebleuScorer +from fairseq import utils +import zipfile + +S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq" + + +class TestFairseqSpeech(unittest.TestCase): + @classmethod + def download(cls, base_url: str, out_root: Path, filename: str): + url = f"{base_url}/{filename}" + path = out_root / filename + if not path.exists(): + torch.hub.download_url_to_file(url, path.as_posix(), progress=True) + return path + + def _set_up(self, dataset_id: str, s3_dir: str, data_filenames: List[str]): + self.use_cuda = torch.cuda.is_available() + self.root = Path.home() / ".cache" / "fairseq" / dataset_id + self.root.mkdir(exist_ok=True, parents=True) + os.chdir(self.root) + self.base_url = ( + s3_dir if re.search("^https:", s3_dir) else f"{S3_BASE_URL}/{s3_dir}" + ) + for filename in data_filenames: + self.download(self.base_url, self.root, filename) + + def set_up_librispeech(self): + self._set_up( + "librispeech", + "s2t/librispeech", + [ + "cfg_librispeech.yaml", + "spm_librispeech_unigram10000.model", + "spm_librispeech_unigram10000.txt", + "librispeech_test-other.tsv", + "librispeech_test-other.zip", + ], + ) + + def set_up_ljspeech(self): + self._set_up( + "ljspeech", + "s2/ljspeech", + [ + "cfg_ljspeech_g2p.yaml", + "ljspeech_g2p_gcmvn_stats.npz", + "ljspeech_g2p.txt", + "ljspeech_test.tsv", + "ljspeech_test.zip", + ], + ) + + def set_up_sotasty_es_en(self): + self._set_up( + "sotasty_es_en", + "s2t/big/es-en", + [ + "cfg_es_en.yaml", + "spm_bpe32768_es_en.model", + "spm_bpe32768_es_en.txt", + "sotasty_es_en_test_ted.tsv", + "sotasty_es_en_test_ted.zip", + ], + ) + + def set_up_mustc_de_fbank(self): + self._set_up( + "mustc_de_fbank", + "https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de", + [ + "config.yaml", + "spm.model", + "dict.txt", + "src_dict.txt", + "tgt_dict.txt", + "tst-COMMON.tsv", + "tst-COMMON.zip", + ], + ) + + def download_and_load_checkpoint( + self, + checkpoint_filename: str, + arg_overrides: Optional[Dict[str, str]] = None, + strict: bool = True, + ): + path = self.download(self.base_url, self.root, checkpoint_filename) + _arg_overrides = arg_overrides or {} + _arg_overrides["data"] = self.root.as_posix() + models, cfg, task = load_model_ensemble_and_task( + [path.as_posix()], arg_overrides=_arg_overrides, strict=strict + ) + if self.use_cuda: + for model in models: + model.cuda() + + return models, cfg, task, self.build_generator(task, models, cfg) + + def build_generator( + self, + task, + models, + cfg, + ): + return task.build_generator(models, cfg) + + @classmethod + def get_batch_iterator(cls, task, test_split, max_tokens, max_positions): + task.load_dataset(test_split) + return task.get_batch_iterator( + dataset=task.dataset(test_split), + max_tokens=max_tokens, + max_positions=max_positions, + num_workers=1, + ).next_epoch_itr(shuffle=False) + + @classmethod + def get_wer_scorer( + cls, tokenizer="none", lowercase=False, remove_punct=False, char_level=False + ): + scorer_args = { + "wer_tokenizer": tokenizer, + "wer_lowercase": lowercase, + "wer_remove_punct": remove_punct, + "wer_char_level": char_level, + } + return WerScorer(Namespace(**scorer_args)) + + @classmethod + def get_bleu_scorer(cls, tokenizer="13a", lowercase=False, char_level=False): + scorer_args = { + "sacrebleu_tokenizer": tokenizer, + "sacrebleu_lowercase": lowercase, + "sacrebleu_char_level": char_level, + } + return SacrebleuScorer(Namespace(**scorer_args)) + + @torch.no_grad() + def base_test( + self, + ckpt_name, + reference_score, + score_delta=0.3, + dataset="librispeech_test-other", + max_tokens=65_536, + max_positions=(4_096, 1_024), + arg_overrides=None, + strict=True, + score_type="wer", + ): + models, _, task, generator = self.download_and_load_checkpoint( + ckpt_name, arg_overrides=arg_overrides, strict=strict + ) + if not self.use_cuda: + return + + batch_iterator = self.get_batch_iterator( + task, dataset, max_tokens, max_positions + ) + if score_type == "bleu": + scorer = self.get_bleu_scorer() + elif score_type == "wer": + scorer = self.get_wer_scorer() + else: + raise Exception(f"Unsupported score type {score_type}") + + progress = tqdm(enumerate(batch_iterator), total=len(batch_iterator)) + for batch_idx, sample in progress: + sample = utils.move_to_cuda(sample) if self.use_cuda else sample + hypo = task.inference_step(generator, models, sample) + for i, sample_id in enumerate(sample["id"].tolist()): + tgt_str, hypo_str = self.postprocess_tokens( + task, + sample["target"][i, :], + hypo[i][0]["tokens"].int().cpu(), + ) + if batch_idx == 0 and i < 3: + print(f"T-{sample_id} {tgt_str}") + print(f"H-{sample_id} {hypo_str}") + scorer.add_string(tgt_str, hypo_str) + + print(scorer.result_string() + f" (reference: {reference_score})") + self.assertAlmostEqual(scorer.score(), reference_score, delta=score_delta) + + def postprocess_tokens(self, task, target, hypo_tokens): + tgt_tokens = utils.strip_pad(target, task.tgt_dict.pad()).int().cpu() + tgt_str = task.tgt_dict.string(tgt_tokens, "sentencepiece") + hypo_str = task.tgt_dict.string(hypo_tokens, "sentencepiece") + return tgt_str, hypo_str + + def unzip_files(self, zip_file_name): + zip_file_path = self.root / zip_file_name + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + zip_ref.extractall(self.root / zip_file_name.strip(".zip")) diff --git a/tests/speech/test_convtransformer_simul_trans.py b/tests/speech/test_convtransformer_simul_trans.py new file mode 100644 index 0000000000..0562404f52 --- /dev/null +++ b/tests/speech/test_convtransformer_simul_trans.py @@ -0,0 +1,33 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from tests.speech import TestFairseqSpeech + +S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq/" + + +class TestConvtransformerSimulTrans(TestFairseqSpeech): + def setUp(self): + self._set_up( + "simul", + "speech_tests/simul", + ["config_gcmvn_specaug.yaml", "dict.txt", "dev.tsv"], + ) + + def test_waitk_checkpoint(self): + """Only test model loading since fairseq currently doesn't support inference of simultaneous models""" + _, _, _, _ = self.download_and_load_checkpoint( + "checkpoint_best.pt", + arg_overrides={ + "config_yaml": "config_gcmvn_specaug.yaml", + "load_pretrained_encoder_from": None, + }, + ) + return + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/speech/test_dual_input_wav_transformer.py b/tests/speech/test_dual_input_wav_transformer.py new file mode 100644 index 0000000000..3581bc1991 --- /dev/null +++ b/tests/speech/test_dual_input_wav_transformer.py @@ -0,0 +1,76 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from collections import namedtuple +from pathlib import Path + +import torch +from tqdm import tqdm + +import fairseq +from fairseq import utils +from fairseq.checkpoint_utils import load_model_ensemble_and_task +from fairseq.scoring.bleu import SacrebleuScorer +from fairseq.tasks import import_tasks +from tests.speech import S3_BASE_URL, TestFairseqSpeech + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestLibrispeechDualInputWavTransformer(TestFairseqSpeech): + def setUp(self): + dataset_id = "librispeech_wvtrasnformer" + base_url = "https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned" + data_filenames = [ + "checkpoint_ave_10.pt", + "spm.model", + "src_dict.txt", + "tgt_dict.txt", + "config.yaml", + ] + self._set_up( + dataset_id, + "s2t", + [ + "librispeech_flac_test-other.tsv", + "librispeech_flac_test-other.zip", + ], + ) + for filename in data_filenames: + self.download(base_url, self.root, filename) + + def import_user_module(self): + user_dir = ( + Path(fairseq.__file__).parent.parent / "examples/speech_text_joint_to_text" + ) + Arg = namedtuple("Arg", ["user_dir"]) + arg = Arg(user_dir.__str__()) + utils.import_user_module(arg) + + @torch.no_grad() + def test_librispeech_dualinput_wav_transformer_checkpoint(self): + self.import_user_module() + checkpoint_filename = "checkpoint_ave_10.pt" + arg_overrides = { + "config_yaml": "config.yaml", + "load_pretrained_speech_text_encoder": "", + "load_pretrained_speech_text_decoder": "", + "beam": 10, + "nbest": 1, + "lenpen": 1.0, + "load_speech_only": True, + } + self.base_test( + checkpoint_filename, + 4.6, + dataset="librispeech_flac_test-other", + max_tokens=800000, + max_positions=(800000, 1024), + arg_overrides=arg_overrides, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/speech/test_dualinput_s2t_transformer.py b/tests/speech/test_dualinput_s2t_transformer.py new file mode 100644 index 0000000000..76675b9823 --- /dev/null +++ b/tests/speech/test_dualinput_s2t_transformer.py @@ -0,0 +1,110 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from argparse import Namespace +from collections import namedtuple +from pathlib import Path + +import torch +from tqdm import tqdm + +import fairseq +from fairseq import utils +from fairseq.checkpoint_utils import load_model_ensemble_and_task +from fairseq.scoring.bleu import SacrebleuScorer +from fairseq.tasks import import_tasks +from tests.speech import TestFairseqSpeech + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestDualInputS2TTransformer(TestFairseqSpeech): + def setUp(self): + self.set_up_mustc_de_fbank() + + def import_user_module(self): + user_dir = ( + Path(fairseq.__file__).parent.parent / "examples/speech_text_joint_to_text" + ) + Arg = namedtuple("Arg", ["user_dir"]) + arg = Arg(user_dir.__str__()) + utils.import_user_module(arg) + + @torch.no_grad() + def test_mustc_de_fbank_dualinput_s2t_transformer_checkpoint(self): + self.import_user_module() + checkpoint_filename = "checkpoint_ave_10.pt" + path = self.download(self.base_url, self.root, checkpoint_filename) + models, cfg, task = load_model_ensemble_and_task( + [path.as_posix()], + arg_overrides={ + "data": self.root.as_posix(), + "config_yaml": "config.yaml", + "load_pretrain_speech_encoder": "", + "load_pretrain_text_encoder_last": "", + "load_pretrain_decoder": "", + "beam": 10, + "nbest": 1, + "lenpen": 1.0, + "load_speech_only": True, + }, + ) + if self.use_cuda: + for model in models: + model.cuda() + generator = task.build_generator(models, cfg) + test_split = "tst-COMMON" + task.load_dataset(test_split) + batch_iterator = task.get_batch_iterator( + dataset=task.dataset(test_split), + max_tokens=250_000, + max_positions=(10_000, 1_024), + num_workers=1, + ).next_epoch_itr(shuffle=False) + + tokenizer = task.build_tokenizer(cfg.tokenizer) + bpe = task.build_bpe(cfg.bpe) + + def decode_fn(x): + if bpe is not None: + x = bpe.decode(x) + if tokenizer is not None: + x = tokenizer.decode(x) + return x + + scorer_args = { + "sacrebleu_tokenizer": "13a", + "sacrebleu_lowercase": False, + "sacrebleu_char_level": False, + } + scorer = SacrebleuScorer(Namespace(**scorer_args)) + progress = tqdm(enumerate(batch_iterator), total=len(batch_iterator)) + for batch_idx, sample in progress: + sample = utils.move_to_cuda(sample) if self.use_cuda else sample + hypo = task.inference_step(generator, models, sample) + for i, sample_id in enumerate(sample["id"].tolist()): + tgt_tokens = ( + utils.strip_pad(sample["target"][i, :], task.tgt_dict.pad()) + .int() + .cpu() + ) + + tgt_str = task.tgt_dict.string(tgt_tokens, "sentencepiece") + hypo_str = task.tgt_dict.string( + hypo[i][0]["tokens"].int().cpu(), "sentencepiece" + ) + if batch_idx == 0 and i < 3: + print(f"T-{sample_id} {tgt_str}") + print(f"D-{sample_id} {hypo_str}") + scorer.add_string(tgt_str, hypo_str) + reference_bleu = 27.3 + result = scorer.result_string() + print(result + f" (reference: {reference_bleu})") + res_bleu = float(result.split()[2]) + self.assertAlmostEqual(res_bleu, reference_bleu, delta=0.3) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/speech/test_fastspeech2.py b/tests/speech/test_fastspeech2.py new file mode 100644 index 0000000000..7150a3bda2 --- /dev/null +++ b/tests/speech/test_fastspeech2.py @@ -0,0 +1,53 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from tqdm import tqdm + +from fairseq import utils +from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion +from tests.speech import TestFairseqSpeech + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestFastSpeech2(TestFairseqSpeech): + def setUp(self): + self.set_up_ljspeech() + + @torch.no_grad() + def test_ljspeech_fastspeech2_checkpoint(self): + models, cfg, task, generator = self.download_and_load_checkpoint( + "ljspeech_fastspeech2_g2p.pt", + arg_overrides={ + "config_yaml": "cfg_ljspeech_g2p.yaml", + "vocoder": "griffin_lim", + "fp16": False, + }, + ) + + batch_iterator = self.get_batch_iterator(task, "ljspeech_test", 65_536, 4_096) + progress = tqdm(batch_iterator, total=len(batch_iterator)) + mcd, n_samples = 0.0, 0 + for sample in progress: + sample = utils.move_to_cuda(sample) if self.use_cuda else sample + hypos = generator.generate(models[0], sample, has_targ=True) + rets = batch_mel_cepstral_distortion( + [hypo["targ_waveform"] for hypo in hypos], + [hypo["waveform"] for hypo in hypos], + sr=task.sr, + ) + mcd += sum(d.item() for d, _ in rets) + n_samples += len(sample["id"].tolist()) + + mcd = round(mcd / n_samples, 1) + reference_mcd = 3.2 + print(f"MCD: {mcd} (reference: {reference_mcd})") + self.assertAlmostEqual(mcd, reference_mcd, delta=0.1) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/speech/test_s2s_transformer.py b/tests/speech/test_s2s_transformer.py new file mode 100644 index 0000000000..180f46307c --- /dev/null +++ b/tests/speech/test_s2s_transformer.py @@ -0,0 +1,51 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from tests.speech import TestFairseqSpeech +from fairseq import utils + +S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq/" + + +class TestS2STransformer(TestFairseqSpeech): + def setUp(self): + self._set_up( + "s2s", + "speech_tests/s2s", + [ + "dev_shuf200.tsv", + "src_feat.zip", + "config_specaug_lb.yaml", + "vocoder", + "vocoder_config.json", + ], + ) + + def test_s2s_transformer_checkpoint(self): + self.base_test( + ckpt_name="s2u_transformer_reduced_fisher.pt", + reference_score=38.3, + dataset="dev_shuf200", + arg_overrides={ + "config_yaml": "config_specaug_lb.yaml", + "multitask_config_yaml": None, + "target_is_code": True, + "target_code_size": 100, + "eval_inference": False, + }, + score_type="bleu", + strict=False, + ) + + def postprocess_tokens(self, task, target, hypo_tokens): + tgt_tokens = utils.strip_pad(target, task.tgt_dict.pad()).int().cpu() + tgt_str = task.tgt_dict.string(tgt_tokens) + hypo_str = task.tgt_dict.string(hypo_tokens) + return tgt_str, hypo_str + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/speech/test_s2t_conformer.py b/tests/speech/test_s2t_conformer.py new file mode 100644 index 0000000000..5aaa4a0ed6 --- /dev/null +++ b/tests/speech/test_s2t_conformer.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from tests.speech import TestFairseqSpeech + + +class TestS2TConformer(TestFairseqSpeech): + def setUp(self): + self.set_up_librispeech() + + def test_librispeech_s2t_conformer_s_checkpoint(self): + self.base_test( + ckpt_name="librispeech_conformer_rel_pos_s.pt", + reference_score=12, + arg_overrides={"config_yaml": "cfg_librispeech.yaml"}, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/speech/test_s2t_transformer.py b/tests/speech/test_s2t_transformer.py new file mode 100644 index 0000000000..172f5484a0 --- /dev/null +++ b/tests/speech/test_s2t_transformer.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from tests.speech import TestFairseqSpeech + + +class TestS2TTransformer(TestFairseqSpeech): + def setUp(self): + self.set_up_librispeech() + + def test_librispeech_s2t_transformer_s_checkpoint(self): + self.base_test( + ckpt_name="librispeech_transformer_s.pt", + reference_score=9, + arg_overrides={"config_yaml": "cfg_librispeech.yaml"}, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/speech/test_tts_transformer.py b/tests/speech/test_tts_transformer.py new file mode 100644 index 0000000000..b6330c6077 --- /dev/null +++ b/tests/speech/test_tts_transformer.py @@ -0,0 +1,53 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from tqdm import tqdm + +from fairseq import utils +from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion +from tests.speech import TestFairseqSpeech + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestTTSTransformer(TestFairseqSpeech): + def setUp(self): + self.set_up_ljspeech() + + @torch.no_grad() + def test_ljspeech_tts_transformer_checkpoint(self): + models, cfg, task, generator = self.download_and_load_checkpoint( + "ljspeech_transformer_g2p.pt", + arg_overrides={ + "config_yaml": "cfg_ljspeech_g2p.yaml", + "vocoder": "griffin_lim", + "fp16": False, + }, + ) + + batch_iterator = self.get_batch_iterator(task, "ljspeech_test", 65_536, 1024) + progress = tqdm(batch_iterator, total=len(batch_iterator)) + mcd, n_samples = 0.0, 0 + for sample in progress: + sample = utils.move_to_cuda(sample) if self.use_cuda else sample + hypos = generator.generate(models[0], sample, has_targ=True) + rets = batch_mel_cepstral_distortion( + [hypo["targ_waveform"] for hypo in hypos], + [hypo["waveform"] for hypo in hypos], + sr=task.sr, + ) + mcd += sum(d.item() for d, _ in rets) + n_samples += len(sample["id"].tolist()) + + mcd = round(mcd / n_samples, 1) + reference_mcd = 3.3 + print(f"MCD: {mcd} (reference: {reference_mcd})") + self.assertAlmostEqual(mcd, reference_mcd, delta=0.1) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/speech/test_wav2vec2.py b/tests/speech/test_wav2vec2.py new file mode 100644 index 0000000000..eff6114c8e --- /dev/null +++ b/tests/speech/test_wav2vec2.py @@ -0,0 +1,90 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +import torch +from tests.speech import TestFairseqSpeech +from fairseq.data.data_utils import post_process +from fairseq import utils +from omegaconf import open_dict + +S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq" + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestWav2Vec2(TestFairseqSpeech): + def setUp(self): + self._set_up( + "librispeech_w2v2", + "conformer/wav2vec2/librispeech", + [ + "test_librispeech-other.ltr", + "test_librispeech-other.tsv", + "test_librispeech-other_small.ltr_100", + "test_librispeech-other_small.tsv", + "test-other.zip", + "dict.ltr.txt", + "dict.ltr_100.txt", + ], + ) + self.unzip_files( + "test-other.zip", + ) + + def test_transformer_w2v2(self): + self.base_test( + ckpt_name="transformer_oss_small_100h.pt", + reference_score=38, + score_delta=1, + dataset="test_librispeech-other", + max_tokens=1000000, + max_positions=(700000, 1000), + arg_overrides={ + "task": "audio_finetuning", + "labels": "ltr", + "nbest": 1, + "tpu": False, + }, + strict=False, + ) + + def test_conformer_w2v2(self): + self.base_test( + ckpt_name="conformer_LS_PT_LS_FT_rope.pt", + reference_score=4.5, + score_delta=1, + dataset="test_librispeech-other_small", + max_tokens=1000000, + max_positions=(700000, 1000), + arg_overrides={ + "task": "audio_finetuning", + "labels": "ltr_100", + "nbest": 1, + "tpu": False, + }, + strict=True, + ) + + def build_generator(self, task, models, cfg): + try: + from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder + except Exception: + raise Exception("Cannot run this test without flashlight dependency") + with open_dict(cfg): + cfg.nbest = 1 + return W2lViterbiDecoder(cfg, task.target_dictionary) + + def postprocess_tokens(self, task, target, hypo_tokens): + tgt_tokens = utils.strip_pad(target, task.target_dictionary.pad()).int().cpu() + tgt_str = task.target_dictionary.string(tgt_tokens) + tgt_str = post_process(tgt_str, "letter") + + hypo_pieces = task.target_dictionary.string(hypo_tokens) + hypo_str = post_process(hypo_pieces, "letter") + return tgt_str, hypo_str + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/speech/test_xm_transformer.py b/tests/speech/test_xm_transformer.py new file mode 100644 index 0000000000..0a55094151 --- /dev/null +++ b/tests/speech/test_xm_transformer.py @@ -0,0 +1,29 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from tests.speech import TestFairseqSpeech + + +class TestXMTransformer(TestFairseqSpeech): + def setUp(self): + self.set_up_sotasty_es_en() + + # TODO: investigate increases BLEU score (30.42 -> 31.74) + def test_sotasty_es_en_600m_checkpoint(self): + self.base_test( + ckpt_name="xm_transformer_600m_es_en_md.pt", + reference_score=31.74, + score_delta=0.2, + max_tokens=3_000_000, + max_positions=(1_000_000, 1_024), + dataset="sotasty_es_en_test_ted", + arg_overrides={"config_yaml": "cfg_es_en.yaml"}, + score_type="bleu", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/tasks/test_denoising.py b/tests/tasks/test_denoising.py new file mode 100644 index 0000000000..5c22168352 --- /dev/null +++ b/tests/tasks/test_denoising.py @@ -0,0 +1,96 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import unittest +from tempfile import TemporaryDirectory + +from fairseq import options +from fairseq.binarizer import FileBinarizer, VocabularyDatasetBinarizer +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.tasks.denoising import DenoisingTask +from tests.utils import build_vocab, make_data + + +class TestDenoising(unittest.TestCase): + def test_denoising(self): + with TemporaryDirectory() as dirname: + + # prep input file + raw_file = os.path.join(dirname, "raw") + data = make_data(out_file=raw_file) + vocab = build_vocab(data) + + # binarize + binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) + split = "train" + bin_file = os.path.join(dirname, split) + dataset_impl = "mmap" + FileBinarizer.multiprocess_dataset( + input_file=raw_file, + binarizer=binarizer, + dataset_impl=dataset_impl, + vocab_size=len(vocab), + output_prefix=bin_file, + ) + + # setup task + train_args = options.parse_args_and_arch( + options.get_training_parser(), + [ + "--task", + "denoising", + "--arch", + "bart_base", + "--seed", + "42", + "--mask-length", + "word", + "--permute-sentences", + "1", + "--rotate", + "0", + "--replace-length", + "-1", + "--mask", + "0.2", + dirname, + ], + ) + cfg = convert_namespace_to_omegaconf(train_args) + task = DenoisingTask(cfg.task, binarizer.dict) + + # load datasets + original_dataset = task._load_dataset_split(bin_file, 1, False) + task.load_dataset(split) + masked_dataset = task.dataset(split) + + iterator = task.get_batch_iterator( + dataset=masked_dataset, + max_tokens=65_536, + max_positions=4_096, + ).next_epoch_itr(shuffle=False) + mask_index = task.source_dictionary.index("<mask>") + for batch in iterator: + for sample in range(len(batch)): + net_input = batch["net_input"] + masked_src_tokens = net_input["src_tokens"][sample] + masked_src_length = net_input["src_lengths"][sample] + masked_tgt_tokens = batch["target"][sample] + + sample_id = batch["id"][sample] + original_tokens = original_dataset[sample_id] + original_tokens = original_tokens.masked_select( + masked_src_tokens[:masked_src_length] == mask_index + ) + masked_tokens = masked_tgt_tokens.masked_select( + masked_src_tokens == mask_index + ) + + assert masked_tokens.equal(original_tokens) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/tasks/test_masked_lm.py b/tests/tasks/test_masked_lm.py new file mode 100644 index 0000000000..215cd355b0 --- /dev/null +++ b/tests/tasks/test_masked_lm.py @@ -0,0 +1,78 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import unittest +from tempfile import TemporaryDirectory + +from fairseq.binarizer import FileBinarizer, VocabularyDatasetBinarizer +from fairseq.tasks.masked_lm import MaskedLMConfig, MaskedLMTask +from tests.utils import build_vocab, make_data + + +class TestMaskedLM(unittest.TestCase): + def test_masks_tokens(self): + with TemporaryDirectory() as dirname: + + # prep input file + raw_file = os.path.join(dirname, "raw") + data = make_data(out_file=raw_file) + vocab = build_vocab(data) + + # binarize + binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) + split = "train" + bin_file = os.path.join(dirname, split) + FileBinarizer.multiprocess_dataset( + input_file=raw_file, + binarizer=binarizer, + dataset_impl="mmap", + vocab_size=len(vocab), + output_prefix=bin_file, + ) + + # setup task + cfg = MaskedLMConfig( + data=dirname, + seed=42, + mask_prob=0.5, # increasing the odds of masking + random_token_prob=0, # avoiding random tokens for exact match + leave_unmasked_prob=0, # always masking for exact match + ) + task = MaskedLMTask(cfg, binarizer.dict) + + original_dataset = task._load_dataset_split(bin_file, 1, False) + + # load datasets + task.load_dataset(split) + masked_dataset = task.dataset(split) + + mask_index = task.source_dictionary.index("<mask>") + iterator = task.get_batch_iterator( + dataset=masked_dataset, + max_tokens=65_536, + max_positions=4_096, + ).next_epoch_itr(shuffle=False) + for batch in iterator: + for sample in range(len(batch)): + net_input = batch["net_input"] + masked_src_tokens = net_input["src_tokens"][sample] + masked_src_length = net_input["src_lengths"][sample] + masked_tgt_tokens = batch["target"][sample] + + sample_id = batch["id"][sample] + original_tokens = original_dataset[sample_id] + original_tokens = original_tokens.masked_select( + masked_src_tokens[:masked_src_length] == mask_index + ) + masked_tokens = masked_tgt_tokens.masked_select( + masked_tgt_tokens != task.source_dictionary.pad() + ) + + assert masked_tokens.equal(original_tokens) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/tasks/test_multilingual_denoising.py b/tests/tasks/test_multilingual_denoising.py new file mode 100644 index 0000000000..a0227f69b5 --- /dev/null +++ b/tests/tasks/test_multilingual_denoising.py @@ -0,0 +1,98 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import unittest +from tempfile import TemporaryDirectory + +from fairseq import options +from fairseq.binarizer import FileBinarizer, VocabularyDatasetBinarizer +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.tasks.multilingual_denoising import MultilingualDenoisingTask +from tests.utils import build_vocab, make_data + + +class TestMultilingualDenoising(unittest.TestCase): + def test_multilingual_denoising(self): + with TemporaryDirectory() as dirname: + + # prep input file + lang_dir = os.path.join(dirname, "en") + os.mkdir(lang_dir) + raw_file = os.path.join(lang_dir, "raw") + data = make_data(out_file=raw_file) + vocab = build_vocab(data) + + # binarize + binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) + split = "train" + bin_file = os.path.join(lang_dir, split) + dataset_impl = "mmap" + FileBinarizer.multiprocess_dataset( + input_file=raw_file, + binarizer=binarizer, + dataset_impl=dataset_impl, + vocab_size=len(vocab), + output_prefix=bin_file, + ) + + # setup task + train_args = options.parse_args_and_arch( + options.get_training_parser(), + [ + "--task", + "multilingual_denoising", + "--arch", + "bart_base", + "--seed", + "42", + "--mask-length", + "word", + "--permute-sentences", + "1", + "--rotate", + "0", + "--replace-length", + "-1", + "--mask", + "0.2", + dirname, + ], + ) + cfg = convert_namespace_to_omegaconf(train_args) + task = MultilingualDenoisingTask(cfg.task, binarizer.dict) + + # load datasets + original_dataset = task._load_dataset_split(bin_file, 1, False) + task.load_dataset(split) + masked_dataset = task.dataset(split) + + iterator = task.get_batch_iterator( + dataset=masked_dataset, + max_tokens=65_536, + max_positions=4_096, + ).next_epoch_itr(shuffle=False) + mask_index = task.source_dictionary.index("<mask>") + for batch in iterator: + for sample in range(len(batch)): + net_input = batch["net_input"] + masked_src_tokens = net_input["src_tokens"][sample] + masked_src_length = net_input["src_lengths"][sample] + masked_tgt_tokens = batch["target"][sample] + + sample_id = batch["id"][sample] + original_tokens = original_dataset[sample_id] + original_tokens = original_tokens.masked_select( + masked_src_tokens[:masked_src_length] == mask_index + ) + masked_tokens = masked_tgt_tokens.masked_select( + masked_src_tokens == mask_index + ) + + assert masked_tokens.equal(original_tokens) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/tasks/test_span_masked_lm.py b/tests/tasks/test_span_masked_lm.py new file mode 100644 index 0000000000..d289cf843e --- /dev/null +++ b/tests/tasks/test_span_masked_lm.py @@ -0,0 +1,106 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import unittest +from tempfile import TemporaryDirectory + +from fairseq import options +from fairseq.binarizer import FileBinarizer, VocabularyDatasetBinarizer +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.tasks.span_masked_lm import SpanMaskedLMTask +from tests.utils import build_vocab, make_data + + +class TestSpanMaskedLM(unittest.TestCase): + def test_masks_token_spans(self): + with TemporaryDirectory() as dirname: + + # prep input file + raw_file = os.path.join(dirname, "raw") + data = make_data(out_file=raw_file) + vocab = build_vocab(data) + + # binarize + binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) + split = "train" + bin_file = os.path.join(dirname, split) + dataset_impl = "mmap" + + FileBinarizer.multiprocess_dataset( + input_file=raw_file, + binarizer=binarizer, + dataset_impl=dataset_impl, + vocab_size=len(vocab), + output_prefix=bin_file, + ) + + # adding sentinel tokens + for i in range(100): + vocab.add_symbol(f"<extra_id_{i}>") + + # setup task + train_args = options.parse_args_and_arch( + options.get_training_parser(), + [ + "--task", + "span_masked_lm", + "--arch", + "bart_base", + "--seed", + "42", + dirname, + ], + ) + cfg = convert_namespace_to_omegaconf(train_args) + task = SpanMaskedLMTask(cfg.task, binarizer.dict) + + # load datasets + original_dataset = task._load_dataset_split(bin_file, 1, False) + task.load_dataset(split) + masked_dataset = task.dataset(split) + + iterator = task.get_batch_iterator( + dataset=masked_dataset, + max_tokens=65_536, + max_positions=4_096, + ).next_epoch_itr(shuffle=False) + num_tokens = len(vocab) + for batch in iterator: + for sample in range(len(batch)): + sample_id = batch["id"][sample] + original_tokens = original_dataset[sample_id] + masked_src_tokens = batch["net_input"]["src_tokens"][sample] + masked_src_length = batch["net_input"]["src_lengths"][sample] + masked_tgt_tokens = batch["target"][sample] + + original_offset = 0 + masked_tgt_offset = 0 + extra_id_token = len(vocab) - 1 + for masked_src_token in masked_src_tokens[:masked_src_length]: + if masked_src_token == extra_id_token: + assert ( + masked_src_token == masked_tgt_tokens[masked_tgt_offset] + ) + extra_id_token -= 1 + masked_tgt_offset += 1 + while ( + original_offset < len(original_tokens) + and masked_tgt_tokens[masked_tgt_offset] + != extra_id_token + ): + assert ( + original_tokens[original_offset] + == masked_tgt_tokens[masked_tgt_offset] + ) + original_offset += 1 + masked_tgt_offset += 1 + else: + assert original_tokens[original_offset] == masked_src_token + original_offset += 1 + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_activation_checkpointing.py b/tests/test_activation_checkpointing.py new file mode 100644 index 0000000000..647a957288 --- /dev/null +++ b/tests/test_activation_checkpointing.py @@ -0,0 +1,79 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +import torch.nn as nn +from fairseq.modules.checkpoint_activations import checkpoint_wrapper +from torch.utils.checkpoint import checkpoint + + +class Model(nn.Module): + def __init__( + self, use_pytorch_checkpoint=False, use_fairseq_checkpoint=False, **kwargs + ): + super().__init__() + torch.manual_seed(0) + self.use_pytorch_checkpoint = use_pytorch_checkpoint + self.ffn = nn.Sequential( + nn.Linear(32, 128), + # add a Dropout layer to test RNG save/restore + nn.Dropout(p=0.5), + nn.Linear(128, 32), + ) + if use_fairseq_checkpoint: + self.ffn = checkpoint_wrapper(self.ffn, **kwargs) + self.out = nn.Linear(32, 1) + + def forward(self, x): + if self.use_pytorch_checkpoint: + x = checkpoint(self.ffn, x) + else: + x = self.ffn(x) + return self.out(x) + + +class TestActivationCheckpointing(unittest.TestCase): + def _test_checkpoint_wrapper(self, device, log_memory_usage=False): + def get_loss_and_gnorm(model): + torch.manual_seed(1) + input = torch.rand(2, 16, 32).requires_grad_(True).to(device) + model.zero_grad() + loss = model(input).sum() + loss.backward() + gnorm = torch.norm( + torch.stack([torch.norm(p.grad.detach()) for p in model.parameters()]) + ) + return {"loss": loss, "gnorm": gnorm} + + model = Model().to(device) + no_cpt = get_loss_and_gnorm(model) + + model = Model(use_pytorch_checkpoint=True).to(device) + pyt_cpt = get_loss_and_gnorm(model) + torch.testing.assert_allclose(no_cpt["loss"], pyt_cpt["loss"]) + torch.testing.assert_allclose(no_cpt["gnorm"], pyt_cpt["gnorm"]) + + model = Model(use_fairseq_checkpoint=True).to(device) + fairseq_cpt = get_loss_and_gnorm(model) + torch.testing.assert_allclose(no_cpt["loss"], fairseq_cpt["loss"]) + torch.testing.assert_allclose(no_cpt["gnorm"], fairseq_cpt["gnorm"]) + + model = Model(use_fairseq_checkpoint=True, offload_to_cpu=True).to(device) + fairseq_cpt_offload = get_loss_and_gnorm(model) + torch.testing.assert_allclose(no_cpt["loss"], fairseq_cpt_offload["loss"]) + torch.testing.assert_allclose(no_cpt["gnorm"], fairseq_cpt_offload["gnorm"]) + + def test_checkpoint_wrapper_cpu(self): + self._test_checkpoint_wrapper(device=torch.device("cpu")) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_checkpoint_wrapper_cuda(self): + self._test_checkpoint_wrapper(device=torch.device("cuda")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_amp_optimizer.py b/tests/test_amp_optimizer.py new file mode 100644 index 0000000000..4d6073a926 --- /dev/null +++ b/tests/test_amp_optimizer.py @@ -0,0 +1,75 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import copy +import unittest + +import torch +from torch.cuda.amp import GradScaler, autocast + +from fairseq.optim import build_optimizer + + +@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") +class TestGradientScalingAMP(unittest.TestCase): + def setUp(self): + self.x = torch.tensor([2.0]).cuda().half() + weight = 3.0 + bias = 5.0 + self.error = 1.0 + self.target = torch.tensor([self.x * weight + bias + self.error]).cuda() + self.loss_fn = torch.nn.L1Loss() + + self.model = torch.nn.Linear(1, 1) + self.model.weight.data = torch.tensor([[weight]]) + self.model.bias.data = torch.tensor([bias]) + self.model.cuda() + self.params = list(self.model.parameters()) + + self.namespace_dls = argparse.Namespace( + optimizer="adam", + lr=[0.1], + adam_betas="(0.9, 0.999)", + adam_eps=1e-8, + weight_decay=0.0, + threshold_loss_scale=1, + min_loss_scale=1e-4, + ) + self.scaler = GradScaler( + init_scale=1, + growth_interval=1, + ) + + def run_iter(self, model, params, optimizer): + optimizer.zero_grad() + with autocast(): + y = model(self.x) + loss = self.loss_fn(y, self.target) + self.scaler.scale(loss).backward() + self.assertEqual(loss, torch.tensor(1.0, device="cuda:0", dtype=torch.float16)) + + self.scaler.unscale_(optimizer) + grad_norm = optimizer.clip_grad_norm(0) + self.assertAlmostEqual(grad_norm.item(), 2.2361, 4) + + self.scaler.step(optimizer) + self.scaler.update() + self.assertEqual( + model.weight, + torch.tensor([[3.1]], device="cuda:0", requires_grad=True), + ) + self.assertEqual( + model.bias, + torch.tensor([5.1], device="cuda:0", requires_grad=True), + ) + self.assertEqual(self.scaler.get_scale(), 2.0) + + def test_automatic_mixed_precision(self): + model = copy.deepcopy(self.model) + params = list(model.parameters()) + optimizer = build_optimizer(self.namespace_dls, params) + + self.run_iter(model, params, optimizer) diff --git a/tests/test_binaries.py b/tests/test_binaries.py index c6722402a1..41d9210e7c 100644 --- a/tests/test_binaries.py +++ b/tests/test_binaries.py @@ -4,26 +4,39 @@ # LICENSE file in the root directory of this source tree. import contextlib +import json import logging import os import random import sys import tempfile import unittest +from packaging import version from io import StringIO +from typing import Dict, List import torch + from fairseq import options -from fairseq_cli import eval_lm, train, validate +from fairseq_cli import eval_lm, train from tests.utils import ( create_dummy_data, + create_laser_data_and_config_json, generate_main, preprocess_lm_data, preprocess_summarization_data, preprocess_translation_data, + train_language_model, train_translation_model, ) +try: + import transformers # noqa + + has_hf_transformers = True +except ImportError: + has_hf_transformers = False + class TestTranslation(unittest.TestCase): def setUp(self): @@ -295,7 +308,9 @@ def test_multilingual_transformer(self): + dec_ltok_flag, ) - @unittest.skipIf(sys.platform.lower() == "darwin", "skip latent depth test on MacOS") + @unittest.skipIf( + sys.platform.lower() == "darwin", "skip latent depth test on MacOS" + ) def test_multilingual_translation_latent_depth(self): # test with latent depth in encoder, decoder, or both encoder_latent_layer = [[], ["--encoder-latent-layer"]] @@ -320,7 +335,7 @@ def test_multilingual_translation_latent_depth(self): task="multilingual_translation_latent_depth", extra_flags=[ "--user-dir", - "examples/latent_depth/src", + "examples/latent_depth/latent_depth_src", "--encoder-layers", "2", "--decoder-layers", @@ -340,7 +355,7 @@ def test_multilingual_translation_latent_depth(self): run_validation=True, extra_valid_flags=[ "--user-dir", - "examples/latent_depth/src", + "examples/latent_depth/latent_depth_src", ] + enc_ll_flag + dec_ll_flag, @@ -349,7 +364,7 @@ def test_multilingual_translation_latent_depth(self): data_dir, extra_flags=[ "--user-dir", - "examples/latent_depth/src", + "examples/latent_depth/latent_depth_src", "--task", "multilingual_translation_latent_depth", "--lang-pairs", @@ -425,6 +440,164 @@ def test_translation_multi_simple_epoch(self): + dec_ltok_flag, ) + def test_translation_multi_simple_epoch_no_vepoch(self): + # test with all combinations of encoder/decoder lang tokens + with contextlib.redirect_stdout(StringIO()): + enc_ltok_flag = ["--encoder-langtok", "src"] + dec_ltok_flag = ["--decoder-langtok"] + with tempfile.TemporaryDirectory( + "test_translation_multi_simple_epoch_dict" + ) as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir, extra_flags=[]) + train_translation_model( + data_dir, + arch="transformer", + task="translation_multi_simple_epoch", + extra_flags=[ + "--encoder-layers", + "2", + "--decoder-layers", + "2", + "--encoder-embed-dim", + "8", + "--decoder-embed-dim", + "8", + "--sampling-method", + "temperature", + "--sampling-temperature", + "1.5", + ] + + enc_ltok_flag + + dec_ltok_flag, + lang_flags=["--lang-pairs", "in-out"], + run_validation=True, + extra_valid_flags=enc_ltok_flag + dec_ltok_flag, + ) + generate_main( + data_dir, + extra_flags=[ + "--task", + "translation_multi_simple_epoch", + "--lang-pairs", + "in-out", + "--source-lang", + "in", + "--target-lang", + "out", + ] + + enc_ltok_flag + + dec_ltok_flag, + ) + + def test_translation_multi_simple_epoch_dicts(self): + # test with all combinations of encoder/decoder lang tokens + with contextlib.redirect_stdout(StringIO()): + enc_ltok_flag = ["--encoder-langtok", "src"] + dec_ltok_flag = ["--decoder-langtok"] + with tempfile.TemporaryDirectory( + "test_translation_multi_simple_epoch_dict" + ) as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir, extra_flags=[]) + train_translation_model( + data_dir, + arch="transformer", + task="translation_multi_simple_epoch", + extra_flags=[ + "--encoder-layers", + "2", + "--decoder-layers", + "2", + "--encoder-embed-dim", + "8", + "--decoder-embed-dim", + "8", + "--sampling-method", + "temperature", + "--sampling-temperature", + "1.5", + "--virtual-epoch-size", + "1000", + ] + + enc_ltok_flag + + dec_ltok_flag, + lang_flags=["--lang-pairs", "in-out"], + run_validation=True, + extra_valid_flags=enc_ltok_flag + dec_ltok_flag, + ) + generate_main( + data_dir, + extra_flags=[ + "--task", + "translation_multi_simple_epoch", + "--lang-pairs", + "in-out", + "--source-lang", + "in", + "--target-lang", + "out", + ] + + enc_ltok_flag + + dec_ltok_flag, + ) + + def test_translation_multi_simple_epoch_src_tgt_dict_spec(self): + # test the specification of explicit --src-dict and --tgt-dict + with contextlib.redirect_stdout(StringIO()): + enc_ltok_flag = ["--encoder-langtok", "src"] + dec_ltok_flag = ["--decoder-langtok"] + with tempfile.TemporaryDirectory( + "test_translation_multi_simple_epoch_dict" + ) as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir, extra_flags=[]) + train_translation_model( + data_dir, + arch="transformer", + task="translation_multi_simple_epoch", + extra_flags=[ + "--source-dict", + f"{data_dir}/dict.in.txt", + "--target-dict", + f"{data_dir}/dict.out.txt", + "--encoder-layers", + "2", + "--decoder-layers", + "2", + "--encoder-embed-dim", + "8", + "--decoder-embed-dim", + "8", + "--sampling-method", + "temperature", + "--sampling-temperature", + "1.5", + "--virtual-epoch-size", + "1000", + ] + + enc_ltok_flag + + dec_ltok_flag, + lang_flags=["--lang-pairs", "in-out"], + run_validation=True, + extra_valid_flags=enc_ltok_flag + dec_ltok_flag, + ) + generate_main( + data_dir, + extra_flags=[ + "--task", + "translation_multi_simple_epoch", + "--lang-pairs", + "in-out", + "--source-lang", + "in", + "--target-lang", + "out", + ] + + enc_ltok_flag + + dec_ltok_flag, + ) + def test_transformer_cross_self_attention(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( @@ -453,6 +626,10 @@ def test_transformer_cross_self_attention(self): ) generate_main(data_dir, extra_flags=[]) + @unittest.skipIf( + version.parse(torch.__version__) > version.parse("1.8"), + "skip for latest torch versions", + ) def test_transformer_pointer_generator(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( @@ -465,7 +642,7 @@ def test_transformer_pointer_generator(self): "transformer_pointer_generator", extra_flags=[ "--user-dir", - "examples/pointer_generator/src", + "examples/pointer_generator/pointer_generator_src", "--encoder-layers", "2", "--decoder-layers", @@ -482,11 +659,17 @@ def test_transformer_pointer_generator(self): "0", ], run_validation=True, - extra_valid_flags=["--user-dir", "examples/pointer_generator/src"], + extra_valid_flags=[ + "--user-dir", + "examples/pointer_generator/pointer_generator_src", + ], ) generate_main( data_dir, - extra_flags=["--user-dir", "examples/pointer_generator/src"], + extra_flags=[ + "--user-dir", + "examples/pointer_generator/pointer_generator_src", + ], ) def test_lightconv(self): @@ -700,7 +883,7 @@ def test_mixture_of_experts(self): "--task", "translation_moe", "--user-dir", - "examples/translation_moe/src", + "examples/translation_moe/translation_moe_src", "--method", "hMoElp", "--mean-pool-gating-network", @@ -722,7 +905,7 @@ def test_mixture_of_experts(self): "--task", "translation_moe", "--user-dir", - "examples/translation_moe/src", + "examples/translation_moe/translation_moe_src", "--method", "hMoElp", "--mean-pool-gating-network", @@ -760,6 +943,65 @@ def test_alignment(self): ) generate_main(data_dir) + def test_laser_lstm(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_laser_lstm") as data_dir: + laser_config_file = create_laser_data_and_config_json(data_dir) + train_translation_model( + laser_config_file.name, + "laser_lstm", + [ + "--user-dir", + "examples/laser/laser_src", + "--weighting-alpha", + "0.3", + "--encoder-bidirectional", + "--encoder-hidden-size", + "512", + "--encoder-layers", + "5", + "--decoder-layers", + "1", + "--encoder-embed-dim", + "320", + "--decoder-embed-dim", + "320", + "--decoder-lang-embed-dim", + "32", + "--save-dir", + data_dir, + "--disable-validation", + ], + task="laser", + lang_flags=[], + ) + + def test_laser_transformer(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_laser_transformer") as data_dir: + laser_config_file = create_laser_data_and_config_json(data_dir) + train_translation_model( + laser_config_file.name, + "laser_transformer", + [ + "--user-dir", + "examples/laser/laser_src", + "--weighting-alpha", + "0.3", + "--encoder-embed-dim", + "320", + "--decoder-embed-dim", + "320", + "--decoder-lang-embed-dim", + "32", + "--save-dir", + data_dir, + "--disable-validation", + ], + task="laser", + lang_flags=[], + ) + def test_alignment_full_context(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_alignment") as data_dir: @@ -788,6 +1030,38 @@ def test_alignment_full_context(self): ) generate_main(data_dir) + def test_transformer_layerdrop(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_transformer_layerdrop") as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model( + data_dir, + "transformer_iwslt_de_en", + [ + "--encoder-layers", + "3", + "--decoder-layers", + "3", + "--encoder-embed-dim", + "8", + "--decoder-embed-dim", + "8", + "--encoder-layerdrop", + "0.01", + "--decoder-layerdrop", + "0.01", + ], + ) + generate_main(data_dir) + generate_main( + data_dir, + [ + "--model-overrides", + "{'encoder_layers_to_keep':'0,2','decoder_layers_to_keep':'1'}", + ], + ) + class TestStories(unittest.TestCase): def setUp(self): @@ -893,7 +1167,73 @@ def test_transformer_lm(self): train_language_model( data_dir, "transformer_lm", - ["--add-bos-token"], + ["--add-bos-token", "--nval", "1"], + run_validation=True, + ) + eval_lm_main(data_dir) + eval_lm_main(data_dir, extra_flags=["--context-window", "25"]) + generate_main( + data_dir, + [ + "--task", + "language_modeling", + "--sample-break-mode", + "eos", + "--tokens-per-sample", + "500", + ], + ) + + def test_normformer_lm(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir: + create_dummy_data(data_dir) + preprocess_lm_data(data_dir) + train_language_model( + data_dir, + "transformer_lm", + [ + "--add-bos-token", + "--nval", + "1", + "--scale-fc", + "--scale-heads", + "--scale-attn", + "--scale-fc", + ], + run_validation=True, + ) + eval_lm_main(data_dir) + eval_lm_main(data_dir, extra_flags=["--context-window", "25"]) + generate_main( + data_dir, + [ + "--task", + "language_modeling", + "--sample-break-mode", + "eos", + "--tokens-per-sample", + "500", + ], + ) + + def test_transformer_lm_with_adaptive_softmax(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory( + "test_transformer_lm_with_adaptive_softmax" + ) as data_dir: + create_dummy_data(data_dir) + preprocess_lm_data(data_dir) + train_language_model( + data_dir, + "transformer_lm", + [ + "--add-bos-token", + "--criterion", + "adaptive_loss", + "--adaptive-softmax-cutoff", + "5,10,15", + ], run_validation=True, ) eval_lm_main(data_dir) @@ -981,6 +1321,50 @@ def test_lstm_lm_residuals(self): ], ) + @unittest.skipIf(not has_hf_transformers, "skip test if transformers is missing") + def test_transformer_xl_bptt_lm(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_transformer_xl_bptt_lm") as data_dir: + create_dummy_data(data_dir) + preprocess_lm_data(data_dir) + task_flags = [ + "--user-dir", + "examples/truncated_bptt", + "--task", + "truncated_bptt_lm", + "--batch-size", + "2", + "--tokens-per-sample", + "50", + ] + train_language_model( + data_dir=data_dir, + arch="transformer_xl", + extra_flags=task_flags + + [ + "--n-layer", + "2", + ], + task="truncated_bptt_lm", + run_validation=True, + extra_valid_flags=task_flags, + ) + eval_lm_main(data_dir, extra_flags=task_flags) + # Train with activation offloading + train_language_model( + data_dir=data_dir, + arch="transformer_xl", + extra_flags=task_flags + + [ + "--n-layer", + "2", + "--offload-activations", + ], + task="truncated_bptt_lm", + run_validation=True, + extra_valid_flags=task_flags, + ) + class TestMaskedLanguageModel(unittest.TestCase): def setUp(self): @@ -1058,7 +1442,7 @@ def test_linformer_roberta_masked_lm(self): "linformer_roberta_base", extra_flags=[ "--user-dir", - "examples/linformer/src", + "examples/linformer/linformer_src", "--encoder-layers", "2", ], @@ -1075,7 +1459,7 @@ def test_linformer_roberta_sentence_prediction(self): data_dir, "linformer_roberta_base", num_classes=num_classes, - extra_flags=["--user-dir", "examples/linformer/src"], + extra_flags=["--user-dir", "examples/linformer/linformer_src"], ) def test_linformer_roberta_regression_single(self): @@ -1095,7 +1479,7 @@ def test_linformer_roberta_regression_single(self): extra_flags=[ "--regression-target", "--user-dir", - "examples/linformer/src", + "examples/linformer/linformer_src", ], ) @@ -1116,7 +1500,7 @@ def test_linformer_roberta_regression_multiple(self): extra_flags=[ "--regression-target", "--user-dir", - "examples/linformer/src", + "examples/linformer/linformer_src", ], ) @@ -1198,7 +1582,7 @@ def test_r4f_roberta(self): num_classes=num_classes, extra_flags=[ "--user-dir", - "examples/rxf/src", + "examples/rxf/rxf_src", "--criterion", "sentence_prediction_r3f", "--spectral-norm-classification-head", @@ -1226,7 +1610,7 @@ def train_legacy_masked_language_model(data_dir, arch, extra_args=()): "0.5", "--lr", "0.0001", - "--min-lr", + "--stop-min-lr", "1e-09", # dropout, attention args "--dropout", @@ -1309,6 +1693,90 @@ def test_optimizers(self): generate_main(data_dir) +def read_last_log_entry( + logs: List[logging.LogRecord], logger_name: str +) -> Dict[str, float]: + for x in reversed(logs): + if x.name == logger_name: + return json.loads(x.message) + raise ValueError(f"No entries from {logger_name} found in captured logs") + + +class TestActivationCheckpointing(unittest.TestCase): + base_flags = [ + "--encoder-layers", + "2", + "--decoder-layers", + "2", + "--encoder-embed-dim", + "8", + "--decoder-embed-dim", + "8", + "--restore-file", + "x.pt", + "--log-format", + "json", + "--log-interval", + "1", + "--max-update", + "2", + ] + + def _train(self, data_dir, extra_flags): + with self.assertLogs() as logs: + train_translation_model( + data_dir, + "transformer_iwslt_de_en", + self.base_flags + extra_flags, + run_validation=True, + extra_valid_flags=["--log-format", "json"], + ) + return logs.records + + def test_activation_offloading_does_not_change_metrics(self): + """Neither ----checkpoint-activations nor --offload-activations should change loss""" + with tempfile.TemporaryDirectory("test_transformer_with_act_cpt") as data_dir: + + with self.assertLogs(): + create_dummy_data(data_dir, num_examples=20) + preprocess_translation_data(data_dir) + offload_logs = self._train(data_dir, ["--offload-activations"]) + baseline_logs = self._train(data_dir, []) + + assert len(baseline_logs) == len(offload_logs) + + baseline_valid_stats = read_last_log_entry(baseline_logs, "valid") + offload_valid_stats = read_last_log_entry(offload_logs, "valid") + baseline_train_stats = read_last_log_entry(baseline_logs, "train") + offload_train_stats = read_last_log_entry(offload_logs, "train") + + assert ( + baseline_train_stats["train_loss"] == offload_train_stats["train_loss"] + ) + assert ( + baseline_valid_stats["valid_loss"] == offload_valid_stats["valid_loss"] + ) + + def test_activation_checkpointing_does_not_change_metrics(self): + """--checkpoint-activations should not change loss""" + + with tempfile.TemporaryDirectory("test_transformer_with_act_cpt") as data_dir: + with self.assertLogs(): + create_dummy_data(data_dir, num_examples=20) + preprocess_translation_data(data_dir) + ckpt_logs = self._train(data_dir, ["--checkpoint-activations"]) + baseline_logs = self._train(data_dir, []) + assert len(baseline_logs) == len(ckpt_logs) + + baseline_train_stats = read_last_log_entry(baseline_logs, "train") + ckpt_train_stats = read_last_log_entry(ckpt_logs, "train") + assert baseline_train_stats["train_loss"] == ckpt_train_stats["train_loss"] + + baseline_valid_stats = read_last_log_entry(baseline_logs, "valid") + ckpt_valid_stats = read_last_log_entry(ckpt_logs, "valid") + assert baseline_valid_stats["valid_loss"] == ckpt_valid_stats["valid_loss"] + + def create_dummy_roberta_head_data( data_dir, num_examples=100, maxlen=10, num_classes=2, regression=False ): @@ -1364,6 +1832,8 @@ def train_masked_lm(data_dir, arch, extra_flags=None): "masked_lm", "--batch-size", "500", + "--required-batch-size-multiple", + "1", "--save-dir", data_dir, "--max-epoch", @@ -1424,68 +1894,7 @@ def train_roberta_head(data_dir, arch, num_classes=2, extra_flags=None): train.main(train_args) -def train_language_model(data_dir, arch, extra_flags=None, run_validation=False): - train_parser = options.get_training_parser() - train_args = options.parse_args_and_arch( - train_parser, - [ - "--task", - "language_modeling", - data_dir, - "--arch", - arch, - "--optimizer", - "adam", - "--lr", - "0.0001", - "--criterion", - "adaptive_loss", - "--adaptive-softmax-cutoff", - "5,10,15", - "--max-tokens", - "500", - "--tokens-per-sample", - "500", - "--save-dir", - data_dir, - "--max-epoch", - "1", - "--no-progress-bar", - "--distributed-world-size", - "1", - "--ddp-backend", - "no_c10d", - "--num-workers", - "0", - ] - + (extra_flags or []), - ) - train.main(train_args) - - if run_validation: - # test validation - validate_parser = options.get_validation_parser() - validate_args = options.parse_args_and_arch( - validate_parser, - [ - "--task", - "language_modeling", - data_dir, - "--path", - os.path.join(data_dir, "checkpoint_last.pt"), - "--valid-subset", - "valid", - "--max-tokens", - "500", - "--no-progress-bar", - "--num-workers", - "0", - ], - ) - validate.main(validate_args) - - -def eval_lm_main(data_dir): +def eval_lm_main(data_dir, extra_flags=None): eval_lm_parser = options.get_eval_lm_parser() eval_lm_args = options.parse_args_and_arch( eval_lm_parser, @@ -1496,75 +1905,10 @@ def eval_lm_main(data_dir): "--no-progress-bar", "--num-workers", "0", - ], - ) - eval_lm.main(eval_lm_args) - - -def train_masked_language_model(data_dir, arch, extra_args=()): - train_parser = options.get_training_parser() - # TODO: langs should be in and out right? - train_args = options.parse_args_and_arch( - train_parser, - [ - "--task", - "cross_lingual_lm", - data_dir, - "--arch", - arch, - # Optimizer args - "--optimizer", - "adam", - "--lr-scheduler", - "reduce_lr_on_plateau", - "--lr-shrink", - "0.5", - "--lr", - "0.0001", - "--min-lr", - "1e-09", - # dropout, attention args - "--dropout", - "0.1", - "--attention-dropout", - "0.1", - # MLM args - "--criterion", - "masked_lm_loss", - "--masked-lm-only", - "--monolingual-langs", - "in,out", - "--num-segment", - "5", - # Transformer args: use a small transformer model for fast training - "--encoder-layers", - "1", - "--encoder-embed-dim", - "32", - "--encoder-attention-heads", - "1", - "--encoder-ffn-embed-dim", - "32", - # Other training args - "--max-tokens", - "500", - "--tokens-per-sample", - "500", - "--save-dir", - data_dir, - "--max-epoch", - "1", - "--no-progress-bar", - "--distributed-world-size", - "1", - "--dataset-impl", - "raw", - "--num-workers", - "0", ] - + list(extra_args), + + (extra_flags or []), ) - train.main(train_args) + eval_lm.main(eval_lm_args) if __name__ == "__main__": diff --git a/tests/test_binarizer.py b/tests/test_binarizer.py new file mode 100644 index 0000000000..50075eabcc --- /dev/null +++ b/tests/test_binarizer.py @@ -0,0 +1,122 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import os +import typing as tp +import unittest +from tempfile import TemporaryDirectory + +from fairseq.binarizer import BinarizeSummary, FileBinarizer, VocabularyDatasetBinarizer +from fairseq.data import Dictionary, indexed_dataset +from tests.utils import make_data, sizes + + +def build_vocab(data: tp.List[tp.List[str]]) -> Dictionary: + d = Dictionary() + for s in data: + for token in s: + d.add_symbol(token) + d.finalize() + return d + + +class TestBinarizer(unittest.TestCase): + def compare_ds_data(self, summary, data, prefix, impl, vocab): + self.assertEqual(summary.num_seq, len(data)) + self.assertEqual(summary.num_tok, sum([len(s) for s in data])) + + dataset = indexed_dataset.make_dataset(prefix, impl) + + self.assertEqual(len(dataset), len(data)) + decoded = [vocab.string(dataset[i]).split() for i in range(0, len(dataset))] + + self.assertEqual(decoded, data) + data_sizes = [i.item() for i in dataset.sizes] + self.assertEqual(data_sizes, sizes(data)) + + def test_can_binarize_line(self): + data = make_data(length=1) + vocab = build_vocab(data) + + binarizer = VocabularyDatasetBinarizer( + vocab, + ) + + sentence = data[0] + summary = BinarizeSummary() + + tensor = binarizer.binarize_line( + " ".join(sentence), + summary, + ) + + self.assertEqual(len(tensor), len(sentence) + 1) + + self.assertEqual(summary.num_tok, len(sentence) + 1) + self.assertEqual(summary.num_seq, 1) + + def test_can_binarize_file_chunk(self): + # test without multiprocess logic + with TemporaryDirectory() as dirname: + raw_file = os.path.join(dirname, "raw1") + prefix = os.path.join(dirname, "test1") + impl = "mmap" + + data = make_data(out_file=raw_file) + vocab = build_vocab(data) + + binarizer = VocabularyDatasetBinarizer( + vocab, + append_eos=False, + ) + + summary = FileBinarizer._binarize_chunk_and_finalize( + binarizer, + raw_file, + offset_start=0, + offset_end=-1, + output_prefix=prefix, + dataset_impl=impl, + vocab_size=len(vocab), + ) + + self.compare_ds_data(summary, data, prefix, impl, vocab) + + def test_can_multiprocess(self): + with TemporaryDirectory() as dirname: + raw_file = os.path.join(dirname, "raw1") + prefix = os.path.join(dirname, "test1") + impl = "mmap" + data = make_data(out_file=raw_file) + vocab = build_vocab(data) + binarizer = VocabularyDatasetBinarizer( + vocab, + append_eos=False, + ) + # with one worker + summary = FileBinarizer.multiprocess_dataset( + raw_file, + impl, + binarizer, + output_prefix=prefix, + vocab_size=len(vocab), + num_workers=1, + ) + + self.compare_ds_data(summary, data, prefix, impl, vocab) + + # with multiple worker + prefix_multi = os.path.join(dirname, "test2") + summary = FileBinarizer.multiprocess_dataset( + raw_file, + impl, + binarizer, + output_prefix=prefix_multi, + vocab_size=len(vocab), + num_workers=3, + ) + + self.compare_ds_data(summary, data, prefix_multi, impl, vocab) diff --git a/tests/test_checkpoint_utils.py b/tests/test_checkpoint_utils.py new file mode 100644 index 0000000000..f8cd943cfa --- /dev/null +++ b/tests/test_checkpoint_utils.py @@ -0,0 +1,125 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import logging +import os +import tempfile +import unittest +from io import StringIO +from unittest.mock import patch + +from fairseq import checkpoint_utils +from tests.utils import ( + create_dummy_data, + preprocess_translation_data, + train_translation_model, +) +import torch + + +class TestCheckpointUtils(unittest.TestCase): + def setUp(self): + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + @contextlib.contextmanager + def _train_transformer(self, seed, extra_args=None): + if extra_args is None: + extra_args = [] + with tempfile.TemporaryDirectory(f"_train_transformer_seed{seed}") as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model( + data_dir, + "transformer_iwslt_de_en", + [ + "--encoder-layers", + "3", + "--decoder-layers", + "3", + "--encoder-embed-dim", + "8", + "--decoder-embed-dim", + "8", + "--seed", + str(seed), + ] + + extra_args, + ) + yield os.path.join(data_dir, "checkpoint_last.pt") + + def test_load_model_ensemble_and_task(self): + # with contextlib.redirect_stdout(StringIO()): + with self._train_transformer(seed=123) as model1: + with self._train_transformer(seed=456) as model2: + ensemble, cfg, task = checkpoint_utils.load_model_ensemble_and_task( + filenames=[model1, model2] + ) + self.assertEqual(len(ensemble), 2) + + # after Transformer has been migrated to Hydra, this will probably + # become cfg.common.seed + self.assertEqual(ensemble[0].args.seed, 123) + self.assertEqual(ensemble[1].args.seed, 456) + + # the task from the first model should be returned + self.assertTrue("seed123" in task.cfg.data) + + # last cfg is saved + self.assertEqual(cfg.common.seed, 456) + + def test_prune_state_dict(self): + with contextlib.redirect_stdout(StringIO()): + extra_args = ["--encoder-layerdrop", "0.01", "--decoder-layerdrop", "0.01"] + with self._train_transformer(seed=1, extra_args=extra_args) as model: + ensemble, cfg, task = checkpoint_utils.load_model_ensemble_and_task( + filenames=[model], + arg_overrides={ + "encoder_layers_to_keep": "0,2", + "decoder_layers_to_keep": "1", + }, + ) + self.assertEqual(len(ensemble), 1) + self.assertEqual(len(ensemble[0].encoder.layers), 2) + self.assertEqual(len(ensemble[0].decoder.layers), 1) + + def test_torch_persistent_save_async(self): + state_dict = {} + filename = "async_checkpoint.pt" + + with patch(f"{checkpoint_utils.__name__}.PathManager.opena") as mock_opena: + with patch( + f"{checkpoint_utils.__name__}._torch_persistent_save" + ) as mock_save: + checkpoint_utils.torch_persistent_save( + state_dict, filename, async_write=True + ) + mock_opena.assert_called_with(filename, "wb") + mock_save.assert_called() + + def test_load_ema_from_checkpoint(self): + dummy_state = {"a": torch.tensor([1]), "b": torch.tensor([0.1])} + with patch(f"{checkpoint_utils.__name__}.PathManager.open") as mock_open, patch( + f"{checkpoint_utils.__name__}.torch.load" + ) as mock_load: + + mock_load.return_value = {"extra_state": {"ema": dummy_state}} + filename = "ema_checkpoint.pt" + state = checkpoint_utils.load_ema_from_checkpoint(filename) + + mock_open.assert_called_with(filename, "rb") + mock_load.assert_called() + + self.assertIn("a", state["model"]) + self.assertIn("b", state["model"]) + self.assertTrue(torch.allclose(dummy_state["a"], state["model"]["a"])) + self.assertTrue(torch.allclose(dummy_state["b"], state["model"]["b"])) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_checkpoint_utils_for_task_level_attributes.py b/tests/test_checkpoint_utils_for_task_level_attributes.py new file mode 100644 index 0000000000..53ab401f03 --- /dev/null +++ b/tests/test_checkpoint_utils_for_task_level_attributes.py @@ -0,0 +1,172 @@ +#!/usr/bin/env fbpython +# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +import contextlib +import logging +import unittest +from io import StringIO +from unittest.mock import MagicMock, patch + +import torch +from fairseq import checkpoint_utils, data +from omegaconf import OmegaConf + + +def mock_trainer(epoch, num_updates, iterations_in_epoch): + trainer = MagicMock() + trainer.load_checkpoint.return_value = { + "train_iterator": { + "epoch": epoch, + "iterations_in_epoch": iterations_in_epoch, + "shuffle": False, + }, + "FakeTask": checkpoint_dict()["FakeTask"], + } + trainer.get_num_updates.return_value = num_updates + trainer.task.__class__.__name__ = "FakeTask" + trainer.task.get_checkpoint_dict.return_value = checkpoint_dict() + trainer.task.set_checkpoint_dict = MagicMock() + + return trainer + + +def checkpoint_dict(): + return { + "FakeTask": { + "observer_stats": { + ( + 4, + 16, + "MovingAveragePerChannelMinMax", + "MovingAveragePerChannelMinMax", + ): {"mod1": 1, "mod2": 2, "mod3": 3} + } + } + } + + +def mock_dict(): + d = MagicMock() + d.pad.return_value = 1 + d.eos.return_value = 2 + d.unk.return_value = 3 + return d + + +def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch): + tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1) + tokens_ds = data.TokenBlockDataset( + tokens, + sizes=[tokens.size(-1)], + block_size=1, + pad=0, + eos=1, + include_targets=False, + ) + trainer = mock_trainer(epoch, num_updates, iterations_in_epoch) + dataset = data.LanguagePairDataset( + tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False + ) + epoch_itr = data.EpochBatchIterator( + dataset=dataset, + collate_fn=dataset.collater, + batch_sampler=[[i] for i in range(epoch_size)], + ) + return trainer, epoch_itr + + +def get_mock_cfg(finetune_from_model): + cfg_mock = OmegaConf.create( + { + "checkpoint": { + "save_dir": None, + "optimizer_overrides": "{}", + "reset_dataloader": False, + "reset_meters": False, + "reset_optimizer": False, + "reset_lr_scheduler": False, + "finetune_from_model": finetune_from_model, + "model_parallel_size": 1, + "restore_file": "checkpoint_last.pt", + "no_save": False, + "save_interval_updates": 0, + "no_last_checkpoints": False, + "keep_interval_updates": 0, + "keep_last_epochs": 0, + "keep_best_checkpoints": 0, + }, + "common": { + "model_parallel_size": 1, + }, + } + ) + return cfg_mock + + +class TestCheckpointsForTaskLevelAttributes(unittest.TestCase): + def setUp(self) -> None: + self.cfg_mock = get_mock_cfg(None) + self.patches = { + "os.makedirs": MagicMock(), + "os.path.join": MagicMock(), + "os.path.isfile": MagicMock(return_value=True), + "os.path.isabs": MagicMock(return_value=False), + "fairseq.file_io.PathManager.exists": MagicMock(return_value=False), + } + self.applied_patches = [patch(p, d) for p, d in self.patches.items()] + [p.start() for p in self.applied_patches] + logging.disable(logging.CRITICAL) + + self.trainer, self.epoch_itr = get_trainer_and_epoch_itr(2, 150, 200, 50) + self.trainer.get_train_iterator = MagicMock(return_value=self.epoch_itr) + self.epoch_itr.next_epoch_itr(shuffle=False) + + checkpoint_utils.save_checkpoint( + self.cfg_mock.checkpoint, self.trainer, self.epoch_itr, None + ) + + def tearDown(self): + patch.stopall() + logging.disable(logging.NOTSET) + + def test_verify_checkpoint(self) -> None: + cp_dict = self.trainer.task.get_checkpoint_dict() + self.assertTrue(len(cp_dict) == 1) + self.assertTrue("FakeTask" in cp_dict) + self.assertTrue("observer_stats" in cp_dict["FakeTask"]) + self.assertTrue(len(cp_dict["FakeTask"]["observer_stats"]) == 1) + self.assertTrue( + ( + 4, + 16, + "MovingAveragePerChannelMinMax", + "MovingAveragePerChannelMinMax", + ) + in cp_dict["FakeTask"]["observer_stats"] + ) + self.assertTrue( + cp_dict["FakeTask"]["observer_stats"][ + ( + 4, + 16, + "MovingAveragePerChannelMinMax", + "MovingAveragePerChannelMinMax", + ) + ] + == {"mod1": 1, "mod2": 2, "mod3": 3} + ) + + def test_load_checkpoint(self) -> None: + with contextlib.redirect_stdout(StringIO()): + # Now, load checkpoint to ensure the respective logic works as expected + _, epoch_itr = checkpoint_utils.load_checkpoint( + self.cfg_mock.checkpoint, self.trainer + ) + + self.trainer.task.set_checkpoint_dict.assert_called_once_with( + checkpoint_dict()["FakeTask"] + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_constraints.py b/tests/test_constraints.py index 1c37f7e1fb..d766d5130f 100755 --- a/tests/test_constraints.py +++ b/tests/test_constraints.py @@ -3,11 +3,17 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import sys import unittest +from typing import List import torch -from fairseq.token_generation_constraints import * + +from fairseq.token_generation_constraints import ( + ConstraintNode, + OrderedConstraintState, + UnorderedConstraintState, + pack_constraints, +) def tensorize(constraints: List[List[int]]) -> torch.Tensor: @@ -53,7 +59,7 @@ def setUp(self): self.examples = [ ( tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), - "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))", + "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))", # noqa {1: 4, 2: 1, 3: 2, 4: 3, 5: 2, 6: 1, 7: 1}, ), ([], "[None].False#0", {}), diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py new file mode 100644 index 0000000000..c48d02c5c6 --- /dev/null +++ b/tests/test_data_utils.py @@ -0,0 +1,136 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import numpy as np + +from fairseq.data.data_utils_fast import batch_by_size_fn, batch_by_size_vec + + +class TestBatchBySize(unittest.TestCase): + @classmethod + def batch_by_size_baseline( + cls, + indices, + num_tokens_vec, + max_tokens, + max_sentences, + bsz_mult, + ): + """Simple, reliable and slow implementation of batch by size""" + batches = [] + start = 0 + while start < len(indices): + for end in range(start + 1, len(indices) + 1): + max_val = max(num_tokens_vec[pos] for pos in range(start, end)) + sent_count = end - start + num_tokens = max_val * sent_count + overflow = num_tokens > max_tokens > 0 or sent_count > max_sentences > 0 + terminate = overflow or end == len(indices) + if overflow: + sent_count -= 1 + if terminate: + if sent_count > bsz_mult: + sent_count = sent_count - sent_count % bsz_mult + batches.append(indices[start : start + sent_count]) + start = start + sent_count + break + return batches + + @classmethod + def _get_error_message( + cls, max_sentences, max_tokens, bsz_mult, num_tokens_vec, validation, results + ): + return f"""Reference batch_by_size implementation should produce + same output as the baseline method. + Params: + max_sentences={max_sentences}, + max_tokens={max_tokens}, + bsz_mult={bsz_mult}, + num_tokens_vec={num_tokens_vec}, + expected_batches={validation}, + returned_batches={results}""" + + def _compare_results( + self, + indices_len, + batch_by_size_impl, + max_sentences, + max_tokens, + bsz_mult, + num_tokens_vec, + ): + indices = np.array(list(range(indices_len))) + validation = self.batch_by_size_baseline( + indices, + num_tokens_vec, + max_tokens=max_tokens, + max_sentences=max_sentences, + bsz_mult=bsz_mult, + ) + results = batch_by_size_impl( + indices, + num_tokens_vec, + max_tokens=max_tokens, + max_sentences=max_sentences, + bsz_mult=bsz_mult, + ) + error_msg = self._get_error_message( + max_sentences, max_tokens, bsz_mult, num_tokens_vec, validation, results + ) + self.assertEqual(len(validation), len(results), error_msg) + for first, second in zip(validation, results): + self.assertTrue(np.array_equal(first, second), error_msg) + + def _run_compare_with_baseline_sweep(self, batch_by_size_impl): + """Compare reference batch_by_size implementation with batch_by_size_baseline + across a dense grid of hyperparam values""" + MAX_MAX_TOKENS = 10 + NUM_TOKENS_VECS_COUNT = 5 + for indices_len in [10, 11]: # try odd and even len of indices + for max_sentences in range(0, indices_len + 2): + for max_tokens in range(0, MAX_MAX_TOKENS): + for bsz_mult in range(1, max(MAX_MAX_TOKENS, indices_len) + 2): + for _ in range(NUM_TOKENS_VECS_COUNT): + num_tokens_vec = np.random.randint( + 0, max_tokens + 1, size=indices_len + ) + self._compare_results( + indices_len, + batch_by_size_impl, + max_sentences, + max_tokens, + bsz_mult, + num_tokens_vec, + ) + + +class TestBatchBySizeVec(TestBatchBySize): + def test_compare_with_baseline(self): + self._run_compare_with_baseline_sweep(batch_by_size_vec) + + +class TestBatchBySizeFn(TestBatchBySize): + def test_compare_with_baseline(self): + def batch_by_size_fn_wrapper( + indices, + num_tokens_vec, + max_tokens, + max_sentences, + bsz_mult, + ): + def num_tokens_fn(idx): + return num_tokens_vec[idx] + + return batch_by_size_fn( + indices, num_tokens_fn, max_tokens, max_sentences, bsz_mult + ) + + self._run_compare_with_baseline_sweep(batch_by_size_fn_wrapper) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_dataclass_utils.py b/tests/test_dataclass_utils.py new file mode 100644 index 0000000000..231f86b6ee --- /dev/null +++ b/tests/test_dataclass_utils.py @@ -0,0 +1,87 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from argparse import ArgumentParser +from dataclasses import dataclass, field + +from fairseq.dataclass import FairseqDataclass +from fairseq.dataclass.utils import gen_parser_from_dataclass + + +@dataclass +class A(FairseqDataclass): + data: str = field(default="test", metadata={"help": "the data input"}) + num_layers: int = field(default=200, metadata={"help": "more layers is better?"}) + + +@dataclass +class B(FairseqDataclass): + bar: A = field(default=A()) + foo: int = field(default=0, metadata={"help": "not a bar"}) + + +@dataclass +class D(FairseqDataclass): + arch: A = field(default=A()) + foo: int = field(default=0, metadata={"help": "not a bar"}) + + +@dataclass +class C(FairseqDataclass): + data: str = field(default="test", metadata={"help": "root level data input"}) + encoder: D = field(default=D()) + decoder: A = field(default=A()) + lr: int = field(default=0, metadata={"help": "learning rate"}) + + +class TestDataclassUtils(unittest.TestCase): + def test_argparse_convert_basic(self): + parser = ArgumentParser() + gen_parser_from_dataclass(parser, A(), True) + args = parser.parse_args(["--num-layers", "10", "the/data/path"]) + self.assertEqual(args.num_layers, 10) + self.assertEqual(args.data, "the/data/path") + + def test_argparse_recursive(self): + parser = ArgumentParser() + gen_parser_from_dataclass(parser, B(), True) + args = parser.parse_args(["--num-layers", "10", "--foo", "10", "the/data/path"]) + self.assertEqual(args.num_layers, 10) + self.assertEqual(args.foo, 10) + self.assertEqual(args.data, "the/data/path") + + def test_argparse_recursive_prefixing(self): + self.maxDiff = None + parser = ArgumentParser() + gen_parser_from_dataclass(parser, C(), True, "") + args = parser.parse_args( + [ + "--encoder-arch-data", + "ENCODER_ARCH_DATA", + "--encoder-arch-num-layers", + "10", + "--encoder-foo", + "10", + "--decoder-data", + "DECODER_DATA", + "--decoder-num-layers", + "10", + "--lr", + "10", + "the/data/path", + ] + ) + self.assertEqual(args.encoder_arch_data, "ENCODER_ARCH_DATA") + self.assertEqual(args.encoder_arch_num_layers, 10) + self.assertEqual(args.encoder_foo, 10) + self.assertEqual(args.decoder_data, "DECODER_DATA") + self.assertEqual(args.decoder_num_layers, 10) + self.assertEqual(args.lr, 10) + self.assertEqual(args.data, "the/data/path") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_dataset.py b/tests/test_dataset.py new file mode 100644 index 0000000000..a3e3970028 --- /dev/null +++ b/tests/test_dataset.py @@ -0,0 +1,66 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import unittest +from typing import Sequence + +from fairseq.data import LanguagePairDataset, ListDataset, RoundRobinZipDatasets +from tests.test_train import mock_dict + + +def lang_pair_dataset(lengths: Sequence[int]) -> LanguagePairDataset: + tokens = [[i] * l for i, l in enumerate(lengths)] + return LanguagePairDataset(ListDataset(tokens), lengths, mock_dict()) + + +def sample(id: int, length: int): + return {"id": id, "source": [id] * length, "target": None} + + +class TestDataset(unittest.TestCase): + def setUp(self): + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + def test_round_robin_zip_datasets(self): + long_dataset = lang_pair_dataset([10, 9, 8, 11]) + short_dataset = lang_pair_dataset([11, 9]) + + dataset = RoundRobinZipDatasets({"a": long_dataset, "b": short_dataset}) + # Dataset is now sorted by sentence length + dataset.ordered_indices() + assert dataset.longest_dataset is long_dataset + self.assertEqual(dict(dataset[0]), {"a": sample(2, 8), "b": sample(1, 9)}) + # The item 2 of dataset 'a' is with item (2 % 2 = 0) of dataset 'b' + self.assertEqual(dict(dataset[2]), {"a": sample(0, 10), "b": sample(1, 9)}) + + def test_round_robin_zip_datasets_filtered(self): + long_dataset = lang_pair_dataset([10, 20, 8, 11, 1000, 7, 12]) + short_dataset = lang_pair_dataset([11, 20, 9, 1000]) + + dataset = RoundRobinZipDatasets({"a": long_dataset, "b": short_dataset}) + # Dataset is now sorted by sentence length + idx = dataset.ordered_indices() + idx, _ = dataset.filter_indices_by_size(idx, {"a": 19, "b": 900}) + self.assertEqual(list(idx), [0, 1, 2, 3, 4]) + self.assertEqual(dict(dataset[0]), {"a": sample(5, 7), "b": sample(2, 9)}) + self.assertEqual(dict(dataset[2]), {"a": sample(0, 10), "b": sample(1, 20)}) + self.assertEqual(dict(dataset[4]), {"a": sample(6, 12), "b": sample(0, 11)}) + + def test_round_robin_zip_datasets_filtered_with_tuple(self): + long_dataset = lang_pair_dataset([10, 20, 8, 11, 1000, 7, 12]) + short_dataset = lang_pair_dataset([11, 20, 9, 1000]) + + dataset = RoundRobinZipDatasets({"a": long_dataset, "b": short_dataset}) + # Dataset is now sorted by sentence length + idx = dataset.ordered_indices() + idx, _ = dataset.filter_indices_by_size(idx, 19) + self.assertEqual(list(idx), [0, 1, 2, 3, 4]) + self.assertEqual(dict(dataset[0]), {"a": sample(5, 7), "b": sample(2, 9)}) + self.assertEqual(dict(dataset[2]), {"a": sample(0, 10), "b": sample(2, 9)}) + self.assertEqual(dict(dataset[4]), {"a": sample(6, 12), "b": sample(2, 9)}) diff --git a/tests/test_dictionary.py b/tests/test_dictionary.py index 81ce102f4f..dc9d71b3c7 100644 --- a/tests/test_dictionary.py +++ b/tests/test_dictionary.py @@ -4,10 +4,13 @@ # LICENSE file in the root directory of this source tree. import io +import os +import string import tempfile import unittest import torch +from fairseq import tokenizer from fairseq.data import Dictionary @@ -111,6 +114,32 @@ def test_space(self): self.assertEqual(d.index("a"), 5) self.assertEqual(d.index("b"), 6) + def test_add_file_to_dict(self): + counts = {} + num_lines = 100 + per_line = 10 + with tempfile.TemporaryDirectory("test_sampling") as data_dir: + filename = os.path.join(data_dir, "dummy.txt") + with open(filename, "w", encoding="utf-8") as data: + for c in string.ascii_letters: + line = f"{c} " * per_line + for _ in range(num_lines): + data.write(f"{line}\n") + counts[c] = per_line * num_lines + per_line += 5 + + dict = Dictionary() + Dictionary.add_file_to_dictionary( + filename, dict, tokenizer.tokenize_line, 10 + ) + dict.finalize(threshold=0, nwords=-1, padding_factor=8) + + for c in string.ascii_letters: + count = dict.get_count(dict.index(c)) + self.assertEqual( + counts[c], count, f"{c} count is {count} but should be {counts[c]}" + ) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_ema.py b/tests/test_ema.py new file mode 100644 index 0000000000..bd2cf2c78c --- /dev/null +++ b/tests/test_ema.py @@ -0,0 +1,275 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from copy import deepcopy +from dataclasses import dataclass +import pytest +from typing import Optional +from unittest.mock import patch + +import torch + +from fairseq.models.ema import EMA + + +class DummyModule(torch.nn.Module): + def __init__(self) -> None: + """LightningModule for testing purposes + + Args: + epoch_min_loss_override (int, optional): Pass in an epoch that will be set to the minimum + validation loss for testing purposes (zero based). If None this is ignored. Defaults to None. + """ + super().__init__() + self.layer = torch.nn.Linear(in_features=32, out_features=2) + self.another_layer = torch.nn.Linear(in_features=2, out_features=2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.layer(x) + return self.another_layer(x) + + +@dataclass +class EMAConfig(object): + ema_decay: float = 0.99 + ema_start_update: int = 0 + ema_fp32: bool = False + ema_seed_model: Optional[str] = None + ema_update_freq: int = 1 + + +class TestEMA(unittest.TestCase): + def assertTorchAllClose(self, x, y, atol=1e-8, rtol=1e-5, msg=None): + diff = x.float() - y.float() + diff_norm = torch.norm(diff) + other_norm = torch.norm(y.float()) + + if msg is None: + msg = "|input - other| > {} + {} * |other|".format(atol, rtol) + + self.assertLessEqual( + diff_norm, + atol + rtol * other_norm, + msg=msg, + ) + + def test_ema(self): + model = DummyModule() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + state = deepcopy(model.state_dict()) + config = EMAConfig() + ema = EMA(model, config) + + # set decay + ema._set_decay(config.ema_decay) + self.assertEqual(ema.get_decay(), config.ema_decay) + + # get model + self.assertEqual(ema.get_model(), ema.model) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + # EMA step + x = torch.randn(32) + y = model(x) + loss = y.sum() + loss.backward() + optimizer.step() + + ema.step(model) + + ema_state_dict = ema.get_model().state_dict() + + for key, param in model.state_dict().items(): + prev_param = state[key] + ema_param = ema_state_dict[key] + + if "version" in key: + # Do not decay a model.version pytorch param + continue + self.assertTorchAllClose( + ema_param, + config.ema_decay * prev_param + (1 - config.ema_decay) * param, + ) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + # Load EMA into model + model2 = DummyModule() + ema.reverse(model2) + + for key, param in model2.state_dict().items(): + ema_param = ema_state_dict[key] + self.assertTrue(torch.allclose(ema_param, param)) + + # Check that step_internal is called once + with patch.object(ema, "_step_internal", return_value=None) as mock_method: + ema.step(model) + mock_method.assert_called_once_with(model, None) + + def _test_ema_start_update(self, updates): + model = DummyModule() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + state = deepcopy(model.state_dict()) + config = EMAConfig(ema_start_update=1) + ema = EMA(model, config) + + # EMA step + x = torch.randn(32) + y = model(x) + loss = y.sum() + loss.backward() + optimizer.step() + + ema.step(model, updates=updates) + ema_state_dict = ema.get_model().state_dict() + + self.assertEqual(ema.get_decay(), 0 if updates == 0 else config.ema_decay) + + for key, param in model.state_dict().items(): + ema_param = ema_state_dict[key] + prev_param = state[key] + + if "version" in key: + # Do not decay a model.version pytorch param + continue + if updates == 0: + self.assertTorchAllClose( + ema_param, + param, + ) + else: + self.assertTorchAllClose( + ema_param, + config.ema_decay * prev_param + (1 - config.ema_decay) * param, + ) + + # Check that step_internal is called once + with patch.object(ema, "_step_internal", return_value=None) as mock_method: + ema.step(model, updates=updates) + mock_method.assert_called_once_with(model, updates) + + def test_ema_before_start_update(self): + self._test_ema_start_update(updates=0) + + def test_ema_after_start_update(self): + self._test_ema_start_update(updates=1) + + def test_ema_fp32(self): + dtype = torch.float + + model = DummyModule().to(dtype) + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + state = deepcopy(model.state_dict()) + config = EMAConfig(ema_fp32=True) + ema = EMA(model, config) + + x = torch.randn(32) + y = model(x.to(dtype)) + loss = y.sum() + loss.backward() + optimizer.step() + + ema.step(model) + + for key, param in model.state_dict().items(): + prev_param = state[key] + ema_param = ema.get_model().state_dict()[key] + + if "version" in key: + # Do not decay a model.version pytorch param + continue + self.assertIn(key, ema.fp32_params) + + # EMA update is done in fp32, and hence the EMA param must be + # closer to the EMA update done in fp32 than in fp16. + self.assertLessEqual( + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param.float() + + (1 - config.ema_decay) * param.float() + ) + .to(dtype) + .float() + ), + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param + (1 - config.ema_decay) * param + ).float() + ), + ) + self.assertTorchAllClose( + ema_param, + ( + config.ema_decay * prev_param.float() + + (1 - config.ema_decay) * param.float() + ).to(dtype), + ) + + @pytest.mark.skipif( + not torch.cuda.is_available(), + reason="CPU no longer supports Linear in half precision", + ) + def test_ema_fp16(self): + model = DummyModule().cuda().half() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + state = deepcopy(model.state_dict()) + config = EMAConfig(ema_fp32=False) + ema = EMA(model, config) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + x = torch.randn(32).cuda() + y = model(x.half()) + loss = y.sum() + loss.backward() + optimizer.step() + + ema.step(model) + + for key, param in model.state_dict().items(): + prev_param = state[key] + ema_param = ema.get_model().state_dict()[key] + + if "version" in key: + # Do not decay a model.version pytorch param + continue + + # EMA update is done in fp16, and hence the EMA param must be + # closer to the EMA update done in fp16 than in fp32. + self.assertLessEqual( + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param + (1 - config.ema_decay) * param + ).float() + ), + torch.norm( + ema_param.float() + - ( + config.ema_decay * prev_param.float() + + (1 - config.ema_decay) * param.float() + ) + .half() + .float() + ), + ) + self.assertTorchAllClose( + ema_param, + config.ema_decay * prev_param + (1 - config.ema_decay) * param, + ) + + # Since fp32 params is not used, it should be of size 0 + self.assertEqual(len(ema.fp32_params), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_espnet_multihead_attention.py b/tests/test_espnet_multihead_attention.py new file mode 100644 index 0000000000..ee71dd0e98 --- /dev/null +++ b/tests/test_espnet_multihead_attention.py @@ -0,0 +1,176 @@ +import torch +import numpy as np +import unittest +from fairseq.modules import ( + ESPNETMultiHeadedAttention, + RelPositionMultiHeadedAttention, + RotaryPositionMultiHeadedAttention, +) + +torch.use_deterministic_algorithms(True) + + +class TestESPNETMultiHeadedAttention(unittest.TestCase): + def setUp(self) -> None: + self.T = 3 + self.B = 1 + self.C = 2 + torch.manual_seed(0) + self.sample = torch.randn(self.T, self.B, self.C) # TBC + self.sample_scores = torch.randn(self.B, 1, self.T, self.T) + self.MHA = ESPNETMultiHeadedAttention(self.C, 1, dropout=0) + + def test_forward(self): + expected_scores = torch.tensor( + [[[0.1713, -0.3776]], [[0.2263, -0.4486]], [[0.2243, -0.4538]]] + ) + scores, _ = self.MHA(self.sample, self.sample, self.sample) + self.assertTrue( + np.allclose( + expected_scores.cpu().detach().numpy(), + scores.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + def test_forward_qkv(self): + expected_query = torch.tensor( + [[[[-1.0235, 0.0409], [0.4008, 1.3077], [0.5396, 2.0698]]]] + ) + expected_key = torch.tensor( + [[[[0.5053, -0.4965], [-0.3730, -0.9473], [-0.7019, -0.1935]]]] + ) + expected_val = torch.tensor( + [[[[-0.9940, 0.5403], [0.5924, -0.7619], [0.7504, -1.0892]]]] + ) + sample_t = self.sample.transpose(0, 1) + query, key, val = self.MHA.forward_qkv(sample_t, sample_t, sample_t) + self.assertTrue( + np.allclose( + expected_query.cpu().detach().numpy(), + query.cpu().detach().numpy(), + atol=1e-4, + ) + ) + self.assertTrue( + np.allclose( + expected_key.cpu().detach().numpy(), + key.cpu().detach().numpy(), + atol=1e-4, + ) + ) + self.assertTrue( + np.allclose( + expected_val.cpu().detach().numpy(), + val.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + def test_forward_attention(self): + expected_scores = torch.tensor( + [[[0.1627, -0.6249], [-0.2547, -0.6487], [-0.0711, -0.8545]]] + ) + scores = self.MHA.forward_attention( + self.sample.transpose(0, 1).view(self.B, 1, self.T, self.C), + self.sample_scores, + mask=None, + ) + self.assertTrue( + np.allclose( + expected_scores.cpu().detach().numpy(), + scores.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + +class TestRelPositionMultiHeadedAttention(unittest.TestCase): + def setUp(self) -> None: + self.T = 3 + self.B = 1 + self.C = 2 + torch.manual_seed(0) + self.sample = torch.randn(self.T, self.B, self.C) # TBC + self.sample_x = torch.randn(self.B, 1, self.T, self.T * 2 - 1) + self.sample_pos = torch.randn(self.B, self.T * 2 - 1, self.C) + self.MHA = RelPositionMultiHeadedAttention(self.C, 1, dropout=0) + + def test_rel_shift(self): + expected_x = torch.tensor( + [ + [ + [ + [-0.7193, -0.4033, -0.5966], + [-0.8567, 1.1006, -1.0712], + [-0.5663, 0.3731, -0.8920], + ] + ] + ] + ) + x = self.MHA.rel_shift(self.sample_x) + self.assertTrue( + np.allclose( + expected_x.cpu().detach().numpy(), + x.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + def test_forward(self): + expected_scores = torch.tensor( + [ + [[-0.9609, -0.5020]], + [[-0.9308, -0.4890]], + [[-0.9473, -0.4948]], + [[-0.9609, -0.5020]], + [[-0.9308, -0.4890]], + [[-0.9473, -0.4948]], + [[-0.9609, -0.5020]], + [[-0.9308, -0.4890]], + [[-0.9473, -0.4948]], + [[-0.9609, -0.5020]], + [[-0.9308, -0.4890]], + [[-0.9473, -0.4948]], + [[-0.9609, -0.5020]], + [[-0.9308, -0.4890]], + [[-0.9473, -0.4948]], + ] + ) + scores, _ = self.MHA(self.sample, self.sample, self.sample, self.sample_pos) + self.assertTrue( + np.allclose( + expected_scores.cpu().detach().numpy(), + scores.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + +class TestRotaryPositionMultiHeadedAttention(unittest.TestCase): + def setUp(self) -> None: + self.T = 3 + self.B = 1 + self.C = 2 + torch.manual_seed(0) + self.sample = torch.randn(self.T, self.B, self.C) # TBC + self.MHA = RotaryPositionMultiHeadedAttention( + self.C, 1, dropout=0, precision=None + ) + + def test_forward(self): + expected_scores = torch.tensor( + [[[-0.3220, -0.4726]], [[-1.2813, -0.0979]], [[-0.3138, -0.4758]]] + ) + scores, _ = self.MHA(self.sample, self.sample, self.sample) + self.assertTrue( + np.allclose( + expected_scores.cpu().detach().numpy(), + scores.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_export.py b/tests/test_export.py index 87e52bd7c1..3e9a48d187 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -9,12 +9,12 @@ import unittest import torch + from fairseq.data.dictionary import Dictionary from fairseq.models.transformer import TransformerModel from fairseq.modules import multihead_attention, sinusoidal_positional_embedding from fairseq.tasks.fairseq_task import LegacyFairseqTask - DEFAULT_TEST_VOCAB_SIZE = 100 @@ -103,6 +103,18 @@ def test_export_transformer(self): scripted = torch.jit.script(model) _test_save_and_load(scripted) + @unittest.skipIf( + torch.__version__ < "1.6.0", "Targeting OSS scriptability for the 1.6 release" + ) + def test_export_transformer_no_token_pos_emb(self): + task, parser = get_dummy_task_and_parser() + TransformerModel.add_args(parser) + args = parser.parse_args([]) + args.no_token_positional_embeddings = True + model = TransformerModel.build_model(args, task) + scripted = torch.jit.script(model) + _test_save_and_load(scripted) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_file_chunker_utils.py b/tests/test_file_chunker_utils.py new file mode 100644 index 0000000000..5cded04572 --- /dev/null +++ b/tests/test_file_chunker_utils.py @@ -0,0 +1,63 @@ +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import shutil +import tempfile +import unittest +from typing import Optional + + +class TestFileChunker(unittest.TestCase): + _tmpdir: Optional[str] = None + _tmpfile: Optional[str] = None + _line_content = "Hello, World\n" + _num_bytes = None + _num_lines = 200 + _num_splits = 20 + + @classmethod + def setUpClass(cls) -> None: + cls._num_bytes = len(cls._line_content.encode("utf-8")) + cls._tmpdir = tempfile.mkdtemp() + with open(os.path.join(cls._tmpdir, "test.txt"), "w") as f: + cls._tmpfile = f.name + for _i in range(cls._num_lines): + f.write(cls._line_content) + f.flush() + + @classmethod + def tearDownClass(cls) -> None: + # Cleanup temp working dir. + if cls._tmpdir is not None: + shutil.rmtree(cls._tmpdir) # type: ignore + + def test_find_offsets(self): + from fairseq.file_chunker_utils import find_offsets + + offsets = find_offsets(self._tmpfile, self._num_splits) + self.assertEqual(len(offsets), self._num_splits + 1) + (zero, *real_offsets, last) = offsets + self.assertEqual(zero, 0) + for i, o in enumerate(real_offsets): + self.assertEqual( + o, + self._num_bytes + + ((i + 1) * self._num_bytes * self._num_lines / self._num_splits), + ) + self.assertEqual(last, self._num_bytes * self._num_lines) + + def test_readchunks(self): + from fairseq.file_chunker_utils import Chunker, find_offsets + + offsets = find_offsets(self._tmpfile, self._num_splits) + for start, end in zip(offsets, offsets[1:]): + with Chunker(self._tmpfile, start, end) as lines: + all_lines = list(lines) + num_lines = self._num_lines / self._num_splits + self.assertAlmostEqual( + len(all_lines), num_lines, delta=1 + ) # because we split on the bites, we might end up with one more/less line in a chunk + self.assertListEqual( + all_lines, [self._line_content for _ in range(len(all_lines))] + ) diff --git a/tests/test_file_io.py b/tests/test_file_io.py index aef5b80d18..af7c4cedb8 100644 --- a/tests/test_file_io.py +++ b/tests/test_file_io.py @@ -38,10 +38,22 @@ def test_file_io(self): self.assertEqual(s, self._tmpfile_contents) def test_file_io_oss(self): - # Mock fvcore to simulate oss environment. - sys.modules["fvcore"] = MagicMock() + # Mock iopath to simulate oss environment. + sys.modules["iopath"] = MagicMock() from fairseq.file_io import PathManager with PathManager.open(os.path.join(self._tmpdir, "test.txt"), "r") as f: s = f.read() self.assertEqual(s, self._tmpfile_contents) + + def test_file_io_async(self): + # ioPath `PathManager` is initialized after the first `opena` call. + try: + from fairseq.file_io import PathManager + + _asyncfile = os.path.join(self._tmpdir, "async.txt") + f = PathManager.opena(_asyncfile, "wb") + f.close() + + finally: + self.assertTrue(PathManager.async_close()) diff --git a/tests/test_fp16_optimizer.py b/tests/test_fp16_optimizer.py index aa6a863d32..27085a12da 100644 --- a/tests/test_fp16_optimizer.py +++ b/tests/test_fp16_optimizer.py @@ -3,8 +3,8 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import argparse import copy +import logging import unittest import torch @@ -30,6 +30,9 @@ def setUp(self): self.cfg_dls = OmegaConf.create( { + "optimization": { + "lr": [0.1], + }, "optimizer": { "_name": "adam", "lr": [0.1], @@ -43,9 +46,14 @@ def setUp(self): "fp16_scale_tolerance": 1, "threshold_loss_scale": 1, "min_loss_scale": 1e-4, + "tpu": False, }, } ) + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) def run_iter(self, model, params, optimizer): optimizer.zero_grad() diff --git a/tests/test_hf_hub.py b/tests/test_hf_hub.py new file mode 100644 index 0000000000..5cfef70d06 --- /dev/null +++ b/tests/test_hf_hub.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch + +try: + import huggingface_hub +except ImportError: + huggingface_hub = None + +from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub + + +@unittest.skipIf(not huggingface_hub, "Requires huggingface_hub install") +class TestHuggingFaceHub(unittest.TestCase): + @torch.no_grad() + def test_hf_fastspeech2(self): + hf_model_id = "facebook/fastspeech2-en-ljspeech" + models, cfg, task = load_model_ensemble_and_task_from_hf_hub(hf_model_id) + self.assertTrue(len(models) > 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_huffman.py b/tests/test_huffman.py new file mode 100644 index 0000000000..85d0c72a76 --- /dev/null +++ b/tests/test_huffman.py @@ -0,0 +1,179 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import typing as tp +import unittest +from collections import Counter +from tempfile import NamedTemporaryFile, TemporaryDirectory + +from fairseq.data import Dictionary, indexed_dataset +from fairseq.data.huffman import ( + HuffmanCodeBuilder, + HuffmanCoder, + HuffmanMMapIndexedDataset, + HuffmanMMapIndexedDatasetBuilder, +) +from tests.utils import POPULATION, make_data, sizes + + +def make_counts(data: tp.List[tp.List[str]]) -> Counter: + return Counter([symbol for sentence in data for symbol in sentence]) + + +def make_code_builder(data: tp.List[tp.List[str]]) -> HuffmanCodeBuilder: + builder = HuffmanCodeBuilder() + for sentence in data: + builder.add_symbols(*sentence) + return builder + + +class TestCodeBuilder(unittest.TestCase): + def test_code_builder_can_count(self): + data = make_data() + counts = make_counts(data) + builder = make_code_builder(data) + + self.assertEqual(builder.symbols, counts) + + def test_code_builder_can_add(self): + data = make_data() + counts = make_counts(data) + builder = make_code_builder(data) + + new_builder = builder + builder + + self.assertEqual(new_builder.symbols, counts + counts) + + def test_code_builder_can_io(self): + data = make_data() + builder = make_code_builder(data) + + with NamedTemporaryFile() as tmp_fp: + builder.to_file(tmp_fp.name) + other_builder = HuffmanCodeBuilder.from_file(tmp_fp.name) + + self.assertEqual(builder.symbols, other_builder.symbols) + + +class TestCoder(unittest.TestCase): + def test_coder_can_io(self): + data = make_data() + builder = make_code_builder(data) + coder = builder.build_code() + + with NamedTemporaryFile() as tmp_fp: + coder.to_file(tmp_fp.name) + other_coder = HuffmanCoder.from_file(tmp_fp.name) + + self.assertEqual(coder, other_coder) + + def test_coder_can_encode_decode(self): + data = make_data() + builder = make_code_builder(data) + coder = builder.build_code() + + encoded = [coder.encode(sentence) for sentence in data] + decoded = [[n.symbol for n in coder.decode(enc)] for enc in encoded] + + self.assertEqual(decoded, data) + + unseen_data = make_data() + unseen_encoded = [coder.encode(sentence) for sentence in unseen_data] + unseen_decoded = [ + [n.symbol for n in coder.decode(enc)] for enc in unseen_encoded + ] + self.assertEqual(unseen_decoded, unseen_data) + + +def build_dataset(prefix, data, coder): + with HuffmanMMapIndexedDatasetBuilder(prefix, coder) as builder: + for sentence in data: + builder.add_item(sentence) + + +class TestHuffmanDataset(unittest.TestCase): + def test_huffman_can_encode_decode(self): + data = make_data() + builder = make_code_builder(data) + coder = builder.build_code() + + with TemporaryDirectory() as dirname: + prefix = os.path.join(dirname, "test1") + build_dataset(prefix, data, coder) + dataset = HuffmanMMapIndexedDataset(prefix) + + self.assertEqual(len(dataset), len(data)) + decoded = [list(dataset.get_symbols(i)) for i in range(0, len(dataset))] + + self.assertEqual(decoded, data) + data_sizes = [i.item() for i in dataset.sizes] + self.assertEqual(data_sizes, sizes(data)) + + def test_huffman_compresses(self): + data = make_data() + builder = make_code_builder(data) + coder = builder.build_code() + + with TemporaryDirectory() as dirname: + prefix = os.path.join(dirname, "huffman") + build_dataset(prefix, data, coder) + + prefix_mmap = os.path.join(dirname, "mmap") + mmap_builder = indexed_dataset.make_builder( + indexed_dataset.data_file_path(prefix_mmap), + "mmap", + vocab_size=len(POPULATION), + ) + dictionary = Dictionary() + for c in POPULATION: + dictionary.add_symbol(c) + dictionary.finalize() + for sentence in data: + mmap_builder.add_item(dictionary.encode_line(" ".join(sentence))) + mmap_builder.finalize(indexed_dataset.index_file_path(prefix_mmap)) + + huff_size = os.stat(indexed_dataset.data_file_path(prefix)).st_size + mmap_size = os.stat(indexed_dataset.data_file_path(prefix_mmap)).st_size + self.assertLess(huff_size, mmap_size) + + def test_huffman_can_append(self): + data1 = make_data() + builder = make_code_builder(data1) + coder = builder.build_code() + + with TemporaryDirectory() as dirname: + prefix1 = os.path.join(dirname, "test1") + build_dataset(prefix1, data1, coder) + + data2 = make_data() + prefix2 = os.path.join(dirname, "test2") + build_dataset(prefix2, data2, coder) + + prefix3 = os.path.join(dirname, "test3") + + with HuffmanMMapIndexedDatasetBuilder(prefix3, coder) as builder: + builder.append(prefix1) + builder.append(prefix2) + + dataset = HuffmanMMapIndexedDataset(prefix3) + + self.assertEqual(len(dataset), len(data1) + len(data2)) + + decoded1 = [list(dataset.get_symbols(i)) for i in range(0, len(data1))] + self.assertEqual(decoded1, data1) + + decoded2 = [ + list(dataset.get_symbols(i)) for i in range(len(data1), len(dataset)) + ] + self.assertEqual(decoded2, data2) + + data_sizes = [i.item() for i in dataset.sizes] + self.assertEqual(data_sizes[: len(data1)], sizes(data1)) + self.assertEqual(data_sizes[len(data1) : len(dataset)], sizes(data2)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_iopath.py b/tests/test_iopath.py new file mode 100644 index 0000000000..48230a6379 --- /dev/null +++ b/tests/test_iopath.py @@ -0,0 +1,28 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from unittest import mock + + +class TestIOPath(unittest.TestCase): + def test_no_iopath(self): + from .test_reproducibility import TestReproducibility + + with mock.patch.dict("sys.modules", {"iopath": None}): + # reuse reproducibility tests, which are e2e tests that should cover + # most checkpoint related functionality + TestReproducibility._test_reproducibility(self, "test_reproducibility") + + def test_no_supports_rename(self): + from .test_reproducibility import TestReproducibility + + with mock.patch("fairseq.file_io.PathManager.supports_rename") as mock_fn: + mock_fn.return_value = False + TestReproducibility._test_reproducibility(self, "test_reproducibility") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_iterators.py b/tests/test_iterators.py index 3d2c4d6251..2e2eb2f0a8 100644 --- a/tests/test_iterators.py +++ b/tests/test_iterators.py @@ -5,11 +5,12 @@ import unittest -from fairseq.data import iterators +from fairseq.data import iterators, ListDataset class TestIterators(unittest.TestCase): - def test_counting_iterator(self, ref=None, itr=None): + def test_counting_iterator_index(self, ref=None, itr=None): + # Test the indexing functionality of CountingIterator if ref is None: assert itr is None ref = list(range(10)) @@ -17,6 +18,7 @@ def test_counting_iterator(self, ref=None, itr=None): else: assert len(ref) == 10 assert itr is not None + self.assertTrue(itr.has_next()) self.assertEqual(itr.n, 0) self.assertEqual(next(itr), ref[0]) @@ -26,9 +28,36 @@ def test_counting_iterator(self, ref=None, itr=None): itr.skip(3) self.assertEqual(itr.n, 5) self.assertEqual(next(itr), ref[5]) - itr.skip(3) - self.assertEqual(itr.n, 9) - self.assertEqual(next(itr), ref[9]) + itr.skip(2) + self.assertEqual(itr.n, 8) + self.assertEqual(list(itr), [ref[8], ref[9]]) + self.assertFalse(itr.has_next()) + + def test_counting_iterator_length_mismatch(self): + ref = list(range(10)) + # When the underlying iterable is longer than the CountingIterator, + # the remaining items in the iterable should be ignored + itr = iterators.CountingIterator(ref, total=8) + self.assertEqual(list(itr), ref[:8]) + # When the underlying iterable is shorter than the CountingIterator, + # raise an IndexError when the underlying iterable is exhausted + itr = iterators.CountingIterator(ref, total=12) + self.assertRaises(IndexError, list, itr) + + def test_counting_iterator_take(self): + # Test the "take" method of CountingIterator + ref = list(range(10)) + itr = iterators.CountingIterator(ref) + itr.take(5) + self.assertEqual(len(itr), len(list(iter(itr)))) + self.assertEqual(len(itr), 5) + + itr = iterators.CountingIterator(ref) + itr.take(5) + self.assertEqual(next(itr), ref[0]) + self.assertEqual(next(itr), ref[1]) + itr.skip(2) + self.assertEqual(next(itr), ref[4]) self.assertFalse(itr.has_next()) def test_grouped_iterator(self): @@ -41,11 +70,11 @@ def test_grouped_iterator(self): itr = iterators.GroupedIterator(x, 5) self.assertEqual(list(itr), [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]) - # test CountingIterator functionality + # test the GroupIterator also works correctly as a CountingIterator x = list(range(30)) ref = list(iterators.GroupedIterator(x, 3)) itr = iterators.GroupedIterator(x, 3) - self.test_counting_iterator(ref, itr) + self.test_counting_iterator_index(ref, itr) def test_sharded_iterator(self): # test correctness @@ -67,22 +96,7 @@ def test_sharded_iterator(self): x = list(range(30)) ref = list(iterators.ShardedIterator(x, num_shards=3, shard_id=0)) itr = iterators.ShardedIterator(x, num_shards=3, shard_id=0) - self.test_counting_iterator(ref, itr) - - def test_counting_iterator_take(self): - ref = list(range(10)) - itr = iterators.CountingIterator(ref) - itr.take(5) - self.assertEqual(len(itr), len(list(iter(itr)))) - self.assertEqual(len(itr), 5) - - itr = iterators.CountingIterator(ref) - itr.take(5) - self.assertEqual(next(itr), ref[0]) - self.assertEqual(next(itr), ref[1]) - itr.skip(2) - self.assertEqual(next(itr), ref[4]) - self.assertFalse(itr.has_next()) + self.test_counting_iterator_index(ref, itr) def test_counting_iterator_buffered_iterator_take(self): ref = list(range(10)) @@ -118,6 +132,63 @@ def test_counting_iterator_buffered_iterator_take(self): self.assertFalse(itr.has_next()) self.assertRaises(StopIteration, next, buffered_itr) + def test_epoch_batch_iterator_skip_remainder_batch(self): + reference = [1, 2, 3] + itr1 = _get_epoch_batch_itr(reference, 2, True) + self.assertEqual(len(itr1), 1) + itr2 = _get_epoch_batch_itr(reference, 2, False) + self.assertEqual(len(itr2), 2) + itr3 = _get_epoch_batch_itr(reference, 1, True) + self.assertEqual(len(itr3), 2) + itr4 = _get_epoch_batch_itr(reference, 1, False) + self.assertEqual(len(itr4), 3) + itr5 = _get_epoch_batch_itr(reference, 4, True) + self.assertEqual(len(itr5), 0) + self.assertFalse(itr5.has_next()) + itr6 = _get_epoch_batch_itr(reference, 4, False) + self.assertEqual(len(itr6), 1) + + def test_grouped_iterator_skip_remainder_batch(self): + reference = [1, 2, 3, 4, 5, 6, 7, 8, 9] + itr1 = _get_epoch_batch_itr(reference, 3, False) + grouped_itr1 = iterators.GroupedIterator(itr1, 2, True) + self.assertEqual(len(grouped_itr1), 1) + + itr2 = _get_epoch_batch_itr(reference, 3, False) + grouped_itr2 = iterators.GroupedIterator(itr2, 2, False) + self.assertEqual(len(grouped_itr2), 2) + + itr3 = _get_epoch_batch_itr(reference, 3, True) + grouped_itr3 = iterators.GroupedIterator(itr3, 2, True) + self.assertEqual(len(grouped_itr3), 1) + + itr4 = _get_epoch_batch_itr(reference, 3, True) + grouped_itr4 = iterators.GroupedIterator(itr4, 2, False) + self.assertEqual(len(grouped_itr4), 1) + + itr5 = _get_epoch_batch_itr(reference, 5, True) + grouped_itr5 = iterators.GroupedIterator(itr5, 2, True) + self.assertEqual(len(grouped_itr5), 0) + + itr6 = _get_epoch_batch_itr(reference, 5, True) + grouped_itr6 = iterators.GroupedIterator(itr6, 2, False) + self.assertEqual(len(grouped_itr6), 1) + + +def _get_epoch_batch_itr(ref, bsz, skip_remainder_batch): + dsz = len(ref) + indices = range(dsz) + starts = indices[::bsz] + batch_sampler = [indices[s : s + bsz] for s in starts] + dataset = ListDataset(ref) + itr = iterators.EpochBatchIterator( + dataset=dataset, + collate_fn=dataset.collater, + batch_sampler=batch_sampler, + skip_remainder_batch=skip_remainder_batch, + ) + return itr.next_epoch_itr() + if __name__ == "__main__": unittest.main() diff --git a/tests/test_lm_context_window.py b/tests/test_lm_context_window.py new file mode 100644 index 0000000000..165e04ac3a --- /dev/null +++ b/tests/test_lm_context_window.py @@ -0,0 +1,54 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch + +from fairseq.data import MonolingualDataset +from fairseq.tasks.language_modeling import LanguageModelingConfig, LanguageModelingTask +from tests import utils as test_utils + + +class TestLMContextWindow(unittest.TestCase): + def test_eval_dataloader(self): + dictionary = test_utils.dummy_dictionary(10) + assert len(dictionary) == 14 # 4 extra special symbols + assert dictionary.pad() == 1 + + dataset = test_utils.TestDataset( + [ + torch.tensor([4, 5, 6, 7], dtype=torch.long), + torch.tensor([8, 9, 10, 11], dtype=torch.long), + torch.tensor([12, 13], dtype=torch.long), + ] + ) + dataset = MonolingualDataset(dataset, sizes=[4, 4, 2], src_vocab=dictionary) + + config = LanguageModelingConfig(tokens_per_sample=4) + task = LanguageModelingTask(config, dictionary) + + eval_dataloader = task.eval_lm_dataloader( + dataset=dataset, + batch_size=1, + context_window=2, + num_workers=0, + ) + + batch = next(eval_dataloader) + assert batch["net_input"]["src_tokens"][0].tolist() == [4, 5, 6, 7, 1, 1] + assert batch["target"][0].tolist() == [4, 5, 6, 7, 1, 1] + + batch = next(eval_dataloader) + assert batch["net_input"]["src_tokens"][0].tolist() == [6, 7, 8, 9, 10, 11] + assert batch["target"][0].tolist() == [1, 1, 8, 9, 10, 11] + + batch = next(eval_dataloader) + assert batch["net_input"]["src_tokens"][0].tolist() == [10, 11, 12, 13] + assert batch["target"][0].tolist() == [1, 1, 12, 13] + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 2de6969cf4..fc93b48088 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -6,7 +6,7 @@ import unittest import uuid -from fairseq import metrics +from fairseq.logging import metrics class TestMetrics(unittest.TestCase): diff --git a/tests/test_multi_corpus_dataset.py b/tests/test_multi_corpus_dataset.py new file mode 100644 index 0000000000..79900abf61 --- /dev/null +++ b/tests/test_multi_corpus_dataset.py @@ -0,0 +1,82 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from collections import OrderedDict + +import torch + +from fairseq.data import LanguagePairDataset, TokenBlockDataset +from fairseq.data.multi_corpus_dataset import MultiCorpusDataset +from tests.test_train import mock_dict + + +class TestMultiCorpusDataset(unittest.TestCase): + def setUp(self): + d = mock_dict() + tokens_1 = torch.LongTensor([i for i in range(1, 5000, 2)]).view(1, -1) + tokens_ds1 = TokenBlockDataset( + tokens_1, + sizes=[tokens_1.size(-1)], + block_size=1, + pad=0, + eos=1, + include_targets=False, + ) + self.dataset_1 = LanguagePairDataset( + tokens_ds1, tokens_ds1.sizes, d, shuffle=False + ) + tokens_2 = torch.LongTensor([i for i in range(0, 5000, 2)]).view(1, -1) + tokens_ds2 = TokenBlockDataset( + tokens_2, + sizes=[tokens_2.size(-1)], + block_size=1, + pad=0, + eos=1, + include_targets=False, + ) + self.dataset_2 = LanguagePairDataset( + tokens_ds2, tokens_ds2.sizes, d, shuffle=False + ) + + def _test_sample_helper( + self, + distribution, + ): + m = MultiCorpusDataset( + OrderedDict({0: self.dataset_1, 1: self.dataset_2}), + distribution=distribution, + seed=0, + sort_indices=True, + ) + m.set_epoch(1) + indices = m.ordered_indices() + count_sample_from_first_dataset = 0 + items = set() + for i in indices: + item = m[i]["source"].item() + if item % 2 == 1: + count_sample_from_first_dataset += 1 + + items.add(item) + sample_from_first_ds_percentage = ( + 1.0 * count_sample_from_first_dataset / len(indices) + ) + self.assertLess( + abs(sample_from_first_ds_percentage - distribution[0]), + 0.01, + ) + self.assertEqual( + len(items), + int( + min(len(self.dataset_1), len(indices) * distribution[0]) + + min(len(self.dataset_1), len(indices) * distribution[1]) + ), + ) + print(distribution) + + def test_multi_corpus_dataset(self): + for distribution in [[0.5, 0.5], [0.1, 0.9], [0.9, 0.1], [0.0, 1.0]]: + self._test_sample_helper(distribution=distribution) diff --git a/tests/test_multi_corpus_sampled_dataset.py b/tests/test_multi_corpus_sampled_dataset.py index 05b20328c5..88f0817a54 100644 --- a/tests/test_multi_corpus_sampled_dataset.py +++ b/tests/test_multi_corpus_sampled_dataset.py @@ -79,7 +79,7 @@ def test_multi_corpus_sampled_dataset_uniform_sample(self): def test_multi_corpus_sampled_dataset_weighted_sample(self): def naive_weighted_sample(weights): - def f(l): + def f(input): v = np.random.random() agg = 0 for i, weight in enumerate(weights): diff --git a/tests/test_multihead_attention.py b/tests/test_multihead_attention.py index 9aa9cb2f87..4a0b430b6f 100644 --- a/tests/test_multihead_attention.py +++ b/tests/test_multihead_attention.py @@ -3,10 +3,410 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import random import unittest +import pytest import torch -from fairseq.modules.multihead_attention import MultiheadAttention + +from fairseq.modules.multihead_attention import MultiheadAttention, _mask_for_xformers + +BATCH = [20, 41, 97] +SEQ = [64] +EMB = [48] +HEADS = [4] +DROP = 0.1 +DEVICE = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"] +ATTN_MASK_DTYPE = [None, torch.uint8, torch.bool, torch.float] +KEY_PADDING_MASK_DTYPE = [None, torch.uint8, torch.bool] + + +# FIXME: some tests fail when decimal=2, fix this and set decimal to 2 +def assert_almost_equal(x, y, decimal=1, err_msg=""): + import numpy.testing as npt + + if isinstance(x, torch.Tensor): + x = x.cpu().detach().numpy() + if isinstance(y, torch.Tensor): + y = y.cpu().detach().numpy() + npt.assert_array_almost_equal(x, y, err_msg=err_msg, decimal=decimal) + + +def _reset_seeds(): + torch.manual_seed(0) + torch.random.manual_seed(0) + random.seed(0) + torch.cuda.manual_seed_all(0) + + +def _get_mask(to_dtype: torch.dtype, dim0: int, dim1: int): + if to_dtype == torch.float: + mask = torch.randint(0, 2, (dim0, dim1)).to(dtype=torch.bool) + return mask.to(dtype=to_dtype).masked_fill(mask, -float("inf")) + return torch.randint(0, 2, (dim0, dim1)).to(dtype=to_dtype) + + +def test_mask_for_xformers(): + # Additive Mask + m_float_add = torch.tensor([float("-inf"), 0]).to(torch.float) + m_float_add_flipped = torch.tensor([0, float("-inf")]).to(torch.float) + m_float16_add = torch.tensor([float("-inf"), 0]).to(torch.float16) + m_float16_add_flipped = torch.tensor([0, float("-inf")]).to(torch.float16) + m_uint = torch.tensor([1, 0]).to(torch.uint8) + m_uint_flipped = torch.tensor([0, 1]).to(torch.uint8) + m_bool = torch.tensor([False, True]) + + assert torch.equal(_mask_for_xformers(m_float_add), m_float_add) + assert torch.equal(_mask_for_xformers(m_float16_add), m_float16_add) + assert torch.equal(_mask_for_xformers(m_uint), m_uint_flipped) + assert torch.equal(_mask_for_xformers(m_bool), ~m_bool) + + assert torch.equal( + _mask_for_xformers(m_float_add, to_dtype=torch.float16), m_float16_add + ) + assert torch.equal( + _mask_for_xformers(m_float_add, to_dtype=torch.float), m_float_add + ) + assert torch.equal(_mask_for_xformers(m_float_add, to_dtype=torch.bool), m_bool) + assert torch.equal( + _mask_for_xformers(m_float_add, to_dtype=torch.uint8), m_uint_flipped + ) + + assert torch.equal( + _mask_for_xformers(m_float16_add, to_dtype=torch.float16), m_float16_add + ) + assert torch.equal( + _mask_for_xformers(m_float16_add, to_dtype=torch.float), m_float_add + ) + assert torch.equal(_mask_for_xformers(m_float16_add, to_dtype=torch.bool), m_bool) + assert torch.equal( + _mask_for_xformers(m_float16_add, to_dtype=torch.uint8), m_uint_flipped + ) + + assert torch.equal( + _mask_for_xformers(m_bool, to_dtype=torch.float16), m_float16_add_flipped + ) + assert torch.equal( + _mask_for_xformers(m_bool, to_dtype=torch.float), m_float_add_flipped + ) + assert torch.equal(_mask_for_xformers(m_bool, to_dtype=torch.bool), ~m_bool) + assert torch.equal(_mask_for_xformers(m_bool, to_dtype=torch.uint8), m_uint) + + assert torch.equal( + _mask_for_xformers(m_uint, to_dtype=torch.float16), m_float16_add + ) + assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.float), m_float_add) + assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.bool), m_bool) + assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.uint8), m_uint_flipped) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="blocksparse requires gpu") +@pytest.mark.skip(reason="not part of latest xformers") +@pytest.mark.parametrize("device", ["cuda"]) +@pytest.mark.parametrize("add_zero_attn", [False]) +@pytest.mark.parametrize("batch_size", [20]) +@pytest.mark.parametrize("embedding", [64]) +@pytest.mark.parametrize("seq_len", [64]) +@pytest.mark.parametrize("num_heads", [4]) +def test_xformers_blocksparse_parity( + device, + add_zero_attn, + batch_size, + embedding, + seq_len, + num_heads, +): + + xformers_att_config = '{"name": "scaled_dot_product"}' + xformers_blocksparse_blocksize = 16 + xformers_blocksparse_layout = torch.ones( + seq_len // xformers_blocksparse_blocksize, + seq_len // xformers_blocksparse_blocksize, + dtype=torch.int32, + ) + + q = torch.rand(seq_len, batch_size, embedding).to(device).half() + q.requires_grad = True + k = torch.rand(seq_len, batch_size, embedding).to(device).half() + k.requires_grad = True + v = torch.rand(seq_len, batch_size, embedding).to(device).half() + v.requires_grad = True + + q_ = q.detach().clone().half() + q_.requires_grad = True + k_ = k.detach().clone().half() + k_.requires_grad = True + v_ = v.detach().clone().half() + v_.requires_grad = True + + _reset_seeds() + xf_blocksparse_mha = ( + MultiheadAttention( + embedding, + num_heads, + dropout=0.0, + add_zero_attn=add_zero_attn, + xformers_att_config=xformers_att_config, + xformers_blocksparse_layout=xformers_blocksparse_layout, + xformers_blocksparse_blocksize=xformers_blocksparse_blocksize, + ) + .to(device) + .half() + ) + + xf_blocksparse_output, _ = xf_blocksparse_mha( + q, + k, + v, + ) + + _reset_seeds() + xformers_mha = ( + MultiheadAttention( + embedding, + num_heads, + dropout=0.0, + add_zero_attn=add_zero_attn, + xformers_att_config=xformers_att_config, + xformers_blocksparse_layout=None, + ) + .to(device) + .half() + ) + + xformers_output, _ = xformers_mha( + q_, + k_, + v_, + ) + + # # account for when nan != nan + rand = random.uniform(0, 1) + xformers_output = xformers_output.masked_fill(xformers_output.isnan(), rand) + xf_blocksparse_output = xf_blocksparse_output.masked_fill( + xf_blocksparse_output.isnan(), rand + ) + + assert_almost_equal(xformers_output, xf_blocksparse_output) + + loss_blocksparse = torch.norm(xformers_output) + loss_original = torch.norm(xf_blocksparse_output) + loss_blocksparse.backward() + loss_original.backward() + + q.masked_fill(q.isnan(), rand) + q_.masked_fill(q_.isnan(), rand) + k.masked_fill(k.isnan(), rand) + k_.masked_fill(k_.isnan(), rand) + v.masked_fill(v.isnan(), rand) + v_.masked_fill(v_.isnan(), rand) + + assert_almost_equal(q.grad, q_.grad) + assert_almost_equal(k.grad, k_.grad) + assert_almost_equal(v.grad, v_.grad) + + +@pytest.mark.parametrize("device", DEVICE) +@pytest.mark.parametrize("attn_dtype", ATTN_MASK_DTYPE) +@pytest.mark.parametrize("key_padding_dtype", KEY_PADDING_MASK_DTYPE) +@pytest.mark.parametrize("add_bias_kv", [True, False]) +@pytest.mark.parametrize("add_zero_attn", [True, False]) +# TODO: test with static_kv True +@pytest.mark.parametrize("static_kv", [False]) +@pytest.mark.parametrize("batch_size", BATCH) +@pytest.mark.parametrize("embedding", EMB) +@pytest.mark.parametrize("seq_len", SEQ) +@pytest.mark.parametrize("num_heads", HEADS) +def test_xformers_single_forward_parity( + device, + attn_dtype, + key_padding_dtype, + add_bias_kv, + add_zero_attn, + static_kv, + batch_size, + embedding, + seq_len, + num_heads, +): + + xformers_att_config = '{"name": "scaled_dot_product"}' + + attn_mask = ( + None + if attn_dtype is None + else _get_mask(to_dtype=attn_dtype, dim0=seq_len, dim1=seq_len).to(device) + ) + key_padding_mask = ( + None + if key_padding_dtype is None + else _get_mask(to_dtype=key_padding_dtype, dim0=batch_size, dim1=seq_len).to( + device + ) + ) + + q = torch.rand(seq_len, batch_size, embedding).to(device) + q.requires_grad = True + k = torch.rand(seq_len, batch_size, embedding).to(device) + k.requires_grad = True + v = torch.rand(seq_len, batch_size, embedding).to(device) + v.requires_grad = True + + q_ = q.detach().clone() + q_.requires_grad = True + k_ = k.detach().clone() + k_.requires_grad = True + v_ = v.detach().clone() + v_.requires_grad = True + + # TODO: dropouts in the two implementations lead to different entries dropped. + _reset_seeds() + xformers_mha = MultiheadAttention( + embedding, + num_heads, + dropout=0.0, + xformers_att_config=xformers_att_config, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ).to(device) + xformers_output, _ = xformers_mha( + q, + k, + v, + key_padding_mask=key_padding_mask, + attn_mask=attn_mask, + static_kv=static_kv, + ) + + _reset_seeds() + original_mha = MultiheadAttention( + embedding, + num_heads, + dropout=0.0, + xformers_att_config=None, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ).to(device) + original_output, _ = original_mha( + q_, + k_, + v_, + key_padding_mask=key_padding_mask, + attn_mask=attn_mask, + static_kv=static_kv, + ) + + # account for when nan != nan + if xformers_output.isnan().any() or original_output.isnan().any(): + rand = random.uniform(0, 1) + xformers_output = xformers_output.masked_fill(xformers_output.isnan(), rand) + original_output = original_output.masked_fill(original_output.isnan(), rand) + + # torch.equal works for cpu, on cuda allclose is needed. + assert torch.allclose( + xformers_output, original_output, atol=1e-06 + ), f"max diff is {torch.max(torch.abs(xformers_output - original_output))}" + + loss_xformers = torch.norm(xformers_output) + loss_original = torch.norm(original_output) + loss_xformers.backward() + loss_original.backward() + + # torch.equal works for cpu, on cuda allclose is needed. + assert torch.allclose( + q.grad, q_.grad + ), f"max diff is {torch.max(torch.abs(q.grad - q_.grad))}" + assert torch.allclose( + k.grad, k_.grad + ), f"max diff is {torch.max(torch.abs(k.grad - k_.grad))}" + assert torch.allclose( + v.grad, v_.grad + ), f"max diff is {torch.max(torch.abs(v.grad - v_.grad))}" + + +def test_mask_padding_parity(): + def old_padding_code(key_padding_mask, attn_mask): + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask), + ], + dim=1, + ) + return key_padding_mask, attn_mask + + # values don't matter for this test. + mha = MultiheadAttention( + embed_dim=8, + num_heads=2, + dropout=0.0, + add_bias_kv=True, + add_zero_attn=True, + ) + + key_padding_mask = torch.rand((8, 64)) + attn_mask = torch.rand((64, 64)) + + kp_mask_orig, a_mask_orig = old_padding_code(key_padding_mask, attn_mask) + kp_mask_new, a_mask_new = mha._pad_masks(key_padding_mask, attn_mask) + + assert kp_mask_orig.size() == kp_mask_new.size() + assert a_mask_orig.size() == a_mask_new.size() + assert torch.equal(kp_mask_orig, kp_mask_new) + assert torch.equal(a_mask_orig, a_mask_new) + + +def test_add_bias_parity(): + # values don't matter for this test. + mha = MultiheadAttention( + embed_dim=8, + num_heads=2, + dropout=0.0, + add_bias_kv=True, + add_zero_attn=True, + ) + + def old_bias_code(k, v, key_padding_mask, attn_mask, bsz): + k = torch.cat([k, mha.bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, mha.bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + key_padding_mask.new_zeros(key_padding_mask.size(0), 1), + ], + dim=1, + ) + return k, v, key_padding_mask, attn_mask + + seq_len = 64 + bsz = 8 + embedding = 8 + key_padding_mask = torch.rand((bsz, seq_len)) + attn_mask = torch.rand((seq_len, seq_len)) + k = torch.rand((seq_len, bsz, embedding)) + v = torch.rand((seq_len, bsz, embedding)) + + k_orig, v_orig, kp_mask_orig, a_mask_orig = old_bias_code( + k, v, key_padding_mask, attn_mask, bsz + ) + k_new, v_new, kp_mask_new, a_mask_new = mha._add_bias( + k, v, key_padding_mask, attn_mask, bsz + ) + + assert torch.equal(k_orig, k_new) + assert torch.equal(v_orig, v_new) + assert torch.equal(kp_mask_orig, kp_mask_new) + assert torch.equal(a_mask_orig, a_mask_new) class TestMultiheadAttention(unittest.TestCase): @@ -35,6 +435,18 @@ def test_append_prev_key_padding_mask(self): torch.tensor([[0, 1, 0]]).bool(), torch.tensor([[0, 1, 0, 1]]).bool(), ), + # prev_key_padding_mask already full + ( + torch.tensor([[0, 1, 0, 1]]).bool(), + None, + torch.tensor([[0, 1, 0, 1]]).bool(), + ), + # key_padding_mask already full + ( + None, + torch.tensor([[0, 1, 0, 1]]).bool(), + torch.tensor([[0, 1, 0, 1]]).bool(), + ), ] for c in cases: key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( @@ -56,6 +468,21 @@ def test_append_prev_key_padding_mask(self): else: self.assertIsNone(c[2]) + def test_pruning_heads(self): + embed_dim = 768 + num_heads = 12 + num_heads_to_keep = 8 + dummy_input = torch.randn(32, 2, embed_dim) + mha = MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads) + reserve_head_index = mha._get_reserve_head_index( + num_heads_to_keep=num_heads_to_keep + ) + mha._adaptive_prune_heads(reserve_head_index=reserve_head_index) + mha._set_skip_embed_dim_check() + mha(query=dummy_input, key=dummy_input, value=dummy_input) + self.assertEqual(mha.head_dim, embed_dim / num_heads) + self.assertEqual(mha.num_heads, num_heads_to_keep) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_noising.py b/tests/test_noising.py index b3d0d123c4..1956f6ad1d 100644 --- a/tests/test_noising.py +++ b/tests/test_noising.py @@ -6,8 +6,9 @@ import unittest from typing import Dict, List -import tests.utils as test_utils import torch + +import tests.utils as test_utils from fairseq import utils from fairseq.data import ( Dictionary, @@ -138,7 +139,7 @@ def _convert_src_tokens_to_tensor( return x, torch.LongTensor(src_len) def assert_eos_at_end(self, x, x_len, eos): - """Asserts last token of every sentence in x is EOS """ + """Asserts last token of every sentence in x is EOS""" for i in range(len(x_len)): self.assertEqual( x[x_len[i] - 1][i], @@ -373,7 +374,7 @@ def test_word_shuffle_without_eos_with_bpe_end_marker(self): ) def assert_no_eos_at_end(self, x, x_len, eos): - """Asserts that the last token of each sentence in x is not EOS """ + """Asserts that the last token of each sentence in x is not EOS""" for i in range(len(x_len)): self.assertNotEqual( x[x_len[i] - 1][i], diff --git a/tests/test_online_backtranslation.py b/tests/test_online_backtranslation.py new file mode 100644 index 0000000000..0ae7e773da --- /dev/null +++ b/tests/test_online_backtranslation.py @@ -0,0 +1,206 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import tempfile +import unittest +from pathlib import Path +from typing import Any, Dict, Sequence + +import fairseq.data.indexed_dataset as indexed_dataset +import fairseq.options +import fairseq.tasks.online_backtranslation as obt +import torch +from tests import utils + + +def mk_sample(tokens: Sequence[int], batch_size: int = 2) -> Dict[str, Any]: + batch = torch.stack([torch.tensor(tokens, dtype=torch.long)] * batch_size) + sample = { + "net_input": { + "src_tokens": batch, + "prev_output_tokens": batch, + "src_lengths": torch.tensor([len(tokens)] * batch_size, dtype=torch.long), + }, + "target": batch[:, 1:], + } + return sample + + +def mk_dataset(num_samples: int, max_len: int, output: Path): + output.parent.mkdir(exist_ok=True) + idx = indexed_dataset.IndexedDatasetBuilder(str(output)) + data = torch.randint(5, 100, (num_samples, max_len)) + lengths = torch.randint(3, max_len, (num_samples,)) + for d, l in zip(data, lengths): + d[0] = 0 + idx.add_item(d[:l]) + idx.finalize(output.with_suffix(".idx")) + assert output.exists() + assert output.with_suffix(".idx").exists() + + +class OnlineBacktranslationTest(unittest.TestCase): + + tmp_dir = Path(tempfile.mkdtemp(suffix="OnlineBacktranslationTest")) + + @classmethod + def obt_task( + cls, languages: Sequence[str], data: Path = None, language_mapping: str = None + ): + dict_path = cls.tmp_dir / "dict.txt" + if not dict_path.exists(): + dictionary = utils.dummy_dictionary(100) + dictionary.save(str(dict_path)) + + if data is not None: + (data / "dict.txt").write_text(dict_path.read_text()) + else: + data = cls.tmp_dir + assert len(languages) >= 2 + + kwargs = { + "arch": "transformer", + # --max-sentences=1 for better predictability of batches + "max_sentences": 1, + # Use characteristics dimensions + "encoder_layers": 3, + "encoder_embed_dim": 12, + "encoder_ffn_embed_dim": 14, + "encoder_attention_heads": 4, + "decoder_layers": 3, + "decoder_embed_dim": 12, + "decoder_output_dim": 12, + "decoder_ffn_embed_dim": 14, + "decoder_attention_heads": 4, + # Disable dropout so we have comparable tests. + "dropout": 0, + "attention_dropout": 0, + "activation_dropout": 0, + "encoder_layerdrop": 0, + } + + args = fairseq.options.get_args( + data, + task="online_backtranslation", + mono_langs=",".join(languages), + valid_lang_pairs=f"{languages[0]}-{languages[1]}", + tokens_per_sample=256, + language_mapping=language_mapping, + **kwargs, + ) + task = obt.OnlineBackTranslationTask.setup_task(args) + # we need to build the model to have the correct dictionary + model = task.build_model(task.args) + return task, model + + def tmp_path(self, test_case: str) -> Path: + return Path(tempfile.mkdtemp(test_case, dir=self.tmp_dir)) + + def test_lang_tokens(self): + task, model = self.obt_task(["en", "ro", "zh"]) + assert obt._lang_token("en") in task.dictionary + assert obt._lang_token("ro") in task.dictionary + assert obt._lang_token("zh") in task.dictionary + + en_bos = obt._lang_token_index(task.common_dict, "en") + assert "en" == task.common_dict[en_bos].strip("_") + zh_bos = obt._lang_token_index(task.common_dict, "zh") + assert "zh" == task.common_dict[zh_bos].strip("_") + zh_sample = mk_sample([zh_bos, 16, 14, 12, 10]) + + # we expect to receive the bos token for translation + assert task.get_bos_token_from_sample(zh_sample) == en_bos + + def test_backtranslate_sample(self): + task, model = self.obt_task(["en", "ro", "zh"]) + + en_bos = obt._lang_token_index(task.common_dict, "en") + zh_bos = obt._lang_token_index(task.common_dict, "zh") + sample = mk_sample([zh_bos, 16, 14, 12, 10]) + + task.backtranslate_sample(sample, "zh", "en") + target_zh = list(sample["target"][0]) + assert target_zh == [16, 14, 12, 10] # original zh sentence + generated_en = sample["net_input"]["src_tokens"][0] + assert generated_en[0] == en_bos + + def test_train_dataset(self): + data = self.tmp_path("test_train_dataset") + mk_dataset(20, 10, data / "en" / "train.bin") + mk_dataset(10, 10, data / "zh" / "train.bin") + task, model = self.obt_task(["en", "zh"], data) + task.load_dataset("train") + + en_bos = obt._lang_token_index(task.common_dict, "en") + zh_bos = obt._lang_token_index(task.common_dict, "zh") + + train = task.datasets["train"] + train.ordered_indices() + train.prefetch([0, 19]) + sample_0 = train[0] + sample_19 = train[19] + self.assertEqual( + set(sample_0.keys()), {"en-BT", "en-DENOISE", "zh-BT", "zh-DENOISE"} + ) + for sample in (sample_0, sample_19): + self.assertEqual(sample["en-BT"]["source"][0], en_bos) + # bt target isn't ready to look at. + self.assertEqual(sample["en-DENOISE"]["source"][0], en_bos) + # TODO What could we check on the target side ? + + for i in range(10): + # Zh dataset is shorter, and is wrapped around En dataset. + train.prefetch([i, i + 10]) + self.assertEqual( + list(train[i]["zh-DENOISE"]["source"]), + list(train[i + 10]["zh-DENOISE"]["source"]), + ) + self.assertEqual(train[i]["zh-DENOISE"]["source"][0].item(), zh_bos) + + # Sorted by increasing len + self.assertLess( + len(sample_0["en-BT"]["source"]), len(sample_19["en-BT"]["source"]) + ) + + def test_valid_dataset(self): + data = self.tmp_path("test_valid_dataset") + mk_dataset(10, 21, data / "valid.en-zh.en.bin") + mk_dataset(10, 21, data / "valid.en-zh.zh.bin") + + task, model = self.obt_task(["en", "zh"], data) + valid = task.load_dataset("valid") + en_bos = obt._lang_token_index(task.common_dict, "en") + + assert valid is not None + valid.prefetch(range(10)) + sample_0 = valid[0] + sample_9 = valid[9] + self.assertEqual(sample_0["id"], 0) + self.assertEqual(sample_9["id"], 9) + self.assertEqual(sample_0["source"][0], en_bos) + self.assertEqual(sample_9["source"][0], en_bos) + # TODO: could we test the target side ? + + def assertFnMatch(self, fn, values): + for x, y in values.items(): + fn_x = fn(x) + self.assertEqual(fn_x, y, f"Fn has wrong value: fn({x}) = {fn_x} != {y}") + + def test_piecewise_linear_fn(self): + self.assertFnMatch( + obt.PiecewiseLinearFn.from_string("1.0"), {0: 1, 100: 1, 500: 1, 1000: 1} + ) + self.assertFnMatch( + obt.PiecewiseLinearFn.from_string("0:1,1000:0"), + {0: 1, 500: 0.5, 1000: 0, 2000: 0}, + ) + self.assertFnMatch( + obt.PiecewiseLinearFn.from_string("0:0,1000:1"), + {0: 0, 500: 0.5, 1000: 1, 2000: 1}, + ) + self.assertFnMatch( + obt.PiecewiseLinearFn.from_string("0:0,1000:1,2000:0"), + {0: 0, 500: 0.5, 1000: 1, 1500: 0.5, 2000: 0, 3000: 0}, + ) diff --git a/tests/test_plasma_utils.py b/tests/test_plasma_utils.py new file mode 100644 index 0000000000..7286c6cd3a --- /dev/null +++ b/tests/test_plasma_utils.py @@ -0,0 +1,127 @@ +import contextlib +import tempfile +import unittest +from io import StringIO + +import numpy as np + +from tests.utils import create_dummy_data, preprocess_lm_data, train_language_model + +try: + from pyarrow import plasma + + from fairseq.data.plasma_utils import PlasmaStore, PlasmaView + + PYARROW_AVAILABLE = True +except ImportError: + PYARROW_AVAILABLE = False + +dummy_path = "dummy" + + +@unittest.skipUnless(PYARROW_AVAILABLE, "") +class TestPlasmaView(unittest.TestCase): + def setUp(self) -> None: + self.tmp_file = tempfile.NamedTemporaryFile() # noqa: P201 + self.path = self.tmp_file.name + self.server = PlasmaStore.start(path=self.path, nbytes=10000) + self.client = plasma.connect(self.path, num_retries=10) + + def tearDown(self) -> None: + self.client.disconnect() + self.tmp_file.close() + self.server.kill() + + def test_two_servers_do_not_share_object_id_space(self): + data_server_1 = np.array([0, 1]) + data_server_2 = np.array([2, 3]) + server_2_path = self.path + with tempfile.NamedTemporaryFile() as server_1_path: + server = PlasmaStore.start(path=server_1_path.name, nbytes=10000) + arr1 = PlasmaView( + data_server_1, dummy_path, 1, plasma_path=server_1_path.name + ) + assert len(arr1.client.list()) == 1 + assert (arr1.array == data_server_1).all() + arr2 = PlasmaView(data_server_2, dummy_path, 1, plasma_path=server_2_path) + assert (arr2.array == data_server_2).all() + assert (arr1.array == data_server_1).all() + server.kill() + + def test_hash_collision(self): + data_server_1 = np.array([0, 1]) + data_server_2 = np.array([2, 3]) + arr1 = PlasmaView(data_server_1, dummy_path, 1, plasma_path=self.path) + assert len(arr1.client.list()) == 1 + arr2 = PlasmaView(data_server_2, dummy_path, 1, plasma_path=self.path) + assert len(arr1.client.list()) == 1 + assert len(arr2.client.list()) == 1 + assert (arr2.array == data_server_1).all() + # New hash key based on tuples + arr3 = PlasmaView( + data_server_2, dummy_path, (1, 12312312312, None), plasma_path=self.path + ) + assert ( + len(arr2.client.list()) == 2 + ), "No new object was created by using a novel hash key" + assert ( + arr3.object_id in arr2.client.list() + ), "No new object was created by using a novel hash key" + assert ( + arr3.object_id in arr3.client.list() + ), "No new object was created by using a novel hash key" + del arr3, arr2, arr1 + + @staticmethod + def _assert_view_equal(pv1, pv2): + np.testing.assert_array_equal(pv1.array, pv2.array) + + def test_putting_same_array_twice(self): + data = np.array([4, 4, 4]) + arr1 = PlasmaView(data, dummy_path, 1, plasma_path=self.path) + assert len(self.client.list()) == 1 + arr1b = PlasmaView( + data, dummy_path, 1, plasma_path=self.path + ) # should not change contents of store + arr1c = PlasmaView( + None, dummy_path, 1, plasma_path=self.path + ) # should not change contents of store + + assert len(self.client.list()) == 1 + self._assert_view_equal(arr1, arr1b) + self._assert_view_equal(arr1, arr1c) + PlasmaView( + data, dummy_path, 2, plasma_path=self.path + ) # new object id, adds new entry + assert len(self.client.list()) == 2 + + new_client = plasma.connect(self.path) + assert len(new_client.list()) == 2 # new client can access same objects + assert isinstance(arr1.object_id, plasma.ObjectID) + del arr1b + del arr1c + + def test_plasma_store_full_raises(self): + with tempfile.NamedTemporaryFile() as new_path: + server = PlasmaStore.start(path=new_path.name, nbytes=10000) + with self.assertRaises(plasma.PlasmaStoreFull): + # 2000 floats is more than 2000 bytes + PlasmaView( + np.random.rand(10000, 1), dummy_path, 1, plasma_path=new_path.name + ) + server.kill() + + def test_object_id_overflow(self): + PlasmaView.get_object_id("", 2**21) + + def test_training_lm_plasma(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir: + create_dummy_data(data_dir) + preprocess_lm_data(data_dir) + train_language_model( + data_dir, + "transformer_lm", + ["--use-plasma-view", "--plasma-path", self.path], + run_validation=True, + ) diff --git a/tests/test_positional_encoding.py b/tests/test_positional_encoding.py new file mode 100644 index 0000000000..4e38c4397d --- /dev/null +++ b/tests/test_positional_encoding.py @@ -0,0 +1,63 @@ +import unittest + +import torch +from fairseq.modules import RelPositionalEncoding +import numpy as np + + +class TestRelPositionalEncoding(unittest.TestCase): + def setUp(self) -> None: + self.T = 3 + self.B = 1 + self.C = 2 + torch.manual_seed(0) + self.sample = torch.randn(self.T, self.B, self.C) # TBC + self.rel_pos_enc = RelPositionalEncoding(max_len=4, d_model=self.C) + + def test_extend_pe(self): + inp = self.sample.transpose(0, 1) + self.rel_pos_enc.extend_pe(inp) + expected_pe = torch.tensor( + [ + [ + [0.1411, -0.9900], + [0.9093, -0.4161], + [0.8415, 0.5403], + [0.0000, 1.0000], + [-0.8415, 0.5403], + [-0.9093, -0.4161], + [-0.1411, -0.9900], + ] + ] + ) + + self.assertTrue( + np.allclose( + expected_pe.cpu().detach().numpy(), + self.rel_pos_enc.pe.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + def test_forward(self): + pos_enc = self.rel_pos_enc(self.sample) + expected_pos_enc = torch.tensor( + [ + [[0.9093, -0.4161]], + [[0.8415, 0.5403]], + [[0.0000, 1.0000]], + [[-0.8415, 0.5403]], + [[-0.9093, -0.4161]], + ] + ) + self.assertTrue( + np.allclose( + pos_enc.cpu().detach().numpy(), + expected_pos_enc.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_reproducibility.py b/tests/test_reproducibility.py index 517e23c39e..b285593272 100644 --- a/tests/test_reproducibility.py +++ b/tests/test_reproducibility.py @@ -3,12 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import contextlib import json import os import tempfile import unittest -from io import StringIO import torch @@ -26,7 +24,7 @@ def _test_reproducibility( ): def get_last_log_stats_containing_string(log_records, search_string): for log_record in logs.records[::-1]: - if search_string in log_record.msg: + if isinstance(log_record.msg, str) and search_string in log_record.msg: return json.loads(log_record.msg) if extra_flags is None: @@ -125,6 +123,18 @@ def test_reproducibility_memory_efficient_fp16(self): ], ) + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_reproducibility_amp(self): + self._test_reproducibility( + "test_reproducibility_amp", + [ + "--amp", + "--fp16-init-scale", + "4096", + ], + delta=0.011, + ) + def test_mid_epoch_reproducibility(self): self._test_reproducibility( "test_mid_epoch_reproducibility", diff --git a/tests/test_roberta.py b/tests/test_roberta.py new file mode 100644 index 0000000000..14f01f9cb7 --- /dev/null +++ b/tests/test_roberta.py @@ -0,0 +1,344 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import functools +import unittest +from typing import Any, Dict, Sequence + +import fairseq +import fairseq.options +import fairseq.tasks +import torch +from tests.utils import dummy_dictionary + +VOCAB_SIZE = 100 + + +@fairseq.tasks.register_task("fake_task") +class FakeTask(fairseq.tasks.LegacyFairseqTask): + def __init__(self, args): + super().__init__(args) + self.dictionary = dummy_dictionary(VOCAB_SIZE - 4) + assert len(self.dictionary) == VOCAB_SIZE + + @property + def source_dictionary(self): + return self.dictionary + + @property + def target_dictionary(self): + return self.dictionary + + +@functools.lru_cache() +def get_toy_model( + device: str, + architecture: str = "roberta_enc_dec", + **extra_args: Any, +): + assert device in ("gpu", "cpu") + kwargs = { + "arch": architecture, + # Use characteristics dimensions + "encoder_layers": 3, + "encoder_embed_dim": 12, + "encoder_ffn_embed_dim": 14, + "encoder_attention_heads": 4, + "decoder_layers": 3, + "decoder_embed_dim": 12, + "decoder_ffn_embed_dim": 14, + "decoder_attention_heads": 4, + # Disable dropout so we have comparable tests. + "dropout": 0, + "attention_dropout": 0, + "activation_dropout": 0, + "encoder_layerdrop": 0, + # required args + "tokens_per_sample": 256, + "data": "/tmp/test_roberta", + } + kwargs.update(extra_args) + fake_task = FakeTask(kwargs) + args = fairseq.options.get_args( + task="online_backtranslation", + mono_langs="en,ro", + valid_lang_pairs="en-ro", + **kwargs, + ) + torch.manual_seed(0) + model = fake_task.build_model(args) + if device == "gpu": + model.cuda() + return fake_task, model + + +def mk_sample( + lang: str, device: str, tok: Sequence[int] = None, batch_size: int = 2 +) -> Dict[str, Any]: + assert device in ("gpu", "cpu") + if not tok: + if lang == "en": + tok = [10, 11, 12, 13, 14, 15, 2] + else: + tok = [20, 21, 22, 23, 24, 25, 26, 27, 2] + + batch = torch.stack([torch.tensor(tok, dtype=torch.long)] * batch_size) + if device == "gpu": + batch = batch.cuda() + sample = { + "net_input": { + "src_tokens": batch, + "prev_output_tokens": batch, + "src_lengths": torch.tensor( + [len(tok)] * batch_size, dtype=torch.long, device=batch.device + ), + }, + "target": batch[:, 1:], + } + return sample + + +def cpu_gpu(fn): + def helper(self): + fn(self, "cpu") + if torch.cuda.is_available(): + fn(self, "gpu") + + return helper + + +def architectures(fn): + def helper(self): + for arch in ["roberta_enc_dec", "transformer"]: + fn(self, arch) + + return helper + + +class RobertaTest(unittest.TestCase): + def assertTensorEqual(self, t1, t2, delta: float = 1e-6): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + if delta == 0.0: + self.assertEqual(t1.ne(t2).long().sum(), 0) + else: + self.assertEqual(((t2 - t1).abs() > delta).long().sum(), 0) + + def assertSharing(self, model, link_groups: Sequence[Sequence[str]]): + ids = {} + for group in link_groups: + group_ids = {name: id(params(model, name)) for name in group} + shared_id = group_ids[group[0]] + self.assertEqual(group_ids, {name: shared_id for name in group}) + self.assertNotIn(shared_id, ids) + ids[shared_id] = group + + def test_roberta_shared_params(self): + _, roberta = get_toy_model("cpu", architecture="roberta") + self.assertSharing( + roberta, + [ + [ + "encoder.sentence_encoder.embed_tokens.weight", + "encoder.lm_head.weight", + ] + ], + ) + + _, roberta = get_toy_model( + "cpu", architecture="roberta", untie_weights_roberta=True + ) + self.assertSharing( + roberta, + [ + ["encoder.sentence_encoder.embed_tokens.weight"], + ["encoder.lm_head.weight"], + ], + ) + + def test_roberta_enc_dec_shared_params(self): + # 3 distinct embeddings + _, enc_dec = get_toy_model("cpu", architecture="roberta_enc_dec") + self.assertSharing( + enc_dec, + [ + ["encoder.embed_tokens.weight"], + ["decoder.embed_tokens.weight"], + ["decoder.output_projection.weight"], + ], + ) + + # 2 distinct embeddings, one for encoder, one for decoder + _, enc_dec = get_toy_model( + "cpu", architecture="roberta_enc_dec", share_decoder_input_output_embed=True + ) + self.assertSharing( + enc_dec, + [ + ["encoder.embed_tokens.weight"], + [ + "decoder.embed_tokens.weight", + "decoder.output_projection.weight", + ], + ], + ) + + # shared embeddings + _, enc_dec = get_toy_model( + "cpu", architecture="roberta_enc_dec", share_all_embeddings=True + ) + self.assertSharing( + enc_dec, + [ + [ + "encoder.embed_tokens.weight", + "decoder.embed_tokens.weight", + "decoder.output_projection.weight", + ] + ], + ) + + def test_roberta_max_positions_is_correctly_set(self): + device = "cpu" + task, model = get_toy_model(device) + max_pos = model.max_decoder_positions() + self.assertEqual(max_pos, 256) + self.assertEqual(max_pos, model.decoder.max_positions()) + self.assertEqual(max_pos, model.encoder.max_positions()) + self.assertEqual(max_pos, model.encoder.embed_positions.max_positions) + + sentence = [31 for _ in range(max_pos)] + sample = mk_sample("en", device, sentence, batch_size=1) + self.assertEqual(list(sample["net_input"]["src_lengths"]), [max_pos]) + self.assertEqual(len(sample["net_input"]["src_tokens"][0]), max_pos) + x, _ = model.forward(**sample["net_input"]) + self.assertEqual(x.shape, (1, max_pos, VOCAB_SIZE)) + + @cpu_gpu + def test_roberta_forward_backward(self, device: str): + _, model = get_toy_model(device) + sample = mk_sample("en", device) + en_tokens = sample["net_input"]["src_tokens"] + (bs, l) = en_tokens.shape + # Forward + logits, _ = model(**sample["net_input"]) + self.assertEqual(logits.shape, (bs, l, VOCAB_SIZE)) + + # Backward + loss = logits.sum() + loss.backward() + + @cpu_gpu + def test_roberta_forward_backward_bs1(self, device: str): + _, model = get_toy_model(device) + sample = mk_sample("en", device, batch_size=1) + o, _ = model.forward(**sample["net_input"]) + loss = o.sum() + sample2 = mk_sample("ro", device, batch_size=1) + o, _ = model.forward(**sample2["net_input"]) + loss += o.sum() + loss.backward() + + @cpu_gpu + def test_roberta_batching(self, device: str): + """ + Checks that the batch of size 2 give twice the same results than the batch of size 1. + """ + _, model = get_toy_model(device) + sample = mk_sample("en", device, batch_size=1) + slen = sample["net_input"]["src_lengths"][0] + sample2 = mk_sample("en", device, batch_size=2) + with torch.no_grad(): + z = model.encoder.forward( + sample["net_input"]["src_tokens"], sample["net_input"]["src_lengths"] + ) + z = z["encoder_out"][-1] + logits, _ = model.forward(**sample["net_input"]) + + z2 = model.encoder.forward( + sample2["net_input"]["src_tokens"], sample["net_input"]["src_lengths"] + ) + z2 = z2["encoder_out"][-1] + logits2, _ = model.forward(**sample2["net_input"]) + + self.assertEqual(z.shape, (slen, 1, 12)) + self.assertEqual(z2.shape, (slen, 2, 12)) + self.assertTensorEqual(logits2[0], logits2[1]) + self.assertTensorEqual(logits[0], logits2[0]) + + @cpu_gpu + def test_roberta_incremental_decoder(self, device: str): + """ + Checks that incremental decoding yields the same result than non incremental one. + """ + task, model = get_toy_model(device) + + en_sample = mk_sample("en", device) + en_tokens = en_sample["net_input"]["src_tokens"] + ro_sample = mk_sample("ro", device) + ro_tokens = ro_sample["net_input"]["src_tokens"] + + en_enc = model.encoder.forward( + en_tokens, src_lengths=en_sample["net_input"]["src_lengths"] + ) + (bs, tgt_len) = ro_tokens.shape + + # Decode without incremental state + ro_dec, _ = model.decoder.forward(ro_tokens, encoder_out=en_enc) + self.assertEqual(ro_dec.shape, (bs, tgt_len, VOCAB_SIZE)) + self.assertTensorEqual(ro_dec[0], ro_dec[1]) + + # Decode with incremental state + inc_state = {} + ro_dec_inc = [] + for i in range(tgt_len): + ro, _ = model.decoder.forward( + ro_tokens[:, : i + 1], encoder_out=en_enc, incremental_state=inc_state + ) + self.assertEqual(ro.shape, (bs, 1, VOCAB_SIZE)) + ro_dec_inc.append(ro) + + for i in range(tgt_len): + # Intra-batch + self.assertTensorEqual(ro_dec_inc[i][0], ro_dec_inc[i][1]) + # Incremental vs non-incremental + self.assertTensorEqual(ro_dec_inc[i][:, 0], ro_dec[:, i]) + + @cpu_gpu + def test_regularize_for_adaprune_in_roberta(self, device: str): + _, model = get_toy_model( + device=device, + architecture="roberta_base", + mha_reg_scale_factor=0.000375, + ffn_reg_scale_factor=0.000375, + ) + sample = mk_sample("en", device, batch_size=1) + task_loss, _ = model.forward(**sample["net_input"]) + head_loss = model._get_adaptive_head_loss() + ffn_loss = model._get_adaptive_ffn_loss() + loss = task_loss.sum() + head_loss + ffn_loss + loss.backward() + + @cpu_gpu + def test_ffn_prune_for_adaprune_in_roberta(self, device: str): + _, model = get_toy_model( + device=device, + architecture="roberta_base", + ) + sample = mk_sample("en", device, batch_size=1) + for layer in model.encoder.sentence_encoder.layers: + fc1_original_size = layer.fc1.out_features + remove_index = layer._get_fc_rank(remove_num=2) + layer._prune_fc_layer(remove_index=remove_index) + self.assertEqual(layer.fc1.out_features, fc1_original_size - 2) + + task_loss, _ = model.forward(**sample["net_input"]) + + +def params(model, name): + if "." not in name: + return getattr(model, name) + + prefix, name = name.split(".", 1) + return params(getattr(model, prefix), name) diff --git a/tests/test_rotary_positional_embedding.py b/tests/test_rotary_positional_embedding.py new file mode 100644 index 0000000000..7c44e86d5d --- /dev/null +++ b/tests/test_rotary_positional_embedding.py @@ -0,0 +1,85 @@ +import torch +import numpy as np +import unittest +from fairseq.modules.rotary_positional_embedding import apply_rotary_pos_emb +from fairseq.modules import RotaryPositionalEmbedding + + +class TestRotaryPositionalEmbedding(unittest.TestCase): + def setUp(self) -> None: + self.T = 3 + self.B = 1 + self.C = 2 + torch.manual_seed(0) + self.sample = torch.randn(self.T, self.B, self.C) # TBC + self.rope_pos_emd = RotaryPositionalEmbedding(dim=self.C) + + def test_forward(self): + expected_cos = torch.tensor( + [[[[1.0000, 1.0000]]], [[[0.5403, 0.5403]]], [[[-0.4161, -0.4161]]]] + ) + expected_sin = torch.tensor( + [[[[0.0000, 0.0000]]], [[[0.8415, 0.8415]]], [[[0.9093, 0.9093]]]] + ) + cos, sin = self.rope_pos_emd(self.sample, self.T) + self.assertTrue( + np.allclose( + expected_cos.cpu().detach().numpy(), + cos.cpu().detach().numpy(), + atol=1e-4, + ) + ) + self.assertTrue( + np.allclose( + expected_sin.cpu().detach().numpy(), + sin.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + def test_apply_rotary_pos_emb(self): + cos, sin = self.rope_pos_emd(self.sample, self.T) + query = self.sample.view(self.T, self.B, 1, self.C) + expected_query = torch.tensor( + [[[[1.5410, -0.2934]]], [[[-1.6555, -1.5263]]], [[[1.7231, -0.4041]]]] + ) + new_query, new_key = apply_rotary_pos_emb(query, query, cos, sin) + self.assertTrue( + np.allclose( + expected_query.cpu().detach().numpy(), + new_query.cpu().detach().numpy(), + atol=1e-4, + ) + ) + self.assertTrue( + np.allclose( + expected_query.cpu().detach().numpy(), + new_key.cpu().detach().numpy(), + atol=1e-4, + ) + ) + + def test_jit_compile_rope_module(self): + module_scripted = torch.jit.script(self.rope_pos_emd) + apply_rotary_scripted = torch.jit.script(apply_rotary_pos_emb) + # Test several different lengths + for T in [3, 5, 10]: + sample = torch.randn(T, self.B, self.C) + # Run forward pass with the original module + cos_original, sin_original = self.rope_pos_emd(sample, T) + query = sample.view(T, self.B, 1, self.C) + new_query, new_key = apply_rotary_pos_emb(query, query, cos_original, sin_original) + + # Run forward pass with the scripted module + cos_scripted, sin_scripted = module_scripted(sample, T) + new_query_scripted, new_key_scripted = apply_rotary_scripted(query, query, cos_scripted, sin_scripted) + + # Ensure the outputs are the same + self.assertTrue(torch.allclose(cos_original, cos_scripted)) + self.assertTrue(torch.allclose(sin_original, sin_scripted)) + self.assertTrue(torch.allclose(new_query, new_query_scripted)) + self.assertTrue(torch.allclose(new_key, new_key_scripted)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_sequence_generator.py b/tests/test_sequence_generator.py index c890b655ff..2e42df0e56 100644 --- a/tests/test_sequence_generator.py +++ b/tests/test_sequence_generator.py @@ -4,18 +4,21 @@ # LICENSE file in the root directory of this source tree. import argparse +import math import tempfile import unittest -import tests.utils as test_utils +import numpy as np import torch + +import tests.utils as test_utils from fairseq import search from fairseq.data.dictionary import Dictionary from fairseq.models.transformer import TransformerModel +from fairseq.ngram_repeat_block import NGramRepeatBlock from fairseq.sequence_generator import EnsembleModel, SequenceGenerator from fairseq.tasks.fairseq_task import LegacyFairseqTask - DEFAULT_TEST_VOCAB_SIZE = 100 @@ -41,7 +44,7 @@ def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE): dummy_dict = Dictionary() # add dummy symbol to satisfy vocab size for id, _ in enumerate(range(vocab_size)): - dummy_dict.add_symbol("{}".format(id), 1000) + dummy_dict.add_symbol("{}".format(id), n=1000) return dummy_dict @@ -107,30 +110,27 @@ def _test_save_and_load(self, scripted_module): torch.jit.load(f.name) -class TestJitSequeneceGenerator(TestJitSequenceGeneratorBase): - @unittest.skipIf( - torch.__version__ < "1.6.0", "Targeting OSS scriptability for the 1.6 release" - ) +JIT_MSG = "Targeting OSS scriptability for the 1.6 release" + + +@unittest.skipIf(torch.__version__ < "1.6.0", JIT_MSG) +class TestJitSequenceGenerator(TestJitSequenceGeneratorBase): def test_export_transformer(self): model = self.transformer_model torch.jit.script(model) - @unittest.skipIf( - torch.__version__ < "1.6.0", "Targeting OSS scriptability for the 1.6 release" - ) def test_ensemble_sequence_generator(self): model = self.transformer_model generator = SequenceGenerator( - [model], self.task.tgt_dict, beam_size=2, no_repeat_ngram_size=2 + [model], + self.task.tgt_dict, + beam_size=2, + no_repeat_ngram_size=2, + max_len_b=10, ) scripted_model = torch.jit.script(generator) self._test_save_and_load(scripted_model) - -class TestJitEnsemble(TestJitSequenceGeneratorBase): - @unittest.skipIf( - torch.__version__ < "1.6.0", "Targeting OSS scriptability for the 1.6 release" - ) def test_export_ensemble_model(self): model = self.transformer_model ensemble_models = EnsembleModel([model]) @@ -185,7 +185,7 @@ def assertTensorEqual(self, t1, t2): self.assertEqual(t1.ne(t2).long().sum(), 0) -class TestSequeneceGenerator(TestSequenceGeneratorBase): +class TestSequenceGenerator(TestSequenceGeneratorBase): def setUp(self): ( self.tgt_dict, @@ -320,12 +320,109 @@ def test_generation_with_additional_input(self): sample = self.sample.copy() sample["net_input"]["fancy_other_input"] = sample["net_input"]["src_tokens"] hypos = generator.forward(self.sample) - eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 + eos, w1 = self.tgt_dict.eos(), self.w1 # sentence 1, beam 1 self.assertHypoTokens(hypos[0][0], [w1, eos]) self.assertHypoScore(hypos[0][0], [0.9, 1.0]) +@unittest.skipUnless(torch.cuda.is_available(), "") +class TestRepeatNgramBlocking(TestSequenceGeneratorBase): + @classmethod + def setUpClass(cls): + ( + cls.tgt_dict, + cls.w1, + cls.w2, + src_tokens, + src_lengths, + cls.model, + ) = test_utils.sequence_generator_setup() + return cls + + def test_finds_repetitive_tokens(self): + bsz, vocab_size, beam_size, step = 2, 4, 1, 3 + generated_tok = torch.tensor( + [[2, 2, 2, 2], [3, 3, 3, 3]], dtype=torch.long, device="cuda" + ) + lprobs = torch.zeros((beam_size * bsz, vocab_size), device="cuda") + desired_result = lprobs.new_tensor( + [[0.0, 0.0, -math.inf, 0.0], [0.0, 0.0, 0.0, -math.inf]] + ) + + cuda_ext_result, baseline_result = self._compare_cuda_ext_to_default_implem( + bsz, beam_size, generated_tok, lprobs, step, 2 + ) + self.assertTensorEqual(cuda_ext_result, desired_result) + self.assertTensorEqual(baseline_result, desired_result) + + @unittest.skipIf(torch.__version__ < "1.6.0", JIT_MSG) + def test_jit_no_extension(self): + bsz, vocab_size, beam_size, step = 2, 4, 1, 3 + generated_tok = torch.tensor( + [[2, 2, 2, 2], [3, 3, 3, 3]], dtype=torch.long, device="cuda" + ) + lprobs = torch.zeros((beam_size * bsz, vocab_size), device="cuda") + blocker = NGramRepeatBlock(2, use_extension=False) + base_result = blocker(generated_tok, lprobs.clone(), bsz, beam_size, step) + scripted_blocker = torch.jit.script(blocker) + jit_result = scripted_blocker( + generated_tok, lprobs.clone(), bsz, beam_size, step + ) + self.assertTensorEqual(base_result, jit_result) + + def test_ngram_blocking_same_as_default_implem(self): + """Test that cuda extension returns same things as default impl in many settings.""" + vocab_size = 4 + step = 6 + for _ in range(2): + block_param = np.random.choice([1, 2, 3, 4]) + batch_size = np.random.randint(1, 8) + beam_size = np.random.choice([1, 2, 4, 8]) + lprobs = torch.zeros((beam_size * batch_size, vocab_size), device="cuda") + + generated_tok = torch.tensor( + np.random.randint( + 0, vocab_size, size=(batch_size * beam_size, step + 1) + ), + device="cuda", + dtype=torch.long, + ) + self._compare_cuda_ext_to_default_implem( + batch_size, + beam_size, + generated_tok, + lprobs, + step, + block_param, + ) + + def _compare_cuda_ext_to_default_implem( + self, bsz, beam_size, generated_tok, lprobs, step, block_param + ): + """Assert that cuda extension and default implem return the same thing.""" + blocker = NGramRepeatBlock(block_param) + assert blocker.use_extension, "Extension not compiled" + cuda_ext_result = blocker( + generated_tok, + lprobs.clone(), + bsz, + beam_size, + step, + ) + blocker.use_extension = False + baseline_result = blocker( + generated_tok, + lprobs.clone(), + bsz, + beam_size, + step, + ) + self.assertTensorEqual(cuda_ext_result, baseline_result) + blocker.use_extension = True + return cuda_ext_result, baseline_result + + class TestDiverseBeamSearch(TestSequenceGeneratorBase): def setUp(self): # construct dummy dictionary diff --git a/tests/test_token_block_dataset.py b/tests/test_token_block_dataset.py index ea315b4e67..c4d7b76dcd 100644 --- a/tests/test_token_block_dataset.py +++ b/tests/test_token_block_dataset.py @@ -74,6 +74,19 @@ def test_complete_break_mode(self): self.assertEqual(ds[1].tolist(), [5, 1, 1]) self.assertEqual(ds[2].tolist(), [6, 1]) + def test_4billion_tokens(self): + """Regression test for numpy type promotion issue https://github.com/numpy/numpy/issues/5745""" + data = [torch.tensor(list(range(10000)), dtype=torch.long)] * 430000 + ds = self._build_dataset( + data, block_size=6, pad=0, eos=1, break_mode="complete" + ) + ds[-1] # __getitem__ works + start, end = ds.slice_indices[-1] + assert end > 4294967295 # data must be sufficiently large to overflow uint32 + assert not isinstance( + end + 1, float + ) # this would also raise, since np.uint64(1) + 1 => 2.0 + if __name__ == "__main__": unittest.main() diff --git a/tests/test_train.py b/tests/test_train.py index 57daa194b2..02ef94cc5b 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -61,6 +61,7 @@ def get_mock_cfg(finetune_from_model): cfg_mock = OmegaConf.create( { "checkpoint": { + "save_dir": None, "optimizer_overrides": "{}", "reset_dataloader": False, "reset_meters": False, @@ -68,6 +69,7 @@ def get_mock_cfg(finetune_from_model): "reset_lr_scheduler": False, "finetune_from_model": finetune_from_model, "model_parallel_size": 1, + "restore_file": "checkpoint_last.pt", }, "common": { "model_parallel_size": 1, diff --git a/tests/test_transformer.py b/tests/test_transformer.py new file mode 100644 index 0000000000..de5c5bdbd4 --- /dev/null +++ b/tests/test_transformer.py @@ -0,0 +1,65 @@ +import argparse +import unittest +from typing import Any, Dict, Sequence + +import torch +from fairseq.models import transformer + +from tests.test_roberta import FakeTask + + +def mk_sample(tok: Sequence[int] = None, batch_size: int = 2) -> Dict[str, Any]: + if not tok: + tok = [10, 11, 12, 13, 14, 15, 2] + + batch = torch.stack([torch.tensor(tok, dtype=torch.long)] * batch_size) + sample = { + "net_input": { + "src_tokens": batch, + "prev_output_tokens": batch, + "src_lengths": torch.tensor( + [len(tok)] * batch_size, dtype=torch.long, device=batch.device + ), + }, + "target": batch[:, 1:], + } + return sample + + +def mk_transformer(**extra_args: Any): + overrides = { + # Use characteristics dimensions + "encoder_embed_dim": 12, + "encoder_ffn_embed_dim": 14, + "decoder_embed_dim": 12, + "decoder_ffn_embed_dim": 14, + # Disable dropout so we have comparable tests. + "dropout": 0, + "attention_dropout": 0, + "activation_dropout": 0, + "encoder_layerdrop": 0, + } + overrides.update(extra_args) + # Overrides the defaults from the parser + args = argparse.Namespace(**overrides) + transformer.tiny_architecture(args) + + torch.manual_seed(0) + task = FakeTask(args) + return transformer.TransformerModel.build_model(args, task) + + +class TransformerTestCase(unittest.TestCase): + def test_forward_backward(self): + model = mk_transformer(encoder_embed_dim=12, decoder_embed_dim=12) + sample = mk_sample() + o, _ = model.forward(**sample["net_input"]) + loss = o.sum() + loss.backward() + + def test_different_encoder_decoder_embed_dim(self): + model = mk_transformer(encoder_embed_dim=12, decoder_embed_dim=16) + sample = mk_sample() + o, _ = model.forward(**sample["net_input"]) + loss = o.sum() + loss.backward() diff --git a/tests/test_valid_subset_checks.py b/tests/test_valid_subset_checks.py new file mode 100644 index 0000000000..c39fb89823 --- /dev/null +++ b/tests/test_valid_subset_checks.py @@ -0,0 +1,143 @@ +import os +import shutil +import tempfile +import unittest + +from fairseq import options +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.data.data_utils import raise_if_valid_subsets_unintentionally_ignored +from .utils import create_dummy_data, preprocess_lm_data, train_language_model + + +def make_lm_config( + data_dir=None, + extra_flags=None, + task="language_modeling", + arch="transformer_lm_gpt2_tiny", +): + task_args = [task] + if data_dir is not None: + task_args += [data_dir] + train_parser = options.get_training_parser() + train_args = options.parse_args_and_arch( + train_parser, + [ + "--task", + *task_args, + "--arch", + arch, + "--optimizer", + "adam", + "--lr", + "0.0001", + "--max-tokens", + "500", + "--tokens-per-sample", + "500", + "--save-dir", + data_dir, + "--max-epoch", + "1", + ] + + (extra_flags or []), + ) + cfg = convert_namespace_to_omegaconf(train_args) + return cfg + + +def write_empty_file(path): + with open(path, "w"): + pass + assert os.path.exists(path) + + +class TestValidSubsetsErrors(unittest.TestCase): + """Test various filesystem, clarg combinations and ensure that error raising happens as expected""" + + def _test_case(self, paths, extra_flags): + with tempfile.TemporaryDirectory() as data_dir: + [ + write_empty_file(os.path.join(data_dir, f"{p}.bin")) + for p in paths + ["train"] + ] + cfg = make_lm_config(data_dir, extra_flags=extra_flags) + raise_if_valid_subsets_unintentionally_ignored(cfg) + + def test_default_raises(self): + with self.assertRaises(ValueError): + self._test_case(["valid", "valid1"], []) + with self.assertRaises(ValueError): + self._test_case( + ["valid", "valid1", "valid2"], ["--valid-subset", "valid,valid1"] + ) + + def partially_specified_valid_subsets(self): + with self.assertRaises(ValueError): + self._test_case( + ["valid", "valid1", "valid2"], ["--valid-subset", "valid,valid1"] + ) + # Fix with ignore unused + self._test_case( + ["valid", "valid1", "valid2"], + ["--valid-subset", "valid,valid1", "--ignore-unused-valid-subsets"], + ) + + def test_legal_configs(self): + self._test_case(["valid"], []) + self._test_case(["valid", "valid1"], ["--ignore-unused-valid-subsets"]) + self._test_case(["valid", "valid1"], ["--combine-val"]) + self._test_case(["valid", "valid1"], ["--valid-subset", "valid,valid1"]) + self._test_case(["valid", "valid1"], ["--valid-subset", "valid1"]) + self._test_case( + ["valid", "valid1"], ["--combine-val", "--ignore-unused-valid-subsets"] + ) + self._test_case( + ["valid1"], ["--valid-subset", "valid1"] + ) # valid.bin doesn't need to be ignored. + + def test_disable_validation(self): + self._test_case([], ["--disable-validation"]) + self._test_case(["valid", "valid1"], ["--disable-validation"]) + + def test_dummy_task(self): + cfg = make_lm_config(task="dummy_lm") + raise_if_valid_subsets_unintentionally_ignored(cfg) + + def test_masked_dummy_task(self): + cfg = make_lm_config(task="dummy_masked_lm") + raise_if_valid_subsets_unintentionally_ignored(cfg) + + +class TestCombineValidSubsets(unittest.TestCase): + def _train(self, extra_flags): + with self.assertLogs() as logs: + with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir: + create_dummy_data(data_dir, num_examples=20) + preprocess_lm_data(data_dir) + + shutil.copyfile(f"{data_dir}/valid.bin", f"{data_dir}/valid1.bin") + shutil.copyfile(f"{data_dir}/valid.idx", f"{data_dir}/valid1.idx") + train_language_model( + data_dir, + "transformer_lm", + ["--max-update", "0", "--log-format", "json"] + extra_flags, + run_validation=False, + ) + return [x.message for x in logs.records] + + def test_combined(self): + flags = ["--combine-valid-subsets", "--required-batch-size-multiple", "1"] + logs = self._train(flags) + assert any(["valid1" in x for x in logs]) # loaded 100 examples from valid1 + assert not any(["valid1_ppl" in x for x in logs]) # metrics are combined + + def test_subsets(self): + flags = [ + "--valid-subset", + "valid,valid1", + "--required-batch-size-multiple", + "1", + ] + logs = self._train(flags) + assert any(["valid_ppl" in x for x in logs]) # loaded 100 examples from valid1 + assert any(["valid1_ppl" in x for x in logs]) # metrics are combined diff --git a/tests/utils.py b/tests/utils.py index a145aa587d..af3f714ed1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,16 +4,23 @@ # LICENSE file in the root directory of this source tree. import argparse +import json import os import random +import shutil +import string import sys +import typing as tp from io import StringIO import torch import torch.nn.functional as F + +import fairseq.distributed.utils as distributed_utils from fairseq import options, utils from fairseq.data import Dictionary from fairseq.data.language_pair_dataset import collate +from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, @@ -159,11 +166,13 @@ def sequence_generator_setup(): return tgt_dict, w1, w2, src_tokens, src_lengths, model -def create_dummy_data(data_dir, num_examples=100, maxlen=20, alignment=False): - def _create_dummy_data(filename): +def create_dummy_data( + data_dir, num_examples=100, maxlen=20, alignment=False, languages=None +): + def _create_dummy_data(dir, filename): data = torch.rand(num_examples * maxlen) data = 97 + torch.floor(26 * data).int() - with open(os.path.join(data_dir, filename), "w") as h: + with open(os.path.join(dir, filename), "w") as h: offset = 0 for _ in range(num_examples): ex_len = random.randint(1, maxlen) @@ -190,12 +199,23 @@ def _create_dummy_alignment_data(filename_src, filename_tgt, filename): ) print(ex_str, file=h) - _create_dummy_data("train.in") - _create_dummy_data("train.out") - _create_dummy_data("valid.in") - _create_dummy_data("valid.out") - _create_dummy_data("test.in") - _create_dummy_data("test.out") + files_to_write = [ + "train.in", + "train.out", + "valid.in", + "valid.out", + "test.in", + "test.out", + ] + if languages is None: # En only dummy dataset + for f in files_to_write: + _create_dummy_data(data_dir, f) + else: + for lang in languages: + lang_dir = os.path.join(data_dir, lang) + os.makedirs(lang_dir, exist_ok=True) + for f in files_to_write: + _create_dummy_data(lang_dir, f) if alignment: _create_dummy_alignment_data("train.in", "train.out", "train.align") @@ -203,22 +223,45 @@ def _create_dummy_alignment_data(filename_src, filename_tgt, filename): _create_dummy_alignment_data("test.in", "test.out", "test.align") -def preprocess_lm_data(data_dir): +def preprocess_lm_data(data_dir, languages=None): preprocess_parser = options.get_preprocessing_parser() - preprocess_args = preprocess_parser.parse_args( - [ - "--only-source", - "--trainpref", - os.path.join(data_dir, "train.out"), - "--validpref", - os.path.join(data_dir, "valid.out"), - "--testpref", - os.path.join(data_dir, "test.out"), - "--destdir", - data_dir, - ] - ) - preprocess.main(preprocess_args) + if languages is None: + preprocess_args = preprocess_parser.parse_args( + [ + "--only-source", + "--trainpref", + os.path.join(data_dir, "train.out"), + "--validpref", + os.path.join(data_dir, "valid.out"), + "--testpref", + os.path.join(data_dir, "test.out"), + "--destdir", + data_dir, + ] + ) + preprocess.main(preprocess_args) + else: + for lang in languages: + lang_dir = os.path.join(data_dir, lang) + assert os.path.exists(lang_dir) + preprocess_args = preprocess_parser.parse_args( + [ + "--only-source", + "--trainpref", + os.path.join(lang_dir, "train.out"), + "--validpref", + os.path.join(lang_dir, "valid.out"), + "--testpref", + os.path.join(lang_dir, "test.out"), + "--destdir", + lang_dir, + ] + ) + preprocess.main(preprocess_args) + shutil.copyfile( + os.path.join(data_dir, languages[0], "dict.txt"), + os.path.join(data_dir, "dict.txt"), + ) def preprocess_translation_data(data_dir, extra_flags=None): @@ -274,6 +317,43 @@ def preprocess_summarization_data(data_dir, extra_flags=None): preprocess.main(preprocess_args) +def create_laser_data_and_config_json(data_dir): + src_langs = ["de", "fr", "ru", "tr", "zh"] + tgt_langs = ["en", "es"] + config_json = {} + config_train_json = [] + src_vocab = None + tgt_vocab = None + + for src_lang in src_langs: + for tgt_lang in tgt_langs: + langpair_folder = f"{src_lang}-{tgt_lang}" + + langpair_path = os.path.join(data_dir, langpair_folder) + os.mkdir(langpair_path) + create_dummy_data(langpair_path) + preprocess_translation_data(langpair_path, ["--dataset-impl", "cached"]) + + src_vocab = os.path.join(langpair_path, "dict.in.txt") + tgt_vocab = os.path.join(langpair_path, "dict.out.txt") + config_train_json.append( + { + "id": 0 if tgt_lang == "en" else 1, + "src": os.path.join(langpair_path, "train.in-out.in"), + "tgt": os.path.join(langpair_path, "train.in-out.out"), + } + ) + + config_json["src_vocab"] = src_vocab + config_json["tgt_vocab"] = tgt_vocab + config_json["train"] = config_train_json + + with open(os.path.join(data_dir, "laserconfig.json"), "w") as config_file: + json.dump(config_json, config_file) + + return config_file + + def train_translation_model( data_dir, arch, @@ -282,6 +362,7 @@ def train_translation_model( run_validation=False, lang_flags=None, extra_valid_flags=None, + world_size=1, ): if lang_flags is None: lang_flags = [ @@ -311,14 +392,16 @@ def train_translation_model( "1", "--no-progress-bar", "--distributed-world-size", - "1", + str(world_size), "--num-workers", "0", ] + lang_flags + (extra_flags or []), ) - train.main(train_args) + + cfg = convert_namespace_to_omegaconf(train_args) + distributed_utils.call_main(cfg, train.main) if run_validation: # test validation @@ -345,18 +428,20 @@ def train_translation_model( validate.main(validate_args) -def generate_main(data_dir, extra_flags=None): +def generate_main(data_dir, extra_flags=None, path=None): if extra_flags is None: extra_flags = [ "--print-alignment", ] + if path is None: + path = os.path.join(data_dir, "checkpoint_last.pt") generate_parser = options.get_generation_parser() generate_args = options.parse_args_and_arch( generate_parser, [ data_dir, "--path", - os.path.join(data_dir, "checkpoint_last.pt"), + path, "--beam", "3", "--batch-size", @@ -409,7 +494,7 @@ def __init__(self, args, src_dict, tgt_dict, model): def setup_task(cls, args, src_dict=None, tgt_dict=None, model=None): return cls(args, src_dict, tgt_dict, model) - def build_model(self, args): + def build_model(self, args, from_checkpoint=False): return TestModel.build_model(args, self) @property @@ -606,3 +691,107 @@ def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs): prev_output_tokens, encoder_out=encoder_out, **kwargs ) return decoder_out + + +def train_language_model( + data_dir, + arch, + extra_flags=None, + run_validation=False, + extra_valid_flags=None, + task="language_modeling", + world_size=1, +): + train_parser = options.get_training_parser() + train_args = options.parse_args_and_arch( + train_parser, + [ + "--task", + task, + data_dir, + "--arch", + arch, + "--optimizer", + "adam", + "--lr", + "0.0001", + "--max-tokens", + "500", + "--tokens-per-sample", + "500", + "--save-dir", + data_dir, + "--max-epoch", + "1", + "--no-progress-bar", + "--distributed-world-size", + str(world_size), + "--ddp-backend", + "no_c10d", + "--num-workers", + "0", + ] + + (extra_flags or []), + ) + cfg = convert_namespace_to_omegaconf(train_args) + distributed_utils.call_main(cfg, train.main) + + if run_validation: + # test validation + validate_parser = options.get_validation_parser() + validate_args = options.parse_args_and_arch( + validate_parser, + [ + "--task", + task, + data_dir, + "--path", + os.path.join(data_dir, "checkpoint_last.pt"), + "--valid-subset", + "valid", + "--max-tokens", + "500", + "--no-progress-bar", + "--num-workers", + "0", + ] + + (extra_valid_flags or []), + ) + validate.main(validate_args) + + +def sizes(data): + return [len(sentence) for sentence in data] + + +POPULATION = string.ascii_letters + string.digits + + +def make_sentence() -> tp.List[str]: + length = random.randint(10, 50) + return random.choices( + population=POPULATION, k=length, weights=range(1, len(POPULATION) + 1) + ) + + +def make_data(length=1000, out_file=None) -> tp.List[tp.List[str]]: + data = ( + [make_sentence() for _ in range(0, length)] + # add all the symbols at least once + + [list(string.ascii_letters), list(string.digits)] + ) + if out_file is not None: + with open(out_file, "w", encoding="utf-8") as out: + for s in data: + print(" ".join(s), file=out) + + return data + + +def build_vocab(data: tp.List[tp.List[str]]) -> Dictionary: + d = Dictionary() + for s in data: + for token in s: + d.add_symbol(token) + d.finalize() + return d