diff --git a/.cardboardlint.yml b/.cardboardlint.yml deleted file mode 100644 index 4a115a37cd..0000000000 --- a/.cardboardlint.yml +++ /dev/null @@ -1,5 +0,0 @@ -linters: -- pylint: - # pylintrc: pylintrc - filefilter: ['- test_*.py', '+ *.py', '- *.npy'] - # exclude: \ No newline at end of file diff --git a/.github/workflows/aux_tests.yml b/.github/workflows/aux_tests.yml deleted file mode 100644 index f4cb3ecfe1..0000000000 --- a/.github/workflows/aux_tests.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: aux-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_aux diff --git a/.github/workflows/data_tests.yml b/.github/workflows/data_tests.yml deleted file mode 100644 index 3d1e3f8c4d..0000000000 --- a/.github/workflows/data_tests.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: data-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make data_tests diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml deleted file mode 100644 index d2159027b6..0000000000 --- a/.github/workflows/inference_tests.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: inference_tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: | - export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make inference_tests diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 2bbcf3cd70..f81f5a7493 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -10,7 +10,7 @@ jobs: build-sdist: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Verify tag matches version run: | set -ex @@ -19,7 +19,7 @@ jobs: if [[ "v$version" != "$tag" ]]; then exit 1 fi - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v5 with: python-version: 3.9 - run: | @@ -28,7 +28,7 @@ jobs: python -m build - run: | pip install dist/*.tar.gz - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v4 with: name: sdist path: dist/*.tar.gz @@ -38,8 +38,8 @@ jobs: matrix: python-version: ["3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install pip requirements @@ -50,45 +50,38 @@ jobs: run: | python setup.py bdist_wheel --plat-name=manylinux1_x86_64 python -m pip install dist/*-manylinux*.whl - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v4 with: name: wheel-${{ matrix.python-version }} path: dist/*-manylinux*.whl publish-artifacts: runs-on: ubuntu-20.04 needs: [build-sdist, build-wheels] + environment: + name: release + url: https://pypi.org/p/coqui-tts + permissions: + id-token: write steps: - run: | mkdir dist - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v4 with: name: "sdist" path: "dist/" - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v4 with: name: "wheel-3.9" path: "dist/" - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v4 with: name: "wheel-3.10" path: "dist/" - - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v4 with: name: "wheel-3.11" path: "dist/" - run: | ls -lh dist/ - - name: Setup PyPI config - run: | - cat << EOF > ~/.pypirc - [pypi] - username=__token__ - password=${{ secrets.PYPI_TOKEN }} - EOF - - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - run: | - python -m pip install twine - - run: | - twine upload --repository pypi dist/* + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml index b7c6393baa..e21feeb7f6 100644 --- a/.github/workflows/style_check.yml +++ b/.github/workflows/style_check.yml @@ -7,12 +7,6 @@ on: pull_request: types: [opened, synchronize, reopened] jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - test: runs-on: ubuntu-latest strategy: @@ -29,18 +23,7 @@ jobs: architecture: x64 cache: 'pip' cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Style check - run: make style + - name: Install/upgrade dev dependencies + run: python3 -m pip install -r requirements.dev.txt + - name: Lint check + run: make lint diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000000..b056e3073d --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,78 @@ +name: tests + +on: + push: + branches: + - main + pull_request: + types: [opened, synchronize, reopened] +jobs: + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [3.9, "3.10", "3.11"] + subset: ["data_tests", "inference_tests", "test_aux", "test_text", "test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + cache: 'pip' + cache-dependency-path: 'requirements*' + - name: check OS + run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 + - name: Install Espeak + if: contains(fromJSON('["inference_tests", "test_text", "test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) + run: | + sudo apt-get update + sudo apt-get install espeak espeak-ng + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends git make gcc + make system-deps + - name: Install/upgrade Python setup deps + run: python3 -m pip install --upgrade pip setuptools wheel + - name: Replace scarf urls + if: contains(fromJSON('["data_tests", "inference_tests", "test_aux", "test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) + run: | + sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json + - name: Install TTS + run: | + python3 -m pip install .[all] + python3 setup.py egg_info + - name: Unit tests + run: make ${{ matrix.subset }} + - name: Upload coverage data + uses: actions/upload-artifact@v4 + with: + name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }} + path: .coverage.* + if-no-files-found: ignore + coverage: + if: always() + needs: test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - uses: actions/download-artifact@v4 + with: + pattern: coverage-data-* + merge-multiple: true + - name: Combine coverage + run: | + python -Im pip install --upgrade coverage[toml] + + python -Im coverage combine + python -Im coverage html --skip-covered --skip-empty + + python -Im coverage report --format=markdown >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/text_tests.yml b/.github/workflows/text_tests.yml deleted file mode 100644 index 78d3026d7f..0000000000 --- a/.github/workflows/text_tests.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: text-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_text diff --git a/.github/workflows/tts_tests.yml b/.github/workflows/tts_tests.yml deleted file mode 100644 index 5074cded6d..0000000000 --- a/.github/workflows/tts_tests.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: tts-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_tts diff --git a/.github/workflows/tts_tests2.yml b/.github/workflows/tts_tests2.yml deleted file mode 100644 index f64433f8df..0000000000 --- a/.github/workflows/tts_tests2.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: tts-tests2 - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_tts2 diff --git a/.github/workflows/vocoder_tests.yml b/.github/workflows/vocoder_tests.yml deleted file mode 100644 index 6519ee3fef..0000000000 --- a/.github/workflows/vocoder_tests.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: vocoder-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_vocoder diff --git a/.github/workflows/xtts_tests.yml b/.github/workflows/xtts_tests.yml deleted file mode 100644 index be367f3547..0000000000 --- a/.github/workflows/xtts_tests.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: xtts-tests - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends git make gcc - sudo apt-get install espeak - sudo apt-get install espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: make test_xtts diff --git a/.github/workflows/zoo_tests0.yml b/.github/workflows/zoo_tests0.yml deleted file mode 100644 index 13f47a938b..0000000000 --- a/.github/workflows/zoo_tests0.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: zoo-tests-0 - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - sudo apt-get install espeak espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: | - nose2 -F -v -B TTS tests.zoo_tests.test_models.test_models_offset_0_step_3 - nose2 -F -v -B TTS tests.zoo_tests.test_models.test_voice_conversion diff --git a/.github/workflows/zoo_tests1.yml b/.github/workflows/zoo_tests1.yml deleted file mode 100644 index 00f13397fa..0000000000 --- a/.github/workflows/zoo_tests1.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: zoo-tests-1 - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - sudo apt-get install espeak espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_1_step_3 diff --git a/.github/workflows/zoo_tests2.yml b/.github/workflows/zoo_tests2.yml deleted file mode 100644 index 310a831a8b..0000000000 --- a/.github/workflows/zoo_tests2.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: zoo-tests-2 - -on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] -jobs: - check_skip: - runs-on: ubuntu-latest - if: "! contains(github.event.head_commit.message, '[ci skip]')" - steps: - - run: echo "${{ github.event.head_commit.message }}" - - test: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: [3.9, "3.10", "3.11"] - experimental: [false] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y git make gcc - sudo apt-get install espeak espeak-ng - make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel - - name: Replace scarf urls - run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS - run: | - python3 -m pip install .[all] - python3 setup.py egg_info - - name: Unit tests - run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_2_step_3 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 911f2a838e..eeb02fde88 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,27 +1,19 @@ repos: - - repo: 'https://github.com/pre-commit/pre-commit-hooks' - rev: v2.3.0 + - repo: "https://github.com/pre-commit/pre-commit-hooks" + rev: v4.5.0 hooks: - id: check-yaml - - id: end-of-file-fixer - - id: trailing-whitespace - - repo: 'https://github.com/psf/black' - rev: 22.3.0 + # TODO: enable these later; there are plenty of violating + # files that need to be fixed first + # - id: end-of-file-fixer + # - id: trailing-whitespace + - repo: "https://github.com/psf/black" + rev: 24.2.0 hooks: - id: black language_version: python3 - - repo: https://github.com/pycqa/isort - rev: 5.8.0 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.3.0 hooks: - - id: isort - name: isort (python) - - id: isort - name: isort (cython) - types: [cython] - - id: isort - name: isort (pyi) - types: [pyi] - - repo: https://github.com/pycqa/pylint - rev: v2.8.2 - hooks: - - id: pylint + - id: ruff + args: [--fix, --exit-non-zero-on-fix] diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 49a9dbdd2c..0000000000 --- a/.pylintrc +++ /dev/null @@ -1,599 +0,0 @@ -[MASTER] - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code. -extension-pkg-whitelist= - -# Add files or directories to the blacklist. They should be base names, not -# paths. -ignore=CVS - -# Add files or directories matching the regex patterns to the blacklist. The -# regex matches against base names, not paths. -ignore-patterns= - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the -# number of processors available to use. -jobs=1 - -# Control the amount of potential inferred values when inferring a single -# object. This can help the performance when dealing with large functions or -# complex, nested conditions. -limit-inference-results=100 - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins= - -# Pickle collected data for later comparisons. -persistent=yes - -# Specify a configuration file. -#rcfile= - -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages. -suggestion-mode=yes - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. -confidence= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once). You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use "--disable=all --enable=classes -# --disable=W". -disable=missing-docstring, - too-many-public-methods, - too-many-lines, - bare-except, - ## for avoiding weird p3.6 CI linter error - ## TODO: see later if we can remove this - assigning-non-slot, - unsupported-assignment-operation, - ## end - line-too-long, - fixme, - wrong-import-order, - ungrouped-imports, - wrong-import-position, - import-error, - invalid-name, - too-many-instance-attributes, - arguments-differ, - arguments-renamed, - no-name-in-module, - no-member, - unsubscriptable-object, - print-statement, - parameter-unpacking, - unpacking-in-except, - old-raise-syntax, - backtick, - long-suffix, - old-ne-operator, - old-octal-literal, - import-star-module-level, - non-ascii-bytes-literal, - raw-checker-failed, - bad-inline-option, - locally-disabled, - file-ignored, - suppressed-message, - useless-suppression, - deprecated-pragma, - use-symbolic-message-instead, - useless-object-inheritance, - too-few-public-methods, - too-many-branches, - too-many-arguments, - too-many-locals, - too-many-statements, - apply-builtin, - basestring-builtin, - buffer-builtin, - cmp-builtin, - coerce-builtin, - execfile-builtin, - file-builtin, - long-builtin, - raw_input-builtin, - reduce-builtin, - standarderror-builtin, - unicode-builtin, - xrange-builtin, - coerce-method, - delslice-method, - getslice-method, - setslice-method, - no-absolute-import, - old-division, - dict-iter-method, - dict-view-method, - next-method-called, - metaclass-assignment, - indexing-exception, - raising-string, - reload-builtin, - oct-method, - hex-method, - nonzero-method, - cmp-method, - input-builtin, - round-builtin, - intern-builtin, - unichr-builtin, - map-builtin-not-iterating, - zip-builtin-not-iterating, - range-builtin-not-iterating, - filter-builtin-not-iterating, - using-cmp-argument, - eq-without-hash, - div-method, - idiv-method, - rdiv-method, - exception-message-attribute, - invalid-str-codec, - sys-max-int, - bad-python3-import, - deprecated-string-function, - deprecated-str-translate-call, - deprecated-itertools-function, - deprecated-types-field, - next-method-defined, - dict-items-not-iterating, - dict-keys-not-iterating, - dict-values-not-iterating, - deprecated-operator-function, - deprecated-urllib-function, - xreadlines-attribute, - deprecated-sys-function, - exception-escape, - comprehension-escape, - duplicate-code, - not-callable, - import-outside-toplevel, - logging-fstring-interpolation, - logging-not-lazy - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=c-extension-no-member - - -[REPORTS] - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details. -#msg-template= - -# Set the output format. Available formats are text, parseable, colorized, json -# and msvs (visual studio). You can also give a reporter class, e.g. -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages. -reports=no - -# Activate the evaluation score. -score=yes - - -[REFACTORING] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - -# Complete name of functions that never returns. When checking for -# inconsistent-return-statements if a never returning function is called then -# it will be considered as an explicit return statement and no message will be -# printed. -never-returning-functions=sys.exit - - -[LOGGING] - -# Format style used to check logging format string. `old` means using % -# formatting, while `new` is for `{}` formatting. -logging-format-style=old - -# Logging modules to check that the string format arguments are in logging -# function parameter format. -logging-modules=logging - - -[SPELLING] - -# Limits count of emitted suggestions for spelling mistakes. -max-spelling-suggestions=4 - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package.. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME, - XXX, - TODO - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members=numpy.*,torch.* - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# Tells whether to warn about missing members when the owner of the attribute -# is inferred to be None. -ignore-none=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - - -[VARIABLES] - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid defining new builtins when possible. -additional-builtins= - -# Tells whether unused global variables should be treated as a violation. -allow-global-unused-variables=yes - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_, - _cb - -# A regular expression matching the name of dummy variables (i.e. expected to -# not be used). -dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore. -ignored-argument-names=_.*|^ignored_|^unused_ - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io - - -[FORMAT] - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Maximum number of characters on a single line. -max-line-length=120 - -# Maximum number of lines in a module. -max-module-lines=1000 - -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma, - dict-separator - -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - - -[SIMILARITIES] - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - -# Minimum lines number of a similarity. -min-similarity-lines=4 - - -[BASIC] - -# Naming style matching correct argument names. -argument-naming-style=snake_case - -# Regular expression matching correct argument names. Overrides argument- -# naming-style. -argument-rgx=[a-z_][a-z0-9_]{0,30}$ - -# Naming style matching correct attribute names. -attr-naming-style=snake_case - -# Regular expression matching correct attribute names. Overrides attr-naming- -# style. -#attr-rgx= - -# Bad variable names which should always be refused, separated by a comma. -bad-names= - -# Naming style matching correct class attribute names. -class-attribute-naming-style=any - -# Regular expression matching correct class attribute names. Overrides class- -# attribute-naming-style. -#class-attribute-rgx= - -# Naming style matching correct class names. -class-naming-style=PascalCase - -# Regular expression matching correct class names. Overrides class-naming- -# style. -#class-rgx= - -# Naming style matching correct constant names. -const-naming-style=UPPER_CASE - -# Regular expression matching correct constant names. Overrides const-naming- -# style. -#const-rgx= - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - -# Naming style matching correct function names. -function-naming-style=snake_case - -# Regular expression matching correct function names. Overrides function- -# naming-style. -#function-rgx= - -# Good variable names which should always be accepted, separated by a comma. -good-names=i, - j, - k, - x, - ex, - Run, - _ - -# Include a hint for the correct naming format with invalid-name. -include-naming-hint=no - -# Naming style matching correct inline iteration names. -inlinevar-naming-style=any - -# Regular expression matching correct inline iteration names. Overrides -# inlinevar-naming-style. -#inlinevar-rgx= - -# Naming style matching correct method names. -method-naming-style=snake_case - -# Regular expression matching correct method names. Overrides method-naming- -# style. -#method-rgx= - -# Naming style matching correct module names. -module-naming-style=snake_case - -# Regular expression matching correct module names. Overrides module-naming- -# style. -#module-rgx= - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -# These decorators are taken in consideration only for invalid-name. -property-classes=abc.abstractproperty - -# Naming style matching correct variable names. -variable-naming-style=snake_case - -# Regular expression matching correct variable names. Overrides variable- -# naming-style. -variable-rgx=[a-z_][a-z0-9_]{0,30}$ - - -[STRING] - -# This flag controls whether the implicit-str-concat-in-sequence should -# generate a warning on implicit string concatenation in sequences defined over -# several lines. -check-str-concat-over-line-jumps=no - - -[IMPORTS] - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma. -deprecated-modules=optparse,tkinter.tix - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled). -ext-import-graph= - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled). -import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled). -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=cls - - -[DESIGN] - -# Maximum number of arguments for function / method. -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in an if statement. -max-bool-expr=5 - -# Maximum number of branch for function / method body. -max-branches=12 - -# Maximum number of locals for function / method body. -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=15 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body. -max-returns=6 - -# Maximum number of statements in function / method body. -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "BaseException, Exception". -overgeneral-exceptions=BaseException, - Exception diff --git a/CITATION.cff b/CITATION.cff index 6b0c8f19af..28eb65e23c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -10,8 +10,8 @@ authors: version: 1.4 doi: 10.5281/zenodo.6334862 license: "MPL-2.0" -url: "https://www.coqui.ai" -repository-code: "https://github.com/coqui-ai/TTS" +url: "https://github.com/eginhard/coqui-tts" +repository-code: "https://github.com/eginhard/coqui-tts" keywords: - machine learning - deep learning diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ae0ce46048..8a0fe3904a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,7 @@ Welcome to the 🐸TTS! -This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/coqui-ai/TTS/blob/main/CODE_OF_CONDUCT.md). +This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/eginhard/coqui-tts/blob/main/CODE_OF_CONDUCT.md). ## Where to start. We welcome everyone who likes to contribute to 🐸TTS. @@ -15,13 +15,13 @@ If you like to contribute code, squash a bug but if you don't know where to star You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc. -- [Github Issues Tracker](https://github.com/coqui-ai/TTS/issues) +- [Github Issues Tracker](https://github.com/eginhard/coqui-tts/issues) This is a place to find feature requests, bugs. Issues with the ```good first issue``` tag are good place for beginners to take on. -- ✨**PR**✨ [pages](https://github.com/coqui-ai/TTS/pulls) with the ```🚀new version``` tag. +- ✨**PR**✨ [pages](https://github.com/eginhard/coqui-tts/pulls) with the ```🚀new version``` tag. We list all the target improvements for the next version. You can pick one of them and start contributing. @@ -46,14 +46,14 @@ Let us know if you encounter a problem along the way. The following steps are tested on an Ubuntu system. -1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page. +1. Fork 🐸TTS[https://github.com/eginhard/coqui-tts] by clicking the fork button at the top right corner of the project page. 2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```. ```bash - $ git clone git@github.com:/TTS.git - $ cd TTS - $ git remote add upstream https://github.com/coqui-ai/TTS.git + $ git clone git@github.com:/coqui-tts.git + $ cd coqui-tts + $ git remote add upstream https://github.com/eginhard/coqui-tts.git ``` 3. Install 🐸TTS for development. @@ -82,13 +82,13 @@ The following steps are tested on an Ubuntu system. $ make test_all # run all the tests, report all the errors ``` -9. Format your code. We use ```black``` for code and ```isort``` for ```import``` formatting. +9. Format your code. We use ```black``` for code formatting. ```bash $ make style ``` -10. Run the linter and correct the issues raised. We use ```pylint``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions. +10. Run the linter and correct the issues raised. We use ```ruff``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions. ```bash $ make lint @@ -105,7 +105,7 @@ The following steps are tested on an Ubuntu system. ```bash $ git fetch upstream - $ git rebase upstream/master + $ git rebase upstream/main # or for the development version $ git rebase upstream/dev ``` @@ -124,7 +124,7 @@ The following steps are tested on an Ubuntu system. 13. Let's discuss until it is perfect. đŸ’Ē - We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/coqui-ai/TTS/pulls]. + We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/eginhard/coqui-tts/pulls]. 14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version. @@ -132,14 +132,14 @@ The following steps are tested on an Ubuntu system. If you prefer working within a Docker container as your development environment, you can do the following: -1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page. +1. Fork 🐸TTS[https://github.com/eginhard/coqui-tts] by clicking the fork button at the top right corner of the project page. 2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```. ```bash - $ git clone git@github.com:/TTS.git - $ cd TTS - $ git remote add upstream https://github.com/coqui-ai/TTS.git + $ git clone git@github.com:/coqui-tts.git + $ cd coqui-tts + $ git remote add upstream https://github.com/eginhard/coqui-tts.git ``` 3. Build the Docker Image as your development environment (it installs all of the dependencies for you): diff --git a/Makefile b/Makefile index 7446848f46..a24c41fc0b 100644 --- a/Makefile +++ b/Makefile @@ -11,47 +11,50 @@ test_all: ## run tests and don't stop on an error. ./run_bash_tests.sh test: ## run tests. - nose2 -F -v -B --with-coverage --coverage TTS tests + coverage run -m nose2 -F -v -B tests test_vocoder: ## run vocoder tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests + coverage run -m nose2 -F -v -B tests.vocoder_tests test_tts: ## run tts tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests + coverage run -m nose2 -F -v -B tests.tts_tests test_tts2: ## run tts tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests2 + coverage run -m nose2 -F -v -B tests.tts_tests2 test_xtts: - nose2 -F -v -B --with-coverage --coverage TTS tests.xtts_tests + coverage run -m nose2 -F -v -B tests.xtts_tests test_aux: ## run aux tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests + coverage run -m nose2 -F -v -B tests.aux_tests ./run_bash_tests.sh -test_zoo: ## run zoo tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests +test_zoo0: ## run zoo tests. + coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_0_step_3 \ + tests.zoo_tests.test_models.test_voice_conversion +test_zoo1: ## run zoo tests. + coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_1_step_3 +test_zoo2: ## run zoo tests. + coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_2_step_3 inference_tests: ## run inference tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests + coverage run -m nose2 -F -v -B tests.inference_tests data_tests: ## run data tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests + coverage run -m nose2 -F -v -B tests.data_tests test_text: ## run text tests. - nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests + coverage run -m nose2 -F -v -B tests.text_tests test_failed: ## only run tests failed the last time. - nose2 -F -v -B --with-coverage --coverage TTS tests + coverage run -m nose2 -F -v -B tests style: ## update code style. black ${target_dirs} - isort ${target_dirs} -lint: ## run pylint linter. - pylint ${target_dirs} +lint: ## run linters. + ruff check ${target_dirs} black ${target_dirs} --check - isort ${target_dirs} --check-only system-deps: ## install linux system deps sudo apt-get install -y libsndfile1-dev diff --git a/README.md b/README.md index 891118c13d..782b48ab69 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,17 @@ ## 🐸Coqui.ai News - đŸ“Ŗ ⓍTTSv2 is here with 16 languages and better performance across the board. -- đŸ“Ŗ ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech). +- đŸ“Ŗ ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/eginhard/coqui-tts/tree/dev/recipes/ljspeech). - đŸ“Ŗ ⓍTTS can now stream with <200ms latency. -- đŸ“Ŗ ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html) -- đŸ“Ŗ [đŸļBark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html) +- đŸ“Ŗ ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://coqui-tts.readthedocs.io/en/dev/models/xtts.html) +- đŸ“Ŗ [đŸļBark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/dev/models/bark.html) - đŸ“Ŗ You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. -- đŸ“Ŗ 🐸TTS now supports đŸĸTortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html) -- đŸ“Ŗ Voice generation with prompts - **Prompt to Voice** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin)!! - [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice) -- đŸ“Ŗ Voice generation with fusion - **Voice fusion** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin). -- đŸ“Ŗ Voice cloning is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin). +- đŸ“Ŗ 🐸TTS now supports đŸĸTortoise with faster inference. [Docs](https://coqui-tts.readthedocs.io/en/dev/models/tortoise.html)
-## +## **🐸TTS is a library for advanced Text-to-Speech generation.** @@ -29,22 +26,14 @@ ______________________________________________________________________ [![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) [![License]()](https://opensource.org/licenses/MPL-2.0) [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS) -[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md) +[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/eginhard/coqui-tts/blob/main/CODE_OF_CONDUCT.md) [![Downloads](https://pepy.tech/badge/tts)](https://pepy.tech/project/tts) [![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/aux_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/data_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/docker.yaml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/inference_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/style_check.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/text_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/tts_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/vocoder_tests.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests0.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests1.yml/badge.svg) -![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests2.yml/badge.svg) -[![Docs]()](https://tts.readthedocs.io/en/latest/) +![GithubActions](https://github.com/eginhard/coqui-tts/actions/workflows/tests.yml/badge.svg) +![GithubActions](https://github.com/eginhard/coqui-tts/actions/workflows/docker.yaml/badge.svg) +![GithubActions](https://github.com/eginhard/coqui-tts/actions/workflows/style_check.yml/badge.svg) +[![Docs]()](https://coqui-tts.readthedocs.io/en/latest/)
@@ -60,8 +49,8 @@ Please use our dedicated channels for questions and discussion. Help is much mor | 👩‍đŸ’ģ **Usage Questions** | [GitHub Discussions] | | đŸ—¯ **General Discussion** | [GitHub Discussions] or [Discord] | -[github issue tracker]: https://github.com/coqui-ai/tts/issues -[github discussions]: https://github.com/coqui-ai/TTS/discussions +[github issue tracker]: https://github.com/eginhard/coqui-tts/issues +[github discussions]: https://github.com/eginhard/coqui-tts/discussions [discord]: https://discord.gg/5eXr5seRrv [Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials @@ -69,19 +58,13 @@ Please use our dedicated channels for questions and discussion. Help is much mor ## 🔗 Links and Resources | Type | Links | | ------------------------------- | --------------------------------------- | -| đŸ’ŧ **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/) -| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#installation)| -| 👩‍đŸ’ģ **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)| +| đŸ’ŧ **Documentation** | [ReadTheDocs](https://coqui-tts.readthedocs.io/en/latest/) +| 💾 **Installation** | [TTS/README.md](https://github.com/eginhard/coqui-tts/tree/dev#installation)| +| 👩‍đŸ’ģ **Contributing** | [CONTRIBUTING.md](https://github.com/eginhard/coqui-tts/blob/main/CONTRIBUTING.md)| | 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378) -| 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)| +| 🚀 **Released Models** | [Standard models](https://github.com/eginhard/coqui-tts/blob/dev/TTS/.models.json) and [Fairseq models in ~1100 languages](https://github.com/eginhard/coqui-tts#example-text-to-speech-using-fairseq-models-in-1100-languages-)| | 📰 **Papers** | [TTS Papers](https://github.com/erogol/TTS-papers)| - -## đŸĨ‡ TTS Performance -

- -Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not released open-source. They are here to show the potential. Models prefixed with a dot (.Jofish .Abe and .Janice) are real human voices. - ## Features - High-performance Deep Learning models for Text2Speech tasks. - Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech). @@ -149,17 +132,17 @@ You can also help us implement more models. ## Installation 🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**. -If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option. +If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option. ```bash -pip install TTS +pip install coqui-tts ``` If you plan to code or train models, clone 🐸TTS and install it locally. ```bash -git clone https://github.com/coqui-ai/TTS -pip install -e .[all,dev,notebooks] # Select the relevant extras +git clone https://github.com/eginhard/coqui-tts +pip install -e .[all,dev,notebooks,server] # Select the relevant extras ``` If you are on Ubuntu (Debian), you can also run following commands for installation. @@ -169,7 +152,9 @@ $ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you $ make install ``` -If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system). +If you are on Windows, 👑@GuyPaddock wrote installation instructions +[here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system) +(note that these are out of date, e.g. you need to have at least Python 3.9). ## Docker Image @@ -183,7 +168,8 @@ python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a s ``` You can then enjoy the TTS server [here](http://[::1]:5002/) -More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html) +More details about the docker images (like GPU support) can be found +[here](https://coqui-tts.readthedocs.io/en/latest/docker_images.html) ## Synthesizing speech by 🐸TTS diff --git a/TTS/.models.json b/TTS/.models.json index b349e7397b..a77ebea1cf 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -46,7 +46,7 @@ "hf_url": [ "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt", "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt", - "https://coqui.gateway.scarf.sh/hf/text_2.pt", + "https://coqui.gateway.scarf.sh/hf/bark/text_2.pt", "https://coqui.gateway.scarf.sh/hf/bark/config.json", "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt", "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth" diff --git a/TTS/VERSION b/TTS/VERSION index 2157409059..a723ece79b 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.22.0 +0.22.1 diff --git a/TTS/api.py b/TTS/api.py index 7abc188e74..992fbe69e9 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -1,15 +1,13 @@ import tempfile import warnings from pathlib import Path -from typing import Union -import numpy as np from torch import nn +from TTS.config import load_config from TTS.utils.audio.numpy_transforms import save_wav from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer -from TTS.config import load_config class TTS(nn.Module): @@ -99,7 +97,7 @@ def is_multi_lingual(self): isinstance(self.model_name, str) and "xtts" in self.model_name or self.config - and ("xtts" in self.config.model or len(self.config.languages) > 1) + and ("xtts" in self.config.model or "languages" in self.config and len(self.config.languages) > 1) ): return True if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager: @@ -122,8 +120,9 @@ def languages(self): def get_models_file_path(): return Path(__file__).parent / ".models.json" - def list_models(self): - return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False) + @staticmethod + def list_models(): + return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False).list_models() def download_model_by_name(self, model_name: str): model_path, config_path, model_item = self.manager.download_model(model_name) @@ -168,9 +167,7 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False): self.synthesizer = None self.model_name = model_name - model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( - model_name - ) + model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(model_name) # init synthesizer # None values are fetch from the model @@ -231,7 +228,7 @@ def _check_arguments( raise ValueError("Model is not multi-speaker but `speaker` is provided.") if not self.is_multi_lingual and language is not None: raise ValueError("Model is not multi-lingual but `language` is provided.") - if not emotion is None and not speed is None: + if emotion is not None and speed is not None: raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.") def tts( diff --git a/TTS/bin/collect_env_info.py b/TTS/bin/collect_env_info.py index 662fcd02ec..32aa303e6e 100644 --- a/TTS/bin/collect_env_info.py +++ b/TTS/bin/collect_env_info.py @@ -1,4 +1,6 @@ """Get detailed info about the working environment.""" + +import json import os import platform import sys @@ -6,11 +8,10 @@ import numpy import torch -sys.path += [os.path.abspath(".."), os.path.abspath(".")] -import json - import TTS +sys.path += [os.path.abspath(".."), os.path.abspath(".")] + def system_info(): return { diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 9ab520be7d..faadf6901d 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -70,7 +70,7 @@ # if the vocabulary was passed, replace the default if "characters" in C.keys(): - symbols, phonemes = make_symbols(**C.characters) + symbols, phonemes = make_symbols(**C.characters) # noqa: F811 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index c6048626b3..16ad36b8dc 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -8,6 +8,7 @@ import torch from torch.utils.data import DataLoader from tqdm import tqdm +from trainer.generic_utils import count_parameters from TTS.config import load_config from TTS.tts.datasets import TTSDataset, load_tts_samples @@ -16,7 +17,6 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import quantize -from TTS.utils.generic_utils import count_parameters use_cuda = torch.cuda.is_available() diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index ea16974839..f476ca5ddb 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -1,9 +1,10 @@ """Find all the unique characters in a dataset""" + import argparse from argparse import RawTextHelpFormatter from TTS.config import load_config -from TTS.tts.datasets import load_tts_samples +from TTS.tts.datasets import find_unique_chars, load_tts_samples def main(): @@ -28,17 +29,7 @@ def main(): ) items = train_items + eval_items - - texts = "".join(item["text"] for item in items) - chars = set(texts) - lower_chars = filter(lambda c: c.islower(), chars) - chars_force_lower = [c.lower() for c in chars] - chars_force_lower = set(chars_force_lower) - - print(f" > Number of unique characters: {len(chars)}") - print(f" > Unique characters: {''.join(sorted(chars))}") - print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") - print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}") + find_unique_chars(items) if __name__ == "__main__": diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index 4bd7a78eef..48f2e7b740 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -1,4 +1,5 @@ """Find all the unique characters in a dataset""" + import argparse import multiprocessing from argparse import RawTextHelpFormatter @@ -13,7 +14,7 @@ def compute_phonemes(item): text = item["text"] ph = phonemizer.phonemize(text).replace("|", "") - return set(list(ph)) + return set(ph) def main(): diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index b86252ab67..b06c93f7d1 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -224,7 +224,7 @@ def main(): const=True, default=False, ) - + # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) @@ -379,10 +379,8 @@ def main(): if model_item["model_type"] == "tts_models": tts_path = model_path tts_config_path = config_path - if "default_vocoder" in model_item: - args.vocoder_name = ( - model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name - ) + if args.vocoder_name is None and "default_vocoder" in model_item: + args.vocoder_name = model_item["default_vocoder"] # voice conversion model if model_item["model_type"] == "voice_conversion_models": diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 448fefc712..6a8cd7b444 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -8,6 +8,7 @@ import torch from torch.utils.data import DataLoader +from trainer.generic_utils import count_parameters, remove_experiment_folder from trainer.io import copy_model_files, save_best_model, save_checkpoint from trainer.torch import NoamLR from trainer.trainer_utils import get_optimizer @@ -18,7 +19,6 @@ from TTS.encoder.utils.visual import plot_embeddings from TTS.tts.datasets import load_tts_samples from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import count_parameters, remove_experiment_folder from TTS.utils.samplers import PerfectBatchSampler from TTS.utils.training import check_update @@ -125,7 +125,7 @@ def evaluation(model, criterion, data_loader, global_step): def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step): model.train() - best_loss = float("inf") + best_loss = {"train_loss": None, "eval_loss": float("inf")} avg_loader_time = 0 end_time = time.time() for epoch in range(c.epochs): @@ -248,7 +248,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, ) # save the best checkpoint best_loss = save_best_model( - eval_loss, + {"train_loss": None, "eval_loss": eval_loss}, best_loss, c, model, diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py index 09582cea7c..a4b10009d7 100644 --- a/TTS/bin/tune_wavegrad.py +++ b/TTS/bin/tune_wavegrad.py @@ -1,4 +1,5 @@ """Search a good noise schedule for WaveGrad for a given number of inference iterations""" + import argparse from itertools import product as cartesian_product diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index c5a6dd68e2..5103f200b0 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -17,9 +17,12 @@ def read_json_with_comments(json_path): with fsspec.open(json_path, "r", encoding="utf-8") as f: input_str = f.read() # handle comments but not urls with // - input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str) + input_str = re.sub( + r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str + ) return json.loads(input_str) + def register_config(model_name: str) -> Coqpit: """Find the right config for the given model name. diff --git a/TTS/demos/xtts_ft_demo/utils/formatter.py b/TTS/demos/xtts_ft_demo/utils/formatter.py index 536faa0108..40e8b8ed32 100644 --- a/TTS/demos/xtts_ft_demo/utils/formatter.py +++ b/TTS/demos/xtts_ft_demo/utils/formatter.py @@ -1,23 +1,17 @@ -import os import gc -import torchaudio +import os + import pandas +import torch +import torchaudio from faster_whisper import WhisperModel -from glob import glob - from tqdm import tqdm -import torch -import torchaudio # torch.set_num_threads(1) - from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners torch.set_num_threads(16) - -import os - audio_types = (".wav", ".mp3", ".flac") @@ -25,9 +19,10 @@ def list_audios(basePath, contains=None): # return the set of files that are valid return list_files(basePath, validExts=audio_types, contains=contains) + def list_files(basePath, validExts=None, contains=None): # loop over the directory structure - for (rootDir, dirNames, filenames) in os.walk(basePath): + for rootDir, dirNames, filenames in os.walk(basePath): # loop over the filenames in the current directory for filename in filenames: # if the contains string is not none and the filename does not contain @@ -36,7 +31,7 @@ def list_files(basePath, validExts=None, contains=None): continue # determine the file extension of the current file - ext = filename[filename.rfind("."):].lower() + ext = filename[filename.rfind(".") :].lower() # check to see if the file is an audio and should be processed if validExts is None or ext.endswith(validExts): @@ -44,13 +39,22 @@ def list_files(basePath, validExts=None, contains=None): audioPath = os.path.join(rootDir, filename) yield audioPath -def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None): + +def format_audio_list( + audio_files, + target_language="en", + out_path=None, + buffer=0.2, + eval_percentage=0.15, + speaker_name="coqui", + gradio_progress=None, +): audio_total_size = 0 # make sure that ooutput file exists os.makedirs(out_path, exist_ok=True) # Loading Whisper - device = "cuda" if torch.cuda.is_available() else "cpu" + device = "cuda" if torch.cuda.is_available() else "cpu" print("Loading Whisper Model!") asr_model = WhisperModel("large-v2", device=device, compute_type="float16") @@ -69,7 +73,7 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 wav = torch.mean(wav, dim=0, keepdim=True) wav = wav.squeeze() - audio_total_size += (wav.size(-1) / sr) + audio_total_size += wav.size(-1) / sr segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language) segments = list(segments) @@ -94,7 +98,7 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 # get previous sentence end previous_word_end = words_list[word_idx - 1].end # add buffer or get the silence midle between the previous sentence and the current one - sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2) + sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start) / 2) sentence = word.word first_word = False @@ -118,19 +122,16 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 # Average the current word end and next word start word_end = min((word.end + next_word_start) / 2, word.end + buffer) - + absoulte_path = os.path.join(out_path, audio_file) os.makedirs(os.path.dirname(absoulte_path), exist_ok=True) i += 1 first_word = True - audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0) + audio = wav[int(sr * sentence_start) : int(sr * word_end)].unsqueeze(0) # if the audio is too short ignore it (i.e < 0.33 seconds) - if audio.size(-1) >= sr/3: - torchaudio.save(absoulte_path, - audio, - sr - ) + if audio.size(-1) >= sr / 3: + torchaudio.save(absoulte_path, audio, sr) else: continue @@ -140,21 +141,21 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0 df = pandas.DataFrame(metadata) df = df.sample(frac=1) - num_val_samples = int(len(df)*eval_percentage) + num_val_samples = int(len(df) * eval_percentage) df_eval = df[:num_val_samples] df_train = df[num_val_samples:] - df_train = df_train.sort_values('audio_file') + df_train = df_train.sort_values("audio_file") train_metadata_path = os.path.join(out_path, "metadata_train.csv") df_train.to_csv(train_metadata_path, sep="|", index=False) eval_metadata_path = os.path.join(out_path, "metadata_eval.csv") - df_eval = df_eval.sort_values('audio_file') + df_eval = df_eval.sort_values("audio_file") df_eval.to_csv(eval_metadata_path, sep="|", index=False) # deallocate VRAM and RAM del asr_model, df_train, df_eval, df, metadata gc.collect() - return train_metadata_path, eval_metadata_path, audio_total_size \ No newline at end of file + return train_metadata_path, eval_metadata_path, audio_total_size diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index a98765c3e7..7b41966b8f 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -1,5 +1,5 @@ -import os import gc +import os from trainer import Trainer, TrainerArgs @@ -25,7 +25,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, BATCH_SIZE = batch_size # set here the batch size GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps - # Define here the dataset that you want to use for the fine-tuning on. config_dataset = BaseDatasetConfig( formatter="coqui", @@ -43,7 +42,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/") os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True) - # DVAE files DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth" MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth" @@ -55,8 +53,9 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, # download DVAE files if needed if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE): print(" > Downloading DVAE files!") - ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True) - + ModelManager._download_model_files( + [MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True + ) # Download XTTS v2.0 checkpoint if needed TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json" @@ -160,7 +159,7 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, # get the longest text audio file to use as speaker reference samples_len = [len(item["text"].split(" ")) for item in train_samples] - longest_text_idx = samples_len.index(max(samples_len)) + longest_text_idx = samples_len.index(max(samples_len)) speaker_ref = train_samples[longest_text_idx]["audio_file"] trainer_out_path = trainer.output_path diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index ebb11f29d1..85168c641d 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -1,19 +1,16 @@ import argparse +import logging import os import sys import tempfile +import traceback import gradio as gr -import librosa.display -import numpy as np - -import os import torch import torchaudio -import traceback + from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt - from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts @@ -23,7 +20,10 @@ def clear_gpu_cache(): if torch.cuda.is_available(): torch.cuda.empty_cache() + XTTS_MODEL = None + + def load_model(xtts_checkpoint, xtts_config, xtts_vocab): global XTTS_MODEL clear_gpu_cache() @@ -40,17 +40,23 @@ def load_model(xtts_checkpoint, xtts_config, xtts_vocab): print("Model Loaded!") return "Model Loaded!" + def run_tts(lang, tts_text, speaker_audio_file): if XTTS_MODEL is None or not speaker_audio_file: return "You need to run the previous step to load the model !!", None, None - gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs) + gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents( + audio_path=speaker_audio_file, + gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, + max_ref_length=XTTS_MODEL.config.max_ref_len, + sound_norm_refs=XTTS_MODEL.config.sound_norm_refs, + ) out = XTTS_MODEL.inference( text=tts_text, language=lang, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, - temperature=XTTS_MODEL.config.temperature, # Add custom parameters here + temperature=XTTS_MODEL.config.temperature, # Add custom parameters here length_penalty=XTTS_MODEL.config.length_penalty, repetition_penalty=XTTS_MODEL.config.repetition_penalty, top_k=XTTS_MODEL.config.top_k, @@ -65,9 +71,7 @@ def run_tts(lang, tts_text, speaker_audio_file): return "Speech generated !", out_path, speaker_audio_file - - -# define a logger to redirect +# define a logger to redirect class Logger: def __init__(self, filename="log.out"): self.log_file = filename @@ -85,21 +89,19 @@ def flush(self): def isatty(self): return False + # redirect stdout and stderr to a file sys.stdout = Logger() sys.stderr = sys.stdout # logging.basicConfig(stream=sys.stdout, level=logging.INFO) -import logging + logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", - handlers=[ - logging.StreamHandler(sys.stdout) - ] + level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(sys.stdout)] ) + def read_logs(): sys.stdout.flush() with open(sys.stdout.log_file, "r") as f: @@ -107,12 +109,11 @@ def read_logs(): if __name__ == "__main__": - parser = argparse.ArgumentParser( description="""XTTS fine-tuning demo\n\n""" """ Example runs: - python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port + python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port """, formatter_class=argparse.RawTextHelpFormatter, ) @@ -190,12 +191,10 @@ def read_logs(): "zh", "hu", "ko", - "ja" + "ja", ], ) - progress_data = gr.Label( - label="Progress:" - ) + progress_data = gr.Label(label="Progress:") logs = gr.Textbox( label="Logs:", interactive=False, @@ -203,20 +202,30 @@ def read_logs(): demo.load(read_logs, None, logs, every=1) prompt_compute_btn = gr.Button(value="Step 1 - Create dataset") - + def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)): clear_gpu_cache() out_path = os.path.join(out_path, "dataset") os.makedirs(out_path, exist_ok=True) if audio_path is None: - return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", "" + return ( + "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", + "", + "", + ) else: try: - train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress) + train_meta, eval_meta, audio_total_size = format_audio_list( + audio_path, target_language=language, out_path=out_path, gradio_progress=progress + ) except: traceback.print_exc() error = traceback.format_exc() - return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", "" + return ( + f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", + "", + "", + ) clear_gpu_cache() @@ -236,7 +245,7 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac eval_csv = gr.Textbox( label="Eval CSV:", ) - num_epochs = gr.Slider( + num_epochs = gr.Slider( label="Number of epochs:", minimum=1, maximum=100, @@ -264,9 +273,7 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac step=1, value=args.max_audio_length, ) - progress_train = gr.Label( - label="Progress:" - ) + progress_train = gr.Label(label="Progress:") logs_tts_train = gr.Textbox( label="Logs:", interactive=False, @@ -274,18 +281,41 @@ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(trac demo.load(read_logs, None, logs_tts_train, every=1) train_btn = gr.Button(value="Step 2 - Run the training") - def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length): + def train_model( + language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length + ): clear_gpu_cache() if not train_csv or not eval_csv: - return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", "" + return ( + "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", + "", + "", + "", + "", + ) try: # convert seconds to waveform frames max_audio_length = int(max_audio_length * 22050) - config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length) + config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt( + language, + num_epochs, + batch_size, + grad_acumm, + train_csv, + eval_csv, + output_path=output_path, + max_audio_length=max_audio_length, + ) except: traceback.print_exc() error = traceback.format_exc() - return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", "" + return ( + f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", + "", + "", + "", + "", + ) # copy original files to avoid parameters changes issues os.system(f"cp {config_path} {exp_path}") @@ -312,9 +342,7 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum label="XTTS vocab path:", value="", ) - progress_load = gr.Label( - label="Progress:" - ) + progress_load = gr.Label(label="Progress:") load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model") with gr.Column() as col2: @@ -342,7 +370,7 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum "hu", "ko", "ja", - ] + ], ) tts_text = gr.Textbox( label="Input Text.", @@ -351,9 +379,7 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum tts_btn = gr.Button(value="Step 4 - Inference") with gr.Column() as col3: - progress_gen = gr.Label( - label="Progress:" - ) + progress_gen = gr.Label(label="Progress:") tts_output_audio = gr.Audio(label="Generated Audio.") reference_audio = gr.Audio(label="Reference audio used.") @@ -371,7 +397,6 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum ], ) - train_btn.click( fn=train_model, inputs=[ @@ -386,14 +411,10 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum ], outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio], ) - + load_btn.click( fn=load_model, - inputs=[ - xtts_checkpoint, - xtts_config, - xtts_vocab - ], + inputs=[xtts_checkpoint, xtts_config, xtts_vocab], outputs=[progress_load], ) @@ -407,9 +428,4 @@ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acum outputs=[progress_gen, tts_output_audio, reference_audio], ) - demo.launch( - share=True, - debug=False, - server_port=args.port, - server_name="0.0.0.0" - ) + demo.launch(share=True, debug=False, server_port=args.port, server_name="0.0.0.0") diff --git a/TTS/encoder/configs/emotion_encoder_config.py b/TTS/encoder/configs/emotion_encoder_config.py index 5eda2671be..1d12325cf2 100644 --- a/TTS/encoder/configs/emotion_encoder_config.py +++ b/TTS/encoder/configs/emotion_encoder_config.py @@ -1,4 +1,4 @@ -from dataclasses import asdict, dataclass +from dataclasses import dataclass from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig diff --git a/TTS/encoder/configs/speaker_encoder_config.py b/TTS/encoder/configs/speaker_encoder_config.py index 6dceb00277..0588527a68 100644 --- a/TTS/encoder/configs/speaker_encoder_config.py +++ b/TTS/encoder/configs/speaker_encoder_config.py @@ -1,4 +1,4 @@ -from dataclasses import asdict, dataclass +from dataclasses import dataclass from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index 236d6fe937..88ed71d3f4 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -34,7 +34,7 @@ def __init__(self, ap, augmentation_config): # ignore not listed directories if noise_dir not in self.additive_noise_types: continue - if not noise_dir in self.noise_list: + if noise_dir not in self.noise_list: self.noise_list[noise_dir] = [] self.noise_list[noise_dir].append(wav_file) diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py index b93baf9e60..5a68c3075a 100644 --- a/TTS/encoder/utils/prepare_voxceleb.py +++ b/TTS/encoder/utils/prepare_voxceleb.py @@ -19,13 +19,13 @@ # pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes """ voxceleb 1 & 2 """ +import csv import hashlib import os import subprocess import sys import zipfile -import pandas import soundfile as sf from absl import logging @@ -185,8 +185,11 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file): # Write to CSV file which contains four columns: # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name". csv_file_path = os.path.join(output_dir, output_file) - df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) - df.to_csv(csv_file_path, index=False, sep="\t") + with open(csv_file_path, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f, delimiter="\t") + writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) + for wav_file in files: + writer.writerow(wav_file) logging.info("Successfully generated csv file {}".format(csv_file_path)) diff --git a/TTS/encoder/utils/training.py b/TTS/encoder/utils/training.py index ff8f271d80..7692478d6b 100644 --- a/TTS/encoder/utils/training.py +++ b/TTS/encoder/utils/training.py @@ -3,13 +3,14 @@ from coqpit import Coqpit from trainer import TrainerArgs, get_last_checkpoint +from trainer.generic_utils import get_experiment_folder_path from trainer.io import copy_model_files from trainer.logging import logger_factory from trainer.logging.console_logger import ConsoleLogger from TTS.config import load_config, register_config from TTS.tts.utils.text.characters import parse_symbols -from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch +from TTS.utils.generic_utils import get_git_branch @dataclass diff --git a/TTS/server/README.md b/TTS/server/README.md index 270656c4e3..f5df08011b 100644 --- a/TTS/server/README.md +++ b/TTS/server/README.md @@ -1,5 +1,8 @@ # :frog: TTS demo server -Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below. +Before you use the server, make sure you +[install](https://github.com/eginhard/coqui-tts/tree/dev#install-tts)) :frog: TTS +properly and install the additional dependencies with `pip install +coqui-tts[server]`. Then, you can follow the steps below. **Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal. diff --git a/TTS/server/server.py b/TTS/server/server.py index 6b2141a9aa..01bd79a137 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -9,7 +9,10 @@ from typing import Union from urllib.parse import parse_qs -from flask import Flask, render_template, render_template_string, request, send_file +try: + from flask import Flask, render_template, render_template_string, request, send_file +except ImportError as e: + raise ImportError("Server requires requires flask, use `pip install coqui-tts[server]`.") from e from TTS.config import load_config from TTS.utils.manage import ModelManager diff --git a/TTS/server/templates/index.html b/TTS/server/templates/index.html index 6354d3919d..f5f547c7bf 100644 --- a/TTS/server/templates/index.html +++ b/TTS/server/templates/index.html @@ -30,7 +30,7 @@ - Fork me on GitHub @@ -151,4 +151,4 @@ - \ No newline at end of file + diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 192138561f..4f354fa0be 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -167,7 +167,7 @@ def _get_formatter_by_name(name): def find_unique_chars(data_samples, verbose=True): - texts = "".join(item[0] for item in data_samples) + texts = "".join(item["text"] for item in data_samples) chars = set(texts) lower_chars = filter(lambda c: c.islower(), chars) chars_force_lower = [c.lower() for c in chars] diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index c673c963b6..257d1c3100 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -6,6 +6,7 @@ import numpy as np import torch +import torchaudio import tqdm from torch.utils.data import Dataset @@ -42,6 +43,17 @@ def string2filename(string): return filename +def get_audio_size(audiopath) -> int: + """Return the number of samples in the audio file.""" + extension = audiopath.rpartition(".")[-1].lower() + if extension not in {"mp3", "wav", "flac"}: + raise RuntimeError( + f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!" + ) + + return torchaudio.info(audiopath).num_frames + + class TTSDataset(Dataset): def __init__( self, @@ -176,7 +188,7 @@ def lengths(self): lens = [] for item in self.samples: _, wav_file, *_ = _parse_sample(item) - audio_len = os.path.getsize(wav_file) / 16 * 8 # assuming 16bit audio + audio_len = get_audio_size(wav_file) lens.append(audio_len) return lens @@ -295,7 +307,7 @@ def load_data(self, idx): def _compute_lengths(samples): new_samples = [] for item in samples: - audio_length = os.path.getsize(item["audio_file"]) / 16 * 8 # assuming 16bit audio + audio_length = get_audio_size(item["audio_file"]) text_lenght = len(item["text"]) item["audio_length"] = audio_length item["text_length"] = text_lenght @@ -445,9 +457,11 @@ def collate_fn(self, batch): # lengths adjusted by the reduction factor mel_lengths_adjusted = [ - m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step)) - if m.shape[1] % self.outputs_per_step - else m.shape[1] + ( + m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step)) + if m.shape[1] % self.outputs_per_step + else m.shape[1] + ) for m in mel ] diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 053444b0c1..09fbd094e8 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -1,3 +1,4 @@ +import csv import os import re import xml.etree.ElementTree as ET @@ -5,7 +6,6 @@ from pathlib import Path from typing import List -import pandas as pd from tqdm import tqdm ######################## @@ -25,25 +25,27 @@ def cml_tts(root_path, meta_file, ignored_speakers=None): if len(line.split("|")) != num_cols: print(f" > Missing column in line {idx + 1} -> {line.strip()}") # load metadata - metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|") - assert all(x in metadata.columns for x in ["wav_filename", "transcript"]) - client_id = None if "client_id" in metadata.columns else "default" - emotion_name = None if "emotion_name" in metadata.columns else "neutral" + with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f, delimiter="|") + metadata = list(reader) + assert all(x in metadata[0] for x in ["wav_filename", "transcript"]) + client_id = None if "client_id" in metadata[0] else "default" + emotion_name = None if "emotion_name" in metadata[0] else "neutral" items = [] not_found_counter = 0 - for row in metadata.itertuples(): - if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers: + for row in metadata: + if client_id is None and ignored_speakers is not None and row["client_id"] in ignored_speakers: continue - audio_path = os.path.join(root_path, row.wav_filename) + audio_path = os.path.join(root_path, row["wav_filename"]) if not os.path.exists(audio_path): not_found_counter += 1 continue items.append( { - "text": row.transcript, + "text": row["transcript"], "audio_file": audio_path, - "speaker_name": client_id if client_id is not None else row.client_id, - "emotion_name": emotion_name if emotion_name is not None else row.emotion_name, + "speaker_name": client_id if client_id is not None else row["client_id"], + "emotion_name": emotion_name if emotion_name is not None else row["emotion_name"], "root_path": root_path, } ) @@ -63,25 +65,27 @@ def coqui(root_path, meta_file, ignored_speakers=None): if len(line.split("|")) != num_cols: print(f" > Missing column in line {idx + 1} -> {line.strip()}") # load metadata - metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|") - assert all(x in metadata.columns for x in ["audio_file", "text"]) - speaker_name = None if "speaker_name" in metadata.columns else "coqui" - emotion_name = None if "emotion_name" in metadata.columns else "neutral" + with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f, delimiter="|") + metadata = list(reader) + assert all(x in metadata[0] for x in ["audio_file", "text"]) + speaker_name = None if "speaker_name" in metadata[0] else "coqui" + emotion_name = None if "emotion_name" in metadata[0] else "neutral" items = [] not_found_counter = 0 - for row in metadata.itertuples(): - if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers: + for row in metadata: + if speaker_name is None and ignored_speakers is not None and row["speaker_name"] in ignored_speakers: continue - audio_path = os.path.join(root_path, row.audio_file) + audio_path = os.path.join(root_path, row["audio_file"]) if not os.path.exists(audio_path): not_found_counter += 1 continue items.append( { - "text": row.text, + "text": row["text"], "audio_file": audio_path, - "speaker_name": speaker_name if speaker_name is not None else row.speaker_name, - "emotion_name": emotion_name if emotion_name is not None else row.emotion_name, + "speaker_name": speaker_name if speaker_name is not None else row["speaker_name"], + "emotion_name": emotion_name if emotion_name is not None else row["emotion_name"], "root_path": root_path, } ) diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py index a6a3b9aeb1..9e487b1e9d 100644 --- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py +++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py @@ -7,8 +7,6 @@ # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py -import logging -from pathlib import Path import torch from einops import pack, unpack diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py index c84022bd08..68c50dbdbd 100644 --- a/TTS/tts/layers/bark/model.py +++ b/TTS/tts/layers/bark/model.py @@ -2,6 +2,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT (https://github.com/karpathy/nanoGPT) """ + import math from dataclasses import dataclass diff --git a/TTS/tts/layers/bark/model_fine.py b/TTS/tts/layers/bark/model_fine.py index 09e5f4765d..29126b41ab 100644 --- a/TTS/tts/layers/bark/model_fine.py +++ b/TTS/tts/layers/bark/model_fine.py @@ -2,6 +2,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT (https://github.com/karpathy/nanoGPT) """ + import math from dataclasses import dataclass diff --git a/TTS/tts/layers/delightful_tts/acoustic_model.py b/TTS/tts/layers/delightful_tts/acoustic_model.py index c906b882e5..74ec204281 100644 --- a/TTS/tts/layers/delightful_tts/acoustic_model.py +++ b/TTS/tts/layers/delightful_tts/acoustic_model.py @@ -362,7 +362,7 @@ def forward( pos_encoding = positional_encoding( self.emb_dim, - max(token_embeddings.shape[1], max(mel_lens)), + max(token_embeddings.shape[1], *mel_lens), device=token_embeddings.device, ) encoder_outputs = self.encoder( diff --git a/TTS/tts/layers/overflow/plotting_utils.py b/TTS/tts/layers/overflow/plotting_utils.py index a63aeb370a..d9d3e3d141 100644 --- a/TTS/tts/layers/overflow/plotting_utils.py +++ b/TTS/tts/layers/overflow/plotting_utils.py @@ -71,7 +71,7 @@ def plot_transition_probabilities_to_numpy(states, transition_probabilities, out ax.set_title("Transition probability of state") ax.set_xlabel("hidden state") ax.set_ylabel("probability") - ax.set_xticks([i for i in range(len(transition_probabilities))]) # pylint: disable=unnecessary-comprehension + ax.set_xticks(list(range(len(transition_probabilities)))) ax.set_xticklabels([int(x) for x in states], rotation=90) plt.tight_layout() if not output_fig: diff --git a/TTS/tts/layers/tortoise/arch_utils.py b/TTS/tts/layers/tortoise/arch_utils.py index dad1814369..c79ef31b0c 100644 --- a/TTS/tts/layers/tortoise/arch_utils.py +++ b/TTS/tts/layers/tortoise/arch_utils.py @@ -1,6 +1,5 @@ import functools import math -import os import fsspec import torch diff --git a/TTS/tts/layers/tortoise/clvp.py b/TTS/tts/layers/tortoise/clvp.py index 69b8c17c3f..241dfdd4f4 100644 --- a/TTS/tts/layers/tortoise/clvp.py +++ b/TTS/tts/layers/tortoise/clvp.py @@ -126,7 +126,7 @@ def forward(self, text, speech_tokens, return_loss=False): text_latents = self.to_text_latent(text_latents) speech_latents = self.to_speech_latent(speech_latents) - text_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (text_latents, speech_latents)) + text_latents, speech_latents = (F.normalize(t, p=2, dim=-1) for t in (text_latents, speech_latents)) temp = self.temperature.exp() diff --git a/TTS/tts/layers/tortoise/diffusion.py b/TTS/tts/layers/tortoise/diffusion.py index 7bea02ca08..2b29091b44 100644 --- a/TTS/tts/layers/tortoise/diffusion.py +++ b/TTS/tts/layers/tortoise/diffusion.py @@ -972,7 +972,7 @@ def autoregressive_training_losses( assert False # not currently supported for this type of diffusion. elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs) - terms.update({k: o for k, o in zip(model_output_keys, model_outputs)}) + terms.update(dict(zip(model_output_keys, model_outputs))) model_output = terms[gd_out_key] if self.model_var_type in [ ModelVarType.LEARNED, diff --git a/TTS/tts/layers/tortoise/transformer.py b/TTS/tts/layers/tortoise/transformer.py index 70d46aa3e0..6cb1bab96a 100644 --- a/TTS/tts/layers/tortoise/transformer.py +++ b/TTS/tts/layers/tortoise/transformer.py @@ -37,7 +37,7 @@ def route_args(router, args, depth): for key in matched_keys: val = args[key] for depth, ((f_args, g_args), routes) in enumerate(zip(routed_args, router[key])): - new_f_args, new_g_args = map(lambda route: ({key: val} if route else {}), routes) + new_f_args, new_g_args = (({key: val} if route else {}) for route in routes) routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args}) return routed_args @@ -152,7 +152,7 @@ def forward(self, x, mask=None): softmax = torch.softmax qkv = self.to_qkv(x).chunk(3, dim=-1) - q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), qkv) + q, k, v = (rearrange(t, "b n (h d) -> b h n d", h=h) for t in qkv) q = q * self.scale diff --git a/TTS/tts/layers/tortoise/xtransformers.py b/TTS/tts/layers/tortoise/xtransformers.py index 1eb3f77269..9325b8c720 100644 --- a/TTS/tts/layers/tortoise/xtransformers.py +++ b/TTS/tts/layers/tortoise/xtransformers.py @@ -84,7 +84,7 @@ def init_zero_(layer): def pick_and_pop(keys, d): - values = list(map(lambda key: d.pop(key), keys)) + values = [d.pop(key) for key in keys] return dict(zip(keys, values)) @@ -107,7 +107,7 @@ def group_by_key_prefix(prefix, d): def groupby_prefix_and_trim(prefix, d): kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d) - kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix) :], x[1]), tuple(kwargs_with_prefix.items()))) + kwargs_without_prefix = {x[0][len(prefix) :]: x[1] for x in tuple(kwargs_with_prefix.items())} return kwargs_without_prefix, kwargs @@ -428,7 +428,7 @@ def forward(self, x, **kwargs): feats_per_shift = x.shape[-1] // segments splitted = x.split(feats_per_shift, dim=-1) segments_to_shift, rest = splitted[:segments], splitted[segments:] - segments_to_shift = list(map(lambda args: shift(*args, mask=mask), zip(segments_to_shift, shifts))) + segments_to_shift = [shift(*args, mask=mask) for args in zip(segments_to_shift, shifts)] x = torch.cat((*segments_to_shift, *rest), dim=-1) return self.fn(x, **kwargs) @@ -635,7 +635,7 @@ def forward( v = self.to_v(v_input) if not collab_heads: - q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v)) + q, k, v = (rearrange(t, "b n (h d) -> b h n d", h=h) for t in (q, k, v)) else: q = einsum("b i d, h d -> b h i d", q, self.collab_mixing) k = rearrange(k, "b n d -> b () n d") @@ -650,9 +650,9 @@ def forward( if exists(rotary_pos_emb) and not has_context: l = rotary_pos_emb.shape[-1] - (ql, qr), (kl, kr), (vl, vr) = map(lambda t: (t[..., :l], t[..., l:]), (q, k, v)) - ql, kl, vl = map(lambda t: apply_rotary_pos_emb(t, rotary_pos_emb), (ql, kl, vl)) - q, k, v = map(lambda t: torch.cat(t, dim=-1), ((ql, qr), (kl, kr), (vl, vr))) + (ql, qr), (kl, kr), (vl, vr) = ((t[..., :l], t[..., l:]) for t in (q, k, v)) + ql, kl, vl = (apply_rotary_pos_emb(t, rotary_pos_emb) for t in (ql, kl, vl)) + q, k, v = (torch.cat(t, dim=-1) for t in ((ql, qr), (kl, kr), (vl, vr))) input_mask = None if any(map(exists, (mask, context_mask))): @@ -664,7 +664,7 @@ def forward( input_mask = q_mask * k_mask if self.num_mem_kv > 0: - mem_k, mem_v = map(lambda t: repeat(t, "h n d -> b h n d", b=b), (self.mem_k, self.mem_v)) + mem_k, mem_v = (repeat(t, "h n d -> b h n d", b=b) for t in (self.mem_k, self.mem_v)) k = torch.cat((mem_k, k), dim=-2) v = torch.cat((mem_v, v), dim=-2) if exists(input_mask): @@ -964,9 +964,7 @@ def forward( seq_len = x.shape[1] if past_key_values is not None: seq_len += past_key_values[0][0].shape[-2] - max_rotary_emb_length = max( - list(map(lambda m: (m.shape[1] if exists(m) else 0) + seq_len, mems)) + [expected_seq_len] - ) + max_rotary_emb_length = max([(m.shape[1] if exists(m) else 0) + seq_len for m in mems] + [expected_seq_len]) rotary_pos_emb = self.rotary_pos_emb(max_rotary_emb_length, x.device) present_key_values = [] @@ -1200,7 +1198,7 @@ def forward( res = [out] if return_attn: - attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) + attn_maps = [t.post_softmax_attn for t in intermediates.attn_intermediates] res.append(attn_maps) if use_cache: res.append(intermediates.past_key_values) @@ -1249,7 +1247,7 @@ def forward(self, x, return_embeddings=False, mask=None, return_attn=False, mems res = [out] if return_attn: - attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) + attn_maps = [t.post_softmax_attn for t in intermediates.attn_intermediates] res.append(attn_maps) if use_cache: res.append(intermediates.past_key_values) diff --git a/TTS/tts/layers/vits/discriminator.py b/TTS/tts/layers/vits/discriminator.py index c27d11bef6..3449739fdc 100644 --- a/TTS/tts/layers/vits/discriminator.py +++ b/TTS/tts/layers/vits/discriminator.py @@ -2,7 +2,7 @@ from torch import nn from torch.nn.modules.conv import Conv1d -from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP, MultiPeriodDiscriminator +from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP class DiscriminatorS(torch.nn.Module): diff --git a/TTS/tts/layers/xtts/dvae.py b/TTS/tts/layers/xtts/dvae.py index bdd7a9d09f..8598f0b47a 100644 --- a/TTS/tts/layers/xtts/dvae.py +++ b/TTS/tts/layers/xtts/dvae.py @@ -260,7 +260,7 @@ def __init__( dec_init_chan = codebook_dim if not has_resblocks else dec_chans[0] dec_chans = [dec_init_chan, *dec_chans] - enc_chans_io, dec_chans_io = map(lambda t: list(zip(t[:-1], t[1:])), (enc_chans, dec_chans)) + enc_chans_io, dec_chans_io = (list(zip(t[:-1], t[1:])) for t in (enc_chans, dec_chans)) pad = (kernel_size - 1) // 2 for (enc_in, enc_out), (dec_in, dec_out) in zip(enc_chans_io, dec_chans_io): @@ -306,9 +306,9 @@ def norm(self, images): if not self.normalization is not None: return images - means, stds = map(lambda t: torch.as_tensor(t).to(images), self.normalization) + means, stds = (torch.as_tensor(t).to(images) for t in self.normalization) arrange = "c -> () c () ()" if self.positional_dims == 2 else "c -> () c ()" - means, stds = map(lambda t: rearrange(t, arrange), (means, stds)) + means, stds = (rearrange(t, arrange) for t in (means, stds)) images = images.clone() images.sub_(means).div_(stds) return images diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index e7b186b858..b55b84d90e 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -1,7 +1,6 @@ # ported from: https://github.com/neonbjb/tortoise-tts import functools -import math import random import torch @@ -188,9 +187,9 @@ def __init__( def get_grad_norm_parameter_groups(self): return { "conditioning_encoder": list(self.conditioning_encoder.parameters()), - "conditioning_perceiver": list(self.conditioning_perceiver.parameters()) - if self.use_perceiver_resampler - else None, + "conditioning_perceiver": ( + list(self.conditioning_perceiver.parameters()) if self.use_perceiver_resampler else None + ), "gpt": list(self.gpt.parameters()), "heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()), } diff --git a/TTS/tts/layers/xtts/gpt_inference.py b/TTS/tts/layers/xtts/gpt_inference.py index d44bd3decd..4625ae1ba9 100644 --- a/TTS/tts/layers/xtts/gpt_inference.py +++ b/TTS/tts/layers/xtts/gpt_inference.py @@ -1,5 +1,3 @@ -import math - import torch from torch import nn from transformers import GPT2PreTrainedModel diff --git a/TTS/tts/layers/xtts/perceiver_encoder.py b/TTS/tts/layers/xtts/perceiver_encoder.py index 7b7ee79b50..d1aa16c456 100644 --- a/TTS/tts/layers/xtts/perceiver_encoder.py +++ b/TTS/tts/layers/xtts/perceiver_encoder.py @@ -155,10 +155,6 @@ def Sequential(*mods): return nn.Sequential(*filter(exists, mods)) -def exists(x): - return x is not None - - def default(val, d): if exists(val): return val diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py index e12f8995cf..b7e07589c5 100644 --- a/TTS/tts/layers/xtts/stream_generator.py +++ b/TTS/tts/layers/xtts/stream_generator.py @@ -43,7 +43,7 @@ def __init__(self, **kwargs): class NewGenerationMixin(GenerationMixin): @torch.no_grad() - def generate( + def generate( # noqa: PLR0911 self, inputs: Optional[torch.Tensor] = None, generation_config: Optional[StreamGenerationConfig] = None, @@ -885,10 +885,10 @@ def init_stream_support(): if __name__ == "__main__": - from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel + from transformers import AutoModelForCausalLM, AutoTokenizer + + init_stream_support() - PreTrainedModel.generate = NewGenerationMixin.generate - PreTrainedModel.sample_stream = NewGenerationMixin.sample_stream model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", torch_dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m") diff --git a/TTS/tts/layers/xtts/trainer/dataset.py b/TTS/tts/layers/xtts/trainer/dataset.py index 2f958cb5a5..0a19997a47 100644 --- a/TTS/tts/layers/xtts/trainer/dataset.py +++ b/TTS/tts/layers/xtts/trainer/dataset.py @@ -1,4 +1,3 @@ -import os import random import sys @@ -187,9 +186,9 @@ def __getitem__(self, index): "wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long), "filenames": audiopath, "conditioning": cond.unsqueeze(1), - "cond_lens": torch.tensor(cond_len, dtype=torch.long) - if cond_len is not torch.nan - else torch.tensor([cond_len]), + "cond_lens": ( + torch.tensor(cond_len, dtype=torch.long) if cond_len is not torch.nan else torch.tensor([cond_len]) + ), "cond_idxs": torch.tensor(cond_idxs) if cond_idxs is not torch.nan else torch.tensor([cond_idxs]), } return res diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 9a7a1d7783..daf9fc7e4f 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -5,7 +5,6 @@ import torch.nn as nn import torchaudio from coqpit import Coqpit -from torch.nn import functional as F from torch.utils.data import DataLoader from trainer.torch import DistributedSampler from trainer.trainer_utils import get_optimizer, get_scheduler @@ -391,7 +390,7 @@ def get_data_loader( loader = DataLoader( dataset, sampler=sampler, - batch_size = config.eval_batch_size if is_eval else config.batch_size, + batch_size=config.eval_batch_size if is_eval else config.batch_size, collate_fn=dataset.collate_fn, num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, pin_memory=False, diff --git a/TTS/tts/layers/xtts/xtts_manager.py b/TTS/tts/layers/xtts/xtts_manager.py index 3e7d0f6c91..5560e87687 100644 --- a/TTS/tts/layers/xtts/xtts_manager.py +++ b/TTS/tts/layers/xtts/xtts_manager.py @@ -1,34 +1,35 @@ import torch -class SpeakerManager(): + +class SpeakerManager: def __init__(self, speaker_file_path=None): self.speakers = torch.load(speaker_file_path) @property def name_to_id(self): - return self.speakers.keys() - + return self.speakers + @property def num_speakers(self): return len(self.name_to_id) - + @property def speaker_names(self): return list(self.name_to_id.keys()) - -class LanguageManager(): + +class LanguageManager: def __init__(self, config): self.langs = config["languages"] @property def name_to_id(self): return self.langs - + @property def num_languages(self): return len(self.name_to_id) - + @property def language_names(self): return list(self.name_to_id) diff --git a/TTS/tts/layers/xtts/zh_num2words.py b/TTS/tts/layers/xtts/zh_num2words.py index e59ccb6630..7d8f658160 100644 --- a/TTS/tts/layers/xtts/zh_num2words.py +++ b/TTS/tts/layers/xtts/zh_num2words.py @@ -4,13 +4,11 @@ import argparse import csv -import os import re import string import sys # fmt: off - # ================================================================================ # # basic constant # ================================================================================ # @@ -491,8 +489,6 @@ class NumberSystem(object): 中文数字įŗģįģŸ """ - pass - class MathSymbol(object): """ diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index b2e51de7d6..18b9cde385 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -415,7 +415,7 @@ def _set_phase(config, global_step): """Decide AlignTTS training phase""" if isinstance(config.phase_start_steps, list): vals = [i < global_step for i in config.phase_start_steps] - if not True in vals: + if True not in vals: phase = 0 else: phase = ( diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py index e5edffd4ef..833a909384 100644 --- a/TTS/tts/models/bark.py +++ b/TTS/tts/models/bark.py @@ -225,14 +225,11 @@ def synthesize( return return_dict - def eval_step(self): - ... + def eval_step(self): ... - def forward(self): - ... + def forward(self): ... - def inference(self): - ... + def inference(self): ... @staticmethod def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 7871cc38c3..0aa5edc647 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -14,7 +14,7 @@ from TTS.tts.datasets.dataset import TTSDataset from TTS.tts.utils.data import get_length_balancer_weights from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -369,9 +369,11 @@ def _get_test_aux_input( d_vector = (random.sample(sorted(d_vector), 1),) aux_inputs = { - "speaker_id": None - if not self.config.use_speaker_embedding - else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1), + "speaker_id": ( + None + if not self.config.use_speaker_embedding + else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1) + ), "d_vector": d_vector, "style_wav": None, # TODO: handle GST style input } diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py index b1cf886bea..a4aa563f48 100644 --- a/TTS/tts/models/delightful_tts.py +++ b/TTS/tts/models/delightful_tts.py @@ -179,17 +179,19 @@ def _wav_to_spec(y, n_fft, hop_length, win_length, center=False): ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) return spec @@ -274,17 +276,19 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index b6e9ac8a14..1d3a13d433 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -299,7 +299,7 @@ def init_multispeaker(self, config: Coqpit): if config.use_d_vector_file: self.embedded_speaker_dim = config.d_vector_dim if self.args.d_vector_dim != self.args.hidden_channels: - #self.proj_g = nn.Conv1d(self.args.d_vector_dim, self.args.hidden_channels, 1) + # self.proj_g = nn.Conv1d(self.args.d_vector_dim, self.args.hidden_channels, 1) self.proj_g = nn.Linear(in_features=self.args.d_vector_dim, out_features=self.args.hidden_channels) # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: @@ -404,13 +404,13 @@ def _forward_encoder( # [B, T, C] x_emb = self.emb(x) # encoder pass - #o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask) + # o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask) o_en = self.encoder(torch.transpose(x_emb, 1, -1), x_mask, g) # speaker conditioning # TODO: try different ways of conditioning - if g is not None: + if g is not None: if hasattr(self, "proj_g"): - g = self.proj_g(g.view(g.shape[0], -1)).unsqueeze(-1) + g = self.proj_g(g.view(g.shape[0], -1)).unsqueeze(-1) o_en = o_en + g return o_en, x_mask, g, x_emb diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 474ec4641d..400a86d042 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -101,12 +101,16 @@ def __init__( num_mel=self.decoder_output_dim, encoder_output_dim=self.encoder_in_features, capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim, - speaker_embedding_dim=self.embedded_speaker_dim - if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding - else None, - text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + speaker_embedding_dim=( + self.embedded_speaker_dim + if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding + else None + ), + text_summary_embedding_dim=( + self.capacitron_vae.capacitron_text_summary_embedding_dim + if self.capacitron_vae.capacitron_use_text_summary_embeddings + else None + ), ) # backward pass decoder @@ -171,9 +175,9 @@ def forward( # pylint: disable=dangerous-default-value encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding( encoder_outputs, reference_mel_info=[mel_specs, mel_lengths], - text_info=[inputs, text_lengths] - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + text_info=( + [inputs, text_lengths] if self.capacitron_vae.capacitron_use_text_summary_embeddings else None + ), speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None, ) else: @@ -237,13 +241,13 @@ def inference(self, text_input, aux_input=None): # B x capacitron_VAE_embedding_dim encoder_outputs, *_ = self.compute_capacitron_VAE_embedding( encoder_outputs, - reference_mel_info=[aux_input["style_mel"], reference_mel_length] - if aux_input["style_mel"] is not None - else None, + reference_mel_info=( + [aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None + ), text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None, - speaker_embedding=aux_input["d_vectors"] - if self.capacitron_vae.capacitron_use_speaker_embedding - else None, + speaker_embedding=( + aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None + ), ) if self.num_speakers > 1: if not self.use_d_vector_file: diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 71ab1eac37..4b1317f440 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -113,12 +113,14 @@ def __init__( num_mel=self.decoder_output_dim, encoder_output_dim=self.encoder_in_features, capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim, - speaker_embedding_dim=self.embedded_speaker_dim - if self.capacitron_vae.capacitron_use_speaker_embedding - else None, - text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + speaker_embedding_dim=( + self.embedded_speaker_dim if self.capacitron_vae.capacitron_use_speaker_embedding else None + ), + text_summary_embedding_dim=( + self.capacitron_vae.capacitron_text_summary_embedding_dim + if self.capacitron_vae.capacitron_use_text_summary_embeddings + else None + ), ) # backward pass decoder @@ -191,9 +193,11 @@ def forward( # pylint: disable=dangerous-default-value encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding( encoder_outputs, reference_mel_info=[mel_specs, mel_lengths], - text_info=[embedded_inputs.transpose(1, 2), text_lengths] - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + text_info=( + [embedded_inputs.transpose(1, 2), text_lengths] + if self.capacitron_vae.capacitron_use_text_summary_embeddings + else None + ), speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None, ) else: @@ -265,13 +269,13 @@ def inference(self, text, aux_input=None): # B x capacitron_VAE_embedding_dim encoder_outputs, *_ = self.compute_capacitron_VAE_embedding( encoder_outputs, - reference_mel_info=[aux_input["style_mel"], reference_mel_length] - if aux_input["style_mel"] is not None - else None, + reference_mel_info=( + [aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None + ), text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None, - speaker_embedding=aux_input["d_vectors"] - if self.capacitron_vae.capacitron_use_speaker_embedding - else None, + speaker_embedding=( + aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None + ), ) if self.num_speakers > 1: diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index 16644ff95e..99e0107fdf 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -715,8 +715,9 @@ def inference( self.autoregressive = self.autoregressive.to(self.device) if verbose: print("Generating autoregressive samples..") - with self.temporary_cuda(self.autoregressive) as autoregressive, torch.autocast( - device_type="cuda", dtype=torch.float16, enabled=half + with ( + self.temporary_cuda(self.autoregressive) as autoregressive, + torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half), ): for b in tqdm(range(num_batches), disable=not verbose): codes = autoregressive.inference_speech( @@ -737,8 +738,9 @@ def inference( self.autoregressive_batch_size = orig_batch_size # in the case of single_sample clip_results = [] - with self.temporary_cuda(self.clvp) as clvp, torch.autocast( - device_type="cuda", dtype=torch.float16, enabled=half + with ( + self.temporary_cuda(self.clvp) as clvp, + torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half), ): for batch in tqdm(samples, disable=not verbose): for i in range(batch.shape[0]): diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index d9b1f59618..9bc743b213 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -121,17 +121,19 @@ def wav_to_spec(y, n_fft, hop_length, win_length, center=False): ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) @@ -189,17 +191,19 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) @@ -1233,7 +1237,7 @@ def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> T Args: batch (Dict): Input tensors. criterion (nn.Module): Loss layer designed for the model. - optimizer_idx (int): Index of optimizer to use. 0 for the generator and 1 for the discriminator networks. + optimizer_idx (int): Index of optimizer to use. 0 for the discriminator and 1 for the generator networks. Returns: Tuple[Dict, Dict]: Model ouputs and computed losses. @@ -1651,13 +1655,16 @@ def get_data_loader( def get_optimizer(self) -> List: """Initiate and return the GAN optimizers based on the config parameters. - It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator. + + It returns 2 optimizers in a list. First one is for the discriminator + and the second one is for the generator. + Returns: List: optimizers. """ - # select generator parameters optimizer0 = get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr_disc, self.disc) + # select generator parameters gen_parameters = chain(params for k, params in self.named_parameters() if not k.startswith("disc.")) optimizer1 = get_optimizer( self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, parameters=gen_parameters @@ -1880,16 +1887,18 @@ def onnx_inference(text, text_lengths, scales, sid=None, langid=None): self.forward = _forward if training: self.train() - if not disc is None: + if disc is not None: self.disc = disc def load_onnx(self, model_path: str, cuda=False): import onnxruntime as ort providers = [ - "CPUExecutionProvider" - if cuda is False - else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}) + ( + "CPUExecutionProvider" + if cuda is False + else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}) + ) ] sess_options = ort.SessionOptions() self.onnx_sess = ort.InferenceSession( @@ -1914,9 +1923,9 @@ def inference_onnx(self, x, x_lengths=None, speaker_id=None, language_id=None): dtype=np.float32, ) input_params = {"input": x, "input_lengths": x_lengths, "scales": scales} - if not speaker_id is None: + if speaker_id is not None: input_params["sid"] = torch.tensor([speaker_id]).cpu().numpy() - if not language_id is None: + if language_id is not None: input_params["langid"] = torch.tensor([language_id]).cpu().numpy() audio = self.onnx_sess.run( @@ -1948,8 +1957,7 @@ def __init__( def _create_vocab(self): self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank] self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)} - # pylint: disable=unnecessary-comprehension - self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)} + self._id_to_char = dict(enumerate(self.vocab)) @staticmethod def init_from_config(config: Coqpit): @@ -1996,4 +2004,4 @@ def vocab(self, vocab_file): self.blank = self._vocab[0] self.pad = " " self._char_to_id = {s: i for i, s in enumerate(self._vocab)} # pylint: disable=unnecessary-comprehension - self._id_to_char = {i: s for i, s in enumerate(self._vocab)} # pylint: disable=unnecessary-comprehension + self._id_to_char = dict(enumerate(self._vocab)) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 83812f377f..1c73c42ce9 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -11,7 +11,7 @@ from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder from TTS.tts.layers.xtts.stream_generator import init_stream_support from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence -from TTS.tts.layers.xtts.xtts_manager import SpeakerManager, LanguageManager +from TTS.tts.layers.xtts.xtts_manager import LanguageManager, SpeakerManager from TTS.tts.models.base_tts import BaseTTS from TTS.utils.io import load_fsspec @@ -274,7 +274,7 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = for i in range(0, audio.shape[1], 22050 * chunk_length): audio_chunk = audio[:, i : i + 22050 * chunk_length] - # if the chunk is too short ignore it + # if the chunk is too short ignore it if audio_chunk.size(-1) < 22050 * 0.33: continue @@ -410,12 +410,14 @@ def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwa if speaker_id is not None: gpt_cond_latent, speaker_embedding = self.speaker_manager.speakers[speaker_id].values() return self.inference(text, language, gpt_cond_latent, speaker_embedding, **settings) - settings.update({ - "gpt_cond_len": config.gpt_cond_len, - "gpt_cond_chunk_len": config.gpt_cond_chunk_len, - "max_ref_len": config.max_ref_len, - "sound_norm_refs": config.sound_norm_refs, - }) + settings.update( + { + "gpt_cond_len": config.gpt_cond_len, + "gpt_cond_chunk_len": config.gpt_cond_chunk_len, + "max_ref_len": config.max_ref_len, + "sound_norm_refs": config.sound_norm_refs, + } + ) return self.full_inference(text, speaker_wav, language, **settings) @torch.inference_mode() @@ -693,12 +695,12 @@ def inference_stream( def forward(self): raise NotImplementedError( - "XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training" + "XTTS has a dedicated trainer, please check the XTTS docs: https://coqui-tts.readthedocs.io/en/dev/models/xtts.html#training" ) def eval_step(self): raise NotImplementedError( - "XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training" + "XTTS has a dedicated trainer, please check the XTTS docs: https://coqui-tts.readthedocs.io/en/dev/models/xtts.html#training" ) @staticmethod @@ -756,11 +758,13 @@ def load_checkpoint( model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth") vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json") - speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers_xtts.pth") + + if speaker_file_path is None and checkpoint_dir is not None: + speaker_file_path = os.path.join(checkpoint_dir, "speakers_xtts.pth") self.language_manager = LanguageManager(config) self.speaker_manager = None - if os.path.exists(speaker_file_path): + if speaker_file_path is not None and os.path.exists(speaker_file_path): self.speaker_manager = SpeakerManager(speaker_file_path) if os.path.exists(vocab_path): @@ -785,5 +789,5 @@ def load_checkpoint( def train_step(self): raise NotImplementedError( - "XTTS has a dedicated trainer, please check the XTTS docs: https://tts.readthedocs.io/en/dev/models/xtts.html#training" + "XTTS has a dedicated trainer, please check the XTTS docs: https://coqui-tts.readthedocs.io/en/dev/models/xtts.html#training" ) diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 1e1836b32c..89e5e1911e 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -59,7 +59,7 @@ def parse_language_ids_from_config(c: Coqpit) -> Dict: languages.add(dataset["language"]) else: raise ValueError(f"Dataset {dataset['name']} has no language specified.") - return {name: i for i, name in enumerate(sorted(list(languages)))} + return {name: i for i, name in enumerate(sorted(languages))} def set_language_ids_from_config(self, c: Coqpit) -> None: """Set language IDs from config samples. diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py index 1f94c5332d..23aa52a8a2 100644 --- a/TTS/tts/utils/managers.py +++ b/TTS/tts/utils/managers.py @@ -193,7 +193,7 @@ def read_embeddings_from_file(file_path: str): embeddings = load_file(file_path) speakers = sorted({x["name"] for x in embeddings.values()}) name_to_id = {name: i for i, name in enumerate(speakers)} - clip_ids = list(set(sorted(clip_name for clip_name in embeddings.keys()))) + clip_ids = list(set(clip_name for clip_name in embeddings.keys())) # cache embeddings_by_names for fast inference using a bigger speakers.json embeddings_by_names = {} for x in embeddings.values(): diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index 4bc3befc5b..eddf05db3f 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -207,6 +207,7 @@ class SSIMLoss(_Loss): https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf, DOI:`10.1109/TIP.2003.819861` """ + __constants__ = ["kernel_size", "k1", "k2", "sigma", "kernel", "reduction"] def __init__( diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py index 8fa45ed84b..37c7a7ca23 100644 --- a/TTS/tts/utils/text/characters.py +++ b/TTS/tts/utils/text/characters.py @@ -87,9 +87,7 @@ def vocab(self, vocab): if vocab is not None: self._vocab = vocab self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)} - self._id_to_char = { - idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension - } + self._id_to_char = dict(enumerate(self._vocab)) @staticmethod def init_from_config(config, **kwargs): @@ -269,9 +267,7 @@ def vocab(self): def vocab(self, vocab): self._vocab = vocab self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)} - self._id_to_char = { - idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension - } + self._id_to_char = dict(enumerate(self.vocab)) @property def num_chars(self): diff --git a/TTS/tts/utils/text/chinese_mandarin/pinyinToPhonemes.py b/TTS/tts/utils/text/chinese_mandarin/pinyinToPhonemes.py index 4e25c3a4c9..89dd654ab1 100644 --- a/TTS/tts/utils/text/chinese_mandarin/pinyinToPhonemes.py +++ b/TTS/tts/utils/text/chinese_mandarin/pinyinToPhonemes.py @@ -94,25 +94,25 @@ "fo": ["fo"], "fou": ["fou"], "fu": ["fu"], - "ga": ["ga"], - "gai": ["gai"], - "gan": ["gan"], - "gang": ["gɑŋ"], - "gao": ["gaʌ"], - "ge": ["gø"], - "gei": ["gei"], - "gen": ["gœn"], - "geng": ["gÉĩŋ"], - "gong": ["goŋ"], - "gou": ["gou"], - "gu": ["gu"], - "gua": ["gua"], - "guai": ["guai"], - "guan": ["guan"], - "guang": ["guɑŋ"], - "gui": ["guei"], - "gun": ["gun"], - "guo": ["guo"], + "ga": ["ÉĄa"], + "gai": ["ÉĄai"], + "gan": ["ÉĄan"], + "gang": ["ÉĄÉ‘Å‹"], + "gao": ["ÉĄaʌ"], + "ge": ["ÉĄÃ¸"], + "gei": ["ÉĄei"], + "gen": ["ÉĄÅ“n"], + "geng": ["ÉĄÉĩŋ"], + "gong": ["ÉĄoŋ"], + "gou": ["ÉĄou"], + "gu": ["ÉĄu"], + "gua": ["ÉĄua"], + "guai": ["ÉĄuai"], + "guan": ["ÉĄuan"], + "guang": ["ÉĄuɑŋ"], + "gui": ["ÉĄuei"], + "gun": ["ÉĄun"], + "guo": ["ÉĄuo"], "ha": ["xa"], "hai": ["xai"], "han": ["xan"], diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 74d3910b51..794a87c866 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -1,4 +1,5 @@ """Set of default text cleaners""" + # TODO: pick the cleaner for languages dynamically import re diff --git a/TTS/tts/utils/text/japanese/phonemizer.py b/TTS/tts/utils/text/japanese/phonemizer.py index c3111067e1..30072ae501 100644 --- a/TTS/tts/utils/text/japanese/phonemizer.py +++ b/TTS/tts/utils/text/japanese/phonemizer.py @@ -350,8 +350,8 @@ def hira2kata(text: str) -> str: return text.replace("う゛", "ヴ") -_SYMBOL_TOKENS = set(list("ãƒģ、。īŧŸīŧ")) -_NO_YOMI_TOKENS = set(list("「」『』―īŧˆīŧ‰īŧģīŧŊ[] â€Ļ")) +_SYMBOL_TOKENS = set("ãƒģ、。īŧŸīŧ") +_NO_YOMI_TOKENS = set("「」『』―īŧˆīŧ‰īŧģīŧŊ[] â€Ļ") _TAGGER = MeCab.Tagger() diff --git a/TTS/tts/utils/text/phonemizers/__init__.py b/TTS/tts/utils/text/phonemizers/__init__.py index f9a0340c55..446f288302 100644 --- a/TTS/tts/utils/text/phonemizers/__init__.py +++ b/TTS/tts/utils/text/phonemizers/__init__.py @@ -10,7 +10,6 @@ from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer except ImportError: JA_JP_Phonemizer = None - pass PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, KO_KR_Phonemizer, BN_Phonemizer)} @@ -64,7 +63,7 @@ def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: return ZH_CN_Phonemizer(**kwargs) if name == "ja_jp_phonemizer": if JA_JP_Phonemizer is None: - raise ValueError(" ❗ You need to install JA phonemizer dependencies. Try `pip install TTS[ja]`.") + raise ValueError(" ❗ You need to install JA phonemizer dependencies. Try `pip install coqui-tts[ja]`.") return JA_JP_Phonemizer(**kwargs) if name == "ko_kr_phonemizer": return KO_KR_Phonemizer(**kwargs) diff --git a/TTS/utils/audio/torch_transforms.py b/TTS/utils/audio/torch_transforms.py index fd40ebb048..632969c51a 100644 --- a/TTS/utils/audio/torch_transforms.py +++ b/TTS/utils/audio/torch_transforms.py @@ -119,17 +119,19 @@ def __call__(self, x): padding = int((self.n_fft - self.hop_length) / 2) x = torch.nn.functional.pad(x, (padding, padding), mode="reflect") # B x D x T x 2 - o = torch.stft( - x.squeeze(1), - self.n_fft, - self.hop_length, - self.win_length, - self.window, - center=True, - pad_mode="reflect", # compatible with audio.py - normalized=self.normalized, - onesided=True, - return_complex=False, + o = torch.view_as_real( + torch.stft( + x.squeeze(1), + self.n_fft, + self.hop_length, + self.win_length, + self.window, + center=True, + pad_mode="reflect", # compatible with audio.py + normalized=self.normalized, + onesided=True, + return_complex=True, + ) ) M = o[:, :, :, 0] P = o[:, :, :, 1] diff --git a/TTS/utils/download.py b/TTS/utils/download.py index 3f06b57824..37e6ed3cee 100644 --- a/TTS/utils/download.py +++ b/TTS/utils/download.py @@ -36,13 +36,16 @@ def stream_url( if start_byte: req.headers["Range"] = "bytes={}-".format(start_byte) - with urllib.request.urlopen(req) as upointer, tqdm( - unit="B", - unit_scale=True, - unit_divisor=1024, - total=url_size, - disable=not progress_bar, - ) as pbar: + with ( + urllib.request.urlopen(req) as upointer, + tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + total=url_size, + disable=not progress_bar, + ) as pbar, + ): num_bytes = 0 while True: chunk = upointer.read(block_size) diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 9730576239..e0cd3ad85f 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -9,26 +9,8 @@ from pathlib import Path from typing import Dict -import fsspec -import torch - - -def to_cuda(x: torch.Tensor) -> torch.Tensor: - if x is None: - return None - if torch.is_tensor(x): - x = x.contiguous() - if torch.cuda.is_available(): - x = x.cuda(non_blocking=True) - return x - - -def get_cuda(): - use_cuda = torch.cuda.is_available() - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - return use_cuda, device - +# TODO: This method is duplicated in Trainer but out of date there def get_git_branch(): try: out = subprocess.check_output(["git", "branch"]).decode("utf8") @@ -36,54 +18,11 @@ def get_git_branch(): current.replace("* ", "") except subprocess.CalledProcessError: current = "inside_docker" - except FileNotFoundError: - current = "unknown" - except StopIteration: + except (FileNotFoundError, StopIteration) as e: current = "unknown" return current -def get_commit_hash(): - """https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script""" - # try: - # subprocess.check_output(['git', 'diff-index', '--quiet', - # 'HEAD']) # Verify client is clean - # except: - # raise RuntimeError( - # " !! Commit before training to get the commit hash.") - try: - commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode().strip() - # Not copying .git folder into docker container - except (subprocess.CalledProcessError, FileNotFoundError): - commit = "0000000" - return commit - - -def get_experiment_folder_path(root_path, model_name): - """Get an experiment folder path with the current date and time""" - date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p") - commit_hash = get_commit_hash() - output_folder = os.path.join(root_path, model_name + "-" + date_str + "-" + commit_hash) - return output_folder - - -def remove_experiment_folder(experiment_path): - """Check folder if there is a checkpoint, otherwise remove the folder""" - fs = fsspec.get_mapper(experiment_path).fs - checkpoint_files = fs.glob(experiment_path + "/*.pth") - if not checkpoint_files: - if fs.exists(experiment_path): - fs.rm(experiment_path, recursive=True) - print(" ! Run is removed from {}".format(experiment_path)) - else: - print(" ! Run is kept in {}".format(experiment_path)) - - -def count_parameters(model): - r"""Count number of trainable parameters in a network""" - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - def to_camel(text): text = text.capitalize() text = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) @@ -184,44 +123,6 @@ def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: return kwargs -class KeepAverage: - def __init__(self): - self.avg_values = {} - self.iters = {} - - def __getitem__(self, key): - return self.avg_values[key] - - def items(self): - return self.avg_values.items() - - def add_value(self, name, init_val=0, init_iter=0): - self.avg_values[name] = init_val - self.iters[name] = init_iter - - def update_value(self, name, value, weighted_avg=False): - if name not in self.avg_values: - # add value if not exist before - self.add_value(name, init_val=value) - else: - # else update existing value - if weighted_avg: - self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value - self.iters[name] += 1 - else: - self.avg_values[name] = self.avg_values[name] * self.iters[name] + value - self.iters[name] += 1 - self.avg_values[name] /= self.iters[name] - - def add_values(self, name_dict): - for key, value in name_dict.items(): - self.add_value(key, init_val=value) - - def update_values(self, value_dict): - for key, value in value_dict.items(): - self.update_value(key, value) - - def get_timestamp(): return datetime.now().strftime("%y%m%d-%H%M%S") diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 3a527f4609..ca16183d37 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -5,7 +5,7 @@ import zipfile from pathlib import Path from shutil import copyfile, rmtree -from typing import Dict, List, Tuple +from typing import Dict, Tuple import fsspec import requests @@ -260,8 +260,7 @@ def set_model_url(model_item: Dict): def _set_model_item(self, model_name): # fetch model info from the dict if "fairseq" in model_name: - model_type = "tts_models" - lang = model_name.split("/")[1] + model_type, lang, dataset, model = model_name.split("/") model_item = { "model_type": "tts_models", "license": "CC BY-NC 4.0", @@ -516,7 +515,7 @@ def _update_path(field_name, new_path, config_path): sub_conf[field_names[-1]] = new_path else: # field name points to a top-level field - if not field_name in config: + if field_name not in config: return if isinstance(config[field_name], list): config[field_name] = [new_path] diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index b98647c30c..6165fb5e8a 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -335,7 +335,7 @@ def tts( # handle multi-lingual language_id = None if self.tts_languages_file or ( - hasattr(self.tts_model, "language_manager") + hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None and not self.tts_config.model == "xtts" ): diff --git a/TTS/vc/configs/shared_configs.py b/TTS/vc/configs/shared_configs.py index 74164a7444..b2fe63d29d 100644 --- a/TTS/vc/configs/shared_configs.py +++ b/TTS/vc/configs/shared_configs.py @@ -1,7 +1,5 @@ -from dataclasses import asdict, dataclass, field -from typing import Dict, List - -from coqpit import Coqpit, check_argument +from dataclasses import dataclass, field +from typing import List from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig diff --git a/TTS/vc/models/base_vc.py b/TTS/vc/models/base_vc.py index 19f2761bbc..78f1556b71 100644 --- a/TTS/vc/models/base_vc.py +++ b/TTS/vc/models/base_vc.py @@ -357,9 +357,11 @@ def _get_test_aux_input( d_vector = (random.sample(sorted(d_vector), 1),) aux_inputs = { - "speaker_id": None - if not self.config.use_speaker_embedding - else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1), + "speaker_id": ( + None + if not self.config.use_speaker_embedding + else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1) + ), "d_vector": d_vector, "style_wav": None, # TODO: handle GST style input } diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index 8bb9989224..8f2a35d204 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -164,7 +164,7 @@ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() self.period = period self.use_spectral_norm = use_spectral_norm - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList( [ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), @@ -201,7 +201,7 @@ def forward(self, x): class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm + norm_f = weight_norm if use_spectral_norm is False else spectral_norm self.convs = nn.ModuleList( [ norm_f(Conv1d(1, 16, 15, 1, padding=7)), @@ -468,7 +468,7 @@ def inference(self, c, g=None, mel=None, c_lengths=None): Returns: torch.Tensor: Output tensor. """ - if c_lengths == None: + if c_lengths is None: c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) if not self.use_spk: g = self.enc_spk.embed_utterance(mel) @@ -544,8 +544,7 @@ def voice_conversion(self, src, tgt): audio = audio[0][0].data.cpu().float().numpy() return audio - def eval_step(): - ... + def eval_step(): ... @staticmethod def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True): @@ -558,5 +557,4 @@ def load_checkpoint(self, config, checkpoint_path, eval=False, strict=True, cach if eval: self.eval() - def train_step(): - ... + def train_step(): ... diff --git a/TTS/vc/modules/freevc/commons.py b/TTS/vc/modules/freevc/commons.py index e799cc2a5b..e5fb13c11c 100644 --- a/TTS/vc/modules/freevc/commons.py +++ b/TTS/vc/modules/freevc/commons.py @@ -1,8 +1,6 @@ import math -import numpy as np import torch -from torch import nn from torch.nn import functional as F diff --git a/TTS/vc/modules/freevc/mel_processing.py b/TTS/vc/modules/freevc/mel_processing.py index 2dcbf21493..1955e758ac 100644 --- a/TTS/vc/modules/freevc/mel_processing.py +++ b/TTS/vc/modules/freevc/mel_processing.py @@ -54,17 +54,19 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) @@ -104,17 +106,19 @@ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, ) y = y.squeeze(1) - spec = torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) diff --git a/TTS/vc/modules/freevc/speaker_encoder/audio.py b/TTS/vc/modules/freevc/speaker_encoder/audio.py index 52f6fd0893..5b23a4dbb6 100644 --- a/TTS/vc/modules/freevc/speaker_encoder/audio.py +++ b/TTS/vc/modules/freevc/speaker_encoder/audio.py @@ -1,13 +1,17 @@ -import struct from pathlib import Path from typing import Optional, Union # import webrtcvad import librosa import numpy as np -from scipy.ndimage.morphology import binary_dilation -from TTS.vc.modules.freevc.speaker_encoder.hparams import * +from TTS.vc.modules.freevc.speaker_encoder.hparams import ( + audio_norm_target_dBFS, + mel_n_channels, + mel_window_length, + mel_window_step, + sampling_rate, +) int16_max = (2**15) - 1 diff --git a/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py b/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py index 2e21a14fd8..7f811ac3ab 100644 --- a/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py +++ b/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py @@ -1,4 +1,3 @@ -from pathlib import Path from time import perf_counter as timer from typing import List, Union @@ -8,7 +7,15 @@ from TTS.utils.io import load_fsspec from TTS.vc.modules.freevc.speaker_encoder import audio -from TTS.vc.modules.freevc.speaker_encoder.hparams import * +from TTS.vc.modules.freevc.speaker_encoder.hparams import ( + mel_n_channels, + mel_window_step, + model_embedding_size, + model_hidden_size, + model_num_layers, + partials_n_frames, + sampling_rate, +) class SpeakerEncoder(nn.Module): diff --git a/TTS/vc/modules/freevc/wavlm/wavlm.py b/TTS/vc/modules/freevc/wavlm/wavlm.py index fc93bd4f50..10dd09ed0c 100644 --- a/TTS/vc/modules/freevc/wavlm/wavlm.py +++ b/TTS/vc/modules/freevc/wavlm/wavlm.py @@ -155,7 +155,9 @@ def arrange(s, e, length, keep_length): class WavLMConfig: def __init__(self, cfg=None): - self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) + self.extractor_mode: str = ( + "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) + ) self.encoder_layers: int = 12 # num encoder layers in the transformer self.encoder_embed_dim: int = 768 # encoder embedding dimension @@ -164,7 +166,9 @@ def __init__(self, cfg=None): self.activation_fn: str = "gelu" # activation function to use self.layer_norm_first: bool = False # apply layernorm first in the transformer - self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] + self.conv_feature_layers: str = ( + "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] + ) self.conv_bias: bool = False # include bias in conv encoder self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this @@ -387,7 +391,7 @@ def make_conv(): nn.init.kaiming_normal_(conv.weight) return conv - assert (is_layer_norm and is_group_norm) == False, "layer norm and group norm are exclusive" + assert (is_layer_norm and is_group_norm) is False, "layer norm and group norm are exclusive" if is_layer_norm: return nn.Sequential( diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 74cfc7262b..1f977755cc 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -298,7 +298,7 @@ def forward( adv_loss = adv_loss + self.hinge_gan_loss_weight * hinge_fake_loss # Feature Matching Loss - if self.use_feat_match_loss and not feats_fake is None: + if self.use_feat_match_loss and feats_fake is not None: feat_match_loss = self.feat_match_loss(feats_fake, feats_real) return_dict["G_feat_match_loss"] = feat_match_loss adv_loss = adv_loss + self.feat_match_loss_weight * feat_match_loss diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index 63a0af4445..113240fd75 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -40,7 +40,7 @@ def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_ Returns: Dict: output figures keyed by the name of the figures. - """ """Plot vocoder model results""" + """ if name_prefix is None: name_prefix = "" diff --git a/docs/source/faq.md b/docs/source/faq.md index fa48c4a9fb..14be9d4c9c 100644 --- a/docs/source/faq.md +++ b/docs/source/faq.md @@ -3,7 +3,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is ## Errors with a pre-trained model. How can I resolve this? - Make sure you use the right commit version of 🐸TTS. Each pre-trained model has its corresponding version that needs to be used. It is defined on the model table. -- If it is still problematic, post your problem on [Discussions](https://github.com/coqui-ai/TTS/discussions). Please give as many details as possible (error message, your TTS version, your TTS model and config.json etc.) +- If it is still problematic, post your problem on [Discussions](https://github.com/eginhard/coqui-tts/discussions). Please give as many details as possible (error message, your TTS version, your TTS model and config.json etc.) - If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny. ## What are the requirements of a good 🐸TTS dataset? @@ -16,7 +16,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is - If you need faster models, consider SpeedySpeech, GlowTTS or AlignTTS. Keep in mind that SpeedySpeech requires a pre-trained Tacotron or Tacotron2 model to compute text-to-speech alignments. ## How can I train my own `tts` model? -0. Check your dataset with notebooks in [dataset_analysis](https://github.com/coqui-ai/TTS/tree/master/notebooks/dataset_analysis) folder. Use [this notebook](https://github.com/coqui-ai/TTS/blob/master/notebooks/dataset_analysis/CheckSpectrograms.ipynb) to find the right audio processing parameters. A better set of parameters results in a better audio synthesis. +0. Check your dataset with notebooks in [dataset_analysis](https://github.com/eginhard/coqui-tts/tree/main/notebooks/dataset_analysis) folder. Use [this notebook](https://github.com/eginhard/coqui-tts/blob/main/notebooks/dataset_analysis/CheckSpectrograms.ipynb) to find the right audio processing parameters. A better set of parameters results in a better audio synthesis. 1. Write your own dataset `formatter` in `datasets/formatters.py` or format your dataset as one of the supported datasets, like LJSpeech. A `formatter` parses the metadata file and converts a list of training samples. diff --git a/docs/source/inference.md b/docs/source/inference.md index 56bccfb5b2..0b05965f46 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -4,7 +4,7 @@ First, you need to install TTS. We recommend using PyPi. You need to call the command below: ```bash -$ pip install TTS +$ pip install coqui-tts ``` After the installation, 2 terminal commands are available. @@ -14,7 +14,7 @@ After the installation, 2 terminal commands are available. 3. In 🐍Python. - `from TTS.api import TTS` ## On the Commandline - `tts` -![cli.gif](https://github.com/coqui-ai/TTS/raw/main/images/tts_cli.gif) +![cli.gif](https://github.com/eginhard/coqui-tts/raw/main/images/tts_cli.gif) After the installation, 🐸TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under 🐸TTS. @@ -81,11 +81,13 @@ tts --model_name "voice_conversion///" ## On the Demo Server - `tts-server` - -![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif) + +![server.gif](https://github.com/eginhard/coqui-tts/raw/main/images/demo_server.gif) -You can boot up a demo 🐸TTS server to run an inference with your models. Note that the server is not optimized for performance -but gives you an easy way to interact with the models. +You can boot up a demo 🐸TTS server to run an inference with your models (make +sure to install the additional dependencies with `pip install coqui-tts[server]`). +Note that the server is not optimized for performance but gives you an easy way +to interact with the models. The demo server provides pretty much the same interface as the CLI command. diff --git a/docs/source/installation.md b/docs/source/installation.md index c4d05361f4..92743a9db4 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -1,6 +1,6 @@ # Installation -🐸TTS supports python >=3.7 <3.11.0 and tested on Ubuntu 18.10, 19.10, 20.10. +🐸TTS supports python >=3.9 <3.12.0 and was tested on Ubuntu 20.04 and 22.04. ## Using `pip` @@ -9,13 +9,13 @@ You can install from PyPI as follows: ```bash -pip install TTS # from PyPI +pip install coqui-tts # from PyPI ``` Or install from Github: ```bash -pip install git+https://github.com/coqui-ai/TTS # from Github +pip install git+https://github.com/eginhard/coqui-tts # from Github ``` ## Installing From Source @@ -23,11 +23,13 @@ pip install git+https://github.com/coqui-ai/TTS # from Github This is recommended for development and more control over 🐸TTS. ```bash -git clone https://github.com/coqui-ai/TTS/ -cd TTS +git clone https://github.com/eginhard/coqui-tts +cd coqui-tts make system-deps # only on Linux systems. make install ``` ## On Windows -If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/ \ No newline at end of file +If you are on Windows, 👑@GuyPaddock wrote installation instructions +[here](https://stackoverflow.com/questions/66726331/) (note that these are out +of date, e.g. you need to have at least Python 3.9) diff --git a/docs/source/main_classes/trainer_api.md b/docs/source/main_classes/trainer_api.md index 876e09e5b6..335294aa4d 100644 --- a/docs/source/main_classes/trainer_api.md +++ b/docs/source/main_classes/trainer_api.md @@ -1,3 +1,3 @@ # Trainer API -We made the trainer a separate project on https://github.com/coqui-ai/Trainer +We made the trainer a separate project on https://github.com/eginhard/coqui-trainer diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index b979d04f6e..014b161669 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -3,9 +3,6 @@ ⓍTTS has important model changes that make cross-language voice cloning and multi-lingual speech generation super easy. There is no need for an excessive amount of training data that spans countless hours. -This is the same model that powers [Coqui Studio](https://coqui.ai/), and [Coqui API](https://docs.coqui.ai/docs), however we apply -a few tricks to make it faster and support streaming inference. - ### Features - Voice cloning. - Cross-language voice cloning. @@ -32,21 +29,20 @@ Stay tuned as we continue to add support for more languages. If you have any lan This model is licensed under [Coqui Public Model License](https://coqui.ai/cpml). ### Contact -Come and join in our 🐸Community. We're active on [Discord](https://discord.gg/fBC58unbKE) and [Twitter](https://twitter.com/coqui_ai). -You can also mail us at info@coqui.ai. +Come and join in our 🐸Community. We're active on [Discord](https://discord.gg/fBC58unbKE) and [Github](https://github.com/eginhard/coqui-tts/discussions). ### Inference #### 🐸TTS Command line -You can check all supported languages with the following command: +You can check all supported languages with the following command: ```console tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ --list_language_idx ``` -You can check all Coqui available speakers with the following command: +You can check all Coqui available speakers with the following command: ```console tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ @@ -280,7 +276,7 @@ To make the `XTTS_v2` fine-tuning more accessible for users that do not have goo The Colab Notebook is available [here](https://colab.research.google.com/drive/1GiI4_X724M8q2W-zZ-jXo7cWTV7RfaH-?usp=sharing). -To learn how to use this Colab Notebook please check the [XTTS fine-tuning video](). +To learn how to use this Colab Notebook please check the [XTTS fine-tuning video](https://www.youtube.com/watch?v=8tpDiiouGxc). If you are not able to acess the video you need to follow the steps: @@ -294,7 +290,7 @@ If you are not able to acess the video you need to follow the steps: ##### Run demo locally To run the demo locally you need to do the following steps: -1. Install 🐸 TTS following the instructions available [here](https://tts.readthedocs.io/en/dev/installation.html#installation). +1. Install 🐸 TTS following the instructions available [here](https://coqui-tts.readthedocs.io/en/latest/installation.html). 2. Install the Gradio demo requirements with the command `python3 -m pip install -r TTS/demos/xtts_ft_demo/requirements.txt` 3. Run the Gradio demo using the command `python3 TTS/demos/xtts_ft_demo/xtts_demo.py` 4. Follow the steps presented in the [tutorial video](https://www.youtube.com/watch?v=8tpDiiouGxc&feature=youtu.be) to be able to fine-tune and test the fine-tuned model. diff --git a/docs/source/tutorial_for_nervous_beginners.md b/docs/source/tutorial_for_nervous_beginners.md index acde3fc4c2..dda2abbc36 100644 --- a/docs/source/tutorial_for_nervous_beginners.md +++ b/docs/source/tutorial_for_nervous_beginners.md @@ -5,14 +5,14 @@ User friendly installation. Recommended only for synthesizing voice. ```bash -$ pip install TTS +$ pip install coqui-tts ``` Developer friendly installation. ```bash -$ git clone https://github.com/coqui-ai/TTS -$ cd TTS +$ git clone https://github.com/eginhard/coqui-tts +$ cd coqui-tts $ pip install -e . ``` @@ -109,14 +109,15 @@ $ tts -h # see the help $ tts --list_models # list the available models. ``` -![cli.gif](https://github.com/coqui-ai/TTS/raw/main/images/tts_cli.gif) +![cli.gif](https://github.com/eginhard/coqui-tts/raw/main/images/tts_cli.gif) -You can call `tts-server` to start a local demo server that you can open it on -your favorite web browser and đŸ—Ŗī¸. +You can call `tts-server` to start a local demo server that you can open on +your favorite web browser and đŸ—Ŗī¸ (make sure to install the additional +dependencies with `pip install coqui-tts[server]`). ```bash $ tts-server -h # see the help $ tts-server --list_models # list the available models. ``` -![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif) +![server.gif](https://github.com/eginhard/coqui-tts/raw/main/images/demo_server.gif) diff --git a/images/TTS-performance.png b/images/TTS-performance.png deleted file mode 100644 index 68eebaf7e6..0000000000 Binary files a/images/TTS-performance.png and /dev/null differ diff --git a/images/tts_performance.png b/images/tts_performance.png deleted file mode 100644 index bdff06731e..0000000000 Binary files a/images/tts_performance.png and /dev/null differ diff --git a/notebooks/Tutorial_1_use-pretrained-TTS.ipynb b/notebooks/Tutorial_1_use-pretrained-TTS.ipynb index 87d04c499d..3c2e9de924 100644 --- a/notebooks/Tutorial_1_use-pretrained-TTS.ipynb +++ b/notebooks/Tutorial_1_use-pretrained-TTS.ipynb @@ -41,7 +41,7 @@ "outputs": [], "source": [ "! pip install -U pip\n", - "! pip install TTS" + "! pip install coqui-tts" ] }, { diff --git a/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb b/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb index 0f580a85b6..c4186670c9 100644 --- a/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb +++ b/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb @@ -32,7 +32,7 @@ "source": [ "## Install Coqui TTS\n", "! pip install -U pip\n", - "! pip install TTS" + "! pip install coqui-tts" ] }, { @@ -44,7 +44,7 @@ "\n", "### **First things first**: we need some data.\n", "\n", - "We're training a Text-to-Speech model, so we need some _text_ and we need some _speech_. Specificially, we want _transcribed speech_. The speech must be divided into audio clips and each clip needs transcription. More details about data requirements such as recording characteristics, background noise and vocabulary coverage can be found in the [🐸TTS documentation](https://tts.readthedocs.io/en/latest/formatting_your_dataset.html).\n", + "We're training a Text-to-Speech model, so we need some _text_ and we need some _speech_. Specificially, we want _transcribed speech_. The speech must be divided into audio clips and each clip needs transcription. More details about data requirements such as recording characteristics, background noise and vocabulary coverage can be found in the [🐸TTS documentation](https://coqui-tts.readthedocs.io/en/latest/formatting_your_dataset.html).\n", "\n", "If you have a single audio file and you need to **split** it into clips. It is also important to use a lossless audio file format to prevent compression artifacts. We recommend using **wav** file format.\n", "\n", diff --git a/pyproject.toml b/pyproject.toml index 922575305c..50d67db97d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,14 +7,64 @@ requires = [ "packaging", ] -[flake8] -max-line-length=120 +[tool.ruff] +line-length = 120 +lint.extend-select = [ + "B033", # duplicate-value + "C416", # unnecessary-comprehension + "D419", # empty-docstring + "E999", # syntax-error + "F401", # unused-import + "F704", # yield-outside-function + "F706", # return-outside-function + "F841", # unused-variable + "I", # import sorting + "PIE790", # unnecessary-pass + "PLC", + "PLE", + "PLR0124", # comparison-with-itself + "PLR0206", # property-with-parameters + "PLR0911", # too-many-return-statements + "PLR1711", # useless-return + "PLW", + "W291", # trailing-whitespace +] + +lint.ignore = [ + "E501", # line too long + "E722", # bare except (TODO: fix these) + "E731", # don't use lambdas + "E741", # ambiguous variable name + "PLR0912", # too-many-branches + "PLR0913", # too-many-arguments + "PLR0915", # too-many-statements + "UP004", # useless-object-inheritance + "F821", # TODO: enable + "F841", # TODO: enable + "PLW0602", # TODO: enable + "PLW2901", # TODO: enable + "PLW0127", # TODO: enable + "PLW0603", # TODO: enable +] + +[tool.ruff.lint.pylint] +max-args = 5 +max-public-methods = 20 +max-returns = 7 + +[tool.ruff.lint.per-file-ignores] +"**/__init__.py" = [ + "F401", # init files may have "unused" imports for now + "F403", # init files may have star imports for now +] +"hubconf.py" = [ + "E402", # module level import not at top of file +] [tool.black] line-length = 120 target-version = ['py39'] -[tool.isort] -line_length = 120 -profile = "black" -multi_line_output = 3 +[tool.coverage.run] +parallel = true +source = ["TTS"] diff --git a/recipes/bel-alex73/README.md b/recipes/bel-alex73/README.md index ad378dd998..6075d3102d 100644 --- a/recipes/bel-alex73/README.md +++ b/recipes/bel-alex73/README.md @@ -39,7 +39,7 @@ Docker container was created for simplify local running. You can run `docker-pre ## Training - with GPU -You need to upload Coqui-TTS(/mycomputer/TTS/) and storage/ directory(/mycomputer/storage/) to some computer with GPU. We don't need cv-corpus/ and fanetyka/ directories for training. Install gcc, then run `pip install -e .[all,dev,notebooks]` to prepare modules. GlowTTS and HifiGan models should be learned separately based on /storage/filtered_dataset only, i.e. they are not dependent from each other. below means list of GPU ids from zero("0,1,2,3" for systems with 4 GPU). See details on the https://tts.readthedocs.io/en/latest/tutorial_for_nervous_beginners.html(multi-gpu training). +You need to upload Coqui-TTS(/mycomputer/TTS/) and storage/ directory(/mycomputer/storage/) to some computer with GPU. We don't need cv-corpus/ and fanetyka/ directories for training. Install gcc, then run `pip install -e .[all,dev,notebooks]` to prepare modules. GlowTTS and HifiGan models should be learned separately based on /storage/filtered_dataset only, i.e. they are not dependent from each other. below means list of GPU ids from zero("0,1,2,3" for systems with 4 GPU). See details on the https://coqui-tts.readthedocs.io/en/latest/tutorial_for_nervous_beginners.html (multi-gpu training). Current setup created for 24GiB GPU. You need to change batch_size if you have more or less GPU memory. Also, you can try to set lr(learning rate) to lower value in the end of training GlowTTS. diff --git a/recipes/bel-alex73/train_hifigan.py b/recipes/bel-alex73/train_hifigan.py index 3e740b2ff4..78221a9f2b 100644 --- a/recipes/bel-alex73/train_hifigan.py +++ b/recipes/bel-alex73/train_hifigan.py @@ -1,11 +1,8 @@ -import os - -from coqpit import Coqpit from trainer import Trainer, TrainerArgs from TTS.tts.configs.shared_configs import BaseAudioConfig from TTS.utils.audio import AudioProcessor -from TTS.vocoder.configs.hifigan_config import * +from TTS.vocoder.configs.hifigan_config import HifiganConfig from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.models.gan import GAN diff --git a/recipes/multilingual/cml_yourtts/train_yourtts.py b/recipes/multilingual/cml_yourtts/train_yourtts.py index 25a2fd0a4b..02f901fe73 100644 --- a/recipes/multilingual/cml_yourtts/train_yourtts.py +++ b/recipes/multilingual/cml_yourtts/train_yourtts.py @@ -4,7 +4,6 @@ from trainer import Trainer, TrainerArgs from TTS.bin.compute_embeddings import compute_embeddings -from TTS.bin.resample import resample_files from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples diff --git a/requirements.dev.txt b/requirements.dev.txt index 8c674727d3..7f76b2400a 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,5 +1,4 @@ -black -coverage -isort +black==24.2.0 +coverage[toml] nose2 -pylint==2.10.2 +ruff==0.3.0 diff --git a/requirements.notebooks.txt b/requirements.notebooks.txt index 65d3f642c9..6b7e6e8956 100644 --- a/requirements.notebooks.txt +++ b/requirements.notebooks.txt @@ -1 +1,2 @@ -bokeh==1.4.0 \ No newline at end of file +bokeh==1.4.0 +pandas>=1.4,<2.0 diff --git a/requirements.txt b/requirements.txt index 1f7a44f6d8..a01efaa648 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,33 +1,25 @@ # core deps -numpy==1.22.0;python_version<="3.10" -numpy>=1.24.3;python_version>"3.10" +numpy>=1.24.3 cython>=0.29.30 scipy>=1.11.2 torch>=2.1 torchaudio soundfile>=0.12.0 -librosa>=0.10.0 -scikit-learn>=1.3.0 -numba==0.55.1;python_version<"3.9" -numba>=0.57.0;python_version>="3.9" +librosa>=0.10.1 inflect>=5.6.0 tqdm>=4.64.1 anyascii>=0.3.0 pyyaml>=6.0 -fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail -aiohttp>=3.8.1 +fsspec[http]>=2023.6.0 # <= 2023.9.1 makes aux tests fail packaging>=23.1 -# deps for examples -flask>=2.0.1 # deps for inference pysbd>=0.3.4 # deps for notebooks umap-learn>=0.5.1 -pandas>=1.4,<2.0 # deps for training matplotlib>=3.7.0 # coqui stack -trainer>=0.0.32 +trainer>=0.0.36 # config management coqpit>=0.0.16 # chinese g2p deps @@ -39,7 +31,6 @@ hangul_romanize gruut[de,es,fr]==2.2.3 # deps for korean jamo -nltk g2pkk>=0.1.1 # deps for bangla bangla @@ -51,6 +42,5 @@ transformers>=4.33.0 #deps for bark encodec>=0.1.1 # deps for XTTS -unidecode>=1.3.2 num2words -spacy[ja]>=3 \ No newline at end of file +spacy[ja]>=3 diff --git a/setup.py b/setup.py index df14b41adc..a25b7674f6 100644 --- a/setup.py +++ b/setup.py @@ -23,12 +23,12 @@ import os import subprocess import sys -from packaging.version import Version import numpy import setuptools.command.build_py import setuptools.command.develop from Cython.Build import cythonize +from packaging.version import Version from setuptools import Extension, find_packages, setup python_version = sys.version.split()[0] @@ -66,7 +66,8 @@ def pip_install(package_name): requirements_dev = f.readlines() with open(os.path.join(cwd, "requirements.ja.txt"), "r") as f: requirements_ja = f.readlines() -requirements_all = requirements_dev + requirements_notebooks + requirements_ja +requirements_server = ["flask>=2.0.1"] +requirements_all = requirements_dev + requirements_notebooks + requirements_ja + requirements_server with open("README.md", "r", encoding="utf-8") as readme_file: README = readme_file.read() @@ -78,12 +79,14 @@ def pip_install(package_name): ) ] setup( - name="TTS", + name="coqui-tts", version=version, - url="https://github.com/coqui-ai/TTS", + url="https://github.com/eginhard/coqui-tts", author="Eren GÃļlge", author_email="egolge@coqui.ai", - description="Deep learning for Text to Speech by Coqui.", + maintainer="Enno Hermann", + maintainer_email="enno.hermann@gmail.com", + description="Deep learning for Text to Speech.", long_description=README, long_description_content_type="text/markdown", license="MPL-2.0", @@ -100,10 +103,10 @@ def pip_install(package_name): ] }, project_urls={ - "Documentation": "https://github.com/coqui-ai/TTS/wiki", - "Tracker": "https://github.com/coqui-ai/TTS/issues", - "Repository": "https://github.com/coqui-ai/TTS", - "Discussions": "https://github.com/coqui-ai/TTS/discussions", + "Documentation": "https://coqui-tts.readthedocs.io", + "Tracker": "https://github.com/eginhard/coqui-tts/issues", + "Repository": "https://github.com/eginhard/coqui-tts", + "Discussions": "https://github.com/eginhard/coqui-tts/discussions", }, cmdclass={ "build_py": build_py, @@ -115,6 +118,7 @@ def pip_install(package_name): "all": requirements_all, "dev": requirements_dev, "notebooks": requirements_notebooks, + "server": requirements_server, "ja": requirements_ja, }, python_requires=">=3.9.0, <3.12", diff --git a/tests/__init__.py b/tests/__init__.py index e102a2dfee..f0a8b2f118 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,7 +1,8 @@ import os +from trainer.generic_utils import get_cuda + from TTS.config import BaseDatasetConfig -from TTS.utils.generic_utils import get_cuda def get_device_id(): diff --git a/tests/data/ljspeech/metadata_flac.csv b/tests/data/ljspeech/metadata_flac.csv new file mode 100644 index 0000000000..43db05ac91 --- /dev/null +++ b/tests/data/ljspeech/metadata_flac.csv @@ -0,0 +1,9 @@ +audio_file|text|transcription|speaker_name +wavs/LJ001-0001.flac|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0 +wavs/LJ001-0002.flac|in being comparatively modern.|in being comparatively modern.|ljspeech-0 +wavs/LJ001-0003.flac|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1 +wavs/LJ001-0004.flac|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1 +wavs/LJ001-0005.flac|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2 +wavs/LJ001-0006.flac|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2 +wavs/LJ001-0007.flac|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3 +wavs/LJ001-0008.flac|has never been surpassed.|has never been surpassed.|ljspeech-3 \ No newline at end of file diff --git a/tests/data/ljspeech/metadata_mp3.csv b/tests/data/ljspeech/metadata_mp3.csv new file mode 100644 index 0000000000..109e48b40a --- /dev/null +++ b/tests/data/ljspeech/metadata_mp3.csv @@ -0,0 +1,9 @@ +audio_file|text|transcription|speaker_name +wavs/LJ001-0001.mp3|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0 +wavs/LJ001-0002.mp3|in being comparatively modern.|in being comparatively modern.|ljspeech-0 +wavs/LJ001-0003.mp3|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1 +wavs/LJ001-0004.mp3|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1 +wavs/LJ001-0005.mp3|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2 +wavs/LJ001-0006.mp3|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2 +wavs/LJ001-0007.mp3|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3 +wavs/LJ001-0008.mp3|has never been surpassed.|has never been surpassed.|ljspeech-3 \ No newline at end of file diff --git a/tests/data/ljspeech/metadata_wav.csv b/tests/data/ljspeech/metadata_wav.csv new file mode 100644 index 0000000000..aff73f6d40 --- /dev/null +++ b/tests/data/ljspeech/metadata_wav.csv @@ -0,0 +1,9 @@ +audio_file|text|transcription|speaker_name +wavs/LJ001-0001.wav|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0 +wavs/LJ001-0002.wav|in being comparatively modern.|in being comparatively modern.|ljspeech-0 +wavs/LJ001-0003.wav|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1 +wavs/LJ001-0004.wav|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1 +wavs/LJ001-0005.wav|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2 +wavs/LJ001-0006.wav|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2 +wavs/LJ001-0007.wav|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3 +wavs/LJ001-0008.wav|has never been surpassed.|has never been surpassed.|ljspeech-3 \ No newline at end of file diff --git a/tests/data/ljspeech/wavs/LJ001-0001.flac b/tests/data/ljspeech/wavs/LJ001-0001.flac new file mode 100644 index 0000000000..ed3b009d4f Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0001.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0001.mp3 b/tests/data/ljspeech/wavs/LJ001-0001.mp3 new file mode 100644 index 0000000000..da62c8d7f7 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0001.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0002.flac b/tests/data/ljspeech/wavs/LJ001-0002.flac new file mode 100644 index 0000000000..f6a607ea91 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0002.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0002.mp3 b/tests/data/ljspeech/wavs/LJ001-0002.mp3 new file mode 100644 index 0000000000..8eb527924f Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0002.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0003.flac b/tests/data/ljspeech/wavs/LJ001-0003.flac new file mode 100644 index 0000000000..05f357a580 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0003.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0003.mp3 b/tests/data/ljspeech/wavs/LJ001-0003.mp3 new file mode 100644 index 0000000000..5bc4449880 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0003.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0004.flac b/tests/data/ljspeech/wavs/LJ001-0004.flac new file mode 100644 index 0000000000..547e7899a8 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0004.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0004.mp3 b/tests/data/ljspeech/wavs/LJ001-0004.mp3 new file mode 100644 index 0000000000..c68a1680f3 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0004.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0005.flac b/tests/data/ljspeech/wavs/LJ001-0005.flac new file mode 100644 index 0000000000..94589dbba4 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0005.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0005.mp3 b/tests/data/ljspeech/wavs/LJ001-0005.mp3 new file mode 100644 index 0000000000..99c245b0c2 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0005.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0006.flac b/tests/data/ljspeech/wavs/LJ001-0006.flac new file mode 100644 index 0000000000..87d32d339f Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0006.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0006.mp3 b/tests/data/ljspeech/wavs/LJ001-0006.mp3 new file mode 100644 index 0000000000..bc6cb81fb3 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0006.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0007.flac b/tests/data/ljspeech/wavs/LJ001-0007.flac new file mode 100644 index 0000000000..7e2b0f1de7 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0007.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0007.mp3 b/tests/data/ljspeech/wavs/LJ001-0007.mp3 new file mode 100644 index 0000000000..f1e34d1b87 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0007.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0008.flac b/tests/data/ljspeech/wavs/LJ001-0008.flac new file mode 100644 index 0000000000..6ca201a60b Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0008.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0008.mp3 b/tests/data/ljspeech/wavs/LJ001-0008.mp3 new file mode 100644 index 0000000000..ede2f06802 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0008.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0009.flac b/tests/data/ljspeech/wavs/LJ001-0009.flac new file mode 100644 index 0000000000..cd272b5f72 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0009.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0009.mp3 b/tests/data/ljspeech/wavs/LJ001-0009.mp3 new file mode 100644 index 0000000000..1dd97c4892 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0009.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0010.flac b/tests/data/ljspeech/wavs/LJ001-0010.flac new file mode 100644 index 0000000000..875e01b019 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0010.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0010.mp3 b/tests/data/ljspeech/wavs/LJ001-0010.mp3 new file mode 100644 index 0000000000..a763be3cc5 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0010.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0011.flac b/tests/data/ljspeech/wavs/LJ001-0011.flac new file mode 100644 index 0000000000..3a45005a1a Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0011.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0011.mp3 b/tests/data/ljspeech/wavs/LJ001-0011.mp3 new file mode 100644 index 0000000000..579854e193 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0011.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0012.flac b/tests/data/ljspeech/wavs/LJ001-0012.flac new file mode 100644 index 0000000000..2f78f762b3 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0012.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0012.mp3 b/tests/data/ljspeech/wavs/LJ001-0012.mp3 new file mode 100644 index 0000000000..51212f906e Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0012.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0013.flac b/tests/data/ljspeech/wavs/LJ001-0013.flac new file mode 100644 index 0000000000..50c7707fbf Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0013.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0013.mp3 b/tests/data/ljspeech/wavs/LJ001-0013.mp3 new file mode 100644 index 0000000000..a457bf9c6e Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0013.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0014.flac b/tests/data/ljspeech/wavs/LJ001-0014.flac new file mode 100644 index 0000000000..f8a5fe8823 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0014.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0014.mp3 b/tests/data/ljspeech/wavs/LJ001-0014.mp3 new file mode 100644 index 0000000000..f4a3d66e69 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0014.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0015.flac b/tests/data/ljspeech/wavs/LJ001-0015.flac new file mode 100644 index 0000000000..99523288ba Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0015.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0015.mp3 b/tests/data/ljspeech/wavs/LJ001-0015.mp3 new file mode 100644 index 0000000000..f0db88e17d Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0015.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0016.flac b/tests/data/ljspeech/wavs/LJ001-0016.flac new file mode 100644 index 0000000000..66b7ca9590 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0016.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0016.mp3 b/tests/data/ljspeech/wavs/LJ001-0016.mp3 new file mode 100644 index 0000000000..cd14b20478 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0016.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0017.flac b/tests/data/ljspeech/wavs/LJ001-0017.flac new file mode 100644 index 0000000000..56725cce10 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0017.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0017.mp3 b/tests/data/ljspeech/wavs/LJ001-0017.mp3 new file mode 100644 index 0000000000..ecc9b2a3eb Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0017.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0018.flac b/tests/data/ljspeech/wavs/LJ001-0018.flac new file mode 100644 index 0000000000..ec038cac88 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0018.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0018.mp3 b/tests/data/ljspeech/wavs/LJ001-0018.mp3 new file mode 100644 index 0000000000..33aa8ba163 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0018.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0019.flac b/tests/data/ljspeech/wavs/LJ001-0019.flac new file mode 100644 index 0000000000..6245cc5a07 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0019.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0019.mp3 b/tests/data/ljspeech/wavs/LJ001-0019.mp3 new file mode 100644 index 0000000000..e1844dce8b Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0019.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0020.flac b/tests/data/ljspeech/wavs/LJ001-0020.flac new file mode 100644 index 0000000000..41598a10f1 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0020.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0020.mp3 b/tests/data/ljspeech/wavs/LJ001-0020.mp3 new file mode 100644 index 0000000000..7a61c05082 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0020.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0021.flac b/tests/data/ljspeech/wavs/LJ001-0021.flac new file mode 100644 index 0000000000..3ec0eeb340 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0021.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0021.mp3 b/tests/data/ljspeech/wavs/LJ001-0021.mp3 new file mode 100644 index 0000000000..45a6d4ce10 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0021.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0022.flac b/tests/data/ljspeech/wavs/LJ001-0022.flac new file mode 100644 index 0000000000..9db1c6cf36 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0022.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0022.mp3 b/tests/data/ljspeech/wavs/LJ001-0022.mp3 new file mode 100644 index 0000000000..a0464aa254 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0022.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0023.flac b/tests/data/ljspeech/wavs/LJ001-0023.flac new file mode 100644 index 0000000000..621ba660f2 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0023.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0023.mp3 b/tests/data/ljspeech/wavs/LJ001-0023.mp3 new file mode 100644 index 0000000000..a6b087f8d4 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0023.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0024.flac b/tests/data/ljspeech/wavs/LJ001-0024.flac new file mode 100644 index 0000000000..4125d10bdd Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0024.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0024.mp3 b/tests/data/ljspeech/wavs/LJ001-0024.mp3 new file mode 100644 index 0000000000..0fee298fc6 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0024.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0025.flac b/tests/data/ljspeech/wavs/LJ001-0025.flac new file mode 100644 index 0000000000..ee0c4b6e05 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0025.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0025.mp3 b/tests/data/ljspeech/wavs/LJ001-0025.mp3 new file mode 100644 index 0000000000..f8c13a10be Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0025.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0026.flac b/tests/data/ljspeech/wavs/LJ001-0026.flac new file mode 100644 index 0000000000..119f26fb5e Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0026.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0026.mp3 b/tests/data/ljspeech/wavs/LJ001-0026.mp3 new file mode 100644 index 0000000000..fed88cc961 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0026.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0027.flac b/tests/data/ljspeech/wavs/LJ001-0027.flac new file mode 100644 index 0000000000..ff685ca577 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0027.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0027.mp3 b/tests/data/ljspeech/wavs/LJ001-0027.mp3 new file mode 100644 index 0000000000..bc23ed3199 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0027.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0028.flac b/tests/data/ljspeech/wavs/LJ001-0028.flac new file mode 100644 index 0000000000..151334f660 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0028.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0028.mp3 b/tests/data/ljspeech/wavs/LJ001-0028.mp3 new file mode 100644 index 0000000000..0212403392 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0028.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0029.flac b/tests/data/ljspeech/wavs/LJ001-0029.flac new file mode 100644 index 0000000000..65586b6c0a Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0029.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0029.mp3 b/tests/data/ljspeech/wavs/LJ001-0029.mp3 new file mode 100644 index 0000000000..f20eb0dfd2 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0029.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0030.flac b/tests/data/ljspeech/wavs/LJ001-0030.flac new file mode 100644 index 0000000000..411553c121 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0030.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0030.mp3 b/tests/data/ljspeech/wavs/LJ001-0030.mp3 new file mode 100644 index 0000000000..7d46fbef9a Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0030.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0031.flac b/tests/data/ljspeech/wavs/LJ001-0031.flac new file mode 100644 index 0000000000..b9f4fa683b Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0031.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0031.mp3 b/tests/data/ljspeech/wavs/LJ001-0031.mp3 new file mode 100644 index 0000000000..6842943c27 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0031.mp3 differ diff --git a/tests/data/ljspeech/wavs/LJ001-0032.flac b/tests/data/ljspeech/wavs/LJ001-0032.flac new file mode 100644 index 0000000000..9166a9d5d5 Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0032.flac differ diff --git a/tests/data/ljspeech/wavs/LJ001-0032.mp3 b/tests/data/ljspeech/wavs/LJ001-0032.mp3 new file mode 100644 index 0000000000..cf5abb648b Binary files /dev/null and b/tests/data/ljspeech/wavs/LJ001-0032.mp3 differ diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index cbd98fc0c5..252b429a16 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -8,7 +8,8 @@ from tests import get_tests_data_path, get_tests_output_path from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig -from TTS.tts.datasets import TTSDataset, load_tts_samples +from TTS.tts.datasets import load_tts_samples +from TTS.tts.datasets.dataset import TTSDataset from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor @@ -21,15 +22,30 @@ c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False) c.r = 5 c.data_path = os.path.join(get_tests_data_path(), "ljspeech/") -ok_ljspeech = os.path.exists(c.data_path) -dataset_config = BaseDatasetConfig( - formatter="ljspeech_test", # ljspeech_test to multi-speaker - meta_file_train="metadata.csv", +dataset_config_wav = BaseDatasetConfig( + formatter="coqui", # ljspeech_test to multi-speaker + meta_file_train="metadata_wav.csv", meta_file_val=None, path=c.data_path, language="en", ) +dataset_config_mp3 = BaseDatasetConfig( + formatter="coqui", # ljspeech_test to multi-speaker + meta_file_train="metadata_mp3.csv", + meta_file_val=None, + path=c.data_path, + language="en", +) +dataset_config_flac = BaseDatasetConfig( + formatter="coqui", # ljspeech_test to multi-speaker + meta_file_train="metadata_flac.csv", + meta_file_val=None, + path=c.data_path, + language="en", +) + +dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac] DATA_EXIST = True if not os.path.exists(c.data_path): @@ -44,11 +60,10 @@ def __init__(self, *args, **kwargs): self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio) - def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): + def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False): # load dataset meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) items = meta_data_train + meta_data_eval - tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( outputs_per_step=r, @@ -64,6 +79,11 @@ def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): max_audio_len=c.max_audio_len, start_by_longest=start_by_longest, ) + + # add preprocess to force the length computation + if preprocess_samples: + dataset.preprocess_samples() + dataloader = DataLoader( dataset, batch_size=batch_size, @@ -75,9 +95,8 @@ def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): return dataloader, dataset def test_loader(self): - if ok_ljspeech: - dataloader, dataset = self._create_dataloader(1, 1, 0) - + for dataset_config in dataset_configs: + dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break @@ -104,8 +123,6 @@ def test_loader(self): # make sure that the computed mels and the waveform match and correctly computed mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy()) - # remove padding in mel-spectrogram - mel_dataloader = mel_input[0].T.numpy()[:, : mel_lengths[0]] # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding mel_new = mel_new[:, : mel_lengths[0]] ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) @@ -124,40 +141,38 @@ def test_loader(self): self.assertGreaterEqual(mel_input.min(), 0) def test_batch_group_shuffle(self): - if ok_ljspeech: - dataloader, dataset = self._create_dataloader(2, c.r, 16) - last_length = 0 - frames = dataset.samples - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - avg_length = mel_lengths.numpy().mean() - dataloader.dataset.preprocess_samples() - is_items_reordered = False - for idx, item in enumerate(dataloader.dataset.samples): - if item != frames[idx]: - is_items_reordered = True - break - self.assertGreaterEqual(avg_length, last_length) - self.assertTrue(is_items_reordered) + dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav) + last_length = 0 + frames = dataset.samples + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + mel_lengths = data["mel_lengths"] + avg_length = mel_lengths.numpy().mean() + dataloader.dataset.preprocess_samples() + is_items_reordered = False + for idx, item in enumerate(dataloader.dataset.samples): + if item != frames[idx]: + is_items_reordered = True + break + self.assertGreaterEqual(avg_length, last_length) + self.assertTrue(is_items_reordered) def test_start_by_longest(self): """Test start_by_longest option. Ther first item of the fist batch must be longer than all the other items. """ - if ok_ljspeech: - dataloader, _ = self._create_dataloader(2, c.r, 0, True) - dataloader.dataset.preprocess_samples() - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - mel_lengths = data["mel_lengths"] - if i == 0: - max_len = mel_lengths[0] - print(mel_lengths) - self.assertTrue(all(max_len >= mel_lengths)) + dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True) + dataloader.dataset.preprocess_samples() + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + mel_lengths = data["mel_lengths"] + if i == 0: + max_len = mel_lengths[0] + print(mel_lengths) + self.assertTrue(all(max_len >= mel_lengths)) def test_padding_and_spectrograms(self): def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): @@ -172,71 +187,70 @@ def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0]) self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0]) - if ok_ljspeech: - dataloader, _ = self._create_dataloader(1, 1, 0) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # check mel_spec consistency - wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype("float32") - mel = torch.FloatTensor(mel).contiguous() - mel_dl = mel_input[0] - # NOTE: Below needs to check == 0 but due to an unknown reason - # there is a slight difference between two matrices. - # TODO: Check this assert cond more in detail. - self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) - - # check mel-spec correctness - mel_spec = mel_input[0].cpu().numpy() - wav = self.ap.inv_melspectrogram(mel_spec.T) - self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") - - # check linear-spec - linear_spec = linear_input[0].cpu().numpy() - wav = self.ap.inv_spectrogram(linear_spec.T) - self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") - shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") - - # check the outputs - check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) - - # Test for batch size 2 - dataloader, _ = self._create_dataloader(2, 1, 0) - - for i, data in enumerate(dataloader): - if i == self.max_loader_iter: - break - linear_input = data["linear"] - mel_input = data["mel"] - mel_lengths = data["mel_lengths"] - stop_target = data["stop_targets"] - item_idx = data["item_idxs"] - - # set id to the longest sequence in the batch - if mel_lengths[0] > mel_lengths[1]: - idx = 0 - else: - idx = 1 - - # check the longer item in the batch - check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) - - # check the other item in the batch - self.assertEqual(linear_input[1 - idx, -1].sum(), 0) - self.assertEqual(mel_input[1 - idx, -1].sum(), 0) - self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) - self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1]) - self.assertEqual(len(mel_lengths.shape), 1) + dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # check mel_spec consistency + wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) + mel = self.ap.melspectrogram(wav).astype("float32") + mel = torch.FloatTensor(mel).contiguous() + mel_dl = mel_input[0] + # NOTE: Below needs to check == 0 but due to an unknown reason + # there is a slight difference between two matrices. + # TODO: Check this assert cond more in detail. + self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) + + # check mel-spec correctness + mel_spec = mel_input[0].cpu().numpy() + wav = self.ap.inv_melspectrogram(mel_spec.T) + self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") + shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") + + # check linear-spec + linear_spec = linear_input[0].cpu().numpy() + wav = self.ap.inv_spectrogram(linear_spec.T) + self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") + shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") + + # check the outputs + check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) + + # Test for batch size 2 + dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav) + + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + linear_input = data["linear"] + mel_input = data["mel"] + mel_lengths = data["mel_lengths"] + stop_target = data["stop_targets"] + item_idx = data["item_idxs"] + + # set id to the longest sequence in the batch + if mel_lengths[0] > mel_lengths[1]: + idx = 0 + else: + idx = 1 + + # check the longer item in the batch + check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) + + # check the other item in the batch + self.assertEqual(linear_input[1 - idx, -1].sum(), 0) + self.assertEqual(mel_input[1 - idx, -1].sum(), 0) + self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) + self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1]) + self.assertEqual(len(mel_lengths.shape), 1) - # check batch zero-frame conditions (zero-frame disabled) - # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 - # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 + # check batch zero-frame conditions (zero-frame disabled) + # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 + # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py index 8810554421..ca25b302c5 100644 --- a/tests/text_tests/test_phonemizer.py +++ b/tests/text_tests/test_phonemizer.py @@ -234,8 +234,12 @@ def test_is_available(self): class TestBN_Phonemizer(unittest.TestCase): def setUp(self): self.phonemizer = BN_Phonemizer() - self._TEST_CASES = "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡, āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ, āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨" - self._EXPECTED = "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡ āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨āĨ¤" + self._TEST_CASES = ( + "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡, āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ, āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨" + ) + self._EXPECTED = ( + "āĻ°āĻžāĻ¸ā§‚āĻ˛ā§āĻ˛ā§āĻ˛āĻžāĻš āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻ˛ā§āĻ˛āĻžāĻšā§ āĻ†āĻ˛āĻžāĻ‡āĻšāĻŋ āĻ“ā§ŸāĻž āĻ¸āĻžāĻ˛ā§āĻ˛āĻžāĻŽ āĻļāĻŋāĻ•ā§āĻˇāĻž āĻĻāĻŋā§Ÿā§‡āĻ›ā§‡āĻ¨ āĻ¯ā§‡ āĻ•ā§‡āĻ‰ āĻ¯āĻĻāĻŋ āĻ•ā§‹āĻ¨ āĻ–āĻžāĻ°āĻžāĻĒ āĻ•āĻŋāĻ›ā§āĻ° āĻ¸āĻŽā§āĻŽā§āĻ–ā§€āĻ¨ āĻšā§Ÿ āĻ¤āĻ–āĻ¨āĻ“ āĻ¯ā§‡āĻ¨āĨ¤" + ) def test_phonemize(self): self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED) diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index b1bdeb9fd1..72b6bcd46b 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -278,7 +278,7 @@ def test_train_step(): }, ) - batch = dict({}) + batch = {} batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device) batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device) batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0] diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 906ec3d09f..7ec3f0df1b 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -4,6 +4,7 @@ import torch from torch import nn, optim +from trainer.generic_utils import count_parameters from tests import get_tests_input_path from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig @@ -24,11 +25,6 @@ WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -def count_parameters(model): - r"""Count number of trainable parameters in a network""" - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - class TacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): @@ -266,7 +262,7 @@ def test_train_step(): }, ) - batch = dict({}) + batch = {} batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device) batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device) batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0] diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index fca9955619..e76e29283e 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -64,7 +64,6 @@ def test_load_audio(self): def test_dataset(self): """TODO:""" - ... def test_init_multispeaker(self): num_speakers = 10 diff --git a/tests/tts_tests2/test_glow_tts.py b/tests/tts_tests2/test_glow_tts.py index 2a723f105f..b93e701f19 100644 --- a/tests/tts_tests2/test_glow_tts.py +++ b/tests/tts_tests2/test_glow_tts.py @@ -4,6 +4,7 @@ import torch from torch import optim +from trainer.generic_utils import count_parameters from trainer.logging.tensorboard_logger import TensorboardLogger from tests import get_tests_data_path, get_tests_input_path, get_tests_output_path @@ -26,11 +27,6 @@ BATCH_SIZE = 3 -def count_parameters(model): - r"""Count number of trainable parameters in a network""" - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - class TestGlowTTS(unittest.TestCase): @staticmethod def _create_inputs(batch_size=8): diff --git a/tests/vc_tests/test_freevc.py b/tests/vc_tests/test_freevc.py index a4a4f72679..c90551b494 100644 --- a/tests/vc_tests/test_freevc.py +++ b/tests/vc_tests/test_freevc.py @@ -2,10 +2,10 @@ import unittest import torch +from trainer.generic_utils import count_parameters from tests import get_tests_input_path -from TTS.vc.configs.freevc_config import FreeVCConfig -from TTS.vc.models.freevc import FreeVC +from TTS.vc.models.freevc import FreeVC, FreeVCConfig # pylint: disable=unused-variable # pylint: disable=no-self-use @@ -20,11 +20,6 @@ BATCH_SIZE = 3 -def count_parameters(model): - r"""Count number of trainable parameters in a network""" - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - class TestFreeVC(unittest.TestCase): def _create_inputs(self, config, batch_size=2): input_dummy = torch.rand(batch_size, 30 * config.audio["hop_length"]).to(device) @@ -116,20 +111,14 @@ def test_voice_conversion(self): output_wav.shape[0] + config.audio.hop_length == source_wav.shape[0] ), f"{output_wav.shape} != {source_wav.shape}" - def test_train_step(self): - ... + def test_train_step(self): ... - def test_train_eval_log(self): - ... + def test_train_eval_log(self): ... - def test_test_run(self): - ... + def test_test_run(self): ... - def test_load_checkpoint(self): - ... + def test_load_checkpoint(self): ... - def test_get_criterion(self): - ... + def test_get_criterion(self): ... - def test_init_from_config(self): - ... + def test_init_from_config(self): ... diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py index fe56ee783f..9b10759505 100644 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ b/tests/vocoder_tests/test_wavegrad_train.py @@ -1,43 +1,54 @@ import glob import os import shutil +import unittest from tests import get_device_id, get_tests_output_path, run_cli from TTS.vocoder.configs import WavegradConfig -config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = WavegradConfig( - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - run_eval=True, - test_delay_epochs=-1, - epochs=1, - seq_len=8192, - eval_split_size=1, - print_step=1, - print_eval=True, - data_path="tests/data/ljspeech", - output_path=output_path, - test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) + +class WavegradTrainingTest(unittest.TestCase): + # TODO: Reactivate after improving CI run times + # This test currently takes ~2h on CI (15min/step vs 8sec/step locally) + if os.getenv("GITHUB_ACTIONS") == "true": + __test__ = False + + def test_train(self): # pylint: disable=no-self-use + config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") + output_path = os.path.join(get_tests_output_path(), "train_outputs") + + config = WavegradConfig( + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + run_eval=True, + test_delay_epochs=-1, + epochs=1, + seq_len=8192, + eval_split_size=1, + print_step=1, + print_eval=True, + data_path="tests/data/ljspeech", + output_path=output_path, + test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, + ) + config.audio.do_trim_silence = True + config.audio.trim_db = 60 + config.save_json(config_path) + + # train the model for one epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " + ) + run_cli(command_train) + + # Find latest folder + continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + + # restore the model and continue training for one more epoch + command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " + ) + run_cli(command_train) + shutil.rmtree(continue_path)