Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
marwan2232004 committed Oct 11, 2024
1 parent 0f40561 commit 6493008
Show file tree
Hide file tree
Showing 10 changed files with 897 additions and 0 deletions.
165 changes: 165 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
.idea
ASR.pth
Test Arabic.mp3
test_main.http
# C extensions
*.so
*.pth
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
58 changes: 58 additions & 0 deletions Inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import numpy as np
from utils.Audio_Processing import preprocess_audio
from utils.Constants import *
from utils.MMS import get_device, MMS, greedyDecoder
from utils.NLP import preprocess_vocab
import torch

############################################################################################


model_path = "./ASR_2_1_220.pth"


############################################################################################


def predict(audio_file):
device = get_device()

processed_audios = []
mel_spec, duration = preprocess_audio(audio_file)
processed_audios.append(mel_spec)
padded_audios = [
(
mel_spec.shape[-1],
np.pad(
mel_spec,
((0, 0), (0, N_FRAMES - mel_spec.shape[-1])),
mode="constant",
),
)
for mel_spec in processed_audios
]

char2idx, idx2char, vocab_size = preprocess_vocab()

# load model

model = MMS(
vocab_size=vocab_size,
max_encoder_seq_len=math.ceil(N_FRAMES / 2),
max_decoder_seq_len=MAX_SEQ_LEN,
num_encoder_layers=2,
num_decoder_layers=1,
d_model=512,
nhead=8,
dim_feedforward=2048,
)

model.load_state_dict(torch.load(model_path, weights_only=True))
model.to(device)
model.eval()

result = greedyDecoder(
model, padded_audios[0][1], padded_audios[0][0], char2idx, idx2char, device
)

return result
97 changes: 97 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import sys
import io

from fastapi import (
FastAPI,
File,
UploadFile,
)
from openai import OpenAI
import dotenv
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel

from Inference import predict

import tempfile
import os
from utils.Translation import get_translate

dotenv.load_dotenv()
app = FastAPI()
client = OpenAI()

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")

# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allow all origins
allow_credentials=True,
allow_methods=["*"], # Allow all methods (GET, POST, etc.)
allow_headers=["*"], # Allow all headers
)


@app.get("/")
async def root():
return {"message": "Hello World"}


@app.post("/translate-any")
async def translate(txt: str, language: str):
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": f"Translate this {txt} into {language} directly without any other words",
}
],
)
print(completion.choices[0].message.content)
return {"response": completion.choices[0].message.content}


class TranslationRequest(BaseModel):
text: str


@app.post("/translate/auto")
async def translate(request: TranslationRequest):
response = get_translate(request.text)
return {"translation": response}


@app.post("/translate/en")
async def translate(request: TranslationRequest):
# response = get_translate(request.text)
return {"translation": "ليس بعد"}


@app.post("/audio2text")
async def upload_audio(file: UploadFile = File(...)):
# Read the uploaded audio file into memory
contents = await file.read()

# Get the current working directory
current_dir = os.getcwd()
print(current_dir, flush=True)

# Create a temporary file in the current working directory
with tempfile.NamedTemporaryFile(
dir=current_dir, delete=False, suffix=".wav"
) as tmp_file:
tmp_file.write(contents)
tmp_file_path = tmp_file.name # Get the path of the temp file

try:
# Pass the path of the saved file to the predict function
print(f"Temporary file created at: {tmp_file_path}", flush=True)
result = predict(tmp_file_path)
finally:
# Clean up the temporary file after prediction
os.remove(tmp_file_path)
print(f"Temporary file deleted: {tmp_file_path}", flush=True)

return {"text": result}
10 changes: 10 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
fastapi~=0.115.0
openai~=1.50.2
python-dotenv~=1.0.1
numpy~=2.0.2
torch~=2.4.1
uvicorn~=0.31.0
python-multipart~=0.0.12
pydantic~=2.8.2
librosa~=0.10.2.post1
requests~=2.32.3
38 changes: 38 additions & 0 deletions utils/Audio_Processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import librosa
import numpy as np
from utils.Constants import *


def pad_or_trim(array, length=N_SAMPLES, axis=-1, padding=True):
if array.shape[axis] > length:
array = array.take(indices=range(length), axis=axis)

if padding & (array.shape[axis] < length):
pad_widths = [(0, 0)] * array.ndim
pad_widths[axis] = (0, length - array.shape[axis])
array = np.pad(array, pad_widths)

return array


# Function to load and preprocess audio
def preprocess_audio(file_path):
audio_data, _ = librosa.load(file_path, sr=SAMPLE_RATE)

duration = librosa.get_duration(y=audio_data, sr=SAMPLE_RATE)

modified_audio = pad_or_trim(audio_data, padding=False)

sgram = librosa.stft(y=modified_audio, n_fft=N_FFT, hop_length=HOP_LENGTH)

sgram_mag, _ = librosa.magphase(sgram)

mel_scale_sgram = librosa.feature.melspectrogram(
S=sgram_mag, sr=SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS
)

mel_sgram = librosa.amplitude_to_db(mel_scale_sgram, ref=np.min)

del audio_data, modified_audio, sgram, mel_scale_sgram

return mel_sgram, duration
22 changes: 22 additions & 0 deletions utils/Constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import math

N_ROWS = 50000 # NUMBER OF ROWS TAKEN FROM THE DATA
MAX_TEXT_LEN = 70
MAX_SEQ_LEN = 70
N_MELS = 128
SAMPLE_RATE = 16000 # NUMBER OF SAMPLES PER SECOND
HOP_LENGTH = (
512 # THE STEP LENGTH OF THE SLIDING WINDOW, Commonly set to one-fourth of N_FFT
)
N_FFT = 2048 # NUMBER OF FFT POINTS (WINDOW SIZE) THIS CONTROLS THE RESOLUTION OF THE FREQUENCY DOMAIN ANALYSIS
CHUNK_LENGTH = 15 # 15 SECOND CHUNK
N_SAMPLES = SAMPLE_RATE * CHUNK_LENGTH
N_FRAMES = math.ceil(N_SAMPLES / HOP_LENGTH)
N_SAMPLES_PER_TOKEN = 2 * HOP_LENGTH
FRAMES_PER_SECOND = SAMPLE_RATE // HOP_LENGTH # OR N_FRAMES // CHUNK_LENGTH
TOKENS_PER_SECOND = SAMPLE_RATE // N_SAMPLES_PER_TOKEN
NEG_INFTY = -1e9
special_tokens = ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]


############################################################################################
Loading

0 comments on commit 6493008

Please sign in to comment.