From e9e4b5e7f474ad2899a663f7b0692d4159bcd167 Mon Sep 17 00:00:00 2001
From: Josh XT <josh@devxt.com>
Date: Sat, 5 Oct 2024 09:45:09 -0400
Subject: [PATCH 1/3] Add create_audiobook function

---
 Pipes.py             | 195 ++++++++++++++++++++++++++++++++++++++++++-
 ezlocalai/Helpers.py |  59 +++++++++++++
 2 files changed, 252 insertions(+), 2 deletions(-)

diff --git a/Pipes.py b/Pipes.py
index 046ee41..31164c0 100644
--- a/Pipes.py
+++ b/Pipes.py
@@ -5,12 +5,16 @@
 from ezlocalai.STT import STT
 from ezlocalai.CTTS import CTTS
 from ezlocalai.Embedding import Embedding
+from ezlocalai.Helpers import chunk_content_by_tokens
+from pydub import AudioSegment
+from Globals import getenv
 from pyngrok import ngrok
 import requests
 import base64
 import pdfplumber
-import torch
-from Globals import getenv
+import random
+import json
+import io
 
 try:
     from ezlocalai.IMG import IMG
@@ -301,3 +305,190 @@ async def get_response(self, data, completion_type="chat"):
             else:
                 response["choices"][0]["message"]["content"] += f"\n\n{generated_image}"
         return response, audio_response
+
+    async def create_audiobook(
+        self,
+        book_content,
+        output_file_name,
+        narrator_voice,
+        language="en",
+        translate=False,
+        target_language=None,
+    ):
+        # Step 1: Chunk the book content into paragraphs
+        paragraphs = chunk_content_by_tokens(book_content)
+
+        # Step 2: Extract characters, their lines, genders, and maintain order
+        characters = {}
+        ordered_content = []
+
+        def find_similar_character(name):
+            # Check for exact match first
+            if name in characters:
+                return name
+            # Check for case-insensitive match
+            lower_name = name.lower()
+            for char in characters:
+                if char.lower() == lower_name:
+                    return char
+            # Check for partial matches (e.g., "Mr. Smith" vs "Smith")
+            for char in characters:
+                if name in char or char in name:
+                    return char
+            return None
+
+        for paragraph in paragraphs:
+            prompt = f"""## Paragraph
+{paragraph}
+
+## System
+Analyze the text in the paragraph and extract:
+1. All character names and their genders (male, female, or unknown)
+2. Lines spoken by each character
+3. Narrator lines (not spoken by any character)
+
+Provide the result in JSON format:
+{{
+    "characters": [
+        {{"name": "character1", "gender": "male/female/unknown"}},
+        {{"name": "character2", "gender": "male/female/unknown"}},
+        ...
+    ],
+    "content": [
+        {{"type": "narrator", "text": "narrator line"}},
+        {{"type": "character", "name": "character1", "text": "character1 line"}},
+        {{"type": "narrator", "text": "narrator line"}},
+        {{"type": "character", "name": "character2", "text": "character2 line"}},
+        ...
+    ]
+}}
+Ensure the content array preserves the original order of narration and dialogue."""
+
+            response = await self.llm.completion(prompt=prompt)
+            result_text = response["choices"][0]["text"]
+
+            # Strip out code block markers if present
+            if "```json" in result_text:
+                result_text = result_text.split("```json")[1].split("```")[0]
+            elif "```" in result_text:
+                result_text = result_text.split("```")[1].split("```")[0]
+
+            try:
+                result = json.loads(result_text)
+                for char in result.get("characters", []):
+                    similar_char = find_similar_character(char["name"])
+                    if similar_char:
+                        # Use the existing character name
+                        char["name"] = similar_char
+                    else:
+                        # Add new character
+                        characters[char["name"]] = char["gender"]
+
+                # Update content with potentially merged character names
+                for item in result.get("content", []):
+                    if item["type"] == "character":
+                        similar_char = find_similar_character(item["name"])
+                        if similar_char:
+                            item["name"] = similar_char
+                    ordered_content.append(item)
+
+            except json.JSONDecodeError:
+                logging.error(f"Failed to parse JSON from LLM response: {result_text}")
+                continue
+
+        # Step 3: Translate the content if necessary
+        if translate and target_language:
+            translated_content = []
+            for item in ordered_content:
+                translation_prompt = f"""## Original text:{item['text']}\n\n## System\nTranslate the original text to {target_language}.\nReturn only the translated text without any additional commentary."""
+                translation_response = await self.llm.completion(
+                    prompt=translation_prompt
+                )
+                translated_text = translation_response["choices"][0]["text"].strip()
+                translated_item = item.copy()
+                translated_item["text"] = translated_text
+                translated_content.append(translated_item)
+            ordered_content = translated_content
+
+        # Step 4: Assign voices to characters based on gender
+        character_voices = {}
+        male_voices = [f"male-{i}" for i in range(1, 101)]
+        female_voices = [f"female-{i}" for i in range(1, 101)]
+        unknown_voices = male_voices + female_voices
+        random.shuffle(male_voices)
+        random.shuffle(female_voices)
+        random.shuffle(unknown_voices)
+
+        for character, gender in characters.items():
+            if gender == "male" and male_voices:
+                character_voices[character] = male_voices.pop()
+            elif gender == "female" and female_voices:
+                character_voices[character] = female_voices.pop()
+            elif unknown_voices:
+                character_voices[character] = unknown_voices.pop()
+            else:
+                logging.warning(
+                    f"Ran out of voices. Reusing voices for character: {character}"
+                )
+                character_voices[character] = random.choice(male_voices + female_voices)
+
+        # Step 5: Generate audio for each item in ordered_content
+        audio_segments = []
+        text_output = []
+
+        for item in ordered_content:
+            if item["type"] == "narrator":
+                try:
+                    audio = await self.ctts.generate(
+                        text=item["text"], voice=narrator_voice, language=language
+                    )
+                    audio_segments.append(base64.b64decode(audio))
+                    text_output.append(f"Narrator: {item['text']}")
+                except Exception as e:
+                    logging.error(
+                        f"Failed to generate audio for narrator text: {item['text'][:50]}... Error: {str(e)}"
+                    )
+            elif item["type"] == "character":
+                character_voice = character_voices.get(item["name"], narrator_voice)
+                try:
+                    audio = await self.ctts.generate(
+                        text=item["text"], voice=character_voice, language=language
+                    )
+                    audio_segments.append(base64.b64decode(audio))
+                    text_output.append(f"{item['name']}: {item['text']}")
+                except Exception as e:
+                    logging.error(
+                        f"Failed to generate audio for character {item['name']}: {item['text'][:50]}... Error: {str(e)}"
+                    )
+
+        # Step 6: Combine all audio segments
+        combined_audio = AudioSegment.empty()
+        for audio_data in audio_segments:
+            try:
+                audio = AudioSegment.from_wav(io.BytesIO(audio_data))
+                combined_audio += audio
+                combined_audio += AudioSegment.silent(
+                    duration=500
+                )  # 0.5 second pause between segments
+            except Exception as e:
+                logging.error(f"Failed to process audio segment. Error: {str(e)}")
+        outputs = os.path.join(os.getcwd(), "outputs")
+        # Step 7: Export the final audiobook
+        audio_output_path = os.path.join(outputs, f"{output_file_name}.mp3")
+        combined_audio.export(audio_output_path, format="mp3")
+
+        # Step 8: Save the text output
+        text_output_path = os.path.join(outputs, f"{output_file_name}.txt")
+        with open(text_output_path, "w", encoding="utf-8") as f:
+            f.write("\n\n".join(text_output))
+
+        logging.info(f"Audiobook created successfully: {audio_output_path}")
+        logging.info(f"Text output saved: {text_output_path}")
+
+        return {
+            "audio_file": audio_output_path,
+            "text_file": text_output_path,
+            "original_text": book_content,
+            "translated_text": "\n\n".join([item["text"] for item in ordered_content]),
+            "character_voices": character_voices,
+        }
diff --git a/ezlocalai/Helpers.py b/ezlocalai/Helpers.py
index 9527ab0..8fb6783 100644
--- a/ezlocalai/Helpers.py
+++ b/ezlocalai/Helpers.py
@@ -27,3 +27,62 @@ def chunk_content(text: str) -> List[str]:
     sentences = list(doc.sents)
     content_chunks = [str(sentence).strip() for sentence in sentences]
     return content_chunks
+
+
+# Export chunks of paragraphs up to 2000 tokens
+def chunk_content_by_tokens(text: str, max_tokens: int = 2000) -> List[str]:
+    # Load spaCy model
+    try:
+        nlp = spacy.load("en_core_web_sm")
+    except:
+        spacy.cli.download("en_core_web_sm")
+        nlp = spacy.load("en_core_web_sm")
+
+    encoding = tiktoken.get_encoding("cl100k_base")
+    paragraphs = text.split("\n\n")
+    chunks = []
+    current_chunk = []
+    current_chunk_tokens = 0
+
+    def add_to_chunk(content: str):
+        nonlocal current_chunk, current_chunk_tokens, chunks
+        content_tokens = encoding.encode(content)
+        if current_chunk_tokens + len(content_tokens) > max_tokens:
+            chunks.append("\n\n".join(current_chunk))
+            current_chunk = []
+            current_chunk_tokens = 0
+        current_chunk.append(content)
+        current_chunk_tokens += len(content_tokens)
+
+    for paragraph in paragraphs:
+        paragraph_tokens = encoding.encode(paragraph)
+        if len(paragraph_tokens) <= max_tokens:
+            add_to_chunk(paragraph)
+        else:
+            # Split long paragraph into sentences using spaCy
+            doc = nlp(paragraph)
+            sentences = [sent.text for sent in doc.sents]
+            current_sentence_group = []
+            current_group_tokens = 0
+
+            for sentence in sentences:
+                sentence_tokens = encoding.encode(sentence)
+                if current_group_tokens + len(sentence_tokens) <= max_tokens:
+                    current_sentence_group.append(sentence)
+                    current_group_tokens += len(sentence_tokens)
+                else:
+                    # Add the current group of sentences as a chunk
+                    if current_sentence_group:
+                        add_to_chunk(" ".join(current_sentence_group))
+                    current_sentence_group = [sentence]
+                    current_group_tokens = len(sentence_tokens)
+
+            # Add any remaining sentences
+            if current_sentence_group:
+                add_to_chunk(" ".join(current_sentence_group))
+
+    # Add the last chunk if it's not empty
+    if current_chunk:
+        chunks.append("\n\n".join(current_chunk))
+
+    return chunks

From b2d6505e5dfe7029911598e5675856bce156aa49 Mon Sep 17 00:00:00 2001
From: Josh XT <josh@devxt.com>
Date: Sat, 5 Oct 2024 09:51:07 -0400
Subject: [PATCH 2/3] inject characters

---
 Pipes.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Pipes.py b/Pipes.py
index 31164c0..024170b 100644
--- a/Pipes.py
+++ b/Pipes.py
@@ -338,12 +338,16 @@ def find_similar_character(name):
             return None
 
         for paragraph in paragraphs:
-            prompt = f"""## Paragraph
+            # Inject a list of characters we know so far.
+            prompt = f"""## Characters we know so far:
+{json.dumps(characters, indent=4)}
+
+## Paragraph
 {paragraph}
 
 ## System
 Analyze the text in the paragraph and extract:
-1. All character names and their genders (male, female, or unknown)
+1. All character names and their genders (male, female, or unknown.) Use best judgement based on hisortical uses of a name to determine gender. Attempt to normalize character names to match existing characters if possible.
 2. Lines spoken by each character
 3. Narrator lines (not spoken by any character)
 

From e2676a012d3321f8776cceda5e1a3768d991c0e8 Mon Sep 17 00:00:00 2001
From: Josh XT <josh@devxt.com>
Date: Sat, 5 Oct 2024 20:17:24 -0400
Subject: [PATCH 3/3] add endpoint

---
 Pipes.py              | 88 ++++++++++++++++++++++++++++++++++---------
 app.py                | 37 +++++++++++++++++-
 cuda-requirements.txt |  4 +-
 requirements.txt      |  4 +-
 4 files changed, 112 insertions(+), 21 deletions(-)

diff --git a/Pipes.py b/Pipes.py
index 024170b..eddcb31 100644
--- a/Pipes.py
+++ b/Pipes.py
@@ -7,11 +7,15 @@
 from ezlocalai.Embedding import Embedding
 from ezlocalai.Helpers import chunk_content_by_tokens
 from pydub import AudioSegment
+from datetime import datetime
 from Globals import getenv
 from pyngrok import ngrok
 import requests
 import base64
 import pdfplumber
+import zipfile
+import docx2txt
+import pandas as pd
 import random
 import json
 import io
@@ -26,6 +30,61 @@
 from ezlocalai.VLM import VLM
 
 
+async def file_to_text(file_path: str = ""):
+    """
+    Learn from a file
+
+    Args:
+        file_path (str, optional): Path to the file. Defaults to "".
+
+    Returns:
+        str: Response from the agent
+    """
+    file_content = ""
+    file_name = os.path.basename(file_path)
+    logging.info(f"File path: {file_path}")
+    file_type = file_name.split(".")[-1]
+    if file_type == "pdf":
+        with pdfplumber.open(file_path) as pdf:
+            content = "\n".join([page.extract_text() for page in pdf.pages])
+            file_content += content
+    elif file_path.endswith(".zip"):
+        extracted_zip_folder_name = f"extracted_{file_name.replace('.zip', '_zip')}"
+        new_folder = os.path.join(os.path.dirname(file_path), extracted_zip_folder_name)
+        file_content += f"Content from the zip file uploaded named `{file_name}`:\n"
+        with zipfile.ZipFile(file_path, "r") as zipObj:
+            zipObj.extractall(path=new_folder)
+        # Iterate over every file that was extracted including subdirectories
+        for root, dirs, files in os.walk(new_folder):
+            for name in files:
+                file_content += f"Content from file uploaded named `{name}`:\n"
+                file_content += await file_to_text(file_path=os.path.join(root, name))
+        return file_content
+    elif file_path.endswith(".doc") or file_path.endswith(".docx"):
+        file_content = docx2txt.process(file_path)
+    elif file_type == "csv":
+        with open(file_path, "r") as f:
+            file_content = f.read()
+    elif file_type == "xlsx" or file_type == "xls":
+        xl = pd.ExcelFile(file_path)
+        if len(xl.sheet_names) > 1:
+            sheet_count = len(xl.sheet_names)
+            for i, sheet_name in enumerate(xl.sheet_names, 1):
+                df = xl.parse(sheet_name)
+                csv_file_path = file_path.replace(f".{file_type}", f"_{i}.csv")
+                df.to_csv(csv_file_path, index=False)
+        else:
+            df = pd.read_excel(file_path)
+            csv_file_path = file_path.replace(f".{file_type}", ".csv")
+            df.to_csv(csv_file_path, index=False)
+        with open(csv_file_path, "r") as f:
+            file_content = f.read()
+    else:
+        with open(file_path, "r") as f:
+            file_content = f.read()
+    return file_content
+
+
 class Pipes:
     def __init__(self):
         load_dotenv()
@@ -308,15 +367,14 @@ async def get_response(self, data, completion_type="chat"):
 
     async def create_audiobook(
         self,
-        book_content,
-        output_file_name,
-        narrator_voice,
+        content,
+        voice,
         language="en",
-        translate=False,
-        target_language=None,
     ):
+        string_timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+        output_file_name = f"audiobook_{string_timestamp}"
         # Step 1: Chunk the book content into paragraphs
-        paragraphs = chunk_content_by_tokens(book_content)
+        paragraphs = chunk_content_by_tokens(content)
 
         # Step 2: Extract characters, their lines, genders, and maintain order
         characters = {}
@@ -401,10 +459,10 @@ def find_similar_character(name):
                 continue
 
         # Step 3: Translate the content if necessary
-        if translate and target_language:
+        if language != "en":
             translated_content = []
             for item in ordered_content:
-                translation_prompt = f"""## Original text:{item['text']}\n\n## System\nTranslate the original text to {target_language}.\nReturn only the translated text without any additional commentary."""
+                translation_prompt = f"""## Original text:{item['text']}\n\n## System\nTranslate the original text to {language}.\nReturn only the translated text without any additional commentary."""
                 translation_response = await self.llm.completion(
                     prompt=translation_prompt
                 )
@@ -444,7 +502,7 @@ def find_similar_character(name):
             if item["type"] == "narrator":
                 try:
                     audio = await self.ctts.generate(
-                        text=item["text"], voice=narrator_voice, language=language
+                        text=item["text"], voice=voice, language=language
                     )
                     audio_segments.append(base64.b64decode(audio))
                     text_output.append(f"Narrator: {item['text']}")
@@ -453,7 +511,7 @@ def find_similar_character(name):
                         f"Failed to generate audio for narrator text: {item['text'][:50]}... Error: {str(e)}"
                     )
             elif item["type"] == "character":
-                character_voice = character_voices.get(item["name"], narrator_voice)
+                character_voice = character_voices.get(item["name"], voice)
                 try:
                     audio = await self.ctts.generate(
                         text=item["text"], voice=character_voice, language=language
@@ -485,14 +543,8 @@ def find_similar_character(name):
         text_output_path = os.path.join(outputs, f"{output_file_name}.txt")
         with open(text_output_path, "w", encoding="utf-8") as f:
             f.write("\n\n".join(text_output))
-
-        logging.info(f"Audiobook created successfully: {audio_output_path}")
-        logging.info(f"Text output saved: {text_output_path}")
-
         return {
-            "audio_file": audio_output_path,
-            "text_file": text_output_path,
-            "original_text": book_content,
-            "translated_text": "\n\n".join([item["text"] for item in ordered_content]),
+            "audio_file": f"{self.local_uri}/outputs/{output_file_name}.mp3",
+            "text_file": f"{self.local_uri}/outputs/{output_file_name}.txt",
             "character_voices": character_voices,
         }
diff --git a/app.py b/app.py
index 487ef3f..1517fd2 100644
--- a/app.py
+++ b/app.py
@@ -13,7 +13,7 @@
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from typing import List, Dict, Union, Optional
-from Pipes import Pipes
+from Pipes import Pipes, file_to_text
 import base64
 import os
 import logging
@@ -376,6 +376,41 @@ async def upload_voice(
     return {"detail": f"Voice {voice_name} has been uploaded."}
 
 
+class BookToSpeech(BaseModel):
+    voice: Optional[str] = "default"
+    language: Optional[str] = "en"
+
+
+class BookToSpeechResponse(BaseModel):
+    audio_file: str
+    text_file: str
+
+
+@app.post(
+    "/v1/audio/book",
+    tags=["Audio"],
+    dependencies=[Depends(verify_api_key)],
+)
+async def book_to_speech(
+    book: BookToSpeech,
+    file: UploadFile = File(...),
+    user=Depends(verify_api_key),
+):
+    if getenv("TTS_ENABLED").lower() == "false":
+        raise HTTPException(status_code=404, detail="Text to speech is disabled.")
+    file_type = file.filename.split(".")[-1]
+    file_path = os.path.join(os.getcwd(), "outputs", f"{uuid.uuid4().hex}.{file_type}")
+    with open(file_path, "wb") as audio_file:
+        audio_file.write(await file.read())
+    file_content = await file_to_text(file_path=file_path)
+    audiobook = await pipe.create_audiobook(
+        voice=book.voice, language=book.language, content=file_content
+    )
+    return BookToSpeechResponse(
+        audio_file=audiobook["audio_file"], text_file=audiobook["text_file"]
+    )
+
+
 # Image Generation endpoint
 # https://platform.openai.com/docs/api-reference/images
 
diff --git a/cuda-requirements.txt b/cuda-requirements.txt
index 1da8798..6c0be96 100644
--- a/cuda-requirements.txt
+++ b/cuda-requirements.txt
@@ -28,4 +28,6 @@ onnx
 timm>=0.9.16
 sentencepiece
 attrdict
-einops
\ No newline at end of file
+einops
+docx2txt
+pandas
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 43160df..658103c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,4 +25,6 @@ optimum
 onnx
 diffusers[torch]
 torchaudio==2.3.1
-llama-cpp-python==0.2.90
\ No newline at end of file
+llama-cpp-python==0.2.90
+docx2txt
+pandas
\ No newline at end of file