Added grammar builder and validator and endpoints

Dicklesworthstone · Oct 3, 2023 · 50199da · 50199da
1 parent fb6a331
commit 50199da
Showing 5 changed files with 402 additions and 4 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,3 +1,6 @@
 {
-    "git.ignoreLimitWarning": true
+    "git.ignoreLimitWarning": true,
+    "cSpell.words": [
+        "bfnrt"
+    ]
 }
diff --git a/embeddings_data_models.py b/embeddings_data_models.py
@@ -230,3 +230,17 @@ class AudioTranscriptResponse(BaseModel):
 class ShowLogsIncrementalModel(BaseModel):
     logs: str
     last_position: int
+
+class GrammarBuilderRequest(BaseModel):
+    sample_json: Optional[dict]
+    pydantic_model_description: Optional[str]
+
+class GrammarBuilderResponse(BaseModel):
+    bnf_grammar: str
+
+class AddGrammarRequest(BaseModel):
+    bnf_grammar: str
+    grammar_file_name: str
+
+class AddGrammarResponse(BaseModel):
+    valid_grammar_files: List[str]
diff --git a/grammar_builder.py b/grammar_builder.py
@@ -0,0 +1,223 @@
+from typing import List, Dict
+import json
+
+use_grammarbuilder_demo = 1
+
+def normalize_json(json_str):
+    output = []
+    in_string = False
+    escape_char = False
+    for char in json_str:
+        if char == "\\" and not escape_char:
+            escape_char = True
+            output.append(char)
+            continue
+        if char == '"' and not escape_char:
+            in_string = not in_string
+        if in_string:
+            output.append(char)
+        else:
+            if char.strip():
+                output.append(char)
+        if escape_char:
+            escape_char = False
+    return ''.join(output)
+
+class GrammarBuilder:
+    type_to_bnf: Dict[str, str] = {
+        "str": "string",
+        "float": "number",
+        "int": "number",
+        "bool": "bool",
+        "datetime": "datetime",
+        "List": "list",
+        "Dict": "dict",
+        "Optional": "optional"
+    }
+
+    def __init__(self):
+        self.rules = {
+            "ws": "([ \\t\\n] ws)?",
+            "string": '\\" ([^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* \\" ws',
+            "number": '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws',
+            "bool": "('true' | 'false') ws",
+            "datetime": "string",
+            "dict": "'{' ws dict_pair_list ws '}' ws",
+            "dict_pair_list": "dict_pair (',' ws dict_pair)*",
+            "dict_pair": "string ':' ws value ws",
+            "list": "'[' ws list_items ws ']' ws",
+            "list_items": "value (',' ws value)*"
+        }
+
+
+    def generate_bnf_from_fields(self, fields: List[str], parent="root") -> str:
+        bnf = []
+        keys = ' | '.join([f'"{field.split(":")[0].strip()}"' for field in fields])
+        bnf.append(f"{parent} ::= '{{' ws {parent}_pair_list ws '}}' ws")
+        bnf.append(f"{parent}_pair_list ::= {parent}_pair (',' ws {parent}_pair)*")
+        bnf.append(f"{parent}_pair ::= allowed_keys_{parent} ':' ws value ws")
+        bnf.append(f"allowed_keys_{parent} ::= {keys}")
+        value_types = set()
+        for field in fields:
+            field_name, field_type = field.split(":")
+            field_name, field_type = field_name.strip(), field_type.strip()
+            parsed_type = self.type_to_bnf.get(field_type, field_type)
+            if field_type.startswith("List"):
+                parsed_type = "list"
+            value_types.add(parsed_type)
+        bnf.append(f"value ::= {' | '.join(value_types)}")
+        return "\n".join(bnf)
+
+    def pydantic_to_json_bnf(self, model_description: str) -> str:
+        lines = model_description.strip().split('\n')[1:]
+        fields = [line.strip() for line in lines if ':' in line]
+        bnf_for_fields = self.generate_bnf_from_fields(fields)
+        return f"{bnf_for_fields}\n{self.generate_base_rules()}"
+
+    def generate_base_rules(self):
+        return "\n".join([f"{key} ::= {value}" for key, value in self.rules.items()])
+
+    def generate_bnf(self, data, parent="root"):
+        bnf = []
+        if isinstance(data, dict):
+            keys = ' | '.join([f'\"{key}\"' for key in data.keys()])
+            bnf.append(f"{parent} ::= '{{' ws {parent}_pair_list ws '}}' ws")
+            bnf.append(f"{parent}_pair_list ::= {parent}_pair (',' ws {parent}_pair)*")
+            bnf.append(f"{parent}_pair ::= allowed_keys_{parent} ':' ws value ws")
+            bnf.append(f"allowed_keys_{parent} ::= {keys}")
+            sample_key = next(iter(data.keys()))
+            if isinstance(data[sample_key], dict):
+                bnf.append(f"value ::= {self.generate_bnf(data[sample_key], 'nested_value')}")
+        elif isinstance(data, list):
+            if len(data) > 0:
+                sample_item = data[0]
+                rule_name = f"{parent}_item"
+                bnf.append(f"{parent} ::= '[' ws {rule_name} (',' ws {rule_name})* ']' ws")
+                bnf.append(f"{rule_name} ::= {self.type_to_bnf.get(type(sample_item).__name__, type(sample_item).__name__)}")
+            else:
+                bnf.append(f"{parent} ::= '[' ws ']' ws")
+        else:
+            bnf.append(f"{parent} ::= {self.type_to_bnf.get(type(data).__name__, type(data).__name__)} ws")
+        return "\n".join(bnf)
+
+    def json_to_bnf(self, json_str):
+        normalized_str = normalize_json(json_str)
+        try:
+            parsed_data = json.loads(normalized_str)
+        except json.JSONDecodeError as e:
+            return f"Invalid JSON: {e}"
+        bnf_grammar = self.generate_bnf(parsed_data)
+        return f"{bnf_grammar}\n{self.generate_base_rules()}"
+
+
+if use_grammarbuilder_demo:
+    gb = GrammarBuilder()
+    sample_json = '''
+    {
+    "Optimistic": {
+        "score": 70.0,
+        "explanation": "The statement talks about secular industry tailwinds and expectations to grow the business at a rate exceeding global GDP."
+    },
+    "Pessimistic": {
+        "score": -20.0,
+        "explanation": "The paragraph acknowledges that they've experienced equity losses year-to-date."
+    },
+    "Confident": {
+        "score": 60.0,
+        "explanation": "The text shows belief in their people, platform, and their prospect of gaining market share."
+    },
+    "Cautious": {
+        "score": 40.0,
+        "explanation": "Mentions the possibility of falling below the target margins but aims to stay within the range."
+    },
+    "Transparent": {
+        "score": 80.0,
+        "explanation": "Provides clear information on financial outlook, including specifics about Adjusted EBITDA."
+    },
+    "Vague": {
+        "score": -80.0,
+        "explanation": "The text is quite specific and does not evade details."
+    },
+    "Upbeat": {
+        "score": 20.0,
+        "explanation": "The tone is more balanced and not overtly enthusiastic."
+    },
+    "Disappointed": {
+        "score": -10.0,
+        "explanation": "Acknowledges equity losses but doesn't express dissatisfaction."
+    },
+    "Reassuring": {
+        "score": 50.0,
+        "explanation": "Tries to reassure by focusing on core business and tailwinds."
+    },
+    "Evasive": {
+        "score": -100.0,
+        "explanation": "No signs of avoiding any topics; quite straightforward."
+    },
+    "Committed": {
+        "score": 60.0,
+        "explanation": "Shows dedication to running the core business within the stated margin."
+    },
+    "Analytical": {
+        "score": 70.0,
+        "explanation": "Provides a breakdown of the financial situation and market conditions."
+    },
+    "Ambitious": {
+        "score": 50.0,
+        "explanation": "Talks about exceeding global GDP growth."
+    },
+    "Concerned": {
+        "score": -10.0,
+        "explanation": "Reflects worry about equity losses but not overly so."
+    },
+    "Focused": {
+        "score": 80.0,
+        "explanation": "Focuses on core business and previously stated margin."
+    },
+    "Uncertain": {
+        "score": -90.0,
+        "explanation": "No ambiguity in the statements; quite specific."
+    },
+    "Responsive": {
+        "score": 60.0,
+        "explanation": "Directly addresses the financial outlook and plans."
+    },
+    "Defensive": {
+        "score": -100.0,
+        "explanation": "No signs of defending or justifying decisions."
+    },
+    "Strategic": {
+        "score": 60.0,
+        "explanation": "Discusses gaining share and investment in people and platform."
+    },
+    "Realistic": {
+        "score": 40.0,
+        "explanation": "Acknowledges challenges but maintains a balanced view."
+    }
+    }
+    '''
+
+    bnf_grammar = gb.json_to_bnf(sample_json)
+    print(bnf_grammar)
+    print('\n' + '_' * 80 + '\n')
+
+    gb = GrammarBuilder()
+    sample_pydantic_model_description = '''
+    class AudioTranscriptResponse(BaseModel):
+        audio_file_hash: str
+        audio_file_name: str
+        audio_file_size_mb: float
+        segments_json: List[dict]
+        combined_transcript_text: str
+        combined_transcript_text_list_of_metadata_dicts: List[dict]
+        info_json: dict
+        url_to_download_zip_file_of_embeddings: str
+        ip_address: str
+        request_time: datetime
+        response_time: datetime
+        total_time: float
+    '''
+
+    bnf_grammar = gb.pydantic_to_json_bnf(sample_pydantic_model_description)
+    print(bnf_grammar)
+
diff --git a/service_functions.py b/service_functions.py
@@ -7,6 +7,7 @@
 from embeddings_data_models import EmbeddingRequest, TextCompletionRequest
 from embeddings_data_models import TextCompletionResponse,  AudioTranscriptResponse
 import os
+import re
 import shutil
 import psutil
 import glob
@@ -614,3 +615,33 @@ async def generate_completion_from_llm(request: TextCompletionRequest, req: Requ
                                             llm_model_usage_json = llm_model_usage_json)
         list_of_responses.append(response)
     return list_of_responses
+
+def validate_bnf_grammar_revised(grammar):
+    defined_rules, used_rules = set(), set()
+    for line in grammar.strip().split('\n'):
+        if '::=' not in line: 
+            continue
+        parts = line.split('::=')
+        rule = parts[0].strip()
+        if rule in defined_rules:
+            return False, f"Rule {rule} is defined more than once."
+        defined_rules.add(rule)
+        expression = parts[-1]
+        # Tokenize the expression using regex
+        tokens = re.findall(r'\b[\w-]+\b|\[.*?\]|\(.*?\)|".*?"', expression)
+        # Additional handling for complex expressions
+        complex_tokens = re.findall(r'[\w-]+\[[\w-]+\]', expression)
+        tokens.extend(complex_tokens)
+        for token in tokens:
+            if token.startswith('[') or token.startswith('(') or token.startswith('"'):
+                continue  # Skip character classes, optional constructs, and string literals
+            if '[' in token and ']' in token:  # Split complex tokens into individual rules
+                sub_parts = token.split('[')
+                used_rules.add(sub_parts[0])
+                used_rules.add(sub_parts[1][:-1])
+                continue
+            used_rules.add(token)
+    for rule in used_rules:
+        if rule not in defined_rules:
+            return False, f"Used rule {rule} is not defined."
+    return True, "Valid BNF Grammar"
diff --git a/swiss_army_llama.py b/swiss_army_llama.py
@@ -5,11 +5,12 @@
 from ramdisk_functions import clear_ramdisk
 from misc_utility_functions import  build_faiss_indexes, safe_path
 from embeddings_data_models import DocumentEmbedding, TokenLevelEmbeddingBundle
-from embeddings_data_models import EmbeddingRequest, SemanticSearchRequest, AdvancedSemanticSearchRequest, SimilarityRequest, TextCompletionRequest
-from embeddings_data_models import EmbeddingResponse, SemanticSearchResponse, AdvancedSemanticSearchResponse, SimilarityResponse, AllStringsResponse, AllDocumentsResponse, TextCompletionResponse
+from embeddings_data_models import EmbeddingRequest, SemanticSearchRequest, AdvancedSemanticSearchRequest, SimilarityRequest, TextCompletionRequest, GrammarBuilderRequest, AddGrammarRequest
+from embeddings_data_models import EmbeddingResponse, SemanticSearchResponse, AdvancedSemanticSearchResponse, SimilarityResponse, AllStringsResponse, AllDocumentsResponse, TextCompletionResponse, GrammarBuilderResponse, AddGrammarResponse
 from embeddings_data_models import ShowLogsIncrementalModel
 from service_functions import get_or_compute_embedding, get_or_compute_transcript, add_model_url, get_or_compute_token_level_embedding_bundle_combined_feature_vector, calculate_token_level_embeddings
-from service_functions import parse_submitted_document_file_into_sentence_strings_func, compute_embeddings_for_document, store_document_embeddings_in_db, generate_completion_from_llm
+from service_functions import parse_submitted_document_file_into_sentence_strings_func, compute_embeddings_for_document, store_document_embeddings_in_db, generate_completion_from_llm, validate_bnf_grammar
+from grammar_builder import GrammarBuilder
 from log_viewer_functions import show_logs_incremental_func, show_logs_func
 from uvicorn_config import option
 import asyncio
@@ -20,6 +21,7 @@
 import tempfile
 import traceback
 import zipfile
+from pathlib import Path
 from datetime import datetime
 from hashlib import sha3_256
 from typing import List, Optional, Dict, Any
@@ -28,6 +30,7 @@
 from decouple import config
 import uvicorn
 import fastapi
+from fastapi.param_functions import Body
 from fastapi import FastAPI, HTTPException, Request, UploadFile, File, Depends
 from fastapi.responses import JSONResponse, FileResponse, HTMLResponse, Response
 from sqlalchemy import select
@@ -893,6 +896,91 @@ async def get_text_completions_from_input_prompt(request: TextCompletionRequest,
         raise HTTPException(status_code=500, detail="Internal Server Error")
 
 
+
+@app.post("/turn_sample_json_into_bnf_grammar_for_llm/",
+        response_model=GrammarBuilderResponse,
+        summary="Generate BNF Grammar from Sample JSON",
+        description="""Generate BNF grammar from a sample JSON file or text. 
+### Parameters:
+- `sample_json`: The sample JSON data as a dictionary (optional if file is uploaded).
+- `file`: The sample JSON file to upload (optional if JSON data is provided in `sample_json`).
+
+### Example Request with JSON Data:
+```json
+{
+    "sample_json": {"name": "John", "age": 30, "is_alive": true}
+}
+```
+
+### Example Request with File Upload:
+Use `multipart/form-data` to upload a JSON file.
+
+### Response:
+The response will include the generated BNF grammar based on the sample JSON provided.
+
+### Example Response:
+```json
+{
+    "bnf_grammar": "root ::= '{' ws root_pair_list ws '}' ws ..."
+}
+```""",
+        response_description="A JSON object containing the generated BNF grammar.")
+async def turn_sample_json_into_bnf_grammar_for_llm(
+    sample_json: GrammarBuilderRequest = Body(None, embed=True),
+    file: UploadFile = File(None)
+) -> GrammarBuilderResponse:
+    if sample_json is None and file is None:
+        raise HTTPException(status_code=400, detail="Either sample_json or file must be provided")
+    gb = GrammarBuilder()
+    if sample_json:
+        bnf_grammar = gb.json_to_bnf(json.dumps(sample_json.sample_json))
+    else:
+        file_content = await file.read()
+        try:
+            json_content = json.loads(file_content.decode('utf-8'))
+        except json.JSONDecodeError as e:
+            raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")
+        bnf_grammar = gb.json_to_bnf(json.dumps(json_content))
+    return {"bnf_grammar": bnf_grammar}
+
+
+
+@app.post("/turn_pydantic_model_description_into_bnf_grammar_for_llm/",
+        response_model=GrammarBuilderResponse,
+        summary="Generate BNF Grammar from Pydantic Model Description",
+        description="""Generate BNF grammar based on a Pydantic model description string.
+### Parameters:
+- `pydantic_model_description`: The Pydantic model description as a string. Must include the fields and their types.
+
+### Example Request:
+```json
+{
+    "pydantic_model_description": "class Model(BaseModel):\\n    name: str\\n    age: int\\n    is_alive: bool"
+}
+```
+
+### Response:
+The response will include the generated BNF grammar based on the Pydantic model description provided.
+
+### Example Response:
+```json
+{
+    "bnf_grammar": "root ::= '{' ws root_pair_list ws '}' ws ..."
+}
+```""",
+        response_description="A JSON object containing the generated BNF grammar.")
+async def turn_pydantic_model_into_bnf_grammar_for_llm(
+    request: GrammarBuilderRequest = Body(...)
+) -> GrammarBuilderResponse:
+    if not request.pydantic_model_description:
+        raise HTTPException(status_code=400, detail="Pydantic model description must be provided")
+
+    gb = GrammarBuilder()
+    bnf_grammar = gb.pydantic_to_json_bnf(request.pydantic_model_description)
+    return {"bnf_grammar": bnf_grammar}
+
+
+
 @app.post("/compute_transcript_with_whisper_from_audio/",
         summary="Transcribe and Embed Audio using Whisper and LLM",
         description="""Transcribe an audio file and optionally compute document embeddings. This endpoint uses the Whisper model for transcription and a specified or default language model for embeddings. The transcription and embeddings are then stored, and a ZIP file containing the embeddings can be downloaded.
@@ -926,6 +1014,45 @@ async def compute_transcript_with_whisper_from_audio(
         raise HTTPException(status_code=500, detail="Internal Server Error")
 
 
+
+@app.post("/add_new_grammar_definition_file/",
+        response_model=AddGrammarResponse,
+        summary="Add a New Grammar Definition File",
+        description="""Add a new BNF grammar definition file.
+### Parameters:
+- `bnf_grammar`: The BNF grammar string.
+- `grammar_file_name`: The name for the new grammar file.
+
+### Example Request:
+```json
+{
+    "bnf_grammar": "root ::= '{' ws root_pair_list ws '}' ws ...",
+    "grammar_file_name": "new_grammar"
+}
+```
+
+### Response:
+The response will include a list of all valid grammar files in the `grammar_files` directory.
+
+### Example Response:
+```json
+{
+    "valid_grammar_files": ["new_grammar.gbnf", "another_grammar.gbnf"]
+}
+```""",
+        response_description="A JSON object containing a list of all valid grammar files.")
+async def add_new_grammar_definition_file(request: AddGrammarRequest) -> AddGrammarResponse:
+    if not validate_bnf_grammar(request.bnf_grammar):
+        raise HTTPException(status_code=400, detail="Invalid BNF grammar")
+
+    grammar_file_path = Path("grammar_files") / f"{request.grammar_file_name}.gbnf"
+    with open(grammar_file_path, "w") as f:
+        f.write(request.bnf_grammar)
+
+    valid_grammar_files = [f.name for f in Path("grammar_files").glob("*.gbnf")]
+
+    return {"valid_grammar_files": valid_grammar_files}
+
 @app.post("/clear_ramdisk/")
 async def clear_ramdisk_endpoint(token: str = None):
     if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):