Skip to content

Commit

Permalink
Added grammar builder and validator and endpoints
Browse files Browse the repository at this point in the history
Dicklesworthstone committed Oct 3, 2023
1 parent fb6a331 commit 50199da
Showing 5 changed files with 402 additions and 4 deletions.
5 changes: 4 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
{
"git.ignoreLimitWarning": true
"git.ignoreLimitWarning": true,
"cSpell.words": [
"bfnrt"
]
}
14 changes: 14 additions & 0 deletions embeddings_data_models.py
Original file line number Diff line number Diff line change
@@ -230,3 +230,17 @@ class AudioTranscriptResponse(BaseModel):
class ShowLogsIncrementalModel(BaseModel):
logs: str
last_position: int

class GrammarBuilderRequest(BaseModel):
sample_json: Optional[dict]
pydantic_model_description: Optional[str]

class GrammarBuilderResponse(BaseModel):
bnf_grammar: str

class AddGrammarRequest(BaseModel):
bnf_grammar: str
grammar_file_name: str

class AddGrammarResponse(BaseModel):
valid_grammar_files: List[str]
223 changes: 223 additions & 0 deletions grammar_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
from typing import List, Dict
import json

use_grammarbuilder_demo = 1

def normalize_json(json_str):
output = []
in_string = False
escape_char = False
for char in json_str:
if char == "\\" and not escape_char:
escape_char = True
output.append(char)
continue
if char == '"' and not escape_char:
in_string = not in_string
if in_string:
output.append(char)
else:
if char.strip():
output.append(char)
if escape_char:
escape_char = False
return ''.join(output)

class GrammarBuilder:
type_to_bnf: Dict[str, str] = {
"str": "string",
"float": "number",
"int": "number",
"bool": "bool",
"datetime": "datetime",
"List": "list",
"Dict": "dict",
"Optional": "optional"
}

def __init__(self):
self.rules = {
"ws": "([ \\t\\n] ws)?",
"string": '\\" ([^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* \\" ws',
"number": '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws',
"bool": "('true' | 'false') ws",
"datetime": "string",
"dict": "'{' ws dict_pair_list ws '}' ws",
"dict_pair_list": "dict_pair (',' ws dict_pair)*",
"dict_pair": "string ':' ws value ws",
"list": "'[' ws list_items ws ']' ws",
"list_items": "value (',' ws value)*"
}


def generate_bnf_from_fields(self, fields: List[str], parent="root") -> str:
bnf = []
keys = ' | '.join([f'"{field.split(":")[0].strip()}"' for field in fields])
bnf.append(f"{parent} ::= '{{' ws {parent}_pair_list ws '}}' ws")
bnf.append(f"{parent}_pair_list ::= {parent}_pair (',' ws {parent}_pair)*")
bnf.append(f"{parent}_pair ::= allowed_keys_{parent} ':' ws value ws")
bnf.append(f"allowed_keys_{parent} ::= {keys}")
value_types = set()
for field in fields:
field_name, field_type = field.split(":")
field_name, field_type = field_name.strip(), field_type.strip()
parsed_type = self.type_to_bnf.get(field_type, field_type)
if field_type.startswith("List"):
parsed_type = "list"
value_types.add(parsed_type)
bnf.append(f"value ::= {' | '.join(value_types)}")
return "\n".join(bnf)

def pydantic_to_json_bnf(self, model_description: str) -> str:
lines = model_description.strip().split('\n')[1:]
fields = [line.strip() for line in lines if ':' in line]
bnf_for_fields = self.generate_bnf_from_fields(fields)
return f"{bnf_for_fields}\n{self.generate_base_rules()}"

def generate_base_rules(self):
return "\n".join([f"{key} ::= {value}" for key, value in self.rules.items()])

def generate_bnf(self, data, parent="root"):
bnf = []
if isinstance(data, dict):
keys = ' | '.join([f'\"{key}\"' for key in data.keys()])
bnf.append(f"{parent} ::= '{{' ws {parent}_pair_list ws '}}' ws")
bnf.append(f"{parent}_pair_list ::= {parent}_pair (',' ws {parent}_pair)*")
bnf.append(f"{parent}_pair ::= allowed_keys_{parent} ':' ws value ws")
bnf.append(f"allowed_keys_{parent} ::= {keys}")
sample_key = next(iter(data.keys()))
if isinstance(data[sample_key], dict):
bnf.append(f"value ::= {self.generate_bnf(data[sample_key], 'nested_value')}")
elif isinstance(data, list):
if len(data) > 0:
sample_item = data[0]
rule_name = f"{parent}_item"
bnf.append(f"{parent} ::= '[' ws {rule_name} (',' ws {rule_name})* ']' ws")
bnf.append(f"{rule_name} ::= {self.type_to_bnf.get(type(sample_item).__name__, type(sample_item).__name__)}")
else:
bnf.append(f"{parent} ::= '[' ws ']' ws")
else:
bnf.append(f"{parent} ::= {self.type_to_bnf.get(type(data).__name__, type(data).__name__)} ws")
return "\n".join(bnf)

def json_to_bnf(self, json_str):
normalized_str = normalize_json(json_str)
try:
parsed_data = json.loads(normalized_str)
except json.JSONDecodeError as e:
return f"Invalid JSON: {e}"
bnf_grammar = self.generate_bnf(parsed_data)
return f"{bnf_grammar}\n{self.generate_base_rules()}"


if use_grammarbuilder_demo:
gb = GrammarBuilder()
sample_json = '''
{
"Optimistic": {
"score": 70.0,
"explanation": "The statement talks about secular industry tailwinds and expectations to grow the business at a rate exceeding global GDP."
},
"Pessimistic": {
"score": -20.0,
"explanation": "The paragraph acknowledges that they've experienced equity losses year-to-date."
},
"Confident": {
"score": 60.0,
"explanation": "The text shows belief in their people, platform, and their prospect of gaining market share."
},
"Cautious": {
"score": 40.0,
"explanation": "Mentions the possibility of falling below the target margins but aims to stay within the range."
},
"Transparent": {
"score": 80.0,
"explanation": "Provides clear information on financial outlook, including specifics about Adjusted EBITDA."
},
"Vague": {
"score": -80.0,
"explanation": "The text is quite specific and does not evade details."
},
"Upbeat": {
"score": 20.0,
"explanation": "The tone is more balanced and not overtly enthusiastic."
},
"Disappointed": {
"score": -10.0,
"explanation": "Acknowledges equity losses but doesn't express dissatisfaction."
},
"Reassuring": {
"score": 50.0,
"explanation": "Tries to reassure by focusing on core business and tailwinds."
},
"Evasive": {
"score": -100.0,
"explanation": "No signs of avoiding any topics; quite straightforward."
},
"Committed": {
"score": 60.0,
"explanation": "Shows dedication to running the core business within the stated margin."
},
"Analytical": {
"score": 70.0,
"explanation": "Provides a breakdown of the financial situation and market conditions."
},
"Ambitious": {
"score": 50.0,
"explanation": "Talks about exceeding global GDP growth."
},
"Concerned": {
"score": -10.0,
"explanation": "Reflects worry about equity losses but not overly so."
},
"Focused": {
"score": 80.0,
"explanation": "Focuses on core business and previously stated margin."
},
"Uncertain": {
"score": -90.0,
"explanation": "No ambiguity in the statements; quite specific."
},
"Responsive": {
"score": 60.0,
"explanation": "Directly addresses the financial outlook and plans."
},
"Defensive": {
"score": -100.0,
"explanation": "No signs of defending or justifying decisions."
},
"Strategic": {
"score": 60.0,
"explanation": "Discusses gaining share and investment in people and platform."
},
"Realistic": {
"score": 40.0,
"explanation": "Acknowledges challenges but maintains a balanced view."
}
}
'''

bnf_grammar = gb.json_to_bnf(sample_json)
print(bnf_grammar)
print('\n' + '_' * 80 + '\n')

gb = GrammarBuilder()
sample_pydantic_model_description = '''
class AudioTranscriptResponse(BaseModel):
audio_file_hash: str
audio_file_name: str
audio_file_size_mb: float
segments_json: List[dict]
combined_transcript_text: str
combined_transcript_text_list_of_metadata_dicts: List[dict]
info_json: dict
url_to_download_zip_file_of_embeddings: str
ip_address: str
request_time: datetime
response_time: datetime
total_time: float
'''

bnf_grammar = gb.pydantic_to_json_bnf(sample_pydantic_model_description)
print(bnf_grammar)

31 changes: 31 additions & 0 deletions service_functions.py
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@
from embeddings_data_models import EmbeddingRequest, TextCompletionRequest
from embeddings_data_models import TextCompletionResponse, AudioTranscriptResponse
import os
import re
import shutil
import psutil
import glob
@@ -614,3 +615,33 @@ async def generate_completion_from_llm(request: TextCompletionRequest, req: Requ
llm_model_usage_json = llm_model_usage_json)
list_of_responses.append(response)
return list_of_responses

def validate_bnf_grammar_revised(grammar):
defined_rules, used_rules = set(), set()
for line in grammar.strip().split('\n'):
if '::=' not in line:
continue
parts = line.split('::=')
rule = parts[0].strip()
if rule in defined_rules:
return False, f"Rule {rule} is defined more than once."
defined_rules.add(rule)
expression = parts[-1]
# Tokenize the expression using regex
tokens = re.findall(r'\b[\w-]+\b|\[.*?\]|\(.*?\)|".*?"', expression)
# Additional handling for complex expressions
complex_tokens = re.findall(r'[\w-]+\[[\w-]+\]', expression)
tokens.extend(complex_tokens)
for token in tokens:
if token.startswith('[') or token.startswith('(') or token.startswith('"'):
continue # Skip character classes, optional constructs, and string literals
if '[' in token and ']' in token: # Split complex tokens into individual rules
sub_parts = token.split('[')
used_rules.add(sub_parts[0])
used_rules.add(sub_parts[1][:-1])
continue
used_rules.add(token)
for rule in used_rules:
if rule not in defined_rules:
return False, f"Used rule {rule} is not defined."
return True, "Valid BNF Grammar"
133 changes: 130 additions & 3 deletions swiss_army_llama.py
Original file line number Diff line number Diff line change
@@ -5,11 +5,12 @@
from ramdisk_functions import clear_ramdisk
from misc_utility_functions import build_faiss_indexes, safe_path
from embeddings_data_models import DocumentEmbedding, TokenLevelEmbeddingBundle
from embeddings_data_models import EmbeddingRequest, SemanticSearchRequest, AdvancedSemanticSearchRequest, SimilarityRequest, TextCompletionRequest
from embeddings_data_models import EmbeddingResponse, SemanticSearchResponse, AdvancedSemanticSearchResponse, SimilarityResponse, AllStringsResponse, AllDocumentsResponse, TextCompletionResponse
from embeddings_data_models import EmbeddingRequest, SemanticSearchRequest, AdvancedSemanticSearchRequest, SimilarityRequest, TextCompletionRequest, GrammarBuilderRequest, AddGrammarRequest
from embeddings_data_models import EmbeddingResponse, SemanticSearchResponse, AdvancedSemanticSearchResponse, SimilarityResponse, AllStringsResponse, AllDocumentsResponse, TextCompletionResponse, GrammarBuilderResponse, AddGrammarResponse
from embeddings_data_models import ShowLogsIncrementalModel
from service_functions import get_or_compute_embedding, get_or_compute_transcript, add_model_url, get_or_compute_token_level_embedding_bundle_combined_feature_vector, calculate_token_level_embeddings
from service_functions import parse_submitted_document_file_into_sentence_strings_func, compute_embeddings_for_document, store_document_embeddings_in_db, generate_completion_from_llm
from service_functions import parse_submitted_document_file_into_sentence_strings_func, compute_embeddings_for_document, store_document_embeddings_in_db, generate_completion_from_llm, validate_bnf_grammar
from grammar_builder import GrammarBuilder
from log_viewer_functions import show_logs_incremental_func, show_logs_func
from uvicorn_config import option
import asyncio
@@ -20,6 +21,7 @@
import tempfile
import traceback
import zipfile
from pathlib import Path
from datetime import datetime
from hashlib import sha3_256
from typing import List, Optional, Dict, Any
@@ -28,6 +30,7 @@
from decouple import config
import uvicorn
import fastapi
from fastapi.param_functions import Body
from fastapi import FastAPI, HTTPException, Request, UploadFile, File, Depends
from fastapi.responses import JSONResponse, FileResponse, HTMLResponse, Response
from sqlalchemy import select
@@ -893,6 +896,91 @@ async def get_text_completions_from_input_prompt(request: TextCompletionRequest,
raise HTTPException(status_code=500, detail="Internal Server Error")



@app.post("/turn_sample_json_into_bnf_grammar_for_llm/",
response_model=GrammarBuilderResponse,
summary="Generate BNF Grammar from Sample JSON",
description="""Generate BNF grammar from a sample JSON file or text.
### Parameters:
- `sample_json`: The sample JSON data as a dictionary (optional if file is uploaded).
- `file`: The sample JSON file to upload (optional if JSON data is provided in `sample_json`).
### Example Request with JSON Data:
```json
{
"sample_json": {"name": "John", "age": 30, "is_alive": true}
}
```
### Example Request with File Upload:
Use `multipart/form-data` to upload a JSON file.
### Response:
The response will include the generated BNF grammar based on the sample JSON provided.
### Example Response:
```json
{
"bnf_grammar": "root ::= '{' ws root_pair_list ws '}' ws ..."
}
```""",
response_description="A JSON object containing the generated BNF grammar.")
async def turn_sample_json_into_bnf_grammar_for_llm(
sample_json: GrammarBuilderRequest = Body(None, embed=True),
file: UploadFile = File(None)
) -> GrammarBuilderResponse:
if sample_json is None and file is None:
raise HTTPException(status_code=400, detail="Either sample_json or file must be provided")
gb = GrammarBuilder()
if sample_json:
bnf_grammar = gb.json_to_bnf(json.dumps(sample_json.sample_json))
else:
file_content = await file.read()
try:
json_content = json.loads(file_content.decode('utf-8'))
except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")
bnf_grammar = gb.json_to_bnf(json.dumps(json_content))
return {"bnf_grammar": bnf_grammar}



@app.post("/turn_pydantic_model_description_into_bnf_grammar_for_llm/",
response_model=GrammarBuilderResponse,
summary="Generate BNF Grammar from Pydantic Model Description",
description="""Generate BNF grammar based on a Pydantic model description string.
### Parameters:
- `pydantic_model_description`: The Pydantic model description as a string. Must include the fields and their types.
### Example Request:
```json
{
"pydantic_model_description": "class Model(BaseModel):\\n name: str\\n age: int\\n is_alive: bool"
}
```
### Response:
The response will include the generated BNF grammar based on the Pydantic model description provided.
### Example Response:
```json
{
"bnf_grammar": "root ::= '{' ws root_pair_list ws '}' ws ..."
}
```""",
response_description="A JSON object containing the generated BNF grammar.")
async def turn_pydantic_model_into_bnf_grammar_for_llm(
request: GrammarBuilderRequest = Body(...)
) -> GrammarBuilderResponse:
if not request.pydantic_model_description:
raise HTTPException(status_code=400, detail="Pydantic model description must be provided")

gb = GrammarBuilder()
bnf_grammar = gb.pydantic_to_json_bnf(request.pydantic_model_description)
return {"bnf_grammar": bnf_grammar}



@app.post("/compute_transcript_with_whisper_from_audio/",
summary="Transcribe and Embed Audio using Whisper and LLM",
description="""Transcribe an audio file and optionally compute document embeddings. This endpoint uses the Whisper model for transcription and a specified or default language model for embeddings. The transcription and embeddings are then stored, and a ZIP file containing the embeddings can be downloaded.
@@ -926,6 +1014,45 @@ async def compute_transcript_with_whisper_from_audio(
raise HTTPException(status_code=500, detail="Internal Server Error")



@app.post("/add_new_grammar_definition_file/",
response_model=AddGrammarResponse,
summary="Add a New Grammar Definition File",
description="""Add a new BNF grammar definition file.
### Parameters:
- `bnf_grammar`: The BNF grammar string.
- `grammar_file_name`: The name for the new grammar file.
### Example Request:
```json
{
"bnf_grammar": "root ::= '{' ws root_pair_list ws '}' ws ...",
"grammar_file_name": "new_grammar"
}
```
### Response:
The response will include a list of all valid grammar files in the `grammar_files` directory.
### Example Response:
```json
{
"valid_grammar_files": ["new_grammar.gbnf", "another_grammar.gbnf"]
}
```""",
response_description="A JSON object containing a list of all valid grammar files.")
async def add_new_grammar_definition_file(request: AddGrammarRequest) -> AddGrammarResponse:
if not validate_bnf_grammar(request.bnf_grammar):
raise HTTPException(status_code=400, detail="Invalid BNF grammar")

grammar_file_path = Path("grammar_files") / f"{request.grammar_file_name}.gbnf"
with open(grammar_file_path, "w") as f:
f.write(request.bnf_grammar)

valid_grammar_files = [f.name for f in Path("grammar_files").glob("*.gbnf")]

return {"valid_grammar_files": valid_grammar_files}

@app.post("/clear_ramdisk/")
async def clear_ramdisk_endpoint(token: str = None):
if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):

0 comments on commit 50199da

Please sign in to comment.