From 50199dac15c40549443f8a6912d6acd94a645f2e Mon Sep 17 00:00:00 2001 From: JE Date: Tue, 3 Oct 2023 00:35:46 -0500 Subject: [PATCH] Added grammar builder and validator and endpoints --- .vscode/settings.json | 5 +- embeddings_data_models.py | 14 +++ grammar_builder.py | 223 ++++++++++++++++++++++++++++++++++++++ service_functions.py | 31 ++++++ swiss_army_llama.py | 133 ++++++++++++++++++++++- 5 files changed, 402 insertions(+), 4 deletions(-) create mode 100644 grammar_builder.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 3b66410..9823561 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,6 @@ { - "git.ignoreLimitWarning": true + "git.ignoreLimitWarning": true, + "cSpell.words": [ + "bfnrt" + ] } \ No newline at end of file diff --git a/embeddings_data_models.py b/embeddings_data_models.py index 12c7a33..4e572eb 100644 --- a/embeddings_data_models.py +++ b/embeddings_data_models.py @@ -230,3 +230,17 @@ class AudioTranscriptResponse(BaseModel): class ShowLogsIncrementalModel(BaseModel): logs: str last_position: int + +class GrammarBuilderRequest(BaseModel): + sample_json: Optional[dict] + pydantic_model_description: Optional[str] + +class GrammarBuilderResponse(BaseModel): + bnf_grammar: str + +class AddGrammarRequest(BaseModel): + bnf_grammar: str + grammar_file_name: str + +class AddGrammarResponse(BaseModel): + valid_grammar_files: List[str] diff --git a/grammar_builder.py b/grammar_builder.py new file mode 100644 index 0000000..5be8e1a --- /dev/null +++ b/grammar_builder.py @@ -0,0 +1,223 @@ +from typing import List, Dict +import json + +use_grammarbuilder_demo = 1 + +def normalize_json(json_str): + output = [] + in_string = False + escape_char = False + for char in json_str: + if char == "\\" and not escape_char: + escape_char = True + output.append(char) + continue + if char == '"' and not escape_char: + in_string = not in_string + if in_string: + output.append(char) + else: + if char.strip(): + output.append(char) + if escape_char: + escape_char = False + return ''.join(output) + +class GrammarBuilder: + type_to_bnf: Dict[str, str] = { + "str": "string", + "float": "number", + "int": "number", + "bool": "bool", + "datetime": "datetime", + "List": "list", + "Dict": "dict", + "Optional": "optional" + } + + def __init__(self): + self.rules = { + "ws": "([ \\t\\n] ws)?", + "string": '\\" ([^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]))* \\" ws', + "number": '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws', + "bool": "('true' | 'false') ws", + "datetime": "string", + "dict": "'{' ws dict_pair_list ws '}' ws", + "dict_pair_list": "dict_pair (',' ws dict_pair)*", + "dict_pair": "string ':' ws value ws", + "list": "'[' ws list_items ws ']' ws", + "list_items": "value (',' ws value)*" + } + + + def generate_bnf_from_fields(self, fields: List[str], parent="root") -> str: + bnf = [] + keys = ' | '.join([f'"{field.split(":")[0].strip()}"' for field in fields]) + bnf.append(f"{parent} ::= '{{' ws {parent}_pair_list ws '}}' ws") + bnf.append(f"{parent}_pair_list ::= {parent}_pair (',' ws {parent}_pair)*") + bnf.append(f"{parent}_pair ::= allowed_keys_{parent} ':' ws value ws") + bnf.append(f"allowed_keys_{parent} ::= {keys}") + value_types = set() + for field in fields: + field_name, field_type = field.split(":") + field_name, field_type = field_name.strip(), field_type.strip() + parsed_type = self.type_to_bnf.get(field_type, field_type) + if field_type.startswith("List"): + parsed_type = "list" + value_types.add(parsed_type) + bnf.append(f"value ::= {' | '.join(value_types)}") + return "\n".join(bnf) + + def pydantic_to_json_bnf(self, model_description: str) -> str: + lines = model_description.strip().split('\n')[1:] + fields = [line.strip() for line in lines if ':' in line] + bnf_for_fields = self.generate_bnf_from_fields(fields) + return f"{bnf_for_fields}\n{self.generate_base_rules()}" + + def generate_base_rules(self): + return "\n".join([f"{key} ::= {value}" for key, value in self.rules.items()]) + + def generate_bnf(self, data, parent="root"): + bnf = [] + if isinstance(data, dict): + keys = ' | '.join([f'\"{key}\"' for key in data.keys()]) + bnf.append(f"{parent} ::= '{{' ws {parent}_pair_list ws '}}' ws") + bnf.append(f"{parent}_pair_list ::= {parent}_pair (',' ws {parent}_pair)*") + bnf.append(f"{parent}_pair ::= allowed_keys_{parent} ':' ws value ws") + bnf.append(f"allowed_keys_{parent} ::= {keys}") + sample_key = next(iter(data.keys())) + if isinstance(data[sample_key], dict): + bnf.append(f"value ::= {self.generate_bnf(data[sample_key], 'nested_value')}") + elif isinstance(data, list): + if len(data) > 0: + sample_item = data[0] + rule_name = f"{parent}_item" + bnf.append(f"{parent} ::= '[' ws {rule_name} (',' ws {rule_name})* ']' ws") + bnf.append(f"{rule_name} ::= {self.type_to_bnf.get(type(sample_item).__name__, type(sample_item).__name__)}") + else: + bnf.append(f"{parent} ::= '[' ws ']' ws") + else: + bnf.append(f"{parent} ::= {self.type_to_bnf.get(type(data).__name__, type(data).__name__)} ws") + return "\n".join(bnf) + + def json_to_bnf(self, json_str): + normalized_str = normalize_json(json_str) + try: + parsed_data = json.loads(normalized_str) + except json.JSONDecodeError as e: + return f"Invalid JSON: {e}" + bnf_grammar = self.generate_bnf(parsed_data) + return f"{bnf_grammar}\n{self.generate_base_rules()}" + + +if use_grammarbuilder_demo: + gb = GrammarBuilder() + sample_json = ''' + { + "Optimistic": { + "score": 70.0, + "explanation": "The statement talks about secular industry tailwinds and expectations to grow the business at a rate exceeding global GDP." + }, + "Pessimistic": { + "score": -20.0, + "explanation": "The paragraph acknowledges that they've experienced equity losses year-to-date." + }, + "Confident": { + "score": 60.0, + "explanation": "The text shows belief in their people, platform, and their prospect of gaining market share." + }, + "Cautious": { + "score": 40.0, + "explanation": "Mentions the possibility of falling below the target margins but aims to stay within the range." + }, + "Transparent": { + "score": 80.0, + "explanation": "Provides clear information on financial outlook, including specifics about Adjusted EBITDA." + }, + "Vague": { + "score": -80.0, + "explanation": "The text is quite specific and does not evade details." + }, + "Upbeat": { + "score": 20.0, + "explanation": "The tone is more balanced and not overtly enthusiastic." + }, + "Disappointed": { + "score": -10.0, + "explanation": "Acknowledges equity losses but doesn't express dissatisfaction." + }, + "Reassuring": { + "score": 50.0, + "explanation": "Tries to reassure by focusing on core business and tailwinds." + }, + "Evasive": { + "score": -100.0, + "explanation": "No signs of avoiding any topics; quite straightforward." + }, + "Committed": { + "score": 60.0, + "explanation": "Shows dedication to running the core business within the stated margin." + }, + "Analytical": { + "score": 70.0, + "explanation": "Provides a breakdown of the financial situation and market conditions." + }, + "Ambitious": { + "score": 50.0, + "explanation": "Talks about exceeding global GDP growth." + }, + "Concerned": { + "score": -10.0, + "explanation": "Reflects worry about equity losses but not overly so." + }, + "Focused": { + "score": 80.0, + "explanation": "Focuses on core business and previously stated margin." + }, + "Uncertain": { + "score": -90.0, + "explanation": "No ambiguity in the statements; quite specific." + }, + "Responsive": { + "score": 60.0, + "explanation": "Directly addresses the financial outlook and plans." + }, + "Defensive": { + "score": -100.0, + "explanation": "No signs of defending or justifying decisions." + }, + "Strategic": { + "score": 60.0, + "explanation": "Discusses gaining share and investment in people and platform." + }, + "Realistic": { + "score": 40.0, + "explanation": "Acknowledges challenges but maintains a balanced view." + } + } + ''' + + bnf_grammar = gb.json_to_bnf(sample_json) + print(bnf_grammar) + print('\n' + '_' * 80 + '\n') + + gb = GrammarBuilder() + sample_pydantic_model_description = ''' + class AudioTranscriptResponse(BaseModel): + audio_file_hash: str + audio_file_name: str + audio_file_size_mb: float + segments_json: List[dict] + combined_transcript_text: str + combined_transcript_text_list_of_metadata_dicts: List[dict] + info_json: dict + url_to_download_zip_file_of_embeddings: str + ip_address: str + request_time: datetime + response_time: datetime + total_time: float + ''' + + bnf_grammar = gb.pydantic_to_json_bnf(sample_pydantic_model_description) + print(bnf_grammar) + diff --git a/service_functions.py b/service_functions.py index aa607ef..70be8b8 100644 --- a/service_functions.py +++ b/service_functions.py @@ -7,6 +7,7 @@ from embeddings_data_models import EmbeddingRequest, TextCompletionRequest from embeddings_data_models import TextCompletionResponse, AudioTranscriptResponse import os +import re import shutil import psutil import glob @@ -614,3 +615,33 @@ async def generate_completion_from_llm(request: TextCompletionRequest, req: Requ llm_model_usage_json = llm_model_usage_json) list_of_responses.append(response) return list_of_responses + +def validate_bnf_grammar_revised(grammar): + defined_rules, used_rules = set(), set() + for line in grammar.strip().split('\n'): + if '::=' not in line: + continue + parts = line.split('::=') + rule = parts[0].strip() + if rule in defined_rules: + return False, f"Rule {rule} is defined more than once." + defined_rules.add(rule) + expression = parts[-1] + # Tokenize the expression using regex + tokens = re.findall(r'\b[\w-]+\b|\[.*?\]|\(.*?\)|".*?"', expression) + # Additional handling for complex expressions + complex_tokens = re.findall(r'[\w-]+\[[\w-]+\]', expression) + tokens.extend(complex_tokens) + for token in tokens: + if token.startswith('[') or token.startswith('(') or token.startswith('"'): + continue # Skip character classes, optional constructs, and string literals + if '[' in token and ']' in token: # Split complex tokens into individual rules + sub_parts = token.split('[') + used_rules.add(sub_parts[0]) + used_rules.add(sub_parts[1][:-1]) + continue + used_rules.add(token) + for rule in used_rules: + if rule not in defined_rules: + return False, f"Used rule {rule} is not defined." + return True, "Valid BNF Grammar" diff --git a/swiss_army_llama.py b/swiss_army_llama.py index dbdbb10..625899c 100644 --- a/swiss_army_llama.py +++ b/swiss_army_llama.py @@ -5,11 +5,12 @@ from ramdisk_functions import clear_ramdisk from misc_utility_functions import build_faiss_indexes, safe_path from embeddings_data_models import DocumentEmbedding, TokenLevelEmbeddingBundle -from embeddings_data_models import EmbeddingRequest, SemanticSearchRequest, AdvancedSemanticSearchRequest, SimilarityRequest, TextCompletionRequest -from embeddings_data_models import EmbeddingResponse, SemanticSearchResponse, AdvancedSemanticSearchResponse, SimilarityResponse, AllStringsResponse, AllDocumentsResponse, TextCompletionResponse +from embeddings_data_models import EmbeddingRequest, SemanticSearchRequest, AdvancedSemanticSearchRequest, SimilarityRequest, TextCompletionRequest, GrammarBuilderRequest, AddGrammarRequest +from embeddings_data_models import EmbeddingResponse, SemanticSearchResponse, AdvancedSemanticSearchResponse, SimilarityResponse, AllStringsResponse, AllDocumentsResponse, TextCompletionResponse, GrammarBuilderResponse, AddGrammarResponse from embeddings_data_models import ShowLogsIncrementalModel from service_functions import get_or_compute_embedding, get_or_compute_transcript, add_model_url, get_or_compute_token_level_embedding_bundle_combined_feature_vector, calculate_token_level_embeddings -from service_functions import parse_submitted_document_file_into_sentence_strings_func, compute_embeddings_for_document, store_document_embeddings_in_db, generate_completion_from_llm +from service_functions import parse_submitted_document_file_into_sentence_strings_func, compute_embeddings_for_document, store_document_embeddings_in_db, generate_completion_from_llm, validate_bnf_grammar +from grammar_builder import GrammarBuilder from log_viewer_functions import show_logs_incremental_func, show_logs_func from uvicorn_config import option import asyncio @@ -20,6 +21,7 @@ import tempfile import traceback import zipfile +from pathlib import Path from datetime import datetime from hashlib import sha3_256 from typing import List, Optional, Dict, Any @@ -28,6 +30,7 @@ from decouple import config import uvicorn import fastapi +from fastapi.param_functions import Body from fastapi import FastAPI, HTTPException, Request, UploadFile, File, Depends from fastapi.responses import JSONResponse, FileResponse, HTMLResponse, Response from sqlalchemy import select @@ -893,6 +896,91 @@ async def get_text_completions_from_input_prompt(request: TextCompletionRequest, raise HTTPException(status_code=500, detail="Internal Server Error") + +@app.post("/turn_sample_json_into_bnf_grammar_for_llm/", + response_model=GrammarBuilderResponse, + summary="Generate BNF Grammar from Sample JSON", + description="""Generate BNF grammar from a sample JSON file or text. +### Parameters: +- `sample_json`: The sample JSON data as a dictionary (optional if file is uploaded). +- `file`: The sample JSON file to upload (optional if JSON data is provided in `sample_json`). + +### Example Request with JSON Data: +```json +{ + "sample_json": {"name": "John", "age": 30, "is_alive": true} +} +``` + +### Example Request with File Upload: +Use `multipart/form-data` to upload a JSON file. + +### Response: +The response will include the generated BNF grammar based on the sample JSON provided. + +### Example Response: +```json +{ + "bnf_grammar": "root ::= '{' ws root_pair_list ws '}' ws ..." +} +```""", + response_description="A JSON object containing the generated BNF grammar.") +async def turn_sample_json_into_bnf_grammar_for_llm( + sample_json: GrammarBuilderRequest = Body(None, embed=True), + file: UploadFile = File(None) +) -> GrammarBuilderResponse: + if sample_json is None and file is None: + raise HTTPException(status_code=400, detail="Either sample_json or file must be provided") + gb = GrammarBuilder() + if sample_json: + bnf_grammar = gb.json_to_bnf(json.dumps(sample_json.sample_json)) + else: + file_content = await file.read() + try: + json_content = json.loads(file_content.decode('utf-8')) + except json.JSONDecodeError as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}") + bnf_grammar = gb.json_to_bnf(json.dumps(json_content)) + return {"bnf_grammar": bnf_grammar} + + + +@app.post("/turn_pydantic_model_description_into_bnf_grammar_for_llm/", + response_model=GrammarBuilderResponse, + summary="Generate BNF Grammar from Pydantic Model Description", + description="""Generate BNF grammar based on a Pydantic model description string. +### Parameters: +- `pydantic_model_description`: The Pydantic model description as a string. Must include the fields and their types. + +### Example Request: +```json +{ + "pydantic_model_description": "class Model(BaseModel):\\n name: str\\n age: int\\n is_alive: bool" +} +``` + +### Response: +The response will include the generated BNF grammar based on the Pydantic model description provided. + +### Example Response: +```json +{ + "bnf_grammar": "root ::= '{' ws root_pair_list ws '}' ws ..." +} +```""", + response_description="A JSON object containing the generated BNF grammar.") +async def turn_pydantic_model_into_bnf_grammar_for_llm( + request: GrammarBuilderRequest = Body(...) +) -> GrammarBuilderResponse: + if not request.pydantic_model_description: + raise HTTPException(status_code=400, detail="Pydantic model description must be provided") + + gb = GrammarBuilder() + bnf_grammar = gb.pydantic_to_json_bnf(request.pydantic_model_description) + return {"bnf_grammar": bnf_grammar} + + + @app.post("/compute_transcript_with_whisper_from_audio/", summary="Transcribe and Embed Audio using Whisper and LLM", description="""Transcribe an audio file and optionally compute document embeddings. This endpoint uses the Whisper model for transcription and a specified or default language model for embeddings. The transcription and embeddings are then stored, and a ZIP file containing the embeddings can be downloaded. @@ -926,6 +1014,45 @@ async def compute_transcript_with_whisper_from_audio( raise HTTPException(status_code=500, detail="Internal Server Error") + +@app.post("/add_new_grammar_definition_file/", + response_model=AddGrammarResponse, + summary="Add a New Grammar Definition File", + description="""Add a new BNF grammar definition file. +### Parameters: +- `bnf_grammar`: The BNF grammar string. +- `grammar_file_name`: The name for the new grammar file. + +### Example Request: +```json +{ + "bnf_grammar": "root ::= '{' ws root_pair_list ws '}' ws ...", + "grammar_file_name": "new_grammar" +} +``` + +### Response: +The response will include a list of all valid grammar files in the `grammar_files` directory. + +### Example Response: +```json +{ + "valid_grammar_files": ["new_grammar.gbnf", "another_grammar.gbnf"] +} +```""", + response_description="A JSON object containing a list of all valid grammar files.") +async def add_new_grammar_definition_file(request: AddGrammarRequest) -> AddGrammarResponse: + if not validate_bnf_grammar(request.bnf_grammar): + raise HTTPException(status_code=400, detail="Invalid BNF grammar") + + grammar_file_path = Path("grammar_files") / f"{request.grammar_file_name}.gbnf" + with open(grammar_file_path, "w") as f: + f.write(request.bnf_grammar) + + valid_grammar_files = [f.name for f in Path("grammar_files").glob("*.gbnf")] + + return {"valid_grammar_files": valid_grammar_files} + @app.post("/clear_ramdisk/") async def clear_ramdisk_endpoint(token: str = None): if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):