From 81ef8614f91f4d93590a93e539bdb9ba6b29ee14 Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Tue, 25 Jun 2024 11:25:33 -0400
Subject: [PATCH 01/20] Added promptflow standard build

---
 docker-compose.yml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 9adbef07..84ed1ee9 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -140,7 +140,17 @@ services:
       - ./utils:/app/utils
       - ./templates:/app/templates
       - ./db/recipedb:/app/db
-
+  promptflow:
+    image: mcr.microsoft.com/azureml/promptflow/promptflow-runtime-stable:latest
+    container_name: data-recipes-promptflow
+    env_file:
+          - .env
+    volumes:
+      - ./flows:/app
+      - ./utils:/app/utils
+      - ./templates:/app/templates
+      - shared-data:/app/recipes/public
+      - ./management/skills.py:/app/recipes/skills.py
 volumes:
   pgdata2:
   shared-data:
\ No newline at end of file

From d43373feba916c3d6f42079a464e7a4233cef2ce Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Wed, 26 Jun 2024 18:11:07 -0400
Subject: [PATCH 02/20] Fixed container name

---
 docker-compose.yml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 84ed1ee9..0b51038b 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -142,15 +142,16 @@ services:
       - ./db/recipedb:/app/db
   promptflow:
     image: mcr.microsoft.com/azureml/promptflow/promptflow-runtime-stable:latest
-    container_name: data-recipes-promptflow
+    container_name: recipes-ai-promptflow
     env_file:
           - .env
     volumes:
       - ./flows:/app
-      - ./utils:/app/utils
-      - ./templates:/app/templates
-      - shared-data:/app/recipes/public
-      - ./management/skills.py:/app/recipes/skills.py
+      - ./utils:/app/chainlit-ui-evaluation/utils
+      - ./templates:/app/chainlit-ui-evaluation/templates
+      - shared-data:/app/chainlit-ui-evaluation/recipes/public
+      - ./management/skills.py:/app/chainlit-ui-evaluation/recipes/skills.py
+      - ./ui/chat-chainlit-assistant/app.py:/app/chainlit-ui-evaluation/app.py
 volumes:
   pgdata2:
   shared-data:
\ No newline at end of file

From 66fa6888d98e7bb28b57bb115a0b1429d83926ba Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Wed, 26 Jun 2024 20:52:07 -0400
Subject: [PATCH 03/20] Interim checkin to make main loop simpler, in prep for
 self-tests

---
 ui/chat-chainlit-assistant/app.py | 172 ++++++++++++++++--------------
 1 file changed, 91 insertions(+), 81 deletions(-)

diff --git a/ui/chat-chainlit-assistant/app.py b/ui/chat-chainlit-assistant/app.py
index 73880a13..454ea5b6 100644
--- a/ui/chat-chainlit-assistant/app.py
+++ b/ui/chat-chainlit-assistant/app.py
@@ -535,87 +535,6 @@ def check_memories_recipes(user_input: str, history=[]) -> str:
 async_check_memories_recipes = make_async(check_memories_recipes)
 
 
-@cl.on_message
-async def main(message: cl.Message):
-    """
-    Process the user's message and interact with the assistant.
-
-    Args:
-        message (cl.Message): The user's message.
-
-    Returns:
-        None
-    """
-    thread_id = cl.user_session.get("thread_id")
-
-    # Azure doesn't yet support attachments
-    if os.getenv("ASSISTANTS_API_TYPE") == "openai":
-
-        attachments = await process_files(message.elements)
-
-        # Add a Message to the Thread
-        await async_openai_client.beta.threads.messages.create(
-            thread_id=thread_id,
-            role="user",
-            content=message.content,
-            attachments=attachments,
-        )
-    else:
-
-        # Add a Message to the Thread
-        await async_openai_client.beta.threads.messages.create(
-            thread_id=thread_id,
-            role="user",
-            content=message.content,
-        )
-
-    # Append to chat history
-    chat_history = cl.user_session.get("chat_history")
-    chat_history.append({"role": "user", "content": message.content})
-    cl.user_session.set("chat_history", chat_history)
-
-    # Check recipes
-    msg = await cl.Message("").send()
-    memory_found, memory_content, memory_response, meta_data_msg = (
-        await async_check_memories_recipes(message.content, chat_history)
-    )
-
-    # memory_foundy=False
-
-    # Message to the thread. If a memory add it as the assistant
-    if memory_found is True:
-        print("Adding memory to thread")
-        await async_openai_client.beta.threads.messages.create(
-            thread_id=thread_id,
-            role="assistant",
-            content=memory_content,
-            # attachments=attachments,
-        )
-
-        msg.content = memory_response["content"]
-        msg.elements = memory_response["elements"]
-        await msg.update()
-
-        # TODO really should be part of message above so feedback can apply
-        await cl.Message(meta_data_msg).send()
-
-        # No need to send anything
-        return
-
-    # msg.content = "Can't find anything in my memories, let me do some analysis ..."
-    msg.content = ""
-    await msg.update()
-
-    # Create and Stream a Run
-    print(f"Creating and streaming a run {assistant.id}")
-    with sync_openai_client.beta.threads.runs.stream(
-        thread_id=thread_id,
-        assistant_id=assistant.id,
-        event_handler=EventHandler(assistant_name=assistant.name),
-    ) as stream:
-        stream.until_done()
-
-
 @cl.on_audio_chunk
 async def on_audio_chunk(chunk: cl.AudioChunk):
     """
@@ -693,3 +612,94 @@ def auth_callback(username: str, password: str):
         )
     else:
         return None
+
+
+async def add_message_to_thread(thread_id, role, content, message=None):
+    """
+    Add a message to a thread.
+
+    Args:
+        thread_id (str): The ID of the thread.
+        role (str): The role of the message author.
+        content (str): The content of the message.
+        message (cl.Message): The message object.
+
+    Returns:
+        None
+    """
+    # Azure doesn't yet support attachments
+    if os.getenv("ASSISTANTS_API_TYPE") == "openai":
+
+        if message is not None:
+            attachments = await process_files(message.elements)
+
+        # Add a Message to the Thread
+        await async_openai_client.beta.threads.messages.create(
+            thread_id=thread_id,
+            role=role,
+            content=content,
+            attachments=attachments,
+        )
+    else:
+
+        # Add a Message to the Thread
+        await async_openai_client.beta.threads.messages.create(
+            thread_id=thread_id,
+            role=role,
+            content=content,
+        )
+
+
+@cl.on_message
+async def main(message: cl.Message):
+    """
+    Process the user's message and interact with the assistant.
+
+    Args:
+        message (cl.Message): The user's message.
+
+    Returns:
+        None
+    """
+    thread_id = cl.user_session.get("thread_id")
+    chat_history = cl.user_session.get("chat_history")
+    msg = await cl.Message("").send()
+
+    # Record user's message
+    add_message_to_thread(thread_id, "user", message.content, message)
+    chat_history.append({"role": "user", "content": message.content})
+
+    # Check recipes
+    memory_found, memory_content, memory_response, meta_data_msg = (
+        await async_check_memories_recipes(message.content, chat_history)
+    )
+
+    # Message to the thread. If a memory add it as the assistant
+    if memory_found is True:
+        print("Adding memory to thread")
+        add_message_to_thread(thread_id, "assistant", memory_content)
+
+        # Send memory output
+        msg.content = memory_response["content"]
+        msg.elements = memory_response["elements"]
+        await msg.update()
+
+        # TODO really should be part of message above so feedback can apply
+        await cl.Message(meta_data_msg).send()
+
+    else:
+
+        # msg.content = "Can't find anything in my memories, let me do some analysis ..."
+        msg.content = ""
+        await msg.update()
+
+        # Create and Stream a Run
+        print(f"Creating and streaming a run {assistant.id}")
+        with sync_openai_client.beta.threads.runs.stream(
+            thread_id=thread_id,
+            assistant_id=assistant.id,
+            event_handler=EventHandler(assistant_name=assistant.name),
+        ) as stream:
+            stream.until_done()
+
+    cl.user_session.set("chat_history", chat_history)

From f040277ea773a110a62c9aac66845d95fa74e38d Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Wed, 26 Jun 2024 21:01:38 -0400
Subject: [PATCH 04/20] Interim checkin to make main loop simpler, in prep for
 self-tests

---
 ui/chat-chainlit-assistant/app.py | 283 ++++++++++++++++--------------
 1 file changed, 153 insertions(+), 130 deletions(-)

diff --git a/ui/chat-chainlit-assistant/app.py b/ui/chat-chainlit-assistant/app.py
index 454ea5b6..64fbb81d 100644
--- a/ui/chat-chainlit-assistant/app.py
+++ b/ui/chat-chainlit-assistant/app.py
@@ -72,158 +72,176 @@
 config.ui.name = bot_name
 
 
-class EventHandler(AssistantEventHandler):
-
-    def __init__(self, assistant_name: str) -> None:
-        """
-        Initializes a new instance of the ChatChainlitAssistant class.
-
-        Args:
-            assistant_name (str): The name of the assistant.
+def get_event_handler(cl, assistant_name):  # noqa: C901
+    """
+    Returns an instance of the EventHandler class, which is responsible for handling events in the ChatChainlitAssistant.
 
-        Returns:
-            None
-        """
-        super().__init__()
-        self.current_message: cl.Message = None
-        self.current_step: cl.Step = None
-        self.current_tool_call = None
-        self.current_message_text = ""
-        self.assistant_name = assistant_name
-
-    @override
-    def on_event(self, event):
-        """
-        Handles the incoming event and performs the necessary actions based on the event type.
+    Args:
+        cl: The ChatClient instance used for communication with the chat service.
+        assistant_name (str): The name of the assistant.
 
-        Args:
-            event: The event object containing information about the event.
+    Returns:
+        EventHandler: An instance of the EventHandler class.
+    """
 
-        Returns:
-            None
-        """
-        print(event.event)
-        run_id = event.data.id
-        if event.event == "thread.message.created":
-            self.current_message = run_sync(cl.Message(content="").send())
-            self.current_message_text = ""
-            print("Run started")
-        if event.event == "thread.message.completed":
-            self.handle_message_completed(event.data, run_id)
-        elif event.event == "thread.run.requires_action":
-            self.handle_requires_action(event.data, run_id)
-        elif event.event == "thread.message.delta":
-            self.handle_message_delta(event.data)
-        else:
-            print(json.dumps(str(event.data), indent=4))
-            print(f"Unhandled event: {event.event}")
+    class EventHandler(AssistantEventHandler):
 
-    def handle_message_delta(self, data):
-        """
-        Handles the message delta data.
+        def __init__(self, assistant_name: str) -> None:
+            """
+            Initializes a new instance of the ChatChainlitAssistant class.
 
-        Args:
-            data: The message delta data.
+            Args:
+                assistant_name (str): The name of the assistant.
 
-        Returns:
-            None
-        """
-        for content in data.delta.content:
-            if content.type == "text":
-                content = content.text.value
-                if content is not None:
-                    self.current_message_text += content
-                    run_sync(self.current_message.stream_token(content))
-            elif content.type == "image_file":
-                file_id = content.image_file.file_id
-                image_data = sync_openai_client.files.content(file_id)
-                image_data_bytes = image_data.read()
-                png_file = f"{images_loc}{file_id}.png"
-                print(f"Writing image to {png_file}")
-                with open(png_file, "wb") as file:
-                    file.write(image_data_bytes)
-                    image = cl.Image(path=png_file, display="inline", size="large")
-                    print(f"Image: {png_file}")
-                    if not self.current_message.elements:
-                        self.current_message.elements = []
-                        self.current_message.elements.append(image)
-                        run_sync(self.current_message.update())
+            Returns:
+                None
+            """
+            super().__init__()
+            self.current_message: cl.Message = None
+            self.current_step: cl.Step = None
+            self.current_tool_call = None
+            self.current_message_text = ""
+            self.assistant_name = assistant_name
+
+        @override
+        def on_event(self, event):
+            """
+            Handles the incoming event and performs the necessary actions based on the event type.
+
+            Args:
+                event: The event object containing information about the event.
+
+            Returns:
+                None
+            """
+            print(event.event)
+            run_id = event.data.id
+            if event.event == "thread.message.created":
+                self.current_message = run_sync(cl.Message(content="").send())
+                self.current_message_text = ""
+                print("Run started")
+            if event.event == "thread.message.completed":
+                self.handle_message_completed(event.data, run_id)
+            elif event.event == "thread.run.requires_action":
+                self.handle_requires_action(event.data, run_id)
+            elif event.event == "thread.message.delta":
+                self.handle_message_delta(event.data)
             else:
-                print(f"Unhandled delta type: {content.type}")
+                print(json.dumps(str(event.data), indent=4))
+                print(f"Unhandled event: {event.event}")
+
+        def handle_message_delta(self, data):
+            """
+            Handles the message delta data.
+
+            Args:
+                data: The message delta data.
+
+            Returns:
+                None
+            """
+            for content in data.delta.content:
+                if content.type == "text":
+                    content = content.text.value
+                    if content is not None:
+                        self.current_message_text += content
+                        run_sync(self.current_message.stream_token(content))
+                elif content.type == "image_file":
+                    file_id = content.image_file.file_id
+                    image_data = sync_openai_client.files.content(file_id)
+                    image_data_bytes = image_data.read()
+                    png_file = f"{images_loc}{file_id}.png"
+                    print(f"Writing image to {png_file}")
+                    with open(png_file, "wb") as file:
+                        file.write(image_data_bytes)
+                        image = cl.Image(path=png_file, display="inline", size="large")
+                        print(f"Image: {png_file}")
+                        if not self.current_message.elements:
+                            self.current_message.elements = []
+                            self.current_message.elements.append(image)
+                            run_sync(self.current_message.update())
+                else:
+                    print(f"Unhandled delta type: {content.type}")
+
+        def handle_message_completed(self, data, run_id):
+            """
+            Handles the completion of a message.
+
+            Args:
+                data: The data associated with the completed message.
+                run_id: The ID of the message run.
+
+            Returns:
+                None
+            """
+            # Add footer to self message. We have to start a new message so it's in right order
+            # TODO combine streaming with image and footer
+            run_sync(self.current_message.update())
+            self.current_message = run_sync(
+                cl.Message(content="", disable_feedback=True).send()
+            )
 
-    def handle_message_completed(self, data, run_id):
-        """
-        Handles the completion of a message.
+            word_count = len(self.current_message_text.split())
+            if word_count > 10:
+                run_sync(self.current_message.stream_token(llm_footer))
+            run_sync(self.current_message.update())
 
-        Args:
-            data: The data associated with the completed message.
-            run_id: The ID of the message run.
+        def handle_requires_action(self, data, run_id):
+            """
+            Handles the required action by executing the specified tools and submitting the tool outputs.
 
-        Returns:
-            None
-        """
-        # Add footer to self message. We have to start a new message so it's in right order
-        # TODO combine streaming with image and footer
-        run_sync(self.current_message.update())
-        self.current_message = run_sync(
-            cl.Message(content="", disable_feedback=True).send()
-        )
-
-        word_count = len(self.current_message_text.split())
-        if word_count > 10:
-            run_sync(self.current_message.stream_token(llm_footer))
-        run_sync(self.current_message.update())
+            Args:
+                data: The data containing the required action information.
+                run_id: The ID of the current run.
 
-    def handle_requires_action(self, data, run_id):
-        """
-        Handles the required action by executing the specified tools and submitting the tool outputs.
+            Returns:
+                None
+            """
+            tool_outputs = []
 
-        Args:
-            data: The data containing the required action information.
-            run_id: The ID of the current run.
+            for tool in data.required_action.submit_tool_outputs.tool_calls:
+                print(tool)
 
-        Returns:
-            None
-        """
-        tool_outputs = []
+                function_name = tool.function.name
+                function_args = tool.function.arguments
 
-        for tool in data.required_action.submit_tool_outputs.tool_calls:
-            print(tool)
+                function_output = run_function(function_name, function_args)
 
-            function_name = tool.function.name
-            function_args = tool.function.arguments
+                tool_outputs.append(
+                    {"tool_call_id": tool.id, "output": function_output}
+                )
 
-            function_output = run_function(function_name, function_args)
+            print("TOOL OUTPUTS: ")
 
-            tool_outputs.append({"tool_call_id": tool.id, "output": function_output})
+            print(tool_outputs)
 
-        print("TOOL OUTPUTS: ")
+            # Submit all tool_outputs at the same time
+            self.submit_tool_outputs(tool_outputs, run_id)
 
-        print(tool_outputs)
+        def submit_tool_outputs(self, tool_outputs, run_id):
+            """
+            Submits the tool outputs to the current run.
 
-        # Submit all tool_outputs at the same time
-        self.submit_tool_outputs(tool_outputs, run_id)
+            Args:
+                tool_outputs (list): A list of tool outputs to be submitted.
+                run_id (str): The ID of the current run.
 
-    def submit_tool_outputs(self, tool_outputs, run_id):
-        """
-        Submits the tool outputs to the current run.
+            Returns:
+                None
+            """
+            with sync_openai_client.beta.threads.runs.submit_tool_outputs_stream(
+                thread_id=self.current_run.thread_id,
+                run_id=self.current_run.id,
+                tool_outputs=tool_outputs,
+                event_handler=EventHandler(assistant_name=self.assistant_name),
+            ) as stream:
+                # Needs this line, or it doesn't work! :)
+                for text in stream.text_deltas:
+                    print(text)
 
-        Args:
-            tool_outputs (list): A list of tool outputs to be submitted.
-            run_id (str): The ID of the current run.
+    event_handler = EventHandler(assistant_name)
 
-        Returns:
-            None
-        """
-        with sync_openai_client.beta.threads.runs.submit_tool_outputs_stream(
-            thread_id=self.current_run.thread_id,
-            run_id=self.current_run.id,
-            tool_outputs=tool_outputs,
-            event_handler=EventHandler(assistant_name=self.assistant_name),
-        ) as stream:
-            # Needs this line, or it doesn't work! :)
-            for text in stream.text_deltas:
-                print(text)
+    return event_handler
 
 
 def run_function(function_name, function_args):
@@ -375,6 +393,9 @@ async def start_chat():
         content="Hi. I'm your humanitarian AI assistant.", disable_feedback=True
     ).send()
 
+    event_handler = get_event_handler(cl, assistant.name)
+    cl.user_session.set("event_handler", event_handler)
+
     cl.user_session.set("chat_history", [])
 
 
@@ -693,12 +714,14 @@ async def main(message: cl.Message):
         msg.content = ""
         await msg.update()
 
+        event_handler = cl.user_session.get("event_handler")
+
         # Create and Stream a Run
         print(f"Creating and streaming a run {assistant.id}")
         with sync_openai_client.beta.threads.runs.stream(
             thread_id=thread_id,
             assistant_id=assistant.id,
-            event_handler=EventHandler(assistant_name=assistant.name),
+            event_handler=event_handler,
         ) as stream:
             stream.until_done()
 

From e47a0e88e8b3baed058c902e3c25a1a3e667d9d6 Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Thu, 27 Jun 2024 15:03:57 -0400
Subject: [PATCH 05/20] Mock test harness, with Mock chainlit so we can use UI
 code for promptflow tests. Also initial Promtpflow

---
 .../aggregate_variant_results.py              |  38 +++
 .../chainlit-ui-evaluation/call_assistant.py  | 260 ++++++++++++++++++
 .../check_evaluation_results.py               |  37 +++
 flows/chainlit-ui-evaluation/concat_scores.py |  39 +++
 flows/chainlit-ui-evaluation/data.jsonl       |   2 +
 .../groundedness_score.jinja2                 |  35 +++
 6 files changed, 411 insertions(+)
 create mode 100644 flows/chainlit-ui-evaluation/aggregate_variant_results.py
 create mode 100644 flows/chainlit-ui-evaluation/call_assistant.py
 create mode 100644 flows/chainlit-ui-evaluation/check_evaluation_results.py
 create mode 100644 flows/chainlit-ui-evaluation/concat_scores.py
 create mode 100644 flows/chainlit-ui-evaluation/data.jsonl
 create mode 100644 flows/chainlit-ui-evaluation/groundedness_score.jinja2

diff --git a/flows/chainlit-ui-evaluation/aggregate_variant_results.py b/flows/chainlit-ui-evaluation/aggregate_variant_results.py
new file mode 100644
index 00000000..bffbefc3
--- /dev/null
+++ b/flows/chainlit-ui-evaluation/aggregate_variant_results.py
@@ -0,0 +1,38 @@
+from typing import List
+
+import numpy as np
+from promptflow import log_metric, tool
+
+
+@tool
+def aggregate_variants_results(results: List[dict]):
+    """
+    Aggregate the results of multiple variants.
+
+    Args:
+        results (List[dict]): A list of dictionaries containing the results for each variant.
+
+    Returns:
+        dict: A dictionary containing the aggregated results, with the metric names as keys and the aggregated values as values.
+    """
+    aggregate_results = {}
+    for result in results:
+        for name, value in result.items():
+            if name not in aggregate_results.keys():
+                aggregate_results[name] = []
+            try:
+                float_val = float(value)
+            except Exception:
+                float_val = np.nan
+            aggregate_results[name].append(float_val)
+
+    for name, value in aggregate_results.items():
+        metric_name = name
+        aggregate_results[name] = np.nanmean(value)
+        if "pass_rate" in metric_name:
+            metric_name = metric_name + "(%)"
+            aggregate_results[name] = aggregate_results[name] * 100.0
+        aggregate_results[name] = round(aggregate_results[name], 2)
+        log_metric(metric_name, aggregate_results[name])
+
+    return aggregate_results
diff --git a/flows/chainlit-ui-evaluation/call_assistant.py b/flows/chainlit-ui-evaluation/call_assistant.py
new file mode 100644
index 00000000..7f8cec47
--- /dev/null
+++ b/flows/chainlit-ui-evaluation/call_assistant.py
@@ -0,0 +1,260 @@
+import asyncio
+import inspect
+import sys
+import threading
+from contextlib import asynccontextmanager
+from contextvars import ContextVar
+
+import chainlit as cl
+from promptflow.core import tool
+
+
+@tool
+async def call_assistant(chat_history: list) -> dict:
+    """
+    Calls the assistant API with the given input and retrieves the response.
+
+    Args:
+        chat_history (list): A list containing the chat history, of the format ...
+
+        [
+            {
+                "author": "user",
+                "content": "Hi",
+                "elements": []
+            },
+            {
+                "author": "assistant",
+                "content": "Hello! How can I help you today?",
+                "elements": []
+            }
+        ]
+
+    Returns:
+        dict: A dictionary containing the response from the assistant, function name, function arguments,
+              function output, and the number of tokens in the function output.
+    """
+    # print("Initializing chainlit thread ...")
+    # thread = await start_chat()
+    # print(thread)
+
+
+def setup_mock_class():
+    """
+    Creates and returns a mock class for testing purposes.
+
+    Returns:
+        cl_mock (MockChainlit): The mock class instance.
+    """
+
+    class MockMessage:
+        """
+        A class representing a mock message.
+
+        Attributes:
+            author (str): The author of the message.
+            content (str): The content of the message.
+            elements (list): The elements of the message.
+            disable_feedback (bool): Flag indicating whether feedback is disabled.
+
+        Methods:
+            send(): Sends the message.
+            stream_token(content): Streams a token.
+            update(): Updates the message.
+        """
+
+        def __init__(
+            self, author=None, content=None, elements=None, disable_feedback=False
+        ):
+            if content is None:
+                content = ""
+            self.author = author
+            self.content = content
+            self.disable_feedback = disable_feedback
+            self.elements = elements if elements is not None else []
+
+        async def send(self):
+            """
+            Sends the message.
+
+            Returns:
+                MockMessage: The sent message.
+            """
+            print(
+                f"Sending message: Author: {self.author}, Content: {self.content}, Elements: {self.elements}"
+            )
+            return self
+
+        async def stream_token(self, content):
+            """
+            Streams a token.
+
+            Args:
+                content (str): The content of the token.
+
+            Returns:
+                MockMessage: The updated message.
+            """
+            print(f"Streaming token: Author: {self.author}, Content: {content}")
+            return self
+
+        async def update(self):
+            """
+            Updates the message.
+
+            Returns:
+                MockMessage: The updated message.
+            """
+            print(
+                f"Updating message: Author: {self.author}, Content: {self.content}, Elements: {self.elements}"
+            )
+            return self
+
+    class MockUserSession:
+        """
+        A class representing a mock user session.
+
+        Attributes:
+            session_data (dict): A dictionary to store session data.
+
+        Methods:
+            get(key): Retrieves the value associated with the given key from the session data.
+            set(key, value): Sets the value associated with the given key in the session data.
+        """
+
+        def __init__(self):
+            self.session_data = {}
+
+        def get(self, key):
+            """
+            Retrieves the value associated with the given key from the session data.
+
+            Args:
+                key (str): The key to retrieve the value for.
+
+            Returns:
+                The value associated with the given key, or None if the key is not found.
+            """
+            return self.session_data.get(key, None)
+
+        def set(self, key, value):
+            """
+            Sets the value associated with the given key in the session data.
+
+            Args:
+                key (str): The key to set the value for.
+                value: The value to be associated with the key.
+            """
+            self.session_data[key] = value
+
+    class MockChainlit:
+        """
+        A mock implementation of the Chainlit class.
+        """
+
+        def __init__(self):
+            self.Message = MockMessage
+            self.user_session = MockUserSession()
+            self.__name__ = "chainlit"
+            self.step = None
+
+    cl_mock = MockChainlit()
+
+    return cl_mock
+
+
+# Method to run a supplied function to override chainlit's run_sync method
+def run_async_coroutine(coroutine):
+    """
+    Runs an asynchronous coroutine in a separate event loop and returns the result.
+
+    Args:
+        coroutine: The coroutine to be executed asynchronously.
+
+    Returns:
+        The result of the coroutine execution.
+
+    Raises:
+        asyncio.TimeoutError: If the coroutine execution times out.
+
+    """
+
+    def start_loop(loop):
+        asyncio.set_event_loop(loop)
+        loop.run_forever()
+
+    new_loop = asyncio.new_event_loop()
+    t = threading.Thread(target=start_loop, args=(new_loop,))
+    t.start()
+    future = asyncio.run_coroutine_threadsafe(coroutine, new_loop)
+    try:
+        return future.result(timeout=10)
+    except asyncio.TimeoutError:
+        print("Coroutine execution timed out.")
+        return None
+
+
+def run_sync(func, *args, **kwargs):
+    """
+    Run a function synchronously or asynchronously depending on its type.
+
+    Args:
+        func: The function to be executed.
+        *args: Positional arguments to be passed to the function.
+        **kwargs: Keyword arguments to be passed to the function.
+
+    Returns:
+        The result of the function execution.
+
+    Raises:
+        None.
+
+    """
+    if inspect.iscoroutinefunction(func):
+        # Use the alternative approach for coroutine functions
+        coroutine = func(*args, **kwargs)
+        return run_async_coroutine(coroutine)
+    elif asyncio.iscoroutine(func):
+        # Directly pass the coroutine object
+        return run_async_coroutine(func)
+    else:
+        # Handle synchronous function
+        return func(*args, **kwargs)
+
+
+def test():
+    """
+    This function is used to test the functionality of the app module.
+    It sets up a mock chainlit class, imports the app module, and overrides certain methods and event handlers.
+    Then it calls the app start and main functions to simulate a chat interaction.
+    """
+
+    # Create a Mock chainlit class
+    cl_mock = setup_mock_class()
+
+    # Import chainlit app
+    import app as app
+
+    # Override run_sync method to use mock cl
+    app.run_sync = run_sync
+
+    # Override OpenAI event handler to use mock cl
+    event_handler = app.get_event_handler(cl_mock, "assistant")
+    cl_mock.user_session.set("event_handler", event_handler)
+
+    # Patch 'cl' with our Mock class
+    app.cl = cl_mock
+
+    # Call app start to set up variables
+    asyncio.run(app.start_chat())
+
+    # Here insert history to thread
+
+    # msg = cl_mock.Message(author="You", content="What is the total population of Mali", elements=[])
+    msg = cl_mock.Message(author="You", content="Hi", elements=[])
+    thread = asyncio.run(app.main(msg))
+    print(thread)
+
+
+if __name__ == "__main__":
+    test()
diff --git a/flows/chainlit-ui-evaluation/check_evaluation_results.py b/flows/chainlit-ui-evaluation/check_evaluation_results.py
new file mode 100644
index 00000000..352ed21e
--- /dev/null
+++ b/flows/chainlit-ui-evaluation/check_evaluation_results.py
@@ -0,0 +1,37 @@
+# This script runs after GitHub action to check promptflow evaluation
+import json
+import os
+import subprocess
+import sys
+
+
+def check_result(run_name="base_run", cutoff=100.0):
+    """
+    Check the evaluation result for a given run.
+
+    Args:
+        run_name (str, optional): The name of the run to check. Defaults to "base_run".
+        cutoff (float, optional): The cutoff value for passing the evaluation. Defaults to 100.0.
+
+    Returns:
+        dict: The evaluation result.
+
+    Raises:
+        SystemExit: If the evaluation result is below the cutoff value.
+    """
+    cmd = f"pf run show-metrics -n {run_name}"
+    print(cmd)
+    # Run cmd and capture output
+    result = subprocess.check_output(cmd, shell=True, text=True)
+    print(result)
+    result = json.loads(result)
+    print(result["gpt_groundedness_pass_rate(%)"])
+    if result["gpt_groundedness_pass_rate(%)"] < cutoff:
+        sys.exit(f"FAILED!!!! Run {run_name} failed with score {result}.")
+    else:
+        print(f"Run {run_name} passed with score {result}.")
+        return result
+
+
+if __name__ == "__main__":
+    check_result()
diff --git a/flows/chainlit-ui-evaluation/concat_scores.py b/flows/chainlit-ui-evaluation/concat_scores.py
new file mode 100644
index 00000000..112bb9a1
--- /dev/null
+++ b/flows/chainlit-ui-evaluation/concat_scores.py
@@ -0,0 +1,39 @@
+import re
+
+import numpy as np
+from promptflow import tool
+
+
+@tool
+def concat_results(groundesness_score: str):
+    """
+    Concatenates the results of groundedness scores.
+
+    Args:
+        groundesness_score (str): The groundedness score.
+
+    Returns:
+        dict: A dictionary containing the concatenated results of groundedness scores.
+    """
+
+    load_list = [{"name": "gpt_groundedness", "score": groundesness_score}]
+    score_list = []
+    errors = []
+    for item in load_list:
+        try:
+            score = item["score"]
+            match = re.search(r"\d", score)
+            if match:
+                score = match.group()
+            score = float(score)
+        except Exception as e:
+            score = np.nan
+            errors.append({"name": item["name"], "msg": str(e), "data": item["score"]})
+        score_list.append({"name": item["name"], "score": score})
+
+    variant_level_result = {}
+    for item in score_list:
+        item_name = str(item["name"])
+        variant_level_result[item_name] = item["score"]
+        variant_level_result[item_name + "_pass_rate"] = 1 if item["score"] > 3 else 0
+    return variant_level_result
diff --git a/flows/chainlit-ui-evaluation/data.jsonl b/flows/chainlit-ui-evaluation/data.jsonl
new file mode 100644
index 00000000..1602a71b
--- /dev/null
+++ b/flows/chainlit-ui-evaluation/data.jsonl
@@ -0,0 +1,2 @@
+{"query": "What is the total population of Mali", "context": "The total population of Mali is 17839995"}
+{"query": "Can you provide the total population for Ethiopia and its breakdown by gender?", "context": "The total population for Ethiopia is 102,536,656, with a gender breakdown as follows:\n\nMale population: 51,389,381\nFemale population: 51,147,275"}
diff --git a/flows/chainlit-ui-evaluation/groundedness_score.jinja2 b/flows/chainlit-ui-evaluation/groundedness_score.jinja2
new file mode 100644
index 00000000..182a6329
--- /dev/null
+++ b/flows/chainlit-ui-evaluation/groundedness_score.jinja2
@@ -0,0 +1,35 @@
+System:
+You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
+User:
+You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating:
+1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
+2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
+3. an integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information.
+
+Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails.
+
+Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation.
+Independent Examples:
+## Example Task #1 Input:
+{"CONTEXT": "The Academy Awards, also known as the Oscars are awards for artistic and technical merit for the film industry. They are presented annually by the Academy of Motion Picture Arts and Sciences, in recognition of excellence in cinematic achievements as assessed by the Academy's voting membership. The Academy Awards are regarded by many as the most prestigious, significant awards in the entertainment industry in the United States and worldwide.", "ANSWER": "Oscar is presented every other two years"}
+## Example Task #1 Output:
+1
+## Example Task #2 Input:
+{"CONTEXT": "The Academy Awards, also known as the Oscars are awards for artistic and technical merit for the film industry. They are presented annually by the Academy of Motion Picture Arts and Sciences, in recognition of excellence in cinematic achievements as assessed by the Academy's voting membership. The Academy Awards are regarded by many as the most prestigious, significant awards in the entertainment industry in the United States and worldwide.", "ANSWER": "Oscar is very important awards in the entertainment industry in the United States. And it's also significant worldwide"}
+## Example Task #2 Output:
+5
+## Example Task #3 Input:
+{"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."}
+## Example Task #3 Output:
+5
+## Example Task #4 Input:
+{"CONTEXT": "Some are reported as not having been wanted at all.", "ANSWER": "All are reported as being completely and fully wanted."}
+## Example Task #4 Output:
+1
+
+Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context.
+
+## Actual Task Input:
+{"CONTEXT": {{context}}, "ANSWER": {{answer}}}
+
+Actual Task Output:
\ No newline at end of file

From 8302bc161fc1a11670dfb7241d21447d03925574 Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Thu, 27 Jun 2024 15:04:30 -0400
Subject: [PATCH 06/20] Mock test harness, with Mock chainlit so we can use UI
 code for promptflow tests. Also initial Promtpflow

---
 README.md                         |  13 ++
 ui/chat-chainlit-assistant/app.py | 307 ++++++++++++++----------------
 2 files changed, 157 insertions(+), 163 deletions(-)

diff --git a/README.md b/README.md
index 18f7d2e9..9b708359 100644
--- a/README.md
+++ b/README.md
@@ -199,6 +199,19 @@ To activate:
 6. Go to playground and start a new session, select the 'Recipes data Analysis' workflow
 7. Ask 'What is the total population of Mali?'
 
+# Evaluation with Prompt Flow
+
+1. Install the DevContainers VSCode extension 
+2. Build data recipes using the `docker compose` command mentioned above
+3. Open the command palette in VSCode (CMD + Shift + P on Mac; CTRL + Shift + P on Windows) and select 
+
+   `Dev Containers: Attach to remote container`. 
+
+   Select the promptflow container. This opens a new VSCode window - use it for the next steps.
+4. Open folder `/app`
+5. Click on `flow.dag.yaml`
+6. Top left of main pane, click on 'Visual editor'
+
 # Deployment
 
 We will add more details here soon, for now, here are some notes on Azure ...
diff --git a/ui/chat-chainlit-assistant/app.py b/ui/chat-chainlit-assistant/app.py
index 64fbb81d..4dbfffb5 100644
--- a/ui/chat-chainlit-assistant/app.py
+++ b/ui/chat-chainlit-assistant/app.py
@@ -72,176 +72,158 @@
 config.ui.name = bot_name
 
 
-def get_event_handler(cl, assistant_name):  # noqa: C901
-    """
-    Returns an instance of the EventHandler class, which is responsible for handling events in the ChatChainlitAssistant.
-
-    Args:
-        cl: The ChatClient instance used for communication with the chat service.
-        assistant_name (str): The name of the assistant.
+class EventHandler(AssistantEventHandler):
 
-    Returns:
-        EventHandler: An instance of the EventHandler class.
-    """
+    def __init__(self, assistant_name: str) -> None:
+        """
+        Initializes a new instance of the ChatChainlitAssistant class.
 
-    class EventHandler(AssistantEventHandler):
+        Args:
+            assistant_name (str): The name of the assistant.
 
-        def __init__(self, assistant_name: str) -> None:
-            """
-            Initializes a new instance of the ChatChainlitAssistant class.
+        Returns:
+            None
+        """
+        super().__init__()
+        self.current_message: cl.Message = None
+        self.current_step: cl.Step = None
+        self.current_tool_call = None
+        self.current_message_text = ""
+        self.assistant_name = assistant_name
+
+    @override
+    def on_event(self, event):
+        """
+        Handles the incoming event and performs the necessary actions based on the event type.
 
-            Args:
-                assistant_name (str): The name of the assistant.
+        Args:
+            event: The event object containing information about the event.
 
-            Returns:
-                None
-            """
-            super().__init__()
-            self.current_message: cl.Message = None
-            self.current_step: cl.Step = None
-            self.current_tool_call = None
+        Returns:
+            None
+        """
+        print(event.event)
+        run_id = event.data.id
+        if event.event == "thread.message.created":
+            self.current_message = run_sync(cl.Message(content="").send())
             self.current_message_text = ""
-            self.assistant_name = assistant_name
-
-        @override
-        def on_event(self, event):
-            """
-            Handles the incoming event and performs the necessary actions based on the event type.
-
-            Args:
-                event: The event object containing information about the event.
-
-            Returns:
-                None
-            """
-            print(event.event)
-            run_id = event.data.id
-            if event.event == "thread.message.created":
-                self.current_message = run_sync(cl.Message(content="").send())
-                self.current_message_text = ""
-                print("Run started")
-            if event.event == "thread.message.completed":
-                self.handle_message_completed(event.data, run_id)
-            elif event.event == "thread.run.requires_action":
-                self.handle_requires_action(event.data, run_id)
-            elif event.event == "thread.message.delta":
-                self.handle_message_delta(event.data)
+            print("Run started")
+        if event.event == "thread.message.completed":
+            self.handle_message_completed(event.data, run_id)
+        elif event.event == "thread.run.requires_action":
+            self.handle_requires_action(event.data, run_id)
+        elif event.event == "thread.message.delta":
+            self.handle_message_delta(event.data)
+        else:
+            print(json.dumps(str(event.data), indent=4))
+            print(f"Unhandled event: {event.event}")
+
+    def handle_message_delta(self, data):
+        """
+        Handles the message delta data.
+
+        Args:
+            data: The message delta data.
+
+        Returns:
+            None
+        """
+        for content in data.delta.content:
+            if content.type == "text":
+                content = content.text.value
+                if content is not None:
+                    self.current_message_text += content
+                    run_sync(self.current_message.stream_token(content))
+            elif content.type == "image_file":
+                file_id = content.image_file.file_id
+                image_data = sync_openai_client.files.content(file_id)
+                image_data_bytes = image_data.read()
+                png_file = f"{images_loc}{file_id}.png"
+                print(f"Writing image to {png_file}")
+                with open(png_file, "wb") as file:
+                    file.write(image_data_bytes)
+                    image = cl.Image(path=png_file, display="inline", size="large")
+                    print(f"Image: {png_file}")
+                    if not self.current_message.elements:
+                        self.current_message.elements = []
+                        self.current_message.elements.append(image)
+                        run_sync(self.current_message.update())
             else:
-                print(json.dumps(str(event.data), indent=4))
-                print(f"Unhandled event: {event.event}")
-
-        def handle_message_delta(self, data):
-            """
-            Handles the message delta data.
-
-            Args:
-                data: The message delta data.
-
-            Returns:
-                None
-            """
-            for content in data.delta.content:
-                if content.type == "text":
-                    content = content.text.value
-                    if content is not None:
-                        self.current_message_text += content
-                        run_sync(self.current_message.stream_token(content))
-                elif content.type == "image_file":
-                    file_id = content.image_file.file_id
-                    image_data = sync_openai_client.files.content(file_id)
-                    image_data_bytes = image_data.read()
-                    png_file = f"{images_loc}{file_id}.png"
-                    print(f"Writing image to {png_file}")
-                    with open(png_file, "wb") as file:
-                        file.write(image_data_bytes)
-                        image = cl.Image(path=png_file, display="inline", size="large")
-                        print(f"Image: {png_file}")
-                        if not self.current_message.elements:
-                            self.current_message.elements = []
-                            self.current_message.elements.append(image)
-                            run_sync(self.current_message.update())
-                else:
-                    print(f"Unhandled delta type: {content.type}")
-
-        def handle_message_completed(self, data, run_id):
-            """
-            Handles the completion of a message.
-
-            Args:
-                data: The data associated with the completed message.
-                run_id: The ID of the message run.
-
-            Returns:
-                None
-            """
-            # Add footer to self message. We have to start a new message so it's in right order
-            # TODO combine streaming with image and footer
-            run_sync(self.current_message.update())
-            self.current_message = run_sync(
-                cl.Message(content="", disable_feedback=True).send()
-            )
+                print(f"Unhandled delta type: {content.type}")
 
-            word_count = len(self.current_message_text.split())
-            if word_count > 10:
-                run_sync(self.current_message.stream_token(llm_footer))
-            run_sync(self.current_message.update())
+    def handle_message_completed(self, data, run_id):
+        """
+        Handles the completion of a message.
 
-        def handle_requires_action(self, data, run_id):
-            """
-            Handles the required action by executing the specified tools and submitting the tool outputs.
+        Args:
+            data: The data associated with the completed message.
+            run_id: The ID of the message run.
 
-            Args:
-                data: The data containing the required action information.
-                run_id: The ID of the current run.
+        Returns:
+            None
+        """
+        # Add footer to self message. We have to start a new message so it's in right order
+        # TODO combine streaming with image and footer
+        run_sync(self.current_message.update())
+        self.current_message = run_sync(
+            cl.Message(content="", disable_feedback=True).send()
+        )
 
-            Returns:
-                None
-            """
-            tool_outputs = []
+        word_count = len(self.current_message_text.split())
+        if word_count > 10:
+            run_sync(self.current_message.stream_token(llm_footer))
+        run_sync(self.current_message.update())
 
-            for tool in data.required_action.submit_tool_outputs.tool_calls:
-                print(tool)
+    def handle_requires_action(self, data, run_id):
+        """
+        Handles the required action by executing the specified tools and submitting the tool outputs.
 
-                function_name = tool.function.name
-                function_args = tool.function.arguments
+        Args:
+            data: The data containing the required action information.
+            run_id: The ID of the current run.
 
-                function_output = run_function(function_name, function_args)
+        Returns:
+            None
+        """
+        tool_outputs = []
 
-                tool_outputs.append(
-                    {"tool_call_id": tool.id, "output": function_output}
-                )
+        for tool in data.required_action.submit_tool_outputs.tool_calls:
+            print(tool)
 
-            print("TOOL OUTPUTS: ")
+            function_name = tool.function.name
+            function_args = tool.function.arguments
 
-            print(tool_outputs)
+            function_output = run_function(function_name, function_args)
 
-            # Submit all tool_outputs at the same time
-            self.submit_tool_outputs(tool_outputs, run_id)
+            tool_outputs.append({"tool_call_id": tool.id, "output": function_output})
 
-        def submit_tool_outputs(self, tool_outputs, run_id):
-            """
-            Submits the tool outputs to the current run.
+        print("TOOL OUTPUTS: ")
 
-            Args:
-                tool_outputs (list): A list of tool outputs to be submitted.
-                run_id (str): The ID of the current run.
+        print(tool_outputs)
 
-            Returns:
-                None
-            """
-            with sync_openai_client.beta.threads.runs.submit_tool_outputs_stream(
-                thread_id=self.current_run.thread_id,
-                run_id=self.current_run.id,
-                tool_outputs=tool_outputs,
-                event_handler=EventHandler(assistant_name=self.assistant_name),
-            ) as stream:
-                # Needs this line, or it doesn't work! :)
-                for text in stream.text_deltas:
-                    print(text)
+        # Submit all tool_outputs at the same time
+        self.submit_tool_outputs(tool_outputs, run_id)
 
-    event_handler = EventHandler(assistant_name)
+    def submit_tool_outputs(self, tool_outputs, run_id):
+        """
+        Submits the tool outputs to the current run.
 
-    return event_handler
+        Args:
+            tool_outputs (list): A list of tool outputs to be submitted.
+            run_id (str): The ID of the current run.
+
+        Returns:
+            None
+        """
+        with sync_openai_client.beta.threads.runs.submit_tool_outputs_stream(
+            thread_id=self.current_run.thread_id,
+            run_id=self.current_run.id,
+            tool_outputs=tool_outputs,
+            event_handler=EventHandler(assistant_name=self.assistant_name),
+        ) as stream:
+            # Needs this line, or it doesn't work! :)
+            for text in stream.text_deltas:
+                print(text)
 
 
 def run_function(function_name, function_args):
@@ -393,9 +375,6 @@ async def start_chat():
         content="Hi. I'm your humanitarian AI assistant.", disable_feedback=True
     ).send()
 
-    event_handler = get_event_handler(cl, assistant.name)
-    cl.user_session.set("event_handler", event_handler)
-
     cl.user_session.set("chat_history", [])
 
 
@@ -648,6 +627,9 @@ async def add_message_to_thread(thread_id, role, content, message=None):
     Returns:
         None
     """
+
+    print(f"Content: {content}")
+
     # Azure doesn't yet support attachments
     if os.getenv("ASSISTANTS_API_TYPE") == "openai":
 
@@ -682,25 +664,26 @@ async def main(message: cl.Message):
     Returns:
         None
     """
+
     thread_id = cl.user_session.get("thread_id")
     chat_history = cl.user_session.get("chat_history")
-    msg = await cl.Message("").send()
-
-    # Record user's message
-    add_message_to_thread(thread_id, "user", message.content, message)
     chat_history.append({"role": "user", "content": message.content})
 
-    # Check recipes
+    # Add user message to thread
+    print(f"Adding user message {message.content} to thread {thread_id}")
+    await add_message_to_thread(thread_id, "user", message.content, message)
+
+    # Check memories/recipes
+    msg = await cl.Message("").send()
     memory_found, memory_content, memory_response, meta_data_msg = (
         await async_check_memories_recipes(message.content, chat_history)
     )
 
-    # Message to the thread. If a memory add it as the assistant
     if memory_found is True:
-        print("Adding memory to thread")
-        add_message_to_thread(thread_id, "assistant", memory_content)
+        print("Adding memory to thread as assistant")
+        await add_message_to_thread(thread_id, "assistant", memory_content)
 
-        # Send memory output
+        # output memory artifacts
         msg.content = memory_response["content"]
         msg.elements = memory_response["elements"]
         await msg.update()
@@ -714,14 +697,12 @@ async def main(message: cl.Message):
         msg.content = ""
         await msg.update()
 
-        event_handler = cl.user_session.get("event_handler")
-
-        # Create and Stream a Run
+        # Create and Stream a Run to assistant
         print(f"Creating and streaming a run {assistant.id}")
         with sync_openai_client.beta.threads.runs.stream(
             thread_id=thread_id,
             assistant_id=assistant.id,
-            event_handler=event_handler,
+            event_handler=EventHandler(assistant_name=assistant.name),
         ) as stream:
             stream.until_done()
 

From 72600740715446161b5ad8204a30c57ca30ecf35 Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Thu, 27 Jun 2024 15:24:07 -0400
Subject: [PATCH 07/20] Mock test harness, with Mock chainlit so we can use UI
 code for promptflow tests. Also initial Promtpflow

---
 .../chainlit-ui-evaluation/azure_openai.yaml  |   6 +
 .../chainlit-ui-evaluation/call_assistant.py  |   5 +-
 flows/chainlit-ui-evaluation/flow.dag.yaml    |  64 ++++
 flows/chainlit-ui-evaluation/openai.yaml      |   4 +
 ui/chat-chainlit-assistant/app.py             | 287 ++++++++++--------
 5 files changed, 233 insertions(+), 133 deletions(-)
 create mode 100644 flows/chainlit-ui-evaluation/azure_openai.yaml
 create mode 100644 flows/chainlit-ui-evaluation/flow.dag.yaml
 create mode 100644 flows/chainlit-ui-evaluation/openai.yaml

diff --git a/flows/chainlit-ui-evaluation/azure_openai.yaml b/flows/chainlit-ui-evaluation/azure_openai.yaml
new file mode 100644
index 00000000..5b916e77
--- /dev/null
+++ b/flows/chainlit-ui-evaluation/azure_openai.yaml
@@ -0,0 +1,6 @@
+$schema: https://azuremlschemas.azureedge.net/promptflow/latest/AzureOpenAIConnection.schema.json
+name: open_ai_connection
+type: azure_open_ai
+api_key: "<user-input>"
+api_base: "<user-input>"
+api_type: "azure"
\ No newline at end of file
diff --git a/flows/chainlit-ui-evaluation/call_assistant.py b/flows/chainlit-ui-evaluation/call_assistant.py
index 7f8cec47..30befdd0 100644
--- a/flows/chainlit-ui-evaluation/call_assistant.py
+++ b/flows/chainlit-ui-evaluation/call_assistant.py
@@ -249,11 +249,12 @@ def test():
     asyncio.run(app.start_chat())
 
     # Here insert history to thread
+    # thread_id = app.cl.user_session.get("thread_id")
+    # wait app.add_message_to_thread(thread_id, "user", message.content, message)
 
     # msg = cl_mock.Message(author="You", content="What is the total population of Mali", elements=[])
     msg = cl_mock.Message(author="You", content="Hi", elements=[])
-    thread = asyncio.run(app.main(msg))
-    print(thread)
+    asyncio.run(app.main(msg))
 
 
 if __name__ == "__main__":
diff --git a/flows/chainlit-ui-evaluation/flow.dag.yaml b/flows/chainlit-ui-evaluation/flow.dag.yaml
new file mode 100644
index 00000000..c9709b72
--- /dev/null
+++ b/flows/chainlit-ui-evaluation/flow.dag.yaml
@@ -0,0 +1,64 @@
+$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json
+environment:
+  python_requirements_txt: requirements.txt
+inputs:
+  query:
+    type: string
+    default: what's the total population of mali?
+  context:
+    type: string
+    default: The total population of Mali is 17,839,995
+outputs:
+  agent_putput:
+    type: string
+    reference: ${call_assistant.output.response}
+  groundedness_score:
+    type: string
+    reference: ${groundedness_score.output}
+  context:
+    type: string
+    reference: ${inputs.context}
+  query:
+    type: string
+    reference: ${inputs.query}
+  function_name:
+    type: string
+    reference: ${call_assistant.output.function_name}
+  function_args:
+    type: string
+    reference: ${call_assistant.output.function_args}
+nodes:
+- name: call_assistant
+  type: python
+  source:
+    type: code
+    path: call_assistant.py
+  inputs: {}
+- name: groundedness_score
+  type: llm
+  source:
+    type: code
+    path: groundedness_score.jinja2
+  inputs:
+    deployment_name: gpt-4-turbo
+    answer: ${call_assistant.output.response}
+    context: ${inputs.context}
+    temperature: 1
+    model: gpt-4-turbo-preview
+  connection: azure_openai
+  api: chat
+- name: concat_scores
+  type: python
+  source:
+    type: code
+    path: concat_scores.py
+  inputs:
+    groundesness_score: ${groundedness_score.output}
+- name: aggregate_variant_results
+  type: python
+  source:
+    type: code
+    path: aggregate_variant_results.py
+  inputs:
+    results: ${concat_scores.output}
+  aggregation: true
diff --git a/flows/chainlit-ui-evaluation/openai.yaml b/flows/chainlit-ui-evaluation/openai.yaml
new file mode 100644
index 00000000..68d25b7a
--- /dev/null
+++ b/flows/chainlit-ui-evaluation/openai.yaml
@@ -0,0 +1,4 @@
+$schema: https://azuremlschemas.azureedge.net/promptflow/latest/OpenAIConnection.schema.json
+name: open_ai_connection
+type: open_ai
+api_key: "<user-input>"
\ No newline at end of file
diff --git a/ui/chat-chainlit-assistant/app.py b/ui/chat-chainlit-assistant/app.py
index 4dbfffb5..d407a110 100644
--- a/ui/chat-chainlit-assistant/app.py
+++ b/ui/chat-chainlit-assistant/app.py
@@ -72,158 +72,179 @@
 config.ui.name = bot_name
 
 
-class EventHandler(AssistantEventHandler):
-
-    def __init__(self, assistant_name: str) -> None:
-        """
-        Initializes a new instance of the ChatChainlitAssistant class.
-
-        Args:
-            assistant_name (str): The name of the assistant.
+def get_event_handler(cl, assistant_name):  # noqa: C901
+    """
+    Returns an instance of the EventHandler class, which is responsible for handling events in the ChatChainlitAssistant.
 
-        Returns:
-            None
-        """
-        super().__init__()
-        self.current_message: cl.Message = None
-        self.current_step: cl.Step = None
-        self.current_tool_call = None
-        self.current_message_text = ""
-        self.assistant_name = assistant_name
-
-    @override
-    def on_event(self, event):
-        """
-        Handles the incoming event and performs the necessary actions based on the event type.
+    Args:
+        cl: The ChatClient instance used for communication with the chat service.
+        assistant_name (str): The name of the assistant.
 
-        Args:
-            event: The event object containing information about the event.
+    Returns:
+        EventHandler: An instance of the EventHandler class.
+    """
 
-        Returns:
-            None
-        """
-        print(event.event)
-        run_id = event.data.id
-        if event.event == "thread.message.created":
-            self.current_message = run_sync(cl.Message(content="").send())
-            self.current_message_text = ""
-            print("Run started")
-        if event.event == "thread.message.completed":
-            self.handle_message_completed(event.data, run_id)
-        elif event.event == "thread.run.requires_action":
-            self.handle_requires_action(event.data, run_id)
-        elif event.event == "thread.message.delta":
-            self.handle_message_delta(event.data)
-        else:
-            print(json.dumps(str(event.data), indent=4))
-            print(f"Unhandled event: {event.event}")
+    class EventHandler(AssistantEventHandler):
 
-    def handle_message_delta(self, data):
-        """
-        Handles the message delta data.
+        def __init__(self, assistant_name: str) -> None:
+            """
+            Initializes a new instance of the ChatChainlitAssistant class.
 
-        Args:
-            data: The message delta data.
+            Args:
+                assistant_name (str): The name of the assistant.
 
-        Returns:
-            None
-        """
-        for content in data.delta.content:
-            if content.type == "text":
-                content = content.text.value
-                if content is not None:
-                    self.current_message_text += content
-                    run_sync(self.current_message.stream_token(content))
-            elif content.type == "image_file":
-                file_id = content.image_file.file_id
-                image_data = sync_openai_client.files.content(file_id)
-                image_data_bytes = image_data.read()
-                png_file = f"{images_loc}{file_id}.png"
-                print(f"Writing image to {png_file}")
-                with open(png_file, "wb") as file:
-                    file.write(image_data_bytes)
-                    image = cl.Image(path=png_file, display="inline", size="large")
-                    print(f"Image: {png_file}")
-                    if not self.current_message.elements:
-                        self.current_message.elements = []
-                        self.current_message.elements.append(image)
-                        run_sync(self.current_message.update())
+            Returns:
+                None
+            """
+            super().__init__()
+            self.current_message: cl.Message = None
+            self.current_step: cl.Step = None
+            self.current_tool_call = None
+            self.current_message_text = ""
+            self.assistant_name = assistant_name
+            self.cl = cl
+
+        @override
+        def on_event(self, event):
+            """
+            Handles the incoming event and performs the necessary actions based on the event type.
+
+            Args:
+                event: The event object containing information about the event.
+
+            Returns:
+                None
+            """
+            print(event.event)
+            run_id = event.data.id
+            if event.event == "thread.message.created":
+                self.current_message = self.cl.Message(content="")
+                self.current_message = run_sync(self.current_message.send())
+                self.current_message_text = ""
+                print("Run started")
+            if event.event == "thread.message.completed":
+                self.handle_message_completed(event.data, run_id)
+            elif event.event == "thread.run.requires_action":
+                self.handle_requires_action(event.data, run_id)
+            elif event.event == "thread.message.delta":
+                self.handle_message_delta(event.data)
             else:
-                print(f"Unhandled delta type: {content.type}")
+                print(json.dumps(str(event.data), indent=4))
+                print(f"Unhandled event: {event.event}")
+
+        def handle_message_delta(self, data):
+            """
+            Handles the message delta data.
+
+            Args:
+                data: The message delta data.
+
+            Returns:
+                None
+            """
+            for content in data.delta.content:
+                if content.type == "text":
+                    content = content.text.value
+                    if content is not None:
+                        self.current_message_text += content
+                        run_sync(self.current_message.stream_token(content))
+                elif content.type == "image_file":
+                    file_id = content.image_file.file_id
+                    image_data = sync_openai_client.files.content(file_id)
+                    image_data_bytes = image_data.read()
+                    png_file = f"{images_loc}{file_id}.png"
+                    print(f"Writing image to {png_file}")
+                    with open(png_file, "wb") as file:
+                        file.write(image_data_bytes)
+                        image = cl.Image(path=png_file, display="inline", size="large")
+                        print(f"Image: {png_file}")
+                        if not self.current_message.elements:
+                            self.current_message.elements = []
+                            self.current_message.elements.append(image)
+                            run_sync(self.current_message.update())
+                else:
+                    print(f"Unhandled delta type: {content.type}")
+
+        def handle_message_completed(self, data, run_id):
+            """
+            Handles the completion of a message.
+
+            Args:
+                data: The data associated with the completed message.
+                run_id: The ID of the message run.
+
+            Returns:
+                None
+            """
+            # Add footer to self message. We have to start a new message so it's in right order
+            # TODO combine streaming with image and footer
+            run_sync(self.current_message.update())
+            self.current_message = run_sync(
+                cl.Message(content="", disable_feedback=True).send()
+            )
 
-    def handle_message_completed(self, data, run_id):
-        """
-        Handles the completion of a message.
+            word_count = len(self.current_message_text.split())
+            if word_count > 10:
+                run_sync(self.current_message.stream_token(llm_footer))
+            run_sync(self.current_message.update())
 
-        Args:
-            data: The data associated with the completed message.
-            run_id: The ID of the message run.
+        def handle_requires_action(self, data, run_id):
+            """
+            Handles the required action by executing the specified tools and submitting the tool outputs.
 
-        Returns:
-            None
-        """
-        # Add footer to self message. We have to start a new message so it's in right order
-        # TODO combine streaming with image and footer
-        run_sync(self.current_message.update())
-        self.current_message = run_sync(
-            cl.Message(content="", disable_feedback=True).send()
-        )
+            Args:
+                data: The data containing the required action information.
+                run_id: The ID of the current run.
 
-        word_count = len(self.current_message_text.split())
-        if word_count > 10:
-            run_sync(self.current_message.stream_token(llm_footer))
-        run_sync(self.current_message.update())
+            Returns:
+                None
+            """
+            tool_outputs = []
 
-    def handle_requires_action(self, data, run_id):
-        """
-        Handles the required action by executing the specified tools and submitting the tool outputs.
-
-        Args:
-            data: The data containing the required action information.
-            run_id: The ID of the current run.
+            for tool in data.required_action.submit_tool_outputs.tool_calls:
+                print(tool)
 
-        Returns:
-            None
-        """
-        tool_outputs = []
+                function_name = tool.function.name
+                function_args = tool.function.arguments
 
-        for tool in data.required_action.submit_tool_outputs.tool_calls:
-            print(tool)
+                function_output = run_function(function_name, function_args)
 
-            function_name = tool.function.name
-            function_args = tool.function.arguments
+                tool_outputs.append(
+                    {"tool_call_id": tool.id, "output": function_output}
+                )
 
-            function_output = run_function(function_name, function_args)
+            print("TOOL OUTPUTS: ")
 
-            tool_outputs.append({"tool_call_id": tool.id, "output": function_output})
+            print(tool_outputs)
 
-        print("TOOL OUTPUTS: ")
+            # Submit all tool_outputs at the same time
+            self.submit_tool_outputs(tool_outputs, run_id)
 
-        print(tool_outputs)
+        def submit_tool_outputs(self, tool_outputs, run_id):
+            """
+            Submits the tool outputs to the current run.
 
-        # Submit all tool_outputs at the same time
-        self.submit_tool_outputs(tool_outputs, run_id)
+            Args:
+                tool_outputs (list): A list of tool outputs to be submitted.
+                run_id (str): The ID of the current run.
 
-    def submit_tool_outputs(self, tool_outputs, run_id):
-        """
-        Submits the tool outputs to the current run.
+            Returns:
+                None
+            """
+            event_handler = get_event_handler(cl, assistant.name)
+            with sync_openai_client.beta.threads.runs.submit_tool_outputs_stream(
+                thread_id=self.current_run.thread_id,
+                run_id=self.current_run.id,
+                tool_outputs=tool_outputs,
+                event_handler=event_handler,
+            ) as stream:
+                # Needs this line, or it doesn't work! :)
+                for text in stream.text_deltas:
+                    print(text)
 
-        Args:
-            tool_outputs (list): A list of tool outputs to be submitted.
-            run_id (str): The ID of the current run.
+    event_handler = EventHandler(assistant_name)
 
-        Returns:
-            None
-        """
-        with sync_openai_client.beta.threads.runs.submit_tool_outputs_stream(
-            thread_id=self.current_run.thread_id,
-            run_id=self.current_run.id,
-            tool_outputs=tool_outputs,
-            event_handler=EventHandler(assistant_name=self.assistant_name),
-        ) as stream:
-            # Needs this line, or it doesn't work! :)
-            for text in stream.text_deltas:
-                print(text)
+    return event_handler
 
 
 def run_function(function_name, function_args):
@@ -365,18 +386,21 @@ async def start_chat():
     It also sends an avatar and a welcome message to the chat.
 
     Returns:
-        None
+        dict: The thread object returned by the OpenAI API.
     """
     # Create a Thread
     thread = await async_openai_client.beta.threads.create()
     # Store thread ID in user session for later use
     cl.user_session.set("thread_id", thread.id)
+
     await cl.Message(
         content="Hi. I'm your humanitarian AI assistant.", disable_feedback=True
     ).send()
 
     cl.user_session.set("chat_history", [])
 
+    return thread
+
 
 def get_metadata_footer(metadata):
     """
@@ -699,10 +723,11 @@ async def main(message: cl.Message):
 
         # Create and Stream a Run to assistant
         print(f"Creating and streaming a run {assistant.id}")
+        event_handler = get_event_handler(cl, assistant.name)
         with sync_openai_client.beta.threads.runs.stream(
             thread_id=thread_id,
             assistant_id=assistant.id,
-            event_handler=EventHandler(assistant_name=assistant.name),
+            event_handler=event_handler,
         ) as stream:
             stream.until_done()
 

From 9b97c4805714903d8487a0111cec01a3b03229b5 Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Thu, 27 Jun 2024 17:24:17 -0400
Subject: [PATCH 08/20] Interim commit, still having thread management issues
 due to async chainlit

---
 .../chainlit-ui-evaluation/call_assistant.py  | 80 +++++++++++++++++--
 1 file changed, 72 insertions(+), 8 deletions(-)

diff --git a/flows/chainlit-ui-evaluation/call_assistant.py b/flows/chainlit-ui-evaluation/call_assistant.py
index 30befdd0..4929c8d5 100644
--- a/flows/chainlit-ui-evaluation/call_assistant.py
+++ b/flows/chainlit-ui-evaluation/call_assistant.py
@@ -96,6 +96,7 @@ async def stream_token(self, content):
                 MockMessage: The updated message.
             """
             print(f"Streaming token: Author: {self.author}, Content: {content}")
+            self.content += content
             return self
 
         async def update(self):
@@ -222,7 +223,7 @@ def run_sync(func, *args, **kwargs):
         return func(*args, **kwargs)
 
 
-def test():
+async def test_using_app_code():
     """
     This function is used to test the functionality of the app module.
     It sets up a mock chainlit class, imports the app module, and overrides certain methods and event handlers.
@@ -238,15 +239,11 @@ def test():
     # Override run_sync method to use mock cl
     app.run_sync = run_sync
 
-    # Override OpenAI event handler to use mock cl
-    event_handler = app.get_event_handler(cl_mock, "assistant")
-    cl_mock.user_session.set("event_handler", event_handler)
-
     # Patch 'cl' with our Mock class
     app.cl = cl_mock
 
     # Call app start to set up variables
-    asyncio.run(app.start_chat())
+    await app.start_chat()
 
     # Here insert history to thread
     # thread_id = app.cl.user_session.get("thread_id")
@@ -254,8 +251,75 @@ def test():
 
     # msg = cl_mock.Message(author="You", content="What is the total population of Mali", elements=[])
     msg = cl_mock.Message(author="You", content="Hi", elements=[])
-    asyncio.run(app.main(msg))
+    await app.process_message(msg)
+
+    # Get last message
+    # messages = await app.async_openai_client.beta.threads.messages.list(thread_id)
+    # print(messages)
+    # print(messages.data[0].content[0].text.value)
+
+    print("Closing OpenAI thread")
+    # app.sync_openai_client.beta.threads.delete(thread_id)
+
+
+def test_using_local_code():
+    """
+    This function is used to test the functionality of the app module.
+    It sets up a mock chainlit class, imports the app module, and overrides certain methods and event handlers.
+    Then it calls the app start and main functions to simulate a chat interaction.
+    """
+
+    # Create a Mock chainlit class
+    cl_mock = setup_mock_class()
+
+    # Import chainlit app
+    import app as app
+
+    # Override run_sync method to use mock cl
+    app.run_sync = run_sync
+
+    # Patch 'cl' with our Mock class
+    app.cl = cl_mock
+
+    # Call app start to set up variables
+    thread = asyncio.run(app.async_openai_client.beta.threads.create())
+
+    # Here insert history to thread
+    # thread_id = app.cl.user_session.get("thread_id")
+    # wait app.add_message_to_thread(thread_id, "user", message.content, message)
+
+    # msg = cl_mock.Message(author="You", content="What is the total population of Mali", elements=[])
+    message = cl_mock.Message(author="You", content="Hi", elements=[])
+
+    print(f"Adding user message {message.content} to thread {thread.id}")
+    app.sync_openai_client.beta.threads.messages.create(
+        thread_id=thread.id,
+        role="user",
+        content=message.content,
+    )
+
+    event_handler = app.get_event_handler(cl_mock, "Assistant")
+    with app.sync_openai_client.beta.threads.runs.stream(
+        thread_id=thread.id,
+        assistant_id=app.assistant.id,
+        event_handler=event_handler,
+    ) as stream:
+        stream.until_done()
+
+    # Get last thread message
+    messages = app.sync_openai_client.beta.threads.messages.list(thread.id)
+    print(messages.data[0].content[0].text.value)
+
+    print("Closing OpenAI thread")
+    app.sync_openai_client.beta.threads.delete(thread.id)
+
+    return
+
+    # First check memories
 
 
 if __name__ == "__main__":
-    test()
+    # This works, but doesn't exist due to async stuff, likely in chainlit.
+    # asyncio.run(test_using_app_code())
+
+    test_using_local_code()

From 5f82705a6d6b014fe292bc9265cd230717dad1ca Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Thu, 27 Jun 2024 18:17:32 -0400
Subject: [PATCH 09/20] Promptflow works partially

---
 .../chainlit-ui-evaluation/call_assistant.py  | 89 +++++--------------
 flows/chainlit-ui-evaluation/flow.dag.yaml    |  5 +-
 2 files changed, 23 insertions(+), 71 deletions(-)

diff --git a/flows/chainlit-ui-evaluation/call_assistant.py b/flows/chainlit-ui-evaluation/call_assistant.py
index 4929c8d5..d9323bf2 100644
--- a/flows/chainlit-ui-evaluation/call_assistant.py
+++ b/flows/chainlit-ui-evaluation/call_assistant.py
@@ -10,7 +10,8 @@
 
 
 @tool
-async def call_assistant(chat_history: list) -> dict:
+# async def call_assistant(chat_history: list) -> dict:
+def call_assistant(query: str) -> dict:
     """
     Calls the assistant API with the given input and retrieves the response.
 
@@ -34,9 +35,13 @@ async def call_assistant(chat_history: list) -> dict:
         dict: A dictionary containing the response from the assistant, function name, function arguments,
               function output, and the number of tokens in the function output.
     """
-    # print("Initializing chainlit thread ...")
-    # thread = await start_chat()
-    # print(thread)
+    message = "Hi"
+
+    result = test_using_app_code(message)
+
+    response = {"response": result}
+
+    return response
 
 
 def setup_mock_class():
@@ -223,7 +228,7 @@ def run_sync(func, *args, **kwargs):
         return func(*args, **kwargs)
 
 
-async def test_using_app_code():
+def test_using_app_code(msg):
     """
     This function is used to test the functionality of the app module.
     It sets up a mock chainlit class, imports the app module, and overrides certain methods and event handlers.
@@ -243,83 +248,29 @@ async def test_using_app_code():
     app.cl = cl_mock
 
     # Call app start to set up variables
-    await app.start_chat()
+    asyncio.run(app.start_chat())
 
     # Here insert history to thread
-    # thread_id = app.cl.user_session.get("thread_id")
+    thread_id = app.cl.user_session.get("thread_id")
     # wait app.add_message_to_thread(thread_id, "user", message.content, message)
 
     # msg = cl_mock.Message(author="You", content="What is the total population of Mali", elements=[])
-    msg = cl_mock.Message(author="You", content="Hi", elements=[])
-    await app.process_message(msg)
+    msg = cl_mock.Message(author="You", content=msg, elements=[])
+    asyncio.run(app.process_message(msg))
 
     # Get last message
-    # messages = await app.async_openai_client.beta.threads.messages.list(thread_id)
-    # print(messages)
-    # print(messages.data[0].content[0].text.value)
-
-    print("Closing OpenAI thread")
-    # app.sync_openai_client.beta.threads.delete(thread_id)
-
-
-def test_using_local_code():
-    """
-    This function is used to test the functionality of the app module.
-    It sets up a mock chainlit class, imports the app module, and overrides certain methods and event handlers.
-    Then it calls the app start and main functions to simulate a chat interaction.
-    """
-
-    # Create a Mock chainlit class
-    cl_mock = setup_mock_class()
-
-    # Import chainlit app
-    import app as app
-
-    # Override run_sync method to use mock cl
-    app.run_sync = run_sync
-
-    # Patch 'cl' with our Mock class
-    app.cl = cl_mock
-
-    # Call app start to set up variables
-    thread = asyncio.run(app.async_openai_client.beta.threads.create())
-
-    # Here insert history to thread
-    # thread_id = app.cl.user_session.get("thread_id")
-    # wait app.add_message_to_thread(thread_id, "user", message.content, message)
-
-    # msg = cl_mock.Message(author="You", content="What is the total population of Mali", elements=[])
-    message = cl_mock.Message(author="You", content="Hi", elements=[])
-
-    print(f"Adding user message {message.content} to thread {thread.id}")
-    app.sync_openai_client.beta.threads.messages.create(
-        thread_id=thread.id,
-        role="user",
-        content=message.content,
-    )
-
-    event_handler = app.get_event_handler(cl_mock, "Assistant")
-    with app.sync_openai_client.beta.threads.runs.stream(
-        thread_id=thread.id,
-        assistant_id=app.assistant.id,
-        event_handler=event_handler,
-    ) as stream:
-        stream.until_done()
-
-    # Get last thread message
-    messages = app.sync_openai_client.beta.threads.messages.list(thread.id)
+    messages = app.sync_openai_client.beta.threads.messages.list(thread_id)
+    result = messages.data[0].content[0].text.value
     print(messages.data[0].content[0].text.value)
 
     print("Closing OpenAI thread")
-    app.sync_openai_client.beta.threads.delete(thread.id)
+    app.sync_openai_client.beta.threads.delete(thread_id)
 
-    return
+    print("All done")
 
-    # First check memories
+    return result
 
 
 if __name__ == "__main__":
     # This works, but doesn't exist due to async stuff, likely in chainlit.
-    # asyncio.run(test_using_app_code())
-
-    test_using_local_code()
+    test_using_app_code("Hi")
diff --git a/flows/chainlit-ui-evaluation/flow.dag.yaml b/flows/chainlit-ui-evaluation/flow.dag.yaml
index c9709b72..1c2ee2e1 100644
--- a/flows/chainlit-ui-evaluation/flow.dag.yaml
+++ b/flows/chainlit-ui-evaluation/flow.dag.yaml
@@ -4,7 +4,7 @@ environment:
 inputs:
   query:
     type: string
-    default: what's the total population of mali?
+    default: Hi
   context:
     type: string
     default: The total population of Mali is 17,839,995
@@ -33,7 +33,8 @@ nodes:
   source:
     type: code
     path: call_assistant.py
-  inputs: {}
+  inputs:
+    query: ${inputs.query}
 - name: groundedness_score
   type: llm
   source:

From 177d3504ac21fb399685db56180810072798366f Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Fri, 28 Jun 2024 11:19:30 -0400
Subject: [PATCH 10/20] Implemented workaround for async hanging thread, to
 call script and kill after execution. Obviously, this is a very hacky
 workaround to be able to use the exact chainlit code for e2e tests, and we
 may not use this (we're implementing unit tests for recipes server and
 assistant independantly), but will finish implementation as it works and
 nearly done

---
 .../chainlit-ui-evaluation/call_assistant.py  | 182 ++++++++++++++----
 flows/chainlit-ui-evaluation/flow.dag.yaml    |   8 +-
 2 files changed, 149 insertions(+), 41 deletions(-)

diff --git a/flows/chainlit-ui-evaluation/call_assistant.py b/flows/chainlit-ui-evaluation/call_assistant.py
index d9323bf2..dffdba28 100644
--- a/flows/chainlit-ui-evaluation/call_assistant.py
+++ b/flows/chainlit-ui-evaluation/call_assistant.py
@@ -1,13 +1,18 @@
+import argparse
 import asyncio
 import inspect
-import sys
+import json
+import os
+import signal
+import subprocess
 import threading
-from contextlib import asynccontextmanager
-from contextvars import ContextVar
+import time
 
-import chainlit as cl
 from promptflow.core import tool
 
+FINISH_PHRASE = "all done"
+OUTPUT_TAG = "ASSISTANT_OUTPUT"
+
 
 @tool
 # async def call_assistant(chat_history: list) -> dict:
@@ -21,13 +26,15 @@ def call_assistant(query: str) -> dict:
         [
             {
                 "author": "user",
-                "content": "Hi",
-                "elements": []
+                "content": "Hi"
             },
             {
                 "author": "assistant",
                 "content": "Hello! How can I help you today?",
-                "elements": []
+            },
+            {
+                "author": "assistant",
+                "content": "What's the total population of Mali?",
             }
         ]
 
@@ -35,9 +42,23 @@ def call_assistant(query: str) -> dict:
         dict: A dictionary containing the response from the assistant, function name, function arguments,
               function output, and the number of tokens in the function output.
     """
-    message = "Hi"
 
-    result = test_using_app_code(message)
+    chat_history = [
+        {"author": "user", "content": "Hi"},
+        {
+            "author": "assistant",
+            "content": "Hello! How can I help you today?",
+        },
+        {
+            "author": "assistant",
+            "content": "Hi again!",
+        },
+    ]
+    chat_history = json.dumps(chat_history)
+    chat_history = chat_history.replace('"', '\\"')
+    chat_history = chat_history.replace("'", "\\'")
+
+    result = run_chainlit_mock(chat_history)
 
     response = {"response": result}
 
@@ -164,6 +185,18 @@ def __init__(self):
             self.__name__ = "chainlit"
             self.step = None
 
+        def Text(self, name, content, display):
+            """
+            Creates a text element.
+
+            Args:
+                text (str): The text content.
+
+            Returns:
+                dict: A dictionary containing the text element.
+            """
+            return {"type": "Text", "text": content}
+
     cl_mock = MockChainlit()
 
     return cl_mock
@@ -200,6 +233,54 @@ def start_loop(loop):
         return None
 
 
+def run_chainlit_mock(chat_history: str) -> str:
+    """
+    This function is used to run the chainlit script and monitor its output.
+    TODO It is a temporary workaround because running the exact chainlit code
+    does not exit all asynchronous threads and hangs. This workaround is temporary
+    and should be replaced by breaking e2e testing into data recipes API and
+    the assistant. Testing both independently is way less complicated.
+
+    Args:
+        chat_history (str): A string containing the chat history.
+
+    Returns:
+        result (str): The result of the chainlit script running with input history
+
+    """
+
+    all_output = ""
+    result = ""
+    print("Monitoring chainlit output")
+    process = subprocess.Popen(
+        ["python3", "call_assistant.py", "--chat_history", chat_history],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    print(process)
+    while True:
+        output = process.stdout.readline()
+        print(output)
+        if output == b"" and process.poll() is not None:
+            break
+        if output:
+            all_output += output.decode("utf-8")
+            print(output.strip())
+            if FINISH_PHRASE in str(output).lower():
+                print(FINISH_PHRASE)
+                print("Killing process")
+                os.kill(process.pid, signal.SIGKILL)
+                print(OUTPUT_TAG)
+                if OUTPUT_TAG in all_output:
+                    result = all_output.split(OUTPUT_TAG)[1].strip()
+                    print("Result:", result)
+                else:
+                    result = "Unparsable output"
+                break
+        time.sleep(0.1)
+    return result
+
+
 def run_sync(func, *args, **kwargs):
     """
     Run a function synchronously or asynchronously depending on its type.
@@ -228,49 +309,82 @@ def run_sync(func, *args, **kwargs):
         return func(*args, **kwargs)
 
 
-def test_using_app_code(msg):
-    """
-    This function is used to test the functionality of the app module.
-    It sets up a mock chainlit class, imports the app module, and overrides certain methods and event handlers.
-    Then it calls the app start and main functions to simulate a chat interaction.
-    """
+async def test_using_app_code_async(msg, timeout=5):
 
-    # Create a Mock chainlit class
     cl_mock = setup_mock_class()
-
-    # Import chainlit app
     import app as app
 
-    # Override run_sync method to use mock cl
     app.run_sync = run_sync
-
-    # Patch 'cl' with our Mock class
     app.cl = cl_mock
 
-    # Call app start to set up variables
-    asyncio.run(app.start_chat())
+    await app.start_chat()
 
-    # Here insert history to thread
     thread_id = app.cl.user_session.get("thread_id")
-    # wait app.add_message_to_thread(thread_id, "user", message.content, message)
 
-    # msg = cl_mock.Message(author="You", content="What is the total population of Mali", elements=[])
     msg = cl_mock.Message(author="You", content=msg, elements=[])
-    asyncio.run(app.process_message(msg))
+    await app.process_message(msg)
 
-    # Get last message
     messages = app.sync_openai_client.beta.threads.messages.list(thread_id)
     result = messages.data[0].content[0].text.value
-    print(messages.data[0].content[0].text.value)
 
-    print("Closing OpenAI thread")
-    app.sync_openai_client.beta.threads.delete(thread_id)
+    return result
 
-    print("All done")
 
+def test_using_app_code(msg):
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    result = loop.run_until_complete(test_using_app_code_async(msg))
+    loop.close()
     return result
 
 
+def main_direct_function():
+    """
+    TODO
+    For testing direct function call, which hangs even though finished because of
+    some issue with async. Left here for future reference for somebody to fix so
+    the script execution and kill hack can be retired.
+
+    """
+    # chat_history = '[{\"author\": \"user\",\"content\": \"Hi\"},{\"author\":\"assistant\content\": \"Hello! How can I help you today?\"},{\"author\": \"assistant\",\"content\": \"What is the total population of Mali?\"}]'
+    chat_history = '[{"author": "user","content": "Hi"}'
+
+    result = test_using_app_code(chat_history)
+    print("OUTPUT")
+    print(result)
+    print("OUTPUT")
+
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        description="Process check in and check out operations (i.e. extracting recipes and recipes from the database for quality checks and edits)."
+    )
+
+    parser.add_argument(
+        "--chat_history",
+        type=str,
+        required=True,
+        help="""
+            A list containing the chat history, of the format (but in one line) ...
+
+            '[{\"author\": \"user\",\"content\": \"Hi\"},{\"author\":\"assistant\",\"content\": \"Hello! How can I help you today?\"},{\"author\": \"assistant\",\"content\": \"What is the total population of Mali?\"}]'
+        """,
+    )
+
+    args = parser.parse_args()
+    chat_history = args.chat_history
+
+    if chat_history:
+        result = test_using_app_code(chat_history)
+        print(OUTPUT_TAG)
+        print(result)
+        print(OUTPUT_TAG)
+
+        # Do not remove this line
+        print(FINISH_PHRASE)
+
+
 if __name__ == "__main__":
-    # This works, but doesn't exist due to async stuff, likely in chainlit.
-    test_using_app_code("Hi")
+    # main_direct_function()
+    main()
diff --git a/flows/chainlit-ui-evaluation/flow.dag.yaml b/flows/chainlit-ui-evaluation/flow.dag.yaml
index 1c2ee2e1..a256ae0c 100644
--- a/flows/chainlit-ui-evaluation/flow.dag.yaml
+++ b/flows/chainlit-ui-evaluation/flow.dag.yaml
@@ -9,7 +9,7 @@ inputs:
     type: string
     default: The total population of Mali is 17,839,995
 outputs:
-  agent_putput:
+  agent_output:
     type: string
     reference: ${call_assistant.output.response}
   groundedness_score:
@@ -21,12 +21,6 @@ outputs:
   query:
     type: string
     reference: ${inputs.query}
-  function_name:
-    type: string
-    reference: ${call_assistant.output.function_name}
-  function_args:
-    type: string
-    reference: ${call_assistant.output.function_args}
 nodes:
 - name: call_assistant
   type: python

From c423f1444323af0358253c4bef9a0b325c71e475 Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Fri, 28 Jun 2024 11:21:42 -0400
Subject: [PATCH 11/20] Implemented workaround for async hanging thread, to
 call script and kill after execution. Obviously, this is a very hacky
 workaround to be able to use the exact chainlit code for e2e tests, and we
 may not use this (we're implementing unit tests for recipes server and
 assistant independantly), but will finish implementation as it works and
 nearly done

---
 ui/chat-chainlit-assistant/app.py | 32 +++++++++++--------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/ui/chat-chainlit-assistant/app.py b/ui/chat-chainlit-assistant/app.py
index d407a110..8a217d5c 100644
--- a/ui/chat-chainlit-assistant/app.py
+++ b/ui/chat-chainlit-assistant/app.py
@@ -128,6 +128,8 @@ def on_event(self, event):
                 self.handle_requires_action(event.data, run_id)
             elif event.event == "thread.message.delta":
                 self.handle_message_delta(event.data)
+            elif event.event == "thread.run.completed":
+                print("Run completed")
             else:
                 print(json.dumps(str(event.data), indent=4))
                 print(f"Unhandled event: {event.event}")
@@ -613,10 +615,6 @@ async def on_audio_end(elements: list[Element]):
         # elements=[input_audio_el, *elements],
     ).send()
 
-    msg = cl.Message(author="You", content=transcription, elements=elements)
-
-    await main(message=msg)
-
 
 @cl.password_auth_callback
 def auth_callback(username: str, password: str):
@@ -654,31 +652,23 @@ async def add_message_to_thread(thread_id, role, content, message=None):
 
     print(f"Content: {content}")
 
+    attachments = []
+
     # Azure doesn't yet support attachments
     if os.getenv("ASSISTANTS_API_TYPE") == "openai":
-
         if message is not None:
             attachments = await process_files(message.elements)
 
-        # Add a Message to the Thread
-        await async_openai_client.beta.threads.messages.create(
-            thread_id=thread_id,
-            role=role,
-            content=content,
-            attachments=attachments,
-        )
-    else:
-
-        # Add a Message to the Thread
-        await async_openai_client.beta.threads.messages.create(
-            thread_id=thread_id,
-            role=role,
-            content=content,
-        )
+    await async_openai_client.beta.threads.messages.create(
+        thread_id=thread_id,
+        role=role,
+        content=content,
+        attachments=attachments,
+    )
 
 
 @cl.on_message
-async def main(message: cl.Message):
+async def process_message(message: cl.Message):
     """
     Process the user's message and interact with the assistant.
 

From 4c2a457bc6aa1ccd5d01a5d32b9ae18338d1fd1d Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Fri, 28 Jun 2024 11:33:51 -0400
Subject: [PATCH 12/20] Implemented workaround for async hanging thread, to
 call script and kill after execution. Obviously, this is a very hacky
 workaround to be able to use the exact chainlit code for e2e tests, and we
 may not use this (we're implementing unit tests for recipes server and
 assistant independantly), but will finish implementation as it works and
 nearly done

---
 .../chainlit-ui-evaluation/call_assistant.py  | 33 ++++++++++++-------
 flows/chainlit-ui-evaluation/flow.dag.yaml    |  8 +++++
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/flows/chainlit-ui-evaluation/call_assistant.py b/flows/chainlit-ui-evaluation/call_assistant.py
index dffdba28..4664e5ae 100644
--- a/flows/chainlit-ui-evaluation/call_assistant.py
+++ b/flows/chainlit-ui-evaluation/call_assistant.py
@@ -16,11 +16,12 @@
 
 @tool
 # async def call_assistant(chat_history: list) -> dict:
-def call_assistant(query: str) -> dict:
+def call_assistant(query: str, chat_history: str) -> dict:
     """
     Calls the assistant API with the given input and retrieves the response.
 
     Args:
+        query: What the user asked
         chat_history (list): A list containing the chat history, of the format ...
 
         [
@@ -43,17 +44,25 @@ def call_assistant(query: str) -> dict:
               function output, and the number of tokens in the function output.
     """
 
-    chat_history = [
-        {"author": "user", "content": "Hi"},
-        {
-            "author": "assistant",
-            "content": "Hello! How can I help you today?",
-        },
-        {
-            "author": "assistant",
-            "content": "Hi again!",
-        },
-    ]
+    print(chat_history)
+
+    chat_history = json.loads(chat_history)
+
+    # Add user query to chat history
+    chat_history.append({"author": "user", "content": query})
+
+    # chat_history = [
+    #     {"author": "user", "content": "Hi"},
+    #     {
+    #         "author": "assistant",
+    #         "content": "Hello! How can I help you today?",
+    #     },
+    #     {
+    #         "author": "assistant",
+    #         "content": "Hi again!",
+    #     },
+    # ]
+
     chat_history = json.dumps(chat_history)
     chat_history = chat_history.replace('"', '\\"')
     chat_history = chat_history.replace("'", "\\'")
diff --git a/flows/chainlit-ui-evaluation/flow.dag.yaml b/flows/chainlit-ui-evaluation/flow.dag.yaml
index a256ae0c..127b541a 100644
--- a/flows/chainlit-ui-evaluation/flow.dag.yaml
+++ b/flows/chainlit-ui-evaluation/flow.dag.yaml
@@ -8,6 +8,13 @@ inputs:
   context:
     type: string
     default: The total population of Mali is 17,839,995
+  chat_history:
+    type: string
+    default: "[{\"author\": \"user\",\"content\":
+      \"Hi\"},{\"author\":\"assistant\",\"content\": \"Hello!
+      How can I help you today?\"},{\"author\":
+      \"assistant\",\"content\": \"What is the total population of
+      Mali?\"}]"
 outputs:
   agent_output:
     type: string
@@ -29,6 +36,7 @@ nodes:
     path: call_assistant.py
   inputs:
     query: ${inputs.query}
+    chat_history: ${inputs.chat_history}
 - name: groundedness_score
   type: llm
   source:

From ac9981570335315b54fa33b7ce790a6239ae4617 Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Fri, 28 Jun 2024 11:37:09 -0400
Subject: [PATCH 13/20] Implemented workaround for async hanging thread, to
 call script and kill after execution. Obviously, this is a very hacky
 workaround to be able to use the exact chainlit code for e2e tests, and we
 may not use this (we're implementing unit tests for recipes server and
 assistant independantly), but will finish implementation as it works and
 nearly done

---
 flows/chainlit-ui-evaluation/flow.dag.yaml | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/flows/chainlit-ui-evaluation/flow.dag.yaml b/flows/chainlit-ui-evaluation/flow.dag.yaml
index 127b541a..8a02b7b8 100644
--- a/flows/chainlit-ui-evaluation/flow.dag.yaml
+++ b/flows/chainlit-ui-evaluation/flow.dag.yaml
@@ -7,14 +7,20 @@ inputs:
     default: Hi
   context:
     type: string
-    default: The total population of Mali is 17,839,995
+    default: '"The answer is:\n\n            \n            The answer is:
+      **17,907,114.0**\n\n            Metadata for the
+      answer:\n            {\"params\": {\"country_code\": \"MLI\"},
+      \"attribution\":
+      \"https://data.humdata.org/dataset/ce21c7db-d8f0-40f8-adc2-452d2d2d105c\",
+      \"data_url\":
+      \"https://data.humdata.org/dataset/ce21c7db-d8f0-40f8-adc2-452d2d2d105c/resource/6f243ba2-4d4a-4663-a7c4-e917dbbde73a/download/mli_pop_adm0_v2.csv\",
+      \"time_period\": {\"start\": \"2018-01-01\", \"end\":
+      \"2018-12-31T23:59:59\"}}"'
   chat_history:
     type: string
-    default: "[{\"author\": \"user\",\"content\":
-      \"Hi\"},{\"author\":\"assistant\",\"content\": \"Hello!
-      How can I help you today?\"},{\"author\":
-      \"assistant\",\"content\": \"What is the total population of
-      Mali?\"}]"
+    default: '[{"author": "user","content": "Hi"},{"author":"assistant","content":
+      "Hello! How can I help you today?"},{"author": "assistant","content":
+      "What is the total population of Mali?"}]'
 outputs:
   agent_output:
     type: string

From 6fdb85f728d6b3a96896b4d9bb2f4ce393b4dbd7 Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Fri, 28 Jun 2024 11:45:55 -0400
Subject: [PATCH 14/20] Added Promptflow to docker build as a dev option, ie
 not part of prod

---
 README.md              |  7 +++++++
 docker-compose-dev.yml | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 docker-compose-dev.yml

diff --git a/README.md b/README.md
index 9b708359..485f2d86 100644
--- a/README.md
+++ b/README.md
@@ -201,6 +201,12 @@ To activate:
 
 # Evaluation with Prompt Flow
 
+First, you will need to build the environment to include Prompt Flow ...
+
+`docker compose -f docker-compose.yml -f docker-compose-dev.yml up -d --build`
+
+Then ...
+
 1. Install the DevContainers VSCode extension 
 2. Build data recipes using the `docker compose` command mentioned above
 3. Open the command palette in VSCode (CMD + Shift + P on Mac; CTRL + Shift + P on Windows) and select 
@@ -211,6 +217,7 @@ To activate:
 4. Open folder `/app`
 5. Click on `flow.dag.yaml`
 6. Top left of main pane, click on 'Visual editor'
+7. You can no run by clicking the play icon. See Promptflow documentation for more details
 
 # Deployment
 
diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml
new file mode 100644
index 00000000..804eb784
--- /dev/null
+++ b/docker-compose-dev.yml
@@ -0,0 +1,18 @@
+#version: "3.4"
+
+services:
+  promptflow:
+    image: mcr.microsoft.com/azureml/promptflow/promptflow-runtime-stable:latest
+    container_name: recipes-ai-promptflow
+    env_file:
+          - .env
+    volumes:
+      - ./flows:/app
+      - ./utils:/app/chainlit-ui-evaluation/utils
+      - ./templates:/app/chainlit-ui-evaluation/templates
+      - shared-data:/app/chainlit-ui-evaluation/recipes/public
+      - ./management/skills.py:/app/chainlit-ui-evaluation/recipes/skills.py
+      - ./ui/chat-chainlit-assistant/app.py:/app/chainlit-ui-evaluation/app.py
+volumes:
+  pgdata2:
+  shared-data:
\ No newline at end of file

From 248ec0f436e14d918ae8c2d1f8d12339b692e25b Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Fri, 28 Jun 2024 13:36:21 -0400
Subject: [PATCH 15/20] Adjusted AI judge prompt as part of creating unit
 tests. We will refine systematically as testing infra gets added

---
 ...generate_intent_from_history_prompt.jinja2 | 71 ++++++++++++++++++-
 1 file changed, 69 insertions(+), 2 deletions(-)

diff --git a/templates/generate_intent_from_history_prompt.jinja2 b/templates/generate_intent_from_history_prompt.jinja2
index dd88e2f4..39fdc0fd 100644
--- a/templates/generate_intent_from_history_prompt.jinja2
+++ b/templates/generate_intent_from_history_prompt.jinja2
@@ -36,8 +36,75 @@ user_preferences: Any user-specific preferences for the analysis or output (e.g.
 
 Here are some examples of what your output should look like:
 
-plot a bar chart of population by state in Nigeria for 2023 using HDX(HAPI) data, highlighting top 5 states as an image
-generate a line chart of average commodity prices by year for 2022-2023 for Haiti
+======== EXAMPLE 1
+
+The user asked this question:
+
+What is the total population of Mali?
+
+Here is the chat history:
+
+[{"author": "user","content": "Hi"},{"author":"assistant","content": "Hello! How can I help you today?"},{"author": "user","content": "What is the total population of Mali"}]
+
+Output:
+
+{
+    "intent": "provide the total population of Mali",
+    "reason": "The user's last request clearly asked for the total population of Mali in their last input. There is no need for further clarification."
+}
+
+======== EXAMPLE 2
+
+The user asked this question:
+
+Can you give me average commodity prices by year for 2022-2023 for Haiti?
+
+Here is the chat history:
+
+[{"author": "user","content": "Can you give me average commodity prices by year for 2022-2023 for Haiti?"}]
+
+Output:
+
+{
+    "intent": "generate a line chart of average commodity prices by year for 2022-2023 for Haiti",
+    "reason": "The user's last request specifies the action (generate), visualization type (line chart), data type (average commodity prices), time range (2022-2023), and location (Haiti). There is no need for further clarification."
+}
+
+======== EXAMPLE 3
+
+The user asked this question:
+
+Plot a bar chart for the population of Nigeria by admin 1, and highlight the top 5 states please
+
+Here is the chat history:
+
+[{"author": "user","content": "Do you have data on conflicts in Ethiopia"}, {"author": "user","content": "Plot a map for the population of Nigeria by admin 1"}]
+
+Output:
+
+{
+    "intent": "plot a bar chart of population by state in Nigeria for 2023 using HDX(HAPI) data, highlighting top 5 states as an image",
+    "reason": "The user's last request specifies the action (plot), visualization type (bar chart), disaggregation (by state), data source (HDX(HAPI) data), location (Nigeria), and output format (image). The user also requested to highlight the top 5 states. There is no need for further clarification."
+}
+
+======== EXAMPLE 4
+
+The user asked this question:
+
+Plot that by state on a map
+
+Here is the chat history:
+
+[{"author": "user","content": "What's the total populkation of Nigeria?"}, {"author": "user","content": "Plot that by state on a map"}]
+
+Output:
+
+{
+    "intent": "plot a map of the total population of Nigeria by state as an image",
+    "reason": "The user's last request was a follow-on from a previous request and specifies the action (plot), visualization type (map), disaggregation (by state), data type (total population), location (Nigeria), and output format (image). There is no need for further clarification."
+}
+
+=======================================================================
 
 Task:
 

From 009de7866ac81fffa322c78edd8aa6d1156087fa Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Fri, 28 Jun 2024 17:10:19 -0400
Subject: [PATCH 16/20] Fixed bug to back-populate assistant history when in
 test mode

---
 .../chainlit-ui-evaluation/call_assistant.py  | 28 ++++++++++++++++---
 flows/chainlit-ui-evaluation/flow.dag.yaml    |  4 +--
 ...generate_intent_from_history_prompt.jinja2 | 15 +++++-----
 ui/chat-chainlit-assistant/app.py             |  3 ++
 4 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/flows/chainlit-ui-evaluation/call_assistant.py b/flows/chainlit-ui-evaluation/call_assistant.py
index 4664e5ae..8a5d0d89 100644
--- a/flows/chainlit-ui-evaluation/call_assistant.py
+++ b/flows/chainlit-ui-evaluation/call_assistant.py
@@ -67,6 +67,8 @@ def call_assistant(query: str, chat_history: str) -> dict:
     chat_history = chat_history.replace('"', '\\"')
     chat_history = chat_history.replace("'", "\\'")
 
+    print("History:", chat_history)
+
     result = run_chainlit_mock(chat_history)
 
     response = {"response": result}
@@ -271,6 +273,9 @@ def run_chainlit_mock(chat_history: str) -> str:
         output = process.stdout.readline()
         print(output)
         if output == b"" and process.poll() is not None:
+            print(
+                "Process finished with No output, try running call_assistant by hand to debug."
+            )
             break
         if output:
             all_output += output.decode("utf-8")
@@ -318,7 +323,7 @@ def run_sync(func, *args, **kwargs):
         return func(*args, **kwargs)
 
 
-async def test_using_app_code_async(msg, timeout=5):
+async def test_using_app_code_async(chat_history, timeout=5):
 
     cl_mock = setup_mock_class()
     import app as app
@@ -330,7 +335,22 @@ async def test_using_app_code_async(msg, timeout=5):
 
     thread_id = app.cl.user_session.get("thread_id")
 
-    msg = cl_mock.Message(author="You", content=msg, elements=[])
+    # Here build history
+    chat_history = chat_history.replace("\\", "")
+    print(">>>>>>>> Chat history:", chat_history)
+    history = json.loads(chat_history)
+    last_message = history[-1]
+    app_chat_history = app.cl.user_session.get("chat_history")
+    for message in history:
+        role = message["author"]
+        msg = message["content"]
+        await app.add_message_to_thread(thread_id, role, msg)
+        app_chat_history.append({"role": role, "content": msg})
+    app.cl.user_session.set("chat_history", history)
+
+    print("<<<<<<<< Last message:", last_message)
+
+    msg = cl_mock.Message(author="user", content=last_message["content"], elements=[])
     await app.process_message(msg)
 
     messages = app.sync_openai_client.beta.threads.messages.list(thread_id)
@@ -339,10 +359,10 @@ async def test_using_app_code_async(msg, timeout=5):
     return result
 
 
-def test_using_app_code(msg):
+def test_using_app_code(chat_history):
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
-    result = loop.run_until_complete(test_using_app_code_async(msg))
+    result = loop.run_until_complete(test_using_app_code_async(chat_history))
     loop.close()
     return result
 
diff --git a/flows/chainlit-ui-evaluation/flow.dag.yaml b/flows/chainlit-ui-evaluation/flow.dag.yaml
index 8a02b7b8..03f6bd08 100644
--- a/flows/chainlit-ui-evaluation/flow.dag.yaml
+++ b/flows/chainlit-ui-evaluation/flow.dag.yaml
@@ -18,9 +18,7 @@ inputs:
       \"2018-12-31T23:59:59\"}}"'
   chat_history:
     type: string
-    default: '[{"author": "user","content": "Hi"},{"author":"assistant","content":
-      "Hello! How can I help you today?"},{"author": "assistant","content":
-      "What is the total population of Mali?"}]'
+    default: '[{"author": "user","content": "Whatis the total population of Mali?"}]'
 outputs:
   agent_output:
     type: string
diff --git a/templates/generate_intent_from_history_prompt.jinja2 b/templates/generate_intent_from_history_prompt.jinja2
index 39fdc0fd..1d669f63 100644
--- a/templates/generate_intent_from_history_prompt.jinja2
+++ b/templates/generate_intent_from_history_prompt.jinja2
@@ -10,12 +10,6 @@ Here is the chat history:
 
 {{ chat_history }}
 
-Important:
-
-- Be careful to note the chat history, they may have just asked a follow-up question
-- Put more emphasis on their last input, it has a stronger influence on the intent than earlier inputs in chat_history
-- include all entities such as places
-
 Intent format:
 
 The intent should cature any of these fields that have been specified in the user's request and history:
@@ -44,7 +38,7 @@ What is the total population of Mali?
 
 Here is the chat history:
 
-[{"author": "user","content": "Hi"},{"author":"assistant","content": "Hello! How can I help you today?"},{"author": "user","content": "What is the total population of Mali"}]
+[{"author": "user","content": "Hi"},{"author": "user","content": "What is the total population of Mali"}]
 
 Output:
 
@@ -84,7 +78,7 @@ Output:
 
 {
     "intent": "plot a bar chart of population by state in Nigeria for 2023 using HDX(HAPI) data, highlighting top 5 states as an image",
-    "reason": "The user's last request specifies the action (plot), visualization type (bar chart), disaggregation (by state), data source (HDX(HAPI) data), location (Nigeria), and output format (image). The user also requested to highlight the top 5 states. There is no need for further clarification."
+    "reason": "The user changed context, their last request specifies the action (plot), visualization type (bar chart), disaggregation (by state), data source (HDX(HAPI) data), location (Nigeria), and output format (image). The user also requested to highlight the top 5 states. There is no need for further clarification."
 }
 
 ======== EXAMPLE 4
@@ -106,6 +100,11 @@ Output:
 
 =======================================================================
 
+Important:
+
+- The user's last input is the most important, but pay attention to the chat history to see if they changed context are are asking something entirely new, or are asking a follow-up question
+- include all entities such as places
+
 Task:
 
 Return the user's intent
diff --git a/ui/chat-chainlit-assistant/app.py b/ui/chat-chainlit-assistant/app.py
index 8a217d5c..209b2c40 100644
--- a/ui/chat-chainlit-assistant/app.py
+++ b/ui/chat-chainlit-assistant/app.py
@@ -470,6 +470,9 @@ def check_memories_recipes(user_input: str, history=[]) -> str:
     memory_response = None
     meta_data_msg = ""
 
+    print("History:")
+    print(history)
+
     memory = call_get_memory_recipe_api(
         user_input, history=str(history), generate_intent="true"
     )

From f07f853f0a377dd38bdfc469be23e73d8a369f1f Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Fri, 28 Jun 2024 17:21:12 -0400
Subject: [PATCH 17/20] Fixed bug to back-populate assistant history when in
 test mode

---
 flows/chainlit-ui-evaluation/call_assistant.py | 2 +-
 flows/chainlit-ui-evaluation/flow.dag.yaml     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/flows/chainlit-ui-evaluation/call_assistant.py b/flows/chainlit-ui-evaluation/call_assistant.py
index 8a5d0d89..156ae81e 100644
--- a/flows/chainlit-ui-evaluation/call_assistant.py
+++ b/flows/chainlit-ui-evaluation/call_assistant.py
@@ -132,7 +132,7 @@ async def stream_token(self, content):
             Returns:
                 MockMessage: The updated message.
             """
-            print(f"Streaming token: Author: {self.author}, Content: {content}")
+            # print(f"Streaming token: Author: {self.author}, Content: {content}")
             self.content += content
             return self
 
diff --git a/flows/chainlit-ui-evaluation/flow.dag.yaml b/flows/chainlit-ui-evaluation/flow.dag.yaml
index 03f6bd08..92107c0b 100644
--- a/flows/chainlit-ui-evaluation/flow.dag.yaml
+++ b/flows/chainlit-ui-evaluation/flow.dag.yaml
@@ -4,7 +4,7 @@ environment:
 inputs:
   query:
     type: string
-    default: Hi
+    default: What's the total population of Mali?
   context:
     type: string
     default: '"The answer is:\n\n            \n            The answer is:
@@ -18,7 +18,7 @@ inputs:
       \"2018-12-31T23:59:59\"}}"'
   chat_history:
     type: string
-    default: '[{"author": "user","content": "Whatis the total population of Mali?"}]'
+    default: '[{"author": "user","content": "Hi!"}]'
 outputs:
   agent_output:
     type: string

From 5218ed8d74f61b0d75f71065f1e63ae5fca1be54 Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Fri, 28 Jun 2024 17:35:42 -0400
Subject: [PATCH 18/20] Had to add a dockerbuild to be able to install and mock
 chainlit

---
 docker-compose-dev.yml                  |  5 ++++-
 docker-compose.yml                      | 12 ------------
 flows/chainlit-ui-evaluation/Dockerfile |  6 ++++++
 3 files changed, 10 insertions(+), 13 deletions(-)
 create mode 100644 flows/chainlit-ui-evaluation/Dockerfile

diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml
index 804eb784..fabd2495 100644
--- a/docker-compose-dev.yml
+++ b/docker-compose-dev.yml
@@ -2,7 +2,10 @@
 
 services:
   promptflow:
-    image: mcr.microsoft.com/azureml/promptflow/promptflow-runtime-stable:latest
+    #image: mcr.microsoft.com/azureml/promptflow/promptflow-runtime-stable:latest
+    build:
+      context: .
+      dockerfile: ./flows/chainlit-ui-evaluation//Dockerfile
     container_name: recipes-ai-promptflow
     env_file:
           - .env
diff --git a/docker-compose.yml b/docker-compose.yml
index 0b51038b..f6cb9e43 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -140,18 +140,6 @@ services:
       - ./utils:/app/utils
       - ./templates:/app/templates
       - ./db/recipedb:/app/db
-  promptflow:
-    image: mcr.microsoft.com/azureml/promptflow/promptflow-runtime-stable:latest
-    container_name: recipes-ai-promptflow
-    env_file:
-          - .env
-    volumes:
-      - ./flows:/app
-      - ./utils:/app/chainlit-ui-evaluation/utils
-      - ./templates:/app/chainlit-ui-evaluation/templates
-      - shared-data:/app/chainlit-ui-evaluation/recipes/public
-      - ./management/skills.py:/app/chainlit-ui-evaluation/recipes/skills.py
-      - ./ui/chat-chainlit-assistant/app.py:/app/chainlit-ui-evaluation/app.py
 volumes:
   pgdata2:
   shared-data:
\ No newline at end of file
diff --git a/flows/chainlit-ui-evaluation/Dockerfile b/flows/chainlit-ui-evaluation/Dockerfile
new file mode 100644
index 00000000..42b6b36f
--- /dev/null
+++ b/flows/chainlit-ui-evaluation/Dockerfile
@@ -0,0 +1,6 @@
+FROM mcr.microsoft.com/azureml/promptflow/promptflow-runtime-stable:latest
+
+# No need to copy the app code, we mount via docker-compose-dev.yml
+
+RUN pip3 install --upgrade pip
+RUN pip3 install chainlit==1.1.305
\ No newline at end of file

From 56b3de380492be9d3e6698e783e9244baf0ec4e5 Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Fri, 28 Jun 2024 17:48:54 -0400
Subject: [PATCH 19/20] Fixed bug to back-populate assistant history when in
 test mode

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 485f2d86..32e24977 100644
--- a/README.md
+++ b/README.md
@@ -214,10 +214,11 @@ Then ...
    `Dev Containers: Attach to remote container`. 
 
    Select the promptflow container. This opens a new VSCode window - use it for the next steps.
-4. Open folder `/app`
-5. Click on `flow.dag.yaml`
-6. Top left of main pane, click on 'Visual editor'
-7. You can no run by clicking the play icon. See Promptflow documentation for more details
+4. Install Promptflow add-in
+5. Open folder `/app`
+6. Click on `flow.dag.yaml`
+7. Top left of main pane, click on 'Visual editor'
+8. You can no run by clicking the play icon. See Promptflow documentation for more details
 
 # Deployment
 

From 4405eb091971978400e079351c9d29a416775d7d Mon Sep 17 00:00:00 2001
From: Matthew Harris <matthew@regolith.org>
Date: Fri, 28 Jun 2024 17:51:23 -0400
Subject: [PATCH 20/20] Fixed bug to back-populate assistant history when in
 test mode

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 32e24977..3eb7a4bc 100644
--- a/README.md
+++ b/README.md
@@ -218,7 +218,9 @@ Then ...
 5. Open folder `/app`
 6. Click on `flow.dag.yaml`
 7. Top left of main pane, click on 'Visual editor'
-8. You can no run by clicking the play icon. See Promptflow documentation for more details
+8. On bottom left under connections, configure an Azure OpenAI connection called 'azure_openai'
+9. On the Groundedness node, select your new connection
+10. You can no run by clicking the play icon. See Promptflow documentation for more details
 
 # Deployment