Merge branch 'main' into feat/model_evaluation

datakind · Jul 17, 2024 · 755df4c · 755df4c
2 parents 6cf4d45 + 6a82f22
commit 755df4c
Show file tree

Hide file tree

Showing 8 changed files with 58 additions and 42 deletions.
diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml
@@ -1,12 +1,13 @@
 name: End-to-End tests
 
-#on: [push, pull_request]
-
-on: [push]
-
-#on:
-#  pull_request_target:
-#    types: [labeled]
+on:
+  push:
+    branches:
+      - develop
+  pull_request:
+    branches:
+      - master
+      - main
 
 jobs:
   test:
@@ -134,37 +135,37 @@ jobs:
         #- name: DEBUG - Setup upterm session
         #  uses: lhotari/action-upterm@v1
 
-        #- name: DEBUG - Run Selenium outside of promptflow
-        #  run: |
-        #    docker exec promptflow python call_assistant.py
-
-        - name: Run tests 
+        - name: DEBUG - Run Selenium outside of promptflow
           run: |
-            env > .env 
-            docker exec promptflow pf run create --flow . --data ./data.jsonl --stream  --column-mapping query='${data.query}' context='${data.context}' chat_history='${data.chat_history}'  --name base_run 
+            docker exec promptflow python call_assistant.py
 
-        - name: Check logs post-tests
-          run: |
-            docker ps
+        # - name: Run tests 
+        #   run: |
+        #     env > .env 
+        #     docker exec promptflow pf run create --flow . --data ./data.jsonl --stream  --column-mapping query='${data.query}' context='${data.context}' chat_history='${data.chat_history}'  --name base_run 
 
-            echo "logs datadb ..."
-            docker compose logs datadb
+        # - name: Check logs post-tests
+        #   run: |
+        #     docker ps
 
-            echo "logs promptflow ..."
-            docker logs promptflow
+        #     echo "logs datadb ..."
+        #     docker compose logs datadb
 
-            echo "logs chat ..."
-            docker compose logs chat
+        #     echo "logs promptflow ..."
+        #     docker logs promptflow
 
-            echo "logs server ..."
-            docker compose logs server  
+        #     echo "logs chat ..."
+        #     docker compose logs chat
 
-        - name: Show results
-          run: |
-            docker exec promptflow pf run show-details -n base_run
-            echo "Getting metrics ..."
-            docker exec promptflow  pf run show-metrics -n base_run
-            ##docker exec promptflow  pf run visualize -n base_run
-            echo "Checking results ..."
-            docker exec promptflow python3 check_evaluation_results.py
+        #     echo "logs server ..."
+        #     docker compose logs server  
+
+        # - name: Show results
+        #   run: |
+        #     docker exec promptflow pf run show-details -n base_run
+        #     echo "Getting metrics ..."
+        #     docker exec promptflow  pf run show-metrics -n base_run
+        #     ##docker exec promptflow  pf run visualize -n base_run
+        #     echo "Checking results ..."
+        #     docker exec promptflow python3 check_evaluation_results.py
 
diff --git a/.github/workflows/get_memory_test.yml b/.github/workflows/get_memory_test.yml
@@ -101,6 +101,16 @@ jobs:
 
             sleep 10
 
+        # Debugging GitHUb actions interactively, by connecting to the runner ...
+        # Get ssh connection details for runner.
+        # See here https://github.com/marketplace/actions/debugging-with-ssh
+        # Basically, uncomment this, then get connection string in actions output, then connect with
+        #
+        # ssh -i <YOUR GITHUB SSH KEY> <CONN STRING ON ACTIONS>
+        #
+        #- name: DEBUG - Setup upterm session
+        #  uses: lhotari/action-upterm@v1
+
         - name: Run tests
           run: |
             echo "exec into container ..."

diff --git a/CONTRIBUTION.md b/CONTRIBUTION.md
@@ -93,7 +93,7 @@ Then ...
 
 #### Changing between Azure OpenAI <> OpenAI
 
-As noted in the README, the repo supports assistants on OpenAI or Azure OpenAI. The README has instructions on how to change in the `.env` file, but you will also have to change the connection in the promptflow groundedness node accordingly.
+As noted in the README, the repo supports assistants on OpenAI or Azure OpenAI. The README has instructions on how to change in the `.env` file, remeber to change ASSISTANT_ID as well as the API settings, but you will **also have to change the connection in the promptflow groundedness node accordingly.**
 
 ## GitHub Workflow
 

diff --git a/flows/chainlit-ui-evaluation/call_assistant.py b/flows/chainlit-ui-evaluation/call_assistant.py
@@ -34,7 +34,9 @@
 TMP_IMAGE = "temp.png"
 CHAT_URL = os.getenv("CHAT_URL")
 
-IMAGE_SUMMARIZATION_PROMPT = "Summarize the image"
+IMAGE_SUMMARIZATION_PROMPT = (
+    "Summarize the image. Where possible list maximum or minimum values and any trends."
+)
 
 
 def set_chrome_options() -> Options:
@@ -319,7 +321,7 @@ def call_assistant(query, chat_history):
     # user_input = chat_history[4]
     # print(user_input)
 
-    # user_input="Is your data updated in real time?"
+    # user_input="How many rows does the population table have for Nigeria"
     # call_assistant(user_input, "[]")
     # sys.exit()
 

diff --git a/flows/chainlit-ui-evaluation/data.jsonl b/flows/chainlit-ui-evaluation/data.jsonl
@@ -1,7 +1,7 @@
 {"test_scenario": "Assistant smalltalk, salutation", "query": "Hi", "chat_history": "[]", "context": "['Hello! How can I assist you today?']"}
 {"test_scenario": "Text answer from memory", "query": "What is the total population of Mali", "chat_history": "[]", "context": "['The answer is: 17,907,114.0', '\u2705 A human approved this data recipe; Source; Raw data; 2018-01-01 to 2018-12-31']"}
-{"test_scenario": "Image answer from memory", "query": "plot a line chart of fatalities by month for Chad using HDX data as an image", "chat_history": "[]", "context": "['*AN IMAGE WAS OUTPUT, HERE IS ITS LLM-GENERATED DESCRIPTION* ... The image is a line graph titled \"Fatalities by Month for Chad,\" depicting the total number of fatalities per month from January 2008 to January 2024. The x-axis represents the months, while the y-axis represents the total fatalities. \\n\\nKey observations:\\n- There are several peaks indicating months with high fatalities.\\n- Notable spikes occur around mid-2008, early 2009, mid-2015, early 2021, and mid-2021.\\n- The highest peak appears to be in early 2021, with fatalities exceeding 500.\\n- There are periods of relatively low fatalities, particularly between 2010 and 2014.\\n- Overall, there is an increasing trend in fatalities over the years, with more frequent spikes in recent years.', '\u2705 A human approved this data recipe; Source; Raw data']"}
-{"test_scenario": "Image answer from recipe", "query": "Plot population pyramids for Nigeria", "chat_history": "[]", "context": "['*AN IMAGE WAS OUTPUT, HERE IS ITS LLM-GENERATED DESCRIPTION* ... The image is a population pyramid for Nigeria (NGA). It displays the population distribution by age and gender. The horizontal axis represents the population in millions, with males on the left (in blue) and females on the right (in pink). The vertical axis represents different age ranges, from 0-4 at the bottom to 80+ at the top.\\n\\nKey observations:\\n- The pyramid has a broad base, indicating a large number of young people (0-14 years).\\n- The population decreases steadily with age, showing a typical pyramid shape.\\n- There are more males than females in the younger age groups, but the difference is not very pronounced.\\n- The population significantly tapers off in the older age groups (60+ years).', '\u2705 A human approved this data recipe; Source; Raw data; 2020-01-01 to 2020-12-31']"}
+{"test_scenario": "Image answer from memory", "query": "plot a line chart of fatalities by month for Chad using HDX data as an image", "chat_history": "[]", "context": "['*AN IMAGE WAS OUTPUT, HERE IS ITS LLM-GENERATED DESCRIPTION* ... The image is a line graph titled \"Fatalities by Month for Chad,\" depicting the total number of fatalities per month from January 2008 to January 2024. The x-axis represents the months, while the y-axis represents the total fatalities.\\n\\nKey observations:\\n- There are several peaks indicating months with high fatalities.\\n- Notable spikes occur around mid-2008, early 2009, mid-2015, early 2021, and mid-2021.\\n- The highest peak appears to be in early 2021, with fatalities exceeding 500.\\n- There are periods of relatively low fatalities, particularly between 2010 and 2014.\\n- The overall trend shows fluctuations with occasional sharp increases in fatalities.', '\u2705 A human approved this data recipe; Source; Raw data']"}
+{"test_scenario": "Image answer from recipe", "query": "Plot population pyramids for Nigeria", "chat_history": "[]", "context": "['*AN IMAGE WAS OUTPUT, HERE IS ITS LLM-GENERATED DESCRIPTION* ... The image is a population pyramid for Nigeria (NGA). It displays the population distribution by age and gender. The horizontal axis represents the population in millions, with males on the left (in blue) and females on the right (in pink). The vertical axis represents different age ranges, starting from 0-4 at the bottom to 80+ at the top.\\n\\nKey observations:\\n- The pyramid has a broad base, indicating a large number of young people (0-14 age range).\\n- The population decreases steadily with age, showing a typical pyramid shape.\\n- There are more males than females in the younger age groups, but the difference is not very pronounced.\\n- The population in older age groups (60 and above) is significantly smaller compared to younger age groups.', '\u2705 A human approved this data recipe; Source; Raw data; 2020-01-01 to 2020-12-31']"}
 {"test_scenario": "Assistant on-the-fly SQL, text answer", "query": "How many rows does the population table have for Nigeria", "chat_history": "[]", "context": "['The population table has 43,794 rows for Nigeria.']"}
-{"test_scenario": "Assistant created image (simple)", "query": "Plot f{x}=10", "chat_history": "[]", "context": "['*AN IMAGE WAS OUTPUT, HERE IS ITS LLM-GENERATED DESCRIPTION* ... The image is a plot of the function \\\\( f(x) = 10 \\\\). The graph shows a horizontal line at \\\\( f(x) = 10 \\\\) across the range of \\\\( x \\\\) values from -10 to 10. The x-axis is labeled \"x\" and the y-axis is labeled \"f(x)\". There is a legend on the right side of the plot indicating that the orange line represents \\\\( f(x) = 10 \\\\). The plot title is \"Plot of f(x) = 10\".']"}
-{"test_scenario": "Assistant answers using vector store", "query": "Is your data updated in real time?", "chat_history": "[]", "context": "['The data is updated in real-time for data sources configured as API data sources. The system calls these APIs on-demand to pull in the latest data from the remote system. However, for data sources where data is ingested, such as HAPI, the update frequency depends on how often the ingestion process is run, which is controlled by the user of the humanitarian AI assistant[0].', '\ud83e\udd16 Caution: LLM Analysis; Sources: [0] HDIP FAQs (External) .pdf']"}
+{"test_scenario": "Assistant created image (simple)", "query": "Plot f{x}=10", "chat_history": "[]", "context": "['*AN IMAGE WAS OUTPUT, HERE IS ITS LLM-GENERATED DESCRIPTION* ... The image is a plot of the function \\\\( f(x) = 10 \\\\). The graph is a horizontal line at \\\\( y = 10 \\\\) that extends from \\\\( x = -10 \\\\) to \\\\( x = 10 \\\\). The x-axis ranges from -10 to 10, and the y-axis ranges from 0 to 10. The plot includes a legend indicating that the line represents \\\\( f(x) = 10 \\\\).']"}
+{"test_scenario": "Assistant answers using vector store", "query": "Is your data updated in real time?", "chat_history": "[]", "context": "['The data is not updated in real-time. For data sources configured as API data sources, the system will call them on-demand to pull in the latest data from the remote system. However, for data sources where data is ingested, like HAPI, the frequency of updates depends on how often the ingestion process is run, which is controlled by the user of the humanitarian AI assistant[0].', '\ud83e\udd16 Caution: LLM Analysis; Sources: [0] HDIP FAQs (External) .pdf']"}
diff --git a/flows/chainlit-ui-evaluation/flow.dag.yaml b/flows/chainlit-ui-evaluation/flow.dag.yaml
@@ -55,7 +55,7 @@ nodes:
     context: ${inputs.context}
     temperature: 0
     model: gpt-4o
-  connection: azure_open_ai_connection
+  connection: open_ai_connection
   api: chat
 - name: concat_scores
   type: python

diff --git a/templates/groundedness_score.jinja2 b/templates/groundedness_score.jinja2
@@ -32,6 +32,7 @@ Key points:
 - if the CONTEXT is a basic greeting, like "Hello" and the ANSWER is a basic greeting also, groundedness is 5
 - Focus on facts, not on the language used to present the facts
 - If the ANSWER presents a fact, and the CONTEXT presents the same fact, groundedness is 5
+- Your ANSWER must **ONLY** be a number from 1-5
 
 Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context.
 

diff --git a/ui/chat-chainlit-assistant/app.py b/ui/chat-chainlit-assistant/app.py
@@ -201,6 +201,8 @@ def handle_message_completed(self, message):
                 None
             """
 
+            citations = None
+
             # Check for citations
             if hasattr(message.content[0], "text"):
                 message_content = message.content[0].text
@@ -232,7 +234,7 @@ def handle_message_completed(self, message):
 
             word_count = len(self.current_message_text.split())
             if word_count > 10:
-                if citations is not None:
+                if citations is not None and len(citations) > 0:
                     citations = "; Sources: " + "; ".join(citations)
                 else:
                     citations = ""