diff --git a/examples/prompt_to_extract_from_image_to_json_customer.ipynb b/examples/prompt_to_extract_from_image_to_json_customer.ipynb new file mode 100644 index 0000000..e0ee53b --- /dev/null +++ b/examples/prompt_to_extract_from_image_to_json_customer.ipynb @@ -0,0 +1,680 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prompt to Extract Key-values into JSON from CBC Reports (Image) using advanced mode\n", + "\n", + "Below it's an example of using OpenParser to extract key-values from a medical CBC report into JSON format. (Note: the model is still in beta and is NOT robust enough to generate the same output. Please bear with it!)\n", + "\n", + "### 1. Load the libraries\n", + "\n", + "If you have install `open_parser`, uncomment the below line." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip3 install python-dotenv\n", + "# !pip3 install --upgrade open_parser\n", + "# !pip3 install pandas\n", + "# !pip3 install jinja2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "from dotenv import load_dotenv\n", + "from open_parser import OpenParser\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Set up your OpenParser API key\n", + "\n", + "To set up your `CAMBIO_API_KEY` API key, you will:\n", + "\n", + "1. create a `.env` file in your root folder;\n", + "2. add the following one line to your `.env file:\n", + " ```\n", + " CAMBIO_API_KEY=17b************************\n", + " ```\n", + "\n", + "Then run the below line to load your API key." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "example_apikey = os.getenv(\"CAMBIO_API_KEY\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Load sample data and Run OpenParser\n", + "\n", + "OpenParser supports both image and PDF. First let's load a sample data to test OpenParser's capabilities." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "report_folder = './report_data/'\n", + "report_files = [\n", + " 'CBC_ReportSample_1_Redacted.png',\n", + " 'CBC_ReportSample_12_Redacted.jpeg',\n", + " 'CBC_ReportSample_14_Redacted.jpeg',\n", + " 'CBC_ReportSample_20_Redacted.jpeg',\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we'll set our `input_keys`, which are the keys that we want to extract from the report. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "input_keys = ['NAME', 'HAEMOGLOBIN', 'RBC (RED CELLS COUNT)', 'HAEMATOCRIT(PCV)','MCV','MCH','MCHC','TOTAL LEUCOCYTE COUNT','DIFFERENTIAL LEUCOCYTE COUNT', 'NEUTROPHILS','LYMPHOCYTES','MONOCYTES', 'EOSINOPHILS', 'BASOPHIL', 'PROMYELOCYTES', 'MYELOCYTES', 'METAMYELOCYTES', 'BLASTS', 'PLATELETS', 'RETICULOCYTE COUNT']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we'll set up our `expected_result`, which is the expected output of the extracted key-values." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "expected_result = [{'NAME': 'Test Patient A',\n", + " 'HAEMOGLOBIN': '12.6',\n", + " 'RBC (RED CELLS COUNT)': '4.54',\n", + " 'HAEMATOCRIT(PCV)': '38.1',\n", + " 'MCV': '83.9',\n", + " 'MCH': '27.8',\n", + " 'MCHC': '33.1',\n", + " 'TOTAL LEUCOCYTE COUNT': '4.62',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '66',\n", + " 'LYMPHOCYTES': '28',\n", + " 'MONOCYTES': '06',\n", + " 'EOSINOPHILS': '00',\n", + " 'BASOPHIL': '00',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '195',\n", + " 'RETICULOCYTE COUNT': 'NA'},\n", + " {'NAME': 'TEST PATIENT 12',\n", + " 'HAEMOGLOBIN': '12.8',\n", + " 'RBC (RED CELLS COUNT)': '4.5',\n", + " 'HAEMATOCRIT(PCV)': '42',\n", + " 'MCV': '93',\n", + " 'MCH': '28',\n", + " 'MCHC': '31',\n", + " 'TOTAL LEUCOCYTE COUNT': '12.6',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '67',\n", + " 'LYMPHOCYTES': '23',\n", + " 'MONOCYTES': '07',\n", + " 'EOSINOPHILS': '03',\n", + " 'BASOPHIL': '00',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '210',\n", + " 'RETICULOCYTE COUNT': 'NA'},\n", + " {'NAME': 'Test Patient 14',\n", + " 'HAEMOGLOBIN': '14.8',\n", + " 'RBC (RED CELLS COUNT)': '5.22',\n", + " 'HAEMATOCRIT(PCV)': '45.2',\n", + " 'MCV': '86.6',\n", + " 'MCH': '28.4',\n", + " 'MCHC': '32.7',\n", + " 'TOTAL LEUCOCYTE COUNT': '5.45',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '64.3',\n", + " 'LYMPHOCYTES': '25.1',\n", + " 'MONOCYTES': '7.2',\n", + " 'EOSINOPHILS': '2.8',\n", + " 'BASOPHIL': '0.6',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '287',\n", + " 'RETICULOCYTE COUNT': 'NA'},\n", + " {'NAME': 'Test Patient 20',\n", + " 'HAEMOGLOBIN': '11.3',\n", + " 'RBC (RED CELLS COUNT)': '4.2',\n", + " 'HAEMATOCRIT(PCV)': '35',\n", + " 'MCV': '82',\n", + " 'MCH': '27',\n", + " 'MCHC': '33',\n", + " 'TOTAL LEUCOCYTE COUNT': '9,600',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '58',\n", + " 'LYMPHOCYTES': '40',\n", + " 'MONOCYTES': '01',\n", + " 'EOSINOPHILS': '01',\n", + " 'BASOPHIL': 'NA',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '238,000',\n", + " 'RETICULOCYTE COUNT': 'NA'}\n", + "]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we set up our prompt and run OpenParser on the data." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upload response: 204\n", + "Extraction success.\n", + "Upload response: 204\n", + "Extraction success.\n", + "Upload response: 204\n", + "Extraction success.\n", + "Upload response: 204\n", + "Extraction success.\n" + ] + } + ], + "source": [ + "prompt = f\"\"\"Return table in a JSON format with all of the following keys and their corresponding values:\n", + "{input_keys}\n", + "\n", + "If a key and/or value is not found in the text, please still include the key with a value of `NA`.\n", + "\n", + "Return all values as strings and always use `.` to separate decimals.\n", + "\"\"\"\n", + "op = OpenParser(example_apikey)\n", + "qa_results = []\n", + "for file in report_files:\n", + " qa_result = op.parse(report_folder + file, prompt, mode=\"advanced\")\n", + " qa_results.append(qa_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is the raw output." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'result': [{'NAME': 'Test Patient A',\n", + " 'HAEMOGLOBIN': '12.6',\n", + " 'RBC (RED CELLS COUNT)': '4.54',\n", + " 'HAEMATOCRIT(PCV)': '38.1',\n", + " 'MCV': '83.9',\n", + " 'MCH': '27.8',\n", + " 'MCHC': '33.1',\n", + " 'TOTAL LEUCOCYTE COUNT': '4.62',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '66',\n", + " 'LYMPHOCYTES': '28',\n", + " 'MONOCYTES': '06',\n", + " 'EOSINOPHILS': '00',\n", + " 'BASOPHIL': '00',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '195',\n", + " 'RETICULOCYTE COUNT': 'NA'}]},\n", + " {'result': [{'NAME': 'TEST PATIENT 12',\n", + " 'HAEMOGLOBIN': '12.8',\n", + " 'RBC (RED CELLS COUNT)': '4.5',\n", + " 'HAEMATOCRIT(PCV)': '42.0',\n", + " 'MCV': '93.0',\n", + " 'MCH': '28.0',\n", + " 'MCHC': '31.0',\n", + " 'TOTAL LEUCOCYTE COUNT': '12.6',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '67.0',\n", + " 'LYMPHOCYTES': '23.0',\n", + " 'MONOCYTES': '7.0',\n", + " 'EOSINOPHILS': '3.0',\n", + " 'BASOPHIL': '0.0',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '210.0',\n", + " 'RETICULOCYTE COUNT': 'NA'}]},\n", + " {'result': [{'NAME': 'Test Patient 14',\n", + " 'HAEMOGLOBIN': '14.8',\n", + " 'RBC (RED CELLS COUNT)': '5.22',\n", + " 'HAEMATOCRIT(PCV)': '45.2',\n", + " 'MCV': '86.6',\n", + " 'MCH': '28.4',\n", + " 'MCHC': '32.7',\n", + " 'TOTAL LEUCOCYTE COUNT': '5.45',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '64.3',\n", + " 'LYMPHOCYTES': '25.1',\n", + " 'MONOCYTES': '7.2',\n", + " 'EOSINOPHILS': '2.8',\n", + " 'BASOPHIL': '0.6',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '287',\n", + " 'RETICULOCYTE COUNT': 'NA'}]},\n", + " {'result': [{'NAME': 'Test Patient 20',\n", + " 'HAEMOGLOBIN': '11.3',\n", + " 'RBC (RED CELLS COUNT)': '4.2',\n", + " 'HAEMATOCRIT(PCV)': '35',\n", + " 'MCV': '82',\n", + " 'MCH': '27',\n", + " 'MCHC': '33',\n", + " 'TOTAL LEUCOCYTE COUNT': '9,600',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '58',\n", + " 'LYMPHOCYTES': '40',\n", + " 'MONOCYTES': '01',\n", + " 'EOSINOPHILS': '01',\n", + " 'BASOPHIL': 'NA',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '238,000',\n", + " 'RETICULOCYTE COUNT': 'NA'}]}]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qa_results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output Analysis\n", + "Now, we will analyze the output and compare it with the expected result. We'll take not of any missing keys, any additional keys added by OpenParser, and any incorrect values." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------------------\n", + "REPORT 1\n", + "\n", + "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n", + "---------------------------------------------------\n", + "REPORT 2\n", + "\n", + "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n", + "---------------------------------------------------\n", + "REPORT 3\n", + "\n", + "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n", + "---------------------------------------------------\n", + "REPORT 4\n", + "\n", + "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n" + ] + } + ], + "source": [ + "values_list = []\n", + "\n", + "def compare_vals(str1, str2):\n", + " try:\n", + " num1 = float(str1)\n", + " num2 = float(str2)\n", + " return num1 == num2\n", + " except ValueError:\n", + " return str1 == str2\n", + "\n", + "for idx, qa_result in enumerate(qa_results):\n", + " print('---------------------------------------------------')\n", + " print(f\"REPORT {idx+1}\\n\")\n", + " data = qa_result['result']\n", + " keys = [list(item.keys()) for item in data][0]\n", + " values = [list(item.values()) for item in data][0]\n", + "\n", + " this_expected_result = expected_result[idx]\n", + " missing_keys = []\n", + " incorrect_keys = []\n", + " expected_keys = list(this_expected_result.keys())\n", + "\n", + " for key_idx, key in enumerate(expected_keys):\n", + " if key_idx < 0 or key_idx >= len(keys) or key != keys[key_idx]:\n", + " print(f\"Key {key} not found in report {idx+1}\")\n", + " missing_keys.append(key)\n", + " keys.insert(key_idx, key)\n", + " values.insert(key_idx, 'Missing')\n", + " elif not compare_vals(this_expected_result[key], values[keys.index(key)]):\n", + " print(f\"Incorrect {key}. Expected: {this_expected_result[key]}, Found: {values[keys.index(key)]}\")\n", + " incorrect_keys.append((key, this_expected_result[key], values[keys.index(key)]))\n", + " values[keys.index(key)] = f'{values[keys.index(key)]} * ({this_expected_result[key]})'\n", + "\n", + " different_keys = {'missing_keys': missing_keys, 'additional_keys': [], 'incorrect_keys': incorrect_keys}\n", + "\n", + " # Check for differences in keys\n", + " for key in keys:\n", + " if key not in input_keys:\n", + " different_keys['additional_keys'].append(key)\n", + " print(f\"Summary: {different_keys}\")\n", + "\n", + " # Create a DataFrame\n", + " values_list.append(values)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Last, we print out the table with all the values extracted from the report. If a value is missing, it's highlighted yellow. If it's an incorrect value, it's highlighted red with the expected value in parentheses." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/39/ddf_qmj154l0k4jvqgngmxjm0000gn/T/ipykernel_166/2497480378.py:16: FutureWarning: Styler.applymap has been deprecated. Use Styler.map instead.\n", + " styled_df = df.style.applymap(highlight_incorrect_keys)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " | CBC_ReportSample_1_Redacted.png | \n", + "CBC_ReportSample_12_Redacted.jpeg | \n", + "CBC_ReportSample_14_Redacted.jpeg | \n", + "CBC_ReportSample_20_Redacted.jpeg | \n", + "
---|---|---|---|---|
NAME | \n", + "Test Patient A | \n", + "TEST PATIENT 12 | \n", + "Test Patient 14 | \n", + "Test Patient 20 | \n", + "
HAEMOGLOBIN | \n", + "12.6 | \n", + "12.8 | \n", + "14.8 | \n", + "11.3 | \n", + "
RBC (RED CELLS COUNT) | \n", + "4.54 | \n", + "4.5 | \n", + "5.22 | \n", + "4.2 | \n", + "
HAEMATOCRIT(PCV) | \n", + "38.1 | \n", + "42.0 | \n", + "45.2 | \n", + "35 | \n", + "
MCV | \n", + "83.9 | \n", + "93.0 | \n", + "86.6 | \n", + "82 | \n", + "
MCH | \n", + "27.8 | \n", + "28.0 | \n", + "28.4 | \n", + "27 | \n", + "
MCHC | \n", + "33.1 | \n", + "31.0 | \n", + "32.7 | \n", + "33 | \n", + "
TOTAL LEUCOCYTE COUNT | \n", + "4.62 | \n", + "12.6 | \n", + "5.45 | \n", + "9,600 | \n", + "
DIFFERENTIAL LEUCOCYTE COUNT | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "
NEUTROPHILS | \n", + "66 | \n", + "67.0 | \n", + "64.3 | \n", + "58 | \n", + "
LYMPHOCYTES | \n", + "28 | \n", + "23.0 | \n", + "25.1 | \n", + "40 | \n", + "
MONOCYTES | \n", + "06 | \n", + "7.0 | \n", + "7.2 | \n", + "01 | \n", + "
EOSINOPHILS | \n", + "00 | \n", + "3.0 | \n", + "2.8 | \n", + "01 | \n", + "
BASOPHIL | \n", + "00 | \n", + "0.0 | \n", + "0.6 | \n", + "NA | \n", + "
PROMYELOCYTES | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "
MYELOCYTES | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "
METAMYELOCYTES | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "
BLASTS | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "
PLATELETS | \n", + "195 | \n", + "210.0 | \n", + "287 | \n", + "238,000 | \n", + "
RETICULOCYTE COUNT | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "