diff --git a/examples/prompt_to_extract_from_image_to_json_customer.ipynb b/examples/prompt_to_extract_from_image_to_json_customer.ipynb new file mode 100644 index 0000000..e0ee53b --- /dev/null +++ b/examples/prompt_to_extract_from_image_to_json_customer.ipynb @@ -0,0 +1,680 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prompt to Extract Key-values into JSON from CBC Reports (Image) using advanced mode\n", + "\n", + "Below it's an example of using OpenParser to extract key-values from a medical CBC report into JSON format. (Note: the model is still in beta and is NOT robust enough to generate the same output. Please bear with it!)\n", + "\n", + "### 1. Load the libraries\n", + "\n", + "If you have install `open_parser`, uncomment the below line." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip3 install python-dotenv\n", + "# !pip3 install --upgrade open_parser\n", + "# !pip3 install pandas\n", + "# !pip3 install jinja2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "from dotenv import load_dotenv\n", + "from open_parser import OpenParser\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Set up your OpenParser API key\n", + "\n", + "To set up your `CAMBIO_API_KEY` API key, you will:\n", + "\n", + "1. create a `.env` file in your root folder;\n", + "2. add the following one line to your `.env file:\n", + " ```\n", + " CAMBIO_API_KEY=17b************************\n", + " ```\n", + "\n", + "Then run the below line to load your API key." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "load_dotenv(override=True)\n", + "example_apikey = os.getenv(\"CAMBIO_API_KEY\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Load sample data and Run OpenParser\n", + "\n", + "OpenParser supports both image and PDF. First let's load a sample data to test OpenParser's capabilities." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "report_folder = './report_data/'\n", + "report_files = [\n", + " 'CBC_ReportSample_1_Redacted.png',\n", + " 'CBC_ReportSample_12_Redacted.jpeg',\n", + " 'CBC_ReportSample_14_Redacted.jpeg',\n", + " 'CBC_ReportSample_20_Redacted.jpeg',\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we'll set our `input_keys`, which are the keys that we want to extract from the report. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "input_keys = ['NAME', 'HAEMOGLOBIN', 'RBC (RED CELLS COUNT)', 'HAEMATOCRIT(PCV)','MCV','MCH','MCHC','TOTAL LEUCOCYTE COUNT','DIFFERENTIAL LEUCOCYTE COUNT', 'NEUTROPHILS','LYMPHOCYTES','MONOCYTES', 'EOSINOPHILS', 'BASOPHIL', 'PROMYELOCYTES', 'MYELOCYTES', 'METAMYELOCYTES', 'BLASTS', 'PLATELETS', 'RETICULOCYTE COUNT']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we'll set up our `expected_result`, which is the expected output of the extracted key-values." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "expected_result = [{'NAME': 'Test Patient A',\n", + " 'HAEMOGLOBIN': '12.6',\n", + " 'RBC (RED CELLS COUNT)': '4.54',\n", + " 'HAEMATOCRIT(PCV)': '38.1',\n", + " 'MCV': '83.9',\n", + " 'MCH': '27.8',\n", + " 'MCHC': '33.1',\n", + " 'TOTAL LEUCOCYTE COUNT': '4.62',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '66',\n", + " 'LYMPHOCYTES': '28',\n", + " 'MONOCYTES': '06',\n", + " 'EOSINOPHILS': '00',\n", + " 'BASOPHIL': '00',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '195',\n", + " 'RETICULOCYTE COUNT': 'NA'},\n", + " {'NAME': 'TEST PATIENT 12',\n", + " 'HAEMOGLOBIN': '12.8',\n", + " 'RBC (RED CELLS COUNT)': '4.5',\n", + " 'HAEMATOCRIT(PCV)': '42',\n", + " 'MCV': '93',\n", + " 'MCH': '28',\n", + " 'MCHC': '31',\n", + " 'TOTAL LEUCOCYTE COUNT': '12.6',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '67',\n", + " 'LYMPHOCYTES': '23',\n", + " 'MONOCYTES': '07',\n", + " 'EOSINOPHILS': '03',\n", + " 'BASOPHIL': '00',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '210',\n", + " 'RETICULOCYTE COUNT': 'NA'},\n", + " {'NAME': 'Test Patient 14',\n", + " 'HAEMOGLOBIN': '14.8',\n", + " 'RBC (RED CELLS COUNT)': '5.22',\n", + " 'HAEMATOCRIT(PCV)': '45.2',\n", + " 'MCV': '86.6',\n", + " 'MCH': '28.4',\n", + " 'MCHC': '32.7',\n", + " 'TOTAL LEUCOCYTE COUNT': '5.45',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '64.3',\n", + " 'LYMPHOCYTES': '25.1',\n", + " 'MONOCYTES': '7.2',\n", + " 'EOSINOPHILS': '2.8',\n", + " 'BASOPHIL': '0.6',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '287',\n", + " 'RETICULOCYTE COUNT': 'NA'},\n", + " {'NAME': 'Test Patient 20',\n", + " 'HAEMOGLOBIN': '11.3',\n", + " 'RBC (RED CELLS COUNT)': '4.2',\n", + " 'HAEMATOCRIT(PCV)': '35',\n", + " 'MCV': '82',\n", + " 'MCH': '27',\n", + " 'MCHC': '33',\n", + " 'TOTAL LEUCOCYTE COUNT': '9,600',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '58',\n", + " 'LYMPHOCYTES': '40',\n", + " 'MONOCYTES': '01',\n", + " 'EOSINOPHILS': '01',\n", + " 'BASOPHIL': 'NA',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '238,000',\n", + " 'RETICULOCYTE COUNT': 'NA'}\n", + "]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we set up our prompt and run OpenParser on the data." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upload response: 204\n", + "Extraction success.\n", + "Upload response: 204\n", + "Extraction success.\n", + "Upload response: 204\n", + "Extraction success.\n", + "Upload response: 204\n", + "Extraction success.\n" + ] + } + ], + "source": [ + "prompt = f\"\"\"Return table in a JSON format with all of the following keys and their corresponding values:\n", + "{input_keys}\n", + "\n", + "If a key and/or value is not found in the text, please still include the key with a value of `NA`.\n", + "\n", + "Return all values as strings and always use `.` to separate decimals.\n", + "\"\"\"\n", + "op = OpenParser(example_apikey)\n", + "qa_results = []\n", + "for file in report_files:\n", + " qa_result = op.parse(report_folder + file, prompt, mode=\"advanced\")\n", + " qa_results.append(qa_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is the raw output." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'result': [{'NAME': 'Test Patient A',\n", + " 'HAEMOGLOBIN': '12.6',\n", + " 'RBC (RED CELLS COUNT)': '4.54',\n", + " 'HAEMATOCRIT(PCV)': '38.1',\n", + " 'MCV': '83.9',\n", + " 'MCH': '27.8',\n", + " 'MCHC': '33.1',\n", + " 'TOTAL LEUCOCYTE COUNT': '4.62',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '66',\n", + " 'LYMPHOCYTES': '28',\n", + " 'MONOCYTES': '06',\n", + " 'EOSINOPHILS': '00',\n", + " 'BASOPHIL': '00',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '195',\n", + " 'RETICULOCYTE COUNT': 'NA'}]},\n", + " {'result': [{'NAME': 'TEST PATIENT 12',\n", + " 'HAEMOGLOBIN': '12.8',\n", + " 'RBC (RED CELLS COUNT)': '4.5',\n", + " 'HAEMATOCRIT(PCV)': '42.0',\n", + " 'MCV': '93.0',\n", + " 'MCH': '28.0',\n", + " 'MCHC': '31.0',\n", + " 'TOTAL LEUCOCYTE COUNT': '12.6',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '67.0',\n", + " 'LYMPHOCYTES': '23.0',\n", + " 'MONOCYTES': '7.0',\n", + " 'EOSINOPHILS': '3.0',\n", + " 'BASOPHIL': '0.0',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '210.0',\n", + " 'RETICULOCYTE COUNT': 'NA'}]},\n", + " {'result': [{'NAME': 'Test Patient 14',\n", + " 'HAEMOGLOBIN': '14.8',\n", + " 'RBC (RED CELLS COUNT)': '5.22',\n", + " 'HAEMATOCRIT(PCV)': '45.2',\n", + " 'MCV': '86.6',\n", + " 'MCH': '28.4',\n", + " 'MCHC': '32.7',\n", + " 'TOTAL LEUCOCYTE COUNT': '5.45',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '64.3',\n", + " 'LYMPHOCYTES': '25.1',\n", + " 'MONOCYTES': '7.2',\n", + " 'EOSINOPHILS': '2.8',\n", + " 'BASOPHIL': '0.6',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '287',\n", + " 'RETICULOCYTE COUNT': 'NA'}]},\n", + " {'result': [{'NAME': 'Test Patient 20',\n", + " 'HAEMOGLOBIN': '11.3',\n", + " 'RBC (RED CELLS COUNT)': '4.2',\n", + " 'HAEMATOCRIT(PCV)': '35',\n", + " 'MCV': '82',\n", + " 'MCH': '27',\n", + " 'MCHC': '33',\n", + " 'TOTAL LEUCOCYTE COUNT': '9,600',\n", + " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", + " 'NEUTROPHILS': '58',\n", + " 'LYMPHOCYTES': '40',\n", + " 'MONOCYTES': '01',\n", + " 'EOSINOPHILS': '01',\n", + " 'BASOPHIL': 'NA',\n", + " 'PROMYELOCYTES': 'NA',\n", + " 'MYELOCYTES': 'NA',\n", + " 'METAMYELOCYTES': 'NA',\n", + " 'BLASTS': 'NA',\n", + " 'PLATELETS': '238,000',\n", + " 'RETICULOCYTE COUNT': 'NA'}]}]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qa_results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output Analysis\n", + "Now, we will analyze the output and compare it with the expected result. We'll take not of any missing keys, any additional keys added by OpenParser, and any incorrect values." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------------------\n", + "REPORT 1\n", + "\n", + "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n", + "---------------------------------------------------\n", + "REPORT 2\n", + "\n", + "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n", + "---------------------------------------------------\n", + "REPORT 3\n", + "\n", + "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n", + "---------------------------------------------------\n", + "REPORT 4\n", + "\n", + "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n" + ] + } + ], + "source": [ + "values_list = []\n", + "\n", + "def compare_vals(str1, str2):\n", + " try:\n", + " num1 = float(str1)\n", + " num2 = float(str2)\n", + " return num1 == num2\n", + " except ValueError:\n", + " return str1 == str2\n", + "\n", + "for idx, qa_result in enumerate(qa_results):\n", + " print('---------------------------------------------------')\n", + " print(f\"REPORT {idx+1}\\n\")\n", + " data = qa_result['result']\n", + " keys = [list(item.keys()) for item in data][0]\n", + " values = [list(item.values()) for item in data][0]\n", + "\n", + " this_expected_result = expected_result[idx]\n", + " missing_keys = []\n", + " incorrect_keys = []\n", + " expected_keys = list(this_expected_result.keys())\n", + "\n", + " for key_idx, key in enumerate(expected_keys):\n", + " if key_idx < 0 or key_idx >= len(keys) or key != keys[key_idx]:\n", + " print(f\"Key {key} not found in report {idx+1}\")\n", + " missing_keys.append(key)\n", + " keys.insert(key_idx, key)\n", + " values.insert(key_idx, 'Missing')\n", + " elif not compare_vals(this_expected_result[key], values[keys.index(key)]):\n", + " print(f\"Incorrect {key}. Expected: {this_expected_result[key]}, Found: {values[keys.index(key)]}\")\n", + " incorrect_keys.append((key, this_expected_result[key], values[keys.index(key)]))\n", + " values[keys.index(key)] = f'{values[keys.index(key)]} * ({this_expected_result[key]})'\n", + "\n", + " different_keys = {'missing_keys': missing_keys, 'additional_keys': [], 'incorrect_keys': incorrect_keys}\n", + "\n", + " # Check for differences in keys\n", + " for key in keys:\n", + " if key not in input_keys:\n", + " different_keys['additional_keys'].append(key)\n", + " print(f\"Summary: {different_keys}\")\n", + "\n", + " # Create a DataFrame\n", + " values_list.append(values)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Last, we print out the table with all the values extracted from the report. If a value is missing, it's highlighted yellow. If it's an incorrect value, it's highlighted red with the expected value in parentheses." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/39/ddf_qmj154l0k4jvqgngmxjm0000gn/T/ipykernel_166/2497480378.py:16: FutureWarning: Styler.applymap has been deprecated. Use Styler.map instead.\n", + " styled_df = df.style.applymap(highlight_incorrect_keys)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 CBC_ReportSample_1_Redacted.pngCBC_ReportSample_12_Redacted.jpegCBC_ReportSample_14_Redacted.jpegCBC_ReportSample_20_Redacted.jpeg
NAMETest Patient ATEST PATIENT 12Test Patient 14Test Patient 20
HAEMOGLOBIN12.612.814.811.3
RBC (RED CELLS COUNT)4.544.55.224.2
HAEMATOCRIT(PCV)38.142.045.235
MCV83.993.086.682
MCH27.828.028.427
MCHC33.131.032.733
TOTAL LEUCOCYTE COUNT4.6212.65.459,600
DIFFERENTIAL LEUCOCYTE COUNTNANANANA
NEUTROPHILS6667.064.358
LYMPHOCYTES2823.025.140
MONOCYTES067.07.201
EOSINOPHILS003.02.801
BASOPHIL000.00.6NA
PROMYELOCYTESNANANANA
MYELOCYTESNANANANA
METAMYELOCYTESNANANANA
BLASTSNANANANA
PLATELETS195210.0287238,000
RETICULOCYTE COUNTNANANANA
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({\n", + " report_files[0]: values_list[0],\n", + " report_files[1]: values_list[1],\n", + " report_files[2]: values_list[2],\n", + " report_files[3]: values_list[3],\n", + "}, index=input_keys)\n", + "\n", + "def highlight_incorrect_keys(val):\n", + " if val == 'Missing':\n", + " return 'background-color: yellow; color: black'\n", + " elif '*' in val:\n", + " return 'background-color: red'\n", + " return ''\n", + "\n", + "# Apply the style to the DataFrame\n", + "styled_df = df.style.applymap(highlight_incorrect_keys)\n", + "\n", + "# Display the styled DataFrame\n", + "styled_df\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## End of the notebook\n", + "\n", + "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", + "\n", + "\n", + " \n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "open-parser", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/report_data/CBC_ReportSample_12_Redacted.jpeg b/examples/report_data/CBC_ReportSample_12_Redacted.jpeg new file mode 100644 index 0000000..033f801 Binary files /dev/null and b/examples/report_data/CBC_ReportSample_12_Redacted.jpeg differ diff --git a/examples/report_data/CBC_ReportSample_14_Redacted.jpeg b/examples/report_data/CBC_ReportSample_14_Redacted.jpeg new file mode 100644 index 0000000..cec66ba Binary files /dev/null and b/examples/report_data/CBC_ReportSample_14_Redacted.jpeg differ diff --git a/examples/report_data/CBC_ReportSample_1_Redacted.png b/examples/report_data/CBC_ReportSample_1_Redacted.png new file mode 100644 index 0000000..054b14d Binary files /dev/null and b/examples/report_data/CBC_ReportSample_1_Redacted.png differ diff --git a/examples/report_data/CBC_ReportSample_20_Redacted.jpeg b/examples/report_data/CBC_ReportSample_20_Redacted.jpeg new file mode 100644 index 0000000..93bdd1c Binary files /dev/null and b/examples/report_data/CBC_ReportSample_20_Redacted.jpeg differ