diff --git a/.gitattributes b/.gitattributes index 513c7ecbf0..3adb203207 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,10 +33,8 @@ *.tsx text *.xml text *.xhtml text diff=html - # Docker Dockerfile text eol=lf - # Documentation *.ipynb text *.markdown text diff=markdown eol=lf @@ -62,7 +60,6 @@ NEWS text eol=lf readme text eol=lf *README* text eol=lf TODO text - # Configs *.cnf text eol=lf *.conf text eol=lf @@ -84,8 +81,10 @@ yarn.lock text -diff browserslist text Makefile text eol=lf makefile text eol=lf - # Images *.png filter=lfs diff=lfs merge=lfs -text *.jpg filter=lfs diff=lfs merge=lfs -text *.jpeg filter=lfs diff=lfs merge=lfs -text +notebook/agentchat_pdf_rag/parsed_elements.json filter=lfs diff=lfs merge=lfs -text +notebook/agentchat_pdf_rag/input_files/nvidia_10k_2024.pdf filter=lfs diff=lfs merge=lfs -text +notebook/agentchat_pdf_rag/processed_elements.json filter=lfs diff=lfs merge=lfs -text diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c3ea21ee42..1fceab925a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,6 +15,7 @@ repos: - id: check-yaml - id: check-toml - id: check-json + exclude: ^notebook/agentchat_pdf_rag/(parsed_elements|processed_elements)\.json$ - id: check-byte-order-marker exclude: .gitignore - id: check-merge-conflict diff --git a/notebook/agentchat_pdf_rag/input_files/nvidia_10k_2024.pdf b/notebook/agentchat_pdf_rag/input_files/nvidia_10k_2024.pdf new file mode 100644 index 0000000000..464ab05680 Binary files /dev/null and b/notebook/agentchat_pdf_rag/input_files/nvidia_10k_2024.pdf differ diff --git a/notebook/agentchat_pdf_rag/parsed_elements.json b/notebook/agentchat_pdf_rag/parsed_elements.json new file mode 100644 index 0000000000..5ed0102616 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_elements.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78469c67d9e702ea6283f89f77f5d9be964782bf6c0242d2d8c6a99879ecf43c +size 2185964 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-1-1.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-1-1.jpg new file mode 100644 index 0000000000..5bfd432354 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-1-1.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b60a18030bb7a01079ad8dd3ae662c431f6ce686db7fbf1380031acebc93d0a +size 2145 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-33-2.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-33-2.jpg new file mode 100644 index 0000000000..6d89302884 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-33-2.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:442defe14cb733e85cf7a821cbec2d20f559b3c603cc3c8bec329c9fe4d8f6d9 +size 69750 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-92-3.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-92-3.jpg new file mode 100644 index 0000000000..9c8646db5a --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-92-3.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d137771b5d03ba4715a8e3c0d128988e0ad0a5cef5dcbe4d940b5b3c3a32a8d +size 5566 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-93-4.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-93-4.jpg new file mode 100644 index 0000000000..aa5e0f897a --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-93-4.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:529984bfdfd9836b0142291207909d4cd01f7c97f201a6a3dfc88257e1c311db +size 5397 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-94-5.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-94-5.jpg new file mode 100644 index 0000000000..eacde13f01 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-94-5.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf16c57b061b039c8e9930efa11fdeb565110ce91fa1e9cb55e5b2e1996638ca +size 5200 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-95-6.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-95-6.jpg new file mode 100644 index 0000000000..1921a22507 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/figure-95-6.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1c93fe1144bc0d163f8dcea0551892f114e2ff68ad2538ed6aa1cee8cce3a60 +size 5364 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-12-2.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-12-2.jpg new file mode 100644 index 0000000000..6eed50cf0b --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-12-2.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74cd46f89df486b07553ca7eb3bef9a87fe431c96b1b11e0977fa815270735f0 +size 42660 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-2-1.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-2-1.jpg new file mode 100644 index 0000000000..05f34f1e52 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-2-1.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de8520ec58bc6c472aa6f910e8ad0a72de01baedadaa43dfa4652bb059dcec9f +size 189286 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-32-3.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-32-3.jpg new file mode 100644 index 0000000000..948d68f0a8 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-32-3.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b634d9b4b4921f85e62f0473237192e65a241dd4df4305caf417da3b80a1e861 +size 62089 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-33-4.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-33-4.jpg new file mode 100644 index 0000000000..bb2d8eec9d --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-33-4.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc27fe4b5af14fd610c6ec93156993f0f5330e19624fb1f81ecab99309518ce6 +size 32682 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-36-5.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-36-5.jpg new file mode 100644 index 0000000000..3697eee0ff --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-36-5.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ced809bb969f7605e49ccdbdb3a79901bea5a9a201035251a1c39adf7cd4df8 +size 54461 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-39-6.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-39-6.jpg new file mode 100644 index 0000000000..f2797e1276 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-39-6.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a13d2574a49df5d346e80b5066fecdb0c6378888a691204ef976f9d56397d0c +size 83482 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-39-7.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-39-7.jpg new file mode 100644 index 0000000000..08e35caaac --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-39-7.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd739a1c862e65db4e5c375519184e3634f3fc12094649f296e3be0ac0079ec5 +size 40082 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-39-8.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-39-8.jpg new file mode 100644 index 0000000000..6a6e4020a2 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-39-8.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42845bdd91bac5198e80b84697a284d7dc7f507427b197bf47390e40731783a0 +size 46386 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-40-9.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-40-9.jpg new file mode 100644 index 0000000000..4c269157c8 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-40-9.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9cc8f53b64555ca5eb51701e3fb3b6a60d6db589a463ed0a52ae5d9bf98e371 +size 68682 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-41-10.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-41-10.jpg new file mode 100644 index 0000000000..d8ae96f21d --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-41-10.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30992b3f47a4305e23ba46c7992a8c7620006a312ea724458284427150d2dae3 +size 39630 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-42-11.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-42-11.jpg new file mode 100644 index 0000000000..3345dceb56 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-42-11.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b59d455a2329f125ae170731b6847fe2b7a88f29e9032493ce0535c04cd85ca +size 28007 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-42-12.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-42-12.jpg new file mode 100644 index 0000000000..3b35ff1ff6 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-42-12.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c507d4e0df2605769f297c9e2fdd91ec2aafb9a8385297cedff48d3f4d45349a +size 35733 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-43-13.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-43-13.jpg new file mode 100644 index 0000000000..932a160da7 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-43-13.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c45ccc4af87c41dc9572729c2b5995d6540f651415f37d3bd62a0643cb32b0f +size 44445 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-47-14.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-47-14.jpg new file mode 100644 index 0000000000..94fb72d0ef --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-47-14.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:563a30606b8dd01ee22e0ea9ecd8d4bdf22913b7585320f339acbe290af4f7b9 +size 142237 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-50-15.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-50-15.jpg new file mode 100644 index 0000000000..62dff895a7 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-50-15.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e498696df863256c4c65783422d5476282375a7594e78675c8dc836b05677448 +size 139375 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-51-16.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-51-16.jpg new file mode 100644 index 0000000000..c2ea65f2a2 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-51-16.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c21dbe5bb978e846e0ecffc1dc9d76cbd805bb8da6b6525d49dce9868bf614a +size 102190 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-52-17.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-52-17.jpg new file mode 100644 index 0000000000..245cf166d8 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-52-17.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f39216a5c51643583d9a4f027ee7cd7b01829372aaec539e29441ab677994a55 +size 138826 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-52-18.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-52-18.jpg new file mode 100644 index 0000000000..2940359e02 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-52-18.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8e4c906a1a925e1fdb14c06e0ac7ecb8246fa2a0bc981a47e3105cae2767385 +size 63739 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-53-19.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-53-19.jpg new file mode 100644 index 0000000000..36d3862996 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-53-19.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e56e1d862f3e84238df2ad0b4d45c0924128149eb88ce470ad53ed555259cd75 +size 183427 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-54-20.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-54-20.jpg new file mode 100644 index 0000000000..36fe781073 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-54-20.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddde44818844e984ebd200e7c6fe09d045b2baa3819726010c19eb14cbdf2a5f +size 303686 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-60-21.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-60-21.jpg new file mode 100644 index 0000000000..084d6cd46d --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-60-21.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5f4bdfb7e9626f95019ec3ddd1f46450ae54d123c50d661d93e36f61c9c3c10 +size 46261 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-61-22.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-61-22.jpg new file mode 100644 index 0000000000..732d85f483 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-61-22.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7c426680e5fa4dd56d90eaf5d0b0545dc6036dd49b3391293cdb84cf8034e70 +size 38499 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-61-23.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-61-23.jpg new file mode 100644 index 0000000000..31b2294589 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-61-23.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6e18c0496cf3948b13ae5d910c49d30b5af1bd0987760cb3b9feedce8d8e713 +size 35416 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-61-24.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-61-24.jpg new file mode 100644 index 0000000000..c2e661176c --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-61-24.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:642ac19c86c63b9c31ffb04f8e416dcebce5b1ba79b628ae32d35e48b826f1ed +size 64583 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-62-25.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-62-25.jpg new file mode 100644 index 0000000000..3af3a5cabf --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-62-25.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e62ea4ba74a3e85135baefb2eced2b8b7e23dfd22c62ab156ee8c8423dfbe63 +size 41601 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-63-26.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-63-26.jpg new file mode 100644 index 0000000000..bc34c7a277 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-63-26.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f68b51a527740cecc7dfd4fbf9e9ba82405f7df361425aed7bee9f7f045cc00 +size 55318 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-63-27.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-63-27.jpg new file mode 100644 index 0000000000..952e53a326 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-63-27.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d45b21c0594d8f463e0e44aef25af7e744e95718991fb11f96506f029ff2dfe6 +size 78562 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-64-28.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-64-28.jpg new file mode 100644 index 0000000000..76d24798f7 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-64-28.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:906a491e9032a523892afae9e9f5fc69bff604f2fa801a97007c863c8ff5aae5 +size 64014 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-64-29.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-64-29.jpg new file mode 100644 index 0000000000..6fe15ffe7e --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-64-29.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a549eaf6b28d04e866c72ee053eda033978c26665f4ecf4f190e3665d3a7a0de +size 29749 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-65-30.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-65-30.jpg new file mode 100644 index 0000000000..fddc072f74 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-65-30.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:883ab7f4e489106c38b32c094fdf4ca31175fe2f918261d0ff6cec49bc947d29 +size 85531 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-65-31.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-65-31.jpg new file mode 100644 index 0000000000..6ffa0d0887 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-65-31.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a2f6ab861fc3a1995d513dbc13d98644f2c3406c36ab9a7ff336960a1551be4 +size 77384 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-66-32.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-66-32.jpg new file mode 100644 index 0000000000..ffdf160e64 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-66-32.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:833d8b3b852d2b2d145916ebbbee5fa1e791eaff99ba52c9b90b9d69789a30f5 +size 74378 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-66-33.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-66-33.jpg new file mode 100644 index 0000000000..cbe4fcc428 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-66-33.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc04f5a0d4aae0f711a0b530d92af7d89adc69f517b3cd27fd73624f3720fca7 +size 73124 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-66-34.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-66-34.jpg new file mode 100644 index 0000000000..c1ff302f47 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-66-34.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be81bf660c87ee3cf6736797b82e475231dfd577bf405b490b8c618eb1bfe88d +size 43613 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-67-35.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-67-35.jpg new file mode 100644 index 0000000000..7e28565273 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-67-35.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbe703c4a52c8d717ffc5f49a10f221b9aba46ec53a82f06c20c1aabdc8de8aa +size 131663 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-68-36.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-68-36.jpg new file mode 100644 index 0000000000..b6aced9a5d --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-68-36.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81f4561d6f7da14a58df8ea7ec81af66c1d24a3c4b26d602af5a221f15664b82 +size 40822 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-68-37.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-68-37.jpg new file mode 100644 index 0000000000..0865736d75 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-68-37.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2192eaa49a0b9c9aeac180598a6137b723e06a9a87c890ae6af33d9c4cf0022 +size 18702 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-68-38.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-68-38.jpg new file mode 100644 index 0000000000..cd36ebb15b --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-68-38.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a3e7f97120b89ecf399e433a67dc2928706c89b05e0c1450381fbf81d4e5f96 +size 30398 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-69-39.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-69-39.jpg new file mode 100644 index 0000000000..3cc9f2f391 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-69-39.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73c69d7edf6614b28f5335a9156f63d4e4420edf536874039cf788426d33cbe0 +size 61561 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-69-40.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-69-40.jpg new file mode 100644 index 0000000000..1a9cf1fddd --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-69-40.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4a257651a15d3d7aa1dee120dbb3461210f49b0e2b5ea40b1b404223c5ec06f +size 35857 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-70-41.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-70-41.jpg new file mode 100644 index 0000000000..2de7c908f7 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-70-41.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfab13b9565d292b821f35cd62a7dd0df1fcdae681c48d6aafaa265931f64338 +size 74040 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-70-42.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-70-42.jpg new file mode 100644 index 0000000000..1b23e53bab --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-70-42.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f676741f3c619861a8c7b37c6448c66ea9e3adcd61c0cd2125cc004ec2faae70 +size 38337 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-70-43.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-70-43.jpg new file mode 100644 index 0000000000..eb1a3ffb24 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-70-43.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d36bda4731a9927506fde1f5e1cff3d09bef4b5353b0b71e264705d2d64ee61f +size 35349 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-71-44.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-71-44.jpg new file mode 100644 index 0000000000..b25fdc8524 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-71-44.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4014e10cbec3bf345cd3a62198e07b35dc88bcac9a2808779ab13128f5d23c23 +size 20683 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-72-45.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-72-45.jpg new file mode 100644 index 0000000000..b459853e13 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-72-45.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:642be8df0f925dc484d8b3356720635230afaedaba7d07ae46170de27014d2c7 +size 94505 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-73-46.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-73-46.jpg new file mode 100644 index 0000000000..fe40d57c53 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-73-46.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89c40584884d7b3b72f0104279d2e06d5ba5198356daba85ed5ad8d2dc8c2409 +size 28198 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-73-47.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-73-47.jpg new file mode 100644 index 0000000000..df1f009df1 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-73-47.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05caa76fd824ff956d5749dacfa635bbfc01758c47ac95477a1f9d1cffede277 +size 38362 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-75-48.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-75-48.jpg new file mode 100644 index 0000000000..24148cd78d --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-75-48.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bc9e7b6f97fb9f05a670e73b2b69cb1785a7cc7beee008de3ff5cce43a46be6 +size 62731 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-75-49.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-75-49.jpg new file mode 100644 index 0000000000..c995ca766a --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-75-49.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a30d066ca7d6a67b3bed4f8a140db099d3f716d865293c96ad8daf0e0e0ba277 +size 28709 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-75-50.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-75-50.jpg new file mode 100644 index 0000000000..50e54ec7c6 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-75-50.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a4f14977f23284199170a7b3d3188bcd42110e1aa402b2df616985d76baf949 +size 107963 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-76-51.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-76-51.jpg new file mode 100644 index 0000000000..1a68cbd88c --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-76-51.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a27d0ad965c5564a283428340135a28393ee68cf986c1757aee566117982548 +size 118556 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-77-52.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-77-52.jpg new file mode 100644 index 0000000000..fbfc7ae9b4 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-77-52.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:735be3e47f6430963cc3098cbfe5bc6525def440b549ac49fe461f9570dbe0ac +size 54658 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-78-53.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-78-53.jpg new file mode 100644 index 0000000000..f4a252d042 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-78-53.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96348a72c7c14c5937cf43235554ae8efd98a3b6b0409e4ab851d8435c68ee07 +size 70330 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-79-54.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-79-54.jpg new file mode 100644 index 0000000000..5510a9056e --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-79-54.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:884ec5ee6effbb173e98921b1a23205a8f7b9d6808211e9f483fb1c363e95282 +size 70884 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-79-55.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-79-55.jpg new file mode 100644 index 0000000000..f5f62e7189 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-79-55.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76a050023989f88960ba98441333decd3c91a18450597daaaae4cfb27d52a407 +size 46317 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-79-56.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-79-56.jpg new file mode 100644 index 0000000000..a8a21cbb02 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-79-56.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1daca4ca5ffd3bddfb5a50ed4e1b822ed7f9369e18b3a4c9cdf391e80c6c6249 +size 47247 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-80-57.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-80-57.jpg new file mode 100644 index 0000000000..c77f552e4e --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-80-57.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1d69beefaf1c0117413fa53b7b9b15feb4efc12486d46f40776ac9975d2757f +size 31572 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-81-58.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-81-58.jpg new file mode 100644 index 0000000000..47dbb244bb --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-81-58.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9de4eee046ea5afca8d9cb5585c19e919e10b3e3e7ea2d5a53dc94b3b22057f5 +size 90702 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-82-59.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-82-59.jpg new file mode 100644 index 0000000000..725a1c145c --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-82-59.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:072d31d5dd81bb5f15a7e49b582da4f2a6b841869d6666da7781e09390a4b420 +size 354183 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-83-60.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-83-60.jpg new file mode 100644 index 0000000000..f2b6984458 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-83-60.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54a96220a68e08e4a61d6b8b15d85092c61bb95499ed963c7db3445508fd1e0d +size 102751 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-85-61.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-85-61.jpg new file mode 100644 index 0000000000..513648aae9 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-85-61.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceb71467ed58ab3ba9605d1445242ede96ba5f555d41cc35840bdf5323564116 +size 172564 diff --git a/notebook/agentchat_pdf_rag/parsed_pdf_info/table-95-62.jpg b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-95-62.jpg new file mode 100644 index 0000000000..01a7128677 --- /dev/null +++ b/notebook/agentchat_pdf_rag/parsed_pdf_info/table-95-62.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49c6b8d938863a47dad6c9fcb8d62465ab99644d6432f5a49221c459064e3894 +size 433728 diff --git a/notebook/agentchat_pdf_rag/processed_elements.json b/notebook/agentchat_pdf_rag/processed_elements.json new file mode 100644 index 0000000000..edb45e86c6 --- /dev/null +++ b/notebook/agentchat_pdf_rag/processed_elements.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc4c0ff84e7320e2ad36b0b7492befb86c171bf2067df6dc9a061809c8bacc71 +size 671130 diff --git a/notebook/agentchat_pdf_rag/sample_elements.json b/notebook/agentchat_pdf_rag/sample_elements.json new file mode 100644 index 0000000000..2ab4755567 --- /dev/null +++ b/notebook/agentchat_pdf_rag/sample_elements.json @@ -0,0 +1,17 @@ +[ + { + "element_id": "518e6f32a8c371f69e6ac8868519f570", + "text": "NVIDIA Corporation and Subsidiaries Consolidated Balance Sheets (In millions, except value)", + "type": "Title", + "page_number": 52, + "parent_id": "d972706e5fe99bae469dd5dc42202fa2" + }, + { + "element_id": "7193a45c9b844e570053b3c0cc752c06", + "text": "NVIDIA Corporation and Subsidiaries Consolidated Balance Sheets (In millions, except value): Assets Current assets: Cash and cash equivalents Marketable securities Accounts receivable, net Inventories Prepaid expenses and other current assets Total current assets Property and equipment, net Operating lease assets Goodwill Intangible assets, net Deferred income tax assets Other assets Total assets Liabilities and Shareholders\u2019 Equity Current liabilities: Accounts payable Accrued and other current liabilities Short-term debt Total current liabilities Long-term debt Long-term operating lease liabilities Other long-term liabilities Total liabilities Commitments and contingencies - see Note 13 Jan 28, 2024 Jan 29, 2023 7,280 $ 3,389 18,704 9,907 9,999 3,827 5,282 5,159 3,080 791 44,345 23,073 3,014 3,807 1,346 1,038 4,430 4,372 1,112 1,676 6,081 3,396 4,500 3,820 65,728 $ 41,182 2,699 $ 1,193 6,682 4,120 1,250 1,250 10,631 6,563 8,459 9,703 1,119 902 2,541 1,913 22,750 19,081", + "type": "Table", + "page_number": 52, + "parent_id": "19874ad91c0234155cb1c5168500a767", + "image_path": "./parsed_pdf_info/table-52-17.jpg" + } +] diff --git a/notebook/agentchat_tabular_data_rag_workflow.ipynb b/notebook/agentchat_tabular_data_rag_workflow.ipynb new file mode 100644 index 0000000000..e34f03950c --- /dev/null +++ b/notebook/agentchat_tabular_data_rag_workflow.ipynb @@ -0,0 +1,589 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Agentic RAG workflow on tabular data from a PDF file\n", + "\n", + "In this notebook, we're building a workflow to extract accurate tabular data information from a PDF file.\n", + "\n", + "The following bullets summarize the notebook, with highlights being:\n", + "\n", + "- Parse the PDF file and extract tables into images (optional).\n", + "- A single RAG agent fails to get the accurate information from tabular data.\n", + "- An agentic workflow using a groupchat is able to extract information accurately:\n", + " - the agentic workflow uses a RAG agent to extract document metadata (e.g. the image of a data table using just the table name)\n", + " - the table image is converted to Markdown through a multi-modal agent\n", + " - finally, an assistant agent answers the original question with an LLM" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "````{=mdx}\n", + ":::info Requirements\n", + "Unstructured-IO is a dependency for this notebook to parse the PDF. Please install AG2 (with the neo4j extra) and the dependencies:\n", + "\n", + "- Install Poppler https://pdf2image.readthedocs.io/en/latest/installation.html\n", + "- Install Tesseract https://tesseract-ocr.github.io/tessdoc/Installation.html\n", + "- pip install ag2[neo4j], unstructured==0.16.11, pi-heif==0.21.0, unstructured_inference==0.8.1, unstructured.pytesseract==0.3.13, pytesseract==0.3.13\n", + ":::\n", + "````\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set Configuration and OpenAI API Key" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import autogen\n", + "\n", + "config_list = autogen.config_list_from_json(\n", + " \"OAI_CONFIG_LIST\",\n", + " filter_dict={\n", + " \"model\": [\"gpt-4o\"],\n", + " },\n", + ")\n", + "os.environ[\"OPENAI_API_KEY\"] = config_list[0][\"api_key\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parse PDF file\n", + "\n", + "**Skip and use parsed files to run the rest.**\n", + "This step is expensive and time consuming, please skip if you don't need to generate the full data set. The **estimated cost is from $10 to $15 to parse the pdf file and build the knowledge graph with entire parsed output**.\n", + "\n", + "For the notebook, we use a common finanical document, [Nvidia 2024 10-K](https://investor.nvidia.com/financial-info/sec-filings/sec-filings-details/default.aspx?FilingId=17293267) as an example ([file download link](https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/1cbe8fe7-e08a-46e3-8dcc-b429fc06c1a4.pdf)).\n", + "\n", + "We use Unstructured-IO to parse the PDF, the table and image from the PDF are extracted out as .jpg files.\n", + "\n", + "All parsed output are saved in a JSON file." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from unstructured.partition.pdf import partition_pdf\n", + "from unstructured.staging.base import elements_to_json\n", + "\n", + "file_elements = partition_pdf(\n", + " filename=\"./input_files/nvidia_10k_2024.pdf\",\n", + " strategy=\"hi_res\",\n", + " languages=[\"eng\"],\n", + " infer_table_structure=True,\n", + " extract_images_in_pdf=True,\n", + " extract_image_block_output_dir=\"./parsed_pdf_info\",\n", + " extract_image_block_types=[\"Image\", \"Table\"],\n", + " extract_forms=False,\n", + " form_extraction_skip_tables=False,\n", + ")\n", + "\n", + "elements_to_json(elements=file_elements, filename=\"parsed_elements.json\", encoding=\"utf-8\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Create sample dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "output_elements = []\n", + "keys_to_extract = [\"element_id\", \"text\", \"type\"]\n", + "metadata_keys = [\"page_number\", \"parent_id\", \"image_path\"]\n", + "text_types = set([\"Text\", \"UncategorizedText\", \"NarrativeText\"])\n", + "element_length = len(file_elements)\n", + "for idx in range(element_length):\n", + " data = file_elements[idx].to_dict()\n", + " new_data = {key: data[key] for key in keys_to_extract}\n", + " metadata = data[\"metadata\"]\n", + " for key in metadata_keys:\n", + " if key in metadata:\n", + " new_data[key] = metadata[key]\n", + " if data[\"type\"] == \"Table\":\n", + " if idx > 0:\n", + " pre_data = file_elements[idx - 1].to_dict()\n", + " if pre_data[\"type\"] in text_types:\n", + " new_data[\"text\"] = pre_data[\"text\"] + new_data[\"text\"]\n", + " if idx < element_length - 1:\n", + " post_data = file_elements[idx + 1].to_dict()\n", + " if post_data[\"type\"] in text_types:\n", + " new_data[\"text\"] = new_data[\"text\"] + post_data[\"text\"]\n", + " output_elements.append(new_data)\n", + "\n", + "with open(\"proessed_elements.json\", \"w\", encoding=\"utf-8\") as file:\n", + " json.dump(output_elements, file, indent=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Imports\n", + "\n", + "**If you want to skip the parsing of the PDF file, you can start here.**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# This is needed to allow nested asyncio calls for Neo4j in Jupyter\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()\n", + "\n", + "from llama_index.embeddings.openai import OpenAIEmbedding\n", + "from llama_index.llms.openai import OpenAI\n", + "\n", + "from autogen import AssistantAgent, ConversableAgent, UserProxyAgent\n", + "\n", + "# load documents\n", + "from autogen.agentchat.contrib.graph_rag.document import Document, DocumentType\n", + "from autogen.agentchat.contrib.graph_rag.neo4j_graph_query_engine import Neo4jGraphQueryEngine\n", + "from autogen.agentchat.contrib.graph_rag.neo4j_graph_rag_capability import Neo4jGraphCapability\n", + "from autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalConversableAgent" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a knowledge graph with sample data\n", + "\n", + "To save time and cost, we use a small subset of the data for the notebook.\n", + "\n", + "**This does not change the fact that the native RAG agent solution failed to provide the correct answer.**" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "22c02a975b784c5db13ea02163bd140a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Parsing nodes: 0%| | 0/1 [00:00.\".\n", + " For example, when you got message \"The image path for the table titled XYZ is \"./parsed_pdf_info/abcde\".\",\n", + " you will reply \"Please extract table from the following image and convert it to Markdown.\n", + " .\"\n", + " \"\"\",\n", + " llm_config=llm_config,\n", + " human_input_mode=\"NEVER\",\n", + ")\n", + "\n", + "image2table_convertor = MultimodalConversableAgent(\n", + " name=\"image2table_convertor\",\n", + " system_message=\"\"\"\n", + " You are an image to table convertor. You will process an image of one or multiple consecutive tables.\n", + " You need to follow the following steps in sequence,\n", + " 1. extract the complete table contents and structure.\n", + " 2. Make sure the structure is complete and no information is left out. Otherwise, start from step 1 again.\n", + " 3. Correct typos in the text fields.\n", + " 4. In the end, output the table(s) in Markdown.\n", + " \"\"\",\n", + " llm_config={\"config_list\": config_list, \"max_tokens\": 300},\n", + " human_input_mode=\"NEVER\",\n", + " max_consecutive_auto_reply=1,\n", + ")\n", + "\n", + "conclusion = AssistantAgent(\n", + " name=\"conclusion\",\n", + " system_message=\"\"\"You are a helpful assistant.\n", + " Base on the history of the groupchat, answer the original question from User_proxy.\n", + " \"\"\",\n", + " llm_config=llm_config,\n", + " human_input_mode=\"NEVER\", # Never ask for human input.\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n", + "\n", + "What is goodwill asset (in millions) for 2024 in table NVIDIA Corporation and Subsidiaries Consolidated Balance Sheets?\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: table_assistant\n", + "\u001b[0m\n", + "\u001b[33mtable_assistant\u001b[0m (to chat_manager):\n", + "\n", + "Find image_path for Table: NVIDIA Corporation and Subsidiaries Consolidated Balance Sheets\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: nvidia_rag\n", + "\u001b[0m\n", + "\u001b[33mnvidia_rag\u001b[0m (to chat_manager):\n", + "\n", + "The image path for the table titled \"NVIDIA Corporation and Subsidiaries Consolidated Balance Sheets\" is \"./parsed_pdf_info/table-52-17.jpg\".\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: img_request_format\n", + "\u001b[0m\n", + "\u001b[33mimg_request_format\u001b[0m (to chat_manager):\n", + "\n", + "Please extract table from the following image and convert it to Markdown.\n", + ".\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: image2table_convertor\n", + "\u001b[0m\n", + "\u001b[33mimage2table_convertor\u001b[0m (to chat_manager):\n", + "\n", + "Here is the extracted table from the image in Markdown format:\n", + "\n", + "```markdown\n", + "| | Jan 28, 2024 | Jan 29, 2023 |\n", + "|------------------------------------------|--------------|--------------|\n", + "| **Assets** | | |\n", + "| Current assets: | | |\n", + "|     Cash and cash equivalents | $7,280 | $3,389 |\n", + "|     Marketable securities | $18,704 | $9,907 |\n", + "|     Accounts receivable, net | $9,999 | $3,827 |\n", + "|     Inventories | $5,282 | $5,159 |\n", + "|     Prepaid expenses and other current assets | $3,080 | $791 |\n", + "| Total current assets | $44,345 | $23,073 |\n", + "| Property and equipment, net | $3,914 | $3,807 |\n", + "| Operating lease assets | $1,346 | $1,038 |\n", + "| Goodwill | $4,430 | $4,372 |\n", + "| Intangible assets, net | $1,112 | $1,676 |\n", + "| Deferred income tax assets | $6,081 | $3,396 |\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: conclusion\n", + "\u001b[0m\n", + "\u001b[33mconclusion\u001b[0m (to chat_manager):\n", + "\n", + "The goodwill asset for NVIDIA Corporation as of January 28, 2024, is $4,430 million.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: User_proxy\n", + "\u001b[0m\n", + "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n", + "\n", + "What is the total current assets from the table NVIDIA Corporation and Subsidiaries Consolidated Balance Sheets?\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: table_assistant\n", + "\u001b[0m\n", + "\u001b[33mtable_assistant\u001b[0m (to chat_manager):\n", + "\n", + "The total current assets from the table \"NVIDIA Corporation and Subsidiaries Consolidated Balance Sheets\" are $44,345 million as of January 28, 2024.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: nvidia_rag\n", + "\u001b[0m\n", + "\u001b[33mnvidia_rag\u001b[0m (to chat_manager):\n", + "\n", + "Yes, that's correct! The total current assets for NVIDIA Corporation and Subsidiaries as of January 28, 2024, are $44,345 million, according to the table \"NVIDIA Corporation and Subsidiaries Consolidated Balance Sheets.\" If you have any more questions or need further details, feel free to ask!\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: img_request_format\n", + "\u001b[0m\n", + "\u001b[33mimg_request_format\u001b[0m (to chat_manager):\n", + "\n", + "Great, if you have any more questions or need further clarification, feel free to ask!\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "groupchat = autogen.GroupChat(\n", + " agents=[\n", + " user_proxy,\n", + " table_assistant,\n", + " rag_agent,\n", + " img_request_format,\n", + " image2table_convertor,\n", + " conclusion,\n", + " ],\n", + " messages=[],\n", + " speaker_selection_method=\"round_robin\",\n", + ")\n", + "manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)\n", + "chat_result = user_proxy.initiate_chat(\n", + " manager,\n", + " message=\"What is goodwill asset (in millions) for 2024 in table NVIDIA Corporation and Subsidiaries Consolidated Balance Sheets?\",\n", + ")" + ] + } + ], + "metadata": { + "front_matter": { + "description": "Agentic RAG workflow on tabular data from a PDF file", + "tags": [ + "RAG", + "groupchat" + ] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index ccf26716f8..560fe7cb85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -118,6 +118,7 @@ neo4j = [ "llama-index==0.12.5", "llama-index-graph-stores-neo4j==0.4.2", "llama-index-core==0.12.5", + "llama-index-readers-web==0.3.3", ] # used for agentchat_realtime_swarm notebook and realtime agent twilio demo diff --git a/website/mint.json b/website/mint.json index c54861d607..d81d773f2f 100644 --- a/website/mint.json +++ b/website/mint.json @@ -639,7 +639,8 @@ "notebooks/config_loader_utility_functions", "notebooks/gpt_assistant_agent_function_call", "notebooks/lats_search", - "notebooks/tools_interoperability" + "notebooks/tools_interoperability", + "notebooks/agentchat_tabular_data_rag_workflow" ] }, "notebooks/Gallery" diff --git a/website/snippets/data/NotebooksMetadata.mdx b/website/snippets/data/NotebooksMetadata.mdx index 7279592e2f..52c880a798 100644 --- a/website/snippets/data/NotebooksMetadata.mdx +++ b/website/snippets/data/NotebooksMetadata.mdx @@ -991,5 +991,16 @@ export const notebooksMetadata = [ "pydanticai" ], "source": "/notebook/tools_interoperability.ipynb" + }, + { + "title": "Agentic RAG workflow on tabular data from a PDF file", + "link": "/notebooks/agentchat_tabular_data_rag_workflow", + "description": "Agentic RAG workflow on tabular data from a PDF file", + "image": null, + "tags": [ + "RAG", + "groupchat" + ], + "source": "/notebook/agentchat_tabular_data_rag_workflow.ipynb" } ];