From c873983300cefa8b728174be66d71b0eae9d1f01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joan=20Fabr=C3=A9gat?= Date: Thu, 19 Dec 2024 17:46:14 -0800 Subject: [PATCH] replaces `pdfplumber` by `pymupdf` --- .idea/copyright/Code_Inc_.xml | 2 +- Dockerfile | 7 ++-- README.md | 2 +- __pycache__/app.cpython-311.pyc | Bin 0 -> 2370 bytes __pycache__/models.cpython-311.pyc | Bin 0 -> 843 bytes app.py | 52 ++++++++++++++--------------- models.py | 15 +++++++++ requirements.txt | 17 +++++++--- 8 files changed, 57 insertions(+), 38 deletions(-) create mode 100644 __pycache__/app.cpython-311.pyc create mode 100644 __pycache__/models.cpython-311.pyc create mode 100644 models.py diff --git a/.idea/copyright/Code_Inc_.xml b/.idea/copyright/Code_Inc_.xml index 0c466f8..739feeb 100644 --- a/.idea/copyright/Code_Inc_.xml +++ b/.idea/copyright/Code_Inc_.xml @@ -1,6 +1,6 @@ - \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 9681607..599f224 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,8 @@ -FROM python:3.9-slim +FROM python:3.11-slim ARG PORT=5000 ENV debian_frontend=noninteractive ENV PYTHONUNBUFFERED=1 -ENV NLTK_DATA=/usr/share/nltk_data WORKDIR /app @@ -13,9 +12,7 @@ RUN apt-get update && apt-get install -y \ COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt && \ - mkdir -p /usr/share/nltk_data && \ - python -m nltk.downloader -d $NLTK_DATA punkt_tab +RUN pip install --no-cache-dir -r requirements.txt COPY app.py . COPY entrypoint.sh . diff --git a/README.md b/README.md index fa375b8..099071a 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Docker Image CI](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml/badge.svg)](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml) [![Docker Image Version](https://img.shields.io/docker/v/codeinchq/pdf2txt?sort=semver&label=Docker%20Hub&color=red)](https://hub.docker.com/r/codeinchq/pdf2txt/tags) -This repository contains a simple containerized API to convert PDF documents to text using Python [pdfplumber](https://pypi.org/project/pdfplumber/) library. The API is built using [FastAPI](https://fastapi.tiangolo.com/). +This repository contains a simple containerized API to convert PDF documents to text using Python [MyMuPDF](https://pymupdf.readthedocs.io/en/latest/) library. The API is built using [FastAPI](https://fastapi.tiangolo.com/). The image is available on [Docker Hub](https://hub.docker.com/r/codeinchq/pdf2txt) under the name `codeinchq/pdf2txt`. diff --git a/__pycache__/app.cpython-311.pyc b/__pycache__/app.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..329e0d174bd05faaed30423c40df1d55b3505067 GIT binary patch literal 2370 zcmaJCU1$_XcxGpB_x{a~npBfEX%sod97-%LNdp=UwJoU_6?3f|>+K|NyuDp^=g3bl zB~_^@_+U%Hwg|mKv6kk+LJMuN54A6azHH>cV?YQLp}w7HORz8fW-ph^MduDWxQnq=+e}WGwcjY`|1f zijR}5YN{#K$K`A=6@>FZHe`lVVT>e1@ePpnZA}MCMQEfLU0be0bmayHN_eY7S1+Lr zyfNU7*Ur~}^&kx|hHgl}4{vpVZb=U}${fdfgpzSdk6sFFkiAKlM^Pg7P%tNCt`2K1 zJ9z9Umxqlk<>8Zd*3xtzkG*!{#IYmO8EP}b$|W$5zCQZa$Z_i0R?elc4(b}C%rNN) zEH)jPW{#F&wewjud5&sXb}qAOr~iRipo{Q6b^yGAF!-OXc>+Ns81d{qjv`8Ag0{50jxRkdO5?5TNvAoMe!hXO_8}6WHtQ?~`mXIA41Uo?hoaC%2 z9x_~O+;U8feS;ngoljp#o^+|>CMPW|mmIQmnzZ%t{cM^gHQVmBXSh3rR>x$m-cSMSfkmZJ!<@t-HA;jEQe(t0 z>>PkBnLu+ySLft{bQqoKoeLC5QK@s)v4UI(6xZplpETNQV?C&cu0NygN>UFOh=DT_ zR5v`0wqi7jdeA=P%2U!b8ADSTqcN~f^aOFjn(F#?Ag5QGR??x0_@c~aHwYTpkQS)Hhm}(i6g;gYZEk`%bRDZ!(|2;$OB@_HJ$)X zBFv+!<&qYBjtB^sL72`}d&bOzd@fry%>kEVlTe+l zLP!W$(`RQGbve2b^=yWM@kS0eFN+bL$fU zU|9oUw58+Icf8i!mA0Cqy2?jF`I9vZ>IWOJm6eqZI~$fOe{Vz4Efs_V19jj#?T4Ws z@pr@D4}0uN7hq}p~!RLoM$Zs71z!Obe9f#kNW*d%ZRF3E%j`(#{ldO65u(#F;-Fmxr zb^sYy{MP6?A+aEkcbq`d`K=g|$AVlF`QeDR>^E3Hd7cIbfGwY;gU-uvAnL%a0Dx;@ zTxmi0RSymP|16{3UgIpIPOov6QGDKi%IHO}aV{&d`HA<9OU67|#`5L13vCO}E#cM@ zZmmcdzl|Xd03Q(~ZLaSHkrcSxexZF~cQHNRzJw2y@PYEqMDfJUo-fC~8mpi-3;_i2 VX}FI2m=N?5P*AxL~Io7K%tfMU=DHW>_<0SlM6BOd#nZSSVOr zcmIJR{wX%OE=+4>w@0oz8{h0En3$V=`|UUH&3p6aoBh^mH2}w-pL{SP0Ke>HIlKj# zzeaKd95{&~Aps$lJF%0vflI)FC&1m0z^Nc#U4S3>RwJNg%)?l1Bj)i7ub^B1ZI^b5 z*?Jf%@hsya#tD5IDcxUGDL>1a^P4z50s#Tx5I6@QTtPY6N3ZaZIhJu3D^o*Xgacuk zIU-Ak$r~XJ)ncTdcWb83SQ;jRG1Fiy$@nm~{32uThhe;kxKXN2hp{kCGac$EOBJqh zm6qROZo4|$^AD7e%72%Iss9L@_jCUCt{!P0ONvCUXtn)D{U5BN@c^3pQzd>w9M_p>PQyMH|SCDO_K*8SGBit)LER f7qAJR?wwfg>Az5&kgEl3!eMk`y{G4d$|L>-5wp2W literal 0 HcmV?d00001 diff --git a/app.py b/app.py index 84624ae..a5a4cff 100644 --- a/app.py +++ b/app.py @@ -1,17 +1,16 @@ #!/usr/bin/env python3 -# Copyright (c) 2024 Code Inc. - All Rights Reserved -# Unauthorized copying of this file, via any medium is strictly prohibited -# Proprietary and confidential -# Visit for more information - -import io -import pdfplumber -from fastapi import FastAPI, File, UploadFile, HTTPException, Query +# Copyright 2024 Code Inc. +# +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +from fastapi import FastAPI, File, UploadFile, HTTPException from fastapi.responses import JSONResponse -from typing import Optional -import nltk from datetime import datetime +import pymupdf +from models import ExtractResponse app = FastAPI() @@ -19,30 +18,29 @@ async def health(): return JSONResponse(content={"status": "up", "timestamp": datetime.now().isoformat()}) -@app.post("/extract") -async def extract( - file: UploadFile = File(...), - tokenize: Optional[bool] = Query(False, description="If true, return sentence tokenization.") -): + +@app.post("/extract", response_model=ExtractResponse) +async def extract(file: UploadFile = File(...)): if file.content_type != "application/pdf": raise HTTPException(status_code=400, detail="Uploaded file must be a PDF.") pdf_bytes = await file.read() try: - with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: + # Open the PDF with PyMuPDF + doc = pymupdf.open(stream=pdf_bytes, filetype="pdf") + print(doc) + with doc: pages_output = [] - for page_num, page in enumerate(pdf.pages, start=1): - text = page.extract_text() or "" - page_dict = { - "page_number": page_num, - "text": text - } - if tokenize: - # Tokenize text into sentences - sentences = nltk.sent_tokenize(text) - page_dict["sentences"] = sentences - pages_output.append(page_dict) + + # Iterate through the pages and extract text + i = 1 + for page in doc: + pages_output.append({ + "page_number": i, + "text": (page.get_text() or "") + }) return {"pages": pages_output} + except Exception as e: raise HTTPException(status_code=500, detail=str(e)) diff --git a/models.py b/models.py new file mode 100644 index 0000000..633af72 --- /dev/null +++ b/models.py @@ -0,0 +1,15 @@ +# Copyright 2024 Code Inc. +# +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +from pydantic import BaseModel +from typing import List + +class Page(BaseModel): + page_number: int + text: str + +class ExtractResponse(BaseModel): + pages: List[Page] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 616f1e0..c3f06ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,14 @@ -fastapi==0.109.1 -uvicorn==0.22.0 -pdfplumber==0.9.0 +annotated-types==0.7.0 +anyio==4.7.0 +click==8.1.7 +fastapi==0.115.6 +h11==0.14.0 +idna==3.10 +pydantic==2.10.4 +pydantic_core==2.27.2 +PyMuPDF==1.25.1 python-multipart==0.0.20 -nltk~=3.9.1 \ No newline at end of file +sniffio==1.3.1 +starlette==0.42.0 +typing_extensions==4.12.2 +uvicorn==0.34.0