replaces pdfplumber by pymupdf

codeinchq · Dec 20, 2024 · c873983 · c873983
1 parent 9b830a6
commit c873983
Show file tree

Hide file tree

Showing 8 changed files with 57 additions and 38 deletions.
diff --git a/.idea/copyright/Code_Inc_.xml b/.idea/copyright/Code_Inc_.xml
diff --git a/Dockerfile b/Dockerfile
@@ -1,9 +1,8 @@
-FROM python:3.9-slim
+FROM python:3.11-slim
 
 ARG PORT=5000
 ENV debian_frontend=noninteractive
 ENV PYTHONUNBUFFERED=1
-ENV NLTK_DATA=/usr/share/nltk_data
 
 WORKDIR /app
 
@@ -13,9 +12,7 @@ RUN apt-get update && apt-get install -y \
 
 COPY requirements.txt .
 
-RUN pip install --no-cache-dir -r requirements.txt && \
-    mkdir -p /usr/share/nltk_data && \
-    python -m nltk.downloader -d $NLTK_DATA punkt_tab
+RUN pip install --no-cache-dir -r requirements.txt
 
 COPY app.py .
 COPY entrypoint.sh .

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 [![Docker Image CI](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml/badge.svg)](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml)
 [![Docker Image Version](https://img.shields.io/docker/v/codeinchq/pdf2txt?sort=semver&label=Docker%20Hub&color=red)](https://hub.docker.com/r/codeinchq/pdf2txt/tags)
 
-This repository contains a simple containerized API to convert PDF documents to text using Python [pdfplumber](https://pypi.org/project/pdfplumber/) library. The API is built using [FastAPI](https://fastapi.tiangolo.com/).
+This repository contains a simple containerized API to convert PDF documents to text using Python [MyMuPDF](https://pymupdf.readthedocs.io/en/latest/) library. The API is built using [FastAPI](https://fastapi.tiangolo.com/).
 
 The image is available on [Docker Hub](https://hub.docker.com/r/codeinchq/pdf2txt) under the name `codeinchq/pdf2txt`.
 

diff --git a/__pycache__/app.cpython-311.pyc b/__pycache__/app.cpython-311.pyc
diff --git a/__pycache__/models.cpython-311.pyc b/__pycache__/models.cpython-311.pyc
diff --git a/app.py b/app.py
@@ -1,48 +1,46 @@
 #!/usr/bin/env python3
 
-#  Copyright (c) 2024 Code Inc. - All Rights Reserved
-#  Unauthorized copying of this file, via any medium is strictly prohibited
-#  Proprietary and confidential
-#  Visit <https://www.codeinc.co> for more information
-
-import io
-import pdfplumber
-from fastapi import FastAPI, File, UploadFile, HTTPException, Query
+#  Copyright 2024 Code Inc. <https://www.codeinc.co>
+#
+#  Use of this source code is governed by an MIT-style
+#  license that can be found in the LICENSE file or at
+#  https://opensource.org/licenses/MIT.
+
+from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.responses import JSONResponse
-from typing import Optional
-import nltk
 from datetime import datetime
+import pymupdf
+from models import ExtractResponse
 
 app = FastAPI()
 
 @app.get("/health")
 async def health():
     return JSONResponse(content={"status": "up", "timestamp": datetime.now().isoformat()})
 
-@app.post("/extract")
-async def extract(
-    file: UploadFile = File(...),
-    tokenize: Optional[bool] = Query(False, description="If true, return sentence tokenization.")
-):
+
+@app.post("/extract", response_model=ExtractResponse)
+async def extract(file: UploadFile = File(...)):
     if file.content_type != "application/pdf":
         raise HTTPException(status_code=400, detail="Uploaded file must be a PDF.")
 
     pdf_bytes = await file.read()
     try:
-        with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+        # Open the PDF with PyMuPDF
+        doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")
+        print(doc)
+        with doc:
             pages_output = []
-            for page_num, page in enumerate(pdf.pages, start=1):
-                text = page.extract_text() or ""
-                page_dict = {
-                    "page_number": page_num,
-                    "text": text
-                }
-                if tokenize:
-                    # Tokenize text into sentences
-                    sentences = nltk.sent_tokenize(text)
-                    page_dict["sentences"] = sentences
-                pages_output.append(page_dict)
+
+            # Iterate through the pages and extract text
+            i = 1
+            for page in doc:
+                pages_output.append({
+                    "page_number": i,
+                    "text": (page.get_text() or "")
+                })
 
             return {"pages": pages_output}
+
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
diff --git a/models.py b/models.py
@@ -0,0 +1,15 @@
+#  Copyright 2024 Code Inc. <https://www.codeinc.co>
+#
+#  Use of this source code is governed by an MIT-style
+#  license that can be found in the LICENSE file or at
+#  https://opensource.org/licenses/MIT.
+
+from pydantic import BaseModel
+from typing import List
+
+class Page(BaseModel):
+    page_number: int
+    text: str
+
+class ExtractResponse(BaseModel):
+    pages: List[Page]
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,14 @@
-fastapi==0.109.1
-uvicorn==0.22.0
-pdfplumber==0.9.0
+annotated-types==0.7.0
+anyio==4.7.0
+click==8.1.7
+fastapi==0.115.6
+h11==0.14.0
+idna==3.10
+pydantic==2.10.4
+pydantic_core==2.27.2
+PyMuPDF==1.25.1
 python-multipart==0.0.20
-nltk~=3.9.1
+sniffio==1.3.1
+starlette==0.42.0
+typing_extensions==4.12.2
+uvicorn==0.34.0