diff --git a/.idea/copyright/Code_Inc_.xml b/.idea/copyright/Code_Inc_.xml
index 0c466f8..739feeb 100644
--- a/.idea/copyright/Code_Inc_.xml
+++ b/.idea/copyright/Code_Inc_.xml
@@ -1,6 +1,6 @@
-
+
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 9681607..599f224 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,9 +1,8 @@
-FROM python:3.9-slim
+FROM python:3.11-slim
ARG PORT=5000
ENV debian_frontend=noninteractive
ENV PYTHONUNBUFFERED=1
-ENV NLTK_DATA=/usr/share/nltk_data
WORKDIR /app
@@ -13,9 +12,7 @@ RUN apt-get update && apt-get install -y \
COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
- mkdir -p /usr/share/nltk_data && \
- python -m nltk.downloader -d $NLTK_DATA punkt_tab
+RUN pip install --no-cache-dir -r requirements.txt
COPY app.py .
COPY entrypoint.sh .
diff --git a/README.md b/README.md
index fa375b8..099071a 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
[![Docker Image CI](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml/badge.svg)](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml)
[![Docker Image Version](https://img.shields.io/docker/v/codeinchq/pdf2txt?sort=semver&label=Docker%20Hub&color=red)](https://hub.docker.com/r/codeinchq/pdf2txt/tags)
-This repository contains a simple containerized API to convert PDF documents to text using Python [pdfplumber](https://pypi.org/project/pdfplumber/) library. The API is built using [FastAPI](https://fastapi.tiangolo.com/).
+This repository contains a simple containerized API to convert PDF documents to text using Python [MyMuPDF](https://pymupdf.readthedocs.io/en/latest/) library. The API is built using [FastAPI](https://fastapi.tiangolo.com/).
The image is available on [Docker Hub](https://hub.docker.com/r/codeinchq/pdf2txt) under the name `codeinchq/pdf2txt`.
diff --git a/__pycache__/app.cpython-311.pyc b/__pycache__/app.cpython-311.pyc
new file mode 100644
index 0000000..329e0d1
Binary files /dev/null and b/__pycache__/app.cpython-311.pyc differ
diff --git a/__pycache__/models.cpython-311.pyc b/__pycache__/models.cpython-311.pyc
new file mode 100644
index 0000000..650cc56
Binary files /dev/null and b/__pycache__/models.cpython-311.pyc differ
diff --git a/app.py b/app.py
index 84624ae..a5a4cff 100644
--- a/app.py
+++ b/app.py
@@ -1,17 +1,16 @@
#!/usr/bin/env python3
-# Copyright (c) 2024 Code Inc. - All Rights Reserved
-# Unauthorized copying of this file, via any medium is strictly prohibited
-# Proprietary and confidential
-# Visit for more information
-
-import io
-import pdfplumber
-from fastapi import FastAPI, File, UploadFile, HTTPException, Query
+# Copyright 2024 Code Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+
+from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
-from typing import Optional
-import nltk
from datetime import datetime
+import pymupdf
+from models import ExtractResponse
app = FastAPI()
@@ -19,30 +18,29 @@
async def health():
return JSONResponse(content={"status": "up", "timestamp": datetime.now().isoformat()})
-@app.post("/extract")
-async def extract(
- file: UploadFile = File(...),
- tokenize: Optional[bool] = Query(False, description="If true, return sentence tokenization.")
-):
+
+@app.post("/extract", response_model=ExtractResponse)
+async def extract(file: UploadFile = File(...)):
if file.content_type != "application/pdf":
raise HTTPException(status_code=400, detail="Uploaded file must be a PDF.")
pdf_bytes = await file.read()
try:
- with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+ # Open the PDF with PyMuPDF
+ doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")
+ print(doc)
+ with doc:
pages_output = []
- for page_num, page in enumerate(pdf.pages, start=1):
- text = page.extract_text() or ""
- page_dict = {
- "page_number": page_num,
- "text": text
- }
- if tokenize:
- # Tokenize text into sentences
- sentences = nltk.sent_tokenize(text)
- page_dict["sentences"] = sentences
- pages_output.append(page_dict)
+
+ # Iterate through the pages and extract text
+ i = 1
+ for page in doc:
+ pages_output.append({
+ "page_number": i,
+ "text": (page.get_text() or "")
+ })
return {"pages": pages_output}
+
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
diff --git a/models.py b/models.py
new file mode 100644
index 0000000..633af72
--- /dev/null
+++ b/models.py
@@ -0,0 +1,15 @@
+# Copyright 2024 Code Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+
+from pydantic import BaseModel
+from typing import List
+
+class Page(BaseModel):
+ page_number: int
+ text: str
+
+class ExtractResponse(BaseModel):
+ pages: List[Page]
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 616f1e0..c3f06ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,14 @@
-fastapi==0.109.1
-uvicorn==0.22.0
-pdfplumber==0.9.0
+annotated-types==0.7.0
+anyio==4.7.0
+click==8.1.7
+fastapi==0.115.6
+h11==0.14.0
+idna==3.10
+pydantic==2.10.4
+pydantic_core==2.27.2
+PyMuPDF==1.25.1
python-multipart==0.0.20
-nltk~=3.9.1
\ No newline at end of file
+sniffio==1.3.1
+starlette==0.42.0
+typing_extensions==4.12.2
+uvicorn==0.34.0