diff --git a/.idea/copyright/Code_Inc_.xml b/.idea/copyright/Code_Inc_.xml index 0c466f8..739feeb 100644 --- a/.idea/copyright/Code_Inc_.xml +++ b/.idea/copyright/Code_Inc_.xml @@ -1,6 +1,6 @@ - \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 9681607..599f224 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,8 @@ -FROM python:3.9-slim +FROM python:3.11-slim ARG PORT=5000 ENV debian_frontend=noninteractive ENV PYTHONUNBUFFERED=1 -ENV NLTK_DATA=/usr/share/nltk_data WORKDIR /app @@ -13,9 +12,7 @@ RUN apt-get update && apt-get install -y \ COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt && \ - mkdir -p /usr/share/nltk_data && \ - python -m nltk.downloader -d $NLTK_DATA punkt_tab +RUN pip install --no-cache-dir -r requirements.txt COPY app.py . COPY entrypoint.sh . diff --git a/README.md b/README.md index fa375b8..099071a 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Docker Image CI](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml/badge.svg)](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml) [![Docker Image Version](https://img.shields.io/docker/v/codeinchq/pdf2txt?sort=semver&label=Docker%20Hub&color=red)](https://hub.docker.com/r/codeinchq/pdf2txt/tags) -This repository contains a simple containerized API to convert PDF documents to text using Python [pdfplumber](https://pypi.org/project/pdfplumber/) library. The API is built using [FastAPI](https://fastapi.tiangolo.com/). +This repository contains a simple containerized API to convert PDF documents to text using Python [MyMuPDF](https://pymupdf.readthedocs.io/en/latest/) library. The API is built using [FastAPI](https://fastapi.tiangolo.com/). The image is available on [Docker Hub](https://hub.docker.com/r/codeinchq/pdf2txt) under the name `codeinchq/pdf2txt`. diff --git a/__pycache__/app.cpython-311.pyc b/__pycache__/app.cpython-311.pyc new file mode 100644 index 0000000..329e0d1 Binary files /dev/null and b/__pycache__/app.cpython-311.pyc differ diff --git a/__pycache__/models.cpython-311.pyc b/__pycache__/models.cpython-311.pyc new file mode 100644 index 0000000..650cc56 Binary files /dev/null and b/__pycache__/models.cpython-311.pyc differ diff --git a/app.py b/app.py index 84624ae..a5a4cff 100644 --- a/app.py +++ b/app.py @@ -1,17 +1,16 @@ #!/usr/bin/env python3 -# Copyright (c) 2024 Code Inc. - All Rights Reserved -# Unauthorized copying of this file, via any medium is strictly prohibited -# Proprietary and confidential -# Visit for more information - -import io -import pdfplumber -from fastapi import FastAPI, File, UploadFile, HTTPException, Query +# Copyright 2024 Code Inc. +# +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +from fastapi import FastAPI, File, UploadFile, HTTPException from fastapi.responses import JSONResponse -from typing import Optional -import nltk from datetime import datetime +import pymupdf +from models import ExtractResponse app = FastAPI() @@ -19,30 +18,29 @@ async def health(): return JSONResponse(content={"status": "up", "timestamp": datetime.now().isoformat()}) -@app.post("/extract") -async def extract( - file: UploadFile = File(...), - tokenize: Optional[bool] = Query(False, description="If true, return sentence tokenization.") -): + +@app.post("/extract", response_model=ExtractResponse) +async def extract(file: UploadFile = File(...)): if file.content_type != "application/pdf": raise HTTPException(status_code=400, detail="Uploaded file must be a PDF.") pdf_bytes = await file.read() try: - with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: + # Open the PDF with PyMuPDF + doc = pymupdf.open(stream=pdf_bytes, filetype="pdf") + print(doc) + with doc: pages_output = [] - for page_num, page in enumerate(pdf.pages, start=1): - text = page.extract_text() or "" - page_dict = { - "page_number": page_num, - "text": text - } - if tokenize: - # Tokenize text into sentences - sentences = nltk.sent_tokenize(text) - page_dict["sentences"] = sentences - pages_output.append(page_dict) + + # Iterate through the pages and extract text + i = 1 + for page in doc: + pages_output.append({ + "page_number": i, + "text": (page.get_text() or "") + }) return {"pages": pages_output} + except Exception as e: raise HTTPException(status_code=500, detail=str(e)) diff --git a/models.py b/models.py new file mode 100644 index 0000000..633af72 --- /dev/null +++ b/models.py @@ -0,0 +1,15 @@ +# Copyright 2024 Code Inc. +# +# Use of this source code is governed by an MIT-style +# license that can be found in the LICENSE file or at +# https://opensource.org/licenses/MIT. + +from pydantic import BaseModel +from typing import List + +class Page(BaseModel): + page_number: int + text: str + +class ExtractResponse(BaseModel): + pages: List[Page] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 616f1e0..c3f06ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,14 @@ -fastapi==0.109.1 -uvicorn==0.22.0 -pdfplumber==0.9.0 +annotated-types==0.7.0 +anyio==4.7.0 +click==8.1.7 +fastapi==0.115.6 +h11==0.14.0 +idna==3.10 +pydantic==2.10.4 +pydantic_core==2.27.2 +PyMuPDF==1.25.1 python-multipart==0.0.20 -nltk~=3.9.1 \ No newline at end of file +sniffio==1.3.1 +starlette==0.42.0 +typing_extensions==4.12.2 +uvicorn==0.34.0