From f151c360d2a6b2894026f06dc11d829033976fca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joan=20Fabr=C3=A9gat?= Date: Fri, 20 Dec 2024 19:18:41 -0800 Subject: [PATCH] dev --- .github/workflows/docker-image.yml | 7 ++ Dockerfile | 7 +- README.md | 25 +++--- app.py | 124 +++++++++++++++++++++++++---- requirements.txt | 1 + 5 files changed, 140 insertions(+), 24 deletions(-) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 0f3ed6f..b2f2fa8 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -33,6 +33,10 @@ jobs: with: images: codeinchq/pdf2txt + - name: Extract tag as version + id: extract_version + run: echo "VERSION=${GITHUB_REF##*/}" >> $GITHUB_ENV + - name: Build and push Docker image uses: docker/build-push-action@v5 with: @@ -41,3 +45,6 @@ jobs: push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} + build-args: | + BUILD_ID=${{ github.run_id }} + VERSION=${{ env.VERSION }} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index a3a7117..4cde93c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,12 @@ FROM python:3.11-slim ARG PORT=5000 -ENV debian_frontend=noninteractive +ARG BUILD_ID=unknown +ARG VERSION=unknown +ENV PORT=$PORT +ENV BUILD_ID=$BUILD_ID +ENV VERSION=$VERSION +ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 # Set the working directory in the container diff --git a/README.md b/README.md index cf48c4e..e16631c 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Docker Image CI](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml/badge.svg)](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml) [![Docker Image Version](https://img.shields.io/docker/v/codeinchq/pdf2txt?sort=semver&label=Docker%20Hub&color=red)](https://hub.docker.com/r/codeinchq/pdf2txt/tags) -This repository contains a simple containerized API to convert PDF documents to text using Python [MyMuPDF](https://pymupdf.readthedocs.io/en/latest/) library. The API is built using [FastAPI](https://fastapi.tiangolo.com/). +This repository contains a simple containerized API to convert PDF documents to text using Python [MyMuPDF](https://pymupdf.readthedocs.io/en/latest/) and [pdfplumber](https://pypi.org/project/pdfplumber/) libraries. The API is built using [FastAPI](https://fastapi.tiangolo.com/). The image is available on [Docker Hub](https://hub.docker.com/r/codeinchq/pdf2txt) under the name `codeinchq/pdf2txt`. @@ -17,14 +17,16 @@ By default, the container listens on port 3000. The port is configurable using t > [!IMPORTANT] > The v2 parameters are slightly different from those of v1. For more information about the v1 parameters, [see here](https://github.com/codeinchq/pdf2txt/blob/v1.8/README.md#usage). -All requests must by send in POST to the `/extract` endpoint with a `multipart/form-data` content type. The request must contain a PDF file with the key `file`. +All requests must be sent as POST requests to either the `/extract/fast` or `/extract/advanced` endpoint, using a multipart/form-data content type. Each request must include a PDF file with the key file. -Additional parameters can be sent to customize the conversion process: -* `first_page`: The first page to extract. Default is `1`. +The first endpoint uses the [MyMuPDF](https://pymupdf.readthedocs.io/en/latest/) library for superfast text extraction, while the second endpoint uses the [pdfplumber](https://pypi.org/project/pdfplumber/) library for more advanced text and table extraction. + +Both endpoints accept the following parameters: +* `first_page`: The first page to extract. Default is 1. * `last_page`: The last page to extract. Default is the last page of the document. * `password`: The password to unlock the PDF. Default is none. -The server returns `200` if the conversion was successful and the images are available in the response body. In case of error, the server returns a `400` status code with a JSON object containing the error message (format: `{error: string}`). +The server returns a `200` status code if the conversion is successful, with the extracted data available in the response body. In case of an error, the server returns a `400` status code along with a JSON object containing the error message in the format: `{error: string}`. ### Example @@ -36,19 +38,24 @@ docker run -p "3000:3000" codeinchq/pdf2txt #### Step 2: convert a PDF file to text Convert a PDF file to text with a JSON response: ```bash -curl -X POST -F "file=@/path/to/file.pdf" http://localhost:3000/extract -o example.json +curl -X POST -F "file=@/path/to/file.pdf" http://localhost:3000/extract/fast -o example.json ``` Extract a password-protected PDF file's text content as JSON and save it to a file: ```bash -curl -X POST -F "file=@/path/to/file.pdf" -F "password=XXX" http://localhost:3000/extract -o example.json +curl -X POST -F "file=@/path/to/file.pdf" -F "password=XXX" http://localhost:3000/extract/advanced -o example.json ``` ### Health check -A health check is available at the `/health` endpoint. The server returns a status code of `200` if the service is healthy, along with a JSON object: +A health check is available at the `/health` endpoint. The server returns a `200` status code if the service is healthy, along with a JSON object in the following format: ```json -{ "status": "up" } +{ + "status": "ok", + "uptime": "0:00:00.000000", + "version": "1.0.0", + "build_id": "00000000" +} ``` ## Client diff --git a/app.py b/app.py index 94796db..9f1ae1d 100644 --- a/app.py +++ b/app.py @@ -6,20 +6,49 @@ # license that can be found in the LICENSE file or at # https://opensource.org/licenses/MIT. -from fastapi import FastAPI, File, UploadFile, HTTPException -from fastapi.responses import JSONResponse +from fastapi import FastAPI, File, UploadFile, HTTPException, Query from datetime import datetime import pymupdf from models import ExtractResponse +import pdfplumber +import io +import os app = FastAPI() +start_time = datetime.now() @app.get("/health") async def health(): - return JSONResponse(content={"status": "up", "timestamp": datetime.now().isoformat()}) + """ + Health check endpoint to verify the service is up and running. -@app.post("/extract", response_model=ExtractResponse) -async def extract(file: UploadFile = File(...)): + :return: JSON response with service status, uptime, version, and build ID + """ + return { + "status": "ok", + "uptime": str(datetime.now() - start_time), + "version": os.getenv("VERSION", "unknown"), + "build_id": os.getenv("BUILD_ID", "unknown") + } + + +@app.post("/extract/fast", response_model=ExtractResponse) +async def extract( + file: UploadFile = File(...), + first_page: int = Query(1, ge=1, description="The first page to extract."), + last_page: int = Query(None, ge=1, description="The last page to extract."), + password: str = Query(None, description="Password to unlock the PDF.") +): + """ + Extract text from PDF pages using PyMuPDF. + + :param file: PDF file to extract text from + :param first_page: First page to extract (default 1) + :param last_page: Last page to extract (default last page) + :param password: Password to unlock the PDF (default none) + + :return: Extracted text from each page + """ if file.content_type != "application/pdf": raise HTTPException(status_code=400, detail="Uploaded file must be a PDF.") @@ -27,19 +56,86 @@ async def extract(file: UploadFile = File(...)): try: # Open the PDF with PyMuPDF doc = pymupdf.open(stream=pdf_bytes, filetype="pdf") - print(doc) - with doc: - pages_output = [] - # Iterate through the pages and extract text - i = 1 - for page in doc: + # Handle password-protected PDFs + if password: + if not doc.authenticate(password): + raise HTTPException(status_code=401, detail="Incorrect password for the PDF.") + + # Default last page to the number of pages in the document + last_page = last_page or doc.page_count + + if first_page < 1 or last_page > doc.page_count or first_page > last_page: + raise HTTPException(status_code=400, detail="Invalid page range.") + + pages_output = [] + with doc: + for i in range(first_page - 1, last_page): + page = doc[i] pages_output.append({ - "page_number": i, - "text": (page.get_text() or "") + "page_number": i + 1, + "text": page.get_text() or "" }) - return {"pages": pages_output} + return {"pages": pages_output} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/extract/advanced", response_model=ExtractResponse) +async def extract_advanced( + file: UploadFile = File(...), + first_page: int = Query(1, ge=1, description="The first page to extract."), + last_page: int = Query(None, ge=1, description="The last page to extract."), + password: str = Query(None, description="Password to unlock the PDF.") +): + """ + Extract text and tables from PDF pages using pdfplumber for better table extraction. + + :param file: PDF file to extract text and tables from + :param first_page: First page to extract (default 1) + :param last_page: Last page to extract (default last page) + :param password: Password to unlock the PDF (default none) + + :return: Extracted text and tables from each page + """ + if file.content_type != "application/pdf": + raise HTTPException(status_code=400, detail="Uploaded file must be a PDF.") + + pdf_bytes = await file.read() + try: + pages_output = [] + + with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: + # Handle password-protected PDFs + if pdf.is_encrypted: + if not password: + raise HTTPException(status_code=401, detail="PDF is password-protected.") + pdf.decrypt(password) + + # Default last page to the number of pages in the document + last_page = last_page or len(pdf.pages) + + if first_page < 1 or last_page > len(pdf.pages) or first_page > last_page: + raise HTTPException(status_code=400, detail="Invalid page range.") + + for i in range(first_page - 1, last_page): + page = pdf.pages[i] + + # Extract text + text = page.extract_text() or "" + + # Extract tables + tables = page.extract_tables() + + pages_output.append({ + "page_number": i + 1, + "text": text, + "tables": tables + }) + + return {"pages": pages_output} + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e0fabd1..e35dfea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ python-multipart==0.0.20 sniffio==1.3.1 typing_extensions==4.12.2 uvicorn==0.34.0 +pdfplumber~=0.11.4 \ No newline at end of file