From f151c360d2a6b2894026f06dc11d829033976fca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Joan=20Fabr=C3=A9gat?= <joan@codeinc.co>
Date: Fri, 20 Dec 2024 19:18:41 -0800
Subject: [PATCH] dev

---
 .github/workflows/docker-image.yml |   7 ++
 Dockerfile                         |   7 +-
 README.md                          |  25 +++---
 app.py                             | 124 +++++++++++++++++++++++++----
 requirements.txt                   |   1 +
 5 files changed, 140 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 0f3ed6f..b2f2fa8 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -33,6 +33,10 @@ jobs:
         with:
           images: codeinchq/pdf2txt
 
+      - name: Extract tag as version
+        id: extract_version
+        run: echo "VERSION=${GITHUB_REF##*/}" >> $GITHUB_ENV
+
       - name: Build and push Docker image
         uses: docker/build-push-action@v5
         with:
@@ -41,3 +45,6 @@ jobs:
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
+          build-args: |
+            BUILD_ID=${{ github.run_id }}
+            VERSION=${{ env.VERSION }}
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index a3a7117..4cde93c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,12 @@
 FROM python:3.11-slim
 
 ARG PORT=5000
-ENV debian_frontend=noninteractive
+ARG BUILD_ID=unknown
+ARG VERSION=unknown
+ENV PORT=$PORT
+ENV BUILD_ID=$BUILD_ID
+ENV VERSION=$VERSION
+ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 
 # Set the working directory in the container
diff --git a/README.md b/README.md
index cf48c4e..e16631c 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![Docker Image CI](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml/badge.svg)](https://github.com/codeinchq/pdf2txt/actions/workflows/docker-image.yml)
 [![Docker Image Version](https://img.shields.io/docker/v/codeinchq/pdf2txt?sort=semver&label=Docker%20Hub&color=red)](https://hub.docker.com/r/codeinchq/pdf2txt/tags)
 
-This repository contains a simple containerized API to convert PDF documents to text using Python [MyMuPDF](https://pymupdf.readthedocs.io/en/latest/) library. The API is built using [FastAPI](https://fastapi.tiangolo.com/).
+This repository contains a simple containerized API to convert PDF documents to text using Python [MyMuPDF](https://pymupdf.readthedocs.io/en/latest/) and [pdfplumber](https://pypi.org/project/pdfplumber/) libraries. The API is built using [FastAPI](https://fastapi.tiangolo.com/).
 
 The image is available on [Docker Hub](https://hub.docker.com/r/codeinchq/pdf2txt) under the name `codeinchq/pdf2txt`.
 
@@ -17,14 +17,16 @@ By default, the container listens on port 3000. The port is configurable using t
 > [!IMPORTANT]  
 > The v2 parameters are slightly different from those of v1. For more information about the v1 parameters, [see here](https://github.com/codeinchq/pdf2txt/blob/v1.8/README.md#usage).
 
-All requests must by send in POST to the `/extract` endpoint with a `multipart/form-data` content type. The request must contain a PDF file with the key `file`. 
+All requests must be sent as POST requests to either the `/extract/fast` or `/extract/advanced` endpoint, using a multipart/form-data content type. Each request must include a PDF file with the key file.
 
-Additional parameters can be sent to customize the conversion process:
-* `first_page`: The first page to extract. Default is `1`.
+The first endpoint uses the [MyMuPDF](https://pymupdf.readthedocs.io/en/latest/) library for superfast text extraction, while the second endpoint uses the [pdfplumber](https://pypi.org/project/pdfplumber/) library for more advanced text and table extraction.
+
+Both endpoints accept the following parameters:
+* `first_page`: The first page to extract. Default is 1.
 * `last_page`: The last page to extract. Default is the last page of the document.
 * `password`: The password to unlock the PDF. Default is none.
 
-The server returns `200` if the conversion was successful and the images are available in the response body. In case of error, the server returns a `400` status code with a JSON object containing the error message (format: `{error: string}`).
+The server returns a `200` status code if the conversion is successful, with the extracted data available in the response body. In case of an error, the server returns a `400` status code along with a JSON object containing the error message in the format: `{error: string}`.
 
 ### Example
 
@@ -36,19 +38,24 @@ docker run -p "3000:3000" codeinchq/pdf2txt
 #### Step 2: convert a PDF file to text
 Convert a PDF file to text with a JSON response:
 ```bash
-curl -X POST -F "file=@/path/to/file.pdf" http://localhost:3000/extract -o example.json
+curl -X POST -F "file=@/path/to/file.pdf" http://localhost:3000/extract/fast -o example.json
 ```
 
 Extract a password-protected PDF file's text content as JSON and save it to a file:
 ```bash
-curl -X POST -F "file=@/path/to/file.pdf" -F "password=XXX" http://localhost:3000/extract -o example.json
+curl -X POST -F "file=@/path/to/file.pdf" -F "password=XXX" http://localhost:3000/extract/advanced -o example.json
 ```
 
 ### Health check
 
-A health check is available at the `/health` endpoint. The server returns a status code of `200` if the service is healthy, along with a JSON object:
+A health check is available at the `/health` endpoint. The server returns a `200` status code if the service is healthy, along with a JSON object in the following format:
 ```json
-{ "status": "up" }
+{
+  "status": "ok",
+  "uptime": "0:00:00.000000",
+  "version": "1.0.0",
+  "build_id": "00000000"
+}
 ```
 
 ## Client
diff --git a/app.py b/app.py
index 94796db..9f1ae1d 100644
--- a/app.py
+++ b/app.py
@@ -6,20 +6,49 @@
 #  license that can be found in the LICENSE file or at
 #  https://opensource.org/licenses/MIT.
 
-from fastapi import FastAPI, File, UploadFile, HTTPException
-from fastapi.responses import JSONResponse
+from fastapi import FastAPI, File, UploadFile, HTTPException, Query
 from datetime import datetime
 import pymupdf
 from models import ExtractResponse
+import pdfplumber
+import io
+import os
 
 app = FastAPI()
+start_time = datetime.now()
 
 @app.get("/health")
 async def health():
-    return JSONResponse(content={"status": "up", "timestamp": datetime.now().isoformat()})
+    """
+    Health check endpoint to verify the service is up and running.
 
-@app.post("/extract", response_model=ExtractResponse)
-async def extract(file: UploadFile = File(...)):
+    :return: JSON response with service status, uptime, version, and build ID
+    """
+    return {
+        "status": "ok",
+        "uptime": str(datetime.now() - start_time),
+        "version": os.getenv("VERSION", "unknown"),
+        "build_id": os.getenv("BUILD_ID", "unknown")
+    }
+
+
+@app.post("/extract/fast", response_model=ExtractResponse)
+async def extract(
+    file: UploadFile = File(...),
+    first_page: int = Query(1, ge=1, description="The first page to extract."),
+    last_page: int = Query(None, ge=1, description="The last page to extract."),
+    password: str = Query(None, description="Password to unlock the PDF.")
+):
+    """
+    Extract text from PDF pages using PyMuPDF.
+
+    :param file: PDF file to extract text from
+    :param first_page: First page to extract (default 1)
+    :param last_page: Last page to extract (default last page)
+    :param password: Password to unlock the PDF (default none)
+
+    :return: Extracted text from each page
+    """
     if file.content_type != "application/pdf":
         raise HTTPException(status_code=400, detail="Uploaded file must be a PDF.")
 
@@ -27,19 +56,86 @@ async def extract(file: UploadFile = File(...)):
     try:
         # Open the PDF with PyMuPDF
         doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")
-        print(doc)
-        with doc:
-            pages_output = []
 
-            # Iterate through the pages and extract text
-            i = 1
-            for page in doc:
+        # Handle password-protected PDFs
+        if password:
+            if not doc.authenticate(password):
+                raise HTTPException(status_code=401, detail="Incorrect password for the PDF.")
+
+        # Default last page to the number of pages in the document
+        last_page = last_page or doc.page_count
+
+        if first_page < 1 or last_page > doc.page_count or first_page > last_page:
+            raise HTTPException(status_code=400, detail="Invalid page range.")
+
+        pages_output = []
+        with doc:
+            for i in range(first_page - 1, last_page):
+                page = doc[i]
                 pages_output.append({
-                    "page_number": i,
-                    "text": (page.get_text() or "")
+                    "page_number": i + 1,
+                    "text": page.get_text() or ""
                 })
 
-            return {"pages": pages_output}
+        return {"pages": pages_output}
 
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/extract/advanced", response_model=ExtractResponse)
+async def extract_advanced(
+    file: UploadFile = File(...),
+    first_page: int = Query(1, ge=1, description="The first page to extract."),
+    last_page: int = Query(None, ge=1, description="The last page to extract."),
+    password: str = Query(None, description="Password to unlock the PDF.")
+):
+    """
+    Extract text and tables from PDF pages using pdfplumber for better table extraction.
+
+    :param file: PDF file to extract text and tables from
+    :param first_page: First page to extract (default 1)
+    :param last_page: Last page to extract (default last page)
+    :param password: Password to unlock the PDF (default none)
+
+    :return: Extracted text and tables from each page
+    """
+    if file.content_type != "application/pdf":
+        raise HTTPException(status_code=400, detail="Uploaded file must be a PDF.")
+
+    pdf_bytes = await file.read()
+    try:
+        pages_output = []
+
+        with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+            # Handle password-protected PDFs
+            if pdf.is_encrypted:
+                if not password:
+                    raise HTTPException(status_code=401, detail="PDF is password-protected.")
+                pdf.decrypt(password)
+
+            # Default last page to the number of pages in the document
+            last_page = last_page or len(pdf.pages)
+
+            if first_page < 1 or last_page > len(pdf.pages) or first_page > last_page:
+                raise HTTPException(status_code=400, detail="Invalid page range.")
+
+            for i in range(first_page - 1, last_page):
+                page = pdf.pages[i]
+
+                # Extract text
+                text = page.extract_text() or ""
+
+                # Extract tables
+                tables = page.extract_tables()
+
+                pages_output.append({
+                    "page_number": i + 1,
+                    "text": text,
+                    "tables": tables
+                })
+
+        return {"pages": pages_output}
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index e0fabd1..e35dfea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ python-multipart==0.0.20
 sniffio==1.3.1
 typing_extensions==4.12.2
 uvicorn==0.34.0
+pdfplumber~=0.11.4
\ No newline at end of file