-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9b830a6
commit c873983
Showing
8 changed files
with
57 additions
and
38 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,48 +1,46 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Copyright (c) 2024 Code Inc. - All Rights Reserved | ||
# Unauthorized copying of this file, via any medium is strictly prohibited | ||
# Proprietary and confidential | ||
# Visit <https://www.codeinc.co> for more information | ||
|
||
import io | ||
import pdfplumber | ||
from fastapi import FastAPI, File, UploadFile, HTTPException, Query | ||
# Copyright 2024 Code Inc. <https://www.codeinc.co> | ||
# | ||
# Use of this source code is governed by an MIT-style | ||
# license that can be found in the LICENSE file or at | ||
# https://opensource.org/licenses/MIT. | ||
|
||
from fastapi import FastAPI, File, UploadFile, HTTPException | ||
from fastapi.responses import JSONResponse | ||
from typing import Optional | ||
import nltk | ||
from datetime import datetime | ||
import pymupdf | ||
from models import ExtractResponse | ||
|
||
app = FastAPI() | ||
|
||
@app.get("/health") | ||
async def health(): | ||
return JSONResponse(content={"status": "up", "timestamp": datetime.now().isoformat()}) | ||
|
||
@app.post("/extract") | ||
async def extract( | ||
file: UploadFile = File(...), | ||
tokenize: Optional[bool] = Query(False, description="If true, return sentence tokenization.") | ||
): | ||
|
||
@app.post("/extract", response_model=ExtractResponse) | ||
async def extract(file: UploadFile = File(...)): | ||
if file.content_type != "application/pdf": | ||
raise HTTPException(status_code=400, detail="Uploaded file must be a PDF.") | ||
|
||
pdf_bytes = await file.read() | ||
try: | ||
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: | ||
# Open the PDF with PyMuPDF | ||
doc = pymupdf.open(stream=pdf_bytes, filetype="pdf") | ||
print(doc) | ||
with doc: | ||
pages_output = [] | ||
for page_num, page in enumerate(pdf.pages, start=1): | ||
text = page.extract_text() or "" | ||
page_dict = { | ||
"page_number": page_num, | ||
"text": text | ||
} | ||
if tokenize: | ||
# Tokenize text into sentences | ||
sentences = nltk.sent_tokenize(text) | ||
page_dict["sentences"] = sentences | ||
pages_output.append(page_dict) | ||
|
||
# Iterate through the pages and extract text | ||
i = 1 | ||
for page in doc: | ||
pages_output.append({ | ||
"page_number": i, | ||
"text": (page.get_text() or "") | ||
}) | ||
|
||
return {"pages": pages_output} | ||
|
||
except Exception as e: | ||
raise HTTPException(status_code=500, detail=str(e)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Copyright 2024 Code Inc. <https://www.codeinc.co> | ||
# | ||
# Use of this source code is governed by an MIT-style | ||
# license that can be found in the LICENSE file or at | ||
# https://opensource.org/licenses/MIT. | ||
|
||
from pydantic import BaseModel | ||
from typing import List | ||
|
||
class Page(BaseModel): | ||
page_number: int | ||
text: str | ||
|
||
class ExtractResponse(BaseModel): | ||
pages: List[Page] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,14 @@ | ||
fastapi==0.109.1 | ||
uvicorn==0.22.0 | ||
pdfplumber==0.9.0 | ||
annotated-types==0.7.0 | ||
anyio==4.7.0 | ||
click==8.1.7 | ||
fastapi==0.115.6 | ||
h11==0.14.0 | ||
idna==3.10 | ||
pydantic==2.10.4 | ||
pydantic_core==2.27.2 | ||
PyMuPDF==1.25.1 | ||
python-multipart==0.0.20 | ||
nltk~=3.9.1 | ||
sniffio==1.3.1 | ||
starlette==0.42.0 | ||
typing_extensions==4.12.2 | ||
uvicorn==0.34.0 |