Skip to content

Commit

Permalink
Merge pull request #75 from datalogics-jacksonm/ocr-pdf
Browse files Browse the repository at this point in the history
PDFCLOUD-3933 | Create new examples for OCR -> Extract Text workflow
  • Loading branch information
datalogics-cgreen authored Jul 29, 2024
2 parents bed956c + 27a85ea commit 931d87b
Show file tree
Hide file tree
Showing 6 changed files with 413 additions and 0 deletions.
62 changes: 62 additions & 0 deletions DotNET/Complex Flow Examples/ocr-with-extract-text.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
using Newtonsoft.Json.Linq;
using System;
using System.IO;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;

class OcrWithExtractText
{
private static readonly string apiKey = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"; // Your API key here

static async Task Main(string[] args)
{
using (var httpClient = new HttpClient { BaseAddress = new Uri("https://api.pdfrest.com") })
{
// Upload PDF for OCR
using var ocrRequest = new HttpRequestMessage(HttpMethod.Post, "pdf-with-ocr-text");

ocrRequest.Headers.TryAddWithoutValidation("Api-Key", apiKey);
ocrRequest.Headers.Accept.Add(new System.Net.Http.Headers.MediaTypeWithQualityHeaderValue("application/json"));
var ocrMultipartContent = new MultipartFormDataContent();

var pdfByteArray = File.ReadAllBytes("/path/to/file.pdf");
var pdfByteArrayContent = new ByteArrayContent(pdfByteArray);
ocrMultipartContent.Add(pdfByteArrayContent, "file", "file.pdf");
pdfByteArrayContent.Headers.TryAddWithoutValidation("Content-Type", "application/pdf");

ocrRequest.Content = ocrMultipartContent;
var ocrResponse = await httpClient.SendAsync(ocrRequest);

var ocrResult = await ocrResponse.Content.ReadAsStringAsync();
Console.WriteLine("OCR response received.");
Console.WriteLine(ocrResult);

dynamic ocrResponseData = JObject.Parse(ocrResult);
string ocrPDFID = ocrResponseData.outputId;

// Extract text from OCR'd PDF
using var extractTextRequest = new HttpRequestMessage(HttpMethod.Post, "extracted-text");

extractTextRequest.Headers.TryAddWithoutValidation("Api-Key", apiKey);
extractTextRequest.Headers.Accept.Add(new System.Net.Http.Headers.MediaTypeWithQualityHeaderValue("application/json"));
var extractTextMultipartContent = new MultipartFormDataContent();

var byteArrayOption = new ByteArrayContent(Encoding.UTF8.GetBytes(ocrPDFID));
extractTextMultipartContent.Add(byteArrayOption, "id");

extractTextRequest.Content = extractTextMultipartContent;
var extractTextResponse = await httpClient.SendAsync(extractTextRequest);

var extractTextResult = await extractTextResponse.Content.ReadAsStringAsync();
Console.WriteLine("Extract text response received.");
Console.WriteLine(extractTextResult);

dynamic extractTextResponseData = JObject.Parse(extractTextResult);
string fullText = extractTextResponseData.fullText;

Console.WriteLine("Extracted text:");
Console.WriteLine(fullText);
}
}
}
106 changes: 106 additions & 0 deletions Java/Complex Flow Examples/OcrWithExtractText.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import io.github.cdimascio.dotenv.Dotenv;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import okhttp3.*;
import org.json.JSONObject;

/* In this sample, we will show how to convert a scanned document into a PDF with
* searchable and extractable text using Optical Character Recognition (OCR), and then
* extract that text from the newly created document.
*
* First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the
* output ID. Then, we will send the output ID to the /extracted-text route, which will
* return the newly added text.
*/

public class OcrWithExtractText {

// Specify the path to your PDF file here, or as the first argument when running the program.
private static final String DEFAULT_PDF_FILE_PATH = "/path/to/file.pdf";

// Specify your API key here, or in the environment variable PDFREST_API_KEY.
// You can also put the environment variable in a .env file.
private static final String DEFAULT_API_KEY = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";

public static void main(String[] args) {
File pdfFile;
if (args.length > 0) {
pdfFile = new File(args[0]);
} else {
pdfFile = new File(DEFAULT_PDF_FILE_PATH);
}

final Dotenv dotenv = Dotenv.configure().ignoreIfMalformed().ignoreIfMissing().load();

final RequestBody pdfFileRequestBody =
RequestBody.create(pdfFile, MediaType.parse("application/pdf"));
RequestBody ocrRequestBody =
new MultipartBody.Builder()
.setType(MultipartBody.FORM)
.addFormDataPart("file", pdfFile.getName(), pdfFileRequestBody)
.addFormDataPart("output", "example_pdf-with-ocr-text_out")
.build();
Request ocrRequest =
new Request.Builder()
.header("Api-Key", dotenv.get("PDFREST_API_KEY", DEFAULT_API_KEY))
.url("https://api.pdfrest.com/pdf-with-ocr-text")
.post(ocrRequestBody)
.build();
try {
OkHttpClient ocrClient =
new OkHttpClient().newBuilder().readTimeout(60, TimeUnit.SECONDS).build();

Response ocrResponse = ocrClient.newCall(ocrRequest).execute();

System.out.println("Response status code: " + ocrResponse.code());
if (ocrResponse.body() != null) {
String ocrResponseString = ocrResponse.body().string();

JSONObject ocrJSON = new JSONObject(ocrResponseString);
if (ocrJSON.has("error")) {
System.out.println("Error during OCR call: " + ocrResponseString);
return;
}

String ocrPDFID = ocrJSON.get("outputId").toString();
System.out.println("Got the output ID: " + ocrPDFID);

RequestBody extractRequestBody =
new MultipartBody.Builder()
.setType(MultipartBody.FORM)
.addFormDataPart("id", ocrPDFID)
.build();
Request extractRequest =
new Request.Builder()
.header("Api-Key", dotenv.get("PDFREST_API_KEY", DEFAULT_API_KEY))
.url("https://api.pdfrest.com/extracted-text")
.post(extractRequestBody)
.build();
try {
OkHttpClient extractClient =
new OkHttpClient().newBuilder().readTimeout(60, TimeUnit.SECONDS).build();

Response extractResponse = extractClient.newCall(extractRequest).execute();

System.out.println("Response status code: " + extractResponse.code());
if (extractResponse.body() != null) {
String extractResponseString = extractResponse.body().string();

JSONObject extractJSON = new JSONObject(extractResponseString);
if (extractJSON.has("error")) {
System.out.println("Error during text extraction call: " + extractResponseString);
return;
}

System.out.println(extractJSON.getString("fullText"));
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
74 changes: 74 additions & 0 deletions JavaScript/Complex Flow Examples/ocr-with-extract-text.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
var axios = require("axios");
var FormData = require("form-data");
var fs = require("fs");

/* In this sample, we will show how to convert a scanned document into a PDF with
* searchable and extractable text using Optical Character Recognition (OCR), and then
* extract that text from the newly created document.
*
* First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the
* output ID. Then, we will send the output ID to the /extracted-text route, which will
* return the newly added text.
*/

var apiKey = "xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"; // Replace with your API key

var ocrData = new FormData();
ocrData.append("file", fs.createReadStream("/path/to/file.pdf"), "file_name.pdf");
ocrData.append("output", "example_pdf-with-ocr-text_out");

var ocrConfig = {
method: "post",
maxBodyLength: Infinity,
url: "https://api.pdfrest.com/pdf-with-ocr-text",
headers: {
"Api-Key": apiKey,
...ocrData.getHeaders(),
},
data: ocrData,
};

console.log("Sending POST request to OCR endpoint...");
axios(ocrConfig)
.then(function (response) {
console.log("Response status code: " + response.status);

if (response.status === 200) {
var ocrPDFID = response.data.outputId;
console.log("Got the output ID: " + ocrPDFID);

var extractData = new FormData();
extractData.append("id", ocrPDFID);

var extractConfig = {
method: "post",
maxBodyLength: Infinity,
url: "https://api.pdfrest.com/extracted-text",
headers: {
"Api-Key": apiKey,
...extractData.getHeaders(),
},
data: extractData,
};

console.log("Sending POST request to extract text endpoint...");
axios(extractConfig)
.then(function (extractResponse) {
console.log("Response status code: " + extractResponse.status);

if (extractResponse.status === 200) {
console.log(extractResponse.data.fullText);
} else {
console.log(extractResponse.data);
}
})
.catch(function (error) {
console.log(error.response ? error.response.data : error.message);
});
} else {
console.log(response.data);
}
})
.catch(function (error) {
console.log(error.response ? error.response.data : error.message);
});
72 changes: 72 additions & 0 deletions PHP/Complex Flow Examples/ocr-with-extract-text.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
<?php

require 'vendor/autoload.php';

use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Utils;

/* In this sample, we will show how to convert a scanned document into a PDF with
* searchable and extractable text using Optical Character Recognition (OCR), and then
* extract that text from the newly created document.
*
* First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the
* output ID. Then, we will send the output ID to the /extracted-text route, which will
* return the newly added text.
*/

$client = new Client();

$headers = [
'Api-Key' => 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' // Replace with your API key
];

// Upload PDF for OCR
$pdfToOCROptions = [
'multipart' => [
[
'name' => 'file',
'contents' => Utils::tryFopen('/path/to/file.pdf', 'r'),
'filename' => 'file.pdf',
'headers' => [
'Content-Type' => 'application/pdf'
]
],
[
'name' => 'output',
'contents' => 'example_pdf-with-ocr-text_out'
]
]
];

$pdfToOCRRequest = new Request('POST', 'https://api.pdfrest.com/pdf-with-ocr-text', $headers);

echo "Sending POST request to OCR endpoint...\n";
$pdfToOCRResponse = $client->sendAsync($pdfToOCRRequest, $pdfToOCROptions)->wait();

echo "Response status code: " . $pdfToOCRResponse->getStatusCode() . "\n";

$ocrPDFID = json_decode($pdfToOCRResponse->getBody())->outputId;
echo "Got the output ID: " . $ocrPDFID . "\n";

// Extract text from OCR'd PDF
$extractTextOptions = [
'multipart' => [
[
'name' => 'id',
'contents' => $ocrPDFID
]
]
];

$extractTextRequest = new Request('POST', 'https://api.pdfrest.com/extracted-text', $headers);

echo "Sending POST request to extract text endpoint...\n";
$extractTextResponse = $client->sendAsync($extractTextRequest, $extractTextOptions)->wait();

echo "Response status code: " . $extractTextResponse->getStatusCode() . "\n";

$fullText = json_decode($extractTextResponse->getBody())->fullText;
echo $fullText . "\n";

?>
67 changes: 67 additions & 0 deletions Python/Complex Flow Examples/ocr-with-extract-text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from requests_toolbelt import MultipartEncoder
import requests


# In this sample, we will show how to convert a scanned document into a PDF with
# searchable and extractable text using Optical Character Recognition (OCR), and then
# extract that text from the newly created document.
#
# First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the
# output ID. Then, we will send the output ID to the /extracted-text route, which will
# return the newly added text.

api_key = 'xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' # place your api key here

ocr_endpoint_url = 'https://api.pdfrest.com/pdf-with-ocr-text'
mp_encoder_pdf = MultipartEncoder(
fields={
'file': ('file_name.pdf', open('/path/to/file.pdf', 'rb'), 'application/pdf'),
'output': 'example_pdf-with-ocr-text_out',
}
)

image_headers = {
'Accept': 'application/json',
'Content-Type': mp_encoder_pdf.content_type,
'Api-Key': api_key
}

print("Sending POST request to OCR endpoint...")
response = requests.post(ocr_endpoint_url, data=mp_encoder_pdf, headers=image_headers)

print("Response status code: " + str(response.status_code))

if response.ok:
response_json = response.json()
ocr_pdf_id = response_json["outputId"]
print("Got the output ID: " + ocr_pdf_id)

extract_endpoint_url = 'https://api.pdfrest.com/extracted-text'

mp_encoder_extract_text = MultipartEncoder(
fields={
'id': ocr_pdf_id
}
)

extract_text_headers = {
'Accept': 'application/json',
'Content-Type': mp_encoder_extract_text.content_type,
'Api-Key': api_key
}

print("Sending POST request to extract text endpoint...")
extract_response = requests.post(extract_endpoint_url, data=mp_encoder_extract_text, headers=extract_text_headers)

print("Response status code: " + str(extract_response.status_code))

if extract_response.ok:
extract_json = extract_response.json()
print(extract_json["fullText"])

else:
print(extract_response.text)


else:
print(response.text)
Loading

0 comments on commit 931d87b

Please sign in to comment.