-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #75 from datalogics-jacksonm/ocr-pdf
PDFCLOUD-3933 | Create new examples for OCR -> Extract Text workflow
- Loading branch information
Showing
6 changed files
with
413 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
using Newtonsoft.Json.Linq; | ||
using System; | ||
using System.IO; | ||
using System.Net.Http; | ||
using System.Text; | ||
using System.Threading.Tasks; | ||
|
||
class OcrWithExtractText | ||
{ | ||
private static readonly string apiKey = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"; // Your API key here | ||
|
||
static async Task Main(string[] args) | ||
{ | ||
using (var httpClient = new HttpClient { BaseAddress = new Uri("https://api.pdfrest.com") }) | ||
{ | ||
// Upload PDF for OCR | ||
using var ocrRequest = new HttpRequestMessage(HttpMethod.Post, "pdf-with-ocr-text"); | ||
|
||
ocrRequest.Headers.TryAddWithoutValidation("Api-Key", apiKey); | ||
ocrRequest.Headers.Accept.Add(new System.Net.Http.Headers.MediaTypeWithQualityHeaderValue("application/json")); | ||
var ocrMultipartContent = new MultipartFormDataContent(); | ||
|
||
var pdfByteArray = File.ReadAllBytes("/path/to/file.pdf"); | ||
var pdfByteArrayContent = new ByteArrayContent(pdfByteArray); | ||
ocrMultipartContent.Add(pdfByteArrayContent, "file", "file.pdf"); | ||
pdfByteArrayContent.Headers.TryAddWithoutValidation("Content-Type", "application/pdf"); | ||
|
||
ocrRequest.Content = ocrMultipartContent; | ||
var ocrResponse = await httpClient.SendAsync(ocrRequest); | ||
|
||
var ocrResult = await ocrResponse.Content.ReadAsStringAsync(); | ||
Console.WriteLine("OCR response received."); | ||
Console.WriteLine(ocrResult); | ||
|
||
dynamic ocrResponseData = JObject.Parse(ocrResult); | ||
string ocrPDFID = ocrResponseData.outputId; | ||
|
||
// Extract text from OCR'd PDF | ||
using var extractTextRequest = new HttpRequestMessage(HttpMethod.Post, "extracted-text"); | ||
|
||
extractTextRequest.Headers.TryAddWithoutValidation("Api-Key", apiKey); | ||
extractTextRequest.Headers.Accept.Add(new System.Net.Http.Headers.MediaTypeWithQualityHeaderValue("application/json")); | ||
var extractTextMultipartContent = new MultipartFormDataContent(); | ||
|
||
var byteArrayOption = new ByteArrayContent(Encoding.UTF8.GetBytes(ocrPDFID)); | ||
extractTextMultipartContent.Add(byteArrayOption, "id"); | ||
|
||
extractTextRequest.Content = extractTextMultipartContent; | ||
var extractTextResponse = await httpClient.SendAsync(extractTextRequest); | ||
|
||
var extractTextResult = await extractTextResponse.Content.ReadAsStringAsync(); | ||
Console.WriteLine("Extract text response received."); | ||
Console.WriteLine(extractTextResult); | ||
|
||
dynamic extractTextResponseData = JObject.Parse(extractTextResult); | ||
string fullText = extractTextResponseData.fullText; | ||
|
||
Console.WriteLine("Extracted text:"); | ||
Console.WriteLine(fullText); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import io.github.cdimascio.dotenv.Dotenv; | ||
import java.io.File; | ||
import java.io.IOException; | ||
import java.util.concurrent.TimeUnit; | ||
import okhttp3.*; | ||
import org.json.JSONObject; | ||
|
||
/* In this sample, we will show how to convert a scanned document into a PDF with | ||
* searchable and extractable text using Optical Character Recognition (OCR), and then | ||
* extract that text from the newly created document. | ||
* | ||
* First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the | ||
* output ID. Then, we will send the output ID to the /extracted-text route, which will | ||
* return the newly added text. | ||
*/ | ||
|
||
public class OcrWithExtractText { | ||
|
||
// Specify the path to your PDF file here, or as the first argument when running the program. | ||
private static final String DEFAULT_PDF_FILE_PATH = "/path/to/file.pdf"; | ||
|
||
// Specify your API key here, or in the environment variable PDFREST_API_KEY. | ||
// You can also put the environment variable in a .env file. | ||
private static final String DEFAULT_API_KEY = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"; | ||
|
||
public static void main(String[] args) { | ||
File pdfFile; | ||
if (args.length > 0) { | ||
pdfFile = new File(args[0]); | ||
} else { | ||
pdfFile = new File(DEFAULT_PDF_FILE_PATH); | ||
} | ||
|
||
final Dotenv dotenv = Dotenv.configure().ignoreIfMalformed().ignoreIfMissing().load(); | ||
|
||
final RequestBody pdfFileRequestBody = | ||
RequestBody.create(pdfFile, MediaType.parse("application/pdf")); | ||
RequestBody ocrRequestBody = | ||
new MultipartBody.Builder() | ||
.setType(MultipartBody.FORM) | ||
.addFormDataPart("file", pdfFile.getName(), pdfFileRequestBody) | ||
.addFormDataPart("output", "example_pdf-with-ocr-text_out") | ||
.build(); | ||
Request ocrRequest = | ||
new Request.Builder() | ||
.header("Api-Key", dotenv.get("PDFREST_API_KEY", DEFAULT_API_KEY)) | ||
.url("https://api.pdfrest.com/pdf-with-ocr-text") | ||
.post(ocrRequestBody) | ||
.build(); | ||
try { | ||
OkHttpClient ocrClient = | ||
new OkHttpClient().newBuilder().readTimeout(60, TimeUnit.SECONDS).build(); | ||
|
||
Response ocrResponse = ocrClient.newCall(ocrRequest).execute(); | ||
|
||
System.out.println("Response status code: " + ocrResponse.code()); | ||
if (ocrResponse.body() != null) { | ||
String ocrResponseString = ocrResponse.body().string(); | ||
|
||
JSONObject ocrJSON = new JSONObject(ocrResponseString); | ||
if (ocrJSON.has("error")) { | ||
System.out.println("Error during OCR call: " + ocrResponseString); | ||
return; | ||
} | ||
|
||
String ocrPDFID = ocrJSON.get("outputId").toString(); | ||
System.out.println("Got the output ID: " + ocrPDFID); | ||
|
||
RequestBody extractRequestBody = | ||
new MultipartBody.Builder() | ||
.setType(MultipartBody.FORM) | ||
.addFormDataPart("id", ocrPDFID) | ||
.build(); | ||
Request extractRequest = | ||
new Request.Builder() | ||
.header("Api-Key", dotenv.get("PDFREST_API_KEY", DEFAULT_API_KEY)) | ||
.url("https://api.pdfrest.com/extracted-text") | ||
.post(extractRequestBody) | ||
.build(); | ||
try { | ||
OkHttpClient extractClient = | ||
new OkHttpClient().newBuilder().readTimeout(60, TimeUnit.SECONDS).build(); | ||
|
||
Response extractResponse = extractClient.newCall(extractRequest).execute(); | ||
|
||
System.out.println("Response status code: " + extractResponse.code()); | ||
if (extractResponse.body() != null) { | ||
String extractResponseString = extractResponse.body().string(); | ||
|
||
JSONObject extractJSON = new JSONObject(extractResponseString); | ||
if (extractJSON.has("error")) { | ||
System.out.println("Error during text extraction call: " + extractResponseString); | ||
return; | ||
} | ||
|
||
System.out.println(extractJSON.getString("fullText")); | ||
} | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
var axios = require("axios"); | ||
var FormData = require("form-data"); | ||
var fs = require("fs"); | ||
|
||
/* In this sample, we will show how to convert a scanned document into a PDF with | ||
* searchable and extractable text using Optical Character Recognition (OCR), and then | ||
* extract that text from the newly created document. | ||
* | ||
* First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the | ||
* output ID. Then, we will send the output ID to the /extracted-text route, which will | ||
* return the newly added text. | ||
*/ | ||
|
||
var apiKey = "xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"; // Replace with your API key | ||
|
||
var ocrData = new FormData(); | ||
ocrData.append("file", fs.createReadStream("/path/to/file.pdf"), "file_name.pdf"); | ||
ocrData.append("output", "example_pdf-with-ocr-text_out"); | ||
|
||
var ocrConfig = { | ||
method: "post", | ||
maxBodyLength: Infinity, | ||
url: "https://api.pdfrest.com/pdf-with-ocr-text", | ||
headers: { | ||
"Api-Key": apiKey, | ||
...ocrData.getHeaders(), | ||
}, | ||
data: ocrData, | ||
}; | ||
|
||
console.log("Sending POST request to OCR endpoint..."); | ||
axios(ocrConfig) | ||
.then(function (response) { | ||
console.log("Response status code: " + response.status); | ||
|
||
if (response.status === 200) { | ||
var ocrPDFID = response.data.outputId; | ||
console.log("Got the output ID: " + ocrPDFID); | ||
|
||
var extractData = new FormData(); | ||
extractData.append("id", ocrPDFID); | ||
|
||
var extractConfig = { | ||
method: "post", | ||
maxBodyLength: Infinity, | ||
url: "https://api.pdfrest.com/extracted-text", | ||
headers: { | ||
"Api-Key": apiKey, | ||
...extractData.getHeaders(), | ||
}, | ||
data: extractData, | ||
}; | ||
|
||
console.log("Sending POST request to extract text endpoint..."); | ||
axios(extractConfig) | ||
.then(function (extractResponse) { | ||
console.log("Response status code: " + extractResponse.status); | ||
|
||
if (extractResponse.status === 200) { | ||
console.log(extractResponse.data.fullText); | ||
} else { | ||
console.log(extractResponse.data); | ||
} | ||
}) | ||
.catch(function (error) { | ||
console.log(error.response ? error.response.data : error.message); | ||
}); | ||
} else { | ||
console.log(response.data); | ||
} | ||
}) | ||
.catch(function (error) { | ||
console.log(error.response ? error.response.data : error.message); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
<?php | ||
|
||
require 'vendor/autoload.php'; | ||
|
||
use GuzzleHttp\Client; | ||
use GuzzleHttp\Psr7\Request; | ||
use GuzzleHttp\Psr7\Utils; | ||
|
||
/* In this sample, we will show how to convert a scanned document into a PDF with | ||
* searchable and extractable text using Optical Character Recognition (OCR), and then | ||
* extract that text from the newly created document. | ||
* | ||
* First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the | ||
* output ID. Then, we will send the output ID to the /extracted-text route, which will | ||
* return the newly added text. | ||
*/ | ||
|
||
$client = new Client(); | ||
|
||
$headers = [ | ||
'Api-Key' => 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' // Replace with your API key | ||
]; | ||
|
||
// Upload PDF for OCR | ||
$pdfToOCROptions = [ | ||
'multipart' => [ | ||
[ | ||
'name' => 'file', | ||
'contents' => Utils::tryFopen('/path/to/file.pdf', 'r'), | ||
'filename' => 'file.pdf', | ||
'headers' => [ | ||
'Content-Type' => 'application/pdf' | ||
] | ||
], | ||
[ | ||
'name' => 'output', | ||
'contents' => 'example_pdf-with-ocr-text_out' | ||
] | ||
] | ||
]; | ||
|
||
$pdfToOCRRequest = new Request('POST', 'https://api.pdfrest.com/pdf-with-ocr-text', $headers); | ||
|
||
echo "Sending POST request to OCR endpoint...\n"; | ||
$pdfToOCRResponse = $client->sendAsync($pdfToOCRRequest, $pdfToOCROptions)->wait(); | ||
|
||
echo "Response status code: " . $pdfToOCRResponse->getStatusCode() . "\n"; | ||
|
||
$ocrPDFID = json_decode($pdfToOCRResponse->getBody())->outputId; | ||
echo "Got the output ID: " . $ocrPDFID . "\n"; | ||
|
||
// Extract text from OCR'd PDF | ||
$extractTextOptions = [ | ||
'multipart' => [ | ||
[ | ||
'name' => 'id', | ||
'contents' => $ocrPDFID | ||
] | ||
] | ||
]; | ||
|
||
$extractTextRequest = new Request('POST', 'https://api.pdfrest.com/extracted-text', $headers); | ||
|
||
echo "Sending POST request to extract text endpoint...\n"; | ||
$extractTextResponse = $client->sendAsync($extractTextRequest, $extractTextOptions)->wait(); | ||
|
||
echo "Response status code: " . $extractTextResponse->getStatusCode() . "\n"; | ||
|
||
$fullText = json_decode($extractTextResponse->getBody())->fullText; | ||
echo $fullText . "\n"; | ||
|
||
?> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from requests_toolbelt import MultipartEncoder | ||
import requests | ||
|
||
|
||
# In this sample, we will show how to convert a scanned document into a PDF with | ||
# searchable and extractable text using Optical Character Recognition (OCR), and then | ||
# extract that text from the newly created document. | ||
# | ||
# First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the | ||
# output ID. Then, we will send the output ID to the /extracted-text route, which will | ||
# return the newly added text. | ||
|
||
api_key = 'xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' # place your api key here | ||
|
||
ocr_endpoint_url = 'https://api.pdfrest.com/pdf-with-ocr-text' | ||
mp_encoder_pdf = MultipartEncoder( | ||
fields={ | ||
'file': ('file_name.pdf', open('/path/to/file.pdf', 'rb'), 'application/pdf'), | ||
'output': 'example_pdf-with-ocr-text_out', | ||
} | ||
) | ||
|
||
image_headers = { | ||
'Accept': 'application/json', | ||
'Content-Type': mp_encoder_pdf.content_type, | ||
'Api-Key': api_key | ||
} | ||
|
||
print("Sending POST request to OCR endpoint...") | ||
response = requests.post(ocr_endpoint_url, data=mp_encoder_pdf, headers=image_headers) | ||
|
||
print("Response status code: " + str(response.status_code)) | ||
|
||
if response.ok: | ||
response_json = response.json() | ||
ocr_pdf_id = response_json["outputId"] | ||
print("Got the output ID: " + ocr_pdf_id) | ||
|
||
extract_endpoint_url = 'https://api.pdfrest.com/extracted-text' | ||
|
||
mp_encoder_extract_text = MultipartEncoder( | ||
fields={ | ||
'id': ocr_pdf_id | ||
} | ||
) | ||
|
||
extract_text_headers = { | ||
'Accept': 'application/json', | ||
'Content-Type': mp_encoder_extract_text.content_type, | ||
'Api-Key': api_key | ||
} | ||
|
||
print("Sending POST request to extract text endpoint...") | ||
extract_response = requests.post(extract_endpoint_url, data=mp_encoder_extract_text, headers=extract_text_headers) | ||
|
||
print("Response status code: " + str(extract_response.status_code)) | ||
|
||
if extract_response.ok: | ||
extract_json = extract_response.json() | ||
print(extract_json["fullText"]) | ||
|
||
else: | ||
print(extract_response.text) | ||
|
||
|
||
else: | ||
print(response.text) |
Oops, something went wrong.