-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #43 from datalogics-tsmith/add-extract-text
PDFCLOUD-2694 Add extract text samples
- Loading branch information
Showing
7 changed files
with
185 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
using System.Text; | ||
|
||
using (var httpClient = new HttpClient { BaseAddress = new Uri("https://api.pdfrest.com") }) | ||
{ | ||
using (var request = new HttpRequestMessage(HttpMethod.Post, "extract-text")) | ||
{ | ||
request.Headers.TryAddWithoutValidation("Api-Key", "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"); | ||
request.Headers.Accept.Add(new("application/json")); | ||
var multipartContent = new MultipartFormDataContent(); | ||
|
||
var byteArray = File.ReadAllBytes("/path/to/file"); | ||
var byteAryContent = new ByteArrayContent(byteArray); | ||
multipartContent.Add(byteAryContent, "file", "file_name"); | ||
byteAryContent.Headers.TryAddWithoutValidation("Content-Type", "application/pdf"); | ||
|
||
|
||
request.Content = multipartContent; | ||
var response = await httpClient.SendAsync(request); | ||
|
||
var apiResult = await response.Content.ReadAsStringAsync(); | ||
|
||
Console.WriteLine("API response received."); | ||
Console.WriteLine(apiResult); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import io.github.cdimascio.dotenv.Dotenv; | ||
import java.io.File; | ||
import java.io.IOException; | ||
import okhttp3.MediaType; | ||
import okhttp3.MultipartBody; | ||
import okhttp3.OkHttpClient; | ||
import okhttp3.Request; | ||
import okhttp3.RequestBody; | ||
import okhttp3.Response; | ||
import org.json.JSONObject; | ||
|
||
public class ExtractText { | ||
|
||
// Specify the path to your file here, or as the first argument when running the program. | ||
private static final String DEFAULT_FILE_PATH = "/path/to/file.pdf"; | ||
|
||
// Specify your API key here, or in the environment variable PDFREST_API_KEY. | ||
// You can also put the environment variable in a .env file. | ||
private static final String DEFAULT_API_KEY = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"; | ||
|
||
public static void main(String[] args) { | ||
File inputFile; | ||
if (args.length > 0) { | ||
inputFile = new File(args[0]); | ||
} else { | ||
inputFile = new File(DEFAULT_FILE_PATH); | ||
} | ||
|
||
final Dotenv dotenv = Dotenv.configure().ignoreIfMalformed().ignoreIfMissing().load(); | ||
|
||
final RequestBody inputFileRequestBody = | ||
RequestBody.create(inputFile, MediaType.parse("application/pdf")); | ||
RequestBody requestBody = | ||
new MultipartBody.Builder() | ||
.setType(MultipartBody.FORM) | ||
.addFormDataPart("file", inputFile.getName(), inputFileRequestBody) | ||
.build(); | ||
Request request = | ||
new Request.Builder() | ||
.header("Api-Key", dotenv.get("PDFREST_API_KEY", DEFAULT_API_KEY)) | ||
.url("https://api.pdfrest.com/extract-text") | ||
.post(requestBody) | ||
.build(); | ||
try { | ||
OkHttpClient client = new OkHttpClient().newBuilder().build(); | ||
Response response = client.newCall(request).execute(); | ||
System.out.println("Result code " + response.code()); | ||
if (response.body() != null) { | ||
System.out.println(prettyJson(response.body().string())); | ||
} | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
private static String prettyJson(String json) { | ||
// https://stackoverflow.com/a/9583835/11996393 | ||
return new JSONObject(json).toString(4); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
// This request demonstrates how to extract text from a PDF document. | ||
var axios = require("axios"); | ||
var FormData = require("form-data"); | ||
var fs = require("fs"); | ||
|
||
// Create a new form data instance and append the PDF file and parameters to it | ||
var data = new FormData(); | ||
data.append("file", fs.createReadStream("/path/to/file")); | ||
|
||
// define configuration options for axios request | ||
var config = { | ||
method: "post", | ||
maxBodyLength: Infinity, // set maximum length of the request body | ||
url: "https://api.pdfrest.com/extract-text", | ||
headers: { | ||
"Api-Key": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", // Replace with your API key | ||
...data.getHeaders(), // set headers for the request | ||
}, | ||
data: data, // set the data to be sent with the request | ||
}; | ||
|
||
// send request and handle response or error | ||
axios(config) | ||
.then(function (response) { | ||
console.log(JSON.stringify(response.data)); | ||
}) | ||
.catch(function (error) { | ||
console.log(error); | ||
}); | ||
|
||
// If you would like to download the file instead of getting the JSON response, please see the 'get-resource-id-endpoint.js' sample. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
<?php | ||
require 'vendor/autoload.php'; // Require the autoload file to load Guzzle HTTP client. | ||
|
||
use GuzzleHttp\Client; // Import the Guzzle HTTP client namespace. | ||
use GuzzleHttp\Psr7\Request; // Import the PSR-7 Request class. | ||
use GuzzleHttp\Psr7\Utils; // Import the PSR-7 Utils class for working with streams. | ||
|
||
$client = new Client(); // Create a new instance of the Guzzle HTTP client. | ||
|
||
$headers = [ | ||
'Api-Key' => 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' // Set the API key in the headers for authentication. | ||
]; | ||
|
||
$options = [ | ||
'multipart' => [ | ||
[ | ||
'name' => 'file', // Specify the field name for the file. | ||
'contents' => Utils::tryFopen('/path/to/file', 'r'), // Open the file specified by '/path/to/file' for reading. | ||
'filename' => '/path/to/file', // Set the filename for the file to be processed, in this case, '/path/to/file'. | ||
'headers' => [ | ||
'Content-Type' => '<Content-type header>' // Set the Content-Type header for the file. | ||
] | ||
] | ||
] | ||
]; | ||
|
||
$request = new Request('POST', 'https://api.pdfrest.com/extract-text', $headers); // Create a new HTTP POST request with the API endpoint and headers. | ||
|
||
$res = $client->sendAsync($request, $options)->wait(); // Send the asynchronous request and wait for the response. | ||
|
||
echo $res->getBody(); // Output the response body, which contains the text extracted from the document. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from requests_toolbelt import MultipartEncoder | ||
import requests | ||
import json | ||
|
||
extract_text_endpoint_url = 'https://api.pdfrest.com/extract-text' | ||
|
||
# The /extract-text endpoint can take a single PDF file or id as input. | ||
#This sample demonstrates querying the title, page count, document language and author | ||
mp_encoder_extractText = MultipartEncoder( | ||
fields={ | ||
'file': ('file_name', open('/path/to/file', 'rb'), 'application/pdf'), | ||
} | ||
) | ||
|
||
# Let's set the headers that the extract-text endpoint expects. | ||
# Since MultipartEncoder is used, the 'Content-Type' header gets set to 'multipart/form-data' via the content_type attribute below. | ||
headers = { | ||
'Accept': 'application/json', | ||
'Content-Type': mp_encoder_extractText.content_type, | ||
'Api-Key': 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' # place your api key here | ||
} | ||
|
||
print("Sending POST request to extract-text endpoint...") | ||
response = requests.post(extract_text_endpoint_url, data=mp_encoder_extractText, headers=headers) | ||
|
||
print("Response status code: " + str(response.status_code)) | ||
|
||
if response.ok: | ||
response_json = response.json() | ||
print(json.dumps(response_json, indent = 2)) | ||
else: | ||
print(response.text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
curl -X POST "https://api.pdfrest.com/extract-text" \ | ||
-H "Accept: application/json" \ | ||
-H "Content-Type: multipart/form-data" \ | ||
-H "Api-Key: xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" \ | ||
-F "file=@/path/to/file" |