Skip to content

Commit

Permalink
Merge pull request #43 from datalogics-tsmith/add-extract-text
Browse files Browse the repository at this point in the history
PDFCLOUD-2694 Add extract text samples
  • Loading branch information
datalogics-dliang authored Oct 2, 2023
2 parents aa6a007 + 26b94a5 commit c623bf6
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 5 deletions.
25 changes: 25 additions & 0 deletions DotNET/Single Calls/extract-text.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
using System.Text;

using (var httpClient = new HttpClient { BaseAddress = new Uri("https://api.pdfrest.com") })
{
using (var request = new HttpRequestMessage(HttpMethod.Post, "extract-text"))
{
request.Headers.TryAddWithoutValidation("Api-Key", "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx");
request.Headers.Accept.Add(new("application/json"));
var multipartContent = new MultipartFormDataContent();

var byteArray = File.ReadAllBytes("/path/to/file");
var byteAryContent = new ByteArrayContent(byteArray);
multipartContent.Add(byteAryContent, "file", "file_name");
byteAryContent.Headers.TryAddWithoutValidation("Content-Type", "application/pdf");


request.Content = multipartContent;
var response = await httpClient.SendAsync(request);

var apiResult = await response.Content.ReadAsStringAsync();

Console.WriteLine("API response received.");
Console.WriteLine(apiResult);
}
}
60 changes: 60 additions & 0 deletions Java/Single Calls/ExtractText.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import io.github.cdimascio.dotenv.Dotenv;
import java.io.File;
import java.io.IOException;
import okhttp3.MediaType;
import okhttp3.MultipartBody;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.RequestBody;
import okhttp3.Response;
import org.json.JSONObject;

public class ExtractText {

// Specify the path to your file here, or as the first argument when running the program.
private static final String DEFAULT_FILE_PATH = "/path/to/file.pdf";

// Specify your API key here, or in the environment variable PDFREST_API_KEY.
// You can also put the environment variable in a .env file.
private static final String DEFAULT_API_KEY = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";

public static void main(String[] args) {
File inputFile;
if (args.length > 0) {
inputFile = new File(args[0]);
} else {
inputFile = new File(DEFAULT_FILE_PATH);
}

final Dotenv dotenv = Dotenv.configure().ignoreIfMalformed().ignoreIfMissing().load();

final RequestBody inputFileRequestBody =
RequestBody.create(inputFile, MediaType.parse("application/pdf"));
RequestBody requestBody =
new MultipartBody.Builder()
.setType(MultipartBody.FORM)
.addFormDataPart("file", inputFile.getName(), inputFileRequestBody)
.build();
Request request =
new Request.Builder()
.header("Api-Key", dotenv.get("PDFREST_API_KEY", DEFAULT_API_KEY))
.url("https://api.pdfrest.com/extract-text")
.post(requestBody)
.build();
try {
OkHttpClient client = new OkHttpClient().newBuilder().build();
Response response = client.newCall(request).execute();
System.out.println("Result code " + response.code());
if (response.body() != null) {
System.out.println(prettyJson(response.body().string()));
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}

private static String prettyJson(String json) {
// https://stackoverflow.com/a/9583835/11996393
return new JSONObject(json).toString(4);
}
}
31 changes: 31 additions & 0 deletions JavaScript/Single Calls/extract-text.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// This request demonstrates how to extract text from a PDF document.
var axios = require("axios");
var FormData = require("form-data");
var fs = require("fs");

// Create a new form data instance and append the PDF file and parameters to it
var data = new FormData();
data.append("file", fs.createReadStream("/path/to/file"));

// define configuration options for axios request
var config = {
method: "post",
maxBodyLength: Infinity, // set maximum length of the request body
url: "https://api.pdfrest.com/extract-text",
headers: {
"Api-Key": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", // Replace with your API key
...data.getHeaders(), // set headers for the request
},
data: data, // set the data to be sent with the request
};

// send request and handle response or error
axios(config)
.then(function (response) {
console.log(JSON.stringify(response.data));
})
.catch(function (error) {
console.log(error);
});

// If you would like to download the file instead of getting the JSON response, please see the 'get-resource-id-endpoint.js' sample.
6 changes: 1 addition & 5 deletions PHP/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,7 @@ In this directory you will find sample calls to single endpoints, as well as mor

2. Navigate to the directory containing the `php` file.

3. Run the following command to install the required dependencies (Guzzle HTTP client):

```bash
composer install
```
3. Install the required dependencies (Guzzle HTTP client) by following the instructions at https://docs.guzzlephp.org/en/stable/overview.html

### Usage

Expand Down
31 changes: 31 additions & 0 deletions PHP/Single Calls/extract-text.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<?php
require 'vendor/autoload.php'; // Require the autoload file to load Guzzle HTTP client.

use GuzzleHttp\Client; // Import the Guzzle HTTP client namespace.
use GuzzleHttp\Psr7\Request; // Import the PSR-7 Request class.
use GuzzleHttp\Psr7\Utils; // Import the PSR-7 Utils class for working with streams.

$client = new Client(); // Create a new instance of the Guzzle HTTP client.

$headers = [
'Api-Key' => 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' // Set the API key in the headers for authentication.
];

$options = [
'multipart' => [
[
'name' => 'file', // Specify the field name for the file.
'contents' => Utils::tryFopen('/path/to/file', 'r'), // Open the file specified by '/path/to/file' for reading.
'filename' => '/path/to/file', // Set the filename for the file to be processed, in this case, '/path/to/file'.
'headers' => [
'Content-Type' => '<Content-type header>' // Set the Content-Type header for the file.
]
]
]
];

$request = new Request('POST', 'https://api.pdfrest.com/extract-text', $headers); // Create a new HTTP POST request with the API endpoint and headers.

$res = $client->sendAsync($request, $options)->wait(); // Send the asynchronous request and wait for the response.

echo $res->getBody(); // Output the response body, which contains the text extracted from the document.
32 changes: 32 additions & 0 deletions Python/Single Calls/extract-text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from requests_toolbelt import MultipartEncoder
import requests
import json

extract_text_endpoint_url = 'https://api.pdfrest.com/extract-text'

# The /extract-text endpoint can take a single PDF file or id as input.
#This sample demonstrates querying the title, page count, document language and author
mp_encoder_extractText = MultipartEncoder(
fields={
'file': ('file_name', open('/path/to/file', 'rb'), 'application/pdf'),
}
)

# Let's set the headers that the extract-text endpoint expects.
# Since MultipartEncoder is used, the 'Content-Type' header gets set to 'multipart/form-data' via the content_type attribute below.
headers = {
'Accept': 'application/json',
'Content-Type': mp_encoder_extractText.content_type,
'Api-Key': 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' # place your api key here
}

print("Sending POST request to extract-text endpoint...")
response = requests.post(extract_text_endpoint_url, data=mp_encoder_extractText, headers=headers)

print("Response status code: " + str(response.status_code))

if response.ok:
response_json = response.json()
print(json.dumps(response_json, indent = 2))
else:
print(response.text)
5 changes: 5 additions & 0 deletions cURL/Single Calls/extract-text.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
curl -X POST "https://api.pdfrest.com/extract-text" \
-H "Accept: application/json" \
-H "Content-Type: multipart/form-data" \
-H "Api-Key: xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" \
-F "file=@/path/to/file"

0 comments on commit c623bf6

Please sign in to comment.