diff --git a/TextExtract/.idea/runConfigurations/TextExtract.xml b/TextExtract/.idea/runConfigurations/TextExtract.xml
new file mode 100644
index 0000000..06c938d
--- /dev/null
+++ b/TextExtract/.idea/runConfigurations/TextExtract.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/TextExtract/pom.xml b/TextExtract/pom.xml
new file mode 100644
index 0000000..f23b97d
--- /dev/null
+++ b/TextExtract/pom.xml
@@ -0,0 +1,209 @@
+
+
+ 4.0.0
+ com.datalogics.pdfl.samples
+ TextExtract
+ 1.0-SNAPSHOT
+
+
+
+ mavenCentral
+ https://repo1.maven.org/maven2/
+
+
+
+
+ UTF-8
+ official
+ 1.8
+ 1.8
+ 1.8
+
+
+
+
+ Windows64
+
+
+ windows
+ amd64
+
+
+
+ win-x86-64-jni
+
+
+
+ MacArm
+
+
+ mac
+ aarch64
+
+
+
+ mac-arm-64-jni
+
+
+
+ Linux64
+
+
+
+ Linux
+ amd64
+
+
+
+ linux-x86-64-jni
+
+
+
+
+
+
+ org.jetbrains.kotlin
+ kotlin-stdlib-jdk8
+ 1.9.21
+
+
+ com.datalogics.pdfl
+ pdfl
+ 18.30.0
+ pom
+
+
+ com.datalogics.pdfl
+ pdfl
+ 18.30.0
+
+
+ com.datalogics.pdfl
+ pdfl
+ 18.30.0
+ zip
+ ${jni.classifier}
+
+
+ com.datalogics.pdfl
+ pdfl
+ 18.30.0
+ zip
+ resources
+
+
+ com.datalogics.pdfl
+ pdfl
+ 18.30.0
+ javadoc
+
+
+
+
+ src/main/kotlin
+
+
+ org.jetbrains.kotlin
+ kotlin-maven-plugin
+ 1.9.21
+
+
+ compile
+ compile
+
+ compile
+
+
+
+
+
+ maven-surefire-plugin
+ 2.22.2
+
+
+ maven-failsafe-plugin
+ 2.22.2
+
+
+ org.codehaus.mojo
+ exec-maven-plugin
+ 1.6.0
+
+ TextExtract
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+
+
+ unpack-resources
+ generate-resources
+
+ unpack
+
+
+
+
+ com.datalogics.pdfl
+ pdfl
+ resources
+ zip
+ ${project.build.directory}/lib/Resources
+
+
+
+
+
+ unpack-jni
+ generate-resources
+
+ unpack
+
+
+
+
+ com.datalogics.pdfl
+ pdfl
+ ${jni.classifier}
+ zip
+ ${project.build.directory}/lib
+
+
+
+
+
+ unpack-license
+ generate-resources
+
+ unpack
+
+
+
+
+ com.datalogics.pdfl
+ pdfl
+ license
+ zip
+ ${project.build.directory}/lib
+
+
+
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+ 3.0.2
+
+
+
+
+
+
\ No newline at end of file
diff --git a/TextExtract/src/main/kotlin/com/datalogics/pdfl/samples/TextExtract.kt b/TextExtract/src/main/kotlin/com/datalogics/pdfl/samples/TextExtract.kt
new file mode 100644
index 0000000..98823c8
--- /dev/null
+++ b/TextExtract/src/main/kotlin/com/datalogics/pdfl/samples/TextExtract.kt
@@ -0,0 +1,216 @@
+package com.datalogics.pdfl.samples
+
+import com.datalogics.PDFL.*
+import java.io.FileOutputStream
+import java.io.OutputStreamWriter
+
+/*
+*
+* This program pulls text from a PDF file and exports it to a text file (TXT).
+* It will open a PDF file called Constitution.PDF and create an output file called
+* TextExtract-untagged-out.txt. The export file includes page number references, and
+* the text is produced using standard Roman encoding. The program is also written
+* to include a provision for working with tagged documents, and determines if the original
+* PDF file is tagged or untagged. Tagging is used to make PDF files accessible
+* to the blind or to people with vision problems.
+*
+* Copyright (c) 2007-2024, Datalogics, Inc. All rights reserved.
+*
+*/
+
+fun main(args: Array) {
+ println("TextExtract sample:")
+ val lib = Library()
+
+ try {
+ // This is an untagged PDF.
+ var sInput: String = Library.getResourceDirectory() + "Sample_Input/constitution.pdf"
+
+ // This is a tagged PDF.
+ // var sInput: String = Library.getResourceDirectory() + "Sample_Input/pdf_intro.pdf"
+ if (args.isNotEmpty()) {
+ sInput = args[0]
+ }
+ println("Reading $sInput")
+
+ val doc = Document(sInput)
+
+ println("Opened document $sInput")
+
+ // Determine if the PDF is tagged. We'll use a slightly different set of rules
+ // for parsing tagged and untagged PDFs.
+ //
+ // We'll determine if the PDF is tagged by examining the MarkInfo
+ // dictionary of the document. First, check for the existence of the MarkInfo dict.
+ var docIsTagged = false
+ val markInfoDict: PDFDict? = doc.root.get("MarkInfo") as? PDFDict
+ val markedEntry: PDFBoolean? = markInfoDict?.get("Marked") as? PDFBoolean
+
+ if (markInfoDict != null) {
+ if (markedEntry != null) {
+ if (markedEntry.value) {
+ docIsTagged = true
+ }
+ }
+ }
+
+ val wordConfig = WordFinderConfig()
+ wordConfig.ignoreCharGaps = false
+ wordConfig.ignoreLineGaps = false
+ wordConfig.noAnnots = false
+ wordConfig.noEncodingGuess = false
+
+ // Std Roman treatment for custom encoding; overrides the noEncodingGuess option
+ wordConfig.unknownToStdEnc = false
+
+ wordConfig.disableTaggedPDF = false // legacy mode WordFinder creation
+ wordConfig.noXYSort = true
+ wordConfig.preserveSpaces = false
+ wordConfig.noLigatureExp = false
+ wordConfig.noHyphenDetection = false
+ wordConfig.trustNBSpace = false
+ wordConfig.noExtCharOffset = false // text extraction efficiency
+ wordConfig.noStyleInfo = false // text extraction efficiency
+
+ val wordFinder = WordFinder(doc, WordFinderVersion.LATEST, wordConfig)
+
+ if (docIsTagged)
+ extractTextTagged(doc, wordFinder)
+ else
+ extractTextUntagged(doc, wordFinder)
+
+ doc.close()
+ } finally {
+ lib.delete()
+ }
+}
+
+fun extractTextUntagged(doc: Document, wordFinder: WordFinder) {
+ val nPages = doc.numPages
+ var pageWords: List?
+
+ val logFile = FileOutputStream("TextExtract-untagged-out.txt")
+ println("Writing TextExtract-untagged-out.txt")
+ val logWriter = OutputStreamWriter(logFile, "UTF-8")
+
+ for (pageIndex in 0 until nPages) {
+ pageWords = wordFinder.getWordList(pageIndex)
+ var textToExtract = ""
+
+ for (wordNum in pageWords.indices) {
+ val wInfo = pageWords[wordNum]
+ val s = wInfo.text
+
+ // Check for hyphenated words that break across a line.
+ if (wInfo.attributes.contains(WordAttributeFlags.HAS_SOFT_HYPHEN) && wInfo.attributes.contains(
+ WordAttributeFlags.LAST_WORD_ON_LINE
+ )
+ ) {
+ // Remove the hyphen and combine the two parts of the word before adding to the extracted text.
+ // Note that we pass in the Unicode character for soft hyphen as well as the regular hyphen.
+ //
+ // In untagged PDF, it's not uncommon to find a mixture of hard and soft hyphens that may
+ // not be used for their intended purposes.
+ // (Soft hyphens are intended only for words that break across lines.)
+ //
+ // For the purposes of this sample, we'll remove all hyphens. In practice, you may need to check
+ // words against a dictionary to determine if the hyphenated word is actually one word or two.
+ // Note we remove ascii hyphen, Unicode soft hyphen(\u00ad) and Unicode hyphen(0x2010)
+ val splitstrs = s.split("-|\u00ad|0x2010".toRegex())
+
+ for (index in splitstrs.indices) {
+ textToExtract += splitstrs[index]
+ }
+ } else {
+ textToExtract += s
+ }
+
+ // Check for space adjacency or last word in region and add a space if necessary.
+ // LastWordInRegion is true if the WordFinder determined that this is the last word in a region.
+ // This may be set for words that are visually separated when viewing the PDF,
+ // but are not separated by a space. Here, it's used in conjunction with
+ // WordAttributes.AdjacentToSpace to determine where to insert spaces when
+ // post-processing WordFinder results.
+ if (wInfo.attributes.contains(WordAttributeFlags.ADJACENT_TO_SPACE) || wInfo.isLastWordInRegion) {
+ textToExtract += " "
+ }
+ // Check for a line break and add one if necessary
+ if (wInfo.attributes.contains(WordAttributeFlags.LAST_WORD_ON_LINE))
+ textToExtract += "\n"
+ }
+ val pageNum = "\n"
+ logWriter.write(pageNum, 0, pageNum.length)
+ logWriter.write(textToExtract, 0, textToExtract.length)
+ logWriter.write("\n")
+
+ // Release requested WordList
+ for (wordnum in pageWords.indices) {
+ pageWords[wordnum].delete()
+ }
+ }
+ println("Extracted $nPages pages.")
+ logWriter.close()
+}
+
+fun extractTextTagged(doc: Document, wordFinder: WordFinder) {
+ val nPages = doc.numPages
+ var pageWords: List
+
+ val logFile = FileOutputStream("TextExtract-tagged-out.txt")
+ println("Writing TextExtract-tagged-out.txt")
+ val logWriter = OutputStreamWriter(logFile, "UTF-8")
+
+ for (pageIndex in 0 until nPages) {
+ pageWords = wordFinder.getWordList(pageIndex)
+
+ var textToExtract = ""
+
+ for (wordNum in pageWords.indices) {
+ val wInfo = pageWords[wordNum]
+ val s = wInfo.text
+
+ // In most tagged PDFs, soft hyphens are used only to break words across lines, so we'll
+ // check for any soft hyphens and remove them from our text output.
+ //
+ // Note that we're not checking for the LAST_WORD_ON_LINE flag, unlike untagged PDF. For Tagged PDF,
+ // words are not flagged as being the last on the line if they are not at the end of a sentence.
+ if (wInfo.attributes.contains(WordAttributeFlags.HAS_SOFT_HYPHEN)) {
+ // Remove the hyphen and combine the two parts of the word before adding to the extracted text.
+ // Note that we pass in the Unicode character for soft hyphen(\u00ad) and Unicode hyphen(0x2010).
+ val splitstrs = s.split("\u00ad|0x2010".toRegex())
+
+ for (index in splitstrs.indices) {
+ textToExtract += splitstrs[index]
+ }
+ } else {
+ textToExtract += s
+ }
+
+ // Check for space adjacency or last word in region and add a space if necessary.
+ // LastWordInRegion is true if the WordFinder determined that this is the last word in a region.
+ // This may be set for words that are visually separated when viewing the PDF,
+ // but are not separated by a space. Here, it's used in conjunction with
+ // WordAttributes.AdjacentToSpace to determine where to insert spaces when
+ // post-processing WordFinder results.
+ if (wInfo.attributes.contains(WordAttributeFlags.ADJACENT_TO_SPACE) || wInfo.isLastWordInRegion) {
+ textToExtract += " "
+ }
+
+ // Check for a line break and add one if necessary
+ if (wInfo.attributes.contains(WordAttributeFlags.LAST_WORD_ON_LINE))
+ textToExtract += "\n"
+ }
+
+ val pageNum = "\n"
+ logWriter.write(pageNum, 0, pageNum.length)
+ logWriter.write(textToExtract, 0, textToExtract.length)
+ logWriter.write("\n")
+
+ // Release requested WordList
+ for (wordnum in pageWords.indices) {
+ pageWords[wordnum].delete()
+ }
+ }
+ println("Extracted $nPages pages.")
+ logWriter.close()
+}