tabulapdf 1.0.5-2

ropensci · Apr 29, 2024 · 4ea1bfb · 4ea1bfb
1 parent 52ae571
commit 4ea1bfb
Show file tree

Hide file tree

Showing 15 changed files with 79 additions and 118 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -12,3 +12,5 @@ ignore/*
 ^docs/
 ^\.github$
 ^codecov\.yml$
+^dev$
+^README\.html$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: tabulapdf
 Type: Package
 Title: Extract tables from PDF documents
-Version: 1.0.5
+Version: 1.0.5-2
 Authors@R: c(person("Thomas J.", "Leeper",
                     role = "aut", 
                     email = "[email protected]",
@@ -30,6 +30,7 @@ URL: https://docs.ropensci.org/tabulapdf (website)
 BugReports: https://github.com/ropensci/tabulapdf/issues
 Imports:
     png,
+    readr,
     rJava,
     tools,
     utils

diff --git a/NAMESPACE b/NAMESPACE
@@ -22,6 +22,6 @@ importFrom(rJava,.jcall)
 importFrom(rJava,.jfloat)
 importFrom(rJava,J)
 importFrom(rJava,new)
+importFrom(readr,read_delim)
 importFrom(tools,file_path_sans_ext)
 importFrom(utils,download.file)
-importFrom(utils,read.delim)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# CHANGES TO tabulapdf 1.0.5-2
+
+* Uses readr for a much faster parsing of extracted tables.
+* The default output format is now a list of tibbles.
+* All tests pass.
+
 # CHANGES TO tabulapdf 1.0.5
 
 * Package renamed to `tabulapdf`

diff --git a/R/extract_tables.R b/R/extract_tables.R
@@ -23,8 +23,8 @@
 #' @param \dots These are additional arguments passed to the internal functions dispatched by \code{method}.
 #' @details This function mimics the behavior of the Tabula command line utility. It returns a list of R character matrices containing tables extracted from a file by default. This response behavior can be changed by using the following options.
 #' \itemize{
+#'   \item \code{output = "tibble"} attempts to coerce the structure returned by \code{method = "character"} into a list of tibbles and returns character strings where this fails.
 #'   \item \code{output = "character"} returns a list of single-element character vectors, where each vector is a tab-delimited, line-separate string of concatenated table cells.
-#'   \item \code{output = "data.frame"} attempts to coerce the structure returned by \code{method = "character"} into a list of data.frames and returns character strings where this fails.
 #'   \item \code{output = "csv"} writes the tables to comma-separated (CSV) files using Tabula's CSVWriter method in the same directory as the original PDF. \code{method = "tsv"} does the same but with tab-separated (TSV) files using Tabula's TSVWriter and \code{method = "json"} does the same using Tabula's JSONWriter method. Any of these three methods return the path to the directory containing the extract table files.
 #'   \item \code{output = "asis"} returns the Java object reference, which can be useful for debugging or for writing a custom parser.
 #' }
@@ -49,11 +49,12 @@
 #' ## part of the table
 #' extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417)))
 #'
-#' # return data.frames
-#' extract_tables(f, pages = 2, output = "data.frame")
+#' # return tibbles
+#' extract_tables(f, pages = 2, output = "tibble")
 #' }
 #' @seealso \code{\link{extract_areas}}, \code{\link{get_page_dims}}, \code{\link{make_thumbnails}}, \code{\link{split_pdf}}
-#' @importFrom utils read.delim download.file
+#' @importFrom utils download.file
+#' @importFrom readr read_delim
 #' @importFrom tools file_path_sans_ext
 #' @importFrom rJava J new .jfloat .jcall
 #' @export
@@ -64,7 +65,7 @@ extract_tables <- function(file,
                            guess = TRUE,
                            method = c("decide", "lattice", "stream"),
                            output = c(
-                             "matrix", "data.frame", "character",
+                             "tibble", "matrix", "character",
                              "asis", "csv", "tsv", "json"
                            ),
                            outdir = NULL,
@@ -160,7 +161,7 @@ extract_tables <- function(file,
     "json" = write_jsons(tables, file = file, outdir = outdir, ...),
     "character" = list_characters(tables, encoding = encoding, ...),
     "matrix" = list_matrices(tables, encoding = encoding, ...),
-    "data.frame" = list_data_frames(tables, encoding = encoding, ...),
+    "tibble" = list_data_frames(tables, encoding = encoding, ...),
     "asis" = tables,
     tables
   )

diff --git a/R/output.R b/R/output.R
@@ -89,17 +89,17 @@ list_matrices <- function(tables, encoding = NULL, ...) {
     out
 }
 
-list_characters <- function(tables, sep = "\t", encoding = NULL, ...) {
+list_characters <- function(tables, delim = "\t", encoding = NULL, ...) {
     m <- list_matrices(tables, encoding = encoding, ...)
     lapply(m, function(x) {
-        paste0(apply(x, 1, paste, collapse = sep), collapse = "\n")
+        paste0(apply(x, 1, paste, collapse = delim), collapse = "\n")
     })
 }
 
-list_data_frames <- function(tables, sep = "\t", stringsAsFactors = FALSE, encoding = NULL, ...) {
-    char <- list_characters(tables = tables, sep = sep, encoding = encoding)
+list_data_frames <- function(tables, delim = "\t", encoding = NULL, ...) {
+    char <- list_characters(tables = tables, delim = delim, encoding = encoding)
     lapply(char, function(x) {
-        o <- try(read.delim(text = x, stringsAsFactors = stringsAsFactors, ...))
+        o <- try(read_delim(file = x, delim = delim))
         if (inherits(o, "try-error")) {
             return(x)
         } else {

diff --git a/dev/errors.R b/dev/errors.R
@@ -2,6 +2,12 @@
 
 library(tabulapdf)
 
-out <- extract_tables("inst/examples/data.pdf", pages = 1, output = "data.frame")
+out <- extract_tables("inst/examples/data.pdf", pages = 1, output = "tibble")
+
+class(out)
+
+class(out[[1]])
+
+library(tibble)
 
 out
diff --git a/inst/examples/text.md b/inst/examples/text.md
diff --git a/inst/examples/text.pdf b/inst/examples/text.pdf
diff --git a/inst/examples/text.qmd b/inst/examples/text.qmd
@@ -0,0 +1,11 @@
+---
+format: pdf
+---
+
+\pagenumbering{gobble}
+
+42 is the number from which the meaning of life, the universe, and everything can be derived.
+
+\newpage
+
+42 is the number from which the meaning of life, the universe, and everything can be derived.
diff --git a/man/extract_tables.Rd b/man/extract_tables.Rd
diff --git a/tests/testthat/test_extract_tables.R b/tests/testthat/test_extract_tables.R
@@ -5,7 +5,7 @@ sf <- system.file("examples", "data.pdf", package = "tabulapdf")
 test_that("It basically works", {
     tab1 <- extract_tables(sf)
     expect_true(is.list(tab1))
-    expect_true(is.matrix(tab1[[1]]))
+    expect_true(is.data.frame(tab1[[1]]))
 })
 
 test_that("Warning for ignored arguments", {
@@ -30,26 +30,26 @@ test_that("Import from remote file works", {
     tab2 <- extract_tables(f2)
     expect_true(is.list(tab2))
     expect_true(length(tab2) == 2)
-    expect_true(is.matrix(tab2[[1]]))
+    expect_true(is.data.frame(tab2[[1]]))
 })
 
 test_that("Import from remote non-Western file", {
     f3 <- "https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/arabic.pdf"
     tab3 <- extract_tables(f3)
     expect_true(is.list(tab3))
     expect_true(length(tab3) == 1)
-    expect_true(is.matrix(tab3[[1]]))
+    expect_true(is.data.frame(tab3[[1]]))
 })
 
 test_that("Test 'area' argument", {
     a4a <- list(c(122, 149, 536, 576))
-    tab4a <- extract_tables(sf, pages = 1, area = a4a, guess = FALSE, output = "data.frame")
+    tab4a <- extract_tables(sf, pages = 1, area = a4a, guess = FALSE, output = "tibble")
     expect_true(is.list(tab4a))
     expect_true(is.data.frame(tab4a[[1]]))
     expect_true(nrow(tab4a[[1]]) == 32)
     expect_true(ncol(tab4a[[1]]) == 12)
     a4b <- list(c(122, 149, 251, 464))
-    tab4b <- extract_tables(sf, pages = 1, area = a4b, guess = FALSE, output = "data.frame")
+    tab4b <- extract_tables(sf, pages = 1, area = a4b, guess = FALSE, output = "tibble")
     expect_true(is.list(tab4b))
     expect_true(is.data.frame(tab4b[[1]]))
     expect_true(nrow(tab4b[[1]]) == 9)
@@ -61,15 +61,15 @@ test_that("Test 'columns' argument", {
     expect_true(is.list(tab5))
     expect_true(length(tab5) == 1)
     expect_true(ncol(tab5[[1]]) == 2)
-    expect_true(nrow(tab5[[1]]) == 34)
+    expect_true(nrow(tab5[[1]]) == 33)
 })
 
 test_that("Extract from encrypted PDF", {
     f6 <- "https://github.com/tabulapdf/tabula-java/raw/98957221950af4b90620b51a29e0bbe502eea9ad/src/test/resources/technology/tabula/encrypted.pdf"
     expect_error(extract_tables(f6, password = "wrongpassword"))
     tab6 <- extract_tables(f6, password = "userpassword")
     expect_true(is.list(tab6))
-    expect_true(is.matrix(tab6[[1]]))
+    expect_true(is.data.frame(tab6[[1]]))
 })
 
 test_that("Test 'copy' argument", {

diff --git a/tests/testthat/test_extract_text.R b/tests/testthat/test_extract_text.R
@@ -4,53 +4,39 @@ sf <- system.file("examples", "text.pdf", package = "tabulapdf")
 
 test_that("Text can be extracted from the whole document", {
   txt <- extract_text(sf, encoding = "UTF-8")
-  cite <- paste(format(citation(), style = "citation"), collapse = "")
-  striptxt <- gsub("[[:space:]+]", "", txt)
-  stripcite <- gsub("[[:space:]+]", "", cite)
-  expect_identical(nchar(striptxt), 2L * nchar(stripcite))
+  expect_identical(txt, "42 is the number from which the meaning of life, the universe, and everything can be derived.\n42 is the number from which the meaning of life, the universe, and everything can be derived.\n")
 })
 
 test_that("'page' argument in extract_text works", {
   txt <- extract_text(sf, pages = 1, encoding = "UTF-8")
-  cite <- paste(format(citation(), style = "citation"), collapse = "")
-  striptxt <- gsub("[[:space:]+]", "", txt)
-  stripcite <- gsub("[[:space:]+]", "", cite)
-  expect_identical(nchar(striptxt), nchar(stripcite))
+  expect_identical(txt, "42 is the number from which the meaning of life, the universe, and everything can be derived.\n")
 })
 
 test_that("'area' argument in extract_text works", {
-  txt <- extract_text(sf, area = list(c(209.4, 140.5, 304.2, 500.8)), encoding = "UTF-8")
-  txt <- paste(txt, collapse = "")
-  bibtex <- paste(as.character(toBibtex(citation())), collapse = "")
-  striptxt <- gsub("[[:space:]+]", "", txt)
-  stripbib <- gsub("[[:space:]+]", "", bibtex)
-  expect_identical(nchar(striptxt), 2L * nchar(stripbib))
+  txt <- extract_text(sf, area = list(c(10, 15, 100, 550)), encoding = "UTF-8")
+  expect_identical(txt[1], "42 is the number from which the meaning of life, the universe, and everything can be derived.\n")
 })
 
 test_that("'area' and 'page' arguments in extract_text work together", {
-  txt <- extract_text(sf, pages = 1, area = list(c(209.4, 140.5, 304.2, 500.8)), encoding = "UTF-8")
-  bibtex <- paste(as.character(toBibtex(citation())), collapse = "")
-  striptxt <- gsub("[[:space:]+]", "", txt)
-  stripbib <- gsub("[[:space:]+]", "", bibtex)
-  expect_identical(nchar(striptxt), nchar(stripbib))
+  txt <- extract_text(sf, pages = 1, area = list(c(10, 15, 100, 550)), encoding = "UTF-8")
+  expect_identical(txt, "42 is the number from which the meaning of life, the universe, and everything can be derived.\n")
 })
 
 test_that("Multiple pages with different areas can be extracted", {
   txt <- extract_text(sf,
     pages = c(1, 2),
     area = list(
-      c(124, 131, 341.6, 504.3),
-      c(209.4, 140.5, 304.2, 500.8)
+      c(10, 15, 100, 550),
+      c(10, 15, 100, 500)
     ), encoding = "UTF-8"
   )
-  txt <- paste(txt, collapse = "")
-  cite <- paste(format(citation(), style = "citation"), collapse = "")
-  bibtex <- paste(as.character(toBibtex(citation())), collapse = "")
-  striptxt <- gsub("[[:space:]+]", "", txt)
-  stripcite <- gsub("[[:space:]+]", "", cite)
-  stripbib <- gsub("[[:space:]+]", "", bibtex)
-  bothpages <- paste0(stripcite, stripbib)
-  expect_identical(nchar(striptxt), nchar(bothpages))
+  expect_identical(
+    txt,
+    c(
+      "42 is the number from which the meaning of life, the universe, and everything can be derived.\n",
+      "42 is the number from which the meaning of life, the universe, and everything can be deriv\n"
+    )
+  )
 })
 
 test_that("Test 'copy' argument", {

diff --git a/tests/testthat/test_non-latin.R b/tests/testthat/test_non-latin.R
@@ -2,18 +2,17 @@ context("Non-latin character tests")
 
 test_that("Read Spanish language PDF", {
     f1 <- "https://github.com/tabulapdf/tabula-java/raw/98957221950af4b90620b51a29e0bbe502eea9ad/src/test/resources/technology/tabula/argentina_diputados_voting_record.pdf"
-    expect_true(is.matrix(extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE)[[1]]))
-    t1a <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE, output = "data.frame", encoding = "latin1")
-    #expect_true(names(t1a[[1]])[2] == "Frente.CÃ.vico.por.Santiago", label = "latin1 encoding worked")
-    t1b <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE, output = "data.frame", encoding = "UTF-8")
-    #expect_true(names(t1b[[1]])[2] == "Frente.Cívico.por.Santiago", label = "UTF-8 encoding worked")
-
+    t1 <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE)
+    t1a <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE, output = "tibble", encoding = "latin1")
+    t1b <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE, output = "tibble", encoding = "UTF-8")
+    expect_true(is.data.frame(t1[[1]]))
+    expect_true(is.data.frame(t1a[[1]]))
+    expect_true(is.data.frame(t1b[[1]]))
 })
 
 test_that("Read French language PDF w/correct encoding", {
     f2 <- "http://www.europarl.europa.eu/oeil/popups/printfichetechnical.pdf?id=673511&lang=fr"
     t2a <- extract_text(f2, page = 1, encoding = "latin1")
     t2b <- extract_text(f2, page = 1, encoding = "UTF-8")
-    #expect_true(nchar(strsplit(t2a, "\n")[[1]][1]) == 50, label = "latin1 encoding worked")
-    #expect_true(nchar(strsplit(t2b, "\n")[[1]][1]) == 47, label = "UTF-8 encoding worked")
+    expect_false(t2a == t2b)
 })
diff --git a/vignettes/tabulapdf.Rmd b/vignettes/tabulapdf.Rmd
@@ -63,7 +63,7 @@ By default, `extract_tables()` returns a list of character matrices. This is bec
 
 ```{r}
 # attempt to coerce tables to data.frames
-extract_tables(f, pages = 2, output = "data.frame")
+extract_tables(f, pages = 2, output = "tibble")
 ```
 
 Tabula itself implements three "writer" methods that write extracted tables to disk as CSV, TSV, or JSON files. These can be specified by `output = "csv"`, `output = "tsv"`, and `output = "json"`, respectively. For CSV and TSV, one file is written to disk for each table and R session's temporary directory `tempdir()` is used by default (alternatively, the directory can be specified through `output` argument). For JSON, one file is written containing information about all tables. For these methods, `extract_tables()` returns a path to the directory containing the output files.
@@ -80,16 +80,16 @@ If none of the standard methods works well, you can specify `output = "asis"` to
 By default, tabulapdf uses Tabula's table detection algorithm to automatically identify tables within each page of a PDF. This automatic detection can be toggled off by setting `guess = FALSE` and specifying an "area" within each PDF page to extract the table from. Here is a comparison of the default settings, versus extracting from two alternative areas within a page:
 
 ```{r}
-str(extract_tables(f, pages = 2, guess = TRUE, output = "data.frame"))
-str(extract_tables(f, pages = 2, area = list(c(126, 149, 212, 462)), guess = FALSE, output = "data.frame"))
-str(extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417)), guess = FALSE, output = "data.frame"))
+str(extract_tables(f, pages = 2, guess = TRUE, output = "tibble"))
+str(extract_tables(f, pages = 2, area = list(c(126, 149, 212, 462)), guess = FALSE, output = "tibble"))
+str(extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417)), guess = FALSE, output = "tibble"))
 ```
 
 The `area` argument should be a list either of length 1 (to use the same area for each specified page) or equal to the number of pages specified. This also means that you can extract multiple areas from one page, but specifying the page twice and indicating the two areas separately:
 
 ```{r}
 a2 <- list(c(126, 149, 212, 462), c(126, 284, 174, 417))
-str(extract_tables(f, pages = c(2, 2), area = a2, guess = FALSE, output = "data.frame"))
+str(extract_tables(f, pages = c(2, 2), area = a2, guess = FALSE, output = "tibble"))
 ```
 
 ## Interactive Table Extraction ##