diff --git a/DESCRIPTION b/DESCRIPTION index d424d0b..14d541f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,9 @@ Authors@R: c(person("Thomas J.", "Leeper", comment = "rOpenSci reviewer"), person("Lincoln", "Mullen", role = "ctb", - comment = "rOpenSci reviewer")) + comment = "rOpenSci reviewer"), + person("Thore", "Engel", + role = "ctb")) Maintainer: Tom Paskhalis <tpaskhalis@gmail.com> Description: Bindings for the 'Tabula' <http://tabula.technology/> 'Java' library, which can extract tables from PDF documents. The 'tabulizerjars' @@ -41,4 +43,4 @@ Suggests: testthat SystemRequirements: Java (>= 7.0) VignetteBuilder: knitr -RoxygenNote: 6.0.1 +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index deb9c46..4796d19 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(extract_text) export(get_n_pages) export(get_page_dims) export(locate_areas) +export(locate_columns) export(make_thumbnails) export(merge_pdfs) export(split_pdf) diff --git a/NEWS.md b/NEWS.md index caae7e8..bf731ff 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,8 +1,11 @@ + + # CHANGES TO tabulizer 0.2.2 * `extract_tables()` gets `outdir` argument for writing out CSV, TSV and JSON files. * Fixes in vignette. +* addition of `locate_columns()` function. # CHANGES TO tabulizer 0.2.1 diff --git a/R/locate_columns.R b/R/locate_columns.R new file mode 100644 index 0000000..cddaeda --- /dev/null +++ b/R/locate_columns.R @@ -0,0 +1,108 @@ + +#' Locate separators between columns +#' +#' This function allows the user to manually locate the separators between columns of a table in a pdf. The output can be used as the \code{columns} argument in \code{extract_tables()} +#' +#' Manually selecting the separators can ensure that values stay in their respective columns. This is useful when some rows of a table have no or only little white space between columns. The code is an adaptation of the \code{locate_area()} function and its helpers. +#' @param file A character string specifying the path or URL to a PDF file. +#' @param pages An optional integer vector specifying pages to extract from. +#' @param resolution An integer specifying the resolution of the PNG images conversions. A low resolution is used by default to speed image loading. +#' @param copy Specifies whether the original local file(s) should be copied to tempdir() before processing. FALSE by default. The argument is ignored if file is URL. +#' @return a list. +#' @author Thore Engel <thore.engel@posteo.de> +#' @export +#' +#' @examples +#' \donttest{ +#' f <- system.file("examples", "data.pdf", package = "tabulizer") +#' separators<-locate_columns(f, pages= 1 ) +#' extract_tables(f,pages = 1, columns = separators[1]) +#' } +#' +locate_columns <- function(file, + pages = NULL, + resolution = 60L, + copy = FALSE) { + if (!interactive()) { + stop("locate_columns() is only available in an interactive session") + } else { + requireNamespace("graphics") + requireNamespace("grDevices") + } + + file <- localize_file(file, copy = copy) + # on.exit(unlink(file), add = TRUE) + dims <- get_page_dims(file, pages = pages) + paths <- make_thumbnails(file, + outdir = tempdir(), + pages = pages, + format = "png", + resolution = resolution) + on.exit(unlink(paths), add = TRUE) + + separators <- rep(list(NULL), length(paths)) + i <- 1 + warnThisTime <- TRUE + while (TRUE) { + if (!is.na(paths[i])) { + a <- try_columns_reduced(file = paths[i], + dims = dims[[i]],warn = warnThisTime) + if(warnThisTime) warnThisTime <- F + if (!is.null(a[["separators"]])) { + separators[[i]] <- a[["separators"]] + } + if (tolower(a[["key"]]) %in% c("del", "delete", "ctrl-h")) { + separators[i] <- list(NULL) + next + } else if (tolower(a[["key"]]) %in% c("home")) { + i <- 1 + next + } else if (tolower(a[["key"]]) %in% c("end")) { + i <- length(paths) + next + } else if (tolower(a[["key"]]) %in% c("pgup", "page_up", "up", "left")) { + i <- if (i == 1) 1 else i - 1 + next + } else if (tolower(a[["key"]]) %in% c("q")) { + break + } + } + i <- i + 1 + if (i > length(paths)) { + break + } + } + return(separators) +} + + +#' Helper function to locate_columns() +#' +#' @param file A character string specifying the path or URL to a PDF file. +#' @param dims An integer specifying the resolution of the PNG images conversions. A low resolution is used by default to speed image loading. +#' @param warn Display warning? + +try_columns_reduced <- function(file, dims, warn = FALSE) { + if (warn) { + message("Click at the locations of separators between columns.") + } + if (grDevices::dev.capabilities()[["rasterImage"]] == "no") { + stop("Graphics device does not support rasterImage() plotting") + } + thispng <- readPNG(file, native = TRUE) + drawPage <- function() { + graphics::plot(c(0, dims[1]), c(0, dims[2]), type = "n", xlab = "", ylab = "", asp = 1) + graphics::rasterImage(thispng, 0, 0, dims[1], dims[2]) + } + + pre_par <- graphics::par(mar=c(0,0,0,0), xaxs = "i", yaxs = "i", bty = "n") + on.exit(graphics::par(pre_par), add = TRUE) + drawPage() + on.exit(grDevices::dev.off(), add = TRUE) + + tmp <- locator() + graphics::abline(v=tmp$x) + Sys.sleep(4) + separators= as.numeric(tmp$x) + return(list(key = "right", separators = separators)) +} diff --git a/man/extract_areas.Rd b/man/extract_areas.Rd index 41a7fea..950dd90 100644 --- a/man/extract_areas.Rd +++ b/man/extract_areas.Rd @@ -5,8 +5,8 @@ \alias{extract_areas} \title{extract_areas} \usage{ -locate_areas(file, pages = NULL, resolution = 60L, widget = c("shiny", - "native", "reduced"), copy = FALSE) +locate_areas(file, pages = NULL, resolution = 60L, + widget = c("shiny", "native", "reduced"), copy = FALSE) extract_areas(file, pages = NULL, guess = FALSE, copy = FALSE, ...) } diff --git a/man/locate_columns.Rd b/man/locate_columns.Rd new file mode 100644 index 0000000..0e1f06a --- /dev/null +++ b/man/locate_columns.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/locate_columns.R +\name{locate_columns} +\alias{locate_columns} +\title{Locate separators between columns} +\usage{ +locate_columns(file, pages = NULL, resolution = 60L, copy = FALSE) +} +\arguments{ +\item{file}{A character string specifying the path or URL to a PDF file.} + +\item{pages}{An optional integer vector specifying pages to extract from.} + +\item{resolution}{An integer specifying the resolution of the PNG images conversions. A low resolution is used by default to speed image loading.} + +\item{copy}{Specifies whether the original local file(s) should be copied to tempdir() before processing. FALSE by default. The argument is ignored if file is URL.} +} +\value{ +a list. +} +\description{ +This function allows the user to manually locate the separators between columns of a table in a pdf. The output can be used as the \code{columns} argument in \code{extract_tables()} +} +\details{ +Manually selecting the separators can ensure that values stay in their respective columns. This is useful when some rows of a table have no or only little white space between columns. The code is an adaptation of the \code{locate_area()} function and its helpers. +} +\examples{ +\donttest{ +f <- system.file("examples", "data.pdf", package = "tabulizer") +separators<-locate_columns(f, pages= 1 ) +extract_tables(f,pages = 1, columns = separators[1]) +} + +} +\author{ +Thore Engel <thore.engel@posteo.de> +} diff --git a/man/make_thumbnails.Rd b/man/make_thumbnails.Rd index 2f00dd1..beee5a4 100644 --- a/man/make_thumbnails.Rd +++ b/man/make_thumbnails.Rd @@ -5,7 +5,8 @@ \title{make_thumbnails} \usage{ make_thumbnails(file, outdir = NULL, pages = NULL, format = c("png", - "jpeg", "bmp", "gif"), resolution = 72, password = NULL, copy = FALSE) + "jpeg", "bmp", "gif"), resolution = 72, password = NULL, + copy = FALSE) } \arguments{ \item{file}{A character string specifying the path or URL to a PDF file.} diff --git a/man/try_columns_reduced.Rd b/man/try_columns_reduced.Rd new file mode 100644 index 0000000..4665e66 --- /dev/null +++ b/man/try_columns_reduced.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/locate_columns.R +\name{try_columns_reduced} +\alias{try_columns_reduced} +\title{Helper function to locate_columns()} +\usage{ +try_columns_reduced(file, dims, warn = FALSE) +} +\arguments{ +\item{file}{A character string specifying the path or URL to a PDF file.} + +\item{dims}{An integer specifying the resolution of the PNG images conversions. A low resolution is used by default to speed image loading.} + +\item{warn}{Display warning?} +} +\description{ +Helper function to locate_columns() +}