Merge pull request #39 from RobertMyles/v2.0.1

V2.0.1
RobertMyles · Mar 7, 2020 · 62164d0 · 62164d0
2 parents 50aa3d5 + 95a648d
commit 62164d0
Show file tree

Hide file tree

Showing 29 changed files with 1,067 additions and 90 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: tidyRSS
 Type: Package
 Title: Tidy RSS for R
-Version: 2.0.0
+Version: 2.0.1
 Author: Robert Myles McDonnell
 Maintainer: Robert Myles McDonnell <[email protected]>
 Description: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -28,6 +28,7 @@ importFrom(tibble,tibble)
 importFrom(xml2,as_list)
 importFrom(xml2,read_xml)
 importFrom(xml2,xml_attr)
+importFrom(xml2,xml_contents)
 importFrom(xml2,xml_find_all)
 importFrom(xml2,xml_find_first)
 importFrom(xml2,xml_text)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+tidyRSS v2.0.1 (Release date: 07/03/2020)
+============
+Changes:
+- More complete testing
+- Better checking of 'geo' RSS feeds
+- new `parse_dates` argument that allows users to skip parsing of dates.
+- bugfix for missing import of dplyr's case_when
+
 tidyRSS v2.0.0 (Release date: 24/02/2020)
 ============
 Changes:

diff --git a/R/atom_parse.R b/R/atom_parse.R
@@ -1,4 +1,4 @@
-atom_parse <- function(response, list, clean_tags) {
+atom_parse <- function(response, list, clean_tags, parse_dates) {
   # https://tools.ietf.org/html/rfc4287
   # https://validator.w3.org/feed/docs/atom.html
   res <- read_xml(response)
@@ -64,8 +64,8 @@ atom_parse <- function(response, list, clean_tags) {
   }
 
   # clean up
-  meta <- clean_up(meta, "atom", clean_tags)
-  entries <- clean_up(entries, "atom", clean_tags)
+  meta <- clean_up(meta, "atom", clean_tags, parse_dates)
+  entries <- clean_up(entries, "atom", clean_tags, parse_dates)
 
   if (isTRUE(list)) {
     out <- list(meta = meta, entries = entries)

diff --git a/R/clean_up.R b/R/clean_up.R
@@ -5,7 +5,7 @@
 # - dates are parsed into datetime columns
 # - HTML tags are removed
 # - list-columns of length 1 are unlisted
-clean_up <- function(df, type, clean_tags) {
+clean_up <- function(df, type, clean_tags, parse_dates) {
   # unlist list-cols of length 1
   df <- df %>% mutate_if(is.list, delist)
   # remove empty and NA cols
@@ -16,26 +16,32 @@ clean_up <- function(df, type, clean_tags) {
     })
   # parse dates & clean HTML
   if (type == "json") {
-    df <- date_parser(df, item_date_published)
-    df <- date_parser(df, item_date_modified)
+    if (isTRUE(parse_dates)) {
+      df <- date_parser(df, item_date_published)
+      df <- date_parser(df, item_date_modified)
+    }
     if (isTRUE(clean_tags)) {
       if (has_name(df, "item_content_html")) {
         df <- df %>%
           mutate(item_content_html = cleanFun(item_content_html))
       }
     }
   } else if (type == "rss") {
-    df <- date_parser(df, feed_pub_date)
-    df <- date_parser(df, feed_last_build_date)
-    df <- date_parser(df, item_pub_date)
+    if (isTRUE(parse_dates)) {
+      df <- date_parser(df, feed_pub_date)
+      df <- date_parser(df, feed_last_build_date)
+      df <- date_parser(df, item_pub_date)
+    }
     if (isTRUE(clean_tags)) {
       if (has_name(df, "item_description")) {
         df$item_description <- cleanFun(df$item_description)
       }
     }
   } else if (type == "atom") {
-    df <- date_parser(df, feed_last_updated)
-    df <- date_parser(df, entry_published)
+    if (isTRUE(parse_dates)) {
+      df <- date_parser(df, feed_last_updated)
+      df <- date_parser(df, entry_published)
+    }
     if (isTRUE(clean_tags)) {
       if (has_name(df, "entry_summary")) {
         df$entry_summary <- cleanFun(df$entry_summary)

diff --git a/R/json_parse.R b/R/json_parse.R
@@ -1,4 +1,4 @@
-json_parse <- function(response, list, clean_tags) {
+json_parse <- function(response, list, clean_tags, parse_dates) {
   # spec here: https://jsonfeed.org/version/1
   res <- parse_json(response)
   items <- res$items
@@ -50,8 +50,8 @@ json_parse <- function(response, list, clean_tags) {
   entries$item_author <- NA
 
   # clean up
-  meta <- clean_up(meta, "json", clean_tags)
-  entries <- clean_up(entries, "json", clean_tags)
+  meta <- clean_up(meta, "json", clean_tags, parse_dates)
+  entries <- clean_up(entries, "json", clean_tags, parse_dates)
 
   if (isTRUE(list)) {
     out <- list(meta = meta, entries = entries)

diff --git a/R/rss_parse.R b/R/rss_parse.R
@@ -1,4 +1,4 @@
-rss_parse <- function(response, list, clean_tags) {
+rss_parse <- function(response, list, clean_tags, parse_dates) {
   # spec here: https://validator.w3.org/feed/docs/rss2.html
   res <- response %>% read_xml()
   geocheck(res)
@@ -49,8 +49,8 @@ rss_parse <- function(response, list, clean_tags) {
   )
 
   # clean up
-  meta <- clean_up(meta, "rss", clean_tags)
-  entries <- clean_up(entries, "rss", clean_tags)
+  meta <- clean_up(meta, "rss", clean_tags, parse_dates)
+  entries <- clean_up(entries, "rss", clean_tags, parse_dates)
 
   if (isTRUE(list)) {
     out <- list(meta = meta, entries = entries)

diff --git a/R/safe_get.R b/R/safe_get.R
@@ -13,6 +13,6 @@ safe_get <- function(feed, user = NULL, config = list()) {
   } else {
     message("GET request successful. Parsing...\n")
   }
-  result <- req$result
-  return(result)
+  result <- req$result #nocov
+  return(result) # nocov
 }
diff --git a/R/safe_run.R b/R/safe_run.R
@@ -15,8 +15,8 @@ safe_run <- function(response, type = c("first", "all"), ...) {
     ret <- result$result %>% xml_text()
     if (length(ret) == 0) ret <- def
   } else {
-    ret <- read_xml("<span></span>") %>%
-      xml_text()
+    ret <- read_xml("<span></span>") %>% #nocov
+      xml_text() #nocov
   }
   return(ret)
 }
diff --git a/R/tidyfeed.R b/R/tidyfeed.R
@@ -4,7 +4,7 @@
 #' @importFrom httr GET user_agent
 #' @importFrom anytime anytime
 #' @importFrom xml2 read_xml as_list xml_text xml_find_all xml_find_first
-#' @importFrom xml2 xml_attr
+#' @importFrom xml2 xml_attr xml_contents
 #' @importFrom dplyr select full_join mutate_if mutate select_if bind_cols
 #' @importFrom dplyr case_when
 #' @importFrom purrr map map_chr safely flatten compact keep map_df
@@ -23,6 +23,14 @@
 #' Cleans columns of HTML tags.
 #' @param list \code{logical}, default \code{FALSE}.
 #' Return metadata and content as separate dataframes in a named list.
+#' @param parse_dates \code{logical}, default \code{TRUE}.
+#' If \code{TRUE}, tidyRSS will attempt to parse columns that contain
+#' datetime values, although this may fail, see note.
+#' @note \code{tidyfeed()} attempts to parse columns that should contain
+#' dates. This can fail, as can be seen
+#' \href{https://github.com/RobertMyles/tidyRSS/issues/37}{here}. If you need
+#' lower-level control over the parsing of dates, it's better to leave
+#' \code{parse_dates} equal to \code{FALSE} and then parse these yourself.
 #' @seealso \link[httr:GET]{GET()}
 #' @examples
 #' \dontrun{
@@ -34,13 +42,17 @@
 #' tidyfeed("https://daringfireball.net/feeds/json")
 #' }
 #' @export
-tidyfeed <- function(feed, config = list(), clean_tags = TRUE, list = FALSE) {
+tidyfeed <- function(feed, config = list(), clean_tags = TRUE, list = FALSE,
+                     parse_dates = TRUE) {
   # checks
   if (!identical(length(feed), 1L)) stop("Please supply only one feed at a time.")
   if (!is.logical(list)) stop("`list` may be FALSE or TRUE only.")
   if (!is.logical(clean_tags)) stop("`clean_tags` may be FALSE or TRUE only.")
   if (!is.list(config)) stop("`config` should be a list only.")
+  if (!is.logical(parse_dates)) stop("`parse_dates` may be FALSE or TRUE only.")
 
+  # nocov start
+  # (functions are tested at lower level)
   # send user agent
   ua <- set_user(config)
   # try to get response
@@ -49,13 +61,14 @@ tidyfeed <- function(feed, config = list(), clean_tags = TRUE, list = FALSE) {
   typ <- type_check(response)
   # send to parsers
   if (typ == "rss") {
-    parsed <- rss_parse(response, list, clean_tags)
+    parsed <- rss_parse(response, list, clean_tags, parse_dates)
   } else if (typ == "atom") {
-    parsed <- atom_parse(response, list, clean_tags)
+    parsed <- atom_parse(response, list, clean_tags, parse_dates)
   } else if (typ == "json") {
-    parsed <- json_parse(response, list, clean_tags)
+    parsed <- json_parse(response, list, clean_tags, parse_dates)
   } else {
     stop(error_msg)
   }
   return(parsed)
+  # nocov end
 }
diff --git a/R/utils.R b/R/utils.R
@@ -18,21 +18,34 @@ set_user <- function(config) {
 # simply reads 'content-type' of response to check type.
 # if contains both atom & rss, prefers rss
 type_check <- function(response) {
+  if (class(response) != "response") stop("`type_check` cannot evaluate this response.")
   content_type <- response$headers$`content-type`
+  xmlns <- xml_attr(read_xml(response), "xmlns")
   typ <- case_when(
     grepl(x = content_type, pattern = "atom") ~ "atom",
     grepl(x = content_type, pattern = "xml") ~ "rss",
     grepl(x = content_type, pattern = "rss") ~ "rss",
     grepl(x = content_type, pattern = "json") ~ "json",
     TRUE ~ "unknown"
   )
+  # overwrite for cases like https://github.com/RobertMyles/tidyRSS/issues/38
+  if (grepl("Atom", xmlns)) typ <- "atom"
   return(typ)
 }
 
 # geocheck - warning about geo feeds
 geocheck <- function(x) {
-  gcheck <- grepl("http://www.georss.org/georss", xml_attr(x, "xmlns:georss"))
-  if (isTRUE(geocheck)) {
+
+  point <- xml_find_all(x, "//*[name()='georss:point']") %>% length()
+  line <- xml_find_all(x, "//*[name()='georss:line']") %>% length()
+  polygon <- xml_find_all(x, "//*[name()='georss:polygon']") %>% length()
+  box <- xml_find_all(x, "//*[name()='georss:box']") %>% length()
+  f_type <- xml_find_all(x, "//*[name()='georss:featuretypetag']") %>% length()
+  r_tag <- xml_find_all(x, "//*[name()='georss:relationshiptag']") %>% length()
+  f_name <- xml_find_all(x, "//*[name()='georss:featurename']") %>% length()
+  geo_elements <- c(point, line, polygon, box, f_type, r_tag, f_name)
+
+  if (any(geo_elements > 1)) {
     message("Parsing feeds with geographic information (geoRSS, geoJSON etc.) is
 deprecated in tidyRSS as of version 2.0.0. The geo-fields in this feed will be ignored.
 If you would like to fetch this information, try the tidygeoRSS package:

diff --git a/README.Rmd b/README.Rmd
@@ -20,14 +20,15 @@ knitr::opts_chunk$set(
 
 tidyRSS is a package for extracting data from [RSS feeds](https://en.wikipedia.org/wiki/RSS), including Atom feeds and JSON feeds. For geo-type feeds, see the section on changes in version 2 below, or jump directly to [tidygeoRSS](https://github.com/RobertMyles/tidygeoRSS), which is designed for that purpose.  
 
-It is easy to use as it only has one function, `tidyfeed()`, which takes four arguments:  
+It is easy to use as it only has one function, `tidyfeed()`, which takes five arguments:  
 
 - the url of the feed; 
 - a logical flag for whether you want the feed returned as a tibble or a list containing two tibbles; 
 - a logical flag for whether you want HTML tags removed from columns in the dataframe; 
-- and a config list that is passed off to [`httr::GET()`](https://httr.r-lib.org/reference/config.html).   
-
+- a config list that is passed off to [`httr::GET()`](https://httr.r-lib.org/reference/config.html);
+- and a `parse_dates` argument, a logical flag, which will attempt to parse dates if `TRUE` (see below).  
 
+If `parse_dates` is `TRUE`, `tidyfeed()` will attempt to parse dates using the [anytime](https://github.com/eddelbuettel/anytime) package. Note that this removes some lower-level control that you may wish to retain over how dates are parsed. See [this issue](https://github.com/RobertMyles/tidyRSS/issues/37) for an example. 
 
 ## Installation
 

diff --git a/README.md b/README.md
@@ -16,15 +16,23 @@ below, or jump directly to
 designed for that purpose.
 
 It is easy to use as it only has one function, `tidyfeed()`, which takes
-four arguments:
+five arguments:
 
   - the url of the feed;
   - a logical flag for whether you want the feed returned as a tibble or
     a list containing two tibbles;
   - a logical flag for whether you want HTML tags removed from columns
     in the dataframe;
-  - and a config list that is passed off to
-    [`httr::GET()`](https://httr.r-lib.org/reference/config.html).
+  - a config list that is passed off to
+    [`httr::GET()`](https://httr.r-lib.org/reference/config.html);
+  - and a `parse_dates` argument, a logical flag, which will attempt to
+    parse dates if `TRUE` (see below).
+
+If `parse_dates` is `TRUE`, `tidyfeed()` will attempt to parse dates
+using the [anytime](https://github.com/eddelbuettel/anytime) package.
+Note that this removes some lower-level control that you may wish to
+retain over how dates are parsed. See [this
+issue](https://github.com/RobertMyles/tidyRSS/issues/37) for an example.
 
 ## Installation
 

diff --git a/cran-comments.md b/cran-comments.md
@@ -1,3 +1,7 @@
+# v2.0.1
+
+This fixes a bug a missing function import; creates a new input argument that allows users to leave dates unparsed, fixing another bug with NA in date columns. It also improves code coverage and testing.
+
 # v2.0.0
 
 This version is a rewrite of the package, removing the functionality of parsing feeds into tibbles with geographic simple features columns into a sister package. I've adopted a more stringent testing strategy along with much more streamlined code.

diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html
diff --git a/docs/authors.html b/docs/authors.html