diff --git a/DESCRIPTION b/DESCRIPTION
index 87b41ab..4748e1d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
Package: tidyRSS
Type: Package
Title: Tidy RSS for R
-Version: 2.0.0
+Version: 2.0.1
Author: Robert Myles McDonnell
Maintainer: Robert Myles McDonnell
Description:
diff --git a/NAMESPACE b/NAMESPACE
index a451894..5362d37 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -28,6 +28,7 @@ importFrom(tibble,tibble)
importFrom(xml2,as_list)
importFrom(xml2,read_xml)
importFrom(xml2,xml_attr)
+importFrom(xml2,xml_contents)
importFrom(xml2,xml_find_all)
importFrom(xml2,xml_find_first)
importFrom(xml2,xml_text)
diff --git a/NEWS.md b/NEWS.md
index 1da5b2a..ecd98f9 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,11 @@
+tidyRSS v2.0.1 (Release date: 07/03/2020)
+============
+Changes:
+- More complete testing
+- Better checking of 'geo' RSS feeds
+- new `parse_dates` argument that allows users to skip parsing of dates.
+- bugfix for missing import of dplyr's case_when
+
tidyRSS v2.0.0 (Release date: 24/02/2020)
============
Changes:
diff --git a/R/atom_parse.R b/R/atom_parse.R
index 3679912..b7b3e10 100644
--- a/R/atom_parse.R
+++ b/R/atom_parse.R
@@ -1,4 +1,4 @@
-atom_parse <- function(response, list, clean_tags) {
+atom_parse <- function(response, list, clean_tags, parse_dates) {
# https://tools.ietf.org/html/rfc4287
# https://validator.w3.org/feed/docs/atom.html
res <- read_xml(response)
@@ -64,8 +64,8 @@ atom_parse <- function(response, list, clean_tags) {
}
# clean up
- meta <- clean_up(meta, "atom", clean_tags)
- entries <- clean_up(entries, "atom", clean_tags)
+ meta <- clean_up(meta, "atom", clean_tags, parse_dates)
+ entries <- clean_up(entries, "atom", clean_tags, parse_dates)
if (isTRUE(list)) {
out <- list(meta = meta, entries = entries)
diff --git a/R/clean_up.R b/R/clean_up.R
index 02c7026..8978fe4 100644
--- a/R/clean_up.R
+++ b/R/clean_up.R
@@ -5,7 +5,7 @@
# - dates are parsed into datetime columns
# - HTML tags are removed
# - list-columns of length 1 are unlisted
-clean_up <- function(df, type, clean_tags) {
+clean_up <- function(df, type, clean_tags, parse_dates) {
# unlist list-cols of length 1
df <- df %>% mutate_if(is.list, delist)
# remove empty and NA cols
@@ -16,8 +16,10 @@ clean_up <- function(df, type, clean_tags) {
})
# parse dates & clean HTML
if (type == "json") {
- df <- date_parser(df, item_date_published)
- df <- date_parser(df, item_date_modified)
+ if (isTRUE(parse_dates)) {
+ df <- date_parser(df, item_date_published)
+ df <- date_parser(df, item_date_modified)
+ }
if (isTRUE(clean_tags)) {
if (has_name(df, "item_content_html")) {
df <- df %>%
@@ -25,17 +27,21 @@ clean_up <- function(df, type, clean_tags) {
}
}
} else if (type == "rss") {
- df <- date_parser(df, feed_pub_date)
- df <- date_parser(df, feed_last_build_date)
- df <- date_parser(df, item_pub_date)
+ if (isTRUE(parse_dates)) {
+ df <- date_parser(df, feed_pub_date)
+ df <- date_parser(df, feed_last_build_date)
+ df <- date_parser(df, item_pub_date)
+ }
if (isTRUE(clean_tags)) {
if (has_name(df, "item_description")) {
df$item_description <- cleanFun(df$item_description)
}
}
} else if (type == "atom") {
- df <- date_parser(df, feed_last_updated)
- df <- date_parser(df, entry_published)
+ if (isTRUE(parse_dates)) {
+ df <- date_parser(df, feed_last_updated)
+ df <- date_parser(df, entry_published)
+ }
if (isTRUE(clean_tags)) {
if (has_name(df, "entry_summary")) {
df$entry_summary <- cleanFun(df$entry_summary)
diff --git a/R/json_parse.R b/R/json_parse.R
index 583d5b2..b0f4d4f 100644
--- a/R/json_parse.R
+++ b/R/json_parse.R
@@ -1,4 +1,4 @@
-json_parse <- function(response, list, clean_tags) {
+json_parse <- function(response, list, clean_tags, parse_dates) {
# spec here: https://jsonfeed.org/version/1
res <- parse_json(response)
items <- res$items
@@ -50,8 +50,8 @@ json_parse <- function(response, list, clean_tags) {
entries$item_author <- NA
# clean up
- meta <- clean_up(meta, "json", clean_tags)
- entries <- clean_up(entries, "json", clean_tags)
+ meta <- clean_up(meta, "json", clean_tags, parse_dates)
+ entries <- clean_up(entries, "json", clean_tags, parse_dates)
if (isTRUE(list)) {
out <- list(meta = meta, entries = entries)
diff --git a/R/rss_parse.R b/R/rss_parse.R
index dc46e5e..0766885 100644
--- a/R/rss_parse.R
+++ b/R/rss_parse.R
@@ -1,4 +1,4 @@
-rss_parse <- function(response, list, clean_tags) {
+rss_parse <- function(response, list, clean_tags, parse_dates) {
# spec here: https://validator.w3.org/feed/docs/rss2.html
res <- response %>% read_xml()
geocheck(res)
@@ -49,8 +49,8 @@ rss_parse <- function(response, list, clean_tags) {
)
# clean up
- meta <- clean_up(meta, "rss", clean_tags)
- entries <- clean_up(entries, "rss", clean_tags)
+ meta <- clean_up(meta, "rss", clean_tags, parse_dates)
+ entries <- clean_up(entries, "rss", clean_tags, parse_dates)
if (isTRUE(list)) {
out <- list(meta = meta, entries = entries)
diff --git a/R/safe_get.R b/R/safe_get.R
index 40832af..8b2bd18 100644
--- a/R/safe_get.R
+++ b/R/safe_get.R
@@ -13,6 +13,6 @@ safe_get <- function(feed, user = NULL, config = list()) {
} else {
message("GET request successful. Parsing...\n")
}
- result <- req$result
- return(result)
+ result <- req$result #nocov
+ return(result) # nocov
}
diff --git a/R/safe_run.R b/R/safe_run.R
index c61bb75..6eb755e 100644
--- a/R/safe_run.R
+++ b/R/safe_run.R
@@ -15,8 +15,8 @@ safe_run <- function(response, type = c("first", "all"), ...) {
ret <- result$result %>% xml_text()
if (length(ret) == 0) ret <- def
} else {
- ret <- read_xml("") %>%
- xml_text()
+ ret <- read_xml("") %>% #nocov
+ xml_text() #nocov
}
return(ret)
}
diff --git a/R/tidyfeed.R b/R/tidyfeed.R
index 0ca88b3..530d96c 100644
--- a/R/tidyfeed.R
+++ b/R/tidyfeed.R
@@ -4,7 +4,7 @@
#' @importFrom httr GET user_agent
#' @importFrom anytime anytime
#' @importFrom xml2 read_xml as_list xml_text xml_find_all xml_find_first
-#' @importFrom xml2 xml_attr
+#' @importFrom xml2 xml_attr xml_contents
#' @importFrom dplyr select full_join mutate_if mutate select_if bind_cols
#' @importFrom dplyr case_when
#' @importFrom purrr map map_chr safely flatten compact keep map_df
@@ -23,6 +23,14 @@
#' Cleans columns of HTML tags.
#' @param list \code{logical}, default \code{FALSE}.
#' Return metadata and content as separate dataframes in a named list.
+#' @param parse_dates \code{logical}, default \code{TRUE}.
+#' If \code{TRUE}, tidyRSS will attempt to parse columns that contain
+#' datetime values, although this may fail, see note.
+#' @note \code{tidyfeed()} attempts to parse columns that should contain
+#' dates. This can fail, as can be seen
+#' \href{https://github.com/RobertMyles/tidyRSS/issues/37}{here}. If you need
+#' lower-level control over the parsing of dates, it's better to leave
+#' \code{parse_dates} equal to \code{FALSE} and then parse these yourself.
#' @seealso \link[httr:GET]{GET()}
#' @examples
#' \dontrun{
@@ -34,13 +42,17 @@
#' tidyfeed("https://daringfireball.net/feeds/json")
#' }
#' @export
-tidyfeed <- function(feed, config = list(), clean_tags = TRUE, list = FALSE) {
+tidyfeed <- function(feed, config = list(), clean_tags = TRUE, list = FALSE,
+ parse_dates = TRUE) {
# checks
if (!identical(length(feed), 1L)) stop("Please supply only one feed at a time.")
if (!is.logical(list)) stop("`list` may be FALSE or TRUE only.")
if (!is.logical(clean_tags)) stop("`clean_tags` may be FALSE or TRUE only.")
if (!is.list(config)) stop("`config` should be a list only.")
+ if (!is.logical(parse_dates)) stop("`parse_dates` may be FALSE or TRUE only.")
+ # nocov start
+ # (functions are tested at lower level)
# send user agent
ua <- set_user(config)
# try to get response
@@ -49,13 +61,14 @@ tidyfeed <- function(feed, config = list(), clean_tags = TRUE, list = FALSE) {
typ <- type_check(response)
# send to parsers
if (typ == "rss") {
- parsed <- rss_parse(response, list, clean_tags)
+ parsed <- rss_parse(response, list, clean_tags, parse_dates)
} else if (typ == "atom") {
- parsed <- atom_parse(response, list, clean_tags)
+ parsed <- atom_parse(response, list, clean_tags, parse_dates)
} else if (typ == "json") {
- parsed <- json_parse(response, list, clean_tags)
+ parsed <- json_parse(response, list, clean_tags, parse_dates)
} else {
stop(error_msg)
}
return(parsed)
+ # nocov end
}
diff --git a/R/utils.R b/R/utils.R
index 4b55cb8..dc3794d 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -18,7 +18,9 @@ set_user <- function(config) {
# simply reads 'content-type' of response to check type.
# if contains both atom & rss, prefers rss
type_check <- function(response) {
+ if (class(response) != "response") stop("`type_check` cannot evaluate this response.")
content_type <- response$headers$`content-type`
+ xmlns <- xml_attr(read_xml(response), "xmlns")
typ <- case_when(
grepl(x = content_type, pattern = "atom") ~ "atom",
grepl(x = content_type, pattern = "xml") ~ "rss",
@@ -26,13 +28,24 @@ type_check <- function(response) {
grepl(x = content_type, pattern = "json") ~ "json",
TRUE ~ "unknown"
)
+ # overwrite for cases like https://github.com/RobertMyles/tidyRSS/issues/38
+ if (grepl("Atom", xmlns)) typ <- "atom"
return(typ)
}
# geocheck - warning about geo feeds
geocheck <- function(x) {
- gcheck <- grepl("http://www.georss.org/georss", xml_attr(x, "xmlns:georss"))
- if (isTRUE(geocheck)) {
+
+ point <- xml_find_all(x, "//*[name()='georss:point']") %>% length()
+ line <- xml_find_all(x, "//*[name()='georss:line']") %>% length()
+ polygon <- xml_find_all(x, "//*[name()='georss:polygon']") %>% length()
+ box <- xml_find_all(x, "//*[name()='georss:box']") %>% length()
+ f_type <- xml_find_all(x, "//*[name()='georss:featuretypetag']") %>% length()
+ r_tag <- xml_find_all(x, "//*[name()='georss:relationshiptag']") %>% length()
+ f_name <- xml_find_all(x, "//*[name()='georss:featurename']") %>% length()
+ geo_elements <- c(point, line, polygon, box, f_type, r_tag, f_name)
+
+ if (any(geo_elements > 1)) {
message("Parsing feeds with geographic information (geoRSS, geoJSON etc.) is
deprecated in tidyRSS as of version 2.0.0. The geo-fields in this feed will be ignored.
If you would like to fetch this information, try the tidygeoRSS package:
diff --git a/README.Rmd b/README.Rmd
index c5c9c39..cda49cf 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -20,14 +20,15 @@ knitr::opts_chunk$set(
tidyRSS is a package for extracting data from [RSS feeds](https://en.wikipedia.org/wiki/RSS), including Atom feeds and JSON feeds. For geo-type feeds, see the section on changes in version 2 below, or jump directly to [tidygeoRSS](https://github.com/RobertMyles/tidygeoRSS), which is designed for that purpose.
-It is easy to use as it only has one function, `tidyfeed()`, which takes four arguments:
+It is easy to use as it only has one function, `tidyfeed()`, which takes five arguments:
- the url of the feed;
- a logical flag for whether you want the feed returned as a tibble or a list containing two tibbles;
- a logical flag for whether you want HTML tags removed from columns in the dataframe;
-- and a config list that is passed off to [`httr::GET()`](https://httr.r-lib.org/reference/config.html).
-
+- a config list that is passed off to [`httr::GET()`](https://httr.r-lib.org/reference/config.html);
+- and a `parse_dates` argument, a logical flag, which will attempt to parse dates if `TRUE` (see below).
+If `parse_dates` is `TRUE`, `tidyfeed()` will attempt to parse dates using the [anytime](https://github.com/eddelbuettel/anytime) package. Note that this removes some lower-level control that you may wish to retain over how dates are parsed. See [this issue](https://github.com/RobertMyles/tidyRSS/issues/37) for an example.
## Installation
diff --git a/README.md b/README.md
index f1677cc..0bbbde1 100644
--- a/README.md
+++ b/README.md
@@ -16,15 +16,23 @@ below, or jump directly to
designed for that purpose.
It is easy to use as it only has one function, `tidyfeed()`, which takes
-four arguments:
+five arguments:
- the url of the feed;
- a logical flag for whether you want the feed returned as a tibble or
a list containing two tibbles;
- a logical flag for whether you want HTML tags removed from columns
in the dataframe;
- - and a config list that is passed off to
- [`httr::GET()`](https://httr.r-lib.org/reference/config.html).
+ - a config list that is passed off to
+ [`httr::GET()`](https://httr.r-lib.org/reference/config.html);
+ - and a `parse_dates` argument, a logical flag, which will attempt to
+ parse dates if `TRUE` (see below).
+
+If `parse_dates` is `TRUE`, `tidyfeed()` will attempt to parse dates
+using the [anytime](https://github.com/eddelbuettel/anytime) package.
+Note that this removes some lower-level control that you may wish to
+retain over how dates are parsed. See [this
+issue](https://github.com/RobertMyles/tidyRSS/issues/37) for an example.
## Installation
diff --git a/cran-comments.md b/cran-comments.md
index 638e7de..5b2af96 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1,3 +1,7 @@
+# v2.0.1
+
+This fixes a bug a missing function import; creates a new input argument that allows users to leave dates unparsed, fixing another bug with NA in date columns. It also improves code coverage and testing.
+
# v2.0.0
This version is a rewrite of the package, removing the functionality of parsing feeds into tibbles with geographic simple features columns into a sister package. I've adopted a more stringent testing strategy along with much more streamlined code.
diff --git a/docs/404.html b/docs/404.html
index 27b8c65..3ca7601 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -18,8 +18,8 @@
-
-
+
+
@@ -67,7 +67,7 @@
tidyRSS
- 2.0.0
+ 2.0.1
@@ -123,7 +123,7 @@
tidyRSS is a package for extracting data from RSS feeds, including Atom feeds and JSON feeds. For geo-type feeds, see the section on changes in version 2 below, or jump directly to tidygeoRSS, which is designed for that purpose.
-
It is easy to use as it only has one function, tidyfeed(), which takes four arguments:
+
It is easy to use as it only has one function, tidyfeed(), which takes five arguments:
the url of the feed;
a logical flag for whether you want the feed returned as a tibble or a list containing two tibbles;
a logical flag for whether you want HTML tags removed from columns in the dataframe;
-
and a config list that is passed off to httr::GET().
and a parse_dates argument, a logical flag, which will attempt to parse dates if TRUE (see below).
+
If parse_dates is TRUE, tidyfeed() will attempt to parse dates using the anytime package. Note that this removes some lower-level control that you may wish to retain over how dates are parsed. See this issue for an example.
Changes: Removed simple features parsing funcionality to a sister package, tidygeoRSS. More robust testing strategy, streamlined code and less dependencies. Removed dataset of rss feeds as I thought it unnecessary.
+
Changes: Removed simple features parsing functionality to a sister package, tidygeoRSS. More robust testing strategy, streamlined code and less dependencies. Removed dataset of rss feeds as I thought it unnecessary.
logical, default FALSE.
-Return metadata and content as seperate dataframes in a named list.
+Return metadata and content as separate dataframes in a named list.
+
+
+
parse_dates
+
logical, default TRUE.
+If TRUE, tidyRSS will attempt to parse columns that contain
+datetime values, although this may fail, see note.
+
Note
+
+
tidyfeed() attempts to parse columns that should contain
+dates. This can fail, as can be seen
+here. If you need
+lower-level control over the parsing of dates, it's better to leave
+parse_dates equal to FALSE and then parse these yourself.
The origin server did not find a current representation for the target resource or is not willing to disclose that one exists.
+
A 404 status code does not indicate whether this lack of representation is temporary or permanent; the 410 Gone status code is preferred over 404 if the origin server knows, presumably through some configurable means, that the condition is likely to be permanent.
+
A 404 response is cacheable by default; i.e., unless otherwise indicated by the method definition or explicit cache controls1.