diff --git a/DESCRIPTION b/DESCRIPTION index 87b41ab..4748e1d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: tidyRSS Type: Package Title: Tidy RSS for R -Version: 2.0.0 +Version: 2.0.1 Author: Robert Myles McDonnell Maintainer: Robert Myles McDonnell Description: diff --git a/NAMESPACE b/NAMESPACE index a451894..5362d37 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,6 +28,7 @@ importFrom(tibble,tibble) importFrom(xml2,as_list) importFrom(xml2,read_xml) importFrom(xml2,xml_attr) +importFrom(xml2,xml_contents) importFrom(xml2,xml_find_all) importFrom(xml2,xml_find_first) importFrom(xml2,xml_text) diff --git a/NEWS.md b/NEWS.md index 1da5b2a..ecd98f9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +tidyRSS v2.0.1 (Release date: 07/03/2020) +============ +Changes: +- More complete testing +- Better checking of 'geo' RSS feeds +- new `parse_dates` argument that allows users to skip parsing of dates. +- bugfix for missing import of dplyr's case_when + tidyRSS v2.0.0 (Release date: 24/02/2020) ============ Changes: diff --git a/R/atom_parse.R b/R/atom_parse.R index 3679912..b7b3e10 100644 --- a/R/atom_parse.R +++ b/R/atom_parse.R @@ -1,4 +1,4 @@ -atom_parse <- function(response, list, clean_tags) { +atom_parse <- function(response, list, clean_tags, parse_dates) { # https://tools.ietf.org/html/rfc4287 # https://validator.w3.org/feed/docs/atom.html res <- read_xml(response) @@ -64,8 +64,8 @@ atom_parse <- function(response, list, clean_tags) { } # clean up - meta <- clean_up(meta, "atom", clean_tags) - entries <- clean_up(entries, "atom", clean_tags) + meta <- clean_up(meta, "atom", clean_tags, parse_dates) + entries <- clean_up(entries, "atom", clean_tags, parse_dates) if (isTRUE(list)) { out <- list(meta = meta, entries = entries) diff --git a/R/clean_up.R b/R/clean_up.R index 02c7026..8978fe4 100644 --- a/R/clean_up.R +++ b/R/clean_up.R @@ -5,7 +5,7 @@ # - dates are parsed into datetime columns # - HTML tags are removed # - list-columns of length 1 are unlisted -clean_up <- function(df, type, clean_tags) { +clean_up <- function(df, type, clean_tags, parse_dates) { # unlist list-cols of length 1 df <- df %>% mutate_if(is.list, delist) # remove empty and NA cols @@ -16,8 +16,10 @@ clean_up <- function(df, type, clean_tags) { }) # parse dates & clean HTML if (type == "json") { - df <- date_parser(df, item_date_published) - df <- date_parser(df, item_date_modified) + if (isTRUE(parse_dates)) { + df <- date_parser(df, item_date_published) + df <- date_parser(df, item_date_modified) + } if (isTRUE(clean_tags)) { if (has_name(df, "item_content_html")) { df <- df %>% @@ -25,17 +27,21 @@ clean_up <- function(df, type, clean_tags) { } } } else if (type == "rss") { - df <- date_parser(df, feed_pub_date) - df <- date_parser(df, feed_last_build_date) - df <- date_parser(df, item_pub_date) + if (isTRUE(parse_dates)) { + df <- date_parser(df, feed_pub_date) + df <- date_parser(df, feed_last_build_date) + df <- date_parser(df, item_pub_date) + } if (isTRUE(clean_tags)) { if (has_name(df, "item_description")) { df$item_description <- cleanFun(df$item_description) } } } else if (type == "atom") { - df <- date_parser(df, feed_last_updated) - df <- date_parser(df, entry_published) + if (isTRUE(parse_dates)) { + df <- date_parser(df, feed_last_updated) + df <- date_parser(df, entry_published) + } if (isTRUE(clean_tags)) { if (has_name(df, "entry_summary")) { df$entry_summary <- cleanFun(df$entry_summary) diff --git a/R/json_parse.R b/R/json_parse.R index 583d5b2..b0f4d4f 100644 --- a/R/json_parse.R +++ b/R/json_parse.R @@ -1,4 +1,4 @@ -json_parse <- function(response, list, clean_tags) { +json_parse <- function(response, list, clean_tags, parse_dates) { # spec here: https://jsonfeed.org/version/1 res <- parse_json(response) items <- res$items @@ -50,8 +50,8 @@ json_parse <- function(response, list, clean_tags) { entries$item_author <- NA # clean up - meta <- clean_up(meta, "json", clean_tags) - entries <- clean_up(entries, "json", clean_tags) + meta <- clean_up(meta, "json", clean_tags, parse_dates) + entries <- clean_up(entries, "json", clean_tags, parse_dates) if (isTRUE(list)) { out <- list(meta = meta, entries = entries) diff --git a/R/rss_parse.R b/R/rss_parse.R index dc46e5e..0766885 100644 --- a/R/rss_parse.R +++ b/R/rss_parse.R @@ -1,4 +1,4 @@ -rss_parse <- function(response, list, clean_tags) { +rss_parse <- function(response, list, clean_tags, parse_dates) { # spec here: https://validator.w3.org/feed/docs/rss2.html res <- response %>% read_xml() geocheck(res) @@ -49,8 +49,8 @@ rss_parse <- function(response, list, clean_tags) { ) # clean up - meta <- clean_up(meta, "rss", clean_tags) - entries <- clean_up(entries, "rss", clean_tags) + meta <- clean_up(meta, "rss", clean_tags, parse_dates) + entries <- clean_up(entries, "rss", clean_tags, parse_dates) if (isTRUE(list)) { out <- list(meta = meta, entries = entries) diff --git a/R/safe_get.R b/R/safe_get.R index 40832af..8b2bd18 100644 --- a/R/safe_get.R +++ b/R/safe_get.R @@ -13,6 +13,6 @@ safe_get <- function(feed, user = NULL, config = list()) { } else { message("GET request successful. Parsing...\n") } - result <- req$result - return(result) + result <- req$result #nocov + return(result) # nocov } diff --git a/R/safe_run.R b/R/safe_run.R index c61bb75..6eb755e 100644 --- a/R/safe_run.R +++ b/R/safe_run.R @@ -15,8 +15,8 @@ safe_run <- function(response, type = c("first", "all"), ...) { ret <- result$result %>% xml_text() if (length(ret) == 0) ret <- def } else { - ret <- read_xml("") %>% - xml_text() + ret <- read_xml("") %>% #nocov + xml_text() #nocov } return(ret) } diff --git a/R/tidyfeed.R b/R/tidyfeed.R index 0ca88b3..530d96c 100644 --- a/R/tidyfeed.R +++ b/R/tidyfeed.R @@ -4,7 +4,7 @@ #' @importFrom httr GET user_agent #' @importFrom anytime anytime #' @importFrom xml2 read_xml as_list xml_text xml_find_all xml_find_first -#' @importFrom xml2 xml_attr +#' @importFrom xml2 xml_attr xml_contents #' @importFrom dplyr select full_join mutate_if mutate select_if bind_cols #' @importFrom dplyr case_when #' @importFrom purrr map map_chr safely flatten compact keep map_df @@ -23,6 +23,14 @@ #' Cleans columns of HTML tags. #' @param list \code{logical}, default \code{FALSE}. #' Return metadata and content as separate dataframes in a named list. +#' @param parse_dates \code{logical}, default \code{TRUE}. +#' If \code{TRUE}, tidyRSS will attempt to parse columns that contain +#' datetime values, although this may fail, see note. +#' @note \code{tidyfeed()} attempts to parse columns that should contain +#' dates. This can fail, as can be seen +#' \href{https://github.com/RobertMyles/tidyRSS/issues/37}{here}. If you need +#' lower-level control over the parsing of dates, it's better to leave +#' \code{parse_dates} equal to \code{FALSE} and then parse these yourself. #' @seealso \link[httr:GET]{GET()} #' @examples #' \dontrun{ @@ -34,13 +42,17 @@ #' tidyfeed("https://daringfireball.net/feeds/json") #' } #' @export -tidyfeed <- function(feed, config = list(), clean_tags = TRUE, list = FALSE) { +tidyfeed <- function(feed, config = list(), clean_tags = TRUE, list = FALSE, + parse_dates = TRUE) { # checks if (!identical(length(feed), 1L)) stop("Please supply only one feed at a time.") if (!is.logical(list)) stop("`list` may be FALSE or TRUE only.") if (!is.logical(clean_tags)) stop("`clean_tags` may be FALSE or TRUE only.") if (!is.list(config)) stop("`config` should be a list only.") + if (!is.logical(parse_dates)) stop("`parse_dates` may be FALSE or TRUE only.") + # nocov start + # (functions are tested at lower level) # send user agent ua <- set_user(config) # try to get response @@ -49,13 +61,14 @@ tidyfeed <- function(feed, config = list(), clean_tags = TRUE, list = FALSE) { typ <- type_check(response) # send to parsers if (typ == "rss") { - parsed <- rss_parse(response, list, clean_tags) + parsed <- rss_parse(response, list, clean_tags, parse_dates) } else if (typ == "atom") { - parsed <- atom_parse(response, list, clean_tags) + parsed <- atom_parse(response, list, clean_tags, parse_dates) } else if (typ == "json") { - parsed <- json_parse(response, list, clean_tags) + parsed <- json_parse(response, list, clean_tags, parse_dates) } else { stop(error_msg) } return(parsed) + # nocov end } diff --git a/R/utils.R b/R/utils.R index 4b55cb8..dc3794d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -18,7 +18,9 @@ set_user <- function(config) { # simply reads 'content-type' of response to check type. # if contains both atom & rss, prefers rss type_check <- function(response) { + if (class(response) != "response") stop("`type_check` cannot evaluate this response.") content_type <- response$headers$`content-type` + xmlns <- xml_attr(read_xml(response), "xmlns") typ <- case_when( grepl(x = content_type, pattern = "atom") ~ "atom", grepl(x = content_type, pattern = "xml") ~ "rss", @@ -26,13 +28,24 @@ type_check <- function(response) { grepl(x = content_type, pattern = "json") ~ "json", TRUE ~ "unknown" ) + # overwrite for cases like https://github.com/RobertMyles/tidyRSS/issues/38 + if (grepl("Atom", xmlns)) typ <- "atom" return(typ) } # geocheck - warning about geo feeds geocheck <- function(x) { - gcheck <- grepl("http://www.georss.org/georss", xml_attr(x, "xmlns:georss")) - if (isTRUE(geocheck)) { + + point <- xml_find_all(x, "//*[name()='georss:point']") %>% length() + line <- xml_find_all(x, "//*[name()='georss:line']") %>% length() + polygon <- xml_find_all(x, "//*[name()='georss:polygon']") %>% length() + box <- xml_find_all(x, "//*[name()='georss:box']") %>% length() + f_type <- xml_find_all(x, "//*[name()='georss:featuretypetag']") %>% length() + r_tag <- xml_find_all(x, "//*[name()='georss:relationshiptag']") %>% length() + f_name <- xml_find_all(x, "//*[name()='georss:featurename']") %>% length() + geo_elements <- c(point, line, polygon, box, f_type, r_tag, f_name) + + if (any(geo_elements > 1)) { message("Parsing feeds with geographic information (geoRSS, geoJSON etc.) is deprecated in tidyRSS as of version 2.0.0. The geo-fields in this feed will be ignored. If you would like to fetch this information, try the tidygeoRSS package: diff --git a/README.Rmd b/README.Rmd index c5c9c39..cda49cf 100644 --- a/README.Rmd +++ b/README.Rmd @@ -20,14 +20,15 @@ knitr::opts_chunk$set( tidyRSS is a package for extracting data from [RSS feeds](https://en.wikipedia.org/wiki/RSS), including Atom feeds and JSON feeds. For geo-type feeds, see the section on changes in version 2 below, or jump directly to [tidygeoRSS](https://github.com/RobertMyles/tidygeoRSS), which is designed for that purpose. -It is easy to use as it only has one function, `tidyfeed()`, which takes four arguments: +It is easy to use as it only has one function, `tidyfeed()`, which takes five arguments: - the url of the feed; - a logical flag for whether you want the feed returned as a tibble or a list containing two tibbles; - a logical flag for whether you want HTML tags removed from columns in the dataframe; -- and a config list that is passed off to [`httr::GET()`](https://httr.r-lib.org/reference/config.html). - +- a config list that is passed off to [`httr::GET()`](https://httr.r-lib.org/reference/config.html); +- and a `parse_dates` argument, a logical flag, which will attempt to parse dates if `TRUE` (see below). +If `parse_dates` is `TRUE`, `tidyfeed()` will attempt to parse dates using the [anytime](https://github.com/eddelbuettel/anytime) package. Note that this removes some lower-level control that you may wish to retain over how dates are parsed. See [this issue](https://github.com/RobertMyles/tidyRSS/issues/37) for an example. ## Installation diff --git a/README.md b/README.md index f1677cc..0bbbde1 100644 --- a/README.md +++ b/README.md @@ -16,15 +16,23 @@ below, or jump directly to designed for that purpose. It is easy to use as it only has one function, `tidyfeed()`, which takes -four arguments: +five arguments: - the url of the feed; - a logical flag for whether you want the feed returned as a tibble or a list containing two tibbles; - a logical flag for whether you want HTML tags removed from columns in the dataframe; - - and a config list that is passed off to - [`httr::GET()`](https://httr.r-lib.org/reference/config.html). + - a config list that is passed off to + [`httr::GET()`](https://httr.r-lib.org/reference/config.html); + - and a `parse_dates` argument, a logical flag, which will attempt to + parse dates if `TRUE` (see below). + +If `parse_dates` is `TRUE`, `tidyfeed()` will attempt to parse dates +using the [anytime](https://github.com/eddelbuettel/anytime) package. +Note that this removes some lower-level control that you may wish to +retain over how dates are parsed. See [this +issue](https://github.com/RobertMyles/tidyRSS/issues/37) for an example. ## Installation diff --git a/cran-comments.md b/cran-comments.md index 638e7de..5b2af96 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,3 +1,7 @@ +# v2.0.1 + +This fixes a bug a missing function import; creates a new input argument that allows users to leave dates unparsed, fixing another bug with NA in date columns. It also improves code coverage and testing. + # v2.0.0 This version is a rewrite of the package, removing the functionality of parsing feeds into tibbles with geographic simple features columns into a sister package. I've adopted a more stringent testing strategy along with much more streamlined code. diff --git a/docs/404.html b/docs/404.html index 27b8c65..3ca7601 100644 --- a/docs/404.html +++ b/docs/404.html @@ -18,8 +18,8 @@ - - + + @@ -67,7 +67,7 @@ tidyRSS - 2.0.0 + 2.0.1 @@ -123,7 +123,7 @@

Page not found (404)

-

Site built with pkgdown 1.4.1.

+

Site built with pkgdown 1.4.1.9000.

diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 608a333..8267eb8 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -18,8 +18,8 @@ - - + + @@ -67,7 +67,7 @@ tidyRSS - 2.0.0 + 2.0.1 @@ -125,7 +125,7 @@

License

-

Site built with pkgdown 1.4.1.

+

Site built with pkgdown 1.4.1.9000.

diff --git a/docs/authors.html b/docs/authors.html index 0402400..d3eefa4 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -18,8 +18,8 @@ - - + + @@ -67,7 +67,7 @@ tidyRSS - 2.0.0 + 2.0.1 @@ -128,7 +128,7 @@

Authors

-

Site built with pkgdown 1.4.1.

+

Site built with pkgdown 1.4.1.9000.

diff --git a/docs/index.html b/docs/index.html index 357e2bc..0a4500f 100644 --- a/docs/index.html +++ b/docs/index.html @@ -7,8 +7,8 @@ Tidy RSS for R • tidyRSS - - + + tidyRSS - 2.0.0 + 2.0.1 @@ -74,13 +74,15 @@

CRAN_Status_Badge CRAN_Download_Badge CRAN_Download_Badge R-CMD-checkCodecov test coverage

tidyRSS is a package for extracting data from RSS feeds, including Atom feeds and JSON feeds. For geo-type feeds, see the section on changes in version 2 below, or jump directly to tidygeoRSS, which is designed for that purpose.

-

It is easy to use as it only has one function, tidyfeed(), which takes four arguments:

+

It is easy to use as it only has one function, tidyfeed(), which takes five arguments:

+

If parse_dates is TRUE, tidyfeed() will attempt to parse dates using the anytime package. Note that this removes some lower-level control that you may wish to retain over how dates are parsed. See this issue for an example.

Installation

@@ -129,11 +131,24 @@

  • JSON: https://jsonfeed.org/version/1
  • +

    I’ve implemented most of the items in the schemas above. The following are not yet implemented:

    +

    Atom meta info:

    +
      +
    • contributor, generator, logo, subtitle
    • +
    +

    Rss meta info:

    +
      +
    • cloud
    • +
    • image
    • +
    • textInput
    • +
    • skipHours
    • +
    • skipDays
    • +

    - - @@ -122,7 +122,7 @@

    Pipe operator

    - @@ -121,7 +121,13 @@

    Extract a tidy data frame from RSS, Atom and JSON feeds

    manipulation and analysis.

    -
    tidyfeed(feed, config = list(), clean_tags = TRUE, list = FALSE)
    +
    tidyfeed(
    +  feed,
    +  config = list(),
    +  clean_tags = TRUE,
    +  list = FALSE,
    +  parse_dates = TRUE
    +)

    Arguments

    @@ -143,10 +149,23 @@

    Arg

    +Return metadata and content as separate dataframes in a named list.

    + + + +
    list

    logical, default FALSE. -Return metadata and content as seperate dataframes in a named list.

    parse_dates

    logical, default TRUE. +If TRUE, tidyRSS will attempt to parse columns that contain +datetime values, although this may fail, see note.

    +

    Note

    + +

    tidyfeed() attempts to parse columns that should contain +dates. This can fail, as can be seen +here. If you need +lower-level control over the parsing of dates, it's better to leave +parse_dates equal to FALSE and then parse these yourself.

    References

    https://en.wikipedia.org/wiki/RSS

    @@ -164,10 +183,11 @@

    Examp tidyfeed("https://daringfireball.net/feeds/json") } -