diff --git a/DESCRIPTION b/DESCRIPTION index 7b0f985..49f7ab2 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pkgsimil Title: Similarity Metrics between R Packages -Version: 0.1.2.116 +Version: 0.1.2.119 Authors@R: c( person("Mark", "Padgham", , "mark.padgham@email.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2172-5265"))) @@ -12,6 +12,7 @@ BugReports: https://github.com/ropensci-review-tools/pkgsimil/issues Imports: brio, cli, + curl, dplyr, fs, httr2, diff --git a/R/cache.R b/R/cache.R index c673557..176abdd 100644 --- a/R/cache.R +++ b/R/cache.R @@ -2,6 +2,7 @@ #' function, either for all rOpenSci packages or, if `fns = TRUE`, all #' individual functions within those packages. #' +#' @inheritParams pkgsimil_similar_pkgs #' @param fns If `FALSE` (default), load embeddings for all rOpenSci packages; #' otherwise load (considerably larger dataset of) embeddings for all #' individual functions. @@ -17,39 +18,62 @@ #' idfs <- pkgsimil_load_data ("idfs") #' idfs_fns <- pkgsimil_load_data ("idfs", fns = TRUE) #' } -pkgsimil_load_data <- function (what = "embeddings", fns = FALSE) { +pkgsimil_load_data <- function (what = "embeddings", corpus = "ropensci", fns = FALSE) { + corpus <- match.arg (corpus, c ("ropensci", "cran")) what <- match.arg (what, c ("embeddings", "idfs")) - if (what == "embeddings") { - fname <- ifelse (fns, "embeddings-fns.Rds", "embeddings.Rds") - } else { - fname <- ifelse (fns, "bm25-ropensci-fns.Rds", "bm25-ropensci.Rds") + if (corpus == "ropensci") { + + if (what == "embeddings") { + fname <- ifelse (fns, "embeddings-fns.Rds", "embeddings.Rds") + } else { + fname <- ifelse (fns, "bm25-ropensci-fns.Rds", "bm25-ropensci.Rds") + } + + } else if (corpus == "cran") { + + if (what == "embeddings") { + fname <- "embeddings-cran.Rds" + } else { + fname <- "bm25-cran.Rds" + } } + fname <- fs::path (pkgsimil_cache_path (), fname) if (!fs::file_exists (fname)) { - fname <- pkgsimil_dl_data (what = what, fns = fns) + fname <- pkgsimil_dl_data (what = what, corpus = corpus, fns = fns) } readRDS (fname) } -pkgsimil_dl_data <- function (what = "embeddings", fns = FALSE) { +pkgsimil_dl_data <- function (what = "embeddings", corpus = "ropensci", fns = FALSE) { what <- match.arg (what, c ("embeddings", "idfs")) + corpus <- match.arg (corpus, c ("ropensci", "cran")) url_base <- "https://github.com/ropensci-review-tools/pkgsimil/releases/download/" version <- "v0.1.2" - if (what == "embeddings") { - file <- ifelse (fns, "embeddings-fns.Rds", "embeddings.Rds") - } else { - file <- ifelse (fns, "bm25-ropensci-fns.Rds", "bm25-ropensci.Rds") + if (corpus == "ropensci") { + if (what == "embeddings") { + file <- ifelse (fns, "embeddings-fns.Rds", "embeddings.Rds") + } else { + file <- ifelse (fns, "bm25-ropensci-fns.Rds", "bm25-ropensci.Rds") + } + } else if (corpus == "cran") { + if (what == "embeddings") { + file <- "embeddings-cran.Rds" + } else { + file <- "bm25-cran.Rds" + } } - url <- paste0 (url_base, version, "/", file) + + dl_url <- paste0 (url_base, version, "/", file) destfile <- fs::path (pkgsimil_cache_path (), file) - utils::download.file (url, destfile = destfile) + curl::curl_download (url = dl_url, destfile = destfile, quiet = opt_is_quiet ()) return (destfile) } diff --git a/R/similar-pkgs.R b/R/similar-pkgs.R index cfcf164..fcde0e7 100644 --- a/R/similar-pkgs.R +++ b/R/similar-pkgs.R @@ -3,6 +3,10 @@ #' #' @param input Either a path to local source code of an R package, or a text #' string. +#' @param corpus If `embeddings` or `idfs` parameters are not specified, they +#' are automatically downloaded for the corpus specified by this parameter. +#' Must be one of "ropensci" or "cran". The function will then return the most +#' similar package from the specified corpus. #' @param embeddings Large Language Model embeddings for all rOpenSci packages, #' generated from \link{pkgsimil_embeddings_from_pkgs}. If not provided, #' pre-generated embeddings will be downloaded and stored in a local cache @@ -21,6 +25,11 @@ #' If `input` is a single text string, a single character vector is returned #' naming the `n` most similar packages. #' +#' @note The first time this function is run without passing either +#' `embeddings` or `idfs`, required values will be automatically downloaded and +#' stored in a locally persistent cache directory. Especially for the "cran" +#' corpus, this downloading may take quite some time. +#' #' @seealso input_is_code #' @export #' @@ -30,16 +39,19 @@ #' pkgsimil_similar_pkgs (input) #' } pkgsimil_similar_pkgs <- function (input, + corpus = "ropensci", embeddings = NULL, idfs = NULL, input_is_code = text_is_code (input), n = 5L) { + corpus <- match.arg (corpus, c ("ropensci", "cran")) + if (is.null (embeddings)) { - embeddings <- pkgsimil_load_data ("embeddings") + embeddings <- pkgsimil_load_data (what = "embeddings", corpus = corpus) } if (is.null (idfs)) { - idfs <- pkgsimil_load_data ("idfs") + idfs <- pkgsimil_load_data (what = "idfs", corpus = corpus) } nms_expected <- c ("text_with_fns", "text_wo_fns", "code") diff --git a/codemeta.json b/codemeta.json index c139b00..033696f 100644 --- a/codemeta.json +++ b/codemeta.json @@ -8,7 +8,7 @@ "codeRepository": "https://github.com/ropensci-review-tools/pkgsimil", "issueTracker": "https://github.com/ropensci-review-tools/pkgsimil/issues", "license": "https://spdx.org/licenses/MIT", - "version": "0.1.2.116", + "version": "0.1.2.119", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", @@ -122,6 +122,18 @@ "sameAs": "https://CRAN.R-project.org/package=cli" }, "3": { + "@type": "SoftwareApplication", + "identifier": "curl", + "name": "curl", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=curl" + }, + "4": { "@type": "SoftwareApplication", "identifier": "dplyr", "name": "dplyr", @@ -133,7 +145,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=dplyr" }, - "4": { + "5": { "@type": "SoftwareApplication", "identifier": "fs", "name": "fs", @@ -145,7 +157,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=fs" }, - "5": { + "6": { "@type": "SoftwareApplication", "identifier": "httr2", "name": "httr2", @@ -157,7 +169,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=httr2" }, - "6": { + "7": { "@type": "SoftwareApplication", "identifier": "memoise", "name": "memoise", @@ -169,7 +181,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=memoise" }, - "7": { + "8": { "@type": "SoftwareApplication", "identifier": "pbapply", "name": "pbapply", @@ -181,7 +193,19 @@ }, "sameAs": "https://CRAN.R-project.org/package=pbapply" }, - "8": { + "9": { + "@type": "SoftwareApplication", + "identifier": "Rcpp", + "name": "Rcpp", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=Rcpp" + }, + "10": { "@type": "SoftwareApplication", "identifier": "tokenizers", "name": "tokenizers", @@ -193,19 +217,16 @@ }, "sameAs": "https://CRAN.R-project.org/package=tokenizers" }, - "9": { + "11": { "@type": "SoftwareApplication", "identifier": "R", "name": "R", "version": ">= 3.5.0" }, - "SystemRequirements": {} + "SystemRequirements": null }, - "fileSize": "32745.224KB", + "fileSize": "294217.436KB", "readme": "https://github.com/ropensci-review-tools/pkgsimil/blob/main/README.md", - "contIntegration": [ - "https://github.com/ropensci-review-tools/pkgsimil/actions?query=workflow%3AR-CMD-check", - "https://app.codecov.io/gh/ropensci-review-tools/pkgsimil" - ], + "contIntegration": ["https://github.com/ropensci-review-tools/pkgsimil/actions?query=workflow%3AR-CMD-check", "https://app.codecov.io/gh/ropensci-review-tools/pkgsimil"], "developmentStatus": "https://www.repostatus.org/#wip" } diff --git a/man/pkgsimil_load_data.Rd b/man/pkgsimil_load_data.Rd index 8d5429a..0756a0b 100644 --- a/man/pkgsimil_load_data.Rd +++ b/man/pkgsimil_load_data.Rd @@ -6,12 +6,17 @@ function, either for all rOpenSci packages or, if \code{fns = TRUE}, all individual functions within those packages.} \usage{ -pkgsimil_load_data(what = "embeddings", fns = FALSE) +pkgsimil_load_data(what = "embeddings", corpus = "ropensci", fns = FALSE) } \arguments{ \item{what}{Either "embeddings" to load pre-generated embeddings, or "idfs" to load pre-generated Inverse Document Frequency weightings.} +\item{corpus}{If \code{embeddings} or \code{idfs} parameters are not specified, they +are automatically downloaded for the corpus specified by this parameter. +Must be one of "ropensci" or "cran". The function will then return the most +similar package from the specified corpus.} + \item{fns}{If \code{FALSE} (default), load embeddings for all rOpenSci packages; otherwise load (considerably larger dataset of) embeddings for all individual functions.} diff --git a/man/pkgsimil_similar_pkgs.Rd b/man/pkgsimil_similar_pkgs.Rd index efdc99d..91bad7c 100644 --- a/man/pkgsimil_similar_pkgs.Rd +++ b/man/pkgsimil_similar_pkgs.Rd @@ -7,6 +7,7 @@ most similar packages to a given local repository.} \usage{ pkgsimil_similar_pkgs( input, + corpus = "ropensci", embeddings = NULL, idfs = NULL, input_is_code = text_is_code(input), @@ -17,6 +18,11 @@ pkgsimil_similar_pkgs( \item{input}{Either a path to local source code of an R package, or a text string.} +\item{corpus}{If \code{embeddings} or \code{idfs} parameters are not specified, they +are automatically downloaded for the corpus specified by this parameter. +Must be one of "ropensci" or "cran". The function will then return the most +similar package from the specified corpus.} + \item{embeddings}{Large Language Model embeddings for all rOpenSci packages, generated from \link{pkgsimil_embeddings_from_pkgs}. If not provided, pre-generated embeddings will be downloaded and stored in a local cache @@ -44,6 +50,12 @@ naming the \code{n} most similar packages. Use the embeddings from \link{pkgsimil_embeddings_from_pkgs} to identify most similar packages to a given local repository. } +\note{ +The first time this function is run without passing either +\code{embeddings} or \code{idfs}, required values will be automatically downloaded and +stored in a locally persistent cache directory. Especially for the "cran" +corpus, this downloading may take quite some time. +} \examples{ \dontrun{ input <- "Download open spatial data from NASA"