Skip to content

Commit

Permalink
Merge pull request #12 from ropensci-review-tools/cran
Browse files Browse the repository at this point in the history
add cran data and corpus
  • Loading branch information
mpadge authored Sep 27, 2024
2 parents 6869774 + 5e6dd8e commit 7f69742
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 30 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: pkgsimil
Title: Similarity Metrics between R Packages
Version: 0.1.2.116
Version: 0.1.2.119
Authors@R: c(
person("Mark", "Padgham", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-2172-5265")))
Expand All @@ -12,6 +12,7 @@ BugReports: https://github.com/ropensci-review-tools/pkgsimil/issues
Imports:
brio,
cli,
curl,
dplyr,
fs,
httr2,
Expand Down
50 changes: 37 additions & 13 deletions R/cache.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#' function, either for all rOpenSci packages or, if `fns = TRUE`, all
#' individual functions within those packages.
#'
#' @inheritParams pkgsimil_similar_pkgs
#' @param fns If `FALSE` (default), load embeddings for all rOpenSci packages;
#' otherwise load (considerably larger dataset of) embeddings for all
#' individual functions.
Expand All @@ -17,39 +18,62 @@
#' idfs <- pkgsimil_load_data ("idfs")
#' idfs_fns <- pkgsimil_load_data ("idfs", fns = TRUE)
#' }
pkgsimil_load_data <- function (what = "embeddings", fns = FALSE) {
pkgsimil_load_data <- function (what = "embeddings", corpus = "ropensci", fns = FALSE) {

corpus <- match.arg (corpus, c ("ropensci", "cran"))
what <- match.arg (what, c ("embeddings", "idfs"))

if (what == "embeddings") {
fname <- ifelse (fns, "embeddings-fns.Rds", "embeddings.Rds")
} else {
fname <- ifelse (fns, "bm25-ropensci-fns.Rds", "bm25-ropensci.Rds")
if (corpus == "ropensci") {

if (what == "embeddings") {
fname <- ifelse (fns, "embeddings-fns.Rds", "embeddings.Rds")
} else {
fname <- ifelse (fns, "bm25-ropensci-fns.Rds", "bm25-ropensci.Rds")
}

} else if (corpus == "cran") {

if (what == "embeddings") {
fname <- "embeddings-cran.Rds"
} else {
fname <- "bm25-cran.Rds"
}
}

fname <- fs::path (pkgsimil_cache_path (), fname)
if (!fs::file_exists (fname)) {
fname <- pkgsimil_dl_data (what = what, fns = fns)
fname <- pkgsimil_dl_data (what = what, corpus = corpus, fns = fns)
}
readRDS (fname)
}

pkgsimil_dl_data <- function (what = "embeddings", fns = FALSE) {
pkgsimil_dl_data <- function (what = "embeddings", corpus = "ropensci", fns = FALSE) {

what <- match.arg (what, c ("embeddings", "idfs"))
corpus <- match.arg (corpus, c ("ropensci", "cran"))

url_base <-
"https://github.com/ropensci-review-tools/pkgsimil/releases/download/"
version <- "v0.1.2"

if (what == "embeddings") {
file <- ifelse (fns, "embeddings-fns.Rds", "embeddings.Rds")
} else {
file <- ifelse (fns, "bm25-ropensci-fns.Rds", "bm25-ropensci.Rds")
if (corpus == "ropensci") {
if (what == "embeddings") {
file <- ifelse (fns, "embeddings-fns.Rds", "embeddings.Rds")
} else {
file <- ifelse (fns, "bm25-ropensci-fns.Rds", "bm25-ropensci.Rds")
}
} else if (corpus == "cran") {
if (what == "embeddings") {
file <- "embeddings-cran.Rds"
} else {
file <- "bm25-cran.Rds"
}
}
url <- paste0 (url_base, version, "/", file)

dl_url <- paste0 (url_base, version, "/", file)

destfile <- fs::path (pkgsimil_cache_path (), file)
utils::download.file (url, destfile = destfile)
curl::curl_download (url = dl_url, destfile = destfile, quiet = opt_is_quiet ())
return (destfile)
}

Expand Down
16 changes: 14 additions & 2 deletions R/similar-pkgs.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
#'
#' @param input Either a path to local source code of an R package, or a text
#' string.
#' @param corpus If `embeddings` or `idfs` parameters are not specified, they
#' are automatically downloaded for the corpus specified by this parameter.
#' Must be one of "ropensci" or "cran". The function will then return the most
#' similar package from the specified corpus.
#' @param embeddings Large Language Model embeddings for all rOpenSci packages,
#' generated from \link{pkgsimil_embeddings_from_pkgs}. If not provided,
#' pre-generated embeddings will be downloaded and stored in a local cache
Expand All @@ -21,6 +25,11 @@
#' If `input` is a single text string, a single character vector is returned
#' naming the `n` most similar packages.
#'
#' @note The first time this function is run without passing either
#' `embeddings` or `idfs`, required values will be automatically downloaded and
#' stored in a locally persistent cache directory. Especially for the "cran"
#' corpus, this downloading may take quite some time.
#'
#' @seealso input_is_code
#' @export
#'
Expand All @@ -30,16 +39,19 @@
#' pkgsimil_similar_pkgs (input)
#' }
pkgsimil_similar_pkgs <- function (input,
corpus = "ropensci",
embeddings = NULL,
idfs = NULL,
input_is_code = text_is_code (input),
n = 5L) {

corpus <- match.arg (corpus, c ("ropensci", "cran"))

if (is.null (embeddings)) {
embeddings <- pkgsimil_load_data ("embeddings")
embeddings <- pkgsimil_load_data (what = "embeddings", corpus = corpus)
}
if (is.null (idfs)) {
idfs <- pkgsimil_load_data ("idfs")
idfs <- pkgsimil_load_data (what = "idfs", corpus = corpus)
}

nms_expected <- c ("text_with_fns", "text_wo_fns", "code")
Expand Down
47 changes: 34 additions & 13 deletions codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"codeRepository": "https://github.com/ropensci-review-tools/pkgsimil",
"issueTracker": "https://github.com/ropensci-review-tools/pkgsimil/issues",
"license": "https://spdx.org/licenses/MIT",
"version": "0.1.2.116",
"version": "0.1.2.119",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "R",
Expand Down Expand Up @@ -122,6 +122,18 @@
"sameAs": "https://CRAN.R-project.org/package=cli"
},
"3": {
"@type": "SoftwareApplication",
"identifier": "curl",
"name": "curl",
"provider": {
"@id": "https://cran.r-project.org",
"@type": "Organization",
"name": "Comprehensive R Archive Network (CRAN)",
"url": "https://cran.r-project.org"
},
"sameAs": "https://CRAN.R-project.org/package=curl"
},
"4": {
"@type": "SoftwareApplication",
"identifier": "dplyr",
"name": "dplyr",
Expand All @@ -133,7 +145,7 @@
},
"sameAs": "https://CRAN.R-project.org/package=dplyr"
},
"4": {
"5": {
"@type": "SoftwareApplication",
"identifier": "fs",
"name": "fs",
Expand All @@ -145,7 +157,7 @@
},
"sameAs": "https://CRAN.R-project.org/package=fs"
},
"5": {
"6": {
"@type": "SoftwareApplication",
"identifier": "httr2",
"name": "httr2",
Expand All @@ -157,7 +169,7 @@
},
"sameAs": "https://CRAN.R-project.org/package=httr2"
},
"6": {
"7": {
"@type": "SoftwareApplication",
"identifier": "memoise",
"name": "memoise",
Expand All @@ -169,7 +181,7 @@
},
"sameAs": "https://CRAN.R-project.org/package=memoise"
},
"7": {
"8": {
"@type": "SoftwareApplication",
"identifier": "pbapply",
"name": "pbapply",
Expand All @@ -181,7 +193,19 @@
},
"sameAs": "https://CRAN.R-project.org/package=pbapply"
},
"8": {
"9": {
"@type": "SoftwareApplication",
"identifier": "Rcpp",
"name": "Rcpp",
"provider": {
"@id": "https://cran.r-project.org",
"@type": "Organization",
"name": "Comprehensive R Archive Network (CRAN)",
"url": "https://cran.r-project.org"
},
"sameAs": "https://CRAN.R-project.org/package=Rcpp"
},
"10": {
"@type": "SoftwareApplication",
"identifier": "tokenizers",
"name": "tokenizers",
Expand All @@ -193,19 +217,16 @@
},
"sameAs": "https://CRAN.R-project.org/package=tokenizers"
},
"9": {
"11": {
"@type": "SoftwareApplication",
"identifier": "R",
"name": "R",
"version": ">= 3.5.0"
},
"SystemRequirements": {}
"SystemRequirements": null
},
"fileSize": "32745.224KB",
"fileSize": "294217.436KB",
"readme": "https://github.com/ropensci-review-tools/pkgsimil/blob/main/README.md",
"contIntegration": [
"https://github.com/ropensci-review-tools/pkgsimil/actions?query=workflow%3AR-CMD-check",
"https://app.codecov.io/gh/ropensci-review-tools/pkgsimil"
],
"contIntegration": ["https://github.com/ropensci-review-tools/pkgsimil/actions?query=workflow%3AR-CMD-check", "https://app.codecov.io/gh/ropensci-review-tools/pkgsimil"],
"developmentStatus": "https://www.repostatus.org/#wip"
}
7 changes: 6 additions & 1 deletion man/pkgsimil_load_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/pkgsimil_similar_pkgs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 7f69742

Please sign in to comment.