Skip to content

Commit

Permalink
Merge pull request #69 from ropensci-review-tools/src
Browse files Browse the repository at this point in the history
separate two sub-fns in src/bm25.cpp
  • Loading branch information
mpadge authored Nov 6, 2024
2 parents 4af3f1d + 558e240 commit 79c971f
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 14 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: pkgmatch
Title: Find R Packages Matching Either Descriptions or Other R Packages
Version: 0.4.1.043
Version: 0.4.1.044
Authors@R: c(
person("Mark", "Padgham", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-2172-5265")),
Expand Down
2 changes: 1 addition & 1 deletion codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"codeRepository": "https://github.com/ropensci-review-tools/pkgmatch",
"issueTracker": "https://github.com/ropensci-review-tools/pkgmatch/issues",
"license": "https://spdx.org/licenses/MIT",
"version": "0.4.1.043",
"version": "0.4.1.044",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "R",
Expand Down
36 changes: 24 additions & 12 deletions src/bm25.cpp
Original file line number Diff line number Diff line change
@@ -1,34 +1,46 @@
#include "bm25.h"

// [[Rcpp::export]]
Rcpp::NumericVector rcpp_bm25 (const Rcpp::DataFrame &idfs, const Rcpp::List &tokensList, Rcpp::DataFrame &these_tokens, const double ntoks_avg) {

// Fixed parameters used in the BM25 function. See wikipedia reference for
// these values.
const double k = 1.2;
const double b = 0.75;
void make_idf_map (
const Rcpp::DataFrame &idfs,
std::unordered_map <std::string, double> &idf_map) {

// Set up primary 'idf_map' to map all tokens to their IDFs over whole corpus:
std::unordered_map <std::string, double> idf_map;
const Rcpp::CharacterVector idf_tokens = idfs ["token"];
const Rcpp::NumericVector idf_idf = idfs ["idf"];
for (int i = 0; i < idfs.nrow (); i++) {
std::string this_tok = static_cast<std::string> (idf_tokens [i]);
idf_map.emplace (this_tok, idf_idf [i]);
}
}

const int ndocs = tokensList.size();
void make_these_tokens_map (
const Rcpp::DataFrame &these_tokens,
std::unordered_map <std::string, int> &these_tokens_map) {

// Then make a map of the input tokens and counts:
std::unordered_map <std::string, int> these_tokens_map;
const Rcpp::CharacterVector these_tokens_str = these_tokens ["token"];
const Rcpp::IntegerVector these_tokens_n = these_tokens ["np"];

for (int i = 0; i < these_tokens.nrow (); i++) {
const std::string this_string = static_cast <std::string> (these_tokens_str [i]);
these_tokens_map.emplace (this_string, these_tokens_n [i]);
}
}

// [[Rcpp::export]]
Rcpp::NumericVector rcpp_bm25 (const Rcpp::DataFrame &idfs, const Rcpp::List &tokensList, Rcpp::DataFrame &these_tokens, const double ntoks_avg) {

// Fixed parameters used in the BM25 function. See wikipedia reference for
// these values.
const double k = 1.2;
const double b = 0.75;

// Set up primary 'idf_map' to map all tokens to their IDFs over whole corpus:
std::unordered_map <std::string, double> idf_map;
make_idf_map (idfs, idf_map);

std::unordered_map <std::string, int> these_tokens_map;
make_these_tokens_map (these_tokens, these_tokens_map);

const int ndocs = tokensList.size();
Rcpp::NumericVector bm25 (ndocs, 0.0);

for (int i = 0; i < ndocs; i++) {
Expand Down
7 changes: 7 additions & 0 deletions src/bm25.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,11 @@

#include <Rcpp.h>

void make_idf_map (
const Rcpp::DataFrame &idfs,
std::unordered_map <std::string, double> &idf_map);
void make_these_tokens_map (
const Rcpp::DataFrame &these_tokens,
std::unordered_map <std::string, double> &these_tokens_map);

Rcpp::NumericVector rcpp_bm25 (const Rcpp::DataFrame &idfs, const Rcpp::List &tokensList, Rcpp::DataFrame &these_tokens, const double ntoks_avg);

0 comments on commit 79c971f

Please sign in to comment.