Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug chinese#20 #21

Merged
merged 7 commits into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 0 additions & 37 deletions .circleci/config.yml

This file was deleted.

4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@
.RData
.Ruserdata

*.swp

~*
~*
.~lock*
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: striprtf
Type: Package
Title: Extract Text from RTF File
Version: 0.5.4
Version: 0.6.0
Authors@R: c(
person("Kota", "Mori", email = "[email protected]", role = c("aut", "cre"))
)
Expand All @@ -16,7 +16,8 @@ Imports:
utils
Suggests:
testthat
RoxygenNote: 7.1.1
RoxygenNote: 7.2.3
LinkingTo: Rcpp
URL: https://github.com/kota7/striprtf
BugReports: https://github.com/kota7/striprtf/issues
Encoding: UTF-8
8 changes: 7 additions & 1 deletion NEWS
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
striprtf v0.5.3 (Dev version as of 2021-09-07)
striprtf v0.6.0 (Dev version as of 2023-08-09)
==============

* Fix error on some code pages (e.g. CP936)


striprtf v0.5.3 (Release date: 2021-09-07)
==============

* `read_rtf`, `strip_rtf` now allow mismatched curly braces
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ hex_to_int <- function(h, sep = 'x') {
.Call('_striprtf_hex_to_int', PACKAGE = 'striprtf', h, sep)
}

strip_helper <- function(match_mat, dest_names, special_keys, special_hex, verbose) {
.Call('_striprtf_strip_helper', PACKAGE = 'striprtf', match_mat, dest_names, special_keys, special_hex, verbose)
strip_helper <- function(match_mat, dest_names, special_keys, special_hex, code_before, code_after, verbose) {
.Call('_striprtf_strip_helper', PACKAGE = 'striprtf', match_mat, dest_names, special_keys, special_hex, code_before, code_after, verbose)
}

32 changes: 19 additions & 13 deletions R/striprtf.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,12 @@ strip_rtf <- function(text, verbose = FALSE,
cp <- stringr::str_match(text, "\\\\ansicpg([0-9]+)")[,2]
if (is.na(cp)) {
cpname <- NA_character_
code_before <- integer(0)
code_after <- integer(0)
} else {
cpname <- paste("CP", cp, sep = "")
code_before <- .cptable[[cpname]]$before
code_after <- .cptable[[cpname]]$after
}

pattern <- stringr::regex(
Expand Down Expand Up @@ -117,6 +121,8 @@ strip_rtf <- function(text, verbose = FALSE,
dest_names = .destinations,
special_keys = keys,
special_hex = hexstr,
code_before = code_before,
code_after = code_after,
verbose = verbose)
#print(parsed)

Expand All @@ -127,19 +133,19 @@ strip_rtf <- function(text, verbose = FALSE,
#print(out)
out <- lapply(parsed$intcode, intToUtf8) %>% unlist()

# code page translation
if (!is.na(cpname)) {
if (cpname %in% names(.cptable)) {
out[parsed$toconv] <- chartr(.cptable[[cpname]]$before,
.cptable[[cpname]]$after,
out[parsed$toconv])
#out[parsed$toconv] <- lapply(out[parsed$toconv], function(a) {
# chartr(.cptable[[cpname]]$before, .cptable[[cpname]]$after, a)
#})
} else {
warning("conversion table for ", cpname, " is missing")
}
}
# code page translation ... will be done in the strip_helper function
# if (!is.na(cpname)) {
# if (cpname %in% names(.cptable)) {
# out[parsed$toconv] <- chartr(.cptable[[cpname]]$before,
# .cptable[[cpname]]$after,
# out[parsed$toconv])
# #out[parsed$toconv] <- lapply(out[parsed$toconv], function(a) {
# # chartr(.cptable[[cpname]]$before, .cptable[[cpname]]$after, a)
# #})
# } else {
# warning("conversion table for ", cpname, " is missing")
# }
# }

# if there is no table or ignore table option is specified,
# remove tmp_rep characters,
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,10 @@

[![CRAN
status](https://www.r-pkg.org/badges/version/striprtf)](https://cran.r-project.org/package=striprtf)
[![Build
Status](https://travis-ci.org/kota7/striprtf.svg?branch=master)](https://travis-ci.org/kota7/striprtf)
[![AppVeyor Build
Status](https://ci.appveyor.com/api/projects/status/github/kota7/striprtf?branch=master&svg=true)](https://ci.appveyor.com/project/kota7/striprtf)
[![](http://cranlogs.r-pkg.org/badges/striprtf)](https://cran.r-project.org/package=striprtf)
[![R-CMD-check](https://github.com/kota7/striprtf/workflows/R-CMD-check/badge.svg)](https://github.com/kota7/striprtf/actions)
[![CircleCI build
status](https://circleci.com/gh/kota7/striprtf.svg?style=svg)](https://circleci.com/gh/kota7/striprtf)

# striprtf: Extract Text from RTF (Rich Text Format) File

Expand Down
13 changes: 6 additions & 7 deletions internal-script/make-cp-table.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,20 +72,19 @@ for (file in file_list)
# we only need the cases where codes are different
x <- x[x$before != x$after,]

# create before and after strings for conversion
bef <- intToUtf8(x$before) %>% paste0(collapse = "")
aft <- intToUtf8(x$after) %>% paste0(collapse = "")
stopifnot(nchar(bef) == nchar(aft))
tmp <- list(before = bef, after = aft)
# create before and after integer values
#bef <- intToUtf8(x$before) %>% paste0(collapse = "")
#aft <- intToUtf8(x$after) %>% paste0(collapse = "")
#stopifnot(nchar(bef) == nchar(aft))

#table_list <- c(table_list, table_name)
out <- c(out, list(tmp) %>% setNames(table_name))
out <- c(out, list(x) %>% setNames(table_name))
}



.cptable <- out
devtools::use_data(.cptable, internal = TRUE, overwrite = TRUE)
usethis::use_data(.cptable, internal = TRUE, overwrite = TRUE)


#cat(paste0(table_list, collapse = ", "))
Expand Down
10 changes: 6 additions & 4 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,27 @@ BEGIN_RCPP
END_RCPP
}
// strip_helper
List strip_helper(CharacterMatrix match_mat, CharacterVector dest_names, CharacterVector special_keys, CharacterVector special_hex, bool verbose);
RcppExport SEXP _striprtf_strip_helper(SEXP match_matSEXP, SEXP dest_namesSEXP, SEXP special_keysSEXP, SEXP special_hexSEXP, SEXP verboseSEXP) {
List strip_helper(CharacterMatrix match_mat, CharacterVector dest_names, CharacterVector special_keys, CharacterVector special_hex, IntegerVector code_before, IntegerVector code_after, bool verbose);
RcppExport SEXP _striprtf_strip_helper(SEXP match_matSEXP, SEXP dest_namesSEXP, SEXP special_keysSEXP, SEXP special_hexSEXP, SEXP code_beforeSEXP, SEXP code_afterSEXP, SEXP verboseSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterMatrix >::type match_mat(match_matSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type dest_names(dest_namesSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type special_keys(special_keysSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type special_hex(special_hexSEXP);
Rcpp::traits::input_parameter< IntegerVector >::type code_before(code_beforeSEXP);
Rcpp::traits::input_parameter< IntegerVector >::type code_after(code_afterSEXP);
Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
rcpp_result_gen = Rcpp::wrap(strip_helper(match_mat, dest_names, special_keys, special_hex, verbose));
rcpp_result_gen = Rcpp::wrap(strip_helper(match_mat, dest_names, special_keys, special_hex, code_before, code_after, verbose));
return rcpp_result_gen;
END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_striprtf_to_hexstr", (DL_FUNC) &_striprtf_to_hexstr, 2},
{"_striprtf_hex_to_int", (DL_FUNC) &_striprtf_hex_to_int, 2},
{"_striprtf_strip_helper", (DL_FUNC) &_striprtf_strip_helper, 5},
{"_striprtf_strip_helper", (DL_FUNC) &_striprtf_strip_helper, 7},
{NULL, NULL, 0}
};

Expand Down
2 changes: 1 addition & 1 deletion src/dechex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ IntegerVector hex_to_int(std::string h, char sep)
IntegerVector out;
bool started = false;
int start = 0;
for (int i = 0; i < h.size(); i++)
for (unsigned int i = 0; i < h.size(); i++)
{
//Rcout << i+1 << "/" << h.size() << ":" << h[i] << "\n";
if (h[i] == sep) {
Expand Down
22 changes: 10 additions & 12 deletions src/dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,28 @@ using namespace Rcpp;
// not usable for other occasions


template <class T>
template <class S, class T>
struct SimpleOrderedDict
{
std::vector<std::string> keys;
std::vector<S> keys;
std::vector<T> values;

SimpleOrderedDict()
{
std::vector<std::string> keys(0);
std::vector<S> keys(0);
std::vector<T> values(0);
}
unsigned int size()
{
return keys.size();
}

void insert(const std::pair<std::string, T> &p)
void insert(S key, T value)
{
std::string key = p.first;
T v = p.second;
// if empty, just add them
if (size() == 0) {
keys.push_back(key);
values.push_back(v);
values.push_back(value);
return;
}
// make sure the key is ordered
Expand All @@ -44,10 +42,10 @@ template <class T>
if (haskey(key)) stop(key + " already exists");

keys.push_back(key);
values.push_back(v);
values.push_back(value);
}

int locate(const std::string &key)
int locate(const S &key)
{
// returns the location of key
// if not exists, returns negative integer
Expand All @@ -68,17 +66,17 @@ template <class T>
return -1;
}

bool haskey(const std::string &key)
bool haskey(const S &key)
{
// returns true if and only if the key exists
return (locate(key) >= 0);
}

T getvalue(const std::string & key)
T getvalue(const S & key)
{
int i = locate(key);
if (i >= 0) return values[i];
return NULL;
Rcpp::stop("key does not exist");
}
};

Expand Down
34 changes: 26 additions & 8 deletions src/strip_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,25 @@ struct Section

void set_parameters(
std::set<std::string> &destinations,
SimpleOrderedDict<std::string> &specialchars,
SimpleOrderedDict<std::string, std::string> &specialchars,
SimpleOrderedDict<int, int> &codemap,
CharacterVector dest_names,
CharacterVector special_keys,
CharacterVector special_hex)
CharacterVector special_hex,
IntegerVector int_before,
IntegerVector int_after)
{
// destinations
for (int i = 0; i < dest_names.size(); i++)
destinations.insert(as<std::string>(dest_names[i]));

// special chars
for (int i = 0; i < special_keys.size(); i++)
specialchars.insert(std::pair<std::string, std::string>(
as<std::string>(special_keys[i]), as<std::string>(special_hex[i])));
specialchars.insert(as<std::string>(special_keys[i]), as<std::string>(special_hex[i]));

// special chars
for (int i = 0; i < int_before.size(); i++)
codemap.insert(int_before[i], int_after[i]);

}

Expand Down Expand Up @@ -108,6 +114,8 @@ List strip_helper(CharacterMatrix match_mat,
CharacterVector dest_names,
CharacterVector special_keys,
CharacterVector special_hex,
IntegerVector code_before,
IntegerVector code_after,
bool verbose) {
// helps rtf2text function by handling loop part
//
Expand All @@ -128,6 +136,11 @@ List strip_helper(CharacterMatrix match_mat,
// characte vVector of same size, which match the
// special words to the hex string to replace
//
// code_before, code_after:
// Mapping of integer codes based on the code page specified
// Codes appearing in the texts are mapped from "before" to "after" values
// before returning
//
// returns a list of four vectors of the same length
// - strcode : character vector of hex codes, in the form
// e.g, x0010x3010...
Expand All @@ -144,9 +157,10 @@ List strip_helper(CharacterMatrix match_mat,
stop("special keys and values have different length");

std::set<std::string> destinations;
SimpleOrderedDict<std::string> specialchars;
set_parameters(destinations, specialchars,
dest_names, special_keys, special_hex);
SimpleOrderedDict<std::string, std::string> specialchars;
SimpleOrderedDict<int, int> codemap;
set_parameters(destinations, specialchars, codemap,
dest_names, special_keys, special_hex, code_before, code_after);


// debug
Expand Down Expand Up @@ -314,7 +328,11 @@ List strip_helper(CharacterMatrix match_mat,
toconv_vec.push_back(doc[i].toconv);
table_vec.push_back(doc[i].intable);

int_vec_list.push_back(hex_to_int(doc[i].strcode));
IntegerVector tmp = hex_to_int(doc[i].strcode);
for (int i=0; i<tmp.size(); i++)
if (codemap.haskey(tmp[i])) tmp[i] = codemap.getvalue(tmp[i]);

int_vec_list.push_back(tmp);
}
List out = List::create(Named("strcode") = str_vec,
Named("intcode") = int_vec_list,
Expand Down
1 change: 1 addition & 0 deletions tests/testthat/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
testthat-problems.rds
Loading