Skip to content

Commit

Permalink
Merge pull request #21 from kota7/bug-chinese#20
Browse files Browse the repository at this point in the history
Bug chinese#20
  • Loading branch information
kota7 authored Aug 9, 2023
2 parents 00dbfad + d3e42a3 commit 246e0de
Show file tree
Hide file tree
Showing 18 changed files with 427 additions and 93 deletions.
37 changes: 0 additions & 37 deletions .circleci/config.yml

This file was deleted.

4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@
.RData
.Ruserdata

*.swp

~*
~*
.~lock*
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: striprtf
Type: Package
Title: Extract Text from RTF File
Version: 0.5.4
Version: 0.6.0
Authors@R: c(
person("Kota", "Mori", email = "[email protected]", role = c("aut", "cre"))
)
Expand All @@ -16,7 +16,8 @@ Imports:
utils
Suggests:
testthat
RoxygenNote: 7.1.1
RoxygenNote: 7.2.3
LinkingTo: Rcpp
URL: https://github.com/kota7/striprtf
BugReports: https://github.com/kota7/striprtf/issues
Encoding: UTF-8
8 changes: 7 additions & 1 deletion NEWS
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
striprtf v0.5.3 (Dev version as of 2021-09-07)
striprtf v0.6.0 (Dev version as of 2023-08-09)
==============

* Fix error on some code pages (e.g. CP936)


striprtf v0.5.3 (Release date: 2021-09-07)
==============

* `read_rtf`, `strip_rtf` now allow mismatched curly braces
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ hex_to_int <- function(h, sep = 'x') {
.Call('_striprtf_hex_to_int', PACKAGE = 'striprtf', h, sep)
}

strip_helper <- function(match_mat, dest_names, special_keys, special_hex, verbose) {
.Call('_striprtf_strip_helper', PACKAGE = 'striprtf', match_mat, dest_names, special_keys, special_hex, verbose)
strip_helper <- function(match_mat, dest_names, special_keys, special_hex, code_before, code_after, verbose) {
.Call('_striprtf_strip_helper', PACKAGE = 'striprtf', match_mat, dest_names, special_keys, special_hex, code_before, code_after, verbose)
}

32 changes: 19 additions & 13 deletions R/striprtf.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,12 @@ strip_rtf <- function(text, verbose = FALSE,
cp <- stringr::str_match(text, "\\\\ansicpg([0-9]+)")[,2]
if (is.na(cp)) {
cpname <- NA_character_
code_before <- integer(0)
code_after <- integer(0)
} else {
cpname <- paste("CP", cp, sep = "")
code_before <- .cptable[[cpname]]$before
code_after <- .cptable[[cpname]]$after
}

pattern <- stringr::regex(
Expand Down Expand Up @@ -117,6 +121,8 @@ strip_rtf <- function(text, verbose = FALSE,
dest_names = .destinations,
special_keys = keys,
special_hex = hexstr,
code_before = code_before,
code_after = code_after,
verbose = verbose)
#print(parsed)

Expand All @@ -127,19 +133,19 @@ strip_rtf <- function(text, verbose = FALSE,
#print(out)
out <- lapply(parsed$intcode, intToUtf8) %>% unlist()

# code page translation
if (!is.na(cpname)) {
if (cpname %in% names(.cptable)) {
out[parsed$toconv] <- chartr(.cptable[[cpname]]$before,
.cptable[[cpname]]$after,
out[parsed$toconv])
#out[parsed$toconv] <- lapply(out[parsed$toconv], function(a) {
# chartr(.cptable[[cpname]]$before, .cptable[[cpname]]$after, a)
#})
} else {
warning("conversion table for ", cpname, " is missing")
}
}
# code page translation ... will be done in the strip_helper function
# if (!is.na(cpname)) {
# if (cpname %in% names(.cptable)) {
# out[parsed$toconv] <- chartr(.cptable[[cpname]]$before,
# .cptable[[cpname]]$after,
# out[parsed$toconv])
# #out[parsed$toconv] <- lapply(out[parsed$toconv], function(a) {
# # chartr(.cptable[[cpname]]$before, .cptable[[cpname]]$after, a)
# #})
# } else {
# warning("conversion table for ", cpname, " is missing")
# }
# }

# if there is no table or ignore table option is specified,
# remove tmp_rep characters,
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,10 @@

[![CRAN
status](https://www.r-pkg.org/badges/version/striprtf)](https://cran.r-project.org/package=striprtf)
[![Build
Status](https://travis-ci.org/kota7/striprtf.svg?branch=master)](https://travis-ci.org/kota7/striprtf)
[![AppVeyor Build
Status](https://ci.appveyor.com/api/projects/status/github/kota7/striprtf?branch=master&svg=true)](https://ci.appveyor.com/project/kota7/striprtf)
[![](http://cranlogs.r-pkg.org/badges/striprtf)](https://cran.r-project.org/package=striprtf)
[![R-CMD-check](https://github.com/kota7/striprtf/workflows/R-CMD-check/badge.svg)](https://github.com/kota7/striprtf/actions)
[![CircleCI build
status](https://circleci.com/gh/kota7/striprtf.svg?style=svg)](https://circleci.com/gh/kota7/striprtf)

# striprtf: Extract Text from RTF (Rich Text Format) File

Expand Down
13 changes: 6 additions & 7 deletions internal-script/make-cp-table.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,20 +72,19 @@ for (file in file_list)
# we only need the cases where codes are different
x <- x[x$before != x$after,]

# create before and after strings for conversion
bef <- intToUtf8(x$before) %>% paste0(collapse = "")
aft <- intToUtf8(x$after) %>% paste0(collapse = "")
stopifnot(nchar(bef) == nchar(aft))
tmp <- list(before = bef, after = aft)
# create before and after integer values
#bef <- intToUtf8(x$before) %>% paste0(collapse = "")
#aft <- intToUtf8(x$after) %>% paste0(collapse = "")
#stopifnot(nchar(bef) == nchar(aft))

#table_list <- c(table_list, table_name)
out <- c(out, list(tmp) %>% setNames(table_name))
out <- c(out, list(x) %>% setNames(table_name))
}



.cptable <- out
devtools::use_data(.cptable, internal = TRUE, overwrite = TRUE)
usethis::use_data(.cptable, internal = TRUE, overwrite = TRUE)


#cat(paste0(table_list, collapse = ", "))
Expand Down
10 changes: 6 additions & 4 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,27 @@ BEGIN_RCPP
END_RCPP
}
// strip_helper
List strip_helper(CharacterMatrix match_mat, CharacterVector dest_names, CharacterVector special_keys, CharacterVector special_hex, bool verbose);
RcppExport SEXP _striprtf_strip_helper(SEXP match_matSEXP, SEXP dest_namesSEXP, SEXP special_keysSEXP, SEXP special_hexSEXP, SEXP verboseSEXP) {
List strip_helper(CharacterMatrix match_mat, CharacterVector dest_names, CharacterVector special_keys, CharacterVector special_hex, IntegerVector code_before, IntegerVector code_after, bool verbose);
RcppExport SEXP _striprtf_strip_helper(SEXP match_matSEXP, SEXP dest_namesSEXP, SEXP special_keysSEXP, SEXP special_hexSEXP, SEXP code_beforeSEXP, SEXP code_afterSEXP, SEXP verboseSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterMatrix >::type match_mat(match_matSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type dest_names(dest_namesSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type special_keys(special_keysSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type special_hex(special_hexSEXP);
Rcpp::traits::input_parameter< IntegerVector >::type code_before(code_beforeSEXP);
Rcpp::traits::input_parameter< IntegerVector >::type code_after(code_afterSEXP);
Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP);
rcpp_result_gen = Rcpp::wrap(strip_helper(match_mat, dest_names, special_keys, special_hex, verbose));
rcpp_result_gen = Rcpp::wrap(strip_helper(match_mat, dest_names, special_keys, special_hex, code_before, code_after, verbose));
return rcpp_result_gen;
END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_striprtf_to_hexstr", (DL_FUNC) &_striprtf_to_hexstr, 2},
{"_striprtf_hex_to_int", (DL_FUNC) &_striprtf_hex_to_int, 2},
{"_striprtf_strip_helper", (DL_FUNC) &_striprtf_strip_helper, 5},
{"_striprtf_strip_helper", (DL_FUNC) &_striprtf_strip_helper, 7},
{NULL, NULL, 0}
};

Expand Down
2 changes: 1 addition & 1 deletion src/dechex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ IntegerVector hex_to_int(std::string h, char sep)
IntegerVector out;
bool started = false;
int start = 0;
for (int i = 0; i < h.size(); i++)
for (unsigned int i = 0; i < h.size(); i++)
{
//Rcout << i+1 << "/" << h.size() << ":" << h[i] << "\n";
if (h[i] == sep) {
Expand Down
22 changes: 10 additions & 12 deletions src/dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,28 @@ using namespace Rcpp;
// not usable for other occasions


template <class T>
template <class S, class T>
struct SimpleOrderedDict
{
std::vector<std::string> keys;
std::vector<S> keys;
std::vector<T> values;

SimpleOrderedDict()
{
std::vector<std::string> keys(0);
std::vector<S> keys(0);
std::vector<T> values(0);
}
unsigned int size()
{
return keys.size();
}

void insert(const std::pair<std::string, T> &p)
void insert(S key, T value)
{
std::string key = p.first;
T v = p.second;
// if empty, just add them
if (size() == 0) {
keys.push_back(key);
values.push_back(v);
values.push_back(value);
return;
}
// make sure the key is ordered
Expand All @@ -44,10 +42,10 @@ template <class T>
if (haskey(key)) stop(key + " already exists");

keys.push_back(key);
values.push_back(v);
values.push_back(value);
}

int locate(const std::string &key)
int locate(const S &key)
{
// returns the location of key
// if not exists, returns negative integer
Expand All @@ -68,17 +66,17 @@ template <class T>
return -1;
}

bool haskey(const std::string &key)
bool haskey(const S &key)
{
// returns true if and only if the key exists
return (locate(key) >= 0);
}

T getvalue(const std::string & key)
T getvalue(const S & key)
{
int i = locate(key);
if (i >= 0) return values[i];
return NULL;
Rcpp::stop("key does not exist");
}
};

Expand Down
34 changes: 26 additions & 8 deletions src/strip_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,25 @@ struct Section

void set_parameters(
std::set<std::string> &destinations,
SimpleOrderedDict<std::string> &specialchars,
SimpleOrderedDict<std::string, std::string> &specialchars,
SimpleOrderedDict<int, int> &codemap,
CharacterVector dest_names,
CharacterVector special_keys,
CharacterVector special_hex)
CharacterVector special_hex,
IntegerVector int_before,
IntegerVector int_after)
{
// destinations
for (int i = 0; i < dest_names.size(); i++)
destinations.insert(as<std::string>(dest_names[i]));

// special chars
for (int i = 0; i < special_keys.size(); i++)
specialchars.insert(std::pair<std::string, std::string>(
as<std::string>(special_keys[i]), as<std::string>(special_hex[i])));
specialchars.insert(as<std::string>(special_keys[i]), as<std::string>(special_hex[i]));

// special chars
for (int i = 0; i < int_before.size(); i++)
codemap.insert(int_before[i], int_after[i]);

}

Expand Down Expand Up @@ -108,6 +114,8 @@ List strip_helper(CharacterMatrix match_mat,
CharacterVector dest_names,
CharacterVector special_keys,
CharacterVector special_hex,
IntegerVector code_before,
IntegerVector code_after,
bool verbose) {
// helps rtf2text function by handling loop part
//
Expand All @@ -128,6 +136,11 @@ List strip_helper(CharacterMatrix match_mat,
// characte vVector of same size, which match the
// special words to the hex string to replace
//
// code_before, code_after:
// Mapping of integer codes based on the code page specified
// Codes appearing in the texts are mapped from "before" to "after" values
// before returning
//
// returns a list of four vectors of the same length
// - strcode : character vector of hex codes, in the form
// e.g, x0010x3010...
Expand All @@ -144,9 +157,10 @@ List strip_helper(CharacterMatrix match_mat,
stop("special keys and values have different length");

std::set<std::string> destinations;
SimpleOrderedDict<std::string> specialchars;
set_parameters(destinations, specialchars,
dest_names, special_keys, special_hex);
SimpleOrderedDict<std::string, std::string> specialchars;
SimpleOrderedDict<int, int> codemap;
set_parameters(destinations, specialchars, codemap,
dest_names, special_keys, special_hex, code_before, code_after);


// debug
Expand Down Expand Up @@ -314,7 +328,11 @@ List strip_helper(CharacterMatrix match_mat,
toconv_vec.push_back(doc[i].toconv);
table_vec.push_back(doc[i].intable);

int_vec_list.push_back(hex_to_int(doc[i].strcode));
IntegerVector tmp = hex_to_int(doc[i].strcode);
for (int i=0; i<tmp.size(); i++)
if (codemap.haskey(tmp[i])) tmp[i] = codemap.getvalue(tmp[i]);

int_vec_list.push_back(tmp);
}
List out = List::create(Named("strcode") = str_vec,
Named("intcode") = int_vec_list,
Expand Down
1 change: 1 addition & 0 deletions tests/testthat/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
testthat-problems.rds
Loading

0 comments on commit 246e0de

Please sign in to comment.