From 2df640d8b758a539a1003c45082deb2a936ba619 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Mon, 22 May 2017 00:17:04 +0200 Subject: [PATCH 01/76] possible fix for valgrind warnings --- src/rcpp_pre13_savestata.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/rcpp_pre13_savestata.cpp b/src/rcpp_pre13_savestata.cpp index 573264fc..29a5337c 100644 --- a/src/rcpp_pre13_savestata.cpp +++ b/src/rcpp_pre13_savestata.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki + * Copyright (C) 2015-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -15,8 +15,7 @@ * with this program. If not, see . */ -#include "readstata.h" - +#include using namespace Rcpp; using namespace std; @@ -445,7 +444,7 @@ int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat) writebin(nlen, dta, swapit); writestr(labname, nvarnameslen, dta); - dta.write((char*)&padding,3); + writestr((char*)&padding, 3, dta); writebin(N, dta, swapit); writebin(txtlen, dta, swapit); From c65f8079b3448e51f746564517d0b15bc38c5f5b Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Wed, 24 May 2017 18:52:46 +0200 Subject: [PATCH 02/76] Fseeko64 (#50) * improve selectrows option using fseeko64 idea and patch by @Kevin-Jin --- DESCRIPTION | 3 +- NEWS | 3 + R/RcppExports.R | 6 +- README.md | 1 + inst/include/readstata.h | 52 ++++++++-- src/read_dta.cpp | 202 ++++++++++++++++++--------------------- src/read_pre13_dta.cpp | 95 ++++++++---------- 7 files changed, 185 insertions(+), 177 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fea0e9bf..a33bba89 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,7 +10,8 @@ Authors@R: c( person("R Core Team", role="cph"), person("Magnus Thor", "Torfason", role="ctb"), person("Luke M.", "Olson", role="ctb"), - person("Giovanni", "Righi", role="ctb") + person("Giovanni", "Righi", role="ctb"), + person("Kevin Jin", role="ctb") ) Description: Function to read and write the 'Stata' file format. URL: https://github.com/sjewo/readstata13 diff --git a/NEWS b/NEWS index f151b3b7..8e4a8e31 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,6 @@ +[testing] +- improve partial reading + [0.9.0] - generate unique factor labels to prevent errors in factor definition - check interrupt for long read diff --git a/R/RcppExports.R b/R/RcppExports.R index 42ed113c..d6bef198 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -2,14 +2,14 @@ # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 stata_pre13_save <- function(filePath, dat) { - .Call('readstata13_stata_pre13_save', PACKAGE = 'readstata13', filePath, dat) + .Call(readstata13_stata_pre13_save, filePath, dat) } stata_read <- function(filePath, missing, selectrows) { - .Call('readstata13_stata_read', PACKAGE = 'readstata13', filePath, missing, selectrows) + .Call(readstata13_stata_read, filePath, missing, selectrows) } stata_save <- function(filePath, dat) { - .Call('readstata13_stata_save', PACKAGE = 'readstata13', filePath, dat) + .Call(readstata13_stata_save, filePath, dat) } diff --git a/README.md b/README.md index f4eea0a8..1db2b9ae 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ devtools::install_github("sjewo/readstata13", ref="testing") [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/package=readstata13) ### Working features +* [testing] Improvements to partial reading. Idea by Kevin Jin * [0.9.0] Generate unique factor labels to prevent errors in factor definition * [0.9.0] check interrupt for long read. Patch by Giovanni Righi * [0.9.0] updates to notes, roxygen and register diff --git a/inst/include/readstata.h b/inst/include/readstata.h index 548c6d25..22d70ed6 100644 --- a/inst/include/readstata.h +++ b/inst/include/readstata.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki + * Copyright (C) 2015-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -30,12 +30,12 @@ /* Test for GCC < 4.9.0 */ #if GCC_VERSION < 40900 & !__clang__ - typedef signed char int8_t; - typedef unsigned char uint8_t; - typedef signed short int16_t; - typedef unsigned short uint16_t; - typedef signed int int32_t; - typedef unsigned int uint32_t; +typedef signed char int8_t; +typedef unsigned char uint8_t; +typedef signed short int16_t; +typedef unsigned short uint16_t; +typedef signed int int32_t; +typedef unsigned int uint32_t; #else #include #endif @@ -125,4 +125,42 @@ static void writestr(std::string val_s, T len, std::fstream& dta) } +inline uint64_t calc_rowlength(Rcpp::IntegerVector vartype) { + + uint16_t k = vartype.size(); + + Rcpp::NumericVector rlen(k); + // calculate row length in byte + for (uint16_t i=0; i nmax)) { - import = 0; - } else { - import = 1; + // skip into the data part + fseeko64(file, rlength * nmin, SEEK_CUR); - // temoprary index values to be reset at the end of the loop - tmp_val = j; - j = tmp_j; - tmp_j++; - } + for(uint32_t j=0; jSTATA_DOUBLE_NA_MAX)) ) - REAL(VECTOR_ELT(df,i))[j] = NA_REAL; - else - REAL(VECTOR_ELT(df,i))[j] = val_d; - } + if ((missing == 0) && !(val_d == R_NegInf) && ((val_dSTATA_DOUBLE_NA_MAX)) ) + REAL(VECTOR_ELT(df,i))[j] = NA_REAL; + else + REAL(VECTOR_ELT(df,i))[j] = val_d; + break; } // float - case 65527: + case STATA_FLOAT: { float val_f = 0; val_f = readbin(val_f, file, swapit); - if (import == 1) { - if ((missing == 0) && ((val_fSTATA_FLOAT_NA_MAX)) ) - REAL(VECTOR_ELT(df,i))[j] = NA_REAL; - else - REAL(VECTOR_ELT(df,i))[j] = val_f; - } + if ((missing == 0) && ((val_fSTATA_FLOAT_NA_MAX)) ) + REAL(VECTOR_ELT(df,i))[j] = NA_REAL; + else + REAL(VECTOR_ELT(df,i))[j] = val_f; + break; } - //long - case 65528: + // long + case STATA_INT: { int32_t val_l = 0; val_l = readbin(val_l, file, swapit); - if (import == 1) { - if ((missing == 0) && ((val_lSTATA_INT_NA_MAX)) ) - INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; - else - INTEGER(VECTOR_ELT(df,i))[j] = val_l; - } + if ((missing == 0) && ((val_lSTATA_INT_NA_MAX)) ) + INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; + else + INTEGER(VECTOR_ELT(df,i))[j] = val_l; + break; } // int - case 65529: + case STATA_SHORTINT: { int16_t val_i = 0; val_i = readbin(val_i, file, swapit); - if (import == 1) { - if ((missing == 0) && ((val_iSTATA_SHORTINT_NA_MAX)) ) - INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; - else - INTEGER(VECTOR_ELT(df,i))[j] = val_i; - } + if ((missing == 0) && ((val_iSTATA_SHORTINT_NA_MAX)) ) + INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; + else + INTEGER(VECTOR_ELT(df,i))[j] = val_i; + break; } // byte - case 65530: + case STATA_BYTE: { int8_t val_b = 0; val_b = readbin(val_b, file, swapit); - if (import == 1) { - if (missing == 0 && ( (val_bSTATA_BYTE_NA_MAX)) ) - INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; - else - INTEGER(VECTOR_ELT(df,i))[j] = val_b; - } + if (missing == 0 && ( (val_bSTATA_BYTE_NA_MAX)) ) + INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; + else + INTEGER(VECTOR_ELT(df,i))[j] = val_b; + break; } // strings with 2045 or fewer characters @@ -530,75 +516,69 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { std::string val_s (len, '\0'); readstring(val_s, file, val_s.size()); - if (import == 1) { - as(df[i])[j] = val_s; - } + as(df[i])[j] = val_s; break; } // string of any length - case 32768: + case STATA_STRL: {// strL 2*4bit or 2 + 6 bit - //char val_strl[22]; // FixMe: Strl in 118 switch (release) - { - - case 117: - { - uint32_t v = 0, o = 0; - - v = readbin(v, file, swapit); - o = readbin(o, file, swapit); - - stringstream val_stream; - val_stream << v << '_' << o; - string val_strl = val_stream.str(); - //sprintf(val_strl, "%010d%010d", v, o); - if (import == 1) { - as(df[i])[j] = val_strl; - } - break; - } - case 118: - { - int16_t v = 0; - int64_t o = 0, z = 0; - - z = readbin(z, file, swapit); - - // works for LSF on little- and big-endian - if(byteorder.compare("LSF")==0) { - v = (int16_t)z; - o = (z >> 16); - } - - // works if we read a big-endian file on little-endian - if(byteorder.compare("MSF")==0) { - v = (z >> 48) & ((1 << 16) - 1); - o = z & ((1 << 16) - 1); - } - - stringstream val_stream; - val_stream << v << '_' << o; - string val_strl = val_stream.str(); - - if (import == 1) { - as(df[i])[j] = val_strl; - } - break; - } + { + + case 117: + { + uint32_t v = 0, o = 0; + + v = readbin(v, file, swapit); + o = readbin(o, file, swapit); + + stringstream val_stream; + val_stream << v << '_' << o; + string val_strl = val_stream.str(); + + as(df[i])[j] = val_strl; + + break; + } + case 118: + { + int16_t v = 0; + int64_t o = 0, z = 0; + + z = readbin(z, file, swapit); + + // works for LSF on little- and big-endian + if(byteorder.compare("LSF")==0) { + v = (int16_t)z; + o = (z >> 16); + } + + // works if we read a big-endian file on little-endian + if(byteorder.compare("MSF")==0) { + v = (z >> 48) & ((1 << 16) - 1); + o = z & ((1 << 16) - 1); } + + stringstream val_stream; + val_stream << v << '_' << o; + string val_strl = val_stream.str(); + + as(df[i])[j] = val_strl; + + break; + } + } } } Rcpp::checkUserInterrupt(); } - - // reset temporary index values to their original values - if (import == 1) - j = tmp_val; } + // skip to end of data part + fseeko64(file, rlength * (n - nmax -1), SEEK_CUR); + // 3. Create a data.frame df.attr("row.names") = rvec; df.attr("names") = varnames; diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp index 8ef8818f..6077bd43 100644 --- a/src/read_pre13_dta.cpp +++ b/src/read_pre13_dta.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki + * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -394,8 +394,8 @@ List read_pre13_dta(FILE * file, const bool missing, int const type = vartype[i]; switch(type) { - case STATA_FLOAT: case STATA_DOUBLE: + case STATA_FLOAT: SET_VECTOR_ELT(df, i, NumericVector(no_init(nn))); break; @@ -411,25 +411,15 @@ List read_pre13_dta(FILE * file, const bool missing, } } - uint32_t tmp_j = 0, tmp_val = 0; - bool import = 1; + uint64_t rlength = calc_rowlength(vartype); // 2. fill it with data - for(uint32_t j=0; j nmax)) { - import = 0; - } else { - import = 1; + // skip into the data part + fseeko64(file, rlength * nmin, SEEK_CUR); - // temoprary index values to be reset at the end of the loop - tmp_val = j; - j = tmp_j; - tmp_j++; - } + for(uint32_t j=0; jSTATA_DOUBLE_NA_MAX)) ) - REAL(VECTOR_ELT(df,i))[j] = NA_REAL; - else - REAL(VECTOR_ELT(df,i))[j] = val_d; - } + if ((missing == FALSE) & !(val_d == R_NegInf) & ((val_dSTATA_DOUBLE_NA_MAX)) ) + REAL(VECTOR_ELT(df,i))[j] = NA_REAL; + else + REAL(VECTOR_ELT(df,i))[j] = val_d; + break; } // float @@ -456,26 +445,25 @@ List read_pre13_dta(FILE * file, const bool missing, float val_f = 0; val_f = readbin(val_f, file, swapit); - if (import == 1) { - if ((missing == FALSE) & ((val_fSTATA_FLOAT_NA_MAX)) ) - REAL(VECTOR_ELT(df,i))[j] = NA_REAL; - else - REAL(VECTOR_ELT(df,i))[j] = val_f; - } + if ((missing == FALSE) & ((val_fSTATA_FLOAT_NA_MAX)) ) + REAL(VECTOR_ELT(df,i))[j] = NA_REAL; + else + REAL(VECTOR_ELT(df,i))[j] = val_f; + break; } - //long + // long case STATA_INT: { int32_t val_l = 0; val_l = readbin(val_l, file, swapit); - if (import == 1) { - if ((missing == FALSE) & ((val_lSTATA_INT_NA_MAX)) ) - INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; - else - INTEGER(VECTOR_ELT(df,i))[j] = val_l; - } + + if ((missing == FALSE) & ((val_lSTATA_INT_NA_MAX)) ) + INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; + else + INTEGER(VECTOR_ELT(df,i))[j] = val_l; + break; } // int @@ -484,12 +472,11 @@ List read_pre13_dta(FILE * file, const bool missing, int16_t val_i = 0; val_i = readbin(val_i, file, swapit); - if (import == 1) { - if ((missing == FALSE) & ((val_iSTATA_SHORTINT_NA_MAX)) ) - INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; - else - INTEGER(VECTOR_ELT(df,i))[j] = val_i; - } + if ((missing == FALSE) & ((val_iSTATA_SHORTINT_NA_MAX)) ) + INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; + else + INTEGER(VECTOR_ELT(df,i))[j] = val_i; + break; } // byte @@ -498,12 +485,11 @@ List read_pre13_dta(FILE * file, const bool missing, int8_t val_b = 0; val_b = readbin(val_b, file, swapit); - if (import == 1) { - if ((missing == FALSE) & ( (val_bSTATA_BYTE_NA_MAX)) ) - INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; - else - INTEGER(VECTOR_ELT(df,i))[j] = val_b; - } + if ((missing == FALSE) & ( (val_bSTATA_BYTE_NA_MAX)) ) + INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER; + else + INTEGER(VECTOR_ELT(df,i))[j] = val_b; + break; } // strings with 244 or fewer characters @@ -514,20 +500,19 @@ List read_pre13_dta(FILE * file, const bool missing, std::string val_s (len, '\0'); readstring(val_s, file, val_s.size()); - if (import == 1) { - as(df[i])[j] = val_s; - } + + as(df[i])[j] = val_s; + break; } } Rcpp::checkUserInterrupt(); } - - // reset temporary index values to their original values - if (import == 1) - j = tmp_val; } + // skip to end of data part + fseeko64(file, rlength * (n - nmax -1), SEEK_CUR); + // 3. Create a data.frame df.attr("row.names") = rvec; df.attr("names") = varnames; From 4b5cdb614c04dabb952cad4f44004e97ee14cb2f Mon Sep 17 00:00:00 2001 From: Marvin Date: Wed, 24 May 2017 18:54:15 +0200 Subject: [PATCH 03/76] Update DESCRIPTION --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a33bba89..a6f869f8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,7 +11,7 @@ Authors@R: c( person("Magnus Thor", "Torfason", role="ctb"), person("Luke M.", "Olson", role="ctb"), person("Giovanni", "Righi", role="ctb"), - person("Kevin Jin", role="ctb") + person("Jin", "Kevin", role="ctb") ) Description: Function to read and write the 'Stata' file format. URL: https://github.com/sjewo/readstata13 From 0df8b715d11a33d9fcd1e257e3edcf887ef3a9dc Mon Sep 17 00:00:00 2001 From: Marvin Date: Wed, 24 May 2017 18:59:33 +0200 Subject: [PATCH 04/76] Update DESCRIPTION again (first name, last name, role) --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a6f869f8..a0bcccd9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,7 +11,7 @@ Authors@R: c( person("Magnus Thor", "Torfason", role="ctb"), person("Luke M.", "Olson", role="ctb"), person("Giovanni", "Righi", role="ctb"), - person("Jin", "Kevin", role="ctb") + person("Kevin", "Jin", role="ctb") ) Description: Function to read and write the 'Stata' file format. URL: https://github.com/sjewo/readstata13 From 29ac62c48ac07136327ce8faa8ab6598bedcaec1 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sun, 11 Jun 2017 19:13:42 +0200 Subject: [PATCH 05/76] f119: k is uint32_t --- src/rcpp_savestata.cpp | 7 +++++-- src/read_dta.cpp | 9 +++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp index b5b8b11f..be6a5bbb 100644 --- a/src/rcpp_savestata.cpp +++ b/src/rcpp_savestata.cpp @@ -28,7 +28,7 @@ using namespace std; // [[Rcpp::export]] int stata_save(const char * filePath, Rcpp::DataFrame dat) { - uint16_t k = dat.size(); + uint32_t k = dat.size(); uint64_t n = dat.nrows(); const string timestamp = dat.attr("timestamp"); @@ -141,7 +141,10 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) writestr(byteord, byteord.size(), dta); writestr(sbyteorder, 3, dta); // LSF writestr(K, K.size(), dta); - writebin(k, dta, swapit); + if (release < 119) + writebin((int16_t)k, dta, swapit); + if (release == 119) + writebin(k, dta, swapit); writestr(num, num.size(), dta); if (release==117) writebin((int32_t)n, dta, swapit); diff --git a/src/read_dta.cpp b/src/read_dta.cpp index 0a582640..4536eca1 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -96,8 +96,13 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { * Number of Variables */ - uint16_t k = 0; - k = readbin(k, file, swapit); + uint32_t k = 0; + if(release < 119){ + k = readbin((uint16_t)k, file, swapit); + } + if(release == 199){ + k = readbin(k, file, swapit); + } // test("", file); From d861ab6a9bd64f0b9ec42c9893c376392b9fced0 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sun, 11 Jun 2017 19:24:40 +0200 Subject: [PATCH 06/76] f119: typo --- src/read_dta.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/read_dta.cpp b/src/read_dta.cpp index d0213d08..d02257b2 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -100,7 +100,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { if(release < 119){ k = readbin((uint16_t)k, file, swapit); } - if(release == 199){ + if(release == 119){ k = readbin(k, file, swapit); } From 90720e4491f9342d375a50fa5dfed5e4a4d1ff88 Mon Sep 17 00:00:00 2001 From: Marvin Date: Mon, 12 Jun 2017 13:30:58 +0200 Subject: [PATCH 07/76] f119: more work towards a working implementation k = uint32 at more places --- R/save.R | 6 ++++-- src/rcpp_savestata.cpp | 11 ++++++----- src/read_dta.cpp | 17 +++++++---------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/R/save.R b/R/save.R index 0d2f0d43..048b6d16 100644 --- a/R/save.R +++ b/R/save.R @@ -75,6 +75,8 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE, stop("Path is invalid. Possibly a non existend directory.") # Allow writing version as Stata version not Stata format + if (version==15L) + version <- 119 if (version==14L) version <- 118 if (version==13L) @@ -90,7 +92,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE, if (version==6) version <- 108 - if (version<102 | version == 109 | version == 116 | version>118) + if (version<102 | version == 109 | version == 116 | version>119) stop("Version missmatch abort execution. No Data was saved.") sstr <- 2045 @@ -278,7 +280,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE, vartypen[empty] <- sbyte } - # recode character variables. 118 wants utf-8, so encoding may be required + # recode character variables. >118 wants utf-8, so encoding may be required if(doRecode) { #TODO: use seq_len ? for(v in (1:ncol(data))[vartypen == "character"]) { diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp index be6a5bbb..3e0e2b2e 100644 --- a/src/rcpp_savestata.cpp +++ b/src/rcpp_savestata.cpp @@ -65,6 +65,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) lbllen = 33; break; case 118: + case 119: nvarnameslen = 129; nformatslen = 57; nvalLabelslen = 129; @@ -148,7 +149,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) writestr(num, num.size(), dta); if (release==117) writebin((int32_t)n, dta, swapit); - if (release==118) + if (release==118 | release==119) writebin(n, dta, swapit); writestr(lab, lab.size(), dta); @@ -167,7 +168,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) if (release==117) writebin((uint8_t)ndlabel, dta, swapit); - if (release==118) + if (release==118 | release==119) writebin(ndlabel, dta, swapit); writestr(datalabel,datalabel.size(), dta); @@ -178,7 +179,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) if (release == 117) { writebin(zero, dta, swapit); } - if (release == 118) { + if (release == 118 | release == 119) { writebin(zero, dta, swapit); writebin(zero, dta, swapit); } @@ -348,7 +349,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) for(uint64_t j = 0; j < n; ++j) { - for (uint16_t i = 0; i < k; ++i) + for (uint32_t i = 0; i < k; ++i) { int const type = vartypes[i]; switch(type < 2046 ? 2045 : type) @@ -523,7 +524,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) writebin(v, dta, swapit); if (release==117) writebin((uint32_t)o, dta, swapit); - if (release==118) + if (release==118 | release==119) writebin(o, dta, swapit); writebin(t, dta, swapit); writebin(len, dta, swapit); diff --git a/src/read_dta.cpp b/src/read_dta.cpp index d02257b2..3bbd6f7c 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -30,7 +30,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { */ int8_t fversion = 117L; //f = first - int8_t lversion = 118L; //l = last + int8_t lversion = 119L; //l = last std::string version(3, '\0'); readstring(version, file, version.size()); @@ -65,6 +65,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { lbllen = 33; break; case 118: + case 119: nvarnameslen = 129; nformatslen = 57; nvalLabelslen = 129; @@ -97,12 +98,10 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { */ uint32_t k = 0; - if(release < 119){ + if(release < 119) k = readbin((uint16_t)k, file, swapit); - } - if(release == 119){ + if(release==119) k = readbin(k, file, swapit); - } // test("", file); @@ -114,12 +113,10 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { uint64_t n = 0; - if(release==117) { + if(release==117) n = readbin((uint32_t)n, file, swapit); - } - if (release ==118) { + if (release ==118 | release==119) n = readbin(n, file, swapit); - } // test("", file); @@ -134,7 +131,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { uint16_t ndlabel = 0; - if (release==118) + if (release==118 | release==119) ndlabel = readbin(ndlabel, file, swapit); if (release==117) ndlabel = readbin((int8_t)ndlabel, file, swapit); From e9c05f7ad13248b18ba5689394aed2323eaa4e36 Mon Sep 17 00:00:00 2001 From: Marvin Date: Mon, 12 Jun 2017 13:31:18 +0200 Subject: [PATCH 08/76] f119: more uint32_t --- src/rcpp_savestata.cpp | 10 +++++----- src/read_dta.cpp | 23 ++++++++++++----------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp index 3e0e2b2e..87dcbb3f 100644 --- a/src/rcpp_savestata.cpp +++ b/src/rcpp_savestata.cpp @@ -211,7 +211,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) map(2) = dta.tellg(); writestr(startvart, startvart.size(), dta); uint16_t nvartype; - for (uint16_t i = 0; i < k; ++i) + for (uint32_t i = 0; i < k; ++i) { nvartype = as(vartypes[i]); @@ -223,7 +223,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) /* ... */ map(3) = dta.tellg(); writestr(startvarn, startvarn.size(), dta); - for (uint16_t i = 0; i < k; ++i ) + for (uint32_t i = 0; i < k; ++i ) { string nvarname = as(nvarnames[i]); nvarname[nvarname.size()] = '\0'; @@ -254,7 +254,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) /* ... */ map(5) = dta.tellg(); writestr(startform, startform.size(), dta); - for (uint16_t i = 0; i < k; ++i ) + for (uint32_t i = 0; i < k; ++i ) { string nformats = as(formats[i]); @@ -270,7 +270,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) /* ... */ map(6) = dta.tellg(); writestr(startvalLabel, startvalLabel.size(), dta); - for (uint16_t i = 0; i < k; ++i) + for (uint32_t i = 0; i < k; ++i) { string nvalLabels = as(valLabels[i]); nvalLabels[nvalLabels.size()] = '\0'; @@ -287,7 +287,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) /* ... */ map(7) = dta.tellg(); writestr(startvarlabel, startvarlabel.size(), dta); - for (uint16_t i = 0; i < k; ++i) + for (uint32_t i = 0; i < k; ++i) { if (!Rf_isNull(varLabels) && Rf_length(varLabels) > 1) { string nvarLabels = as(varLabels[i]); diff --git a/src/read_dta.cpp b/src/read_dta.cpp index 3bbd6f7c..12948200 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -219,8 +219,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { */ IntegerVector vartype(k); - for (uint16_t i=0; i Date: Mon, 12 Jun 2017 14:29:53 +0200 Subject: [PATCH 09/76] f119: close FixMes. we should be able to read and write a f119 file w/o strls --- src/read_dta.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/read_dta.cpp b/src/read_dta.cpp index 12948200..9058d084 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -221,9 +221,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { IntegerVector vartype(k); for (uint32_t i=0; i Date: Mon, 12 Jun 2017 15:53:01 +0200 Subject: [PATCH 10/76] thinko --- src/read_dta.cpp | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/read_dta.cpp b/src/read_dta.cpp index 9058d084..866952ed 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -221,12 +221,8 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { IntegerVector vartype(k); for (uint32_t i=0; i Date: Mon, 12 Jun 2017 15:53:23 +0200 Subject: [PATCH 11/76] big_k musst be bigger than k --- src/rcpp_savestata.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp index 87dcbb3f..7590509d 100644 --- a/src/rcpp_savestata.cpp +++ b/src/rcpp_savestata.cpp @@ -241,9 +241,9 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) map(4) = dta.tellg(); writestr(startsor, startsor.size(), dta); - uint32_t big_k = k+1; + uint64_t big_k = k+1; - for (uint32_t i = 0; i < big_k; ++i) + for (uint64_t i = 0; i < big_k; ++i) { uint16_t nsortlist = 0; writebin(nsortlist, dta, swapit); From e7ffcab4c472181fa939e17f5d99d3f246e2b943 Mon Sep 17 00:00:00 2001 From: Marvin Date: Mon, 12 Jun 2017 17:39:55 +0200 Subject: [PATCH 12/76] f119: experimental support for writing strls --- src/rcpp_savestata.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp index 7590509d..4c72532d 100644 --- a/src/rcpp_savestata.cpp +++ b/src/rcpp_savestata.cpp @@ -491,6 +491,29 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) dta.write((char*)&z, sizeof(z)); // writestr((char*)&z, sizeof(z), dta); + break; + } + case 119: + { + int32_t v = i+1; + int64_t o = j+1; + char z[8]; + + // push back every v, o and val_strl + V.push_back(v); + O.push_back(o); + + // z is 'vv-- ----' + memcpy(&z[0], &v, sizeof(v)); + if (SBYTEORDER == 1) { + o <<= 24; + } + memcpy(&z[3], &o, 5); + // z is 'vvvo oooo' + + dta.write((char*)&z, sizeof(z)); + // writestr((char*)&z, sizeof(z), dta); + break; } } From 21cc28880d58666c9e6131657ecbbf1197e25679 Mon Sep 17 00:00:00 2001 From: Marvin Date: Mon, 12 Jun 2017 17:46:22 +0200 Subject: [PATCH 13/76] f119: experimental support for reading strls --- src/read_dta.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/read_dta.cpp b/src/read_dta.cpp index 866952ed..1c952755 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -570,6 +570,33 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { break; } + case 119: + { + int32_t v = 0; + int64_t o = 0, z = 0; + + z = readbin(z, file, swapit); + + // works for LSF on little- and big-endian + if(byteorder.compare("LSF")==0) { + v = (int32_t)z; + o = (z >> 24); + } + + // works if we read a big-endian file on little-endian + if(byteorder.compare("MSF")==0) { + v = (z >> 48) & ((1 << 24) - 1); + o = z & ((1 << 24) - 1); + } + + stringstream val_stream; + val_stream << v << '_' << o; + string val_strl = val_stream.str(); + + as(df[i])[j] = val_strl; + + break; + } } } } From 81289fd8ed8c4ef9da51787a905e24be34ae4fd3 Mon Sep 17 00:00:00 2001 From: Marvin Date: Mon, 12 Jun 2017 18:12:25 +0200 Subject: [PATCH 14/76] f119: read strl part --- src/read_dta.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/read_dta.cpp b/src/read_dta.cpp index 1c952755..216b8958 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -656,6 +656,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { break; } case 118: + case 119: { uint32_t v = 0; uint64_t o = 0; From 947a0dfc79d96259e1d0e02f780eedb10f6577b5 Mon Sep 17 00:00:00 2001 From: Marvin Date: Mon, 12 Jun 2017 18:29:36 +0200 Subject: [PATCH 15/76] f119: experimental support reading strl --- src/read_dta.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/read_dta.cpp b/src/read_dta.cpp index 216b8958..1976ac00 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -579,11 +579,11 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { // works for LSF on little- and big-endian if(byteorder.compare("LSF")==0) { - v = (int32_t)z; + v = (int32_t)z & ((1 << 24) - 1); o = (z >> 24); } - // works if we read a big-endian file on little-endian + // FixMe: works if we read a big-endian file on little-endian if(byteorder.compare("MSF")==0) { v = (z >> 48) & ((1 << 24) - 1); o = z & ((1 << 24) - 1); From 03846b833bfb085f8de01ecc9529de983d68c624 Mon Sep 17 00:00:00 2001 From: Marvin Date: Mon, 12 Jun 2017 18:39:26 +0200 Subject: [PATCH 16/76] f119: update tests --- tests/testthat/test_save.R | 48 +++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/tests/testthat/test_save.R b/tests/testthat/test_save.R index fcfd34e6..990a8649 100644 --- a/tests/testthat/test_save.R +++ b/tests/testthat/test_save.R @@ -40,6 +40,7 @@ dir.create("data") dd <- mtcars +save.dta13(dd, "data/dta_119.dta", version = 119) save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) save.dta13(dd, "data/dta_115.dta", version = 115) @@ -56,6 +57,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104) save.dta13(dd, "data/dta_103.dta", version = 103) save.dta13(dd, "data/dta_102.dta", version = 102) +dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") @@ -76,6 +78,7 @@ dd102 <- read.dta13("data/dta_102.dta") unlink("data", recursive = TRUE) test_that("version", { + expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) @@ -103,6 +106,7 @@ dir.create("data") dd <- mtcars +save.dta13(dd, "data/dta_119.dta", version = 119, compress = TRUE) save.dta13(dd, "data/dta_118.dta", version = 118, compress = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, compress = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, compress = TRUE) @@ -119,6 +123,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104, compress = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, compress = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, compress = TRUE) +dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") @@ -139,6 +144,7 @@ dd102 <- read.dta13("data/dta_102.dta") unlink("data", recursive = TRUE) test_that("compress", { + expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) @@ -167,7 +173,7 @@ dir.create("data") dd <- mtcars dd$am <- factor(x = dd$am, levels = c(0,1), labels = c("auto", "man")) - +save.dta13(dd, "data/dta_119.dta", version = 119, convert.factors = TRUE) save.dta13(dd, "data/dta_118.dta", version = 118, convert.factors = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.factors = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.factors = TRUE) @@ -184,7 +190,7 @@ save.dta13(dd, "data/dta_107.dta", version = 107, convert.factors = TRUE) # save.dta13(dd, "data/dta_103.dta", version = 103, convert.factors = TRUE) # save.dta13(dd, "data/dta_102.dta", version = 102, convert.factors = TRUE) - +dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") @@ -205,6 +211,7 @@ dd107 <- read.dta13("data/dta_107.dta") unlink("data", recursive = TRUE) test_that("convert.factors TRUE", { + expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) @@ -233,7 +240,7 @@ dir.create("data") dd <- mtcars dd$am <- factor(x = dd$am, levels = c(0,1), labels = c("auto", "man")) - +save.dta13(dd, "data/dta_119.dta", version = 119, convert.factors = FALSE) save.dta13(dd, "data/dta_118.dta", version = 118, convert.factors = FALSE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.factors = FALSE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.factors = FALSE) @@ -250,7 +257,7 @@ save.dta13(dd, "data/dta_107.dta", version = 107, convert.factors = FALSE) # save.dta13(dd, "data/dta_103.dta", version = 103, convert.factors = FALSE) # save.dta13(dd, "data/dta_102.dta", version = 102, convert.factors = FALSE) - +dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") @@ -275,6 +282,7 @@ dd$am <- dd$am + 1 unlink("data", recursive = TRUE) test_that("convert.factors TRUE", { + expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) @@ -301,6 +309,7 @@ dir.create("data") dd <- mtcars +save.dta13(dd, "data/dta_119.dta", version = 119, add.rownames = TRUE) save.dta13(dd, "data/dta_118.dta", version = 118, add.rownames = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, add.rownames = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, add.rownames = TRUE) @@ -317,7 +326,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104, add.rownames = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, add.rownames = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, add.rownames = TRUE) - +dd119 <- read.dta13("data/dta_119.dta", add.rownames = TRUE) dd118 <- read.dta13("data/dta_118.dta", add.rownames = TRUE) dd117 <- read.dta13("data/dta_117.dta", add.rownames = TRUE) dd115 <- read.dta13("data/dta_115.dta", add.rownames = TRUE) @@ -339,6 +348,7 @@ unlink("data", recursive = TRUE) test_that("add.rownames TRUE", { # Check that rownames are identical + expect_true(identical(rownames(dd), rownames(dd119))) expect_true(identical(rownames(dd), rownames(dd118))) expect_true(identical(rownames(dd), rownames(dd117))) expect_true(identical(rownames(dd), rownames(dd115))) @@ -356,6 +366,7 @@ test_that("add.rownames TRUE", { expect_true(identical(rownames(dd), rownames(dd102))) # Check that data is identical + expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) @@ -385,6 +396,7 @@ dir.create("data") dd <- mtcars +save.dta13(dd, "data/dta_119.dta", version = 119, data.label = dl) save.dta13(dd, "data/dta_118.dta", version = 118, data.label = dl) save.dta13(dd, "data/dta_117.dta", version = 117, data.label = dl) save.dta13(dd, "data/dta_115.dta", version = 115, data.label = dl) @@ -401,7 +413,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104, data.label = dl) save.dta13(dd, "data/dta_103.dta", version = 103, data.label = dl) # save.dta13(dd, "data/dta_102.dta", version = 102, data.label = dl) # no data label - +dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") @@ -422,6 +434,7 @@ unlink("data", recursive = TRUE) test_that("data label", { # Check that rownames are identical + expect_equal(dl, attr(dd119, "datalabel")) expect_equal(dl, attr(dd118, "datalabel")) expect_equal(dl, attr(dd117, "datalabel")) expect_equal(dl, attr(dd115, "datalabel")) @@ -450,6 +463,7 @@ dir.create("data") dd <- data.frame( dat = Sys.Date() ) +save.dta13(dd, "data/dta_119.dta", version = 119, convert.dates = TRUE) save.dta13(dd, "data/dta_118.dta", version = 118, convert.dates = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.dates = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.dates = TRUE) @@ -466,7 +480,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104, convert.dates = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, convert.dates = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, convert.dates = TRUE) - +dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") @@ -487,6 +501,7 @@ unlink("data", recursive = TRUE) test_that("convert.dates TRUE", { # Check that rownames are identical + expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) @@ -517,6 +532,7 @@ dd <- data.frame( dat = c(paste(replicate(2046, "a"), collapse = ""), paste(replicate(2046, "b"), collapse = "")), stringsAsFactors = FALSE) +save.dta13(dd, "data/dta_119.dta", version = 119) save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) # save.dta13(dd, "data/dta_115.dta", version = 115) # no strl @@ -533,7 +549,7 @@ save.dta13(dd, "data/dta_117.dta", version = 117) # save.dta13(dd, "data/dta_103.dta", version = 103) # save.dta13(dd, "data/dta_102.dta", version = 102) - +dd119 <- read.dta13("data/dta_119.dta", replace.strl = TRUE) dd118 <- read.dta13("data/dta_118.dta", replace.strl = TRUE) dd117 <- read.dta13("data/dta_117.dta", replace.strl = TRUE) # dd115 <- read.dta13("data/dta_115.dta") @@ -554,6 +570,7 @@ unlink("data", recursive = TRUE) test_that("replace.strl TRUE", { # Check that rownames are identical + expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) # expect_true(datacompare(dd, dd115)) @@ -578,9 +595,9 @@ if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") - dd <- data.frame(x.1 = 1) +save.dta13(dd, "data/dta_119.dta", version = 119, convert.underscore = TRUE) save.dta13(dd, "data/dta_118.dta", version = 118, convert.underscore = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.underscore = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.underscore = TRUE) @@ -597,7 +614,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104, convert.underscore = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, convert.underscore = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, convert.underscore = TRUE) - +dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") @@ -620,6 +637,7 @@ names(dd) <- "x_1" test_that("convert.underscore TRUE", { # check numerics + expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) @@ -636,6 +654,7 @@ test_that("convert.underscore TRUE", { expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) # check names + expect_true(namescompare(dd, dd119)) expect_true(namescompare(dd, dd118)) expect_true(namescompare(dd, dd117)) expect_true(namescompare(dd, dd115)) @@ -660,9 +679,9 @@ if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") - dd <- mtcars +save.dta13(dd, "data/dta_119.dta", version = 119) save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) save.dta13(dd, "data/dta_115.dta", version = 115) @@ -679,7 +698,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104) save.dta13(dd, "data/dta_103.dta", version = 103) save.dta13(dd, "data/dta_102.dta", version = 102) - +dd119 <- read.dta13("data/dta_119.dta", select.rows = 5) dd118 <- read.dta13("data/dta_118.dta", select.rows = 5) dd117 <- read.dta13("data/dta_117.dta", select.rows = 5) dd115 <- read.dta13("data/dta_115.dta", select.rows = 5) @@ -702,6 +721,7 @@ dd <- dd[1:5,] test_that("select.rows = 5", { # check numerics + expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) @@ -725,6 +745,7 @@ dir.create("data") dd <- mtcars +save.dta13(dd, "data/dta_119.dta", version = 119) save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) save.dta13(dd, "data/dta_115.dta", version = 115) @@ -741,7 +762,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104) save.dta13(dd, "data/dta_103.dta", version = 103) save.dta13(dd, "data/dta_102.dta", version = 102) - +dd119 <- read.dta13("data/dta_119.dta", select.rows = c(5,10)) dd118 <- read.dta13("data/dta_118.dta", select.rows = c(5,10)) dd117 <- read.dta13("data/dta_117.dta", select.rows = c(5,10)) dd115 <- read.dta13("data/dta_115.dta", select.rows = c(5,10)) @@ -764,6 +785,7 @@ dd <- dd[5:10,] test_that("select.rows = c(5,10)", { # check numerics + expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) From 2da02712bd1611b1168a94dc561696cc8517a1e8 Mon Sep 17 00:00:00 2001 From: Marvin Date: Mon, 12 Jun 2017 18:45:08 +0200 Subject: [PATCH 17/76] f119: finish implementation of experimental support for LSF --- NEWS | 1 + R/read.R | 12 ++++++------ R/save.R | 2 +- README.md | 5 +++-- src/rcpp_savestata.cpp | 2 +- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/NEWS b/NEWS index 8e4a8e31..e234a362 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,5 @@ [testing] +- experimental support for format 119 - improve partial reading [0.9.0] diff --git a/R/read.R b/R/read.R index 8df7e895..113943bb 100644 --- a/R/read.R +++ b/R/read.R @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki +# Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki # Copyright (C) of 'convert.dates' and 'missing.types' Thomas Lumley # # This program is free software; you can redistribute it and/or modify it @@ -23,7 +23,7 @@ #' @param convert.factors \emph{logical.} If \code{TRUE}, factors from Stata #' value labels are created. #' @param generate.factors \emph{logical.} If \code{TRUE} and convert.factors is -#' TRUE, missing factor labels are created from integers. If duplicated labels are found, +#' TRUE, missing factor labels are created from integers. If duplicated labels are found, #' unique labels will be generated according the following scheme: "label_(integer code)". #' @param encoding \emph{character.} Strings can be converted from Windows-1252 or UTF-8 #' to system encoding. Options are "latin1" or "UTF-8" to specify target @@ -357,7 +357,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE, varunique <- na.omit(unique(data[, i])) # assign label if label set is complete if (all(varunique %in% labtable)) { - + #check for duplicated labels labcount <- table(names(labtable)) if(any(labcount > 1)) { @@ -366,17 +366,17 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE, # generate unique labels from assigned label and code number names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")") } - + data[, i] <- factor(data[, i], levels=labtable, labels=names(labtable)) # else generate labels from codes } else if (generate.factors) { names(varunique) <- as.character(varunique) gen.lab <- sort(c(varunique[!varunique %in% labtable], labtable)) - + data[, i] <- factor(data[, i], levels=gen.lab, labels=names(gen.lab)) - + } else { warning(paste0("\n ",vnames[i], ":\n Missing factor labels - no labels assigned.\n Set option generate.factors=T to generate labels.")) } diff --git a/R/save.R b/R/save.R index 048b6d16..b23af667 100644 --- a/R/save.R +++ b/R/save.R @@ -1,5 +1,5 @@ # -# Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki +# Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the diff --git a/README.md b/README.md index 1db2b9ae..0e21aee4 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # readstata13 -Package to read and write all Stata file formats (version 14 and older) into a -R data.frame. The dta file format versions 102 to 118 are supported. +Package to read and write all Stata file formats (version 15 and older) into a +R data.frame. The dta file format versions 102 to 119 are supported. The function ```read.dta``` from the foreign package imports only dta files from Stata versions <= 12. Due to the different structure and features of dta 117 @@ -56,6 +56,7 @@ devtools::install_github("sjewo/readstata13", ref="testing") [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/package=readstata13) ### Working features +* [testing] Experimental support for format 119 * [testing] Improvements to partial reading. Idea by Kevin Jin * [0.9.0] Generate unique factor labels to prevent errors in factor definition * [0.9.0] check interrupt for long read. Patch by Giovanni Righi diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp index 4c72532d..4a90e46e 100644 --- a/src/rcpp_savestata.cpp +++ b/src/rcpp_savestata.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki + * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the From 2d5ca181d9f6287e524cccdb962bbcd7537acf52 Mon Sep 17 00:00:00 2001 From: Marvin Date: Mon, 12 Jun 2017 19:08:46 +0200 Subject: [PATCH 18/76] f119: update documentation --- R/save.R | 2 +- man/read.dta13.Rd | 2 +- man/save.dta13.Rd | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/save.R b/R/save.R index b23af667..95f12d8f 100644 --- a/R/save.R +++ b/R/save.R @@ -39,7 +39,7 @@ #' @param compress \emph{logical.} If \code{TRUE}, the resulting dta-file will #' use all of Statas numeric-vartypes. #' @param version \emph{numeric.} Stata format for the resulting dta-file either -#' the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 14. +#' the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 15. #' @return The function writes a dta-file to disk. The following features of the #' dta file format are supported: #' \describe{ diff --git a/man/read.dta13.Rd b/man/read.dta13.Rd index a4b2ab6b..56018ea0 100644 --- a/man/read.dta13.Rd +++ b/man/read.dta13.Rd @@ -16,7 +16,7 @@ read.dta13(file, convert.factors = TRUE, generate.factors = FALSE, value labels are created.} \item{generate.factors}{\emph{logical.} If \code{TRUE} and convert.factors is -TRUE, missing factor labels are created from integers. If duplicated labels are found, +TRUE, missing factor labels are created from integers. If duplicated labels are found, unique labels will be generated according the following scheme: "label_(integer code)".} \item{encoding}{\emph{character.} Strings can be converted from Windows-1252 or UTF-8 diff --git a/man/save.dta13.Rd b/man/save.dta13.Rd index e4c9808b..32831383 100644 --- a/man/save.dta13.Rd +++ b/man/save.dta13.Rd @@ -37,7 +37,7 @@ will be added to the dta-file.} use all of Statas numeric-vartypes.} \item{version}{\emph{numeric.} Stata format for the resulting dta-file either -the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 14.} +the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 15.} \item{convert.underscore}{\emph{logical.} If \code{TRUE}, all non numerics or non alphabet characters will be converted to underscores.} From 942376d1fb7f6f74ba93d59b470a9d7560b1af55 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Mon, 12 Jun 2017 23:13:00 +0200 Subject: [PATCH 19/76] f119: codestyle pedantic --- src/rcpp_savestata.cpp | 14 +++++++------- src/read_dta.cpp | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp index 4a90e46e..37d8aea7 100644 --- a/src/rcpp_savestata.cpp +++ b/src/rcpp_savestata.cpp @@ -147,9 +147,9 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) if (release == 119) writebin(k, dta, swapit); writestr(num, num.size(), dta); - if (release==117) + if (release == 117) writebin((int32_t)n, dta, swapit); - if (release==118 | release==119) + if ((release == 118) | (release == 119)) writebin(n, dta, swapit); writestr(lab, lab.size(), dta); @@ -166,9 +166,9 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) } ndlabel = datalabel.size(); - if (release==117) + if (release == 117) writebin((uint8_t)ndlabel, dta, swapit); - if (release==118 | release==119) + if ((release == 118) | (release == 119)) writebin(ndlabel, dta, swapit); writestr(datalabel,datalabel.size(), dta); @@ -179,7 +179,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) if (release == 117) { writebin(zero, dta, swapit); } - if (release == 118 | release == 119) { + if ((release == 118) | (release == 119)) { writebin(zero, dta, swapit); writebin(zero, dta, swapit); } @@ -545,9 +545,9 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) writestr(gso, gso.size(), dta); writebin(v, dta, swapit); - if (release==117) + if (release == 117) writebin((uint32_t)o, dta, swapit); - if (release==118 | release==119) + if ((release == 118) | (release == 119)) writebin(o, dta, swapit); writebin(t, dta, swapit); writebin(len, dta, swapit); diff --git a/src/read_dta.cpp b/src/read_dta.cpp index 1976ac00..27aaa97b 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -98,9 +98,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { */ uint32_t k = 0; - if(release < 119) + if (release < 119) k = readbin((uint16_t)k, file, swapit); - if(release==119) + if (release == 119) k = readbin(k, file, swapit); // @@ -113,9 +113,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { uint64_t n = 0; - if(release==117) + if(release == 117) n = readbin((uint32_t)n, file, swapit); - if (release ==118 | release==119) + if ((release == 118) | (release == 119)) n = readbin(n, file, swapit); // @@ -131,9 +131,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { uint16_t ndlabel = 0; - if (release==118 | release==119) + if ((release == 118) | (release == 119)) ndlabel = readbin(ndlabel, file, swapit); - if (release==117) + if (release == 117) ndlabel = readbin((int8_t)ndlabel, file, swapit); std::string datalabel(ndlabel, '\0'); From a436ce6c101e005c42ea4fda60cb388d9e079a47 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Tue, 13 Jun 2017 00:01:22 +0200 Subject: [PATCH 20/76] cleaning --- src/rcpp_savestata.cpp | 4 ++-- src/read_dta.cpp | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp index 37d8aea7..35a60ef6 100644 --- a/src/rcpp_savestata.cpp +++ b/src/rcpp_savestata.cpp @@ -155,7 +155,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) /* write a datalabel */ - if(!datalabel.empty()) + if (!datalabel.empty()) { if (datalabel.size() > maxdatalabelsize) { @@ -440,7 +440,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat) string val_s = as(as(dat[i])[j]); - if(val_s == "NA") + if (val_s == "NA") val_s.clear(); writestr(val_s, len, dta); diff --git a/src/read_dta.cpp b/src/read_dta.cpp index 27aaa97b..52c0cf08 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -113,7 +113,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { uint64_t n = 0; - if(release == 117) + if (release == 117) n = readbin((uint32_t)n, file, swapit); if ((release == 118) | (release == 119)) n = readbin(n, file, swapit); @@ -551,13 +551,13 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { z = readbin(z, file, swapit); // works for LSF on little- and big-endian - if(byteorder.compare("LSF")==0) { + if (byteorder.compare("LSF")==0) { v = (int16_t)z; o = (z >> 16); } // works if we read a big-endian file on little-endian - if(byteorder.compare("MSF")==0) { + if (byteorder.compare("MSF")==0) { v = (z >> 48) & ((1 << 16) - 1); o = z & ((1 << 16) - 1); } @@ -578,13 +578,13 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { z = readbin(z, file, swapit); // works for LSF on little- and big-endian - if(byteorder.compare("LSF")==0) { + if (byteorder.compare("LSF")==0) { v = (int32_t)z & ((1 << 24) - 1); o = (z >> 24); } // FixMe: works if we read a big-endian file on little-endian - if(byteorder.compare("MSF")==0) { + if (byteorder.compare("MSF")==0) { v = (z >> 48) & ((1 << 24) - 1); o = z & ((1 << 24) - 1); } @@ -652,7 +652,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { stringstream val_stream; val_stream << v << '_' << o; ref.assign(val_stream.str()); - //sprintf(ref, "%010d%010d", v, o); + break; } case 118: @@ -660,15 +660,13 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { { uint32_t v = 0; uint64_t o = 0; - // uint64_t z = 0; + v = readbin(v, file, swapit); o = readbin(o, file, swapit); - // z = readbin(z, file, swapit); stringstream val_stream; val_stream << v << '_' << o; ref.assign(val_stream.str()); - //sprintf(ref, "%010d%010ld", v, o); break; } From 781d0a24c1916c01f1e10973e527cb24016bc555 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Tue, 13 Jun 2017 01:07:34 +0200 Subject: [PATCH 21/76] export the dimensions of the original dta-file. --- R/read.R | 2 ++ man/read.dta13.Rd | 2 ++ src/read_dta.cpp | 6 ++++++ src/read_pre13_dta.cpp | 7 +++++++ 4 files changed, 17 insertions(+) diff --git a/R/read.R b/R/read.R index 113943bb..7f875964 100644 --- a/R/read.R +++ b/R/read.R @@ -101,6 +101,8 @@ #' and the contents of Stata characteristic field.} #' \item{missing:}{List of numeric vectors with Stata missing type for each #' variable.} +#' \item{byteorder:}{Byteorder of the dta-file. LSF or MSF.} +#' \item{orig.dim:}{Dimension recorded inside the dta-file.} #' } #' @note read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members #' from foreign::read.dta(). diff --git a/man/read.dta13.Rd b/man/read.dta13.Rd index 56018ea0..6fc3dfd6 100644 --- a/man/read.dta13.Rd +++ b/man/read.dta13.Rd @@ -74,6 +74,8 @@ The function returns a data.frame with attributes. The attributes and the contents of Stata characteristic field.} \item{missing:}{List of numeric vectors with Stata missing type for each variable.} + \item{byteorder:}{Byteorder of the dta-file. LSF or MSF.} + \item{orig.dim:}{Dimension recorded inside the dta-file.} } } \description{ diff --git a/src/read_dta.cpp b/src/read_dta.cpp index 52c0cf08..65f126bd 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -121,6 +121,11 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { // test("", file); test("