From 2df640d8b758a539a1003c45082deb2a936ba619 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Mon, 22 May 2017 00:17:04 +0200
Subject: [PATCH 01/76] possible fix for valgrind warnings

---
 src/rcpp_pre13_savestata.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/rcpp_pre13_savestata.cpp b/src/rcpp_pre13_savestata.cpp
index 573264fc..29a5337c 100644
--- a/src/rcpp_pre13_savestata.cpp
+++ b/src/rcpp_pre13_savestata.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
+ * Copyright (C) 2015-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -15,8 +15,7 @@
  * with this program. If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "readstata.h"
-
+#include <readstata.h>
 
 using namespace Rcpp;
 using namespace std;
@@ -445,7 +444,7 @@ int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat)
         writebin(nlen, dta, swapit);
 
         writestr(labname, nvarnameslen, dta);
-        dta.write((char*)&padding,3);
+        writestr((char*)&padding, 3, dta);
         writebin(N, dta, swapit);
         writebin(txtlen, dta, swapit);
 

From c65f8079b3448e51f746564517d0b15bc38c5f5b Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@rub.de>
Date: Wed, 24 May 2017 18:52:46 +0200
Subject: [PATCH 02/76] Fseeko64 (#50)

* improve selectrows option using fseeko64 idea and patch by @Kevin-Jin
---
 DESCRIPTION              |   3 +-
 NEWS                     |   3 +
 R/RcppExports.R          |   6 +-
 README.md                |   1 +
 inst/include/readstata.h |  52 ++++++++--
 src/read_dta.cpp         | 202 ++++++++++++++++++---------------------
 src/read_pre13_dta.cpp   |  95 ++++++++----------
 7 files changed, 185 insertions(+), 177 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index fea0e9bf..a33bba89 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -10,7 +10,8 @@ Authors@R: c(
     person("R Core Team", role="cph"),
     person("Magnus Thor", "Torfason", role="ctb"),
     person("Luke M.", "Olson", role="ctb"),
-    person("Giovanni", "Righi", role="ctb")
+    person("Giovanni", "Righi", role="ctb"),
+    person("Kevin Jin", role="ctb")
     )
 Description: Function to read and write the 'Stata' file format.
 URL: https://github.com/sjewo/readstata13
diff --git a/NEWS b/NEWS
index f151b3b7..8e4a8e31 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,6 @@
+[testing]
+- improve partial reading
+
 [0.9.0]
 - generate unique factor labels to prevent errors in factor definition
 - check interrupt for long read
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 42ed113c..d6bef198 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -2,14 +2,14 @@
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
 stata_pre13_save <- function(filePath, dat) {
-    .Call('readstata13_stata_pre13_save', PACKAGE = 'readstata13', filePath, dat)
+    .Call(readstata13_stata_pre13_save, filePath, dat)
 }
 
 stata_read <- function(filePath, missing, selectrows) {
-    .Call('readstata13_stata_read', PACKAGE = 'readstata13', filePath, missing, selectrows)
+    .Call(readstata13_stata_read, filePath, missing, selectrows)
 }
 
 stata_save <- function(filePath, dat) {
-    .Call('readstata13_stata_save', PACKAGE = 'readstata13', filePath, dat)
+    .Call(readstata13_stata_save, filePath, dat)
 }
 
diff --git a/README.md b/README.md
index f4eea0a8..1db2b9ae 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ devtools::install_github("sjewo/readstata13", ref="testing")
 [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/package=readstata13)
 
 ### Working features
+* [testing] Improvements to partial reading. Idea by Kevin Jin
 * [0.9.0] Generate unique factor labels to prevent errors in factor definition
 * [0.9.0] check interrupt for long read. Patch by Giovanni Righi
 * [0.9.0] updates to notes, roxygen and register
diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 548c6d25..22d70ed6 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
+ * Copyright (C) 2015-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -30,12 +30,12 @@
 
 /* Test for GCC < 4.9.0 */
 #if GCC_VERSION < 40900 & !__clang__
-    typedef signed char int8_t;
-    typedef unsigned char uint8_t;
-    typedef signed short int16_t;
-    typedef unsigned short uint16_t;
-    typedef signed int int32_t;
-    typedef unsigned int uint32_t;
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef signed short int16_t;
+typedef unsigned short uint16_t;
+typedef signed int int32_t;
+typedef unsigned int uint32_t;
 #else
 #include <stdint.h>
 #endif
@@ -125,4 +125,42 @@ static void writestr(std::string val_s, T len, std::fstream& dta)
 
 }
 
+inline uint64_t calc_rowlength(Rcpp::IntegerVector vartype) {
+
+  uint16_t k = vartype.size();
+
+  Rcpp::NumericVector rlen(k);
+  // calculate row length in byte
+  for (uint16_t i=0; i<k; ++i)
+  {
+    int const type = vartype[i];
+    switch(type)
+    {
+    case STATA_DOUBLE:
+      rlen(i) = 8;
+      break;
+    case STATA_FLOAT:
+    case STATA_INT:
+      rlen(i) = 4;
+      break;
+    case STATA_SHORTINT:
+      rlen(i) = 2;
+      break;
+    case STATA_BYTE:
+      rlen(i) = 1;
+      break;
+    case STATA_STRL:
+      rlen(i) = 8;
+      break;
+    default:
+      rlen(i) = type;
+    break;
+    }
+  }
+
+  uint64_t rlength = sum(rlen);
+
+  return(rlength);
+}
+
 #endif
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 0a582640..96bd8967 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
+ * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -411,14 +411,14 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
     int const type = vartype[i];
     switch(type)
     {
-    case 65526:
-    case 65527:
+    case STATA_DOUBLE:
+    case STATA_FLOAT:
       SET_VECTOR_ELT(df, i, NumericVector(no_init(nn)));
       break;
 
-    case 65528:
-    case 65529:
-    case 65530:
+    case STATA_INT:
+    case STATA_SHORTINT:
+    case STATA_BYTE:
       SET_VECTOR_ELT(df, i, IntegerVector(no_init(nn)));
       break;
 
@@ -428,24 +428,15 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
     }
   }
 
-  uint64_t tmp_j = 0, tmp_val = 0;
-  bool import = 1;
+  uint64_t rlength = calc_rowlength(vartype);
 
   // 2. fill it with data
-  for(uint64_t j=0; j<n; ++j)
-  {
 
-    // import is a bool if data is handed over to R
-    if ((j < nmin) || (j > nmax)) {
-      import = 0;
-    } else {
-      import = 1;
+  // skip into the data part
+  fseeko64(file, rlength * nmin, SEEK_CUR);
 
-      // temoprary index values to be reset at the end of the loop
-      tmp_val = j;
-      j = tmp_j;
-      tmp_j++;
-    }
+  for(uint32_t j=0; j<nn; ++j)
+  {
 
     for (uint16_t i=0; i<k; ++i)
     {
@@ -453,73 +444,68 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
       switch(type < 2046 ? 2045 : type)
       {
         // double
-      case 65526:
+      case STATA_DOUBLE:
       {
         double val_d = 0;
         val_d = readbin(val_d, file, swapit);
 
-        if (import == 1) {
-          if ((missing == 0) && !(val_d == R_NegInf) && ((val_d<STATA_DOUBLE_NA_MIN) || (val_d>STATA_DOUBLE_NA_MAX)) )
-            REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
-          else
-            REAL(VECTOR_ELT(df,i))[j] = val_d;
-        }
+        if ((missing == 0) && !(val_d == R_NegInf) && ((val_d<STATA_DOUBLE_NA_MIN) || (val_d>STATA_DOUBLE_NA_MAX)) )
+          REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
+        else
+          REAL(VECTOR_ELT(df,i))[j] = val_d;
+
         break;
       }
         // float
-      case 65527:
+      case STATA_FLOAT:
       {
         float val_f = 0;
         val_f = readbin(val_f, file, swapit);
 
-        if (import == 1) {
-          if ((missing == 0) && ((val_f<STATA_FLOAT_NA_MIN) || (val_f>STATA_FLOAT_NA_MAX)) )
-            REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
-          else
-            REAL(VECTOR_ELT(df,i))[j] = val_f;
-        }
+        if ((missing == 0) && ((val_f<STATA_FLOAT_NA_MIN) || (val_f>STATA_FLOAT_NA_MAX)) )
+          REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
+        else
+          REAL(VECTOR_ELT(df,i))[j] = val_f;
+
         break;
       }
-        //long
-      case 65528:
+        // long
+      case STATA_INT:
       {
         int32_t val_l = 0;
         val_l = readbin(val_l, file, swapit);
 
-        if (import == 1) {
-          if ((missing == 0) && ((val_l<STATA_INT_NA_MIN) || (val_l>STATA_INT_NA_MAX)) )
-            INTEGER(VECTOR_ELT(df,i))[j]  = NA_INTEGER;
-          else
-            INTEGER(VECTOR_ELT(df,i))[j] = val_l;
-        }
+        if ((missing == 0) && ((val_l<STATA_INT_NA_MIN) || (val_l>STATA_INT_NA_MAX)) )
+          INTEGER(VECTOR_ELT(df,i))[j]  = NA_INTEGER;
+        else
+          INTEGER(VECTOR_ELT(df,i))[j] = val_l;
+
         break;
       }
         // int
-      case 65529:
+      case STATA_SHORTINT:
       {
         int16_t val_i = 0;
         val_i = readbin(val_i, file, swapit);
 
-        if (import == 1) {
-          if ((missing == 0) && ((val_i<STATA_SHORTINT_NA_MIN) || (val_i>STATA_SHORTINT_NA_MAX)) )
-            INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
-          else
-            INTEGER(VECTOR_ELT(df,i))[j] = val_i;
-        }
+        if ((missing == 0) && ((val_i<STATA_SHORTINT_NA_MIN) || (val_i>STATA_SHORTINT_NA_MAX)) )
+          INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
+        else
+          INTEGER(VECTOR_ELT(df,i))[j] = val_i;
+
         break;
       }
         // byte
-      case 65530:
+      case STATA_BYTE:
       {
         int8_t val_b = 0;
         val_b = readbin(val_b, file, swapit);
 
-        if (import == 1) {
-          if (missing == 0 && ( (val_b<STATA_BYTE_NA_MIN) || (val_b>STATA_BYTE_NA_MAX)) )
-            INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
-          else
-            INTEGER(VECTOR_ELT(df,i))[j] = val_b;
-        }
+        if (missing == 0 && ( (val_b<STATA_BYTE_NA_MIN) || (val_b>STATA_BYTE_NA_MAX)) )
+          INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
+        else
+          INTEGER(VECTOR_ELT(df,i))[j] = val_b;
+
         break;
       }
         // strings with 2045 or fewer characters
@@ -530,75 +516,69 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         std::string val_s (len, '\0');
 
         readstring(val_s, file, val_s.size());
-        if (import == 1) {
-          as<CharacterVector>(df[i])[j] = val_s;
-        }
+        as<CharacterVector>(df[i])[j] = val_s;
         break;
       }
         // string of any length
-      case 32768:
+      case STATA_STRL:
       {// strL 2*4bit or 2 + 6 bit
-        //char val_strl[22];
 
         // FixMe: Strl in 118
         switch (release)
-        {
-
-        case 117:
-          {
-            uint32_t v = 0, o = 0;
-
-            v = readbin(v, file, swapit);
-            o = readbin(o, file, swapit);
-
-            stringstream val_stream;
-            val_stream << v << '_' << o;
-            string val_strl = val_stream.str();
-            //sprintf(val_strl, "%010d%010d", v, o);
-            if (import == 1) {
-              as<CharacterVector>(df[i])[j] = val_strl;
-            }
-            break;
-          }
-        case 118:
-          {
-            int16_t v = 0;
-            int64_t o = 0, z = 0;
-
-            z = readbin(z, file, swapit);
-
-            // works for LSF on little- and big-endian
-            if(byteorder.compare("LSF")==0) {
-              v = (int16_t)z;
-              o = (z >> 16);
-            }
-
-            // works if we read a big-endian file on little-endian
-            if(byteorder.compare("MSF")==0) {
-              v = (z >> 48) & ((1 << 16) - 1);
-              o = z & ((1 << 16) - 1);
-            }
-
-            stringstream val_stream;
-            val_stream << v << '_' << o;
-            string val_strl = val_stream.str();
-
-            if (import == 1) {
-              as<CharacterVector>(df[i])[j] = val_strl;
-            }
-            break;
-          }
+      {
+
+      case 117:
+      {
+        uint32_t v = 0, o = 0;
+
+        v = readbin(v, file, swapit);
+        o = readbin(o, file, swapit);
+
+        stringstream val_stream;
+        val_stream << v << '_' << o;
+        string val_strl = val_stream.str();
+
+        as<CharacterVector>(df[i])[j] = val_strl;
+
+        break;
+      }
+      case 118:
+      {
+        int16_t v = 0;
+        int64_t o = 0, z = 0;
+
+        z = readbin(z, file, swapit);
+
+        // works for LSF on little- and big-endian
+        if(byteorder.compare("LSF")==0) {
+          v = (int16_t)z;
+          o = (z >> 16);
+        }
+
+        // works if we read a big-endian file on little-endian
+        if(byteorder.compare("MSF")==0) {
+          v = (z >> 48) & ((1 << 16) - 1);
+          o = z & ((1 << 16) - 1);
         }
+
+        stringstream val_stream;
+        val_stream << v << '_' << o;
+        string val_strl = val_stream.str();
+
+        as<CharacterVector>(df[i])[j] = val_strl;
+
+        break;
+      }
+      }
       }
       }
       Rcpp::checkUserInterrupt();
     }
-
-    // reset temporary index values to their original values
-    if (import == 1)
-      j = tmp_val;
   }
 
+  // skip to end of data part
+  fseeko64(file, rlength * (n - nmax -1), SEEK_CUR);
+
   // 3. Create a data.frame
   df.attr("row.names") = rvec;
   df.attr("names") = varnames;
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index 8ef8818f..6077bd43 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
+ * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -394,8 +394,8 @@ List read_pre13_dta(FILE * file, const bool missing,
     int const type = vartype[i];
     switch(type)
     {
-    case STATA_FLOAT:
     case STATA_DOUBLE:
+    case STATA_FLOAT:
       SET_VECTOR_ELT(df, i, NumericVector(no_init(nn)));
       break;
 
@@ -411,25 +411,15 @@ List read_pre13_dta(FILE * file, const bool missing,
     }
   }
 
-  uint32_t tmp_j = 0, tmp_val = 0;
-  bool import = 1;
+  uint64_t rlength = calc_rowlength(vartype);
 
   // 2. fill it with data
 
-  for(uint32_t j=0; j<n; ++j)
-  {
-
-    // import is a bool if data is handed over to R
-    if ((j < nmin) || (j > nmax)) {
-      import = 0;
-    } else {
-      import = 1;
+  // skip into the data part
+  fseeko64(file, rlength * nmin, SEEK_CUR);
 
-      // temoprary index values to be reset at the end of the loop
-      tmp_val = j;
-      j = tmp_j;
-      tmp_j++;
-    }
+  for(uint32_t j=0; j<nn; ++j)
+  {
 
     for (uint16_t i=0; i<k; ++i)
     {
@@ -442,12 +432,11 @@ List read_pre13_dta(FILE * file, const bool missing,
         double val_d = 0;
         val_d = readbin(val_d, file, swapit);
 
-        if (import == 1) {
-          if ((missing == FALSE) & !(val_d == R_NegInf) & ((val_d<STATA_DOUBLE_NA_MIN) | (val_d>STATA_DOUBLE_NA_MAX)) )
-            REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
-          else
-            REAL(VECTOR_ELT(df,i))[j] = val_d;
-        }
+        if ((missing == FALSE) & !(val_d == R_NegInf) & ((val_d<STATA_DOUBLE_NA_MIN) | (val_d>STATA_DOUBLE_NA_MAX)) )
+          REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
+        else
+          REAL(VECTOR_ELT(df,i))[j] = val_d;
+
         break;
       }
         // float
@@ -456,26 +445,25 @@ List read_pre13_dta(FILE * file, const bool missing,
         float val_f = 0;
         val_f = readbin(val_f, file, swapit);
 
-        if (import == 1) {
-          if ((missing == FALSE) & ((val_f<STATA_FLOAT_NA_MIN) | (val_f>STATA_FLOAT_NA_MAX)) )
-            REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
-          else
-            REAL(VECTOR_ELT(df,i))[j] = val_f;
-        }
+        if ((missing == FALSE) & ((val_f<STATA_FLOAT_NA_MIN) | (val_f>STATA_FLOAT_NA_MAX)) )
+          REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
+        else
+          REAL(VECTOR_ELT(df,i))[j] = val_f;
+
         break;
       }
-        //long
+        // long
       case STATA_INT:
       {
         int32_t val_l = 0;
         val_l = readbin(val_l, file, swapit);
 
-        if (import == 1) {
-          if ((missing == FALSE) & ((val_l<STATA_INT_NA_MIN) | (val_l>STATA_INT_NA_MAX)) )
-            INTEGER(VECTOR_ELT(df,i))[j]  = NA_INTEGER;
-          else
-            INTEGER(VECTOR_ELT(df,i))[j] = val_l;
-        }
+
+        if ((missing == FALSE) & ((val_l<STATA_INT_NA_MIN) | (val_l>STATA_INT_NA_MAX)) )
+          INTEGER(VECTOR_ELT(df,i))[j]  = NA_INTEGER;
+        else
+          INTEGER(VECTOR_ELT(df,i))[j] = val_l;
+
         break;
       }
         // int
@@ -484,12 +472,11 @@ List read_pre13_dta(FILE * file, const bool missing,
         int16_t val_i = 0;
         val_i = readbin(val_i, file, swapit);
 
-        if (import == 1) {
-          if ((missing == FALSE) & ((val_i<STATA_SHORTINT_NA_MIN) | (val_i>STATA_SHORTINT_NA_MAX)) )
-            INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
-          else
-            INTEGER(VECTOR_ELT(df,i))[j] = val_i;
-        }
+        if ((missing == FALSE) & ((val_i<STATA_SHORTINT_NA_MIN) | (val_i>STATA_SHORTINT_NA_MAX)) )
+          INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
+        else
+          INTEGER(VECTOR_ELT(df,i))[j] = val_i;
+
         break;
       }
         // byte
@@ -498,12 +485,11 @@ List read_pre13_dta(FILE * file, const bool missing,
         int8_t val_b = 0;
         val_b = readbin(val_b, file, swapit);
 
-        if (import == 1) {
-          if ((missing == FALSE) & ( (val_b<STATA_BYTE_NA_MIN) | (val_b>STATA_BYTE_NA_MAX)) )
-            INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
-          else
-            INTEGER(VECTOR_ELT(df,i))[j] = val_b;
-        }
+        if ((missing == FALSE) & ( (val_b<STATA_BYTE_NA_MIN) | (val_b>STATA_BYTE_NA_MAX)) )
+          INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
+        else
+          INTEGER(VECTOR_ELT(df,i))[j] = val_b;
+
         break;
       }
         // strings with 244 or fewer characters
@@ -514,20 +500,19 @@ List read_pre13_dta(FILE * file, const bool missing,
         std::string val_s (len, '\0');
 
         readstring(val_s, file, val_s.size());
-        if (import == 1) {
-          as<CharacterVector>(df[i])[j] = val_s;
-        }
+
+        as<CharacterVector>(df[i])[j] = val_s;
+
         break;
       }
       }
       Rcpp::checkUserInterrupt();
     }
-
-    // reset temporary index values to their original values
-    if (import == 1)
-      j = tmp_val;
   }
 
+  // skip to end of data part
+  fseeko64(file, rlength * (n - nmax -1), SEEK_CUR);
+
   // 3. Create a data.frame
   df.attr("row.names") = rvec;
   df.attr("names") = varnames;

From 4b5cdb614c04dabb952cad4f44004e97ee14cb2f Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Wed, 24 May 2017 18:54:15 +0200
Subject: [PATCH 03/76] Update DESCRIPTION

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a33bba89..a6f869f8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -11,7 +11,7 @@ Authors@R: c(
     person("Magnus Thor", "Torfason", role="ctb"),
     person("Luke M.", "Olson", role="ctb"),
     person("Giovanni", "Righi", role="ctb"),
-    person("Kevin Jin", role="ctb")
+    person("Jin", "Kevin", role="ctb")
     )
 Description: Function to read and write the 'Stata' file format.
 URL: https://github.com/sjewo/readstata13

From 0df8b715d11a33d9fcd1e257e3edcf887ef3a9dc Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Wed, 24 May 2017 18:59:33 +0200
Subject: [PATCH 04/76] Update DESCRIPTION again

(first name, last name, role)
---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a6f869f8..a0bcccd9 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -11,7 +11,7 @@ Authors@R: c(
     person("Magnus Thor", "Torfason", role="ctb"),
     person("Luke M.", "Olson", role="ctb"),
     person("Giovanni", "Righi", role="ctb"),
-    person("Jin", "Kevin", role="ctb")
+    person("Kevin", "Jin", role="ctb")
     )
 Description: Function to read and write the 'Stata' file format.
 URL: https://github.com/sjewo/readstata13

From 29ac62c48ac07136327ce8faa8ab6598bedcaec1 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Sun, 11 Jun 2017 19:13:42 +0200
Subject: [PATCH 05/76] f119: k is uint32_t

---
 src/rcpp_savestata.cpp | 7 +++++--
 src/read_dta.cpp       | 9 +++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp
index b5b8b11f..be6a5bbb 100644
--- a/src/rcpp_savestata.cpp
+++ b/src/rcpp_savestata.cpp
@@ -28,7 +28,7 @@ using namespace std;
 // [[Rcpp::export]]
 int stata_save(const char * filePath, Rcpp::DataFrame dat)
 {
-  uint16_t k = dat.size();
+  uint32_t k = dat.size();
   uint64_t n = dat.nrows();
 
   const string timestamp = dat.attr("timestamp");
@@ -141,7 +141,10 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
     writestr(byteord, byteord.size(), dta);
     writestr(sbyteorder, 3, dta); // LSF
     writestr(K, K.size(), dta);
-    writebin(k, dta, swapit);
+    if (release < 119)
+      writebin((int16_t)k, dta, swapit);
+    if (release == 119)
+      writebin(k, dta, swapit);
     writestr(num, num.size(), dta);
     if (release==117)
       writebin((int32_t)n, dta, swapit);
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 0a582640..4536eca1 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -96,8 +96,13 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   * Number of Variables
   */
 
-  uint16_t k = 0;
-  k = readbin(k, file, swapit);
+  uint32_t k = 0;
+  if(release < 119){
+    k = readbin((uint16_t)k, file, swapit);
+  }
+  if(release == 199){
+    k = readbin(k, file, swapit);
+  }
 
   //</K>
   test("</K>", file);

From d861ab6a9bd64f0b9ec42c9893c376392b9fced0 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Sun, 11 Jun 2017 19:24:40 +0200
Subject: [PATCH 06/76] f119: typo

---
 src/read_dta.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index d0213d08..d02257b2 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -100,7 +100,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   if(release < 119){
     k = readbin((uint16_t)k, file, swapit);
   }
-  if(release == 199){
+  if(release == 119){
     k = readbin(k, file, swapit);
   }
 

From 90720e4491f9342d375a50fa5dfed5e4a4d1ff88 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 13:30:58 +0200
Subject: [PATCH 07/76] f119: more work towards a working implementation

k = uint32 at more places
---
 R/save.R               |  6 ++++--
 src/rcpp_savestata.cpp | 11 ++++++-----
 src/read_dta.cpp       | 17 +++++++----------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/R/save.R b/R/save.R
index 0d2f0d43..048b6d16 100644
--- a/R/save.R
+++ b/R/save.R
@@ -75,6 +75,8 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
     stop("Path is invalid. Possibly a non existend directory.")
 
   # Allow writing version as Stata version not Stata format
+  if (version==15L)
+    version <- 119
   if (version==14L)
     version <- 118
   if (version==13L)
@@ -90,7 +92,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
   if (version==6)
     version <- 108
 
-  if (version<102 | version == 109 | version == 116 | version>118)
+  if (version<102 | version == 109 | version == 116 | version>119)
     stop("Version missmatch abort execution. No Data was saved.")
 
   sstr     <- 2045
@@ -278,7 +280,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
     vartypen[empty] <- sbyte
   }
 
-  # recode character variables. 118 wants utf-8, so encoding may be required
+  # recode character variables. >118 wants utf-8, so encoding may be required
   if(doRecode) {
     #TODO: use seq_len ?
     for(v in (1:ncol(data))[vartypen == "character"]) {
diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp
index be6a5bbb..3e0e2b2e 100644
--- a/src/rcpp_savestata.cpp
+++ b/src/rcpp_savestata.cpp
@@ -65,6 +65,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
     lbllen = 33;
     break;
   case 118:
+  case 119:
     nvarnameslen = 129;
     nformatslen = 57;
     nvalLabelslen = 129;
@@ -148,7 +149,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
     writestr(num, num.size(), dta);
     if (release==117)
       writebin((int32_t)n, dta, swapit);
-    if (release==118)
+    if (release==118 | release==119)
       writebin(n, dta, swapit);
     writestr(lab, lab.size(), dta);
 
@@ -167,7 +168,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
 
       if (release==117)
         writebin((uint8_t)ndlabel, dta, swapit);
-      if (release==118)
+      if (release==118 | release==119)
         writebin(ndlabel, dta, swapit);
 
       writestr(datalabel,datalabel.size(), dta);
@@ -178,7 +179,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
       if (release == 117) {
         writebin(zero, dta, swapit);
       }
-      if (release == 118) {
+      if (release == 118 | release == 119) {
         writebin(zero, dta, swapit);
         writebin(zero, dta, swapit);
       }
@@ -348,7 +349,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
 
     for(uint64_t j = 0; j < n; ++j)
     {
-      for (uint16_t i = 0; i < k; ++i)
+      for (uint32_t i = 0; i < k; ++i)
       {
         int const type = vartypes[i];
         switch(type < 2046 ? 2045 : type)
@@ -523,7 +524,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
       writebin(v, dta, swapit);
       if (release==117)
         writebin((uint32_t)o, dta, swapit);
-      if (release==118)
+      if (release==118 | release==119)
         writebin(o, dta, swapit);
       writebin(t, dta, swapit);
       writebin(len, dta, swapit);
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index d02257b2..3bbd6f7c 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -30,7 +30,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   */
 
   int8_t fversion = 117L; //f = first
-  int8_t lversion = 118L; //l = last
+  int8_t lversion = 119L; //l = last
 
   std::string version(3, '\0');
   readstring(version, file, version.size());
@@ -65,6 +65,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
     lbllen = 33;
     break;
   case 118:
+  case 119:
     nvarnameslen = 129;
     nformatslen = 57;
     nvalLabelslen = 129;
@@ -97,12 +98,10 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   */
 
   uint32_t k = 0;
-  if(release < 119){
+  if(release < 119)
     k = readbin((uint16_t)k, file, swapit);
-  }
-  if(release == 119){
+  if(release==119)
     k = readbin(k, file, swapit);
-  }
 
   //</K>
   test("</K>", file);
@@ -114,12 +113,10 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
 
   uint64_t n = 0;
 
-  if(release==117) {
+  if(release==117)
     n = readbin((uint32_t)n, file, swapit);
-  }
-  if (release ==118) {
+  if (release ==118 | release==119)
     n = readbin(n, file, swapit);
-  }
 
   //</N>
   test("</N>", file);
@@ -134,7 +131,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
 
   uint16_t ndlabel = 0;
 
-  if (release==118)
+  if (release==118 | release==119)
     ndlabel = readbin(ndlabel, file, swapit);
   if (release==117)
     ndlabel = readbin((int8_t)ndlabel, file, swapit);

From e9c05f7ad13248b18ba5689394aed2323eaa4e36 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 13:31:18 +0200
Subject: [PATCH 08/76] f119: more uint32_t

---
 src/rcpp_savestata.cpp | 10 +++++-----
 src/read_dta.cpp       | 23 ++++++++++++-----------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp
index 3e0e2b2e..87dcbb3f 100644
--- a/src/rcpp_savestata.cpp
+++ b/src/rcpp_savestata.cpp
@@ -211,7 +211,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
     map(2) = dta.tellg();
     writestr(startvart, startvart.size(), dta);
     uint16_t nvartype;
-    for (uint16_t i = 0; i < k; ++i)
+    for (uint32_t i = 0; i < k; ++i)
     {
       nvartype = as<uint16_t>(vartypes[i]);
 
@@ -223,7 +223,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
     /* <varnames> ... </varnames> */
     map(3) = dta.tellg();
     writestr(startvarn, startvarn.size(), dta);
-    for (uint16_t i = 0; i < k; ++i )
+    for (uint32_t i = 0; i < k; ++i )
     {
       string nvarname = as<string>(nvarnames[i]);
       nvarname[nvarname.size()] = '\0';
@@ -254,7 +254,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
     /* <formats> ... </formats> */
     map(5) = dta.tellg();
     writestr(startform, startform.size(), dta);
-    for (uint16_t i = 0; i < k; ++i )
+    for (uint32_t i = 0; i < k; ++i )
     {
       string nformats = as<string>(formats[i]);
 
@@ -270,7 +270,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
     /* <value_label_names> ... </value_label_names> */
     map(6) = dta.tellg();
     writestr(startvalLabel, startvalLabel.size(), dta);
-    for (uint16_t i = 0; i < k; ++i)
+    for (uint32_t i = 0; i < k; ++i)
     {
       string nvalLabels = as<string>(valLabels[i]);
       nvalLabels[nvalLabels.size()] = '\0';
@@ -287,7 +287,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
     /* <variable_labels> ... </variable_labels> */
     map(7) = dta.tellg();
     writestr(startvarlabel, startvarlabel.size(), dta);
-    for (uint16_t i = 0; i < k; ++i)
+    for (uint32_t i = 0; i < k; ++i)
     {
       if (!Rf_isNull(varLabels) && Rf_length(varLabels) > 1) {
         string nvarLabels = as<string>(varLabels[i]);
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 3bbd6f7c..12948200 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -219,8 +219,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   */
 
   IntegerVector vartype(k);
-  for (uint16_t i=0; i<k; ++i)
+  for (uint32_t i=0; i<k; ++i)
   {
+    // FixMe: uint32_t nvartype if release == 119?
     uint16_t nvartype = 0;
     nvartype = readbin(nvartype, file, swapit);
     vartype[i] = nvartype;
@@ -237,7 +238,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   std::string nvarnames(nvarnameslen, '\0');
 
   CharacterVector varnames(k);
-  for (uint16_t i=0; i<k; ++i)
+  for (uint32_t i=0; i<k; ++i)
   {
     readstring(nvarnames, file, nvarnames.size());
     varnames[i] = nvarnames;
@@ -254,11 +255,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   * Vector size is k+1.
   */
 
-  uint32_t big_k = k+1;
+  uint64_t big_k = k+1;
 
   IntegerVector sortlist(big_k);
-  for (uint32_t i=0; i<big_k; ++i)
+  for (uint64_t i=0; i<big_k; ++i)
   {
+    // FixMe: uint32_t nsortlist if release==119?
     uint16_t nsortlist = 0;
     nsortlist = readbin(nsortlist, file, swapit);
     sortlist[i] = nsortlist;
@@ -276,7 +278,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   std::string nformats(nformatslen, '\0');
 
   CharacterVector formats(k);
-  for (uint16_t i=0; i<k; ++i)
+  for (uint32_t i=0; i<k; ++i)
   {
     readstring(nformats, file, nformats.size());
     formats[i] = nformats;
@@ -295,7 +297,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   std::string nvalLabels(nvalLabelslen, '\0');
 
   CharacterVector valLabels(k);
-  for (uint16_t i=0; i<k; ++i)
+  for (uint32_t i=0; i<k; ++i)
   {
     readstring(nvalLabels, file, nvalLabels.size());
     valLabels[i] = nvalLabels;
@@ -312,7 +314,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   std::string nvarLabels (nvarLabelslen, '\0');
 
   CharacterVector varLabels(k);
-  for (uint16_t i=0; i<k; ++i)
+  for (uint32_t i=0; i<k; ++i)
   {
     readstring(nvarLabels, file, nvarLabels.size());
     varLabels[i] = nvarLabels;
@@ -408,7 +410,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
 
   // 1. create the list
   List df(k);
-  for (uint16_t i=0; i<k; ++i)
+  for (uint32_t i=0; i<k; ++i)
   {
     int const type = vartype[i];
     switch(type)
@@ -437,10 +439,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   // skip into the data part
   fseeko64(file, rlength * nmin, SEEK_CUR);
 
-  for(uint32_t j=0; j<nn; ++j)
+  for(uint64_t j=0; j<nn; ++j)
   {
-
-    for (uint16_t i=0; i<k; ++i)
+    for (uint32_t i=0; i<k; ++i)
     {
       int const type = vartype[i];
       switch(type < 2046 ? 2045 : type)

From b33674bbdf4b9c07246fc427f6785be6b6e59e5c Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 14:29:53 +0200
Subject: [PATCH 09/76] f119: close FixMes. we should be able to read and write
 a f119 file w/o strls

---
 src/read_dta.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 12948200..9058d084 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -221,9 +221,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   IntegerVector vartype(k);
   for (uint32_t i=0; i<k; ++i)
   {
-    // FixMe: uint32_t nvartype if release == 119?
-    uint16_t nvartype = 0;
-    nvartype = readbin(nvartype, file, swapit);
+    uint32_t nvartype = 0;
+
+    if (release < 119)
+      nvartype = readbin((uint16_t)nvartype, file, swapit);
+    if (release==119)
+      nvartype = readbin(nvartype, file, swapit);
     vartype[i] = nvartype;
   }
 
@@ -260,9 +263,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   IntegerVector sortlist(big_k);
   for (uint64_t i=0; i<big_k; ++i)
   {
-    // FixMe: uint32_t nsortlist if release==119?
-    uint16_t nsortlist = 0;
-    nsortlist = readbin(nsortlist, file, swapit);
+    uint32_t nsortlist = 0;
+
+    if (release < 119)
+      nsortlist = readbin((uint16_t)nsortlist, file, swapit);
+    if (release==119)
+      nsortlist = readbin(nsortlist, file, swapit);
     sortlist[i] = nsortlist;
   }
 

From 9b80d1854e11de4ad4a5a7f6bde00589dbd4d86d Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 15:53:01 +0200
Subject: [PATCH 10/76] thinko

---
 src/read_dta.cpp | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 9058d084..866952ed 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -221,12 +221,8 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   IntegerVector vartype(k);
   for (uint32_t i=0; i<k; ++i)
   {
-    uint32_t nvartype = 0;
-
-    if (release < 119)
-      nvartype = readbin((uint16_t)nvartype, file, swapit);
-    if (release==119)
-      nvartype = readbin(nvartype, file, swapit);
+    uint16_t nvartype = 0;
+    nvartype = readbin(nvartype, file, swapit);
     vartype[i] = nvartype;
   }
 
@@ -263,12 +259,8 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   IntegerVector sortlist(big_k);
   for (uint64_t i=0; i<big_k; ++i)
   {
-    uint32_t nsortlist = 0;
-
-    if (release < 119)
-      nsortlist = readbin((uint16_t)nsortlist, file, swapit);
-    if (release==119)
-      nsortlist = readbin(nsortlist, file, swapit);
+    uint16_t nsortlist = 0;
+    nsortlist = readbin(nsortlist, file, swapit);
     sortlist[i] = nsortlist;
   }
 

From d74bb013ce45d13a8c0f8e8b319190ebaabc6f75 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 15:53:23 +0200
Subject: [PATCH 11/76] big_k musst be bigger than k

---
 src/rcpp_savestata.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp
index 87dcbb3f..7590509d 100644
--- a/src/rcpp_savestata.cpp
+++ b/src/rcpp_savestata.cpp
@@ -241,9 +241,9 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
     map(4) = dta.tellg();
     writestr(startsor, startsor.size(), dta);
 
-    uint32_t big_k = k+1;
+    uint64_t big_k = k+1;
 
-    for (uint32_t i = 0; i < big_k; ++i)
+    for (uint64_t i = 0; i < big_k; ++i)
     {
       uint16_t nsortlist = 0;
       writebin(nsortlist, dta, swapit);

From e7ffcab4c472181fa939e17f5d99d3f246e2b943 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 17:39:55 +0200
Subject: [PATCH 12/76] f119: experimental support for writing strls

---
 src/rcpp_savestata.cpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp
index 7590509d..4c72532d 100644
--- a/src/rcpp_savestata.cpp
+++ b/src/rcpp_savestata.cpp
@@ -491,6 +491,29 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
             dta.write((char*)&z, sizeof(z));
             // writestr((char*)&z, sizeof(z), dta);
 
+            break;
+          }
+            case 119:
+          {
+            int32_t v = i+1;
+            int64_t o = j+1;
+            char    z[8];
+
+            // push back every v, o and val_strl
+            V.push_back(v);
+            O.push_back(o);
+
+            // z is 'vv-- ----'
+            memcpy(&z[0], &v, sizeof(v));
+            if (SBYTEORDER == 1) {
+              o <<= 24;
+            }
+            memcpy(&z[3], &o, 5);
+            // z is 'vvvo oooo'
+
+            dta.write((char*)&z, sizeof(z));
+            // writestr((char*)&z, sizeof(z), dta);
+
             break;
           }
             }

From 21cc28880d58666c9e6131657ecbbf1197e25679 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 17:46:22 +0200
Subject: [PATCH 13/76] f119: experimental support for reading strls

---
 src/read_dta.cpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 866952ed..1c952755 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -570,6 +570,33 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
 
         break;
       }
+      case 119:
+      {
+        int32_t v = 0;
+        int64_t o = 0, z = 0;
+
+        z = readbin(z, file, swapit);
+
+        // works for LSF on little- and big-endian
+        if(byteorder.compare("LSF")==0) {
+          v = (int32_t)z;
+          o = (z >> 24);
+        }
+
+        // works if we read a big-endian file on little-endian
+        if(byteorder.compare("MSF")==0) {
+          v = (z >> 48) & ((1 << 24) - 1);
+          o = z & ((1 << 24) - 1);
+        }
+
+        stringstream val_stream;
+        val_stream << v << '_' << o;
+        string val_strl = val_stream.str();
+
+        as<CharacterVector>(df[i])[j] = val_strl;
+
+        break;
+      }
       }
       }
       }

From 81289fd8ed8c4ef9da51787a905e24be34ae4fd3 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 18:12:25 +0200
Subject: [PATCH 14/76] f119: read strl part

---
 src/read_dta.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 1c952755..216b8958 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -656,6 +656,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
       break;
     }
     case 118:
+    case 119:
     {
       uint32_t v = 0;
       uint64_t o = 0;

From 947a0dfc79d96259e1d0e02f780eedb10f6577b5 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 18:29:36 +0200
Subject: [PATCH 15/76] f119: experimental support reading strl

---
 src/read_dta.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 216b8958..1976ac00 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -579,11 +579,11 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
 
         // works for LSF on little- and big-endian
         if(byteorder.compare("LSF")==0) {
-          v = (int32_t)z;
+          v = (int32_t)z & ((1 << 24) - 1);
           o = (z >> 24);
         }
 
-        // works if we read a big-endian file on little-endian
+        // FixMe: works if we read a big-endian file on little-endian
         if(byteorder.compare("MSF")==0) {
           v = (z >> 48) & ((1 << 24) - 1);
           o = z & ((1 << 24) - 1);

From 03846b833bfb085f8de01ecc9529de983d68c624 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 18:39:26 +0200
Subject: [PATCH 16/76] f119: update tests

---
 tests/testthat/test_save.R | 48 +++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/tests/testthat/test_save.R b/tests/testthat/test_save.R
index fcfd34e6..990a8649 100644
--- a/tests/testthat/test_save.R
+++ b/tests/testthat/test_save.R
@@ -40,6 +40,7 @@ dir.create("data")
 
 dd <- mtcars
 
+save.dta13(dd, "data/dta_119.dta", version = 119)
 save.dta13(dd, "data/dta_118.dta", version = 118)
 save.dta13(dd, "data/dta_117.dta", version = 117)
 save.dta13(dd, "data/dta_115.dta", version = 115)
@@ -56,6 +57,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104)
 save.dta13(dd, "data/dta_103.dta", version = 103)
 save.dta13(dd, "data/dta_102.dta", version = 102)
 
+dd119 <- read.dta13("data/dta_119.dta")
 dd118 <- read.dta13("data/dta_118.dta")
 dd117 <- read.dta13("data/dta_117.dta")
 dd115 <- read.dta13("data/dta_115.dta")
@@ -76,6 +78,7 @@ dd102 <- read.dta13("data/dta_102.dta")
 unlink("data", recursive = TRUE)
 
 test_that("version", {
+  expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))
   expect_true(datacompare(dd, dd117))
   expect_true(datacompare(dd, dd115))
@@ -103,6 +106,7 @@ dir.create("data")
 
 dd <- mtcars
 
+save.dta13(dd, "data/dta_119.dta", version = 119, compress = TRUE)
 save.dta13(dd, "data/dta_118.dta", version = 118, compress = TRUE)
 save.dta13(dd, "data/dta_117.dta", version = 117, compress = TRUE)
 save.dta13(dd, "data/dta_115.dta", version = 115, compress = TRUE)
@@ -119,6 +123,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104, compress = TRUE)
 save.dta13(dd, "data/dta_103.dta", version = 103, compress = TRUE)
 save.dta13(dd, "data/dta_102.dta", version = 102, compress = TRUE)
 
+dd119 <- read.dta13("data/dta_119.dta")
 dd118 <- read.dta13("data/dta_118.dta")
 dd117 <- read.dta13("data/dta_117.dta")
 dd115 <- read.dta13("data/dta_115.dta")
@@ -139,6 +144,7 @@ dd102 <- read.dta13("data/dta_102.dta")
 unlink("data", recursive = TRUE)
 
 test_that("compress", {
+  expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))
   expect_true(datacompare(dd, dd117))
   expect_true(datacompare(dd, dd115))
@@ -167,7 +173,7 @@ dir.create("data")
 dd <- mtcars
 dd$am <- factor(x = dd$am, levels = c(0,1), labels = c("auto", "man"))
 
-
+save.dta13(dd, "data/dta_119.dta", version = 119, convert.factors = TRUE)
 save.dta13(dd, "data/dta_118.dta", version = 118, convert.factors = TRUE)
 save.dta13(dd, "data/dta_117.dta", version = 117, convert.factors = TRUE)
 save.dta13(dd, "data/dta_115.dta", version = 115, convert.factors = TRUE)
@@ -184,7 +190,7 @@ save.dta13(dd, "data/dta_107.dta", version = 107, convert.factors = TRUE)
 # save.dta13(dd, "data/dta_103.dta", version = 103, convert.factors = TRUE)
 # save.dta13(dd, "data/dta_102.dta", version = 102, convert.factors = TRUE)
 
-
+dd119 <- read.dta13("data/dta_119.dta")
 dd118 <- read.dta13("data/dta_118.dta")
 dd117 <- read.dta13("data/dta_117.dta")
 dd115 <- read.dta13("data/dta_115.dta")
@@ -205,6 +211,7 @@ dd107 <- read.dta13("data/dta_107.dta")
 unlink("data", recursive = TRUE)
 
 test_that("convert.factors TRUE", {
+  expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))
   expect_true(datacompare(dd, dd117))
   expect_true(datacompare(dd, dd115))
@@ -233,7 +240,7 @@ dir.create("data")
 dd <- mtcars
 dd$am <- factor(x = dd$am, levels = c(0,1), labels = c("auto", "man"))
 
-
+save.dta13(dd, "data/dta_119.dta", version = 119, convert.factors = FALSE)
 save.dta13(dd, "data/dta_118.dta", version = 118, convert.factors = FALSE)
 save.dta13(dd, "data/dta_117.dta", version = 117, convert.factors = FALSE)
 save.dta13(dd, "data/dta_115.dta", version = 115, convert.factors = FALSE)
@@ -250,7 +257,7 @@ save.dta13(dd, "data/dta_107.dta", version = 107, convert.factors = FALSE)
 # save.dta13(dd, "data/dta_103.dta", version = 103, convert.factors = FALSE)
 # save.dta13(dd, "data/dta_102.dta", version = 102, convert.factors = FALSE)
 
-
+dd119 <- read.dta13("data/dta_119.dta")
 dd118 <- read.dta13("data/dta_118.dta")
 dd117 <- read.dta13("data/dta_117.dta")
 dd115 <- read.dta13("data/dta_115.dta")
@@ -275,6 +282,7 @@ dd$am <- dd$am + 1
 unlink("data", recursive = TRUE)
 
 test_that("convert.factors TRUE", {
+  expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))
   expect_true(datacompare(dd, dd117))
   expect_true(datacompare(dd, dd115))
@@ -301,6 +309,7 @@ dir.create("data")
 
 dd <- mtcars
 
+save.dta13(dd, "data/dta_119.dta", version = 119, add.rownames = TRUE)
 save.dta13(dd, "data/dta_118.dta", version = 118, add.rownames = TRUE)
 save.dta13(dd, "data/dta_117.dta", version = 117, add.rownames = TRUE)
 save.dta13(dd, "data/dta_115.dta", version = 115, add.rownames = TRUE)
@@ -317,7 +326,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104, add.rownames = TRUE)
 save.dta13(dd, "data/dta_103.dta", version = 103, add.rownames = TRUE)
 save.dta13(dd, "data/dta_102.dta", version = 102, add.rownames = TRUE)
 
-
+dd119 <- read.dta13("data/dta_119.dta", add.rownames = TRUE)
 dd118 <- read.dta13("data/dta_118.dta", add.rownames = TRUE)
 dd117 <- read.dta13("data/dta_117.dta", add.rownames = TRUE)
 dd115 <- read.dta13("data/dta_115.dta", add.rownames = TRUE)
@@ -339,6 +348,7 @@ unlink("data", recursive = TRUE)
 
 test_that("add.rownames TRUE", {
   # Check that rownames are identical
+  expect_true(identical(rownames(dd), rownames(dd119)))
   expect_true(identical(rownames(dd), rownames(dd118)))
   expect_true(identical(rownames(dd), rownames(dd117)))
   expect_true(identical(rownames(dd), rownames(dd115)))
@@ -356,6 +366,7 @@ test_that("add.rownames TRUE", {
   expect_true(identical(rownames(dd), rownames(dd102)))
 
   # Check that data is identical
+  expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))
   expect_true(datacompare(dd, dd117))
   expect_true(datacompare(dd, dd115))
@@ -385,6 +396,7 @@ dir.create("data")
 
 dd <- mtcars
 
+save.dta13(dd, "data/dta_119.dta", version = 119, data.label = dl)
 save.dta13(dd, "data/dta_118.dta", version = 118, data.label = dl)
 save.dta13(dd, "data/dta_117.dta", version = 117, data.label = dl)
 save.dta13(dd, "data/dta_115.dta", version = 115, data.label = dl)
@@ -401,7 +413,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104, data.label = dl)
 save.dta13(dd, "data/dta_103.dta", version = 103, data.label = dl)
 # save.dta13(dd, "data/dta_102.dta", version = 102, data.label = dl) # no data label
 
-
+dd119 <- read.dta13("data/dta_119.dta")
 dd118 <- read.dta13("data/dta_118.dta")
 dd117 <- read.dta13("data/dta_117.dta")
 dd115 <- read.dta13("data/dta_115.dta")
@@ -422,6 +434,7 @@ unlink("data", recursive = TRUE)
 
 test_that("data label", {
   # Check that rownames are identical
+  expect_equal(dl, attr(dd119, "datalabel"))
   expect_equal(dl, attr(dd118, "datalabel"))
   expect_equal(dl, attr(dd117, "datalabel"))
   expect_equal(dl, attr(dd115, "datalabel"))
@@ -450,6 +463,7 @@ dir.create("data")
 
 dd <- data.frame( dat = Sys.Date() )
 
+save.dta13(dd, "data/dta_119.dta", version = 119, convert.dates = TRUE)
 save.dta13(dd, "data/dta_118.dta", version = 118, convert.dates = TRUE)
 save.dta13(dd, "data/dta_117.dta", version = 117, convert.dates = TRUE)
 save.dta13(dd, "data/dta_115.dta", version = 115, convert.dates = TRUE)
@@ -466,7 +480,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104, convert.dates = TRUE)
 save.dta13(dd, "data/dta_103.dta", version = 103, convert.dates = TRUE)
 save.dta13(dd, "data/dta_102.dta", version = 102, convert.dates = TRUE)
 
-
+dd119 <- read.dta13("data/dta_119.dta")
 dd118 <- read.dta13("data/dta_118.dta")
 dd117 <- read.dta13("data/dta_117.dta")
 dd115 <- read.dta13("data/dta_115.dta")
@@ -487,6 +501,7 @@ unlink("data", recursive = TRUE)
 
 test_that("convert.dates TRUE", {
   # Check that rownames are identical
+  expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))
   expect_true(datacompare(dd, dd117))
   expect_true(datacompare(dd, dd115))
@@ -517,6 +532,7 @@ dd <- data.frame( dat = c(paste(replicate(2046, "a"), collapse = ""),
                           paste(replicate(2046, "b"), collapse = "")),
                   stringsAsFactors = FALSE)
 
+save.dta13(dd, "data/dta_119.dta", version = 119)
 save.dta13(dd, "data/dta_118.dta", version = 118)
 save.dta13(dd, "data/dta_117.dta", version = 117)
 # save.dta13(dd, "data/dta_115.dta", version = 115) # no strl
@@ -533,7 +549,7 @@ save.dta13(dd, "data/dta_117.dta", version = 117)
 # save.dta13(dd, "data/dta_103.dta", version = 103)
 # save.dta13(dd, "data/dta_102.dta", version = 102)
 
-
+dd119 <- read.dta13("data/dta_119.dta", replace.strl = TRUE)
 dd118 <- read.dta13("data/dta_118.dta", replace.strl = TRUE)
 dd117 <- read.dta13("data/dta_117.dta", replace.strl = TRUE)
 # dd115 <- read.dta13("data/dta_115.dta")
@@ -554,6 +570,7 @@ unlink("data", recursive = TRUE)
 
 test_that("replace.strl TRUE", {
   # Check that rownames are identical
+  expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))
   expect_true(datacompare(dd, dd117))
   # expect_true(datacompare(dd, dd115))
@@ -578,9 +595,9 @@ if (readstata13:::dir.exists13("data"))
   unlink("data", recursive = TRUE)
 dir.create("data")
 
-
 dd <- data.frame(x.1 = 1)
 
+save.dta13(dd, "data/dta_119.dta", version = 119, convert.underscore = TRUE)
 save.dta13(dd, "data/dta_118.dta", version = 118, convert.underscore = TRUE)
 save.dta13(dd, "data/dta_117.dta", version = 117, convert.underscore = TRUE)
 save.dta13(dd, "data/dta_115.dta", version = 115, convert.underscore = TRUE)
@@ -597,7 +614,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104, convert.underscore = TRUE)
 save.dta13(dd, "data/dta_103.dta", version = 103, convert.underscore = TRUE)
 save.dta13(dd, "data/dta_102.dta", version = 102, convert.underscore = TRUE)
 
-
+dd119 <- read.dta13("data/dta_119.dta")
 dd118 <- read.dta13("data/dta_118.dta")
 dd117 <- read.dta13("data/dta_117.dta")
 dd115 <- read.dta13("data/dta_115.dta")
@@ -620,6 +637,7 @@ names(dd) <- "x_1"
 
 test_that("convert.underscore TRUE", {
   # check numerics
+  expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))
   expect_true(datacompare(dd, dd117))
   expect_true(datacompare(dd, dd115))
@@ -636,6 +654,7 @@ test_that("convert.underscore TRUE", {
   expect_true(datacompare(dd, dd103))
   expect_true(datacompare(dd, dd102))
   # check names
+  expect_true(namescompare(dd, dd119))
   expect_true(namescompare(dd, dd118))
   expect_true(namescompare(dd, dd117))
   expect_true(namescompare(dd, dd115))
@@ -660,9 +679,9 @@ if (readstata13:::dir.exists13("data"))
   unlink("data", recursive = TRUE)
 dir.create("data")
 
-
 dd <- mtcars
 
+save.dta13(dd, "data/dta_119.dta", version = 119)
 save.dta13(dd, "data/dta_118.dta", version = 118)
 save.dta13(dd, "data/dta_117.dta", version = 117)
 save.dta13(dd, "data/dta_115.dta", version = 115)
@@ -679,7 +698,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104)
 save.dta13(dd, "data/dta_103.dta", version = 103)
 save.dta13(dd, "data/dta_102.dta", version = 102)
 
-
+dd119 <- read.dta13("data/dta_119.dta", select.rows = 5)
 dd118 <- read.dta13("data/dta_118.dta", select.rows = 5)
 dd117 <- read.dta13("data/dta_117.dta", select.rows = 5)
 dd115 <- read.dta13("data/dta_115.dta", select.rows = 5)
@@ -702,6 +721,7 @@ dd <- dd[1:5,]
 
 test_that("select.rows = 5", {
   # check numerics
+  expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))
   expect_true(datacompare(dd, dd117))
   expect_true(datacompare(dd, dd115))
@@ -725,6 +745,7 @@ dir.create("data")
 
 dd <- mtcars
 
+save.dta13(dd, "data/dta_119.dta", version = 119)
 save.dta13(dd, "data/dta_118.dta", version = 118)
 save.dta13(dd, "data/dta_117.dta", version = 117)
 save.dta13(dd, "data/dta_115.dta", version = 115)
@@ -741,7 +762,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104)
 save.dta13(dd, "data/dta_103.dta", version = 103)
 save.dta13(dd, "data/dta_102.dta", version = 102)
 
-
+dd119 <- read.dta13("data/dta_119.dta", select.rows = c(5,10))
 dd118 <- read.dta13("data/dta_118.dta", select.rows = c(5,10))
 dd117 <- read.dta13("data/dta_117.dta", select.rows = c(5,10))
 dd115 <- read.dta13("data/dta_115.dta", select.rows = c(5,10))
@@ -764,6 +785,7 @@ dd <- dd[5:10,]
 
 test_that("select.rows = c(5,10)", {
   # check numerics
+  expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))
   expect_true(datacompare(dd, dd117))
   expect_true(datacompare(dd, dd115))

From 2da02712bd1611b1168a94dc561696cc8517a1e8 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 18:45:08 +0200
Subject: [PATCH 17/76] f119: finish implementation of experimental support for
 LSF

---
 NEWS                   |  1 +
 R/read.R               | 12 ++++++------
 R/save.R               |  2 +-
 README.md              |  5 +++--
 src/rcpp_savestata.cpp |  2 +-
 5 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/NEWS b/NEWS
index 8e4a8e31..e234a362 100644
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,5 @@
 [testing]
+- experimental support for format 119
 - improve partial reading
 
 [0.9.0]
diff --git a/R/read.R b/R/read.R
index 8df7e895..113943bb 100644
--- a/R/read.R
+++ b/R/read.R
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
+# Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
 # Copyright (C) of 'convert.dates' and 'missing.types' Thomas Lumley
 #
 # This program is free software; you can redistribute it and/or modify it
@@ -23,7 +23,7 @@
 #' @param convert.factors \emph{logical.} If \code{TRUE}, factors from Stata
 #'  value labels are created.
 #' @param generate.factors \emph{logical.} If \code{TRUE} and convert.factors is
-#'  TRUE, missing factor labels are created from integers. If duplicated labels are found, 
+#'  TRUE, missing factor labels are created from integers. If duplicated labels are found,
 #'  unique labels will be generated according the following scheme: "label_(integer code)".
 #' @param encoding \emph{character.} Strings can be converted from Windows-1252 or UTF-8
 #'  to system encoding. Options are "latin1" or "UTF-8" to specify target
@@ -357,7 +357,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
         varunique <- na.omit(unique(data[, i]))
         # assign label if label set is complete
         if (all(varunique %in% labtable)) {
-          
+
           #check for duplicated labels
           labcount <- table(names(labtable))
           if(any(labcount > 1)) {
@@ -366,17 +366,17 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
             # generate unique labels from assigned label and code number
             names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")")
           }
-          
+
           data[, i] <- factor(data[, i], levels=labtable,
                               labels=names(labtable))
           # else generate labels from codes
         } else if (generate.factors) {
           names(varunique) <- as.character(varunique)
           gen.lab  <- sort(c(varunique[!varunique %in% labtable], labtable))
-          
+
           data[, i] <- factor(data[, i], levels=gen.lab,
                               labels=names(gen.lab))
-          
+
         } else {
           warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no labels assigned.\n  Set option generate.factors=T to generate labels."))
         }
diff --git a/R/save.R b/R/save.R
index 048b6d16..b23af667 100644
--- a/R/save.R
+++ b/R/save.R
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
+# Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
diff --git a/README.md b/README.md
index 1db2b9ae..0e21aee4 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # readstata13
 
-Package to read and write all Stata file formats (version 14 and older) into a
-R data.frame. The dta file format versions 102 to 118 are supported.
+Package to read and write all Stata file formats (version 15 and older) into a
+R data.frame. The dta file format versions 102 to 119 are supported.
 
 The function ```read.dta``` from the foreign package imports only dta files from
 Stata versions <= 12. Due to the different structure and features of dta 117
@@ -56,6 +56,7 @@ devtools::install_github("sjewo/readstata13", ref="testing")
 [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/package=readstata13)
 
 ### Working features
+* [testing] Experimental support for format 119
 * [testing] Improvements to partial reading. Idea by Kevin Jin
 * [0.9.0] Generate unique factor labels to prevent errors in factor definition
 * [0.9.0] check interrupt for long read. Patch by Giovanni Righi
diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp
index 4c72532d..4a90e46e 100644
--- a/src/rcpp_savestata.cpp
+++ b/src/rcpp_savestata.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
+ * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the

From 2d5ca181d9f6287e524cccdb962bbcd7537acf52 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 12 Jun 2017 19:08:46 +0200
Subject: [PATCH 18/76] f119: update documentation

---
 R/save.R          | 2 +-
 man/read.dta13.Rd | 2 +-
 man/save.dta13.Rd | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/save.R b/R/save.R
index b23af667..95f12d8f 100644
--- a/R/save.R
+++ b/R/save.R
@@ -39,7 +39,7 @@
 #' @param compress \emph{logical.} If \code{TRUE}, the resulting dta-file will
 #'  use all of Statas numeric-vartypes.
 #' @param version \emph{numeric.} Stata format for the resulting dta-file either
-#'  the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 14.
+#'  the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 15.
 #' @return The function writes a dta-file to disk. The following features of the
 #'  dta file format are supported:
 #' \describe{
diff --git a/man/read.dta13.Rd b/man/read.dta13.Rd
index a4b2ab6b..56018ea0 100644
--- a/man/read.dta13.Rd
+++ b/man/read.dta13.Rd
@@ -16,7 +16,7 @@ read.dta13(file, convert.factors = TRUE, generate.factors = FALSE,
 value labels are created.}
 
 \item{generate.factors}{\emph{logical.} If \code{TRUE} and convert.factors is
-TRUE, missing factor labels are created from integers. If duplicated labels are found, 
+TRUE, missing factor labels are created from integers. If duplicated labels are found,
 unique labels will be generated according the following scheme: "label_(integer code)".}
 
 \item{encoding}{\emph{character.} Strings can be converted from Windows-1252 or UTF-8
diff --git a/man/save.dta13.Rd b/man/save.dta13.Rd
index e4c9808b..32831383 100644
--- a/man/save.dta13.Rd
+++ b/man/save.dta13.Rd
@@ -37,7 +37,7 @@ will be added to the dta-file.}
 use all of Statas numeric-vartypes.}
 
 \item{version}{\emph{numeric.} Stata format for the resulting dta-file either
-the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 14.}
+the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 15.}
 
 \item{convert.underscore}{\emph{logical.} If \code{TRUE}, all non numerics or
 non alphabet characters will be converted to underscores.}

From 942376d1fb7f6f74ba93d59b470a9d7560b1af55 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Mon, 12 Jun 2017 23:13:00 +0200
Subject: [PATCH 19/76] f119: codestyle pedantic

---
 src/rcpp_savestata.cpp | 14 +++++++-------
 src/read_dta.cpp       | 12 ++++++------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp
index 4a90e46e..37d8aea7 100644
--- a/src/rcpp_savestata.cpp
+++ b/src/rcpp_savestata.cpp
@@ -147,9 +147,9 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
     if (release == 119)
       writebin(k, dta, swapit);
     writestr(num, num.size(), dta);
-    if (release==117)
+    if (release == 117)
       writebin((int32_t)n, dta, swapit);
-    if (release==118 | release==119)
+    if ((release == 118) | (release == 119))
       writebin(n, dta, swapit);
     writestr(lab, lab.size(), dta);
 
@@ -166,9 +166,9 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
       }
       ndlabel = datalabel.size();
 
-      if (release==117)
+      if (release == 117)
         writebin((uint8_t)ndlabel, dta, swapit);
-      if (release==118 | release==119)
+      if ((release == 118) | (release == 119))
         writebin(ndlabel, dta, swapit);
 
       writestr(datalabel,datalabel.size(), dta);
@@ -179,7 +179,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
       if (release == 117) {
         writebin(zero, dta, swapit);
       }
-      if (release == 118 | release == 119) {
+      if ((release == 118) | (release == 119)) {
         writebin(zero, dta, swapit);
         writebin(zero, dta, swapit);
       }
@@ -545,9 +545,9 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
 
       writestr(gso, gso.size(), dta);
       writebin(v, dta, swapit);
-      if (release==117)
+      if (release == 117)
         writebin((uint32_t)o, dta, swapit);
-      if (release==118 | release==119)
+      if ((release == 118) | (release == 119))
         writebin(o, dta, swapit);
       writebin(t, dta, swapit);
       writebin(len, dta, swapit);
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 1976ac00..27aaa97b 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -98,9 +98,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   */
 
   uint32_t k = 0;
-  if(release < 119)
+  if (release < 119)
     k = readbin((uint16_t)k, file, swapit);
-  if(release==119)
+  if (release == 119)
     k = readbin(k, file, swapit);
 
   //</K>
@@ -113,9 +113,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
 
   uint64_t n = 0;
 
-  if(release==117)
+  if(release == 117)
     n = readbin((uint32_t)n, file, swapit);
-  if (release ==118 | release==119)
+  if ((release == 118) | (release == 119))
     n = readbin(n, file, swapit);
 
   //</N>
@@ -131,9 +131,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
 
   uint16_t ndlabel = 0;
 
-  if (release==118 | release==119)
+  if ((release == 118) | (release == 119))
     ndlabel = readbin(ndlabel, file, swapit);
-  if (release==117)
+  if (release == 117)
     ndlabel = readbin((int8_t)ndlabel, file, swapit);
 
   std::string datalabel(ndlabel, '\0');

From a436ce6c101e005c42ea4fda60cb388d9e079a47 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Tue, 13 Jun 2017 00:01:22 +0200
Subject: [PATCH 20/76] cleaning

---
 src/rcpp_savestata.cpp |  4 ++--
 src/read_dta.cpp       | 16 +++++++---------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/rcpp_savestata.cpp b/src/rcpp_savestata.cpp
index 37d8aea7..35a60ef6 100644
--- a/src/rcpp_savestata.cpp
+++ b/src/rcpp_savestata.cpp
@@ -155,7 +155,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
 
 
     /* write a datalabel */
-    if(!datalabel.empty())
+    if (!datalabel.empty())
     {
       if (datalabel.size() > maxdatalabelsize)
       {
@@ -440,7 +440,7 @@ int stata_save(const char * filePath, Rcpp::DataFrame dat)
 
           string val_s = as<string>(as<CharacterVector>(dat[i])[j]);
 
-          if(val_s == "NA")
+          if (val_s == "NA")
             val_s.clear();
 
           writestr(val_s, len, dta);
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 27aaa97b..52c0cf08 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -113,7 +113,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
 
   uint64_t n = 0;
 
-  if(release == 117)
+  if (release == 117)
     n = readbin((uint32_t)n, file, swapit);
   if ((release == 118) | (release == 119))
     n = readbin(n, file, swapit);
@@ -551,13 +551,13 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         z = readbin(z, file, swapit);
 
         // works for LSF on little- and big-endian
-        if(byteorder.compare("LSF")==0) {
+        if (byteorder.compare("LSF")==0) {
           v = (int16_t)z;
           o = (z >> 16);
         }
 
         // works if we read a big-endian file on little-endian
-        if(byteorder.compare("MSF")==0) {
+        if (byteorder.compare("MSF")==0) {
           v = (z >> 48) & ((1 << 16) - 1);
           o = z & ((1 << 16) - 1);
         }
@@ -578,13 +578,13 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         z = readbin(z, file, swapit);
 
         // works for LSF on little- and big-endian
-        if(byteorder.compare("LSF")==0) {
+        if (byteorder.compare("LSF")==0) {
           v = (int32_t)z & ((1 << 24) - 1);
           o = (z >> 24);
         }
 
         // FixMe: works if we read a big-endian file on little-endian
-        if(byteorder.compare("MSF")==0) {
+        if (byteorder.compare("MSF")==0) {
           v = (z >> 48) & ((1 << 24) - 1);
           o = z & ((1 << 24) - 1);
         }
@@ -652,7 +652,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
       stringstream val_stream;
       val_stream << v << '_' << o;
       ref.assign(val_stream.str());
-      //sprintf(ref, "%010d%010d", v, o);
+      
       break;
     }
     case 118:
@@ -660,15 +660,13 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
     {
       uint32_t v = 0;
       uint64_t o = 0;
-      // uint64_t z = 0;
+      
       v = readbin(v, file, swapit);
       o = readbin(o, file, swapit);
-      // z = readbin(z, file, swapit);
 
       stringstream val_stream;
       val_stream << v << '_' << o;
       ref.assign(val_stream.str());
-      //sprintf(ref, "%010d%010ld", v, o);
 
       break;
     }

From 781d0a24c1916c01f1e10973e527cb24016bc555 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Tue, 13 Jun 2017 01:07:34 +0200
Subject: [PATCH 21/76] export the dimensions of the original dta-file.

---
 R/read.R               | 2 ++
 man/read.dta13.Rd      | 2 ++
 src/read_dta.cpp       | 6 ++++++
 src/read_pre13_dta.cpp | 7 +++++++
 4 files changed, 17 insertions(+)

diff --git a/R/read.R b/R/read.R
index 113943bb..7f875964 100644
--- a/R/read.R
+++ b/R/read.R
@@ -101,6 +101,8 @@
 #'    and the contents of Stata characteristic field.}
 #'   \item{missing:}{List of numeric vectors with Stata missing type for each
 #'    variable.}
+#'   \item{byteorder:}{Byteorder of the dta-file. LSF or MSF.}
+#'   \item{orig.dim:}{Dimension recorded inside the dta-file.}
 #' }
 #' @note read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members
 #'  from foreign::read.dta().
diff --git a/man/read.dta13.Rd b/man/read.dta13.Rd
index 56018ea0..6fc3dfd6 100644
--- a/man/read.dta13.Rd
+++ b/man/read.dta13.Rd
@@ -74,6 +74,8 @@ The function returns a data.frame with attributes. The attributes
    and the contents of Stata characteristic field.}
   \item{missing:}{List of numeric vectors with Stata missing type for each
    variable.}
+  \item{byteorder:}{Byteorder of the dta-file. LSF or MSF.}
+  \item{orig.dim:}{Dimension recorded inside the dta-file.}
 }
 }
 \description{
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 52c0cf08..65f126bd 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -121,6 +121,11 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   //</N>
   test("</N>", file);
   test("<label>", file);
+  
+  // dim to return original dim for partial read files
+  IntegerVector dim(2);
+  dim(0) = n;
+  dim(1) = k;
 
   /*
   * A dataset may have a label e.g. "Written by R".
@@ -823,6 +828,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   df.attr("expansion.fields") = ch;
   df.attr("strl") = strlvalues;
   df.attr("byteorder") = wrap(byteorder);
+  df.attr("orig.dim") = dim;
 
   return df;
 }
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index 6077bd43..fb059ffc 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -119,6 +119,11 @@ List read_pre13_dta(FILE * file, const bool missing,
 
   uint32_t n = 0;
   n = readbin(n, file, swapit);
+  
+  // dim to return original dim for partial read files
+  IntegerVector dim(2);
+  dim(0) = n;
+  dim(1) = k;
 
   /*
   * A dataset may have a label e.g. "Written by R".
@@ -639,5 +644,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   df.attr("label.table") = labelList;
   df.attr("expansion.fields") = ch;
   df.attr("byteorder") = byteorderI;
+  df.attr("orig.dim") = dim;
+  
   return df;
 }

From f40ea80dbe7104d3e84fa4d93e4082abbb39128c Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Tue, 13 Jun 2017 04:32:22 +0200
Subject: [PATCH 22/76] f119: more uint32_t

---
 inst/include/readstata.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 22d70ed6..6372f2ef 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -127,11 +127,11 @@ static void writestr(std::string val_s, T len, std::fstream& dta)
 
 inline uint64_t calc_rowlength(Rcpp::IntegerVector vartype) {
 
-  uint16_t k = vartype.size();
+  uint32_t k = vartype.size();
 
   Rcpp::NumericVector rlen(k);
   // calculate row length in byte
-  for (uint16_t i=0; i<k; ++i)
+  for (uint32_t i=0; i<k; ++i)
   {
     int const type = vartype[i];
     switch(type)

From b59b66194a6a0ef0c192b651abda23db003c973c Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@rub.de>
Date: Tue, 13 Jun 2017 19:09:44 +0200
Subject: [PATCH 23/76] WIP: Selectcols (#55)

Implement reading of selected variables
---
 NEWS                          |   1 +
 R/RcppExports.R               |   4 +-
 R/read.R                      |   9 ++-
 README.md                     |   1 +
 inst/include/read_dta.h       |   4 +-
 inst/include/read_pre13_dta.h |   4 +-
 inst/include/readstata.h      |  42 +++++++++++--
 man/read.dta13.Rd             |   5 +-
 src/RcppExports.cpp           |   7 ++-
 src/rcpp_readstata.cpp        |   7 ++-
 src/read_dta.cpp              | 109 +++++++++++++++++++++++++---------
 src/read_pre13_dta.cpp        | 109 +++++++++++++++++++++++++---------
 src/register.c                |   2 +-
 tests/testthat/test_read.R    |   1 -
 tests/testthat/test_save.R    |  68 +++++++++++++++++++++
 15 files changed, 297 insertions(+), 76 deletions(-)

diff --git a/NEWS b/NEWS
index e234a362..7e28c8c7 100644
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,5 @@
 [testing]
+- allow reading only pre-selected variables
 - experimental support for format 119
 - improve partial reading
 
diff --git a/R/RcppExports.R b/R/RcppExports.R
index d6bef198..87a2fd10 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -5,8 +5,8 @@ stata_pre13_save <- function(filePath, dat) {
     .Call(readstata13_stata_pre13_save, filePath, dat)
 }
 
-stata_read <- function(filePath, missing, selectrows) {
-    .Call(readstata13_stata_read, filePath, missing, selectrows)
+stata_read <- function(filePath, missing, selectrows, selectcols) {
+    .Call(readstata13_stata_read, filePath, missing, selectrows, selectcols)
 }
 
 stata_save <- function(filePath, dat) {
diff --git a/R/read.R b/R/read.R
index 7f875964..6a153b33 100644
--- a/R/read.R
+++ b/R/read.R
@@ -51,6 +51,7 @@
 #' @param select.rows \emph{integer.} Vector of one or two numbers. If single
 #'  value rows from 1:val are selected. If two values of a range are selected
 #'  the rows in range will be selected.
+#' @param select.cols \emph{character:} Vector of variables to select.
 #'
 #' @details If the filename is a url, the file will be downloaded as a temporary
 #'  file and read afterwards.
@@ -121,7 +122,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
                        convert.underscore = FALSE, missing.type = FALSE,
                        convert.dates = TRUE, replace.strl = TRUE,
                        add.rownames = FALSE, nonint.factors=FALSE,
-                       select.rows = NULL) {
+                       select.rows = NULL, select.cols = NULL) {
   # Check if path is a url
   if (length(grep("^(http|ftp|https)://", file))) {
     tmp <- tempfile()
@@ -167,7 +168,11 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     select.rows <- c(0,0)
   }
 
-  data <- stata_read(filepath, missing.type, select.rows)
+  if (is.null(select.cols)){
+    select.cols <- ""
+  }
+
+  data <- stata_read(filepath, missing.type, select.rows, select.cols)
 
   version <- attr(data, "version")
 
diff --git a/README.md b/README.md
index 0e21aee4..ce5684a6 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ devtools::install_github("sjewo/readstata13", ref="testing")
 [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/package=readstata13)
 
 ### Working features
+* [testing] Allow reading only pre-selected variables
 * [testing] Experimental support for format 119
 * [testing] Improvements to partial reading. Idea by Kevin Jin
 * [0.9.0] Generate unique factor labels to prevent errors in factor definition
diff --git a/inst/include/read_dta.h b/inst/include/read_dta.h
index 18ee0856..7369903f 100644
--- a/inst/include/read_dta.h
+++ b/inst/include/read_dta.h
@@ -18,6 +18,8 @@
 #ifndef READDTA_H
 #define READDTA_H
 
-Rcpp::List read_dta(FILE * file, const bool missing, const Rcpp::IntegerVector selectrows);
+Rcpp::List read_dta(FILE * file, const bool missing,
+                    const Rcpp::IntegerVector selectrows,
+                    const Rcpp::CharacterVector selectcols);
 
 #endif
diff --git a/inst/include/read_pre13_dta.h b/inst/include/read_pre13_dta.h
index 624b725e..108dd492 100644
--- a/inst/include/read_pre13_dta.h
+++ b/inst/include/read_pre13_dta.h
@@ -18,6 +18,8 @@
 #ifndef READPRE13DTA_H
 #define READPRE13DTA_H
 
-Rcpp::List read_pre13_dta(FILE * file, const bool missing, const Rcpp::IntegerVector selectrows);
+Rcpp::List read_pre13_dta(FILE * file, const bool missing,
+                          const Rcpp::IntegerVector selectrows,
+                          const Rcpp::CharacterVector selectcols);
 
 #endif
diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 6372f2ef..b7d136fb 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -125,11 +125,11 @@ static void writestr(std::string val_s, T len, std::fstream& dta)
 
 }
 
-inline uint64_t calc_rowlength(Rcpp::IntegerVector vartype) {
+inline Rcpp::IntegerVector calc_rowlength(Rcpp::IntegerVector vartype) {
 
   uint32_t k = vartype.size();
 
-  Rcpp::NumericVector rlen(k);
+  Rcpp::IntegerVector rlen(k);
   // calculate row length in byte
   for (uint32_t i=0; i<k; ++i)
   {
@@ -158,9 +158,43 @@ inline uint64_t calc_rowlength(Rcpp::IntegerVector vartype) {
     }
   }
 
-  uint64_t rlength = sum(rlen);
+  return(rlen);
+}
+
+inline Rcpp::IntegerVector choose(Rcpp::CharacterVector x,
+                                  Rcpp::CharacterVector y)
+{
+  Rcpp::IntegerVector mm = Rcpp::match(x, y);
+
+  if (Rcpp::any(Rcpp::is_na(mm))) {
+    Rcpp::LogicalVector ll = !Rcpp::is_na(mm);
+
+    Rcpp::CharacterVector ms = x[ll==0];
+
+    Rcpp::Rcout << "Variable " <<  ms <<
+      " was not found in dta-file." << std::endl;
+
+    mm = mm[ll==1];
+  }
 
-  return(rlength);
+  return(mm);
 }
 
+
+inline Rcpp::IntegerVector which_pos(Rcpp::IntegerVector cvec,
+                                     Rcpp::IntegerVector select)
+{
+  // integer position of not selected variables
+  std::vector<int> vec = Rcpp::as< std::vector<int> >(cvec);
+  for (uint32_t i=0; i<select.size(); ++i) {
+    vec.erase(std::remove(vec.begin(), vec.end(), select(i)), vec.end());
+  }
+
+  Rcpp::IntegerVector nselect = Rcpp::wrap(vec);
+  nselect = nselect -1;
+
+  return(nselect);
+}
+
+
 #endif
diff --git a/man/read.dta13.Rd b/man/read.dta13.Rd
index 6fc3dfd6..6867fb8a 100644
--- a/man/read.dta13.Rd
+++ b/man/read.dta13.Rd
@@ -7,7 +7,8 @@
 read.dta13(file, convert.factors = TRUE, generate.factors = FALSE,
   encoding = "UTF-8", fromEncoding = NULL, convert.underscore = FALSE,
   missing.type = FALSE, convert.dates = TRUE, replace.strl = TRUE,
-  add.rownames = FALSE, nonint.factors = FALSE, select.rows = NULL)
+  add.rownames = FALSE, nonint.factors = FALSE, select.rows = NULL,
+  select.cols = NULL)
 }
 \arguments{
 \item{file}{\emph{character.} Path to the dta file you want to import.}
@@ -53,6 +54,8 @@ will be assigned to variables of type float and double.}
 \item{select.rows}{\emph{integer.} Vector of one or two numbers. If single
 value rows from 1:val are selected. If two values of a range are selected
 the rows in range will be selected.}
+
+\item{select.cols}{\emph{character:} Vector of variables to select.}
 }
 \value{
 The function returns a data.frame with attributes. The attributes
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 33e3c458..2461b76c 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -18,15 +18,16 @@ BEGIN_RCPP
 END_RCPP
 }
 // stata_read
-List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows);
-RcppExport SEXP readstata13_stata_read(SEXP filePathSEXP, SEXP missingSEXP, SEXP selectrowsSEXP) {
+List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows, const CharacterVector selectcols);
+RcppExport SEXP readstata13_stata_read(SEXP filePathSEXP, SEXP missingSEXP, SEXP selectrowsSEXP, SEXP selectcolsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP);
     Rcpp::traits::input_parameter< const bool >::type missing(missingSEXP);
     Rcpp::traits::input_parameter< const IntegerVector >::type selectrows(selectrowsSEXP);
-    rcpp_result_gen = Rcpp::wrap(stata_read(filePath, missing, selectrows));
+    Rcpp::traits::input_parameter< const CharacterVector >::type selectcols(selectcolsSEXP);
+    rcpp_result_gen = Rcpp::wrap(stata_read(filePath, missing, selectrows, selectcols));
     return rcpp_result_gen;
 END_RCPP
 }
diff --git a/src/rcpp_readstata.cpp b/src/rcpp_readstata.cpp
index d00a619e..9ec251ff 100644
--- a/src/rcpp_readstata.cpp
+++ b/src/rcpp_readstata.cpp
@@ -27,7 +27,8 @@ using namespace Rcpp;
 // @export
 // [[Rcpp::export]]
 List stata_read(const char * filePath, const bool missing,
-                const IntegerVector selectrows)
+                const IntegerVector selectrows,
+                const CharacterVector selectcols)
 {
   FILE *file = NULL;    // File pointer
 
@@ -53,9 +54,9 @@ List stata_read(const char * filePath, const bool missing,
   List df(0);
 
   if (fbit.compare(expfbit) == 0)
-    df = read_dta(file, missing, selectrows);
+    df = read_dta(file, missing, selectrows, selectcols);
   else
-    df = read_pre13_dta(file, missing, selectrows);
+    df = read_pre13_dta(file, missing, selectrows, selectcols);
 
   fclose(file);
 
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 65f126bd..023ca3bf 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -20,7 +20,8 @@
 using namespace Rcpp;
 using namespace std;
 
-List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
+List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
+              const CharacterVector selectcols) {
   // stata_dta><header>
   test("stata_dta><header>", file);
   test("<release>", file);
@@ -121,7 +122,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   //</N>
   test("</N>", file);
   test("<label>", file);
-  
+
   // dim to return original dim for partial read files
   IntegerVector dim(2);
   dim(0) = n;
@@ -404,6 +405,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   if (n < nmin)
     nmin = n;
 
+  Rcpp::IntegerVector cvec = seq(1, k);
   Rcpp::IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
 
@@ -411,11 +413,45 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   nmin = nmin -1;
   nmax = nmax -1;
 
+  // calculate length of variables and of row
+  IntegerVector rlen = calc_rowlength(vartype);
+  uint64_t rlength = sum(rlen);
+
+  // check if vars are selected
+  std::string selcols = as<std::string>(selectcols(0));
+  bool noselectvars = selcols == "";
+
+  // select vars: either select every var or only matched cases
+  IntegerVector select;
+  if (noselectvars) {
+    select = cvec;
+  } else {
+    select = choose(selectcols, varnames);
+  }
+
+  // match returns r index
+  IntegerVector select_c = select -1;
+
+  uint32_t kk = select.size();
+
+  // shrink variables
+  CharacterVector varnames_kk = varnames[select_c];
+  IntegerVector vartype_kk = vartype[select_c];
+  IntegerVector vartype3 = vartype;
+
+  IntegerVector nselect = which_pos(cvec, select);
+
+  IntegerVector rlen2 = rlen[nselect];
+  rlen2 = -rlen2;
+
+  vartype3[nselect] = rlen2;
+
   // 1. create the list
-  List df(k);
-  for (uint32_t i=0; i<k; ++i)
+  List df(kk);
+  for (uint32_t i=0; i<kk; ++i)
   {
-    int const type = vartype[i];
+    int const type = vartype_kk[i];
+
     switch(type)
     {
     case STATA_DOUBLE:
@@ -435,19 +471,21 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
     }
   }
 
-  uint64_t rlength = calc_rowlength(vartype);
-
   // 2. fill it with data
 
   // skip into the data part
   fseeko64(file, rlength * nmin, SEEK_CUR);
 
+  uint32_t ii = 0;
   for(uint64_t j=0; j<nn; ++j)
   {
+    // reset partial index
+    ii = 0;
     for (uint32_t i=0; i<k; ++i)
     {
-      int const type = vartype[i];
-      switch(type < 2046 ? 2045 : type)
+      int const type = vartype3[i];
+
+      switch(((type >0) & (type < 2046)) ? 2045 : type)
       {
         // double
       case STATA_DOUBLE:
@@ -456,9 +494,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         val_d = readbin(val_d, file, swapit);
 
         if ((missing == 0) && !(val_d == R_NegInf) && ((val_d<STATA_DOUBLE_NA_MIN) || (val_d>STATA_DOUBLE_NA_MAX)) )
-          REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
+          REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
-          REAL(VECTOR_ELT(df,i))[j] = val_d;
+          REAL(VECTOR_ELT(df,ii))[j] = val_d;
 
         break;
       }
@@ -469,9 +507,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         val_f = readbin(val_f, file, swapit);
 
         if ((missing == 0) && ((val_f<STATA_FLOAT_NA_MIN) || (val_f>STATA_FLOAT_NA_MAX)) )
-          REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
+          REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
-          REAL(VECTOR_ELT(df,i))[j] = val_f;
+          REAL(VECTOR_ELT(df,ii))[j] = val_f;
 
         break;
       }
@@ -482,9 +520,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         val_l = readbin(val_l, file, swapit);
 
         if ((missing == 0) && ((val_l<STATA_INT_NA_MIN) || (val_l>STATA_INT_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,i))[j]  = NA_INTEGER;
+          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
-          INTEGER(VECTOR_ELT(df,i))[j] = val_l;
+          INTEGER(VECTOR_ELT(df,ii))[j] = val_l;
 
         break;
       }
@@ -495,9 +533,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         val_i = readbin(val_i, file, swapit);
 
         if ((missing == 0) && ((val_i<STATA_SHORTINT_NA_MIN) || (val_i>STATA_SHORTINT_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
+          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
-          INTEGER(VECTOR_ELT(df,i))[j] = val_i;
+          INTEGER(VECTOR_ELT(df,ii))[j] = val_i;
 
         break;
       }
@@ -508,9 +546,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         val_b = readbin(val_b, file, swapit);
 
         if (missing == 0 && ( (val_b<STATA_BYTE_NA_MIN) || (val_b>STATA_BYTE_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
+          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
-          INTEGER(VECTOR_ELT(df,i))[j] = val_b;
+          INTEGER(VECTOR_ELT(df,ii))[j] = val_b;
 
         break;
       }
@@ -522,7 +560,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         std::string val_s (len, '\0');
 
         readstring(val_s, file, val_s.size());
-        as<CharacterVector>(df[i])[j] = val_s;
+        as<CharacterVector>(df[ii])[j] = val_s;
         break;
       }
         // string of any length
@@ -544,7 +582,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         val_stream << v << '_' << o;
         string val_strl = val_stream.str();
 
-        as<CharacterVector>(df[i])[j] = val_strl;
+        as<CharacterVector>(df[ii])[j] = val_strl;
 
         break;
       }
@@ -571,7 +609,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         val_stream << v << '_' << o;
         string val_strl = val_stream.str();
 
-        as<CharacterVector>(df[i])[j] = val_strl;
+        as<CharacterVector>(df[ii])[j] = val_strl;
 
         break;
       }
@@ -598,13 +636,24 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
         val_stream << v << '_' << o;
         string val_strl = val_stream.str();
 
-        as<CharacterVector>(df[i])[j] = val_strl;
+        as<CharacterVector>(df[ii])[j] = val_strl;
 
         break;
       }
       }
+        break;
+      }
+        // case < 0:
+      default:
+      {
+        // skip to the next valid case
+        fseeko64(file, abs(type), SEEK_CUR);
+        break;
       }
       }
+
+      if (type >= 0) ii += 1;
+
       Rcpp::checkUserInterrupt();
     }
   }
@@ -614,7 +663,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
 
   // 3. Create a data.frame
   df.attr("row.names") = rvec;
-  df.attr("names") = varnames;
+  df.attr("names") = varnames_kk;
   df.attr("class") = "data.frame";
 
   //</data>
@@ -657,7 +706,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
       stringstream val_stream;
       val_stream << v << '_' << o;
       ref.assign(val_stream.str());
-      
+
       break;
     }
     case 118:
@@ -665,7 +714,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
     {
       uint32_t v = 0;
       uint64_t o = 0;
-      
+
       v = readbin(v, file, swapit);
       o = readbin(o, file, swapit);
 
@@ -811,16 +860,18 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) {
   test("ue_labels>", file);
   test("</stata_dta>", file);
 
-
-
   /*
    * assign attributes to the resulting data.frame
    */
 
+  formats = formats[select_c];
+  valLabels = valLabels[select_c];
+  varLabels = varLabels[select_c];
+
   df.attr("datalabel") = datalabelCV;
   df.attr("time.stamp") = timestampCV;
   df.attr("formats") = formats;
-  df.attr("types") = vartype;
+  df.attr("types") = vartype_kk;
   df.attr("val.labels") = valLabels;
   df.attr("var.labels") = varLabels;
   df.attr("version") = versionIV;
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index fb059ffc..d8b72f98 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -21,7 +21,8 @@ using namespace Rcpp;
 using namespace std;
 
 List read_pre13_dta(FILE * file, const bool missing,
-                    const IntegerVector selectrows)
+                    const IntegerVector selectrows,
+                    const CharacterVector selectcols)
 {
   int8_t release = 0;
 
@@ -119,7 +120,7 @@ List read_pre13_dta(FILE * file, const bool missing,
 
   uint32_t n = 0;
   n = readbin(n, file, swapit);
-  
+
   // dim to return original dim for partial read files
   IntegerVector dim(2);
   dim(0) = n;
@@ -367,9 +368,9 @@ List read_pre13_dta(FILE * file, const bool missing,
   std::replace (vartype.begin(), vartype.end(), 255, STATA_DOUBLE);
 
 
-  uint32_t nmin = selectrows(0);
-  uint32_t nmax = selectrows(1);
-  uint32_t nn   = 0;
+  uint64_t nmin = selectrows(0);
+  uint64_t nmax = selectrows(1);
+  uint64_t nn   = 0;
 
   // if  selectrows is c(0,0) use full data
   if ((nmin == 0) && (nmax == 0)){
@@ -385,6 +386,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   if (n < nmin)
     nmin = n;
 
+  Rcpp::IntegerVector cvec = seq(1, k);
   Rcpp::IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
 
@@ -392,11 +394,47 @@ List read_pre13_dta(FILE * file, const bool missing,
   nmin = nmin -1;
   nmax = nmax -1;
 
+  // calculate length of variables and of row
+  IntegerVector rlen = calc_rowlength(vartype);
+  uint64_t rlength = sum(rlen);
+
+  // check if vars are selected
+  std::string selcols = as<std::string>(selectcols(0));
+  bool noselectvars = selcols == "";
+
+  // select vars: either select every var or only matched cases
+  IntegerVector select;
+  if (noselectvars) {
+    select = cvec;
+  } else {
+    select = choose(selectcols, varnames);
+  }
+
+  // match returns r index
+  IntegerVector select_c = select -1;
+
+  uint32_t kk = select.size();
+
+  // shrink variables
+  CharacterVector varnames_kk = varnames[select_c];
+  IntegerVector vartype_kk = vartype[select_c];
+  IntegerVector types_kk = types[select_c];
+  IntegerVector vartype3 = vartype;
+
+
+  IntegerVector nselect = which_pos(cvec, select);
+
+  IntegerVector rlen2 = rlen[nselect];
+  rlen2 = -rlen2;
+
+  vartype3[nselect] = rlen2;
+
   // 1. create the list
-  List df(k);
-  for (uint16_t i=0; i<k; ++i)
+  List df(kk);
+  for (uint32_t i=0; i<kk; ++i)
   {
-    int const type = vartype[i];
+    int const type = vartype_kk[i];
+
     switch(type)
     {
     case STATA_DOUBLE:
@@ -416,20 +454,22 @@ List read_pre13_dta(FILE * file, const bool missing,
     }
   }
 
-  uint64_t rlength = calc_rowlength(vartype);
-
   // 2. fill it with data
 
   // skip into the data part
   fseeko64(file, rlength * nmin, SEEK_CUR);
 
+  uint32_t ii = 0;
   for(uint32_t j=0; j<nn; ++j)
   {
-
+    // reset partial index
+    ii = 0;
     for (uint16_t i=0; i<k; ++i)
     {
-      int32_t const type = vartype[i];
-      switch(type)
+      int const type = vartype3[i];
+
+
+      switch(((type >0) & (type < 244)) ? 244 : type)
       {
         // double
       case STATA_DOUBLE:
@@ -438,9 +478,9 @@ List read_pre13_dta(FILE * file, const bool missing,
         val_d = readbin(val_d, file, swapit);
 
         if ((missing == FALSE) & !(val_d == R_NegInf) & ((val_d<STATA_DOUBLE_NA_MIN) | (val_d>STATA_DOUBLE_NA_MAX)) )
-          REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
+          REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
-          REAL(VECTOR_ELT(df,i))[j] = val_d;
+          REAL(VECTOR_ELT(df,ii))[j] = val_d;
 
         break;
       }
@@ -451,9 +491,9 @@ List read_pre13_dta(FILE * file, const bool missing,
         val_f = readbin(val_f, file, swapit);
 
         if ((missing == FALSE) & ((val_f<STATA_FLOAT_NA_MIN) | (val_f>STATA_FLOAT_NA_MAX)) )
-          REAL(VECTOR_ELT(df,i))[j] = NA_REAL;
+          REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
-          REAL(VECTOR_ELT(df,i))[j] = val_f;
+          REAL(VECTOR_ELT(df,ii))[j] = val_f;
 
         break;
       }
@@ -465,9 +505,9 @@ List read_pre13_dta(FILE * file, const bool missing,
 
 
         if ((missing == FALSE) & ((val_l<STATA_INT_NA_MIN) | (val_l>STATA_INT_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,i))[j]  = NA_INTEGER;
+          INTEGER(VECTOR_ELT(df,ii))[j]  = NA_INTEGER;
         else
-          INTEGER(VECTOR_ELT(df,i))[j] = val_l;
+          INTEGER(VECTOR_ELT(df,ii))[j] = val_l;
 
         break;
       }
@@ -478,9 +518,9 @@ List read_pre13_dta(FILE * file, const bool missing,
         val_i = readbin(val_i, file, swapit);
 
         if ((missing == FALSE) & ((val_i<STATA_SHORTINT_NA_MIN) | (val_i>STATA_SHORTINT_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
+          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
-          INTEGER(VECTOR_ELT(df,i))[j] = val_i;
+          INTEGER(VECTOR_ELT(df,ii))[j] = val_i;
 
         break;
       }
@@ -491,14 +531,14 @@ List read_pre13_dta(FILE * file, const bool missing,
         val_b = readbin(val_b, file, swapit);
 
         if ((missing == FALSE) & ( (val_b<STATA_BYTE_NA_MIN) | (val_b>STATA_BYTE_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,i))[j] = NA_INTEGER;
+          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
-          INTEGER(VECTOR_ELT(df,i))[j] = val_b;
+          INTEGER(VECTOR_ELT(df,ii))[j] = val_b;
 
         break;
       }
         // strings with 244 or fewer characters
-      default:
+      case 244:
       {
         int32_t len = 0;
         len = vartype[i];
@@ -506,11 +546,20 @@ List read_pre13_dta(FILE * file, const bool missing,
 
         readstring(val_s, file, val_s.size());
 
-        as<CharacterVector>(df[i])[j] = val_s;
+        as<CharacterVector>(df[ii])[j] = val_s;
 
         break;
       }
+        // case < 0:
+      default:
+      {
+        // skip to the next valid case
+        fseeko64(file, abs(type), SEEK_CUR);
+        break;
+      }
       }
+
+      if (type >= 0) ii += 1;
       Rcpp::checkUserInterrupt();
     }
   }
@@ -520,7 +569,7 @@ List read_pre13_dta(FILE * file, const bool missing,
 
   // 3. Create a data.frame
   df.attr("row.names") = rvec;
-  df.attr("names") = varnames;
+  df.attr("names") = varnames_kk;
   df.attr("class") = "data.frame";
 
   /*
@@ -634,10 +683,14 @@ List read_pre13_dta(FILE * file, const bool missing,
    * assign attributes to the resulting data.frame
    */
 
+  formats = formats[select_c];
+  valLabels = valLabels[select_c];
+  varLabels = varLabels[select_c];
+
   df.attr("datalabel") = datalabelCV;
   df.attr("time.stamp") = timestampCV;
   df.attr("formats") = formats;
-  df.attr("types") = types;
+  df.attr("types") = types_kk;
   df.attr("val.labels") = valLabels;
   df.attr("var.labels") = varLabels;
   df.attr("version") = versionIV;
@@ -645,6 +698,6 @@ List read_pre13_dta(FILE * file, const bool missing,
   df.attr("expansion.fields") = ch;
   df.attr("byteorder") = byteorderI;
   df.attr("orig.dim") = dim;
-  
+
   return df;
 }
diff --git a/src/register.c b/src/register.c
index 19e50457..5a20b782 100644
--- a/src/register.c
+++ b/src/register.c
@@ -15,7 +15,7 @@ extern SEXP readstata13_stata_save(SEXP, SEXP);
 
 static const R_CallMethodDef CallEntries[] = {
     {"readstata13_stata_pre13_save", (DL_FUNC) &readstata13_stata_pre13_save, 2},
-    {"readstata13_stata_read",       (DL_FUNC) &readstata13_stata_read,       3},
+    {"readstata13_stata_read",       (DL_FUNC) &readstata13_stata_read,       4},
     {"readstata13_stata_save",       (DL_FUNC) &readstata13_stata_save,       2},
     {NULL, NULL, 0}
 };
diff --git a/tests/testthat/test_read.R b/tests/testthat/test_read.R
index e5c3283d..143d34f6 100644
--- a/tests/testthat/test_read.R
+++ b/tests/testthat/test_read.R
@@ -185,4 +185,3 @@ test_that("Reading of strls", {
   ddstrl <- read.dta13(strl, replace.strl = T)
   expect_equal(ddstrl$model, ddstrl$modelStrL)
 })
-
diff --git a/tests/testthat/test_save.R b/tests/testthat/test_save.R
index 990a8649..2f68b42c 100644
--- a/tests/testthat/test_save.R
+++ b/tests/testthat/test_save.R
@@ -804,3 +804,71 @@ test_that("select.rows = c(5,10)", {
 })
 
 # rm(list = files)
+
+#### select.cols ####
+
+if (readstata13:::dir.exists13("data"))
+  unlink("data", recursive = TRUE)
+dir.create("data")
+
+dd <- mtcars
+
+save.dta13(dd, "data/dta_119.dta", version = 119)
+save.dta13(dd, "data/dta_118.dta", version = 118)
+save.dta13(dd, "data/dta_117.dta", version = 117)
+save.dta13(dd, "data/dta_115.dta", version = 115)
+save.dta13(dd, "data/dta_114.dta", version = 114)
+save.dta13(dd, "data/dta_113.dta", version = 113)
+save.dta13(dd, "data/dta_112.dta", version = 112)
+save.dta13(dd, "data/dta_111.dta", version = 111)
+save.dta13(dd, "data/dta_110.dta", version = 110)
+save.dta13(dd, "data/dta_108.dta", version = 108)
+save.dta13(dd, "data/dta_107.dta", version = 107)
+save.dta13(dd, "data/dta_106.dta", version = 106)
+save.dta13(dd, "data/dta_105.dta", version = 105)
+save.dta13(dd, "data/dta_104.dta", version = 104)
+save.dta13(dd, "data/dta_103.dta", version = 103)
+save.dta13(dd, "data/dta_102.dta", version = 102)
+
+dd119 <- read.dta13("data/dta_119.dta", select.cols = c("disp", "drat"))
+dd118 <- read.dta13("data/dta_118.dta", select.cols = c("disp", "drat"))
+dd117 <- read.dta13("data/dta_117.dta", select.cols = c("disp", "drat"))
+dd115 <- read.dta13("data/dta_115.dta", select.cols = c("disp", "drat"))
+dd114 <- read.dta13("data/dta_114.dta", select.cols = c("disp", "drat"))
+dd113 <- read.dta13("data/dta_113.dta", select.cols = c("disp", "drat"))
+dd112 <- read.dta13("data/dta_112.dta", select.cols = c("disp", "drat"))
+dd111 <- read.dta13("data/dta_111.dta", select.cols = c("disp", "drat"))
+dd110 <- read.dta13("data/dta_110.dta", select.cols = c("disp", "drat"))
+dd108 <- read.dta13("data/dta_108.dta", select.cols = c("disp", "drat"))
+dd107 <- read.dta13("data/dta_107.dta", select.cols = c("disp", "drat"))
+dd106 <- read.dta13("data/dta_106.dta", select.cols = c("disp", "drat"))
+dd105 <- read.dta13("data/dta_105.dta", select.cols = c("disp", "drat"))
+dd104 <- read.dta13("data/dta_104.dta", select.cols = c("disp", "drat"))
+dd103 <- read.dta13("data/dta_103.dta", select.cols = c("disp", "drat"))
+dd102 <- read.dta13("data/dta_102.dta", select.cols = c("disp", "drat"))
+
+unlink("data", recursive = TRUE)
+
+dd <- dd[,c("disp", "drat")]
+
+test_that("select.cols = c('disp', 'drat')", {
+  # check numerics
+  expect_true(datacompare(dd, dd119))
+  expect_true(datacompare(dd, dd118))
+  expect_true(datacompare(dd, dd117))
+  expect_true(datacompare(dd, dd115))
+  expect_true(datacompare(dd, dd114))
+  expect_true(datacompare(dd, dd113))
+  expect_true(datacompare(dd, dd112))
+  expect_true(datacompare(dd, dd111))
+  expect_true(datacompare(dd, dd110))
+  expect_true(datacompare(dd, dd108))
+  expect_true(datacompare(dd, dd107))
+  expect_true(datacompare(dd, dd106))
+  expect_true(datacompare(dd, dd105))
+  expect_true(datacompare(dd, dd104))
+  expect_true(datacompare(dd, dd103))
+  expect_true(datacompare(dd, dd102))
+})
+
+# rm(list = files)

From 611a7ca3863330e0f6ebdf7b808ca002f3a98a06 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Tue, 13 Jun 2017 20:31:17 +0200
Subject: [PATCH 24/76] calculate jumpsize to maximize jumps

---
 inst/include/readstata.h | 44 ++++++++++++++++++++++++++++++++++++++++
 src/read_dta.cpp         |  9 ++++++--
 src/read_pre13_dta.cpp   |  8 ++++++--
 3 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index b7d136fb..4ae5f58a 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -196,5 +196,49 @@ inline Rcpp::IntegerVector which_pos(Rcpp::IntegerVector cvec,
   return(nselect);
 }
 
+inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector vartype3) {
+
+  // amount of
+  Rcpp::IntegerVector vartype4;
+  int64_t val = 0;
+  bool last = 0;
+
+  uint32_t k = vartype3.size();
+
+  for (uint32_t i=0; i<k; ++i)
+  {
+
+    int32_t value = vartype3(i);
+
+    if (value < 0) {
+
+      // after start or if last was pos fill to val
+      if ( (i == 0) || (last == 1)) {
+        val = value;
+      } else {
+        val += value;
+      }
+      last = 0;
+
+    } else {
+
+      // push back if last was neg
+      if (i > 0 & last == 0)
+        vartype4.push_back(val);
+
+      val = value;
+      vartype4.push_back(val);
+
+      last = 1;
+    }
+
+    if ((i+1 == k) & (last == 0)) {
+      vartype4.push_back(val);
+    }
+
+  }
+
+    return(vartype4);
+}
 
 #endif
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 023ca3bf..94153e61 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -446,6 +446,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
 
   vartype3[nselect] = rlen2;
 
+
   // 1. create the list
   List df(kk);
   for (uint32_t i=0; i<kk; ++i)
@@ -470,6 +471,10 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     break;
     }
   }
+    
+  // calulate jumpsize
+  IntegerVector vartype4 = calc_jump(vartype3);
+  kk = vartype4.size();
 
   // 2. fill it with data
 
@@ -481,9 +486,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   {
     // reset partial index
     ii = 0;
-    for (uint32_t i=0; i<k; ++i)
+    for (uint32_t i=0; i<kk; ++i)
     {
-      int const type = vartype3[i];
+      int const type = vartype4[i];
 
       switch(((type >0) & (type < 2046)) ? 2045 : type)
       {
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index d8b72f98..fdf23e49 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -454,6 +454,10 @@ List read_pre13_dta(FILE * file, const bool missing,
     }
   }
 
+  // calulate jumpsize
+  IntegerVector vartype4 = calc_jump(vartype3);
+  kk = vartype4.size();
+
   // 2. fill it with data
 
   // skip into the data part
@@ -464,9 +468,9 @@ List read_pre13_dta(FILE * file, const bool missing,
   {
     // reset partial index
     ii = 0;
-    for (uint16_t i=0; i<k; ++i)
+    for (uint16_t i=0; i<kk; ++i)
     {
-      int const type = vartype3[i];
+      int const type = vartype4[i];
 
 
       switch(((type >0) & (type < 244)) ? 244 : type)

From 1a26822fcd7d1ef516e8c9633bf4a1188c0baa2c Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Wed, 14 Jun 2017 08:22:39 +0200
Subject: [PATCH 25/76] whitespace changes

---
 R/read.R                   |  82 ++++-----
 inst/include/readstata.h   |  60 +++---
 src/read_dta.cpp           | 368 ++++++++++++++++++-------------------
 src/read_pre13_dta.cpp     | 280 ++++++++++++++--------------
 tests/testthat/test_save.R |   2 +-
 5 files changed, 396 insertions(+), 396 deletions(-)

diff --git a/R/read.R b/R/read.R
index 6a153b33..bb717618 100644
--- a/R/read.R
+++ b/R/read.R
@@ -135,9 +135,9 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
   }
   if (!file.exists(filepath))
     return(message("File not found."))
-
-
-
+  
+  
+  
   # some select.row checks
   if (!is.null(select.rows)) {
     # check that it is a numeric
@@ -147,11 +147,11 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       # guard against negative values
       if (any(select.rows < 0) )
         select.rows <- abs(select.rows)
-
+      
       # check that lenght is not > 2
       if (length(select.rows) > 2)
         return(message("select.rows must be of length 1 or 2."))
-
+      
       # if lenght 1 start at row 1
       if (length(select.rows) == 1)
         select.rows <- c(1, select.rows)
@@ -159,7 +159,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     # reorder if 2 is bigger than 1
     if (select.rows[2] < select.rows[1])
       select.rows <- c(select.rows[2], select.rows[1])
-
+    
     # make sure to start at index position 1 if select.rows[2] > 0
     if (select.rows[2] > 0 & select.rows[1] == 0)
       select.rows[1] <- 1
@@ -167,15 +167,15 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     # set a value
     select.rows <- c(0,0)
   }
-
+  
   if (is.null(select.cols)){
     select.cols <- ""
   }
-
+  
   data <- stata_read(filepath, missing.type, select.rows, select.cols)
-
+  
   version <- attr(data, "version")
-
+  
   sstr     <- 2045
   sstrl    <- 32768
   sdouble  <- 65526
@@ -183,7 +183,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
   slong    <- 65528
   sint     <- 65529
   sbyte    <- 65530
-
+  
   if (version < 117) {
     sstr    <- 244
     sstrl   <- 255
@@ -193,20 +193,20 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     sint    <- 252
     sbyte   <- 251
   }
-
+  
   if (convert.underscore)
     names(data) <- gsub("_", ".", names(data))
-
+  
   types <- attr(data, "types")
   val.labels <- attr(data, "val.labels")
   label <- attr(data, "label.table")
-
+  
   if (missing.type) {
     stata.na <- data.frame(type = sdouble:sbyte,
                            min = c(101, 32741, 2147483621, 2 ^ 127, 2 ^ 1023),
                            inc = c(1, 1, 1, 2 ^ 115, 2 ^ 1011)
     )
-
+    
     if (version >= 113L & version < 117L) {
       missings <- vector("list", length(data))
       names(missings) <- names(data)
@@ -240,33 +240,33 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
         warning("'missing.type' only applicable to version >= 8 files")
     }
   }
-
+  
   var.labels <- attr(data, "var.labels")
-
+  
   ## Encoding
   if(!is.null(encoding)) {
-
+    
     # set from encoding by dta version
     if(is.null(fromEncoding)) {
       fromEncoding <- "CP1252"
       if(attr(data, "version") >= 118L)
         fromEncoding <- "UTF-8"
     }
-
+    
     # varnames
     names(data) <- read.encoding(names(data), fromEncoding, encoding)
-
+    
     # var.labels
     attr(data, "var.labels") <- read.encoding(var.labels, fromEncoding,
                                               encoding)
-
+    
     # val.labels
     names(val.labels) <- read.encoding(val.labels, fromEncoding, encoding)
     attr(data, "val.labels") <- val.labels
-
+    
     # label
     names(label) <- read.encoding(names(label), fromEncoding, encoding)
-
+    
     if (length(label) > 0) {
       for (i in 1:length(label))  {
         names(label[[i]]) <- read.encoding(names(label[[i]]), fromEncoding,
@@ -274,12 +274,12 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       }
       attr(data, "label.table") <- label
     }
-
+    
     # recode character variables
     for (v in (1:ncol(data))[types <= sstr]) {
       data[, v] <- iconv(data[, v], from=fromEncoding, to=encoding, sub="byte")
     }
-
+    
     # expansion.field
     efi <- attr(data, "expansion.fields")
     if (length(efi) > 0) {
@@ -289,7 +289,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       }
       attr(data, "expansion.fields") <- efi
     }
-
+    
     if (version >= 117L) {
       #strl
       strl <- attr(data, "strl")
@@ -301,9 +301,9 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       }
     }
   }
-
+  
   var.labels <- attr(data, "var.labels")
-
+  
   if (replace.strl & version >= 117L) {
     strl <- c("")
     names(strl) <- "00000000000000000000"
@@ -314,19 +314,19 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     # if strls are in data.frame remove attribute strl
     attr(data, "strl") <- NULL
   }
-
-
+  
+  
   if (convert.dates) {
     convert_dt_c <- function(x)
       as.POSIXct((x + 0.1) / 1000, origin = "1960-01-01") # avoid rounding down
-
+    
     convert_dt_C <- function(x) {
       ls <- .leap.seconds + seq_along(.leap.seconds) + 315619200
       z <- (x + 0.1) / 1000 # avoid rounding down
       z <- z - rowSums(outer(z, ls, ">="))
       as.POSIXct(z, origin = "1960-01-01")
     }
-
+    
     ff <- attr(data, "formats")
     ## dates <- grep("%-*d", ff)
     ## Stata 12 introduced 'business dates'
@@ -336,16 +336,16 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     ## 'Stata has an old *%d* format notation and some datasets
     ##  still have them. Format *%d*... is equivalent to modern
     ##  format *%td*... and *%-d*... is equivalent to *%-td*...'
-
+    
     dates <- grep("^%(-|)(d|td)", ff)
     ## avoid as.Date in case strptime is messed up
     base <- structure(-3653L, class = "Date") # Stata dates are integer vars
     for (v in dates) data[[v]] <- structure(base + data[[v]], class = "Date")
-
+    
     for (v in grep("%tc", ff)) data[[v]] <- convert_dt_c(data[[v]])
     for (v in grep("%tC", ff)) data[[v]] <- convert_dt_C(data[[v]])
   }
-
+  
   if (convert.factors) {
     vnames <- names(data)
     for (i in seq_along(val.labels)) {
@@ -364,7 +364,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
         varunique <- na.omit(unique(data[, i]))
         # assign label if label set is complete
         if (all(varunique %in% labtable)) {
-
+          
           #check for duplicated labels
           labcount <- table(names(labtable))
           if(any(labcount > 1)) {
@@ -373,28 +373,28 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
             # generate unique labels from assigned label and code number
             names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")")
           }
-
+          
           data[, i] <- factor(data[, i], levels=labtable,
                               labels=names(labtable))
           # else generate labels from codes
         } else if (generate.factors) {
           names(varunique) <- as.character(varunique)
           gen.lab  <- sort(c(varunique[!varunique %in% labtable], labtable))
-
+          
           data[, i] <- factor(data[, i], levels=gen.lab,
                               labels=names(gen.lab))
-
+          
         } else {
           warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no labels assigned.\n  Set option generate.factors=T to generate labels."))
         }
       }
     }
   }
-
+  
   if (add.rownames) {
     rownames(data) <- data[[1]]
     data[[1]] <- NULL
   }
-
+  
   return(data)
 }
diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 4ae5f58a..129395b8 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -72,9 +72,9 @@ T readuint48( T t , FILE * file, bool swapit)
   } else if (ferror(file)){
     Rcpp::warning("num: a binary read error occurred.");
   }
-
+  
   t = *(uint64_t *)&uint48;
-
+  
   if (swapit==0)
     return(t);
   else
@@ -90,7 +90,7 @@ static void readstring(std::string &mystring, FILE * fp, int nchar)
 inline void test(std::string testme, FILE * file)
 {
   std::string test(testme.size(), '\0');
-
+  
   readstring(test,file, test.size());
   if (testme.compare(test)!=0)
   {
@@ -116,19 +116,19 @@ static void writebin(T t, std::fstream& dta, bool swapit)
 template <typename T>
 static void writestr(std::string val_s, T len, std::fstream& dta)
 {
-
+  
   std::stringstream val_stream;
   val_stream << std::left << std::setw(len) << std::setfill('\0') << val_s;
   std::string val_strl = val_stream.str();
-
+  
   dta.write(val_strl.c_str(),val_strl.length());
-
+  
 }
 
 inline Rcpp::IntegerVector calc_rowlength(Rcpp::IntegerVector vartype) {
-
+  
   uint32_t k = vartype.size();
-
+  
   Rcpp::IntegerVector rlen(k);
   // calculate row length in byte
   for (uint32_t i=0; i<k; ++i)
@@ -157,7 +157,7 @@ inline Rcpp::IntegerVector calc_rowlength(Rcpp::IntegerVector vartype) {
     break;
     }
   }
-
+  
   return(rlen);
 }
 
@@ -165,18 +165,18 @@ inline Rcpp::IntegerVector choose(Rcpp::CharacterVector x,
                                   Rcpp::CharacterVector y)
 {
   Rcpp::IntegerVector mm = Rcpp::match(x, y);
-
+  
   if (Rcpp::any(Rcpp::is_na(mm))) {
     Rcpp::LogicalVector ll = !Rcpp::is_na(mm);
-
+    
     Rcpp::CharacterVector ms = x[ll==0];
-
+    
     Rcpp::Rcout << "Variable " <<  ms <<
       " was not found in dta-file." << std::endl;
-
+    
     mm = mm[ll==1];
   }
-
+  
   return(mm);
 }
 
@@ -189,29 +189,29 @@ inline Rcpp::IntegerVector which_pos(Rcpp::IntegerVector cvec,
   for (uint32_t i=0; i<select.size(); ++i) {
     vec.erase(std::remove(vec.begin(), vec.end(), select(i)), vec.end());
   }
-
+  
   Rcpp::IntegerVector nselect = Rcpp::wrap(vec);
   nselect = nselect -1;
-
+  
   return(nselect);
 }
 
 inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector vartype3) {
-
+  
   // amount of
   Rcpp::IntegerVector vartype4;
   int64_t val = 0;
   bool last = 0;
-
+  
   uint32_t k = vartype3.size();
-
+  
   for (uint32_t i=0; i<k; ++i)
   {
-
+    
     int32_t value = vartype3(i);
-
+    
     if (value < 0) {
-
+      
       // after start or if last was pos fill to val
       if ( (i == 0) || (last == 1)) {
         val = value;
@@ -219,26 +219,26 @@ inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector vartype3) {
         val += value;
       }
       last = 0;
-
+      
     } else {
-
+      
       // push back if last was neg
       if (i > 0 & last == 0)
         vartype4.push_back(val);
-
+      
       val = value;
       vartype4.push_back(val);
-
+      
       last = 1;
     }
-
+    
     if ((i+1 == k) & (last == 0)) {
       vartype4.push_back(val);
     }
-
+    
   }
-
-    return(vartype4);
+  
+  return(vartype4);
 }
 
 #endif
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 94153e61..e9c706c6 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -25,36 +25,36 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   // stata_dta><header>
   test("stata_dta><header>", file);
   test("<release>", file);
-
+  
   /*
   * version is a 4 byte character e.g. "117"
   */
-
+  
   int8_t fversion = 117L; //f = first
   int8_t lversion = 119L; //l = last
-
+  
   std::string version(3, '\0');
   readstring(version, file, version.size());
-
+  
   int8_t const release = atoi(version.c_str());
-
+  
   IntegerVector versionIV(1);
   versionIV(0) = release;
-
+  
   // check the release version.
   if (release<fversion || release>lversion)
   {
     Rcpp::warning("File version is %d.\nVersion: Not a version 13/14 dta-file", release);
     return -1;
   }
-
+  
   uint8_t nvarnameslen = 0;
   int8_t nformatslen = 0;
   uint8_t nvalLabelslen = 0;
   uint16_t nvarLabelslen = 0;
   int32_t chlen = 0;
   uint8_t lbllen = 0;
-
+  
   switch(release)
   {
   case 117:
@@ -75,113 +75,113 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     lbllen = 129;
     break;
   }
-
+  
   // </release>
   test("</release>", file);
   test("<byteorder>", file);
-
+  
   /*
   * byteorder is a 4 byte character e.g. "LSF". MSF referes to big-memory data.
   */
-
+  
   std::string byteorder(3, '\0');
   readstring(byteorder,file, byteorder.size());
-
+  
   // </byteorder>
   test("</byteorder>", file);
   test("<K>", file);
-
+  
   bool swapit = 0;
   swapit = strcmp(byteorder.c_str(), sbyteorder);
-
+  
   /*
   * Number of Variables
   */
-
+  
   uint32_t k = 0;
   if (release < 119)
     k = readbin((uint16_t)k, file, swapit);
   if (release == 119)
     k = readbin(k, file, swapit);
-
+  
   //</K>
   test("</K>", file);
   test("<N>", file);
-
+  
   /*
   * Number of Observations
   */
-
+  
   uint64_t n = 0;
-
+  
   if (release == 117)
     n = readbin((uint32_t)n, file, swapit);
   if ((release == 118) | (release == 119))
     n = readbin(n, file, swapit);
-
+  
   //</N>
   test("</N>", file);
   test("<label>", file);
-
+  
   // dim to return original dim for partial read files
   IntegerVector dim(2);
   dim(0) = n;
   dim(1) = k;
-
+  
   /*
   * A dataset may have a label e.g. "Written by R".
   * First we read its length (ndlabel), later the actual label (datalabel).
   * ndlabel:   length of datalabel (excl. binary 0)
   * datalabel: string max length 80
   */
-
+  
   uint16_t ndlabel = 0;
-
+  
   if ((release == 118) | (release == 119))
     ndlabel = readbin(ndlabel, file, swapit);
   if (release == 117)
     ndlabel = readbin((int8_t)ndlabel, file, swapit);
-
+  
   std::string datalabel(ndlabel, '\0');
-
+  
   if (ndlabel>0)
   {
     readstring(datalabel, file, datalabel.size());
   } else {
     datalabel = "";
   }
-
+  
   CharacterVector datalabelCV(1);
   datalabelCV(0) = datalabel;
-
+  
   //</label>
   test("</label>", file);
   test("<timestamp>", file);
-
+  
   /*
   * A dataset may have a timestamp. If it has a timestamp the length of the
   * timestamp (ntimestamp) is 17. Else it is zero.
   * ntimestamp: 0 or 17
   * timestamp: empty or 17 byte string
   */
-
+  
   uint8_t ntimestamp = 0;
   ntimestamp = readbin(ntimestamp, file, swapit);
-
+  
   std::string timestamp(17, '\0');
-
+  
   if (ntimestamp == 17) // ntimestap is 0 or 17
   {
     readstring(timestamp, file, timestamp.size());
   } else {
     timestamp = "";
   }
-
+  
   CharacterVector timestampCV = timestamp;
   //</timestamp></header>
   test("</timestamp></header>", file);
   test("<map>", file);
-
+  
   /*
   * Stata stores the byteposition of certain areas of the file here. Currently
   * this is of no use to us.
@@ -200,7 +200,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * 13. </stata_data>
   * 14. end-of-file
   */
-
+  
   NumericVector map(14);
   for (int i=0; i <14; ++i)
   {
@@ -208,11 +208,11 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     nmap = readbin(nmap, file, swapit);
     map[i] = nmap;
   }
-
+  
   //</map>
   test("</map>", file);
   test("<variable_types>", file);
-
+  
   /*
   * vartypes.
   * 0-2045: strf (String: Max length 2045)
@@ -223,7 +223,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * 65529:  int
   * 65530:  byte
   */
-
+  
   IntegerVector vartype(k);
   for (uint32_t i=0; i<k; ++i)
   {
@@ -231,37 +231,37 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     nvartype = readbin(nvartype, file, swapit);
     vartype[i] = nvartype;
   }
-
+  
   //</variable_types>
   test("</variable_types>", file);
   test("<varnames>", file);
-
+  
   /*
   * varnames.
   */
-
+  
   std::string nvarnames(nvarnameslen, '\0');
-
+  
   CharacterVector varnames(k);
   for (uint32_t i=0; i<k; ++i)
   {
     readstring(nvarnames, file, nvarnames.size());
     varnames[i] = nvarnames;
   }
-
+  
   //</varnames>
   test("</varnames>", file);
   test("<sortlist>", file);
-
+  
   /*
   * sortlist. Stata stores the information which variable of a dataset was
   * sorted. Depending on byteorder sortlist is written different. Currently we
   * do not use this information.
   * Vector size is k+1.
   */
-
+  
   uint64_t big_k = k+1;
-
+  
   IntegerVector sortlist(big_k);
   for (uint64_t i=0; i<big_k; ++i)
   {
@@ -269,65 +269,65 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     nsortlist = readbin(nsortlist, file, swapit);
     sortlist[i] = nsortlist;
   }
-
+  
   //</sortlist>
   test("</sortlist>", file);
   test("<formats>", file);
-
+  
   /*
   * formats handle how Stata prints a variable. Currently we do not use this
   * information.
   */
-
+  
   std::string nformats(nformatslen, '\0');
-
+  
   CharacterVector formats(k);
   for (uint32_t i=0; i<k; ++i)
   {
     readstring(nformats, file, nformats.size());
     formats[i] = nformats;
   }
-
+  
   //</formats>
   test("</formats>", file);
   test("<value_label_names>",file);
-
+  
   /*
   * value_label_names. Stata stores variable labels by names.
   * nvalLabels: length of the value_label_name
   * valLabels:
   */
-
+  
   std::string nvalLabels(nvalLabelslen, '\0');
-
+  
   CharacterVector valLabels(k);
   for (uint32_t i=0; i<k; ++i)
   {
     readstring(nvalLabels, file, nvalLabels.size());
     valLabels[i] = nvalLabels;
   }
-
+  
   //</value_label_names>
   test("</value_label_names>", file);
   test("<variable_labels>", file);
-
+  
   /*
   * variabel_labels
   */
-
+  
   std::string nvarLabels (nvarLabelslen, '\0');
-
+  
   CharacterVector varLabels(k);
   for (uint32_t i=0; i<k; ++i)
   {
     readstring(nvarLabels, file, nvarLabels.size());
     varLabels[i] = nvarLabels;
   }
-
+  
   //</variable_labels>
   test("</variable_labels>", file);
   test("<characteristics>", file);
-
+  
   /*
   * characteristics. Stata can store additional information this way. It may
   * contain notes (for the dataset or a variable) or about label language sets.
@@ -338,89 +338,89 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * chcharact:    characteristicsname (binary 0 terminated)
   * nnocharacter: contes (binary 0 terminated)
   */
-
+  
   std::string chtag = "<ch>";
-
+  
   std::string tago(4, '\0');
   readstring(tago, file, tago.size());
-
+  
   List ch = List();
   CharacterVector chs(3);
-
+  
   while (chtag.compare(tago)==0)
   {
     uint32_t nocharacter = 0;
     nocharacter = readbin(nocharacter, file, swapit);
-
+    
     std::string chvarname(chlen, '\0');
     std::string chcharact(chlen, '\0');
     std::string nnocharacter(nocharacter-chlen*2, '\0');
-
+    
     readstring(chvarname, file, chvarname.size());
     readstring(chcharact, file, chcharact.size());
     readstring(nnocharacter, file, nnocharacter.size());
-
+    
     // chs vector
     CharacterVector chs(3);
     chs[0] = chvarname;
     chs[1] = chcharact;
     chs[2] = nnocharacter;
-
+    
     // add characteristics to the list
     ch.push_front( chs );
-
+    
     // </ch>
     test("</ch>", file);
-
+    
     // read next tag
     readstring(tago, file, tago.size());
   }
-
+  
   //[</ch]aracteristics>
   test("aracteristics>", file);
   test("<data>", file);
-
+  
   /*
   * data. First a list is created with vectors. The vector type is defined by
   * vartype. Stata stores data columnwise so we loop over it and store the
   * data in the list of the first step. Third variable- and row-names are
   * attatched and the list type is changed to data.frame.
   */
-
+  
   uint64_t nmin = selectrows(0);
   uint64_t nmax = selectrows(1);
   uint64_t nn   = 0;
-
+  
   // if  selectrows is c(0,0) use full data
   if ((nmin == 0) && (nmax == 0)){
     nmin = 1;
     nmax = n;
   }
-
+  
   // make sure that n is not greater nmax
   if (n < nmax)
     nmax = n;
-
+  
   // neither should nmin be greater
   if (n < nmin)
     nmin = n;
-
+  
   Rcpp::IntegerVector cvec = seq(1, k);
   Rcpp::IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
-
+  
   // use c indexing starting at 0
   nmin = nmin -1;
   nmax = nmax -1;
-
+  
   // calculate length of variables and of row
   IntegerVector rlen = calc_rowlength(vartype);
   uint64_t rlength = sum(rlen);
-
+  
   // check if vars are selected
   std::string selcols = as<std::string>(selectcols(0));
   bool noselectvars = selcols == "";
-
+  
   // select vars: either select every var or only matched cases
   IntegerVector select;
   if (noselectvars) {
@@ -428,59 +428,59 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   } else {
     select = choose(selectcols, varnames);
   }
-
+  
   // match returns r index
   IntegerVector select_c = select -1;
-
+  
   uint32_t kk = select.size();
-
+  
   // shrink variables
   CharacterVector varnames_kk = varnames[select_c];
   IntegerVector vartype_kk = vartype[select_c];
   IntegerVector vartype3 = vartype;
-
+  
   IntegerVector nselect = which_pos(cvec, select);
-
+  
   IntegerVector rlen2 = rlen[nselect];
   rlen2 = -rlen2;
-
+  
   vartype3[nselect] = rlen2;
-
-
+  
+  
   // 1. create the list
   List df(kk);
   for (uint32_t i=0; i<kk; ++i)
   {
     int const type = vartype_kk[i];
-
+    
     switch(type)
     {
     case STATA_DOUBLE:
     case STATA_FLOAT:
       SET_VECTOR_ELT(df, i, NumericVector(no_init(nn)));
       break;
-
+      
     case STATA_INT:
     case STATA_SHORTINT:
     case STATA_BYTE:
       SET_VECTOR_ELT(df, i, IntegerVector(no_init(nn)));
       break;
-
+      
     default:
       SET_VECTOR_ELT(df, i, CharacterVector(no_init(nn)));
     break;
     }
   }
-    
+  
   // calulate jumpsize
   IntegerVector vartype4 = calc_jump(vartype3);
   kk = vartype4.size();
-
+  
   // 2. fill it with data
-
+  
   // skip into the data part
   fseeko64(file, rlength * nmin, SEEK_CUR);
-
+  
   uint32_t ii = 0;
   for(uint64_t j=0; j<nn; ++j)
   {
@@ -489,7 +489,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     for (uint32_t i=0; i<kk; ++i)
     {
       int const type = vartype4[i];
-
+      
       switch(((type >0) & (type < 2046)) ? 2045 : type)
       {
         // double
@@ -497,12 +497,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       {
         double val_d = 0;
         val_d = readbin(val_d, file, swapit);
-
+        
         if ((missing == 0) && !(val_d == R_NegInf) && ((val_d<STATA_DOUBLE_NA_MIN) || (val_d>STATA_DOUBLE_NA_MAX)) )
           REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
           REAL(VECTOR_ELT(df,ii))[j] = val_d;
-
+        
         break;
       }
         // float
@@ -510,12 +510,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       {
         float val_f = 0;
         val_f = readbin(val_f, file, swapit);
-
+        
         if ((missing == 0) && ((val_f<STATA_FLOAT_NA_MIN) || (val_f>STATA_FLOAT_NA_MAX)) )
           REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
           REAL(VECTOR_ELT(df,ii))[j] = val_f;
-
+        
         break;
       }
         // long
@@ -523,12 +523,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       {
         int32_t val_l = 0;
         val_l = readbin(val_l, file, swapit);
-
+        
         if ((missing == 0) && ((val_l<STATA_INT_NA_MIN) || (val_l>STATA_INT_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_l;
-
+        
         break;
       }
         // int
@@ -536,12 +536,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       {
         int16_t val_i = 0;
         val_i = readbin(val_i, file, swapit);
-
+        
         if ((missing == 0) && ((val_i<STATA_SHORTINT_NA_MIN) || (val_i>STATA_SHORTINT_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_i;
-
+        
         break;
       }
         // byte
@@ -549,12 +549,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       {
         int8_t val_b = 0;
         val_b = readbin(val_b, file, swapit);
-
+        
         if (missing == 0 && ( (val_b<STATA_BYTE_NA_MIN) || (val_b>STATA_BYTE_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_b;
-
+        
         break;
       }
         // strings with 2045 or fewer characters
@@ -563,7 +563,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
         int32_t len = 0;
         len = vartype[i];
         std::string val_s (len, '\0');
-
+        
         readstring(val_s, file, val_s.size());
         as<CharacterVector>(df[ii])[j] = val_s;
         break;
@@ -571,78 +571,78 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
         // string of any length
       case STATA_STRL:
       {// strL 2*4bit or 2 + 6 bit
-
+        
         // FixMe: Strl in 118
         switch (release)
       {
-
+        
       case 117:
       {
         uint32_t v = 0, o = 0;
-
+        
         v = readbin(v, file, swapit);
         o = readbin(o, file, swapit);
-
+        
         stringstream val_stream;
         val_stream << v << '_' << o;
         string val_strl = val_stream.str();
-
+        
         as<CharacterVector>(df[ii])[j] = val_strl;
-
+        
         break;
       }
       case 118:
       {
         int16_t v = 0;
         int64_t o = 0, z = 0;
-
+        
         z = readbin(z, file, swapit);
-
+        
         // works for LSF on little- and big-endian
         if (byteorder.compare("LSF")==0) {
           v = (int16_t)z;
           o = (z >> 16);
         }
-
+        
         // works if we read a big-endian file on little-endian
         if (byteorder.compare("MSF")==0) {
           v = (z >> 48) & ((1 << 16) - 1);
           o = z & ((1 << 16) - 1);
         }
-
+        
         stringstream val_stream;
         val_stream << v << '_' << o;
         string val_strl = val_stream.str();
-
+        
         as<CharacterVector>(df[ii])[j] = val_strl;
-
+        
         break;
       }
       case 119:
       {
         int32_t v = 0;
         int64_t o = 0, z = 0;
-
+        
         z = readbin(z, file, swapit);
-
+        
         // works for LSF on little- and big-endian
         if (byteorder.compare("LSF")==0) {
           v = (int32_t)z & ((1 << 24) - 1);
           o = (z >> 24);
         }
-
+        
         // FixMe: works if we read a big-endian file on little-endian
         if (byteorder.compare("MSF")==0) {
           v = (z >> 48) & ((1 << 24) - 1);
           o = z & ((1 << 24) - 1);
         }
-
+        
         stringstream val_stream;
         val_stream << v << '_' << o;
         string val_strl = val_stream.str();
-
+        
         as<CharacterVector>(df[ii])[j] = val_strl;
-
+        
         break;
       }
       }
@@ -656,25 +656,25 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
         break;
       }
       }
-
+      
       if (type >= 0) ii += 1;
-
+      
       Rcpp::checkUserInterrupt();
     }
   }
-
+  
   // skip to end of data part
   fseeko64(file, rlength * (n - nmax -1), SEEK_CUR);
-
+  
   // 3. Create a data.frame
   df.attr("row.names") = rvec;
   df.attr("names") = varnames_kk;
   df.attr("class") = "data.frame";
-
+  
   //</data>
   test("</data>", file);
   test("<strls>", file);
-
+  
   /*
   * strL. Stata 13 introduced long strings up to 2 billon characters. strLs are
   * sperated by "GSO".
@@ -683,35 +683,35 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * len:   length of the strL.
   * strl:  long string.
   */
-
+  
   std::string gso = "GSO";
-
+  
   std::string tags(3, '\0');
   readstring(tags, file, tags.size());
-
+  
   //put strLs into a named vector
   CharacterVector strlvalues(0);
   CharacterVector strlnames(0);
-
+  
   while(gso.compare(tags)==0)
   {
     CharacterVector strls(2);
     string ref;
-
+    
     // FixMe: Strl in 118
     switch (release)
     {
     case 117:
     {
       uint32_t v = 0, o = 0;
-
+      
       v = readbin(v, file, swapit);
       o = readbin(o, file, swapit);
-
+      
       stringstream val_stream;
       val_stream << v << '_' << o;
       ref.assign(val_stream.str());
-
+      
       break;
     }
     case 118:
@@ -719,44 +719,44 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     {
       uint32_t v = 0;
       uint64_t o = 0;
-
+      
       v = readbin(v, file, swapit);
       o = readbin(o, file, swapit);
-
+      
       stringstream val_stream;
       val_stream << v << '_' << o;
       ref.assign(val_stream.str());
-
+      
       break;
     }
     }
-
+    
     // (129 = binary) | (130 = ascii)
     uint8_t t = 0;
     t = readbin(t, file, swapit);
-
+    
     uint32_t len = 0;
     len = readbin(len, file, swapit);
-
+    
     // 129 len = len; 130 len = len +'\0';
-
+    
     std::string strl(len, '\0');
     readstring(strl, file, strl.size());
-
+    
     strlvalues.push_back( strl );
     strlnames.push_back( ref );
-
+    
     readstring(tags, file, tags.size());
   }
-
+  
   // set identifier as name
   strlvalues.attr("names") = strlnames;
-
+  
   // after strls
   //[</s]trls>
   test("trls>", file);
   test("<value_labels>", file);
-
+  
   /*
   * labels are seperated by <lbl>-tags. Labels may appear in any order e.g.
   * 2 "female" 1 "male 9 "missing". They are stored as tables.
@@ -766,34 +766,34 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * txtlen:   length of the label text.
   * off:      offset defines where to read a new label in txtlen.
   */
-
+  
   std::string lbltag = "<lbl>";
-
+  
   std::string tag(5, '\0');
   readstring(tag, file, tag.size());
-
+  
   List labelList = List(); //put labels into this list
-
+  
   while(lbltag.compare(tag)==0)
   {
     int32_t nlen = 0, labn = 0, txtlen = 0, noff = 0, val = 0;
-
+    
     // length of value_label_table
     nlen = readbin(nlen, file, swapit);
-
+    
     // name of this label set
-
+    
     std::string nlabname(lbllen, '\0');
-
+    
     readstring(nlabname, file, nlabname.size());
-
+    
     //padding
     fseek(file, 3, SEEK_CUR);
-
+    
     // value_label_table for actual label set
     labn = readbin(labn, file, swapit);
     txtlen = readbin(txtlen, file, swapit);
-
+    
     // offset for each label
     // off0 : label 0 starts at off0
     // off1 : label 1 starts at off1 ...
@@ -802,43 +802,43 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       noff = readbin(noff, file, swapit);
       off[i] = noff;
     }
-
+    
     // needed for match
     IntegerVector laborder = clone(off);
     //laborder.erase(labn+1);
     IntegerVector labordersort = clone(off);
     //labordersort.erase(labn+1);
     std::sort(labordersort.begin(), labordersort.end());
-
+    
     // needs txtlen for loop
     off.push_back(txtlen);
-
+    
     // sort offsets so we can read labels sequentially
     std::sort(off.begin(), off.end());
-
+    
     // create an index to sort lables along the code values
     // this is done while factor creation
     IntegerVector indx(labn);
     indx = match(laborder,labordersort);
-
+    
     // code for each label
     IntegerVector code(labn);
     for (int i=0; i < labn; ++i) {
       val = readbin(val, file, swapit);
       code[i] = val;
     }
-
+    
     // label text
     CharacterVector label(labn);
     for (int i=0; i < labn; ++i) {
       int lablen = off[i+1]-off[i];
-
+      
       std::string lab (lablen, '\0');
-
+      
       readstring(lab, file, lablen);
       label[i] = lab;
     }
-
+    
     // sort labels according to indx
     CharacterVector labelo(labn);
     for (int i=0; i < labn; ++i) {
@@ -847,32 +847,32 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     // create table for actual label set
     string const labset = nlabname;
     code.attr("names") = labelo;
-
+    
     // add this set to output list
     labelList.push_front( code, labset);
-
+    
     fseek(file, 6, SEEK_CUR); //</lbl>
-
+    
     readstring(tag, file, tag.size());
   }
-
+  
   /*
-   * Final test if we reached the end of the file
-   * close the file
-   */
-
+  * Final test if we reached the end of the file
+  * close the file
+  */
+  
   // [</val]ue_labels>
   test("ue_labels>", file);
   test("</stata_dta>", file);
-
+  
   /*
    * assign attributes to the resulting data.frame
    */
-
+  
   formats = formats[select_c];
   valLabels = valLabels[select_c];
   varLabels = varLabels[select_c];
-
+  
   df.attr("datalabel") = datalabelCV;
   df.attr("time.stamp") = timestampCV;
   df.attr("formats") = formats;
@@ -885,6 +885,6 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   df.attr("strl") = strlvalues;
   df.attr("byteorder") = wrap(byteorder);
   df.attr("orig.dim") = dim;
-
+  
   return df;
 }
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index fdf23e49..70bfe4e8 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -25,20 +25,20 @@ List read_pre13_dta(FILE * file, const bool missing,
                     const CharacterVector selectcols)
 {
   int8_t release = 0;
-
+  
   rewind(file);
   release = readbin(release, file, 0);
-
+  
   if (release<102 || release == 109 || release>115)
     Rcpp::stop("First byte: Not a dta-file we can read.");
-
+  
   IntegerVector versionIV(1);
   versionIV(0) = release;
-
+  
   /*
   * byteorder is a 4 byte character e.g. "LSF". MSF referes to big-memory data.
   */
-
+  
   uint16_t ndlabel = 81;
   uint8_t nvarnameslen = 33;
   int8_t nformatslen = 49;
@@ -46,7 +46,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   uint16_t nvarLabelslen = 81;
   int32_t chlen = 33;
   uint8_t lbllen = 33;
-
+  
   switch(release)
   {
   case 102:
@@ -87,70 +87,70 @@ List read_pre13_dta(FILE * file, const bool missing,
     nformatslen = 12;
     break;
   }
-
+  
   CharacterVector byteorderC(1);
   IntegerVector byteorderI(1);
   bool swapit = 0;
-
+  
   int8_t byteorder = 0;
   byteorder = readbin(byteorder, file, 0);
   // 1 = MSF 2 = LSF
   swapit = std::abs(SBYTEORDER-byteorder);
   byteorderI(0) = byteorder;
-
+  
   // filetype: unnown?
   int8_t ft = 0;
   ft = readbin(ft, file, swapit);
-
+  
   int8_t unused = 0;
   unused = readbin(unused, file, swapit);
-
-
+  
+  
   /*
   * Number of Variables
   */
-
+  
   uint16_t k = 0;
   k = readbin(k, file, swapit);
-
-
+  
+  
   /*
   * Number of Observations
   */
-
+  
   uint32_t n = 0;
   n = readbin(n, file, swapit);
-
+  
   // dim to return original dim for partial read files
   IntegerVector dim(2);
   dim(0) = n;
   dim(1) = k;
-
+  
   /*
   * A dataset may have a label e.g. "Written by R".
   * First we read its length (ndlabel), later the actual label (datalabel).
   * ndlabel:   length of datalabel (excl. binary 0)
   * datalabel: string max length 80
   */
-
-
+  
+  
   CharacterVector datalabelCV(1);
-
+  
   std::string datalabel(ndlabel, '\0');
-
+  
   if (ndlabel > 0)
     readstring(datalabel, file, datalabel.size());
   else
     datalabel = "";
-
+  
   datalabelCV(0) = datalabel;
-
+  
   CharacterVector timestampCV(1);
   std::string timestamp(18, '\0');
-
+  
   switch (release)
   {
-
+    
   case 102:
   case 103:
   case 104:
@@ -158,16 +158,16 @@ List read_pre13_dta(FILE * file, const bool missing,
     timestamp = "";
     break;
   }
-
+    
   default:
   {
     readstring(timestamp, file, timestamp.size());
     break;
   }
   }
-
+  
   timestampCV(0) = timestamp;
-
+  
   /*
   * vartypes.
   * 0-2045: strf (String: Max length 2045)
@@ -178,12 +178,12 @@ List read_pre13_dta(FILE * file, const bool missing,
   * 65529:  int
   * 65530:  byte
   */
-
+  
   IntegerVector vartype(k);
-
+  
   switch (release)
   {
-
+    
   case 102:
   case 103:
   case 104:
@@ -195,11 +195,11 @@ List read_pre13_dta(FILE * file, const bool missing,
   case 112:
   {
     uint8_t nvartypec = 0;
-
+    
     for (uint16_t i=0; i<k; ++i)
     {
       nvartypec = readbin(nvartypec, file, swapit);
-
+      
       if(nvartypec== 98) // b
         vartype[i] = 251;
       if(nvartypec==105) // i
@@ -215,14 +215,14 @@ List read_pre13_dta(FILE * file, const bool missing,
     }
     break;
   }
-
+    
   case 111:
   case 113:
   case 114:
   case 115:
   {
     uint8_t nvartype = 0;
-
+    
     for (uint16_t i=0; i<k; ++i)
     {
       nvartype = readbin(nvartype, file, swapit);
@@ -230,34 +230,34 @@ List read_pre13_dta(FILE * file, const bool missing,
     }
     break;
   }
-
+    
   }
-
+  
   // FixMe: Needs clone otherwise missing.type would not work
   IntegerVector types = clone(vartype);
-
+  
   /*
   * varnames. Max length 33.
   */
-
+  
   std::string nvarnames(nvarnameslen, '\0');
-
+  
   CharacterVector varnames(k);
   for (uint16_t i=0; i<k; ++i)
   {
     readstring(nvarnames, file, nvarnames.size());
     varnames[i] = nvarnames;
   }
-
+  
   /*
   * sortlist. Stata stores the information which variable of a dataset was
   * sorted. Depending on byteorder sortlist is written different. Currently we
   * do not use this information.
   * Vector size is k+1.
   */
-
+  
   uint32_t big_k = k+1;
-
+  
   IntegerVector sortlist(big_k);
   for (uint32_t i=0; i<big_k; ++i)
   {
@@ -265,100 +265,100 @@ List read_pre13_dta(FILE * file, const bool missing,
     nsortlist = readbin(nsortlist, file, swapit);
     sortlist[i] = nsortlist;
   }
-
+  
   /*
   * formats handle how Stata prints a variable. Currently we do not use this
   * information.
   */
-
+  
   CharacterVector formats(k);
   std::string nformats(nformatslen, '\0');
-
+  
   for (uint16_t i=0; i<k; ++i)
   {
     readstring(nformats, file, nformats.size());
     formats[i] = nformats;
   }
-
+  
   /*
   * value_label_names. Stata stores variable labels by names.
   * nvalLabels: length of the value_label_name
   * valLabels:  Char of max length 33
   */
-
+  
   CharacterVector valLabels(k);
   std::string nvalLabels(nvalLabelslen, '\0');
-
+  
   for (uint16_t i=0; i<k; ++i)
   {
     readstring(nvalLabels, file, nvalLabels.size());
     valLabels[i] = nvalLabels;
   }
-
+  
   /*
   * variabel_labels
   */
-
+  
   CharacterVector varLabels(k);
   std::string nvarLabels (nvarLabelslen, '\0');
-
+  
   for (uint16_t i=0; i<k; ++i)
   {
     readstring(nvarLabels, file, nvarLabels.size());
     varLabels[i] = nvarLabels;
   }
-
+  
   /* <characteristics> ... </characteristics> */
-
+  
   List ch = List();
   if (release > 104)
   {
     int8_t datatype = 0;
     uint32_t len = 0;
-
+    
     datatype = readbin(datatype, file, swapit);
     if (release <= 108)
       len = readbin((uint16_t)len, file, swapit);
     else
       len = readbin(len, file, swapit);
-
-
+    
+    
     while (!(datatype==0) && !(len==0))
     {
       std::string chvarname(chlen, '\0');
       std::string chcharact(chlen, '\0');
       std::string nnocharacter(len-chlen*2, '\0');
-
+      
       readstring(chvarname, file, chvarname.size());
       readstring(chcharact, file, chcharact.size());
       readstring(nnocharacter, file, nnocharacter.size());
-
+      
       // chs vector
       CharacterVector chs(3);
       chs[0] = chvarname;
       chs[1] = chcharact;
       chs[2] = nnocharacter;
-
+      
       // add characteristics to the list
       ch.push_front( chs );
-
+      
       datatype = readbin(datatype, file, swapit);
-
+      
       if (release <= 108)
         len = readbin((uint16_t)len, file, swapit);
       else
         len = readbin(len, file, swapit);
     }
   }
-
-
+  
+  
   /*
   * data. First a list is created with vectors. The vector type is defined by
   * vartype. Stata stores data columnwise so we loop over it and store the
   * data in the list of the first step. Third variable- and row-names are
   * attatched and the list type is changed to data.frame.
   */
-
+  
   /* replace vartypes of Stata 8 - 12 with Stata 13 values. */
   // 117 contains new variable types (longer strings and strL)
   std::replace (vartype.begin(), vartype.end(), 251, STATA_BYTE);
@@ -366,42 +366,42 @@ List read_pre13_dta(FILE * file, const bool missing,
   std::replace (vartype.begin(), vartype.end(), 253, STATA_INT);
   std::replace (vartype.begin(), vartype.end(), 254, STATA_FLOAT);
   std::replace (vartype.begin(), vartype.end(), 255, STATA_DOUBLE);
-
-
+  
+  
   uint64_t nmin = selectrows(0);
   uint64_t nmax = selectrows(1);
   uint64_t nn   = 0;
-
+  
   // if  selectrows is c(0,0) use full data
   if ((nmin == 0) && (nmax == 0)){
     nmin = 1;
     nmax = n;
   }
-
+  
   // make sure that n is not greater nmax
   if (n < nmax)
     nmax = n;
-
+  
   // neither should nmin be greater
   if (n < nmin)
     nmin = n;
-
+  
   Rcpp::IntegerVector cvec = seq(1, k);
   Rcpp::IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
-
+  
   // use c indexing starting at 0
   nmin = nmin -1;
   nmax = nmax -1;
-
+  
   // calculate length of variables and of row
   IntegerVector rlen = calc_rowlength(vartype);
   uint64_t rlength = sum(rlen);
-
+  
   // check if vars are selected
   std::string selcols = as<std::string>(selectcols(0));
   bool noselectvars = selcols == "";
-
+  
   // select vars: either select every var or only matched cases
   IntegerVector select;
   if (noselectvars) {
@@ -409,60 +409,60 @@ List read_pre13_dta(FILE * file, const bool missing,
   } else {
     select = choose(selectcols, varnames);
   }
-
+  
   // match returns r index
   IntegerVector select_c = select -1;
-
+  
   uint32_t kk = select.size();
-
+  
   // shrink variables
   CharacterVector varnames_kk = varnames[select_c];
   IntegerVector vartype_kk = vartype[select_c];
   IntegerVector types_kk = types[select_c];
   IntegerVector vartype3 = vartype;
-
-
+  
+  
   IntegerVector nselect = which_pos(cvec, select);
-
+  
   IntegerVector rlen2 = rlen[nselect];
   rlen2 = -rlen2;
-
+  
   vartype3[nselect] = rlen2;
-
+  
   // 1. create the list
   List df(kk);
   for (uint32_t i=0; i<kk; ++i)
   {
     int const type = vartype_kk[i];
-
+    
     switch(type)
     {
     case STATA_DOUBLE:
     case STATA_FLOAT:
       SET_VECTOR_ELT(df, i, NumericVector(no_init(nn)));
       break;
-
+      
     case STATA_INT:
     case STATA_SHORTINT:
     case STATA_BYTE:
       SET_VECTOR_ELT(df, i, IntegerVector(no_init(nn)));
       break;
-
+      
     default:
       SET_VECTOR_ELT(df, i, CharacterVector(no_init(nn)));
     break;
     }
   }
-
+  
   // calulate jumpsize
   IntegerVector vartype4 = calc_jump(vartype3);
   kk = vartype4.size();
-
+  
   // 2. fill it with data
-
+  
   // skip into the data part
   fseeko64(file, rlength * nmin, SEEK_CUR);
-
+  
   uint32_t ii = 0;
   for(uint32_t j=0; j<nn; ++j)
   {
@@ -471,8 +471,8 @@ List read_pre13_dta(FILE * file, const bool missing,
     for (uint16_t i=0; i<kk; ++i)
     {
       int const type = vartype4[i];
-
-
+      
+      
       switch(((type >0) & (type < 244)) ? 244 : type)
       {
         // double
@@ -480,12 +480,12 @@ List read_pre13_dta(FILE * file, const bool missing,
       {
         double val_d = 0;
         val_d = readbin(val_d, file, swapit);
-
+        
         if ((missing == FALSE) & !(val_d == R_NegInf) & ((val_d<STATA_DOUBLE_NA_MIN) | (val_d>STATA_DOUBLE_NA_MAX)) )
           REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
           REAL(VECTOR_ELT(df,ii))[j] = val_d;
-
+        
         break;
       }
         // float
@@ -493,12 +493,12 @@ List read_pre13_dta(FILE * file, const bool missing,
       {
         float val_f = 0;
         val_f = readbin(val_f, file, swapit);
-
+        
         if ((missing == FALSE) & ((val_f<STATA_FLOAT_NA_MIN) | (val_f>STATA_FLOAT_NA_MAX)) )
           REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
           REAL(VECTOR_ELT(df,ii))[j] = val_f;
-
+        
         break;
       }
         // long
@@ -506,13 +506,13 @@ List read_pre13_dta(FILE * file, const bool missing,
       {
         int32_t val_l = 0;
         val_l = readbin(val_l, file, swapit);
-
-
+        
+        
         if ((missing == FALSE) & ((val_l<STATA_INT_NA_MIN) | (val_l>STATA_INT_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j]  = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_l;
-
+        
         break;
       }
         // int
@@ -520,12 +520,12 @@ List read_pre13_dta(FILE * file, const bool missing,
       {
         int16_t val_i = 0;
         val_i = readbin(val_i, file, swapit);
-
+        
         if ((missing == FALSE) & ((val_i<STATA_SHORTINT_NA_MIN) | (val_i>STATA_SHORTINT_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_i;
-
+        
         break;
       }
         // byte
@@ -533,12 +533,12 @@ List read_pre13_dta(FILE * file, const bool missing,
       {
         int8_t val_b = 0;
         val_b = readbin(val_b, file, swapit);
-
+        
         if ((missing == FALSE) & ( (val_b<STATA_BYTE_NA_MIN) | (val_b>STATA_BYTE_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_b;
-
+        
         break;
       }
         // strings with 244 or fewer characters
@@ -547,11 +547,11 @@ List read_pre13_dta(FILE * file, const bool missing,
         int32_t len = 0;
         len = vartype[i];
         std::string val_s (len, '\0');
-
+        
         readstring(val_s, file, val_s.size());
-
+        
         as<CharacterVector>(df[ii])[j] = val_s;
-
+        
         break;
       }
         // case < 0:
@@ -562,20 +562,20 @@ List read_pre13_dta(FILE * file, const bool missing,
         break;
       }
       }
-
+      
       if (type >= 0) ii += 1;
       Rcpp::checkUserInterrupt();
     }
   }
-
+  
   // skip to end of data part
   fseeko64(file, rlength * (n - nmax -1), SEEK_CUR);
-
+  
   // 3. Create a data.frame
   df.attr("row.names") = rvec;
   df.attr("names") = varnames_kk;
   df.attr("class") = "data.frame";
-
+  
   /*
   * labels are seperated by <lbl>-tags. Labels may appear in any order e.g.
   * 2 "female" 1 "male 9 "missing". They are stored as tables.
@@ -585,39 +585,39 @@ List read_pre13_dta(FILE * file, const bool missing,
   * txtlen:   length of the label text.
   * off:      offset defines where to read a new label in txtlen.
   */
-
+  
   List labelList = List(); //put labels into this list
-
+  
   if (release>105) {
     // FixMe: the while statement differs and the final check
-
-
+    
+    
     int32_t nlen = 0, labn = 0, txtlen = 0, noff = 0, val = 0;
     std::string tag(5, '\0');
-
+    
     bool haslabel = false;
-
+    
     // length of value_label_table
     nlen = readbin(nlen, file, swapit);
-
+    
     if (!(feof(file) || ferror(file)))
       haslabel = true;
-
+    
     while(haslabel)
     {
-
+      
       // name of this label set
       std::string nlabname(lbllen, '\0');
-
+      
       readstring(nlabname, file, nlabname.size());
-
+      
       //padding
       fseek(file, 3, SEEK_CUR);
-
+      
       // value_label_table for actual label set
       labn = readbin(labn, file, swapit);
       txtlen = readbin(txtlen, file, swapit);
-
+      
       // offset for each label
       // off0 : label 0 starts at off0
       // off1 : label 1 starts at off1 ...
@@ -626,43 +626,43 @@ List read_pre13_dta(FILE * file, const bool missing,
         noff = readbin(noff, file, swapit);
         off[i] = noff;
       }
-
+      
       // needed for match
       IntegerVector laborder = clone(off);
       //laborder.erase(labn+1);
       IntegerVector labordersort = clone(off);
       //labordersort.erase(labn+1);
       std::sort(labordersort.begin(), labordersort.end());
-
+      
       // needs txtlen for loop
       off.push_back(txtlen);
-
+      
       // sort offsets so we can read labels sequentially
       std::sort(off.begin(), off.end());
-
+      
       // create an index to sort lables along the code values
       // this is done while factor creation
       IntegerVector indx(labn);
       indx = match(laborder,labordersort);
-
+      
       // code for each label
       IntegerVector code(labn);
       for (int i=0; i < labn; ++i) {
         val = readbin(val, file, swapit);
         code[i] = val;
       }
-
+      
       // label text
       CharacterVector label(labn);
       for (int i=0; i < labn; ++i) {
         int lablen = off[i+1]-off[i];
-
+        
         std::string lab (lablen, '\0');
-
+        
         readstring(lab, file, lablen);
         label[i] = lab;
       }
-
+      
       // sort labels according to indx
       CharacterVector labelo(labn);
       for (int i=0; i < labn; ++i) {
@@ -671,26 +671,26 @@ List read_pre13_dta(FILE * file, const bool missing,
       // create table for actual label set
       string const labset = nlabname;
       code.attr("names") = labelo;
-
+      
       // add this set to output list
       labelList.push_front( code, labset);
-
+      
       // length of value_label_table
       nlen = readbin(nlen, file, swapit);
-
+      
       if (feof(file) || ferror(file))
         break;
     }
   }
-
+  
   /*
    * assign attributes to the resulting data.frame
    */
-
+  
   formats = formats[select_c];
   valLabels = valLabels[select_c];
   varLabels = varLabels[select_c];
-
+  
   df.attr("datalabel") = datalabelCV;
   df.attr("time.stamp") = timestampCV;
   df.attr("formats") = formats;
@@ -702,6 +702,6 @@ List read_pre13_dta(FILE * file, const bool missing,
   df.attr("expansion.fields") = ch;
   df.attr("byteorder") = byteorderI;
   df.attr("orig.dim") = dim;
-
+  
   return df;
 }
diff --git a/tests/testthat/test_save.R b/tests/testthat/test_save.R
index 2f68b42c..65b86d0a 100644
--- a/tests/testthat/test_save.R
+++ b/tests/testthat/test_save.R
@@ -364,7 +364,7 @@ test_that("add.rownames TRUE", {
   expect_true(identical(rownames(dd), rownames(dd104)))
   expect_true(identical(rownames(dd), rownames(dd103)))
   expect_true(identical(rownames(dd), rownames(dd102)))
-
+  
   # Check that data is identical
   expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))

From 522761a916fec552e780759a6dd428212cf2ab1c Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Wed, 14 Jun 2017 08:31:59 +0200
Subject: [PATCH 26/76] Revert "whitespace changes"

This reverts commit 1a26822fcd7d1ef516e8c9633bf4a1188c0baa2c.
---
 R/read.R                   |  82 ++++-----
 inst/include/readstata.h   |  60 +++---
 src/read_dta.cpp           | 368 ++++++++++++++++++-------------------
 src/read_pre13_dta.cpp     | 280 ++++++++++++++--------------
 tests/testthat/test_save.R |   2 +-
 5 files changed, 396 insertions(+), 396 deletions(-)

diff --git a/R/read.R b/R/read.R
index bb717618..6a153b33 100644
--- a/R/read.R
+++ b/R/read.R
@@ -135,9 +135,9 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
   }
   if (!file.exists(filepath))
     return(message("File not found."))
-  
-  
-  
+
+
+
   # some select.row checks
   if (!is.null(select.rows)) {
     # check that it is a numeric
@@ -147,11 +147,11 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       # guard against negative values
       if (any(select.rows < 0) )
         select.rows <- abs(select.rows)
-      
+
       # check that lenght is not > 2
       if (length(select.rows) > 2)
         return(message("select.rows must be of length 1 or 2."))
-      
+
       # if lenght 1 start at row 1
       if (length(select.rows) == 1)
         select.rows <- c(1, select.rows)
@@ -159,7 +159,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     # reorder if 2 is bigger than 1
     if (select.rows[2] < select.rows[1])
       select.rows <- c(select.rows[2], select.rows[1])
-    
+
     # make sure to start at index position 1 if select.rows[2] > 0
     if (select.rows[2] > 0 & select.rows[1] == 0)
       select.rows[1] <- 1
@@ -167,15 +167,15 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     # set a value
     select.rows <- c(0,0)
   }
-  
+
   if (is.null(select.cols)){
     select.cols <- ""
   }
-  
+
   data <- stata_read(filepath, missing.type, select.rows, select.cols)
-  
+
   version <- attr(data, "version")
-  
+
   sstr     <- 2045
   sstrl    <- 32768
   sdouble  <- 65526
@@ -183,7 +183,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
   slong    <- 65528
   sint     <- 65529
   sbyte    <- 65530
-  
+
   if (version < 117) {
     sstr    <- 244
     sstrl   <- 255
@@ -193,20 +193,20 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     sint    <- 252
     sbyte   <- 251
   }
-  
+
   if (convert.underscore)
     names(data) <- gsub("_", ".", names(data))
-  
+
   types <- attr(data, "types")
   val.labels <- attr(data, "val.labels")
   label <- attr(data, "label.table")
-  
+
   if (missing.type) {
     stata.na <- data.frame(type = sdouble:sbyte,
                            min = c(101, 32741, 2147483621, 2 ^ 127, 2 ^ 1023),
                            inc = c(1, 1, 1, 2 ^ 115, 2 ^ 1011)
     )
-    
+
     if (version >= 113L & version < 117L) {
       missings <- vector("list", length(data))
       names(missings) <- names(data)
@@ -240,33 +240,33 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
         warning("'missing.type' only applicable to version >= 8 files")
     }
   }
-  
+
   var.labels <- attr(data, "var.labels")
-  
+
   ## Encoding
   if(!is.null(encoding)) {
-    
+
     # set from encoding by dta version
     if(is.null(fromEncoding)) {
       fromEncoding <- "CP1252"
       if(attr(data, "version") >= 118L)
         fromEncoding <- "UTF-8"
     }
-    
+
     # varnames
     names(data) <- read.encoding(names(data), fromEncoding, encoding)
-    
+
     # var.labels
     attr(data, "var.labels") <- read.encoding(var.labels, fromEncoding,
                                               encoding)
-    
+
     # val.labels
     names(val.labels) <- read.encoding(val.labels, fromEncoding, encoding)
     attr(data, "val.labels") <- val.labels
-    
+
     # label
     names(label) <- read.encoding(names(label), fromEncoding, encoding)
-    
+
     if (length(label) > 0) {
       for (i in 1:length(label))  {
         names(label[[i]]) <- read.encoding(names(label[[i]]), fromEncoding,
@@ -274,12 +274,12 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       }
       attr(data, "label.table") <- label
     }
-    
+
     # recode character variables
     for (v in (1:ncol(data))[types <= sstr]) {
       data[, v] <- iconv(data[, v], from=fromEncoding, to=encoding, sub="byte")
     }
-    
+
     # expansion.field
     efi <- attr(data, "expansion.fields")
     if (length(efi) > 0) {
@@ -289,7 +289,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       }
       attr(data, "expansion.fields") <- efi
     }
-    
+
     if (version >= 117L) {
       #strl
       strl <- attr(data, "strl")
@@ -301,9 +301,9 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       }
     }
   }
-  
+
   var.labels <- attr(data, "var.labels")
-  
+
   if (replace.strl & version >= 117L) {
     strl <- c("")
     names(strl) <- "00000000000000000000"
@@ -314,19 +314,19 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     # if strls are in data.frame remove attribute strl
     attr(data, "strl") <- NULL
   }
-  
-  
+
+
   if (convert.dates) {
     convert_dt_c <- function(x)
       as.POSIXct((x + 0.1) / 1000, origin = "1960-01-01") # avoid rounding down
-    
+
     convert_dt_C <- function(x) {
       ls <- .leap.seconds + seq_along(.leap.seconds) + 315619200
       z <- (x + 0.1) / 1000 # avoid rounding down
       z <- z - rowSums(outer(z, ls, ">="))
       as.POSIXct(z, origin = "1960-01-01")
     }
-    
+
     ff <- attr(data, "formats")
     ## dates <- grep("%-*d", ff)
     ## Stata 12 introduced 'business dates'
@@ -336,16 +336,16 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     ## 'Stata has an old *%d* format notation and some datasets
     ##  still have them. Format *%d*... is equivalent to modern
     ##  format *%td*... and *%-d*... is equivalent to *%-td*...'
-    
+
     dates <- grep("^%(-|)(d|td)", ff)
     ## avoid as.Date in case strptime is messed up
     base <- structure(-3653L, class = "Date") # Stata dates are integer vars
     for (v in dates) data[[v]] <- structure(base + data[[v]], class = "Date")
-    
+
     for (v in grep("%tc", ff)) data[[v]] <- convert_dt_c(data[[v]])
     for (v in grep("%tC", ff)) data[[v]] <- convert_dt_C(data[[v]])
   }
-  
+
   if (convert.factors) {
     vnames <- names(data)
     for (i in seq_along(val.labels)) {
@@ -364,7 +364,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
         varunique <- na.omit(unique(data[, i]))
         # assign label if label set is complete
         if (all(varunique %in% labtable)) {
-          
+
           #check for duplicated labels
           labcount <- table(names(labtable))
           if(any(labcount > 1)) {
@@ -373,28 +373,28 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
             # generate unique labels from assigned label and code number
             names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")")
           }
-          
+
           data[, i] <- factor(data[, i], levels=labtable,
                               labels=names(labtable))
           # else generate labels from codes
         } else if (generate.factors) {
           names(varunique) <- as.character(varunique)
           gen.lab  <- sort(c(varunique[!varunique %in% labtable], labtable))
-          
+
           data[, i] <- factor(data[, i], levels=gen.lab,
                               labels=names(gen.lab))
-          
+
         } else {
           warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no labels assigned.\n  Set option generate.factors=T to generate labels."))
         }
       }
     }
   }
-  
+
   if (add.rownames) {
     rownames(data) <- data[[1]]
     data[[1]] <- NULL
   }
-  
+
   return(data)
 }
diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 129395b8..4ae5f58a 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -72,9 +72,9 @@ T readuint48( T t , FILE * file, bool swapit)
   } else if (ferror(file)){
     Rcpp::warning("num: a binary read error occurred.");
   }
-  
+
   t = *(uint64_t *)&uint48;
-  
+
   if (swapit==0)
     return(t);
   else
@@ -90,7 +90,7 @@ static void readstring(std::string &mystring, FILE * fp, int nchar)
 inline void test(std::string testme, FILE * file)
 {
   std::string test(testme.size(), '\0');
-  
+
   readstring(test,file, test.size());
   if (testme.compare(test)!=0)
   {
@@ -116,19 +116,19 @@ static void writebin(T t, std::fstream& dta, bool swapit)
 template <typename T>
 static void writestr(std::string val_s, T len, std::fstream& dta)
 {
-  
+
   std::stringstream val_stream;
   val_stream << std::left << std::setw(len) << std::setfill('\0') << val_s;
   std::string val_strl = val_stream.str();
-  
+
   dta.write(val_strl.c_str(),val_strl.length());
-  
+
 }
 
 inline Rcpp::IntegerVector calc_rowlength(Rcpp::IntegerVector vartype) {
-  
+
   uint32_t k = vartype.size();
-  
+
   Rcpp::IntegerVector rlen(k);
   // calculate row length in byte
   for (uint32_t i=0; i<k; ++i)
@@ -157,7 +157,7 @@ inline Rcpp::IntegerVector calc_rowlength(Rcpp::IntegerVector vartype) {
     break;
     }
   }
-  
+
   return(rlen);
 }
 
@@ -165,18 +165,18 @@ inline Rcpp::IntegerVector choose(Rcpp::CharacterVector x,
                                   Rcpp::CharacterVector y)
 {
   Rcpp::IntegerVector mm = Rcpp::match(x, y);
-  
+
   if (Rcpp::any(Rcpp::is_na(mm))) {
     Rcpp::LogicalVector ll = !Rcpp::is_na(mm);
-    
+
     Rcpp::CharacterVector ms = x[ll==0];
-    
+
     Rcpp::Rcout << "Variable " <<  ms <<
       " was not found in dta-file." << std::endl;
-    
+
     mm = mm[ll==1];
   }
-  
+
   return(mm);
 }
 
@@ -189,29 +189,29 @@ inline Rcpp::IntegerVector which_pos(Rcpp::IntegerVector cvec,
   for (uint32_t i=0; i<select.size(); ++i) {
     vec.erase(std::remove(vec.begin(), vec.end(), select(i)), vec.end());
   }
-  
+
   Rcpp::IntegerVector nselect = Rcpp::wrap(vec);
   nselect = nselect -1;
-  
+
   return(nselect);
 }
 
 inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector vartype3) {
-  
+
   // amount of
   Rcpp::IntegerVector vartype4;
   int64_t val = 0;
   bool last = 0;
-  
+
   uint32_t k = vartype3.size();
-  
+
   for (uint32_t i=0; i<k; ++i)
   {
-    
+
     int32_t value = vartype3(i);
-    
+
     if (value < 0) {
-      
+
       // after start or if last was pos fill to val
       if ( (i == 0) || (last == 1)) {
         val = value;
@@ -219,26 +219,26 @@ inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector vartype3) {
         val += value;
       }
       last = 0;
-      
+
     } else {
-      
+
       // push back if last was neg
       if (i > 0 & last == 0)
         vartype4.push_back(val);
-      
+
       val = value;
       vartype4.push_back(val);
-      
+
       last = 1;
     }
-    
+
     if ((i+1 == k) & (last == 0)) {
       vartype4.push_back(val);
     }
-    
+
   }
-  
-  return(vartype4);
+
+    return(vartype4);
 }
 
 #endif
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index e9c706c6..94153e61 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -25,36 +25,36 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   // stata_dta><header>
   test("stata_dta><header>", file);
   test("<release>", file);
-  
+
   /*
   * version is a 4 byte character e.g. "117"
   */
-  
+
   int8_t fversion = 117L; //f = first
   int8_t lversion = 119L; //l = last
-  
+
   std::string version(3, '\0');
   readstring(version, file, version.size());
-  
+
   int8_t const release = atoi(version.c_str());
-  
+
   IntegerVector versionIV(1);
   versionIV(0) = release;
-  
+
   // check the release version.
   if (release<fversion || release>lversion)
   {
     Rcpp::warning("File version is %d.\nVersion: Not a version 13/14 dta-file", release);
     return -1;
   }
-  
+
   uint8_t nvarnameslen = 0;
   int8_t nformatslen = 0;
   uint8_t nvalLabelslen = 0;
   uint16_t nvarLabelslen = 0;
   int32_t chlen = 0;
   uint8_t lbllen = 0;
-  
+
   switch(release)
   {
   case 117:
@@ -75,113 +75,113 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     lbllen = 129;
     break;
   }
-  
+
   // </release>
   test("</release>", file);
   test("<byteorder>", file);
-  
+
   /*
   * byteorder is a 4 byte character e.g. "LSF". MSF referes to big-memory data.
   */
-  
+
   std::string byteorder(3, '\0');
   readstring(byteorder,file, byteorder.size());
-  
+
   // </byteorder>
   test("</byteorder>", file);
   test("<K>", file);
-  
+
   bool swapit = 0;
   swapit = strcmp(byteorder.c_str(), sbyteorder);
-  
+
   /*
   * Number of Variables
   */
-  
+
   uint32_t k = 0;
   if (release < 119)
     k = readbin((uint16_t)k, file, swapit);
   if (release == 119)
     k = readbin(k, file, swapit);
-  
+
   //</K>
   test("</K>", file);
   test("<N>", file);
-  
+
   /*
   * Number of Observations
   */
-  
+
   uint64_t n = 0;
-  
+
   if (release == 117)
     n = readbin((uint32_t)n, file, swapit);
   if ((release == 118) | (release == 119))
     n = readbin(n, file, swapit);
-  
+
   //</N>
   test("</N>", file);
   test("<label>", file);
-  
+
   // dim to return original dim for partial read files
   IntegerVector dim(2);
   dim(0) = n;
   dim(1) = k;
-  
+
   /*
   * A dataset may have a label e.g. "Written by R".
   * First we read its length (ndlabel), later the actual label (datalabel).
   * ndlabel:   length of datalabel (excl. binary 0)
   * datalabel: string max length 80
   */
-  
+
   uint16_t ndlabel = 0;
-  
+
   if ((release == 118) | (release == 119))
     ndlabel = readbin(ndlabel, file, swapit);
   if (release == 117)
     ndlabel = readbin((int8_t)ndlabel, file, swapit);
-  
+
   std::string datalabel(ndlabel, '\0');
-  
+
   if (ndlabel>0)
   {
     readstring(datalabel, file, datalabel.size());
   } else {
     datalabel = "";
   }
-  
+
   CharacterVector datalabelCV(1);
   datalabelCV(0) = datalabel;
-  
+
   //</label>
   test("</label>", file);
   test("<timestamp>", file);
-  
+
   /*
   * A dataset may have a timestamp. If it has a timestamp the length of the
   * timestamp (ntimestamp) is 17. Else it is zero.
   * ntimestamp: 0 or 17
   * timestamp: empty or 17 byte string
   */
-  
+
   uint8_t ntimestamp = 0;
   ntimestamp = readbin(ntimestamp, file, swapit);
-  
+
   std::string timestamp(17, '\0');
-  
+
   if (ntimestamp == 17) // ntimestap is 0 or 17
   {
     readstring(timestamp, file, timestamp.size());
   } else {
     timestamp = "";
   }
-  
+
   CharacterVector timestampCV = timestamp;
   //</timestamp></header>
   test("</timestamp></header>", file);
   test("<map>", file);
-  
+
   /*
   * Stata stores the byteposition of certain areas of the file here. Currently
   * this is of no use to us.
@@ -200,7 +200,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * 13. </stata_data>
   * 14. end-of-file
   */
-  
+
   NumericVector map(14);
   for (int i=0; i <14; ++i)
   {
@@ -208,11 +208,11 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     nmap = readbin(nmap, file, swapit);
     map[i] = nmap;
   }
-  
+
   //</map>
   test("</map>", file);
   test("<variable_types>", file);
-  
+
   /*
   * vartypes.
   * 0-2045: strf (String: Max length 2045)
@@ -223,7 +223,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * 65529:  int
   * 65530:  byte
   */
-  
+
   IntegerVector vartype(k);
   for (uint32_t i=0; i<k; ++i)
   {
@@ -231,37 +231,37 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     nvartype = readbin(nvartype, file, swapit);
     vartype[i] = nvartype;
   }
-  
+
   //</variable_types>
   test("</variable_types>", file);
   test("<varnames>", file);
-  
+
   /*
   * varnames.
   */
-  
+
   std::string nvarnames(nvarnameslen, '\0');
-  
+
   CharacterVector varnames(k);
   for (uint32_t i=0; i<k; ++i)
   {
     readstring(nvarnames, file, nvarnames.size());
     varnames[i] = nvarnames;
   }
-  
+
   //</varnames>
   test("</varnames>", file);
   test("<sortlist>", file);
-  
+
   /*
   * sortlist. Stata stores the information which variable of a dataset was
   * sorted. Depending on byteorder sortlist is written different. Currently we
   * do not use this information.
   * Vector size is k+1.
   */
-  
+
   uint64_t big_k = k+1;
-  
+
   IntegerVector sortlist(big_k);
   for (uint64_t i=0; i<big_k; ++i)
   {
@@ -269,65 +269,65 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     nsortlist = readbin(nsortlist, file, swapit);
     sortlist[i] = nsortlist;
   }
-  
+
   //</sortlist>
   test("</sortlist>", file);
   test("<formats>", file);
-  
+
   /*
   * formats handle how Stata prints a variable. Currently we do not use this
   * information.
   */
-  
+
   std::string nformats(nformatslen, '\0');
-  
+
   CharacterVector formats(k);
   for (uint32_t i=0; i<k; ++i)
   {
     readstring(nformats, file, nformats.size());
     formats[i] = nformats;
   }
-  
+
   //</formats>
   test("</formats>", file);
   test("<value_label_names>",file);
-  
+
   /*
   * value_label_names. Stata stores variable labels by names.
   * nvalLabels: length of the value_label_name
   * valLabels:
   */
-  
+
   std::string nvalLabels(nvalLabelslen, '\0');
-  
+
   CharacterVector valLabels(k);
   for (uint32_t i=0; i<k; ++i)
   {
     readstring(nvalLabels, file, nvalLabels.size());
     valLabels[i] = nvalLabels;
   }
-  
+
   //</value_label_names>
   test("</value_label_names>", file);
   test("<variable_labels>", file);
-  
+
   /*
   * variabel_labels
   */
-  
+
   std::string nvarLabels (nvarLabelslen, '\0');
-  
+
   CharacterVector varLabels(k);
   for (uint32_t i=0; i<k; ++i)
   {
     readstring(nvarLabels, file, nvarLabels.size());
     varLabels[i] = nvarLabels;
   }
-  
+
   //</variable_labels>
   test("</variable_labels>", file);
   test("<characteristics>", file);
-  
+
   /*
   * characteristics. Stata can store additional information this way. It may
   * contain notes (for the dataset or a variable) or about label language sets.
@@ -338,89 +338,89 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * chcharact:    characteristicsname (binary 0 terminated)
   * nnocharacter: contes (binary 0 terminated)
   */
-  
+
   std::string chtag = "<ch>";
-  
+
   std::string tago(4, '\0');
   readstring(tago, file, tago.size());
-  
+
   List ch = List();
   CharacterVector chs(3);
-  
+
   while (chtag.compare(tago)==0)
   {
     uint32_t nocharacter = 0;
     nocharacter = readbin(nocharacter, file, swapit);
-    
+
     std::string chvarname(chlen, '\0');
     std::string chcharact(chlen, '\0');
     std::string nnocharacter(nocharacter-chlen*2, '\0');
-    
+
     readstring(chvarname, file, chvarname.size());
     readstring(chcharact, file, chcharact.size());
     readstring(nnocharacter, file, nnocharacter.size());
-    
+
     // chs vector
     CharacterVector chs(3);
     chs[0] = chvarname;
     chs[1] = chcharact;
     chs[2] = nnocharacter;
-    
+
     // add characteristics to the list
     ch.push_front( chs );
-    
+
     // </ch>
     test("</ch>", file);
-    
+
     // read next tag
     readstring(tago, file, tago.size());
   }
-  
+
   //[</ch]aracteristics>
   test("aracteristics>", file);
   test("<data>", file);
-  
+
   /*
   * data. First a list is created with vectors. The vector type is defined by
   * vartype. Stata stores data columnwise so we loop over it and store the
   * data in the list of the first step. Third variable- and row-names are
   * attatched and the list type is changed to data.frame.
   */
-  
+
   uint64_t nmin = selectrows(0);
   uint64_t nmax = selectrows(1);
   uint64_t nn   = 0;
-  
+
   // if  selectrows is c(0,0) use full data
   if ((nmin == 0) && (nmax == 0)){
     nmin = 1;
     nmax = n;
   }
-  
+
   // make sure that n is not greater nmax
   if (n < nmax)
     nmax = n;
-  
+
   // neither should nmin be greater
   if (n < nmin)
     nmin = n;
-  
+
   Rcpp::IntegerVector cvec = seq(1, k);
   Rcpp::IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
-  
+
   // use c indexing starting at 0
   nmin = nmin -1;
   nmax = nmax -1;
-  
+
   // calculate length of variables and of row
   IntegerVector rlen = calc_rowlength(vartype);
   uint64_t rlength = sum(rlen);
-  
+
   // check if vars are selected
   std::string selcols = as<std::string>(selectcols(0));
   bool noselectvars = selcols == "";
-  
+
   // select vars: either select every var or only matched cases
   IntegerVector select;
   if (noselectvars) {
@@ -428,59 +428,59 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   } else {
     select = choose(selectcols, varnames);
   }
-  
+
   // match returns r index
   IntegerVector select_c = select -1;
-  
+
   uint32_t kk = select.size();
-  
+
   // shrink variables
   CharacterVector varnames_kk = varnames[select_c];
   IntegerVector vartype_kk = vartype[select_c];
   IntegerVector vartype3 = vartype;
-  
+
   IntegerVector nselect = which_pos(cvec, select);
-  
+
   IntegerVector rlen2 = rlen[nselect];
   rlen2 = -rlen2;
-  
+
   vartype3[nselect] = rlen2;
-  
-  
+
+
   // 1. create the list
   List df(kk);
   for (uint32_t i=0; i<kk; ++i)
   {
     int const type = vartype_kk[i];
-    
+
     switch(type)
     {
     case STATA_DOUBLE:
     case STATA_FLOAT:
       SET_VECTOR_ELT(df, i, NumericVector(no_init(nn)));
       break;
-      
+
     case STATA_INT:
     case STATA_SHORTINT:
     case STATA_BYTE:
       SET_VECTOR_ELT(df, i, IntegerVector(no_init(nn)));
       break;
-      
+
     default:
       SET_VECTOR_ELT(df, i, CharacterVector(no_init(nn)));
     break;
     }
   }
-  
+    
   // calulate jumpsize
   IntegerVector vartype4 = calc_jump(vartype3);
   kk = vartype4.size();
-  
+
   // 2. fill it with data
-  
+
   // skip into the data part
   fseeko64(file, rlength * nmin, SEEK_CUR);
-  
+
   uint32_t ii = 0;
   for(uint64_t j=0; j<nn; ++j)
   {
@@ -489,7 +489,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     for (uint32_t i=0; i<kk; ++i)
     {
       int const type = vartype4[i];
-      
+
       switch(((type >0) & (type < 2046)) ? 2045 : type)
       {
         // double
@@ -497,12 +497,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       {
         double val_d = 0;
         val_d = readbin(val_d, file, swapit);
-        
+
         if ((missing == 0) && !(val_d == R_NegInf) && ((val_d<STATA_DOUBLE_NA_MIN) || (val_d>STATA_DOUBLE_NA_MAX)) )
           REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
           REAL(VECTOR_ELT(df,ii))[j] = val_d;
-        
+
         break;
       }
         // float
@@ -510,12 +510,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       {
         float val_f = 0;
         val_f = readbin(val_f, file, swapit);
-        
+
         if ((missing == 0) && ((val_f<STATA_FLOAT_NA_MIN) || (val_f>STATA_FLOAT_NA_MAX)) )
           REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
           REAL(VECTOR_ELT(df,ii))[j] = val_f;
-        
+
         break;
       }
         // long
@@ -523,12 +523,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       {
         int32_t val_l = 0;
         val_l = readbin(val_l, file, swapit);
-        
+
         if ((missing == 0) && ((val_l<STATA_INT_NA_MIN) || (val_l>STATA_INT_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_l;
-        
+
         break;
       }
         // int
@@ -536,12 +536,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       {
         int16_t val_i = 0;
         val_i = readbin(val_i, file, swapit);
-        
+
         if ((missing == 0) && ((val_i<STATA_SHORTINT_NA_MIN) || (val_i>STATA_SHORTINT_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_i;
-        
+
         break;
       }
         // byte
@@ -549,12 +549,12 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       {
         int8_t val_b = 0;
         val_b = readbin(val_b, file, swapit);
-        
+
         if (missing == 0 && ( (val_b<STATA_BYTE_NA_MIN) || (val_b>STATA_BYTE_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_b;
-        
+
         break;
       }
         // strings with 2045 or fewer characters
@@ -563,7 +563,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
         int32_t len = 0;
         len = vartype[i];
         std::string val_s (len, '\0');
-        
+
         readstring(val_s, file, val_s.size());
         as<CharacterVector>(df[ii])[j] = val_s;
         break;
@@ -571,78 +571,78 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
         // string of any length
       case STATA_STRL:
       {// strL 2*4bit or 2 + 6 bit
-        
+
         // FixMe: Strl in 118
         switch (release)
       {
-        
+
       case 117:
       {
         uint32_t v = 0, o = 0;
-        
+
         v = readbin(v, file, swapit);
         o = readbin(o, file, swapit);
-        
+
         stringstream val_stream;
         val_stream << v << '_' << o;
         string val_strl = val_stream.str();
-        
+
         as<CharacterVector>(df[ii])[j] = val_strl;
-        
+
         break;
       }
       case 118:
       {
         int16_t v = 0;
         int64_t o = 0, z = 0;
-        
+
         z = readbin(z, file, swapit);
-        
+
         // works for LSF on little- and big-endian
         if (byteorder.compare("LSF")==0) {
           v = (int16_t)z;
           o = (z >> 16);
         }
-        
+
         // works if we read a big-endian file on little-endian
         if (byteorder.compare("MSF")==0) {
           v = (z >> 48) & ((1 << 16) - 1);
           o = z & ((1 << 16) - 1);
         }
-        
+
         stringstream val_stream;
         val_stream << v << '_' << o;
         string val_strl = val_stream.str();
-        
+
         as<CharacterVector>(df[ii])[j] = val_strl;
-        
+
         break;
       }
       case 119:
       {
         int32_t v = 0;
         int64_t o = 0, z = 0;
-        
+
         z = readbin(z, file, swapit);
-        
+
         // works for LSF on little- and big-endian
         if (byteorder.compare("LSF")==0) {
           v = (int32_t)z & ((1 << 24) - 1);
           o = (z >> 24);
         }
-        
+
         // FixMe: works if we read a big-endian file on little-endian
         if (byteorder.compare("MSF")==0) {
           v = (z >> 48) & ((1 << 24) - 1);
           o = z & ((1 << 24) - 1);
         }
-        
+
         stringstream val_stream;
         val_stream << v << '_' << o;
         string val_strl = val_stream.str();
-        
+
         as<CharacterVector>(df[ii])[j] = val_strl;
-        
+
         break;
       }
       }
@@ -656,25 +656,25 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
         break;
       }
       }
-      
+
       if (type >= 0) ii += 1;
-      
+
       Rcpp::checkUserInterrupt();
     }
   }
-  
+
   // skip to end of data part
   fseeko64(file, rlength * (n - nmax -1), SEEK_CUR);
-  
+
   // 3. Create a data.frame
   df.attr("row.names") = rvec;
   df.attr("names") = varnames_kk;
   df.attr("class") = "data.frame";
-  
+
   //</data>
   test("</data>", file);
   test("<strls>", file);
-  
+
   /*
   * strL. Stata 13 introduced long strings up to 2 billon characters. strLs are
   * sperated by "GSO".
@@ -683,35 +683,35 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * len:   length of the strL.
   * strl:  long string.
   */
-  
+
   std::string gso = "GSO";
-  
+
   std::string tags(3, '\0');
   readstring(tags, file, tags.size());
-  
+
   //put strLs into a named vector
   CharacterVector strlvalues(0);
   CharacterVector strlnames(0);
-  
+
   while(gso.compare(tags)==0)
   {
     CharacterVector strls(2);
     string ref;
-    
+
     // FixMe: Strl in 118
     switch (release)
     {
     case 117:
     {
       uint32_t v = 0, o = 0;
-      
+
       v = readbin(v, file, swapit);
       o = readbin(o, file, swapit);
-      
+
       stringstream val_stream;
       val_stream << v << '_' << o;
       ref.assign(val_stream.str());
-      
+
       break;
     }
     case 118:
@@ -719,44 +719,44 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     {
       uint32_t v = 0;
       uint64_t o = 0;
-      
+
       v = readbin(v, file, swapit);
       o = readbin(o, file, swapit);
-      
+
       stringstream val_stream;
       val_stream << v << '_' << o;
       ref.assign(val_stream.str());
-      
+
       break;
     }
     }
-    
+
     // (129 = binary) | (130 = ascii)
     uint8_t t = 0;
     t = readbin(t, file, swapit);
-    
+
     uint32_t len = 0;
     len = readbin(len, file, swapit);
-    
+
     // 129 len = len; 130 len = len +'\0';
-    
+
     std::string strl(len, '\0');
     readstring(strl, file, strl.size());
-    
+
     strlvalues.push_back( strl );
     strlnames.push_back( ref );
-    
+
     readstring(tags, file, tags.size());
   }
-  
+
   // set identifier as name
   strlvalues.attr("names") = strlnames;
-  
+
   // after strls
   //[</s]trls>
   test("trls>", file);
   test("<value_labels>", file);
-  
+
   /*
   * labels are seperated by <lbl>-tags. Labels may appear in any order e.g.
   * 2 "female" 1 "male 9 "missing". They are stored as tables.
@@ -766,34 +766,34 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * txtlen:   length of the label text.
   * off:      offset defines where to read a new label in txtlen.
   */
-  
+
   std::string lbltag = "<lbl>";
-  
+
   std::string tag(5, '\0');
   readstring(tag, file, tag.size());
-  
+
   List labelList = List(); //put labels into this list
-  
+
   while(lbltag.compare(tag)==0)
   {
     int32_t nlen = 0, labn = 0, txtlen = 0, noff = 0, val = 0;
-    
+
     // length of value_label_table
     nlen = readbin(nlen, file, swapit);
-    
+
     // name of this label set
-    
+
     std::string nlabname(lbllen, '\0');
-    
+
     readstring(nlabname, file, nlabname.size());
-    
+
     //padding
     fseek(file, 3, SEEK_CUR);
-    
+
     // value_label_table for actual label set
     labn = readbin(labn, file, swapit);
     txtlen = readbin(txtlen, file, swapit);
-    
+
     // offset for each label
     // off0 : label 0 starts at off0
     // off1 : label 1 starts at off1 ...
@@ -802,43 +802,43 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       noff = readbin(noff, file, swapit);
       off[i] = noff;
     }
-    
+
     // needed for match
     IntegerVector laborder = clone(off);
     //laborder.erase(labn+1);
     IntegerVector labordersort = clone(off);
     //labordersort.erase(labn+1);
     std::sort(labordersort.begin(), labordersort.end());
-    
+
     // needs txtlen for loop
     off.push_back(txtlen);
-    
+
     // sort offsets so we can read labels sequentially
     std::sort(off.begin(), off.end());
-    
+
     // create an index to sort lables along the code values
     // this is done while factor creation
     IntegerVector indx(labn);
     indx = match(laborder,labordersort);
-    
+
     // code for each label
     IntegerVector code(labn);
     for (int i=0; i < labn; ++i) {
       val = readbin(val, file, swapit);
       code[i] = val;
     }
-    
+
     // label text
     CharacterVector label(labn);
     for (int i=0; i < labn; ++i) {
       int lablen = off[i+1]-off[i];
-      
+
       std::string lab (lablen, '\0');
-      
+
       readstring(lab, file, lablen);
       label[i] = lab;
     }
-    
+
     // sort labels according to indx
     CharacterVector labelo(labn);
     for (int i=0; i < labn; ++i) {
@@ -847,32 +847,32 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     // create table for actual label set
     string const labset = nlabname;
     code.attr("names") = labelo;
-    
+
     // add this set to output list
     labelList.push_front( code, labset);
-    
+
     fseek(file, 6, SEEK_CUR); //</lbl>
-    
+
     readstring(tag, file, tag.size());
   }
-  
+
   /*
-  * Final test if we reached the end of the file
-  * close the file
-  */
-  
+   * Final test if we reached the end of the file
+   * close the file
+   */
+
   // [</val]ue_labels>
   test("ue_labels>", file);
   test("</stata_dta>", file);
-  
+
   /*
    * assign attributes to the resulting data.frame
    */
-  
+
   formats = formats[select_c];
   valLabels = valLabels[select_c];
   varLabels = varLabels[select_c];
-  
+
   df.attr("datalabel") = datalabelCV;
   df.attr("time.stamp") = timestampCV;
   df.attr("formats") = formats;
@@ -885,6 +885,6 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   df.attr("strl") = strlvalues;
   df.attr("byteorder") = wrap(byteorder);
   df.attr("orig.dim") = dim;
-  
+
   return df;
 }
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index 70bfe4e8..fdf23e49 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -25,20 +25,20 @@ List read_pre13_dta(FILE * file, const bool missing,
                     const CharacterVector selectcols)
 {
   int8_t release = 0;
-  
+
   rewind(file);
   release = readbin(release, file, 0);
-  
+
   if (release<102 || release == 109 || release>115)
     Rcpp::stop("First byte: Not a dta-file we can read.");
-  
+
   IntegerVector versionIV(1);
   versionIV(0) = release;
-  
+
   /*
   * byteorder is a 4 byte character e.g. "LSF". MSF referes to big-memory data.
   */
-  
+
   uint16_t ndlabel = 81;
   uint8_t nvarnameslen = 33;
   int8_t nformatslen = 49;
@@ -46,7 +46,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   uint16_t nvarLabelslen = 81;
   int32_t chlen = 33;
   uint8_t lbllen = 33;
-  
+
   switch(release)
   {
   case 102:
@@ -87,70 +87,70 @@ List read_pre13_dta(FILE * file, const bool missing,
     nformatslen = 12;
     break;
   }
-  
+
   CharacterVector byteorderC(1);
   IntegerVector byteorderI(1);
   bool swapit = 0;
-  
+
   int8_t byteorder = 0;
   byteorder = readbin(byteorder, file, 0);
   // 1 = MSF 2 = LSF
   swapit = std::abs(SBYTEORDER-byteorder);
   byteorderI(0) = byteorder;
-  
+
   // filetype: unnown?
   int8_t ft = 0;
   ft = readbin(ft, file, swapit);
-  
+
   int8_t unused = 0;
   unused = readbin(unused, file, swapit);
-  
-  
+
+
   /*
   * Number of Variables
   */
-  
+
   uint16_t k = 0;
   k = readbin(k, file, swapit);
-  
-  
+
+
   /*
   * Number of Observations
   */
-  
+
   uint32_t n = 0;
   n = readbin(n, file, swapit);
-  
+
   // dim to return original dim for partial read files
   IntegerVector dim(2);
   dim(0) = n;
   dim(1) = k;
-  
+
   /*
   * A dataset may have a label e.g. "Written by R".
   * First we read its length (ndlabel), later the actual label (datalabel).
   * ndlabel:   length of datalabel (excl. binary 0)
   * datalabel: string max length 80
   */
-  
-  
+
+
   CharacterVector datalabelCV(1);
-  
+
   std::string datalabel(ndlabel, '\0');
-  
+
   if (ndlabel > 0)
     readstring(datalabel, file, datalabel.size());
   else
     datalabel = "";
-  
+
   datalabelCV(0) = datalabel;
-  
+
   CharacterVector timestampCV(1);
   std::string timestamp(18, '\0');
-  
+
   switch (release)
   {
-    
+
   case 102:
   case 103:
   case 104:
@@ -158,16 +158,16 @@ List read_pre13_dta(FILE * file, const bool missing,
     timestamp = "";
     break;
   }
-    
+
   default:
   {
     readstring(timestamp, file, timestamp.size());
     break;
   }
   }
-  
+
   timestampCV(0) = timestamp;
-  
+
   /*
   * vartypes.
   * 0-2045: strf (String: Max length 2045)
@@ -178,12 +178,12 @@ List read_pre13_dta(FILE * file, const bool missing,
   * 65529:  int
   * 65530:  byte
   */
-  
+
   IntegerVector vartype(k);
-  
+
   switch (release)
   {
-    
+
   case 102:
   case 103:
   case 104:
@@ -195,11 +195,11 @@ List read_pre13_dta(FILE * file, const bool missing,
   case 112:
   {
     uint8_t nvartypec = 0;
-    
+
     for (uint16_t i=0; i<k; ++i)
     {
       nvartypec = readbin(nvartypec, file, swapit);
-      
+
       if(nvartypec== 98) // b
         vartype[i] = 251;
       if(nvartypec==105) // i
@@ -215,14 +215,14 @@ List read_pre13_dta(FILE * file, const bool missing,
     }
     break;
   }
-    
+
   case 111:
   case 113:
   case 114:
   case 115:
   {
     uint8_t nvartype = 0;
-    
+
     for (uint16_t i=0; i<k; ++i)
     {
       nvartype = readbin(nvartype, file, swapit);
@@ -230,34 +230,34 @@ List read_pre13_dta(FILE * file, const bool missing,
     }
     break;
   }
-    
+
   }
-  
+
   // FixMe: Needs clone otherwise missing.type would not work
   IntegerVector types = clone(vartype);
-  
+
   /*
   * varnames. Max length 33.
   */
-  
+
   std::string nvarnames(nvarnameslen, '\0');
-  
+
   CharacterVector varnames(k);
   for (uint16_t i=0; i<k; ++i)
   {
     readstring(nvarnames, file, nvarnames.size());
     varnames[i] = nvarnames;
   }
-  
+
   /*
   * sortlist. Stata stores the information which variable of a dataset was
   * sorted. Depending on byteorder sortlist is written different. Currently we
   * do not use this information.
   * Vector size is k+1.
   */
-  
+
   uint32_t big_k = k+1;
-  
+
   IntegerVector sortlist(big_k);
   for (uint32_t i=0; i<big_k; ++i)
   {
@@ -265,100 +265,100 @@ List read_pre13_dta(FILE * file, const bool missing,
     nsortlist = readbin(nsortlist, file, swapit);
     sortlist[i] = nsortlist;
   }
-  
+
   /*
   * formats handle how Stata prints a variable. Currently we do not use this
   * information.
   */
-  
+
   CharacterVector formats(k);
   std::string nformats(nformatslen, '\0');
-  
+
   for (uint16_t i=0; i<k; ++i)
   {
     readstring(nformats, file, nformats.size());
     formats[i] = nformats;
   }
-  
+
   /*
   * value_label_names. Stata stores variable labels by names.
   * nvalLabels: length of the value_label_name
   * valLabels:  Char of max length 33
   */
-  
+
   CharacterVector valLabels(k);
   std::string nvalLabels(nvalLabelslen, '\0');
-  
+
   for (uint16_t i=0; i<k; ++i)
   {
     readstring(nvalLabels, file, nvalLabels.size());
     valLabels[i] = nvalLabels;
   }
-  
+
   /*
   * variabel_labels
   */
-  
+
   CharacterVector varLabels(k);
   std::string nvarLabels (nvarLabelslen, '\0');
-  
+
   for (uint16_t i=0; i<k; ++i)
   {
     readstring(nvarLabels, file, nvarLabels.size());
     varLabels[i] = nvarLabels;
   }
-  
+
   /* <characteristics> ... </characteristics> */
-  
+
   List ch = List();
   if (release > 104)
   {
     int8_t datatype = 0;
     uint32_t len = 0;
-    
+
     datatype = readbin(datatype, file, swapit);
     if (release <= 108)
       len = readbin((uint16_t)len, file, swapit);
     else
       len = readbin(len, file, swapit);
-    
-    
+
+
     while (!(datatype==0) && !(len==0))
     {
       std::string chvarname(chlen, '\0');
       std::string chcharact(chlen, '\0');
       std::string nnocharacter(len-chlen*2, '\0');
-      
+
       readstring(chvarname, file, chvarname.size());
       readstring(chcharact, file, chcharact.size());
       readstring(nnocharacter, file, nnocharacter.size());
-      
+
       // chs vector
       CharacterVector chs(3);
       chs[0] = chvarname;
       chs[1] = chcharact;
       chs[2] = nnocharacter;
-      
+
       // add characteristics to the list
       ch.push_front( chs );
-      
+
       datatype = readbin(datatype, file, swapit);
-      
+
       if (release <= 108)
         len = readbin((uint16_t)len, file, swapit);
       else
         len = readbin(len, file, swapit);
     }
   }
-  
-  
+
+
   /*
   * data. First a list is created with vectors. The vector type is defined by
   * vartype. Stata stores data columnwise so we loop over it and store the
   * data in the list of the first step. Third variable- and row-names are
   * attatched and the list type is changed to data.frame.
   */
-  
+
   /* replace vartypes of Stata 8 - 12 with Stata 13 values. */
   // 117 contains new variable types (longer strings and strL)
   std::replace (vartype.begin(), vartype.end(), 251, STATA_BYTE);
@@ -366,42 +366,42 @@ List read_pre13_dta(FILE * file, const bool missing,
   std::replace (vartype.begin(), vartype.end(), 253, STATA_INT);
   std::replace (vartype.begin(), vartype.end(), 254, STATA_FLOAT);
   std::replace (vartype.begin(), vartype.end(), 255, STATA_DOUBLE);
-  
-  
+
+
   uint64_t nmin = selectrows(0);
   uint64_t nmax = selectrows(1);
   uint64_t nn   = 0;
-  
+
   // if  selectrows is c(0,0) use full data
   if ((nmin == 0) && (nmax == 0)){
     nmin = 1;
     nmax = n;
   }
-  
+
   // make sure that n is not greater nmax
   if (n < nmax)
     nmax = n;
-  
+
   // neither should nmin be greater
   if (n < nmin)
     nmin = n;
-  
+
   Rcpp::IntegerVector cvec = seq(1, k);
   Rcpp::IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
-  
+
   // use c indexing starting at 0
   nmin = nmin -1;
   nmax = nmax -1;
-  
+
   // calculate length of variables and of row
   IntegerVector rlen = calc_rowlength(vartype);
   uint64_t rlength = sum(rlen);
-  
+
   // check if vars are selected
   std::string selcols = as<std::string>(selectcols(0));
   bool noselectvars = selcols == "";
-  
+
   // select vars: either select every var or only matched cases
   IntegerVector select;
   if (noselectvars) {
@@ -409,60 +409,60 @@ List read_pre13_dta(FILE * file, const bool missing,
   } else {
     select = choose(selectcols, varnames);
   }
-  
+
   // match returns r index
   IntegerVector select_c = select -1;
-  
+
   uint32_t kk = select.size();
-  
+
   // shrink variables
   CharacterVector varnames_kk = varnames[select_c];
   IntegerVector vartype_kk = vartype[select_c];
   IntegerVector types_kk = types[select_c];
   IntegerVector vartype3 = vartype;
-  
-  
+
+
   IntegerVector nselect = which_pos(cvec, select);
-  
+
   IntegerVector rlen2 = rlen[nselect];
   rlen2 = -rlen2;
-  
+
   vartype3[nselect] = rlen2;
-  
+
   // 1. create the list
   List df(kk);
   for (uint32_t i=0; i<kk; ++i)
   {
     int const type = vartype_kk[i];
-    
+
     switch(type)
     {
     case STATA_DOUBLE:
     case STATA_FLOAT:
       SET_VECTOR_ELT(df, i, NumericVector(no_init(nn)));
       break;
-      
+
     case STATA_INT:
     case STATA_SHORTINT:
     case STATA_BYTE:
       SET_VECTOR_ELT(df, i, IntegerVector(no_init(nn)));
       break;
-      
+
     default:
       SET_VECTOR_ELT(df, i, CharacterVector(no_init(nn)));
     break;
     }
   }
-  
+
   // calulate jumpsize
   IntegerVector vartype4 = calc_jump(vartype3);
   kk = vartype4.size();
-  
+
   // 2. fill it with data
-  
+
   // skip into the data part
   fseeko64(file, rlength * nmin, SEEK_CUR);
-  
+
   uint32_t ii = 0;
   for(uint32_t j=0; j<nn; ++j)
   {
@@ -471,8 +471,8 @@ List read_pre13_dta(FILE * file, const bool missing,
     for (uint16_t i=0; i<kk; ++i)
     {
       int const type = vartype4[i];
-      
-      
+
+
       switch(((type >0) & (type < 244)) ? 244 : type)
       {
         // double
@@ -480,12 +480,12 @@ List read_pre13_dta(FILE * file, const bool missing,
       {
         double val_d = 0;
         val_d = readbin(val_d, file, swapit);
-        
+
         if ((missing == FALSE) & !(val_d == R_NegInf) & ((val_d<STATA_DOUBLE_NA_MIN) | (val_d>STATA_DOUBLE_NA_MAX)) )
           REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
           REAL(VECTOR_ELT(df,ii))[j] = val_d;
-        
+
         break;
       }
         // float
@@ -493,12 +493,12 @@ List read_pre13_dta(FILE * file, const bool missing,
       {
         float val_f = 0;
         val_f = readbin(val_f, file, swapit);
-        
+
         if ((missing == FALSE) & ((val_f<STATA_FLOAT_NA_MIN) | (val_f>STATA_FLOAT_NA_MAX)) )
           REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
         else
           REAL(VECTOR_ELT(df,ii))[j] = val_f;
-        
+
         break;
       }
         // long
@@ -506,13 +506,13 @@ List read_pre13_dta(FILE * file, const bool missing,
       {
         int32_t val_l = 0;
         val_l = readbin(val_l, file, swapit);
-        
-        
+
+
         if ((missing == FALSE) & ((val_l<STATA_INT_NA_MIN) | (val_l>STATA_INT_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j]  = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_l;
-        
+
         break;
       }
         // int
@@ -520,12 +520,12 @@ List read_pre13_dta(FILE * file, const bool missing,
       {
         int16_t val_i = 0;
         val_i = readbin(val_i, file, swapit);
-        
+
         if ((missing == FALSE) & ((val_i<STATA_SHORTINT_NA_MIN) | (val_i>STATA_SHORTINT_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_i;
-        
+
         break;
       }
         // byte
@@ -533,12 +533,12 @@ List read_pre13_dta(FILE * file, const bool missing,
       {
         int8_t val_b = 0;
         val_b = readbin(val_b, file, swapit);
-        
+
         if ((missing == FALSE) & ( (val_b<STATA_BYTE_NA_MIN) | (val_b>STATA_BYTE_NA_MAX)) )
           INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
         else
           INTEGER(VECTOR_ELT(df,ii))[j] = val_b;
-        
+
         break;
       }
         // strings with 244 or fewer characters
@@ -547,11 +547,11 @@ List read_pre13_dta(FILE * file, const bool missing,
         int32_t len = 0;
         len = vartype[i];
         std::string val_s (len, '\0');
-        
+
         readstring(val_s, file, val_s.size());
-        
+
         as<CharacterVector>(df[ii])[j] = val_s;
-        
+
         break;
       }
         // case < 0:
@@ -562,20 +562,20 @@ List read_pre13_dta(FILE * file, const bool missing,
         break;
       }
       }
-      
+
       if (type >= 0) ii += 1;
       Rcpp::checkUserInterrupt();
     }
   }
-  
+
   // skip to end of data part
   fseeko64(file, rlength * (n - nmax -1), SEEK_CUR);
-  
+
   // 3. Create a data.frame
   df.attr("row.names") = rvec;
   df.attr("names") = varnames_kk;
   df.attr("class") = "data.frame";
-  
+
   /*
   * labels are seperated by <lbl>-tags. Labels may appear in any order e.g.
   * 2 "female" 1 "male 9 "missing". They are stored as tables.
@@ -585,39 +585,39 @@ List read_pre13_dta(FILE * file, const bool missing,
   * txtlen:   length of the label text.
   * off:      offset defines where to read a new label in txtlen.
   */
-  
+
   List labelList = List(); //put labels into this list
-  
+
   if (release>105) {
     // FixMe: the while statement differs and the final check
-    
-    
+
+
     int32_t nlen = 0, labn = 0, txtlen = 0, noff = 0, val = 0;
     std::string tag(5, '\0');
-    
+
     bool haslabel = false;
-    
+
     // length of value_label_table
     nlen = readbin(nlen, file, swapit);
-    
+
     if (!(feof(file) || ferror(file)))
       haslabel = true;
-    
+
     while(haslabel)
     {
-      
+
       // name of this label set
       std::string nlabname(lbllen, '\0');
-      
+
       readstring(nlabname, file, nlabname.size());
-      
+
       //padding
       fseek(file, 3, SEEK_CUR);
-      
+
       // value_label_table for actual label set
       labn = readbin(labn, file, swapit);
       txtlen = readbin(txtlen, file, swapit);
-      
+
       // offset for each label
       // off0 : label 0 starts at off0
       // off1 : label 1 starts at off1 ...
@@ -626,43 +626,43 @@ List read_pre13_dta(FILE * file, const bool missing,
         noff = readbin(noff, file, swapit);
         off[i] = noff;
       }
-      
+
       // needed for match
       IntegerVector laborder = clone(off);
       //laborder.erase(labn+1);
       IntegerVector labordersort = clone(off);
       //labordersort.erase(labn+1);
       std::sort(labordersort.begin(), labordersort.end());
-      
+
       // needs txtlen for loop
       off.push_back(txtlen);
-      
+
       // sort offsets so we can read labels sequentially
       std::sort(off.begin(), off.end());
-      
+
       // create an index to sort lables along the code values
       // this is done while factor creation
       IntegerVector indx(labn);
       indx = match(laborder,labordersort);
-      
+
       // code for each label
       IntegerVector code(labn);
       for (int i=0; i < labn; ++i) {
         val = readbin(val, file, swapit);
         code[i] = val;
       }
-      
+
       // label text
       CharacterVector label(labn);
       for (int i=0; i < labn; ++i) {
         int lablen = off[i+1]-off[i];
-        
+
         std::string lab (lablen, '\0');
-        
+
         readstring(lab, file, lablen);
         label[i] = lab;
       }
-      
+
       // sort labels according to indx
       CharacterVector labelo(labn);
       for (int i=0; i < labn; ++i) {
@@ -671,26 +671,26 @@ List read_pre13_dta(FILE * file, const bool missing,
       // create table for actual label set
       string const labset = nlabname;
       code.attr("names") = labelo;
-      
+
       // add this set to output list
       labelList.push_front( code, labset);
-      
+
       // length of value_label_table
       nlen = readbin(nlen, file, swapit);
-      
+
       if (feof(file) || ferror(file))
         break;
     }
   }
-  
+
   /*
    * assign attributes to the resulting data.frame
    */
-  
+
   formats = formats[select_c];
   valLabels = valLabels[select_c];
   varLabels = varLabels[select_c];
-  
+
   df.attr("datalabel") = datalabelCV;
   df.attr("time.stamp") = timestampCV;
   df.attr("formats") = formats;
@@ -702,6 +702,6 @@ List read_pre13_dta(FILE * file, const bool missing,
   df.attr("expansion.fields") = ch;
   df.attr("byteorder") = byteorderI;
   df.attr("orig.dim") = dim;
-  
+
   return df;
 }
diff --git a/tests/testthat/test_save.R b/tests/testthat/test_save.R
index 65b86d0a..2f68b42c 100644
--- a/tests/testthat/test_save.R
+++ b/tests/testthat/test_save.R
@@ -364,7 +364,7 @@ test_that("add.rownames TRUE", {
   expect_true(identical(rownames(dd), rownames(dd104)))
   expect_true(identical(rownames(dd), rownames(dd103)))
   expect_true(identical(rownames(dd), rownames(dd102)))
-  
+
   # Check that data is identical
   expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))

From 7ec39be72f5a89e7c1f651d75295b445f413edc1 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Wed, 14 Jun 2017 08:33:30 +0200
Subject: [PATCH 27/76] the actual whitespace fix I was intending

---
 inst/include/readstata.h | 2 +-
 src/read_dta.cpp         | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 4ae5f58a..6ef64623 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -238,7 +238,7 @@ inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector vartype3) {
 
   }
 
-    return(vartype4);
+  return(vartype4);
 }
 
 #endif
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 94153e61..6c10f09f 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -471,7 +471,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     break;
     }
   }
-    
+
   // calulate jumpsize
   IntegerVector vartype4 = calc_jump(vartype3);
   kk = vartype4.size();
@@ -857,9 +857,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   }
 
   /*
-   * Final test if we reached the end of the file
-   * close the file
-   */
+  * Final test if we reached the end of the file
+  * close the file
+  */
 
   // [</val]ue_labels>
   test("ue_labels>", file);

From bc8e2d52e76d5bcf168627732d195f5f823f1fb9 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Wed, 14 Jun 2017 08:36:58 +0200
Subject: [PATCH 28/76] pedantic

---
 inst/include/readstata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 6ef64623..b1c3d5b5 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -223,7 +223,7 @@ inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector vartype3) {
     } else {
 
       // push back if last was neg
-      if (i > 0 & last == 0)
+      if ((i > 0) & (last == 0))
         vartype4.push_back(val);
 
       val = value;

From ff85b8332c30fa1940c1c8193203358b0d9d3930 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Wed, 14 Jun 2017 08:50:07 +0200
Subject: [PATCH 29/76] comment code

---
 inst/include/readstata.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index b1c3d5b5..6b159b6c 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -161,6 +161,9 @@ inline Rcpp::IntegerVector calc_rowlength(Rcpp::IntegerVector vartype) {
   return(rlen);
 }
 
+// return only the matched positions. Either Rcpps in() can't handle Character-
+// Vectors or I could not make it work. Wanted to select the selected varname
+// position from the varnames vector.
 inline Rcpp::IntegerVector choose(Rcpp::CharacterVector x,
                                   Rcpp::CharacterVector y)
 {
@@ -180,25 +183,32 @@ inline Rcpp::IntegerVector choose(Rcpp::CharacterVector x,
   return(mm);
 }
 
-
+// return only the positions of variables, we have selected.
 inline Rcpp::IntegerVector which_pos(Rcpp::IntegerVector cvec,
                                      Rcpp::IntegerVector select)
 {
   // integer position of not selected variables
+  // This drops all the positions we do not need. Initially I wanted something
+  // like cvec[select], but that somehow did not work, possibly this could be
+  // improved.
   std::vector<int> vec = Rcpp::as< std::vector<int> >(cvec);
   for (uint32_t i=0; i<select.size(); ++i) {
     vec.erase(std::remove(vec.begin(), vec.end(), select(i)), vec.end());
   }
-
   Rcpp::IntegerVector nselect = Rcpp::wrap(vec);
+
+  // return to C-index
   nselect = nselect -1;
 
   return(nselect);
 }
 
+// calculate the maximum jump. This calculates the maximum space we can skip if
+// reading only a single variable. Before we skipped over each variable. Now we
+// skip over them combined. Therefore if a value in vartype3 is positive push it
+// into a new vector. If negative, sum the length up.
 inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector vartype3) {
 
-  // amount of
   Rcpp::IntegerVector vartype4;
   int64_t val = 0;
   bool last = 0;

From 8141362b44bbe5de225042cc8e08f445e7e77ae3 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Wed, 14 Jun 2017 09:28:26 +0200
Subject: [PATCH 30/76] comments

---
 inst/include/readstata.h |  9 ++++---
 src/read_dta.cpp         | 55 ++++++++++++++++++--------------------
 src/read_pre13_dta.cpp   | 57 ++++++++++++++++++----------------------
 3 files changed, 57 insertions(+), 64 deletions(-)

diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 6b159b6c..791cb920 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -167,6 +167,9 @@ inline Rcpp::IntegerVector calc_rowlength(Rcpp::IntegerVector vartype) {
 inline Rcpp::IntegerVector choose(Rcpp::CharacterVector x,
                                   Rcpp::CharacterVector y)
 {
+  // ToDo: Maybe we can skip the select and nselect in read_dta.cpp if we match
+  // the other way around and use Rcpp::is_na on the result which then could be
+  // used as an additional index
   Rcpp::IntegerVector mm = Rcpp::match(x, y);
 
   if (Rcpp::any(Rcpp::is_na(mm))) {
@@ -180,6 +183,9 @@ inline Rcpp::IntegerVector choose(Rcpp::CharacterVector x,
     mm = mm[ll==1];
   }
 
+  // match returns R index
+  mm = mm -1;
+
   return(mm);
 }
 
@@ -197,9 +203,6 @@ inline Rcpp::IntegerVector which_pos(Rcpp::IntegerVector cvec,
   }
   Rcpp::IntegerVector nselect = Rcpp::wrap(vec);
 
-  // return to C-index
-  nselect = nselect -1;
-
   return(nselect);
 }
 
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 6c10f09f..40c88683 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -387,8 +387,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * attatched and the list type is changed to data.frame.
   */
 
-  uint64_t nmin = selectrows(0);
-  uint64_t nmax = selectrows(1);
+  uint64_t nmin = selectrows(0), nmax = selectrows(1);
   uint64_t nn   = 0;
 
   // if  selectrows is c(0,0) use full data
@@ -397,15 +396,14 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     nmax = n;
   }
 
-  // make sure that n is not greater nmax
+  // make sure that n is not greater than nmax or nmin
   if (n < nmax)
     nmax = n;
-
-  // neither should nmin be greater
   if (n < nmin)
     nmin = n;
 
-  Rcpp::IntegerVector cvec = seq(1, k);
+  // sequences of colum and row
+  Rcpp::IntegerVector cvec = seq(0, (k-1));
   Rcpp::IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
 
@@ -413,38 +411,35 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   nmin = nmin -1;
   nmax = nmax -1;
 
-  // calculate length of variables and of row
+  // calculate length of each variable stored in file. Calculate row length
   IntegerVector rlen = calc_rowlength(vartype);
   uint64_t rlength = sum(rlen);
 
   // check if vars are selected
   std::string selcols = as<std::string>(selectcols(0));
-  bool noselectvars = selcols == "";
+  bool selectvars = selcols != "";
 
-  // select vars: either select every var or only matched cases
-  IntegerVector select;
-  if (noselectvars) {
-    select = cvec;
-  } else {
+  // select vars: either select every var or only matched cases. This will
+  // return index positions of the selected variables. If non are selected the
+  // index position is cvec
+  IntegerVector select = cvec;
+  if (selectvars)
     select = choose(selectcols, varnames);
-  }
-
-  // match returns r index
-  IntegerVector select_c = select -1;
 
   uint32_t kk = select.size();
 
-  // shrink variables
-  CharacterVector varnames_kk = varnames[select_c];
-  IntegerVector vartype_kk = vartype[select_c];
-  IntegerVector vartype3 = vartype;
+  // shrink variables to selected size
+  CharacterVector varnames_kk = varnames[select];
+  IntegerVector vartype_kk = vartype[select];
+  IntegerVector vartype_s = vartype;
 
+  // integer positions of variables not selected. Their position in vartype is
+  // filled with the negative size of their variable.
   IntegerVector nselect = which_pos(cvec, select);
 
   IntegerVector rlen2 = rlen[nselect];
   rlen2 = -rlen2;
-
-  vartype3[nselect] = rlen2;
+  vartype_s[nselect] = rlen2;
 
 
   // 1. create the list
@@ -472,9 +467,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     }
   }
 
-  // calulate jumpsize
-  IntegerVector vartype4 = calc_jump(vartype3);
-  kk = vartype4.size();
+  // Use vartype_s to calulate jumpsize
+  IntegerVector vartype_sj = calc_jump(vartype_s);
+  kk = vartype_sj.size();
 
   // 2. fill it with data
 
@@ -488,7 +483,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     ii = 0;
     for (uint32_t i=0; i<kk; ++i)
     {
-      int const type = vartype4[i];
+      int const type = vartype_sj[i];
 
       switch(((type >0) & (type < 2046)) ? 2045 : type)
       {
@@ -869,9 +864,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
    * assign attributes to the resulting data.frame
    */
 
-  formats = formats[select_c];
-  valLabels = valLabels[select_c];
-  varLabels = varLabels[select_c];
+  formats = formats[select];
+  valLabels = valLabels[select];
+  varLabels = varLabels[select];
 
   df.attr("datalabel") = datalabelCV;
   df.attr("time.stamp") = timestampCV;
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index fdf23e49..5497b287 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -368,8 +368,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   std::replace (vartype.begin(), vartype.end(), 255, STATA_DOUBLE);
 
 
-  uint64_t nmin = selectrows(0);
-  uint64_t nmax = selectrows(1);
+  uint64_t nmin = selectrows(0), nmax = selectrows(1);
   uint64_t nn   = 0;
 
   // if  selectrows is c(0,0) use full data
@@ -378,15 +377,14 @@ List read_pre13_dta(FILE * file, const bool missing,
     nmax = n;
   }
 
-  // make sure that n is not greater nmax
+  // make sure that n is not greater than nmax or nmin
   if (n < nmax)
     nmax = n;
-
-  // neither should nmin be greater
   if (n < nmin)
     nmin = n;
 
-  Rcpp::IntegerVector cvec = seq(1, k);
+  // sequences of colum and row
+  Rcpp::IntegerVector cvec = seq(0, (k-1));
   Rcpp::IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
 
@@ -394,40 +392,37 @@ List read_pre13_dta(FILE * file, const bool missing,
   nmin = nmin -1;
   nmax = nmax -1;
 
-  // calculate length of variables and of row
+  // calculate length of each variable stored in file. Calculate row length
   IntegerVector rlen = calc_rowlength(vartype);
   uint64_t rlength = sum(rlen);
 
   // check if vars are selected
   std::string selcols = as<std::string>(selectcols(0));
-  bool noselectvars = selcols == "";
+  bool selectvars = selcols != "";
 
-  // select vars: either select every var or only matched cases
-  IntegerVector select;
-  if (noselectvars) {
-    select = cvec;
-  } else {
+  // select vars: either select every var or only matched cases. This will
+  // return index positions of the selected variables. If non are selected the
+  // index position is cvec
+  IntegerVector select = cvec;
+  if (selectvars)
     select = choose(selectcols, varnames);
-  }
-
-  // match returns r index
-  IntegerVector select_c = select -1;
 
   uint32_t kk = select.size();
 
-  // shrink variables
-  CharacterVector varnames_kk = varnames[select_c];
-  IntegerVector vartype_kk = vartype[select_c];
-  IntegerVector types_kk = types[select_c];
-  IntegerVector vartype3 = vartype;
-
+  // shrink variables to selected size
+  CharacterVector varnames_kk = varnames[select];
+  IntegerVector vartype_kk = vartype[select];
+  IntegerVector vartype_s = vartype;
+  IntegerVector types_kk = types[select];
 
+  // integer positions of variables not selected. Their position in vartype is
+  // filled with the negative size of their variable.
   IntegerVector nselect = which_pos(cvec, select);
 
   IntegerVector rlen2 = rlen[nselect];
   rlen2 = -rlen2;
+  vartype_s[nselect] = rlen2;
 
-  vartype3[nselect] = rlen2;
 
   // 1. create the list
   List df(kk);
@@ -454,9 +449,9 @@ List read_pre13_dta(FILE * file, const bool missing,
     }
   }
 
-  // calulate jumpsize
-  IntegerVector vartype4 = calc_jump(vartype3);
-  kk = vartype4.size();
+  // Use vartype_s to calulate jumpsize
+  IntegerVector vartype_sj = calc_jump(vartype_s);
+  kk = vartype_sj.size();
 
   // 2. fill it with data
 
@@ -470,7 +465,7 @@ List read_pre13_dta(FILE * file, const bool missing,
     ii = 0;
     for (uint16_t i=0; i<kk; ++i)
     {
-      int const type = vartype4[i];
+      int const type = vartype_sj[i];
 
 
       switch(((type >0) & (type < 244)) ? 244 : type)
@@ -687,9 +682,9 @@ List read_pre13_dta(FILE * file, const bool missing,
    * assign attributes to the resulting data.frame
    */
 
-  formats = formats[select_c];
-  valLabels = valLabels[select_c];
-  varLabels = varLabels[select_c];
+  formats = formats[select];
+  valLabels = valLabels[select];
+  varLabels = varLabels[select];
 
   df.attr("datalabel") = datalabelCV;
   df.attr("time.stamp") = timestampCV;

From 6e9a7319c30cbf982cf987d3e3c4d9ca99b23738 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Wed, 14 Jun 2017 09:40:53 +0200
Subject: [PATCH 31/76] update naming

---
 inst/include/readstata.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 791cb920..48968288 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -190,16 +190,16 @@ inline Rcpp::IntegerVector choose(Rcpp::CharacterVector x,
 }
 
 // return only the positions of variables, we have selected.
-inline Rcpp::IntegerVector which_pos(Rcpp::IntegerVector cvec,
-                                     Rcpp::IntegerVector select)
+inline Rcpp::IntegerVector which_pos(Rcpp::IntegerVector x,
+                                     Rcpp::IntegerVector y)
 {
   // integer position of not selected variables
   // This drops all the positions we do not need. Initially I wanted something
-  // like cvec[select], but that somehow did not work, possibly this could be
+  // like x[y], but that somehow did not work, possibly this could be
   // improved.
-  std::vector<int> vec = Rcpp::as< std::vector<int> >(cvec);
-  for (uint32_t i=0; i<select.size(); ++i) {
-    vec.erase(std::remove(vec.begin(), vec.end(), select(i)), vec.end());
+  std::vector<int> vec = Rcpp::as< std::vector<int> >(x);
+  for (uint32_t i=0; i<y.size(); ++i) {
+    vec.erase(std::remove(vec.begin(), vec.end(), y(i)), vec.end());
   }
   Rcpp::IntegerVector nselect = Rcpp::wrap(vec);
 
@@ -208,20 +208,20 @@ inline Rcpp::IntegerVector which_pos(Rcpp::IntegerVector cvec,
 
 // calculate the maximum jump. This calculates the maximum space we can skip if
 // reading only a single variable. Before we skipped over each variable. Now we
-// skip over them combined. Therefore if a value in vartype3 is positive push it
+// skip over them combined. Therefore if a value in x is positive push it
 // into a new vector. If negative, sum the length up.
-inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector vartype3) {
+inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector x) {
 
-  Rcpp::IntegerVector vartype4;
+  Rcpp::IntegerVector y;
   int64_t val = 0;
   bool last = 0;
 
-  uint32_t k = vartype3.size();
+  uint32_t k = x.size();
 
   for (uint32_t i=0; i<k; ++i)
   {
 
-    int32_t value = vartype3(i);
+    int32_t value = x(i);
 
     if (value < 0) {
 
@@ -237,21 +237,21 @@ inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector vartype3) {
 
       // push back if last was neg
       if ((i > 0) & (last == 0))
-        vartype4.push_back(val);
+        y.push_back(val);
 
       val = value;
-      vartype4.push_back(val);
+      y.push_back(val);
 
       last = 1;
     }
 
     if ((i+1 == k) & (last == 0)) {
-      vartype4.push_back(val);
+      y.push_back(val);
     }
 
   }
 
-  return(vartype4);
+  return(y);
 }
 
 #endif

From 048e6cfbe5a015d872e284339131fc2379973ffa Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Sun, 18 Jun 2017 15:41:55 +0200
Subject: [PATCH 32/76] improve selection code using subsetting in Rcpp. avoids
 which_pos code and should improve readability

---
 inst/include/readstata.h | 26 +++++---------------------
 src/read_dta.cpp         | 12 +++++++-----
 src/read_pre13_dta.cpp   | 12 +++++++-----
 3 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 48968288..201b8fe4 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -179,31 +179,15 @@ inline Rcpp::IntegerVector choose(Rcpp::CharacterVector x,
 
     Rcpp::Rcout << "Variable " <<  ms <<
       " was not found in dta-file." << std::endl;
-
-    mm = mm[ll==1];
   }
 
-  // match returns R index
-  mm = mm -1;
+  // report position for found cases
+  mm = Rcpp::match(y, x);
 
-  return(mm);
-}
+//   // match returns R index
+//   mm = mm -1;
 
-// return only the positions of variables, we have selected.
-inline Rcpp::IntegerVector which_pos(Rcpp::IntegerVector x,
-                                     Rcpp::IntegerVector y)
-{
-  // integer position of not selected variables
-  // This drops all the positions we do not need. Initially I wanted something
-  // like x[y], but that somehow did not work, possibly this could be
-  // improved.
-  std::vector<int> vec = Rcpp::as< std::vector<int> >(x);
-  for (uint32_t i=0; i<y.size(); ++i) {
-    vec.erase(std::remove(vec.begin(), vec.end(), y(i)), vec.end());
-  }
-  Rcpp::IntegerVector nselect = Rcpp::wrap(vec);
-
-  return(nselect);
+  return(mm);
 }
 
 // calculate the maximum jump. This calculates the maximum space we can skip if
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 40c88683..8b1e2103 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -422,10 +422,15 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   // select vars: either select every var or only matched cases. This will
   // return index positions of the selected variables. If non are selected the
   // index position is cvec
-  IntegerVector select = cvec;
+  IntegerVector select = cvec, nselect;
   if (selectvars)
     select = choose(selectcols, varnames);
 
+  // separaet selected from not selected cases
+  LogicalVector ll = is_na(select);
+  nselect = cvec[ll == 1];
+  select = cvec[ll == 0];
+
   uint32_t kk = select.size();
 
   // shrink variables to selected size
@@ -433,10 +438,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   IntegerVector vartype_kk = vartype[select];
   IntegerVector vartype_s = vartype;
 
-  // integer positions of variables not selected. Their position in vartype is
-  // filled with the negative size of their variable.
-  IntegerVector nselect = which_pos(cvec, select);
-
+  // replace not selected cases with their negative size values
   IntegerVector rlen2 = rlen[nselect];
   rlen2 = -rlen2;
   vartype_s[nselect] = rlen2;
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index 5497b287..ab4d23da 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -403,10 +403,15 @@ List read_pre13_dta(FILE * file, const bool missing,
   // select vars: either select every var or only matched cases. This will
   // return index positions of the selected variables. If non are selected the
   // index position is cvec
-  IntegerVector select = cvec;
+  IntegerVector select = cvec, nselect;
   if (selectvars)
     select = choose(selectcols, varnames);
 
+  // separaet selected from not selected cases
+  LogicalVector ll = is_na(select);
+  nselect = cvec[ll == 1];
+  select = cvec[ll == 0];
+
   uint32_t kk = select.size();
 
   // shrink variables to selected size
@@ -415,10 +420,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   IntegerVector vartype_s = vartype;
   IntegerVector types_kk = types[select];
 
-  // integer positions of variables not selected. Their position in vartype is
-  // filled with the negative size of their variable.
-  IntegerVector nselect = which_pos(cvec, select);
-
+  // replace not selected cases with their negative size values
   IntegerVector rlen2 = rlen[nselect];
   rlen2 = -rlen2;
   vartype_s[nselect] = rlen2;

From e5850da99a9ffb69e5b65c941555f15242b2ea15 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Sun, 18 Jun 2017 15:51:59 +0200
Subject: [PATCH 33/76] cleaning. new statadefines for str and short_str

---
 inst/include/readstata.h    |  3 ---
 inst/include/statadefines.h |  2 ++
 src/read_dta.cpp            | 12 ++++++------
 src/read_pre13_dta.cpp      | 12 ++++++------
 4 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 201b8fe4..01401981 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -184,9 +184,6 @@ inline Rcpp::IntegerVector choose(Rcpp::CharacterVector x,
   // report position for found cases
   mm = Rcpp::match(y, x);
 
-//   // match returns R index
-//   mm = mm -1;
-
   return(mm);
 }
 
diff --git a/inst/include/statadefines.h b/inst/include/statadefines.h
index 856a83c6..2bda8635 100644
--- a/inst/include/statadefines.h
+++ b/inst/include/statadefines.h
@@ -54,6 +54,8 @@
 #define STATA_FLOAT 65527
 #define STATA_DOUBLE 65526
 
+#define STATA_STR 2045
+#define STATA_SHORT_STR 244
 #define STATA_STRL 32768
 
 #endif
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 8b1e2103..bdd68b5b 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -44,7 +44,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   // check the release version.
   if (release<fversion || release>lversion)
   {
-    Rcpp::warning("File version is %d.\nVersion: Not a version 13/14 dta-file", release);
+    warning("File version is %d.\nVersion: Not a version 13/14 dta-file", release);
     return -1;
   }
 
@@ -403,8 +403,8 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     nmin = n;
 
   // sequences of colum and row
-  Rcpp::IntegerVector cvec = seq(0, (k-1));
-  Rcpp::IntegerVector rvec = seq(nmin, nmax);
+  IntegerVector cvec = seq(0, (k-1));
+  IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
 
   // use c indexing starting at 0
@@ -487,7 +487,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     {
       int const type = vartype_sj[i];
 
-      switch(((type >0) & (type < 2046)) ? 2045 : type)
+      switch(((type >0) & (type < 2046)) ? STATA_STR : type)
       {
         // double
       case STATA_DOUBLE:
@@ -555,7 +555,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
         break;
       }
         // strings with 2045 or fewer characters
-      case 2045:
+      case STATA_STR:
       {
         int32_t len = 0;
         len = vartype[i];
@@ -656,7 +656,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
 
       if (type >= 0) ii += 1;
 
-      Rcpp::checkUserInterrupt();
+      checkUserInterrupt();
     }
   }
 
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index ab4d23da..6bcff13b 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -30,7 +30,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   release = readbin(release, file, 0);
 
   if (release<102 || release == 109 || release>115)
-    Rcpp::stop("First byte: Not a dta-file we can read.");
+    stop("First byte: Not a dta-file we can read.");
 
   IntegerVector versionIV(1);
   versionIV(0) = release;
@@ -384,8 +384,8 @@ List read_pre13_dta(FILE * file, const bool missing,
     nmin = n;
 
   // sequences of colum and row
-  Rcpp::IntegerVector cvec = seq(0, (k-1));
-  Rcpp::IntegerVector rvec = seq(nmin, nmax);
+  IntegerVector cvec = seq(0, (k-1));
+  IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
 
   // use c indexing starting at 0
@@ -470,7 +470,7 @@ List read_pre13_dta(FILE * file, const bool missing,
       int const type = vartype_sj[i];
 
 
-      switch(((type >0) & (type < 244)) ? 244 : type)
+      switch(((type >0) & (type < 245)) ? STATA_SHORT_STR : type)
       {
         // double
       case STATA_DOUBLE:
@@ -539,7 +539,7 @@ List read_pre13_dta(FILE * file, const bool missing,
         break;
       }
         // strings with 244 or fewer characters
-      case 244:
+      case STATA_SHORT_STR:
       {
         int32_t len = 0;
         len = vartype[i];
@@ -561,7 +561,7 @@ List read_pre13_dta(FILE * file, const bool missing,
       }
 
       if (type >= 0) ii += 1;
-      Rcpp::checkUserInterrupt();
+      checkUserInterrupt();
     }
   }
 

From 09a86436bce3021c268ab047fdf7b473e622cbc9 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Sun, 18 Jun 2017 16:01:55 +0200
Subject: [PATCH 34/76] comment

---
 inst/include/readstata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/inst/include/readstata.h b/inst/include/readstata.h
index 01401981..4eb125b7 100644
--- a/inst/include/readstata.h
+++ b/inst/include/readstata.h
@@ -177,7 +177,8 @@ inline Rcpp::IntegerVector choose(Rcpp::CharacterVector x,
 
     Rcpp::CharacterVector ms = x[ll==0];
 
-    Rcpp::Rcout << "Variable " <<  ms <<
+    // does not work if ms contains multiple names: Rcpp::as<std::string>(ms)
+    Rcpp::Rcout << "Variable " << ms <<
       " was not found in dta-file." << std::endl;
   }
 

From 777a64e55b8e40c5d0f14faba667b7837f0915a5 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Wed, 21 Jun 2017 00:40:26 +0200
Subject: [PATCH 35/76] use makro

---
 R/read.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/read.R b/R/read.R
index 6a153b33..fafeb619 100644
--- a/R/read.R
+++ b/R/read.R
@@ -308,7 +308,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     strl <- c("")
     names(strl) <- "00000000000000000000"
     strl <- c(strl, attr(data,"strl"))
-    for (j in seq(ncol(data))[types == 32768] ) {
+    for (j in seq(ncol(data))[types == sstrl] ) {
       data[, j] <- strl[data[,j]]
     }
     # if strls are in data.frame remove attribute strl

From 3c738ce29a18f6803ba14d8a37ddc9b1f0f7b1f4 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Sat, 15 Jul 2017 14:53:07 +0200
Subject: [PATCH 36/76] check for lang_v fields

---
 R/tools.R | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/R/tools.R b/R/tools.R
index 28be9ae7..c4b86cfa 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -117,9 +117,21 @@ get.label.name <- function(dat, var.name=NULL, lang=NA) {
     names(labelsets) <- vnames
   } else if (is.character(lang)) {
     ex <- attr(dat, "expansion.fields")
-    varname <- sapply(ex[grep(paste0("_lang_l_", lang), ex)],
+
+    test_lang_v <-
+      identical(integer(0),
+                unlist(lapply(ex, grep, pattern ="_lang_l_"))
+      )
+
+    langstr <- "_lang_l_"
+
+    if (test_lang_v)
+      langstr <- "_lang_v_"
+
+    varname <- sapply(ex[grep(paste0(langstr, lang), ex)],
                       function(x) x[1])
-    labelsets.tmp <- sapply(ex[grep(paste0("_lang_l_", lang), ex)],
+
+    labelsets.tmp <- sapply(ex[grep(paste0(langstr, lang), ex)],
                             function(x) x[3])
     names(labelsets.tmp) <- varname
 

From a944020a8fc1503dd89adb5bd43244b74c1ad6db Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Sat, 15 Jul 2017 15:19:52 +0200
Subject: [PATCH 37/76] revert to old style, but check if lang_l exists.
 otherwise lang is variables. if no lang_l found, just return "".

---
 R/tools.R | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/R/tools.R b/R/tools.R
index c4b86cfa..9d93e551 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -118,20 +118,18 @@ get.label.name <- function(dat, var.name=NULL, lang=NA) {
   } else if (is.character(lang)) {
     ex <- attr(dat, "expansion.fields")
 
-    test_lang_v <-
-      identical(integer(0),
-                unlist(lapply(ex, grep, pattern ="_lang_l_"))
-      )
+    has_no_label_lang <- identical(
+      integer(0),
+      unlist(lapply(ex, grep, pattern ="_lang_l_"))
+    )
 
-    langstr <- "_lang_l_"
-
-    if (test_lang_v)
-      langstr <- "_lang_v_"
+    if (has_no_label_lang) {
+      return("")
+    }
 
-    varname <- sapply(ex[grep(paste0(langstr, lang), ex)],
+    varname <- sapply(ex[grep(paste0("_lang_l_", lang), ex)],
                       function(x) x[1])
-
-    labelsets.tmp <- sapply(ex[grep(paste0(langstr, lang), ex)],
+    labelsets.tmp <- sapply(ex[grep(paste0("_lang_l_", lang), ex)],
                             function(x) x[3])
     names(labelsets.tmp) <- varname
 

From 56567f77122e8b238cdfb3c43b9609579ac9fc08 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Mon, 17 Jul 2017 11:24:30 +0200
Subject: [PATCH 38/76] update Rcpp

---
 R/RcppExports.R     |  6 +++---
 src/RcppExports.cpp | 18 +++++++++++++++---
 src/register.c      | 28 ----------------------------
 3 files changed, 18 insertions(+), 34 deletions(-)
 delete mode 100644 src/register.c

diff --git a/R/RcppExports.R b/R/RcppExports.R
index 87a2fd10..50edc8fa 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -2,14 +2,14 @@
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
 stata_pre13_save <- function(filePath, dat) {
-    .Call(readstata13_stata_pre13_save, filePath, dat)
+    .Call(`_readstata13_stata_pre13_save`, filePath, dat)
 }
 
 stata_read <- function(filePath, missing, selectrows, selectcols) {
-    .Call(readstata13_stata_read, filePath, missing, selectrows, selectcols)
+    .Call(`_readstata13_stata_read`, filePath, missing, selectrows, selectcols)
 }
 
 stata_save <- function(filePath, dat) {
-    .Call(readstata13_stata_save, filePath, dat)
+    .Call(`_readstata13_stata_save`, filePath, dat)
 }
 
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 2461b76c..5c4214f8 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -7,7 +7,7 @@ using namespace Rcpp;
 
 // stata_pre13_save
 int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat);
-RcppExport SEXP readstata13_stata_pre13_save(SEXP filePathSEXP, SEXP datSEXP) {
+RcppExport SEXP _readstata13_stata_pre13_save(SEXP filePathSEXP, SEXP datSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
@@ -19,7 +19,7 @@ END_RCPP
 }
 // stata_read
 List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows, const CharacterVector selectcols);
-RcppExport SEXP readstata13_stata_read(SEXP filePathSEXP, SEXP missingSEXP, SEXP selectrowsSEXP, SEXP selectcolsSEXP) {
+RcppExport SEXP _readstata13_stata_read(SEXP filePathSEXP, SEXP missingSEXP, SEXP selectrowsSEXP, SEXP selectcolsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
@@ -33,7 +33,7 @@ END_RCPP
 }
 // stata_save
 int stata_save(const char * filePath, Rcpp::DataFrame dat);
-RcppExport SEXP readstata13_stata_save(SEXP filePathSEXP, SEXP datSEXP) {
+RcppExport SEXP _readstata13_stata_save(SEXP filePathSEXP, SEXP datSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
@@ -43,3 +43,15 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_readstata13_stata_pre13_save", (DL_FUNC) &_readstata13_stata_pre13_save, 2},
+    {"_readstata13_stata_read", (DL_FUNC) &_readstata13_stata_read, 4},
+    {"_readstata13_stata_save", (DL_FUNC) &_readstata13_stata_save, 2},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_readstata13(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/src/register.c b/src/register.c
deleted file mode 100644
index 5a20b782..00000000
--- a/src/register.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <R.h>
-#include <Rinternals.h>
-#include <stdlib.h> // for NULL
-#include <R_ext/Rdynload.h>
-
-/* FIXME:
-*    Check these declarations against the C/Fortran source code.
-*    */
-
-/* .Call calls */
-
-extern SEXP readstata13_stata_pre13_save(SEXP, SEXP);
-extern SEXP readstata13_stata_read(SEXP, SEXP);
-extern SEXP readstata13_stata_save(SEXP, SEXP);
-
-static const R_CallMethodDef CallEntries[] = {
-    {"readstata13_stata_pre13_save", (DL_FUNC) &readstata13_stata_pre13_save, 2},
-    {"readstata13_stata_read",       (DL_FUNC) &readstata13_stata_read,       4},
-    {"readstata13_stata_save",       (DL_FUNC) &readstata13_stata_save,       2},
-    {NULL, NULL, 0}
-};
-
-void R_init_readstata13(DllInfo *dll)
-{
-    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
-    R_useDynamicSymbols(dll, FALSE);
-}
-

From c01c06e59cda4caaf8b32e5aedbf85038332bc5d Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Fri, 24 Nov 2017 12:46:16 +0100
Subject: [PATCH 39/76] rename cpp-files

---
 src/{rcpp_readstata.cpp => read.cpp}                 | 0
 src/{rcpp_savestata.cpp => save_dta.cpp}             | 0
 src/{rcpp_pre13_savestata.cpp => save_pre13_dta.cpp} | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename src/{rcpp_readstata.cpp => read.cpp} (100%)
 rename src/{rcpp_savestata.cpp => save_dta.cpp} (100%)
 rename src/{rcpp_pre13_savestata.cpp => save_pre13_dta.cpp} (100%)

diff --git a/src/rcpp_readstata.cpp b/src/read.cpp
similarity index 100%
rename from src/rcpp_readstata.cpp
rename to src/read.cpp
diff --git a/src/rcpp_savestata.cpp b/src/save_dta.cpp
similarity index 100%
rename from src/rcpp_savestata.cpp
rename to src/save_dta.cpp
diff --git a/src/rcpp_pre13_savestata.cpp b/src/save_pre13_dta.cpp
similarity index 100%
rename from src/rcpp_pre13_savestata.cpp
rename to src/save_pre13_dta.cpp

From c5debf22d98b808abddaeb80b183dbfcc22c1876 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Fri, 24 Nov 2017 12:46:27 +0100
Subject: [PATCH 40/76] update RcppExports

---
 R/RcppExports.R     |  8 ++++----
 src/RcppExports.cpp | 26 +++++++++++++-------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/R/RcppExports.R b/R/RcppExports.R
index 50edc8fa..3eb6fc0f 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -1,10 +1,6 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-stata_pre13_save <- function(filePath, dat) {
-    .Call(`_readstata13_stata_pre13_save`, filePath, dat)
-}
-
 stata_read <- function(filePath, missing, selectrows, selectcols) {
     .Call(`_readstata13_stata_read`, filePath, missing, selectrows, selectcols)
 }
@@ -13,3 +9,7 @@ stata_save <- function(filePath, dat) {
     .Call(`_readstata13_stata_save`, filePath, dat)
 }
 
+stata_pre13_save <- function(filePath, dat) {
+    .Call(`_readstata13_stata_pre13_save`, filePath, dat)
+}
+
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 5c4214f8..122fd89a 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -5,18 +5,6 @@
 
 using namespace Rcpp;
 
-// stata_pre13_save
-int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat);
-RcppExport SEXP _readstata13_stata_pre13_save(SEXP filePathSEXP, SEXP datSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject rcpp_result_gen;
-    Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP);
-    Rcpp::traits::input_parameter< Rcpp::DataFrame >::type dat(datSEXP);
-    rcpp_result_gen = Rcpp::wrap(stata_pre13_save(filePath, dat));
-    return rcpp_result_gen;
-END_RCPP
-}
 // stata_read
 List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows, const CharacterVector selectcols);
 RcppExport SEXP _readstata13_stata_read(SEXP filePathSEXP, SEXP missingSEXP, SEXP selectrowsSEXP, SEXP selectcolsSEXP) {
@@ -43,11 +31,23 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// stata_pre13_save
+int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat);
+RcppExport SEXP _readstata13_stata_pre13_save(SEXP filePathSEXP, SEXP datSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP);
+    Rcpp::traits::input_parameter< Rcpp::DataFrame >::type dat(datSEXP);
+    rcpp_result_gen = Rcpp::wrap(stata_pre13_save(filePath, dat));
+    return rcpp_result_gen;
+END_RCPP
+}
 
 static const R_CallMethodDef CallEntries[] = {
-    {"_readstata13_stata_pre13_save", (DL_FUNC) &_readstata13_stata_pre13_save, 2},
     {"_readstata13_stata_read", (DL_FUNC) &_readstata13_stata_read, 4},
     {"_readstata13_stata_save", (DL_FUNC) &_readstata13_stata_save, 2},
+    {"_readstata13_stata_pre13_save", (DL_FUNC) &_readstata13_stata_pre13_save, 2},
     {NULL, NULL, 0}
 };
 

From 701a7a9f1079e22edabbcfd86a0b3b9d87f6cfcc Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Fri, 24 Nov 2017 14:35:35 +0100
Subject: [PATCH 41/76] correct some spelling

---
 src/read_dta.cpp       | 28 +++++++++++++++-------------
 src/read_pre13_dta.cpp | 18 +++++++++---------
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index bdd68b5b..654ea7e1 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -81,7 +81,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   test("<byteorder>", file);
 
   /*
-  * byteorder is a 4 byte character e.g. "LSF". MSF referes to big-memory data.
+  * byteorder is a 4 byte character e.g. "LSF". MSF refers to big-endian.
   */
 
   std::string byteorder(3, '\0');
@@ -255,7 +255,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
 
   /*
   * sortlist. Stata stores the information which variable of a dataset was
-  * sorted. Depending on byteorder sortlist is written different. Currently we
+  * sorted. Depending on byteorder sortlist is written differently. Currently we
   * do not use this information.
   * Vector size is k+1.
   */
@@ -332,7 +332,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * characteristics. Stata can store additional information this way. It may
   * contain notes (for the dataset or a variable) or about label language sets.
   * Characteristics are not documented. We export them as attribute:
-  * expansion.fields. Characteristics are seperated by <ch> tags. Each <ch> has:
+  * expansion.fields. Characteristics are separated by <ch> tags. Each <ch> has:
   * nocharacter:  length of the characteristics
   * chvarname:    varname (binary 0 terminated)
   * chcharact:    characteristicsname (binary 0 terminated)
@@ -384,7 +384,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   * data. First a list is created with vectors. The vector type is defined by
   * vartype. Stata stores data columnwise so we loop over it and store the
   * data in the list of the first step. Third variable- and row-names are
-  * attatched and the list type is changed to data.frame.
+  * attached and the list type is changed to data.frame.
   */
 
   uint64_t nmin = selectrows(0), nmax = selectrows(1);
@@ -402,7 +402,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   if (n < nmin)
     nmin = n;
 
-  // sequences of colum and row
+  // sequences of column and row
   IntegerVector cvec = seq(0, (k-1));
   IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
@@ -426,7 +426,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   if (selectvars)
     select = choose(selectcols, varnames);
 
-  // separaet selected from not selected cases
+ // separate the selected from the not selected cases
   LogicalVector ll = is_na(select);
   nselect = cvec[ll == 1];
   select = cvec[ll == 0];
@@ -469,7 +469,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     }
   }
 
-  // Use vartype_s to calulate jumpsize
+  // Use vartype_s to calculate jump
   IntegerVector vartype_sj = calc_jump(vartype_s);
   kk = vartype_sj.size();
 
@@ -673,8 +673,8 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   test("<strls>", file);
 
   /*
-  * strL. Stata 13 introduced long strings up to 2 billon characters. strLs are
-  * sperated by "GSO".
+  * strL. Stata 13 introduced long strings up to 2 billion characters. strLs are
+  * separated by "GSO".
   * (v,o): Position in the data.frame.
   * t:     129/130 defines whether or not the strL is stored with a binary 0.
   * len:   length of the strL.
@@ -728,16 +728,18 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
     }
     }
 
-    // (129 = binary) | (130 = ascii)
+    // (129 = binary) | (130 = ascii) Note:
+    // if 130 full len contains the string. if 130 len includes trailing \0.
+    // that does not affect us. we read the full len, and if \0 occurs R
+    // will print only the string up to that position. we write 129
     uint8_t t = 0;
     t = readbin(t, file, swapit);
 
     uint32_t len = 0;
     len = readbin(len, file, swapit);
 
-    // 129 len = len; 130 len = len +'\0';
-
     std::string strl(len, '\0');
+
     readstring(strl, file, strl.size());
 
     strlvalues.push_back( strl );
@@ -755,7 +757,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   test("<value_labels>", file);
 
   /*
-  * labels are seperated by <lbl>-tags. Labels may appear in any order e.g.
+  * labels are separated by <lbl>-tags. Labels may appear in any order e.g.
   * 2 "female" 1 "male 9 "missing". They are stored as tables.
   * nlen:     length of label.
   * nlabname: label name.
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index 6bcff13b..9ce16835 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -36,7 +36,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   versionIV(0) = release;
 
   /*
-  * byteorder is a 4 byte character e.g. "LSF". MSF referes to big-memory data.
+  * byteorder is a 4 byte character e.g. "LSF". MSF referes to big-endian.
   */
 
   uint16_t ndlabel = 81;
@@ -98,7 +98,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   swapit = std::abs(SBYTEORDER-byteorder);
   byteorderI(0) = byteorder;
 
-  // filetype: unnown?
+  // filetype: unknown?
   int8_t ft = 0;
   ft = readbin(ft, file, swapit);
 
@@ -251,7 +251,7 @@ List read_pre13_dta(FILE * file, const bool missing,
 
   /*
   * sortlist. Stata stores the information which variable of a dataset was
-  * sorted. Depending on byteorder sortlist is written different. Currently we
+  * sorted. Depending on byteorder sortlist is written differently. Currently we
   * do not use this information.
   * Vector size is k+1.
   */
@@ -356,7 +356,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   * data. First a list is created with vectors. The vector type is defined by
   * vartype. Stata stores data columnwise so we loop over it and store the
   * data in the list of the first step. Third variable- and row-names are
-  * attatched and the list type is changed to data.frame.
+  * attached and the list type is changed to data.frame.
   */
 
   /* replace vartypes of Stata 8 - 12 with Stata 13 values. */
@@ -383,7 +383,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   if (n < nmin)
     nmin = n;
 
-  // sequences of colum and row
+  // sequences of column and row
   IntegerVector cvec = seq(0, (k-1));
   IntegerVector rvec = seq(nmin, nmax);
   nn = rvec.size();
@@ -407,7 +407,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   if (selectvars)
     select = choose(selectcols, varnames);
 
-  // separaet selected from not selected cases
+  // separate the selected from the not selected cases
   LogicalVector ll = is_na(select);
   nselect = cvec[ll == 1];
   select = cvec[ll == 0];
@@ -451,7 +451,7 @@ List read_pre13_dta(FILE * file, const bool missing,
     }
   }
 
-  // Use vartype_s to calulate jumpsize
+  // Use vartype_s to calulate jump
   IntegerVector vartype_sj = calc_jump(vartype_s);
   kk = vartype_sj.size();
 
@@ -574,7 +574,7 @@ List read_pre13_dta(FILE * file, const bool missing,
   df.attr("class") = "data.frame";
 
   /*
-  * labels are seperated by <lbl>-tags. Labels may appear in any order e.g.
+  * labels are separated by <lbl>-tags. Labels may appear in any order e.g.
   * 2 "female" 1 "male 9 "missing". They are stored as tables.
   * nlen:     length of label.
   * nlabname: label name.
@@ -637,7 +637,7 @@ List read_pre13_dta(FILE * file, const bool missing,
       // sort offsets so we can read labels sequentially
       std::sort(off.begin(), off.end());
 
-      // create an index to sort lables along the code values
+      // create an index to sort labels along the code values
       // this is done while factor creation
       IntegerVector indx(labn);
       indx = match(laborder,labordersort);

From 4bb44edfb68447507d4aeffeeae4d9eef975d1c1 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Fri, 24 Nov 2017 14:42:28 +0100
Subject: [PATCH 42/76] move some code around

---
 src/read_dta.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 654ea7e1..474d25b3 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -137,10 +137,10 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
 
   uint16_t ndlabel = 0;
 
-  if ((release == 118) | (release == 119))
-    ndlabel = readbin(ndlabel, file, swapit);
   if (release == 117)
     ndlabel = readbin((int8_t)ndlabel, file, swapit);
+  if ((release == 118) | (release == 119))
+    ndlabel = readbin(ndlabel, file, swapit);
 
   std::string datalabel(ndlabel, '\0');
 
@@ -479,7 +479,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   fseeko64(file, rlength * nmin, SEEK_CUR);
 
   uint32_t ii = 0;
-  for(uint64_t j=0; j<nn; ++j)
+  for (uint64_t j=0; j<nn; ++j)
   {
     // reset partial index
     ii = 0;
@@ -690,7 +690,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   CharacterVector strlvalues(0);
   CharacterVector strlnames(0);
 
-  while(gso.compare(tags)==0)
+  while (gso.compare(tags)==0)
   {
     CharacterVector strls(2);
     string ref;
@@ -773,7 +773,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
 
   List labelList = List(); //put labels into this list
 
-  while(lbltag.compare(tag)==0)
+  while (lbltag.compare(tag)==0)
   {
     int32_t nlen = 0, labn = 0, txtlen = 0, noff = 0, val = 0;
 

From fed6fb1ee72bb3eb278cab581ee4641b9cb31c51 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Fri, 24 Nov 2017 15:01:40 +0100
Subject: [PATCH 43/76] limit to lines of 80

---
 R/dbcal.R       |   8 +++-
 R/read.R        |  55 +++++++++++++++----------
 R/readstata13.R |   4 +-
 R/save.R        |   8 ++--
 R/tools.R       | 107 +++++++++++++++++++++++++++++-------------------
 5 files changed, 110 insertions(+), 72 deletions(-)

diff --git a/R/dbcal.R b/R/dbcal.R
index aac4e585..3381feaa 100644
--- a/R/dbcal.R
+++ b/R/dbcal.R
@@ -126,8 +126,12 @@ stbcal <- function(stbcalfile) {
   # In case centerdate is not rangestart:
   stbcal$buisdays <- NA
   stbcal$buisdays[stbcal$range==centerdate] <- 0
-  stbcal$buisdays[stbcal$range<centerdate] <- seq(from=-length(stbcal$range[stbcal$range<centerdate]), to=-1)
-  stbcal$buisdays[stbcal$range>centerdate] <- seq(from=1, to=length(stbcal$range[stbcal$range>centerdate]))
+  stbcal$buisdays[stbcal$range<centerdate] <- seq(
+    from=-length(stbcal$range[stbcal$range<centerdate]),
+    to=-1)
+  stbcal$buisdays[stbcal$range>centerdate] <- seq(
+    from=1,
+    to=length(stbcal$range[stbcal$range>centerdate]))
 
   # Add purpose
   if (any(grepl("purpose", x))) {
diff --git a/R/read.R b/R/read.R
index fafeb619..a8b08ca5 100644
--- a/R/read.R
+++ b/R/read.R
@@ -23,13 +23,14 @@
 #' @param convert.factors \emph{logical.} If \code{TRUE}, factors from Stata
 #'  value labels are created.
 #' @param generate.factors \emph{logical.} If \code{TRUE} and convert.factors is
-#'  TRUE, missing factor labels are created from integers. If duplicated labels are found,
-#'  unique labels will be generated according the following scheme: "label_(integer code)".
-#' @param encoding \emph{character.} Strings can be converted from Windows-1252 or UTF-8
-#'  to system encoding. Options are "latin1" or "UTF-8" to specify target
-#'  encoding explicitly. Stata 14 files are UTF-8 encoded and may contain strings
-#'   which can't be displayed in the current locale.
-#'   Set encoding=NULL to stop reencoding.
+#'  TRUE, missing factor labels are created from integers. If duplicated labels
+#'  are found, unique labels will be generated according the following scheme:
+#'  "label_(integer code)".
+#' @param encoding \emph{character.} Strings can be converted from Windows-1252
+#'  or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify
+#'  target encoding explicitly. Stata 14 files are UTF-8 encoded and may contain
+#'  strings which can't be displayed in the current locale.
+#'  Set encoding=NULL to stop reencoding.
 #' @param fromEncoding \emph{character.} We expect strings to be encoded as
 #'  "CP1252" for Stata Versions 13 and older. For dta files saved with Stata 14
 #'  or newer "UTF-8" is used. In some situation the used encoding can differ for
@@ -56,8 +57,9 @@
 #' @details If the filename is a url, the file will be downloaded as a temporary
 #'  file and read afterwards.
 #'
-#' Stata files are encoded in ansinew. Depending on your system's default encoding
-#'  certain characters may appear wrong. Using a correct encoding may fix these.
+#' Stata files are encoded in ansinew. Depending on your system's default
+#'  encoding certain characters may appear wrong. Using a correct encoding may
+#'  fix these.
 #'
 #' Variable names stored in the dta-file will be used in the resulting
 #'  data.frame. Stata types char, byte, and int will become integer; float and
@@ -71,15 +73,15 @@
 #'
 #' Stata 13 introduced a new character type called strL. strLs are able to store
 #'  strings up to 2 billion characters.  While R is able to store
-#'  strings of this size in a character vector, the printed representation of such
-#'  vectors looks rather cluttered, so it's possible to save only a reference in the
-#'  data.frame with option \code{replace.strl=FALSE}.
+#'  strings of this size in a character vector, the printed representation of
+#'  such vectors looks rather cluttered, so it's possible to save only a
+#'  reference in the data.frame with option \code{replace.strl=FALSE}.
 #'
 #' In R, you may use rownames to store characters (see for instance
 #'  \code{data(swiss)}). In Stata, this is not possible and rownames have to be
 #'  stored as a variable. If you want to use rownames, set add.rownames to TRUE.
-#'  Then the first variable of the dta-file will hold the rownames of the resulting
-#'  data.frame.
+#'  Then the first variable of the dta-file will hold the rownames of the
+#'  resulting data.frame.
 #'
 #' Reading dta-files of older and newer versions than 13 was introduced
 #'  with version 0.8.
@@ -96,8 +98,8 @@
 #'   \item{var.labels:}{Variable labels}
 #'   \item{version:}{dta file format version}
 #'   \item{label.table:}{List of value labels.}
-#'   \item{strl:}{Character vector with long strings for the new strl string variable
-#'    type. The name of every element is the identifier.}
+#'   \item{strl:}{Character vector with long strings for the new strl string
+#'    variable type. The name of every element is the identifier.}
 #'   \item{expansion.fields:}{list providing variable name, characteristic name
 #'    and the contents of Stata characteristic field.}
 #'   \item{missing:}{List of numeric vectors with Stata missing type for each
@@ -107,8 +109,10 @@
 #' }
 #' @note read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members
 #'  from foreign::read.dta().
-#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
-#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13.
+#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and
+#'  \code{memisc} for dta files from Stata
+#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version
+#'  >= 13.
 #' @references Stata Corp (2014): Description of .dta file format
 #'  \url{http://www.stata.com/help.cgi?dta}
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
@@ -356,7 +360,10 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       if (labname %in% names(label)) {
         if((vartype == sdouble | vartype == sfloat)) {
           if(!nonint.factors) {
-            warning(paste0("\n  ",vnames[i], ":\n  Factor codes of type double or float detected - no labels assigned.\n  Set option nonint.factors to TRUE to assign labels anyway.\n"))
+            warning(paste0("\n  ",vnames[i], ":\n  Factor codes of type double",
+                           "or float detected - no labels assigned.\n  Set",
+                           "option nonint.factors to TRUE to assign labels",
+                           "anyway.\n"))
             next
           }
         }
@@ -368,10 +375,12 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
           #check for duplicated labels
           labcount <- table(names(labtable))
           if(any(labcount > 1)) {
-            warning(paste0("\n  ",vnames[i], ":\n  Duplicated factor levels detected - generating unique labels.\n"))
+            warning(paste0("\n  ",vnames[i], ":\n  Duplicated factor levels",
+                           "detected - generating unique labels.\n"))
             labdups <- names(labtable) %in% names(labcount[labcount > 1])
             # generate unique labels from assigned label and code number
-            names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")")
+            names(labtable)[labdups] <- paste0(names(labtable)[labdups],
+                                               "_(", labtable[labdups], ")")
           }
 
           data[, i] <- factor(data[, i], levels=labtable,
@@ -385,7 +394,9 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
                               labels=names(gen.lab))
 
         } else {
-          warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no labels assigned.\n  Set option generate.factors=T to generate labels."))
+          warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no",
+                         "labels assigned.\n  Set option generate.factors=T to",
+                         "generate labels."))
         }
       }
     }
diff --git a/R/readstata13.R b/R/readstata13.R
index d444448a..f5ec6f8f 100644
--- a/R/readstata13.R
+++ b/R/readstata13.R
@@ -11,6 +11,6 @@
 #' @useDynLib readstata13, .registration = TRUE
 #' @import Rcpp
 #' @note If you catch a bug, please do not sue us, we do not have any money.
-#' @seealso \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from Stata
-#' Versions < 13
+#' @seealso \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from
+#'  Stata Versions < 13
 NULL
diff --git a/R/save.R b/R/save.R
index 95f12d8f..4396ff3e 100644
--- a/R/save.R
+++ b/R/save.R
@@ -54,8 +54,9 @@
 #'    type. The first element is the identifier and the second element the
 #'    string.}
 #' }
-#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
-#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13.
+#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and
+#'  \code{memisc} for dta files from Stata versions < 13 and \code{read_dta} in
+#'  package \code{haven} for Stata version >= 13.
 #' @references Stata Corp (2014): Description of .dta file format
 #'  \url{http://www.stata.com/help.cgi?dta}
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
@@ -170,7 +171,8 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
       hasfactors <- sapply(data, is.factor)
 
       if (any(hasfactors))
-        warning("dta-format < 106 does not handle factors. Labels are not saved!")
+        warning(paste("dta-format < 106 can not handle factors.",
+                      "Labels are not saved!"))
     }
     # If our data.frame contains factors, we create a label.table
     factors <- which(sapply(data, is.factor))
diff --git a/R/tools.R b/R/tools.R
index 28be9ae7..079123aa 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -45,13 +45,13 @@ dir.exists13 <-function(x) {
 # @param path path to dta file
 # @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
 # @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
-get.filepath <- function(path=""){
-  if(substring(path, 1, 1) == "~") {
+get.filepath <- function(path="") {
+  if (substring(path, 1, 1) == "~") {
     filepath <- path.expand(path)
   } else {
     filepath <- path
   }
-  if(!file.exists(filepath)) {
+  if (!file.exists(filepath)) {
     return("File does not exist.")
   }
 
@@ -63,14 +63,16 @@ get.filepath <- function(path=""){
 #' Displays informations about the defined label languages.
 #'
 #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
-#' @param print \emph{logical.} If \code{TRUE}, print available languages and default language.
+#' @param print \emph{logical.} If \code{TRUE}, print available languages and
+#'  default language.
 #' @return Returns a list with two components:
 #' \describe{
 #' \item{languages:}{Vector of label languages used in the dataset}
 #' \item{default:}{Name of the actual default label language, otherwise NA}
 #' }
-#' @details Stata allows to define multiple label sets in different languages. This functions reports the
-#' available languages and the selected default language.
+#' @details Stata allows to define multiple label sets in different languages.
+#'  This functions reports the available languages and the selected default
+#'  language.
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
 #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
 #' @export
@@ -78,7 +80,7 @@ get.lang <- function(dat, print=T) {
   ex <- attr(dat, "expansion.fields")
 
   lang <- list()
-  if(length(grep("_lang_list", ex)) > 0) {
+  if (length(grep("_lang_list", ex)) > 0) {
     lang$languages <- strsplit(ex[[grep("_lang_list", ex)]][3], " ")[[1]]
   } else {
     lang$languages <- NA
@@ -87,7 +89,7 @@ get.lang <- function(dat, print=T) {
                          ex[[grep("_lang_c", ex)]][3],
                          NA)
 
-  if(print) {
+  if (print) {
     cat("Available languages:\n ")
     cat(paste0(lang$languages, "\n"))
     cat("\nDefault language:\n")
@@ -100,13 +102,17 @@ get.lang <- function(dat, print=T) {
 
 #' Get Names of Stata Label Set
 #'
-#' Retrieves the Stata label set in the dataset for all or an vector of variable names.
+#' Retrieves the Stata label set in the dataset for all or an vector of variable
+#' names.
 #'
 #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
-#' @param var.name \emph{character vector.} Variable names. If \code{NULL}, get names of all label sets.
-#' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA
+#' @param var.name \emph{character vector.} Variable names. If \code{NULL}, get
+#'  names of all label sets.
+#' @param lang \emph{character.} Label language. Default language defined by
+#'  \code{\link{get.lang}} is used if NA
 #' @return Returns an named vector of variable labels
-#' @details Stata stores factor labels in variable independent labels sets.  This function retrieves the name of the label set for a variable.
+#' @details Stata stores factor labels in variable independent labels sets. This
+#'  function retrieves the name of the label set for a variable.
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
 #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
 #' @export
@@ -128,7 +134,7 @@ get.label.name <- function(dat, var.name=NULL, lang=NA) {
     labelsets[varname] <- labelsets.tmp[varname]
   }
 
-  if(is.null(var.name)) {
+  if (is.null(var.name)) {
     return(labelsets)
   } else {
     return(labelsets[var.name])
@@ -140,9 +146,12 @@ get.label.name <- function(dat, var.name=NULL, lang=NA) {
 #' Recreates the code numbers of a factor as stored in the Stata dataset.
 #'
 #' @param x \emph{factor.} Factor to obtain code for
-#' @param label.table \emph{table.} Table with factor levels obtained by \code{\link{get.label}}.
+#' @param label.table \emph{table.} Table with factor levels obtained by
+#'  \code{\link{get.label}}.
 #' @return Returns an integer with original codes
-#' @details While converting numeric variables into factors, the original code numbers are lost.  This function reconstructs the codes from the attribute \code{label.table}.
+#' @details While converting numeric variables into factors, the original code
+#'  numbers are lost. This function reconstructs the codes from the attribute
+#'  \code{label.table}.
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
 #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
 #' @examples
@@ -155,7 +164,7 @@ get.label.name <- function(dat, var.name=NULL, lang=NA) {
 #' as.integer(dat$type)
 #' @export
 get.origin.codes <- function(x, label.table) {
-  if(is.factor(x)) {
+  if (is.factor(x)) {
     fac <- as.character(x)
     return(as.integer(label.table[fac]))
   } else {
@@ -170,8 +179,9 @@ get.origin.codes <- function(x, label.table) {
 #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
 #' @param label.name \emph{character.} Name of the Stata label set
 #' @return Returns a named vector of code numbers
-#' @details This function returns the table of factor levels which represent a Stata label set.
-#' The name of a label set for a variable can be obtained by \code{\link{get.label.name}}.
+#' @details This function returns the table of factor levels which represent
+#'  a Stata label set. The name of a label set for a variable can be obtained
+#'  by \code{\link{get.label.name}}.
 #' @examples
 #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))
 #' labname <- get.label.name(dat,"type")
@@ -185,15 +195,18 @@ get.label <- function(dat, label.name) {
 
 #' Assign Stata Labels to a Variable
 #'
-#' Assign value labels from a Stata label set to a variable. If duplicated labels are found, 
-#' unique labels will be generated according the following scheme: "label_(integer code)".
+#' Assign value labels from a Stata label set to a variable. If duplicated
+#'  labels are found, unique labels will be generated according the following
+#'  scheme: "label_(integer code)".
 #'
 #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
 #' @param var.name \emph{character.} Name of the variable in the data.frame
-#' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA
+#' @param lang \emph{character.} Label language. Default language defined by
+#'  \code{\link{get.lang}} is used if NA
 #' @return Returns a labeled factor
 #' @examples
-#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"), convert.factors=FALSE)
+#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"),
+#'                   convert.factors=FALSE)
 #'
 #' # compare vectors
 #' set.label(dat, "type")
@@ -203,8 +216,9 @@ get.label <- function(dat, label.name) {
 #' set.label(dat, "type", "de")
 #' @export
 set.label <- function(dat, var.name, lang=NA) {
-  if(is.factor(dat[,var.name])) {
-    tmp <- get.origin.codes(dat[,var.name], get.label(dat, get.label.name(dat, var.name)))
+  if (is.factor(dat[,var.name])) {
+    tmp <- get.origin.codes(dat[,var.name],
+                            get.label(dat, get.label.name(dat, var.name)))
   } else {
     tmp <- dat[,var.name]
   }
@@ -213,15 +227,17 @@ set.label <- function(dat, var.name, lang=NA) {
 
   #check for duplicated labels
   labcount <- table(names(labtable))
-  if(any(labcount > 1)) {
-    
-    
-    warning(paste0("\n  ",var.name, ":\n  Duplicated factor levels detected - generating unique labels.\n"))
+  if (any(labcount > 1)) {
+
+
+    warning(paste0("\n  ",var.name, ":\n  Duplicated factor levels detected -",
+                   "generating unique labels.\n"))
     labdups <- names(labtable) %in% names(labcount[labcount > 1])
     # generate unique labels from assigned label and code number
-    names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")")
+    names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(",
+                                       labtable[labdups], ")")
   }
-  
+
   return(factor(tmp, levels=labtable,
                 labels=names(labtable))
   )
@@ -234,8 +250,10 @@ set.label <- function(dat, var.name, lang=NA) {
 #' @name varlabel
 #' @rdname varlabel
 #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
-#' @param var.name \emph{character vector.} Variable names. If NULL, get label for all variables.
-#' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA
+#' @param var.name \emph{character vector.} Variable names. If NULL, get label
+#'  for all variables.
+#' @param lang \emph{character.} Label language. Default language defined by
+#'  \code{\link{get.lang}} is used if NA
 #' @param value \emph{character vector.} Vector of variable names.
 #' @return Returns an named vector of variable labels
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
@@ -257,7 +275,7 @@ varlabel <- function(dat, var.name=NULL, lang=NA) {
     varlabel <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[3])
     names(varlabel) <- varname
   }
-  if(is.null(var.name)) {
+  if (is.null(var.name)) {
     # order by data.frame columns and return
     return(varlabel[vnames])
   } else {
@@ -269,7 +287,7 @@ varlabel <- function(dat, var.name=NULL, lang=NA) {
 #' @export
 'varlabel<-' <- function(dat, value) {
   nlabs <- length(attr(dat, "var.labels"))
-  if(length(value)==nlabs) {
+  if (length(value)==nlabs) {
     attr(x, "var.labels") <- value
   } else {
       warning(paste("Vector of new labels must have",nlabs,"entries."))
@@ -283,8 +301,10 @@ varlabel <- function(dat, var.name=NULL, lang=NA) {
 #' Changes default label language for a dataset.
 #'
 #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
-#' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA
-#' @param generate.factors \emph{logical.} If \code{TRUE}, missing factor levels are generated.
+#' @param lang \emph{character.} Label language. Default language defined by
+#'  \code{\link{get.lang}} is used if NA
+#' @param generate.factors \emph{logical.} If \code{TRUE}, missing factor levels
+#'  are generated.
 #' @return Returns a data.frame with value labels in language "lang".
 #' @examples
 #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))
@@ -316,14 +336,14 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
     pb <- txtProgressBar(min=1,max=length(val.labels)+1)
 
     for (i in seq_along(val.labels)) {
-      if(val.labels[i]!="") {
+      if (val.labels[i]!="") {
         labname <- val.labels[i]
         vartype <- types[i]
         labtable <- label[[labname]]
         varname <- names(val.labels)[i]
 
         # get old codes
-        if(is.factor(dat[, varname])) {
+        if (is.factor(dat[, varname])) {
           oldlabname <- get.label.name(dat, varname)
           oldlabtab <- get.label(dat, oldlabname)
           codes <- get.origin.codes(dat[,varname], oldlabtab)
@@ -332,7 +352,8 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
           varunique <- na.omit(unique(dat[,varname]))
         }
 
-        if(labname %in% names(label) & vartype > 65527 & is.factor(dat[,varname])) {
+        if (labname %in% names(label) & vartype > 65527 &
+           is.factor(dat[,varname])) {
           # assign label if label set is complete
           if (all(varunique %in% labtable)) {
 
@@ -340,7 +361,7 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
                                     labels=names(labtable))
           }
           # else generate labels from codes
-        } else if(generate.factors) {
+        } else if (generate.factors) {
           names(varunique) <- as.character(varunique)
           gen.lab  <- sort(c(varunique[!varunique %in% labtable], labtable))
 
@@ -361,7 +382,7 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
     vnames <- names(oldval.labels)
     names(oldval.labels) <- NULL
     tmp <- list()
-    for (i in seq_along(val.labels)){
+    for (i in seq_along(val.labels)) {
       tmp[[i]] <- c(vnames[i],paste0("_lang_l_",oldlang), oldval.labels[i])
     }
     attr(dat, "expansion.fields") <- c(attr(dat, "expansion.fields"),tmp)
@@ -369,7 +390,7 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
     # variable label
     old.varlabel <- attr(dat, "var.labels")
     tmp <- list()
-    for (i in seq_along(old.varlabel)){
+    for (i in seq_along(old.varlabel)) {
       tmp[[i]] <- c(vnames[i],paste0("_lang_v_", oldlang), old.varlabel[i])
     }
     attr(dat, "expansion.fields") <- c(attr(dat, "expansion.fields"),tmp)
@@ -422,7 +443,7 @@ maxchar <- function(x) {
   z <- max(nchar(x, type="byte"), na.rm = TRUE)
 
   # Stata does not allow storing a string of size 0
-  if(is.infinite(z) | (z == 0))
+  if (is.infinite(z) | (z == 0))
     z <- 1
 
   z

From d14fff94f6f419323cb98edd5b5a126cc9451325 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Fri, 24 Nov 2017 15:04:38 +0100
Subject: [PATCH 44/76] update documentation (not really required; only changes
 in markdown due to the linebreak changes previously)

---
 man/get.label.Rd        |  5 +++--
 man/get.label.name.Rd   | 12 ++++++++----
 man/get.lang.Rd         |  8 +++++---
 man/get.origin.codes.Rd |  7 +++++--
 man/read.dta13.Rd       | 40 ++++++++++++++++++++++------------------
 man/readstata13.Rd      |  4 ++--
 man/save.dta13.Rd       |  5 +++--
 man/set.label.Rd        | 11 +++++++----
 man/set.lang.Rd         |  6 ++++--
 man/varlabel.Rd         |  6 ++++--
 10 files changed, 63 insertions(+), 41 deletions(-)

diff --git a/man/get.label.Rd b/man/get.label.Rd
index 51535074..34829a41 100644
--- a/man/get.label.Rd
+++ b/man/get.label.Rd
@@ -18,8 +18,9 @@ Returns a named vector of code numbers
 Retrieve the value labels for a specific Stata label set.
 }
 \details{
-This function returns the table of factor levels which represent a Stata label set.
-The name of a label set for a variable can be obtained by \code{\link{get.label.name}}.
+This function returns the table of factor levels which represent
+ a Stata label set. The name of a label set for a variable can be obtained
+ by \code{\link{get.label.name}}.
 }
 \examples{
 dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))
diff --git a/man/get.label.name.Rd b/man/get.label.name.Rd
index fd3e0224..f90cfcd9 100644
--- a/man/get.label.name.Rd
+++ b/man/get.label.name.Rd
@@ -9,18 +9,22 @@ get.label.name(dat, var.name = NULL, lang = NA)
 \arguments{
 \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.}
 
-\item{var.name}{\emph{character vector.} Variable names. If \code{NULL}, get names of all label sets.}
+\item{var.name}{\emph{character vector.} Variable names. If \code{NULL}, get
+names of all label sets.}
 
-\item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA}
+\item{lang}{\emph{character.} Label language. Default language defined by
+\code{\link{get.lang}} is used if NA}
 }
 \value{
 Returns an named vector of variable labels
 }
 \description{
-Retrieves the Stata label set in the dataset for all or an vector of variable names.
+Retrieves the Stata label set in the dataset for all or an vector of variable
+names.
 }
 \details{
-Stata stores factor labels in variable independent labels sets.  This function retrieves the name of the label set for a variable.
+Stata stores factor labels in variable independent labels sets. This
+ function retrieves the name of the label set for a variable.
 }
 \author{
 Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de}
diff --git a/man/get.lang.Rd b/man/get.lang.Rd
index 12577494..11be34c9 100644
--- a/man/get.lang.Rd
+++ b/man/get.lang.Rd
@@ -9,7 +9,8 @@ get.lang(dat, print = T)
 \arguments{
 \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.}
 
-\item{print}{\emph{logical.} If \code{TRUE}, print available languages and default language.}
+\item{print}{\emph{logical.} If \code{TRUE}, print available languages and
+default language.}
 }
 \value{
 Returns a list with two components:
@@ -22,8 +23,9 @@ Returns a list with two components:
 Displays informations about the defined label languages.
 }
 \details{
-Stata allows to define multiple label sets in different languages. This functions reports the
-available languages and the selected default language.
+Stata allows to define multiple label sets in different languages.
+ This functions reports the available languages and the selected default
+ language.
 }
 \author{
 Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de}
diff --git a/man/get.origin.codes.Rd b/man/get.origin.codes.Rd
index 33ee7d35..59822e78 100644
--- a/man/get.origin.codes.Rd
+++ b/man/get.origin.codes.Rd
@@ -9,7 +9,8 @@ get.origin.codes(x, label.table)
 \arguments{
 \item{x}{\emph{factor.} Factor to obtain code for}
 
-\item{label.table}{\emph{table.} Table with factor levels obtained by \code{\link{get.label}}.}
+\item{label.table}{\emph{table.} Table with factor levels obtained by
+\code{\link{get.label}}.}
 }
 \value{
 Returns an integer with original codes
@@ -18,7 +19,9 @@ Returns an integer with original codes
 Recreates the code numbers of a factor as stored in the Stata dataset.
 }
 \details{
-While converting numeric variables into factors, the original code numbers are lost.  This function reconstructs the codes from the attribute \code{label.table}.
+While converting numeric variables into factors, the original code
+ numbers are lost. This function reconstructs the codes from the attribute
+ \code{label.table}.
 }
 \examples{
 dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))
diff --git a/man/read.dta13.Rd b/man/read.dta13.Rd
index 6867fb8a..7a3e8fc5 100644
--- a/man/read.dta13.Rd
+++ b/man/read.dta13.Rd
@@ -17,14 +17,15 @@ read.dta13(file, convert.factors = TRUE, generate.factors = FALSE,
 value labels are created.}
 
 \item{generate.factors}{\emph{logical.} If \code{TRUE} and convert.factors is
-TRUE, missing factor labels are created from integers. If duplicated labels are found,
-unique labels will be generated according the following scheme: "label_(integer code)".}
+TRUE, missing factor labels are created from integers. If duplicated labels
+are found, unique labels will be generated according the following scheme:
+"label_(integer code)".}
 
-\item{encoding}{\emph{character.} Strings can be converted from Windows-1252 or UTF-8
-to system encoding. Options are "latin1" or "UTF-8" to specify target
-encoding explicitly. Stata 14 files are UTF-8 encoded and may contain strings
- which can't be displayed in the current locale.
- Set encoding=NULL to stop reencoding.}
+\item{encoding}{\emph{character.} Strings can be converted from Windows-1252
+or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify
+target encoding explicitly. Stata 14 files are UTF-8 encoded and may contain
+strings which can't be displayed in the current locale.
+Set encoding=NULL to stop reencoding.}
 
 \item{fromEncoding}{\emph{character.} We expect strings to be encoded as
 "CP1252" for Stata Versions 13 and older. For dta files saved with Stata 14
@@ -71,8 +72,8 @@ The function returns a data.frame with attributes. The attributes
   \item{var.labels:}{Variable labels}
   \item{version:}{dta file format version}
   \item{label.table:}{List of value labels.}
-  \item{strl:}{Character vector with long strings for the new strl string variable
-   type. The name of every element is the identifier.}
+  \item{strl:}{Character vector with long strings for the new strl string
+   variable type. The name of every element is the identifier.}
   \item{expansion.fields:}{list providing variable name, characteristic name
    and the contents of Stata characteristic field.}
   \item{missing:}{List of numeric vectors with Stata missing type for each
@@ -89,8 +90,9 @@ The function returns a data.frame with attributes. The attributes
 If the filename is a url, the file will be downloaded as a temporary
  file and read afterwards.
 
-Stata files are encoded in ansinew. Depending on your system's default encoding
- certain characters may appear wrong. Using a correct encoding may fix these.
+Stata files are encoded in ansinew. Depending on your system's default
+ encoding certain characters may appear wrong. Using a correct encoding may
+ fix these.
 
 Variable names stored in the dta-file will be used in the resulting
  data.frame. Stata types char, byte, and int will become integer; float and
@@ -104,15 +106,15 @@ dates.
 
 Stata 13 introduced a new character type called strL. strLs are able to store
  strings up to 2 billion characters.  While R is able to store
- strings of this size in a character vector, the printed representation of such
- vectors looks rather cluttered, so it's possible to save only a reference in the
- data.frame with option \code{replace.strl=FALSE}.
+ strings of this size in a character vector, the printed representation of
+ such vectors looks rather cluttered, so it's possible to save only a
+ reference in the data.frame with option \code{replace.strl=FALSE}.
 
 In R, you may use rownames to store characters (see for instance
  \code{data(swiss)}). In Stata, this is not possible and rownames have to be
  stored as a variable. If you want to use rownames, set add.rownames to TRUE.
- Then the first variable of the dta-file will hold the rownames of the resulting
- data.frame.
+ Then the first variable of the dta-file will hold the rownames of the
+ resulting data.frame.
 
 Reading dta-files of older and newer versions than 13 was introduced
  with version 0.8.
@@ -126,8 +128,10 @@ Stata Corp (2014): Description of .dta file format
  \url{http://www.stata.com/help.cgi?dta}
 }
 \seealso{
-\code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
-versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13.
+\code{\link[foreign]{read.dta}} in package \code{foreign} and
+ \code{memisc} for dta files from Stata
+versions < 13 and \code{read_dta} in package \code{haven} for Stata version
+ >= 13.
 }
 \author{
 Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de}
diff --git a/man/readstata13.Rd b/man/readstata13.Rd
index 62dafbde..09b47117 100644
--- a/man/readstata13.Rd
+++ b/man/readstata13.Rd
@@ -12,8 +12,8 @@ Function to read the Stata file format into a data.frame.
 If you catch a bug, please do not sue us, we do not have any money.
 }
 \seealso{
-\code{\link[foreign]{read.dta}} and \code{memisc} for dta files from Stata
-Versions < 13
+\code{\link[foreign]{read.dta}} and \code{memisc} for dta files from
+ Stata Versions < 13
 }
 \author{
 Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de}
diff --git a/man/save.dta13.Rd b/man/save.dta13.Rd
index 32831383..3b698a70 100644
--- a/man/save.dta13.Rd
+++ b/man/save.dta13.Rd
@@ -67,8 +67,9 @@ Stata Corp (2014): Description of .dta file format
  \url{http://www.stata.com/help.cgi?dta}
 }
 \seealso{
-\code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
-versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13.
+\code{\link[foreign]{read.dta}} in package \code{foreign} and
+ \code{memisc} for dta files from Stata versions < 13 and \code{read_dta} in
+ package \code{haven} for Stata version >= 13.
 }
 \author{
 Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de}
diff --git a/man/set.label.Rd b/man/set.label.Rd
index 65ca65c0..84b7493a 100644
--- a/man/set.label.Rd
+++ b/man/set.label.Rd
@@ -11,17 +11,20 @@ set.label(dat, var.name, lang = NA)
 
 \item{var.name}{\emph{character.} Name of the variable in the data.frame}
 
-\item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA}
+\item{lang}{\emph{character.} Label language. Default language defined by
+\code{\link{get.lang}} is used if NA}
 }
 \value{
 Returns a labeled factor
 }
 \description{
-Assign value labels from a Stata label set to a variable. If duplicated labels are found, 
-unique labels will be generated according the following scheme: "label_(integer code)".
+Assign value labels from a Stata label set to a variable. If duplicated
+ labels are found, unique labels will be generated according the following
+ scheme: "label_(integer code)".
 }
 \examples{
-dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"), convert.factors=FALSE)
+dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"),
+                  convert.factors=FALSE)
 
 # compare vectors
 set.label(dat, "type")
diff --git a/man/set.lang.Rd b/man/set.lang.Rd
index 3afa187a..f4e055cb 100644
--- a/man/set.lang.Rd
+++ b/man/set.lang.Rd
@@ -9,9 +9,11 @@ set.lang(dat, lang = NA, generate.factors = FALSE)
 \arguments{
 \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.}
 
-\item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA}
+\item{lang}{\emph{character.} Label language. Default language defined by
+\code{\link{get.lang}} is used if NA}
 
-\item{generate.factors}{\emph{logical.} If \code{TRUE}, missing factor levels are generated.}
+\item{generate.factors}{\emph{logical.} If \code{TRUE}, missing factor levels
+are generated.}
 }
 \value{
 Returns a data.frame with value labels in language "lang".
diff --git a/man/varlabel.Rd b/man/varlabel.Rd
index 316d7c6b..0968cc9d 100644
--- a/man/varlabel.Rd
+++ b/man/varlabel.Rd
@@ -13,9 +13,11 @@ varlabel(dat) <- value
 \arguments{
 \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.}
 
-\item{var.name}{\emph{character vector.} Variable names. If NULL, get label for all variables.}
+\item{var.name}{\emph{character vector.} Variable names. If NULL, get label
+for all variables.}
 
-\item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA}
+\item{lang}{\emph{character.} Label language. Default language defined by
+\code{\link{get.lang}} is used if NA}
 
 \item{value}{\emph{character vector.} Vector of variable names.}
 }

From d5b567a654ea75461b6cf33ee613a0d77ad2d1db Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Fri, 24 Nov 2017 16:14:23 +0100
Subject: [PATCH 45/76] fix chlen for reading v108 file:
 https://www.stata.com/manual/chapter28.dta

---
 src/read_pre13_dta.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index 9ce16835..cd30b723 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -75,6 +75,7 @@ List read_pre13_dta(FILE * file, const bool missing,
     break;
   case 107:
   case 108:
+    chlen = 9;
     nvarnameslen = 9;
     nformatslen = 12;
     nvalLabelslen = 9;

From 97a3ff1418fd2aaa71eaab94eb8a1037b5e1087c Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Fri, 24 Nov 2017 17:10:52 +0100
Subject: [PATCH 46/76] fix writing characteristics and add unit tests

---
 src/read_pre13_dta.cpp     |  1 +
 src/save_pre13_dta.cpp     |  7 +--
 tests/testthat/test_save.R | 88 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index cd30b723..59f1f7a0 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -66,6 +66,7 @@ List read_pre13_dta(FILE * file, const bool missing,
     break;
   case 105:
   case 106:
+    chlen = 9;
     ndlabel = 32;
     nvarnameslen = 9;
     nformatslen = 12;
diff --git a/src/save_pre13_dta.cpp b/src/save_pre13_dta.cpp
index 29a5337c..f3bf97e6 100644
--- a/src/save_pre13_dta.cpp
+++ b/src/save_pre13_dta.cpp
@@ -47,7 +47,6 @@ int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat)
   List varLabels = dat.attr("var.labels");
   List vartypes = dat.attr("types");
 
-
   int8_t version = as<int>(dat.attr("version"));
 
 
@@ -85,6 +84,7 @@ int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat)
       break;
     case 105:
     case 106:// unknown version (SE?)
+      chlen = 9;
       ndlabel = 32;
       nvarnameslen = 9;
       nformatslen = 12;
@@ -93,6 +93,7 @@ int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat)
       break;
     case 107: // unknown version (SE?)
     case 108:
+      chlen = 9;
       nvarnameslen = 9;
       nformatslen = 12;
       nvalLabelslen = 9;
@@ -245,8 +246,8 @@ int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat)
       int8_t datatype = 0;
       uint32_t len = 0;
 
-      if (chs.size()>0){
-        for (int32_t i = 0; i<chs.size(); ++i){
+      if (chs.size()>0) {
+        for (int32_t i = 0; i<chs.size(); ++i) {
 
           CharacterVector ch = as<CharacterVector>(chs[i]);
 
diff --git a/tests/testthat/test_save.R b/tests/testthat/test_save.R
index 2f68b42c..5800e08d 100644
--- a/tests/testthat/test_save.R
+++ b/tests/testthat/test_save.R
@@ -872,3 +872,91 @@ test_that("select.cols = c('disp', 'drat')", {
 })
 
 # rm(list = files)
+
+
+#### expansion.fields ####
+
+if (readstata13:::dir.exists13("data"))
+  unlink("data", recursive = TRUE)
+dir.create("data")
+
+dd <- mtcars
+
+# create expansion.fields: In stata use command notes: They are constructed as
+# follows:
+#
+# 1. on what is the note : can be _dta or a variable name
+# 2. string "note" + number of note
+# 3. the note
+
+# initializiation of a one line note on a dta-file is done using: Ordering does
+# not matter:
+#
+# line1: _dta note0 1
+#
+# line2: _dta note1 a note attached to the dta
+
+ef <- list(
+  c("_dta", "note1", "note written in R"),
+  c("_dta", "note0", "1"),
+  c("mpg", "note1", "Miles/(US) gallon"),
+  c("mpg", "note0", "1")
+)
+
+attr(dd, "expansion.fields") <- ef
+
+save.dta13(dd, "data/dta_119.dta", version = 119)
+save.dta13(dd, "data/dta_118.dta", version = 118)
+save.dta13(dd, "data/dta_117.dta", version = 117)
+save.dta13(dd, "data/dta_115.dta", version = 115)
+save.dta13(dd, "data/dta_114.dta", version = 114)
+save.dta13(dd, "data/dta_113.dta", version = 113)
+save.dta13(dd, "data/dta_112.dta", version = 112)
+save.dta13(dd, "data/dta_111.dta", version = 111)
+save.dta13(dd, "data/dta_110.dta", version = 110)
+save.dta13(dd, "data/dta_108.dta", version = 108)
+save.dta13(dd, "data/dta_107.dta", version = 107)
+save.dta13(dd, "data/dta_106.dta", version = 106)
+save.dta13(dd, "data/dta_105.dta", version = 105)
+# save.dta13(dd, "data/dta_104.dta", version = 104)
+# save.dta13(dd, "data/dta_103.dta", version = 103)
+# save.dta13(dd, "data/dta_102.dta", version = 102)
+
+dd119 <- attr(read.dta13("data/dta_119.dta"), "expansion.fields")
+dd118 <- attr(read.dta13("data/dta_118.dta"), "expansion.fields")
+dd117 <- attr(read.dta13("data/dta_117.dta"), "expansion.fields")
+dd115 <- attr(read.dta13("data/dta_115.dta"), "expansion.fields")
+dd114 <- attr(read.dta13("data/dta_114.dta"), "expansion.fields")
+dd113 <- attr(read.dta13("data/dta_113.dta"), "expansion.fields")
+dd112 <- attr(read.dta13("data/dta_112.dta"), "expansion.fields")
+dd111 <- attr(read.dta13("data/dta_111.dta"), "expansion.fields")
+dd110 <- attr(read.dta13("data/dta_110.dta"), "expansion.fields")
+dd108 <- attr(read.dta13("data/dta_108.dta"), "expansion.fields")
+dd107 <- attr(read.dta13("data/dta_107.dta"), "expansion.fields")
+dd106 <- attr(read.dta13("data/dta_106.dta"), "expansion.fields")
+dd105 <- attr(read.dta13("data/dta_105.dta"), "expansion.fields")
+# dd104 <- read.dta13("data/dta_104.dta")
+# dd103 <- read.dta13("data/dta_103.dta")
+# dd102 <- read.dta13("data/dta_102.dta")
+
+unlink("data", recursive = TRUE)
+
+test_that("expansinon.fields", {
+  # check numerics
+  expect_equal(ef, dd119)
+  expect_equal(ef, dd118)
+  expect_equal(ef, dd117)
+  expect_equal(ef, dd115)
+  expect_equal(ef, dd114)
+  expect_equal(ef, dd113)
+  expect_equal(ef, dd112)
+  expect_equal(ef, dd111)
+  expect_equal(ef, dd110)
+  expect_equal(ef, dd108)
+  expect_equal(ef, dd107)
+  expect_equal(ef, dd106)
+  expect_equal(ef, dd105)
+  # expect_equal(ef, dd104)
+  # expect_equal(ef, dd103)
+  # expect_equal(ef, dd102)
+})

From 3c3c364354ee4dea41b0101fb9a99ecac4830346 Mon Sep 17 00:00:00 2001
From: Marvin <jmg@edna>
Date: Fri, 24 Nov 2017 19:17:52 +0100
Subject: [PATCH 47/76] vartype[i] is no longer correct at this point

---
 src/read_dta.cpp       | 2 +-
 src/read_pre13_dta.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 474d25b3..c474f199 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -558,7 +558,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       case STATA_STR:
       {
         int32_t len = 0;
-        len = vartype[i];
+        len = vartype_sj[i];
         std::string val_s (len, '\0');
 
         readstring(val_s, file, val_s.size());
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index 59f1f7a0..67d09f26 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -544,7 +544,7 @@ List read_pre13_dta(FILE * file, const bool missing,
       case STATA_SHORT_STR:
       {
         int32_t len = 0;
-        len = vartype[i];
+        len = vartype_sj[i];
         std::string val_s (len, '\0');
 
         readstring(val_s, file, val_s.size());

From 8e13c1f1fb566d1ed2e656ce45dfbe04d3a1eec7 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@rub.de>
Date: Thu, 11 Jan 2018 17:51:45 +0100
Subject: [PATCH 48/76] WIP: parallel read (#58)

* use same read_data part in post and pre 13 read_*.cpp files

* move df creation part to read_data()
---
 inst/include/read_data.h |  29 +++++
 src/read_data.cpp        | 241 +++++++++++++++++++++++++++++++++++++++
 src/read_dta.cpp         | 209 +--------------------------------
 src/read_pre13_dta.cpp   | 148 +++---------------------
 4 files changed, 287 insertions(+), 340 deletions(-)
 create mode 100644 inst/include/read_data.h
 create mode 100644 src/read_data.cpp

diff --git a/inst/include/read_data.h b/inst/include/read_data.h
new file mode 100644
index 00000000..7067150e
--- /dev/null
+++ b/inst/include/read_data.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef READDATA_H
+#define READDATA_H
+
+Rcpp::List read_data(FILE * file,
+                     const Rcpp::IntegerVector vartype_kk,
+                     const bool missing,
+                     const int8_t release,
+                     const uint64_t nn, uint32_t kk,
+                     const Rcpp::IntegerVector vartype_sj,
+                     const std::string byteorder, const bool swapit);
+
+#endif
diff --git a/src/read_data.cpp b/src/read_data.cpp
new file mode 100644
index 00000000..2b50e21d
--- /dev/null
+++ b/src/read_data.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "readstata.h"
+
+using namespace Rcpp;
+using namespace std;
+
+List read_data(FILE * file,
+               const IntegerVector vartype_kk,
+               const bool missing, const int8_t release,
+               const uint64_t nn, uint32_t kk,
+               const IntegerVector vartype_sj,
+               const std::string byteorder, const bool swapit) {
+
+  // 1. create the list
+  List df(kk);
+  for (uint32_t i=0; i<kk; ++i)
+  {
+    int const type = vartype_kk[i];
+
+    switch(type)
+    {
+    case STATA_DOUBLE:
+    case STATA_FLOAT:
+      SET_VECTOR_ELT(df, i, NumericVector(no_init(nn)));
+      break;
+
+    case STATA_INT:
+    case STATA_SHORTINT:
+    case STATA_BYTE:
+      SET_VECTOR_ELT(df, i, IntegerVector(no_init(nn)));
+      break;
+
+    default:
+      SET_VECTOR_ELT(df, i, CharacterVector(no_init(nn)));
+    break;
+    }
+  }
+
+  // updated kk to reflect the jump size
+  kk = vartype_sj.size();
+
+  uint32_t ii = 0;
+  for (uint64_t j=0; j<nn; ++j)
+  {
+    // reset partial index
+    ii = 0;
+    for (uint32_t i=0; i<kk; ++i)
+    {
+      int const type = vartype_sj[i];
+
+      switch(((type >0) & (type < 2046)) ? STATA_STR : type)
+      {
+        // double
+      case STATA_DOUBLE:
+      {
+        double val_d = 0;
+        val_d = readbin(val_d, file, swapit);
+
+        if ((missing == 0) && !(val_d == R_NegInf) && ((val_d<STATA_DOUBLE_NA_MIN) || (val_d>STATA_DOUBLE_NA_MAX)) )
+          REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
+        else
+          REAL(VECTOR_ELT(df,ii))[j] = val_d;
+
+        break;
+      }
+        // float
+      case STATA_FLOAT:
+      {
+        float val_f = 0;
+        val_f = readbin(val_f, file, swapit);
+
+        if ((missing == 0) && ((val_f<STATA_FLOAT_NA_MIN) || (val_f>STATA_FLOAT_NA_MAX)) )
+          REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
+        else
+          REAL(VECTOR_ELT(df,ii))[j] = val_f;
+
+        break;
+      }
+        // long
+      case STATA_INT:
+      {
+        int32_t val_l = 0;
+        val_l = readbin(val_l, file, swapit);
+
+        if ((missing == 0) && ((val_l<STATA_INT_NA_MIN) || (val_l>STATA_INT_NA_MAX)) )
+          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
+        else
+          INTEGER(VECTOR_ELT(df,ii))[j] = val_l;
+
+        break;
+      }
+        // int
+      case STATA_SHORTINT:
+      {
+        int16_t val_i = 0;
+        val_i = readbin(val_i, file, swapit);
+
+        if ((missing == 0) && ((val_i<STATA_SHORTINT_NA_MIN) || (val_i>STATA_SHORTINT_NA_MAX)) )
+          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
+        else
+          INTEGER(VECTOR_ELT(df,ii))[j] = val_i;
+
+        break;
+      }
+        // byte
+      case STATA_BYTE:
+      {
+        int8_t val_b = 0;
+        val_b = readbin(val_b, file, swapit);
+
+        if (missing == 0 && ( (val_b<STATA_BYTE_NA_MIN) || (val_b>STATA_BYTE_NA_MAX)) )
+          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
+        else
+          INTEGER(VECTOR_ELT(df,ii))[j] = val_b;
+
+        break;
+      }
+        // strings with 2045 or fewer characters
+      case STATA_STR:
+      {
+        int32_t len = 0;
+        len = vartype_sj[i];
+        std::string val_s (len, '\0');
+
+        readstring(val_s, file, val_s.size());
+        as<CharacterVector>(df[ii])[j] = val_s;
+        break;
+      }
+        // string of any length
+      case STATA_STRL:
+      {// strL 2*4bit or 2 + 6 bit
+
+        // FixMe: Strl in 118
+        switch (release)
+      {
+
+      case 117:
+      {
+        uint32_t v = 0, o = 0;
+
+        v = readbin(v, file, swapit);
+        o = readbin(o, file, swapit);
+
+        stringstream val_stream;
+        val_stream << v << '_' << o;
+        string val_strl = val_stream.str();
+
+        as<CharacterVector>(df[ii])[j] = val_strl;
+
+        break;
+      }
+      case 118:
+      {
+        int16_t v = 0;
+        int64_t o = 0, z = 0;
+
+        z = readbin(z, file, swapit);
+
+        // works for LSF on little- and big-endian
+        if (byteorder.compare("LSF")==0) {
+          v = (int16_t)z;
+          o = (z >> 16);
+        }
+
+        // works if we read a big-endian file on little-endian
+        if (byteorder.compare("MSF")==0) {
+          v = (z >> 48) & ((1 << 16) - 1);
+          o = z & ((1 << 16) - 1);
+        }
+
+        stringstream val_stream;
+        val_stream << v << '_' << o;
+        string val_strl = val_stream.str();
+
+        as<CharacterVector>(df[ii])[j] = val_strl;
+
+        break;
+      }
+      case 119:
+      {
+        int32_t v = 0;
+        int64_t o = 0, z = 0;
+
+        z = readbin(z, file, swapit);
+
+        // works for LSF on little- and big-endian
+        if (byteorder.compare("LSF")==0) {
+          v = (int32_t)z & ((1 << 24) - 1);
+          o = (z >> 24);
+        }
+
+        // FixMe: works if we read a big-endian file on little-endian
+        if (byteorder.compare("MSF")==0) {
+          v = (z >> 48) & ((1 << 24) - 1);
+          o = z & ((1 << 24) - 1);
+        }
+
+        stringstream val_stream;
+        val_stream << v << '_' << o;
+        string val_strl = val_stream.str();
+
+        as<CharacterVector>(df[ii])[j] = val_strl;
+
+        break;
+      }
+      }
+        break;
+      }
+        // case < 0:
+      default:
+      {
+        // skip to the next valid case
+        fseeko64(file, abs(type), SEEK_CUR);
+        break;
+      }
+      }
+
+      if (type >= 0) ii += 1;
+
+      checkUserInterrupt();
+    }
+  }
+
+  return(df);
+}
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index c474f199..5caf1536 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -16,6 +16,7 @@
  */
 
 #include "readstata.h"
+#include "read_data.h"
 
 using namespace Rcpp;
 using namespace std;
@@ -444,221 +445,17 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   vartype_s[nselect] = rlen2;
 
 
-  // 1. create the list
-  List df(kk);
-  for (uint32_t i=0; i<kk; ++i)
-  {
-    int const type = vartype_kk[i];
-
-    switch(type)
-    {
-    case STATA_DOUBLE:
-    case STATA_FLOAT:
-      SET_VECTOR_ELT(df, i, NumericVector(no_init(nn)));
-      break;
-
-    case STATA_INT:
-    case STATA_SHORTINT:
-    case STATA_BYTE:
-      SET_VECTOR_ELT(df, i, IntegerVector(no_init(nn)));
-      break;
-
-    default:
-      SET_VECTOR_ELT(df, i, CharacterVector(no_init(nn)));
-    break;
-    }
-  }
 
   // Use vartype_s to calculate jump
   IntegerVector vartype_sj = calc_jump(vartype_s);
-  kk = vartype_sj.size();
 
   // 2. fill it with data
 
   // skip into the data part
   fseeko64(file, rlength * nmin, SEEK_CUR);
 
-  uint32_t ii = 0;
-  for (uint64_t j=0; j<nn; ++j)
-  {
-    // reset partial index
-    ii = 0;
-    for (uint32_t i=0; i<kk; ++i)
-    {
-      int const type = vartype_sj[i];
-
-      switch(((type >0) & (type < 2046)) ? STATA_STR : type)
-      {
-        // double
-      case STATA_DOUBLE:
-      {
-        double val_d = 0;
-        val_d = readbin(val_d, file, swapit);
-
-        if ((missing == 0) && !(val_d == R_NegInf) && ((val_d<STATA_DOUBLE_NA_MIN) || (val_d>STATA_DOUBLE_NA_MAX)) )
-          REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
-        else
-          REAL(VECTOR_ELT(df,ii))[j] = val_d;
-
-        break;
-      }
-        // float
-      case STATA_FLOAT:
-      {
-        float val_f = 0;
-        val_f = readbin(val_f, file, swapit);
-
-        if ((missing == 0) && ((val_f<STATA_FLOAT_NA_MIN) || (val_f>STATA_FLOAT_NA_MAX)) )
-          REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
-        else
-          REAL(VECTOR_ELT(df,ii))[j] = val_f;
-
-        break;
-      }
-        // long
-      case STATA_INT:
-      {
-        int32_t val_l = 0;
-        val_l = readbin(val_l, file, swapit);
-
-        if ((missing == 0) && ((val_l<STATA_INT_NA_MIN) || (val_l>STATA_INT_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
-        else
-          INTEGER(VECTOR_ELT(df,ii))[j] = val_l;
-
-        break;
-      }
-        // int
-      case STATA_SHORTINT:
-      {
-        int16_t val_i = 0;
-        val_i = readbin(val_i, file, swapit);
-
-        if ((missing == 0) && ((val_i<STATA_SHORTINT_NA_MIN) || (val_i>STATA_SHORTINT_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
-        else
-          INTEGER(VECTOR_ELT(df,ii))[j] = val_i;
-
-        break;
-      }
-        // byte
-      case STATA_BYTE:
-      {
-        int8_t val_b = 0;
-        val_b = readbin(val_b, file, swapit);
-
-        if (missing == 0 && ( (val_b<STATA_BYTE_NA_MIN) || (val_b>STATA_BYTE_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
-        else
-          INTEGER(VECTOR_ELT(df,ii))[j] = val_b;
-
-        break;
-      }
-        // strings with 2045 or fewer characters
-      case STATA_STR:
-      {
-        int32_t len = 0;
-        len = vartype_sj[i];
-        std::string val_s (len, '\0');
-
-        readstring(val_s, file, val_s.size());
-        as<CharacterVector>(df[ii])[j] = val_s;
-        break;
-      }
-        // string of any length
-      case STATA_STRL:
-      {// strL 2*4bit or 2 + 6 bit
-
-        // FixMe: Strl in 118
-        switch (release)
-      {
-
-      case 117:
-      {
-        uint32_t v = 0, o = 0;
-
-        v = readbin(v, file, swapit);
-        o = readbin(o, file, swapit);
-
-        stringstream val_stream;
-        val_stream << v << '_' << o;
-        string val_strl = val_stream.str();
-
-        as<CharacterVector>(df[ii])[j] = val_strl;
-
-        break;
-      }
-      case 118:
-      {
-        int16_t v = 0;
-        int64_t o = 0, z = 0;
-
-        z = readbin(z, file, swapit);
-
-        // works for LSF on little- and big-endian
-        if (byteorder.compare("LSF")==0) {
-          v = (int16_t)z;
-          o = (z >> 16);
-        }
-
-        // works if we read a big-endian file on little-endian
-        if (byteorder.compare("MSF")==0) {
-          v = (z >> 48) & ((1 << 16) - 1);
-          o = z & ((1 << 16) - 1);
-        }
-
-        stringstream val_stream;
-        val_stream << v << '_' << o;
-        string val_strl = val_stream.str();
-
-        as<CharacterVector>(df[ii])[j] = val_strl;
-
-        break;
-      }
-      case 119:
-      {
-        int32_t v = 0;
-        int64_t o = 0, z = 0;
-
-        z = readbin(z, file, swapit);
-
-        // works for LSF on little- and big-endian
-        if (byteorder.compare("LSF")==0) {
-          v = (int32_t)z & ((1 << 24) - 1);
-          o = (z >> 24);
-        }
-
-        // FixMe: works if we read a big-endian file on little-endian
-        if (byteorder.compare("MSF")==0) {
-          v = (z >> 48) & ((1 << 24) - 1);
-          o = z & ((1 << 24) - 1);
-        }
-
-        stringstream val_stream;
-        val_stream << v << '_' << o;
-        string val_strl = val_stream.str();
-
-        as<CharacterVector>(df[ii])[j] = val_strl;
-
-        break;
-      }
-      }
-        break;
-      }
-        // case < 0:
-      default:
-      {
-        // skip to the next valid case
-        fseeko64(file, abs(type), SEEK_CUR);
-        break;
-      }
-      }
-
-      if (type >= 0) ii += 1;
-
-      checkUserInterrupt();
-    }
-  }
+  List df = read_data(file, vartype_kk, missing, release, nn, kk,
+                      vartype_sj, byteorder, swapit);
 
   // skip to end of data part
   fseeko64(file, rlength * (n - nmax -1), SEEK_CUR);
diff --git a/src/read_pre13_dta.cpp b/src/read_pre13_dta.cpp
index 67d09f26..c8e211a2 100644
--- a/src/read_pre13_dta.cpp
+++ b/src/read_pre13_dta.cpp
@@ -16,6 +16,7 @@
  */
 
 #include "readstata.h"
+#include "read_data.h"
 
 using namespace Rcpp;
 using namespace std;
@@ -94,11 +95,18 @@ List read_pre13_dta(FILE * file, const bool missing,
   IntegerVector byteorderI(1);
   bool swapit = 0;
 
-  int8_t byteorder = 0;
-  byteorder = readbin(byteorder, file, 0);
+  int8_t byteorder_i = 0;
+  byteorder_i = readbin(byteorder_i, file, 0);
   // 1 = MSF 2 = LSF
-  swapit = std::abs(SBYTEORDER-byteorder);
-  byteorderI(0) = byteorder;
+  swapit = std::abs(SBYTEORDER-byteorder_i);
+  byteorderI(0) = byteorder_i;
+
+  std::string byteorder(3, '\0');
+
+  if (byteorder_i == 1)
+    byteorder = "MSF";
+  else
+    byteorder = "LSF";
 
   // filetype: unknown?
   int8_t ft = 0;
@@ -428,144 +436,16 @@ List read_pre13_dta(FILE * file, const bool missing,
   vartype_s[nselect] = rlen2;
 
 
-  // 1. create the list
-  List df(kk);
-  for (uint32_t i=0; i<kk; ++i)
-  {
-    int const type = vartype_kk[i];
-
-    switch(type)
-    {
-    case STATA_DOUBLE:
-    case STATA_FLOAT:
-      SET_VECTOR_ELT(df, i, NumericVector(no_init(nn)));
-      break;
-
-    case STATA_INT:
-    case STATA_SHORTINT:
-    case STATA_BYTE:
-      SET_VECTOR_ELT(df, i, IntegerVector(no_init(nn)));
-      break;
-
-    default:
-      SET_VECTOR_ELT(df, i, CharacterVector(no_init(nn)));
-    break;
-    }
-  }
-
   // Use vartype_s to calulate jump
   IntegerVector vartype_sj = calc_jump(vartype_s);
-  kk = vartype_sj.size();
 
   // 2. fill it with data
 
   // skip into the data part
   fseeko64(file, rlength * nmin, SEEK_CUR);
 
-  uint32_t ii = 0;
-  for(uint32_t j=0; j<nn; ++j)
-  {
-    // reset partial index
-    ii = 0;
-    for (uint16_t i=0; i<kk; ++i)
-    {
-      int const type = vartype_sj[i];
-
-
-      switch(((type >0) & (type < 245)) ? STATA_SHORT_STR : type)
-      {
-        // double
-      case STATA_DOUBLE:
-      {
-        double val_d = 0;
-        val_d = readbin(val_d, file, swapit);
-
-        if ((missing == FALSE) & !(val_d == R_NegInf) & ((val_d<STATA_DOUBLE_NA_MIN) | (val_d>STATA_DOUBLE_NA_MAX)) )
-          REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
-        else
-          REAL(VECTOR_ELT(df,ii))[j] = val_d;
-
-        break;
-      }
-        // float
-      case STATA_FLOAT:
-      {
-        float val_f = 0;
-        val_f = readbin(val_f, file, swapit);
-
-        if ((missing == FALSE) & ((val_f<STATA_FLOAT_NA_MIN) | (val_f>STATA_FLOAT_NA_MAX)) )
-          REAL(VECTOR_ELT(df,ii))[j] = NA_REAL;
-        else
-          REAL(VECTOR_ELT(df,ii))[j] = val_f;
-
-        break;
-      }
-        // long
-      case STATA_INT:
-      {
-        int32_t val_l = 0;
-        val_l = readbin(val_l, file, swapit);
-
-
-        if ((missing == FALSE) & ((val_l<STATA_INT_NA_MIN) | (val_l>STATA_INT_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,ii))[j]  = NA_INTEGER;
-        else
-          INTEGER(VECTOR_ELT(df,ii))[j] = val_l;
-
-        break;
-      }
-        // int
-      case STATA_SHORTINT:
-      {
-        int16_t val_i = 0;
-        val_i = readbin(val_i, file, swapit);
-
-        if ((missing == FALSE) & ((val_i<STATA_SHORTINT_NA_MIN) | (val_i>STATA_SHORTINT_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
-        else
-          INTEGER(VECTOR_ELT(df,ii))[j] = val_i;
-
-        break;
-      }
-        // byte
-      case STATA_BYTE:
-      {
-        int8_t val_b = 0;
-        val_b = readbin(val_b, file, swapit);
-
-        if ((missing == FALSE) & ( (val_b<STATA_BYTE_NA_MIN) | (val_b>STATA_BYTE_NA_MAX)) )
-          INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER;
-        else
-          INTEGER(VECTOR_ELT(df,ii))[j] = val_b;
-
-        break;
-      }
-        // strings with 244 or fewer characters
-      case STATA_SHORT_STR:
-      {
-        int32_t len = 0;
-        len = vartype_sj[i];
-        std::string val_s (len, '\0');
-
-        readstring(val_s, file, val_s.size());
-
-        as<CharacterVector>(df[ii])[j] = val_s;
-
-        break;
-      }
-        // case < 0:
-      default:
-      {
-        // skip to the next valid case
-        fseeko64(file, abs(type), SEEK_CUR);
-        break;
-      }
-      }
-
-      if (type >= 0) ii += 1;
-      checkUserInterrupt();
-    }
-  }
+  List df = read_data(file, vartype_kk, missing, release, nn, kk,
+                      vartype_sj, byteorder, swapit);
 
   // skip to end of data part
   fseeko64(file, rlength * (n - nmax -1), SEEK_CUR);

From 7650c2039f8c9faa8430ad90d3ee0294695f4a8f Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 16 Mar 2018 12:16:44 +0100
Subject: [PATCH 49/76] Check for duplicate labels if generate.factors=T

---
 R/read.R | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/R/read.R b/R/read.R
index a8b08ca5..2a9960db 100644
--- a/R/read.R
+++ b/R/read.R
@@ -369,20 +369,20 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
         }
         # get unique values / omit NA
         varunique <- na.omit(unique(data[, i]))
-        # assign label if label set is complete
-        if (all(varunique %in% labtable)) {
 
-          #check for duplicated labels
-          labcount <- table(names(labtable))
-          if(any(labcount > 1)) {
-            warning(paste0("\n  ",vnames[i], ":\n  Duplicated factor levels",
-                           "detected - generating unique labels.\n"))
-            labdups <- names(labtable) %in% names(labcount[labcount > 1])
-            # generate unique labels from assigned label and code number
-            names(labtable)[labdups] <- paste0(names(labtable)[labdups],
-                                               "_(", labtable[labdups], ")")
-          }
+        #check for duplicated labels
+        labcount <- table(names(labtable))
+        if(any(labcount > 1)) {
+          warning(paste0("\n  ",vnames[i], ":\n  Duplicated factor levels",
+                         "detected - generating unique labels.\n"))
+          labdups <- names(labtable) %in% names(labcount[labcount > 1])
+          # generate unique labels from assigned label and code number
+          names(labtable)[labdups] <- paste0(names(labtable)[labdups],
+                                             "_(", labtable[labdups], ")")
+        }
 
+        # assign label if label set is complete
+        if (all(varunique %in% labtable)) {
           data[, i] <- factor(data[, i], levels=labtable,
                               labels=names(labtable))
           # else generate labels from codes

From d2fcf21f7f8692676d484d3f16553f3ab8200f1f Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 16 Mar 2018 15:07:35 +0100
Subject: [PATCH 50/76] New function get.label.tables: show all Stata label
 sets

---
 NAMESPACE |  1 +
 R/tools.R | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/NAMESPACE b/NAMESPACE
index d93352b7..3ee43a47 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,6 +4,7 @@ export("varlabel<-")
 export(as.caldays)
 export(get.label)
 export(get.label.name)
+export(get.label.tables)
 export(get.lang)
 export(get.origin.codes)
 export(read.dta13)
diff --git a/R/tools.R b/R/tools.R
index 079123aa..dd8e9910 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -193,6 +193,25 @@ get.label <- function(dat, label.name) {
   return(attr(dat, "label.table")[label.name][[1]])
 }
 
+#' Get all Stata Label Sets for a Data.frame
+#'
+#' Retrieve the value labels for all variables.
+#'
+#' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
+#' @return Returns a named list of label tables
+#' @details This function returns the factor levels which represent
+#'  a Stata label set for all variables.
+#' @examples
+#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))
+#' get.label.tables(dat)
+#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
+#' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
+#' @export
+get.label.tables <- function(dat) {
+  varnames <- setNames(names(dat), names(dat))
+  lapply(varnames, function(varname) get.label(dat, get.label.name(dat, varname)))
+}
+
 #' Assign Stata Labels to a Variable
 #'
 #' Assign value labels from a Stata label set to a variable. If duplicated

From 5c3abbfefd9118a18f7ebc0bdbdba85531a41a38 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Fri, 16 Mar 2018 19:14:34 +0100
Subject: [PATCH 51/76] push Rd-file to please check()

---
 man/get.label.tables.Rd | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 man/get.label.tables.Rd

diff --git a/man/get.label.tables.Rd b/man/get.label.tables.Rd
new file mode 100644
index 00000000..3321167e
--- /dev/null
+++ b/man/get.label.tables.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/tools.R
+\name{get.label.tables}
+\alias{get.label.tables}
+\title{Get all Stata Label Sets for a Data.frame}
+\usage{
+get.label.tables(dat)
+}
+\arguments{
+\item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.}
+}
+\value{
+Returns a named list of label tables
+}
+\description{
+Retrieve the value labels for all variables.
+}
+\details{
+This function returns the factor levels which represent
+ a Stata label set for all variables.
+}
+\examples{
+dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))
+get.label.tables(dat)
+}
+\author{
+Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de}
+
+Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de}
+}

From 162bc7fe68d826e8b8c56464b46fcd450875c9f0 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Fri, 16 Mar 2018 19:14:34 +0100
Subject: [PATCH 52/76] push Rd-file to please check()

---
 NAMESPACE               |  1 +
 R/tools.R               |  1 +
 man/get.label.tables.Rd | 30 ++++++++++++++++++++++++++++++
 3 files changed, 32 insertions(+)
 create mode 100644 man/get.label.tables.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 3ee43a47..8b070e9b 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -16,6 +16,7 @@ export(varlabel)
 import(Rcpp)
 importFrom(stats,complete.cases)
 importFrom(stats,na.omit)
+importFrom(stats,setNames)
 importFrom(utils,download.file)
 importFrom(utils,localeToCharset)
 importFrom(utils,setTxtProgressBar)
diff --git a/R/tools.R b/R/tools.R
index e7502d94..1eca8f14 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -216,6 +216,7 @@ get.label <- function(dat, label.name) {
 #' get.label.tables(dat)
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
 #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
+#' @importFrom stats setNames
 #' @export
 get.label.tables <- function(dat) {
   varnames <- setNames(names(dat), names(dat))
diff --git a/man/get.label.tables.Rd b/man/get.label.tables.Rd
new file mode 100644
index 00000000..3321167e
--- /dev/null
+++ b/man/get.label.tables.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/tools.R
+\name{get.label.tables}
+\alias{get.label.tables}
+\title{Get all Stata Label Sets for a Data.frame}
+\usage{
+get.label.tables(dat)
+}
+\arguments{
+\item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.}
+}
+\value{
+Returns a named list of label tables
+}
+\description{
+Retrieve the value labels for all variables.
+}
+\details{
+This function returns the factor levels which represent
+ a Stata label set for all variables.
+}
+\examples{
+dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))
+get.label.tables(dat)
+}
+\author{
+Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de}
+
+Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de}
+}

From f031163b38568e8d9b59b10d10a602997e3b043c Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Fri, 11 May 2018 13:35:19 +0200
Subject: [PATCH 53/76] option to allow export of binary files stored in strl

---
 R/RcppExports.R         |  4 ++--
 R/read.R                | 10 ++++++++--
 inst/include/read_dta.h |  4 +++-
 man/read.dta13.Rd       |  7 ++++++-
 src/RcppExports.cpp     | 10 ++++++----
 src/read.cpp            |  6 ++++--
 src/read_dta.cpp        | 28 +++++++++++++++++++++++-----
 7 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/R/RcppExports.R b/R/RcppExports.R
index 3eb6fc0f..1ef45a93 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -1,8 +1,8 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-stata_read <- function(filePath, missing, selectrows, selectcols) {
-    .Call(`_readstata13_stata_read`, filePath, missing, selectrows, selectcols)
+stata_read <- function(filePath, missing, selectrows, selectcols, strlexport, strlpath) {
+    .Call(`_readstata13_stata_read`, filePath, missing, selectrows, selectcols, strlexport, strlpath)
 }
 
 stata_save <- function(filePath, dat) {
diff --git a/R/read.R b/R/read.R
index 2a9960db..db422b10 100644
--- a/R/read.R
+++ b/R/read.R
@@ -53,6 +53,9 @@
 #'  value rows from 1:val are selected. If two values of a range are selected
 #'  the rows in range will be selected.
 #' @param select.cols \emph{character:} Vector of variables to select.
+#' @param strlexport \emph{logical:} Should strl content be exported as binary
+#'  files?
+#' @param strlpath \emph{cahracter:} Path for strl export.
 #'
 #' @details If the filename is a url, the file will be downloaded as a temporary
 #'  file and read afterwards.
@@ -126,7 +129,9 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
                        convert.underscore = FALSE, missing.type = FALSE,
                        convert.dates = TRUE, replace.strl = TRUE,
                        add.rownames = FALSE, nonint.factors=FALSE,
-                       select.rows = NULL, select.cols = NULL) {
+                       select.rows = NULL, select.cols = NULL,
+                       strlexport = FALSE, strlpath = ".") {
+
   # Check if path is a url
   if (length(grep("^(http|ftp|https)://", file))) {
     tmp <- tempfile()
@@ -176,7 +181,8 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     select.cols <- ""
   }
 
-  data <- stata_read(filepath, missing.type, select.rows, select.cols)
+  data <- stata_read(filepath, missing.type, select.rows, select.cols,
+                     strlexport, strlpath)
 
   version <- attr(data, "version")
 
diff --git a/inst/include/read_dta.h b/inst/include/read_dta.h
index 7369903f..f0a8ee27 100644
--- a/inst/include/read_dta.h
+++ b/inst/include/read_dta.h
@@ -20,6 +20,8 @@
 
 Rcpp::List read_dta(FILE * file, const bool missing,
                     const Rcpp::IntegerVector selectrows,
-                    const Rcpp::CharacterVector selectcols);
+                    const Rcpp::CharacterVector selectcols,
+                    const bool strlexport,
+                    const Rcpp::CharacterVector strlpath);
 
 #endif
diff --git a/man/read.dta13.Rd b/man/read.dta13.Rd
index 7a3e8fc5..98534404 100644
--- a/man/read.dta13.Rd
+++ b/man/read.dta13.Rd
@@ -8,7 +8,7 @@ read.dta13(file, convert.factors = TRUE, generate.factors = FALSE,
   encoding = "UTF-8", fromEncoding = NULL, convert.underscore = FALSE,
   missing.type = FALSE, convert.dates = TRUE, replace.strl = TRUE,
   add.rownames = FALSE, nonint.factors = FALSE, select.rows = NULL,
-  select.cols = NULL)
+  select.cols = NULL, strlexport = FALSE, strlpath = ".")
 }
 \arguments{
 \item{file}{\emph{character.} Path to the dta file you want to import.}
@@ -57,6 +57,11 @@ value rows from 1:val are selected. If two values of a range are selected
 the rows in range will be selected.}
 
 \item{select.cols}{\emph{character:} Vector of variables to select.}
+
+\item{strlexport}{\emph{logical:} Should strl content be exported as binary
+files?}
+
+\item{strlpath}{\emph{cahracter:} Path for strl export.}
 }
 \value{
 The function returns a data.frame with attributes. The attributes
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 122fd89a..0f12b52a 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -6,8 +6,8 @@
 using namespace Rcpp;
 
 // stata_read
-List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows, const CharacterVector selectcols);
-RcppExport SEXP _readstata13_stata_read(SEXP filePathSEXP, SEXP missingSEXP, SEXP selectrowsSEXP, SEXP selectcolsSEXP) {
+List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows, const CharacterVector selectcols, const bool strlexport, const CharacterVector strlpath);
+RcppExport SEXP _readstata13_stata_read(SEXP filePathSEXP, SEXP missingSEXP, SEXP selectrowsSEXP, SEXP selectcolsSEXP, SEXP strlexportSEXP, SEXP strlpathSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
@@ -15,7 +15,9 @@ BEGIN_RCPP
     Rcpp::traits::input_parameter< const bool >::type missing(missingSEXP);
     Rcpp::traits::input_parameter< const IntegerVector >::type selectrows(selectrowsSEXP);
     Rcpp::traits::input_parameter< const CharacterVector >::type selectcols(selectcolsSEXP);
-    rcpp_result_gen = Rcpp::wrap(stata_read(filePath, missing, selectrows, selectcols));
+    Rcpp::traits::input_parameter< const bool >::type strlexport(strlexportSEXP);
+    Rcpp::traits::input_parameter< const CharacterVector >::type strlpath(strlpathSEXP);
+    rcpp_result_gen = Rcpp::wrap(stata_read(filePath, missing, selectrows, selectcols, strlexport, strlpath));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -45,7 +47,7 @@ END_RCPP
 }
 
 static const R_CallMethodDef CallEntries[] = {
-    {"_readstata13_stata_read", (DL_FUNC) &_readstata13_stata_read, 4},
+    {"_readstata13_stata_read", (DL_FUNC) &_readstata13_stata_read, 6},
     {"_readstata13_stata_save", (DL_FUNC) &_readstata13_stata_save, 2},
     {"_readstata13_stata_pre13_save", (DL_FUNC) &_readstata13_stata_pre13_save, 2},
     {NULL, NULL, 0}
diff --git a/src/read.cpp b/src/read.cpp
index 9ec251ff..bc6b2269 100644
--- a/src/read.cpp
+++ b/src/read.cpp
@@ -28,7 +28,8 @@ using namespace Rcpp;
 // [[Rcpp::export]]
 List stata_read(const char * filePath, const bool missing,
                 const IntegerVector selectrows,
-                const CharacterVector selectcols)
+                const CharacterVector selectcols,
+                const bool strlexport, const CharacterVector strlpath)
 {
   FILE *file = NULL;    // File pointer
 
@@ -54,7 +55,8 @@ List stata_read(const char * filePath, const bool missing,
   List df(0);
 
   if (fbit.compare(expfbit) == 0)
-    df = read_dta(file, missing, selectrows, selectcols);
+    df = read_dta(file, missing, selectrows, selectcols,
+                  strlexport, strlpath);
   else
     df = read_pre13_dta(file, missing, selectrows, selectcols);
 
diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 5caf1536..26cb15f0 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -22,7 +22,9 @@ using namespace Rcpp;
 using namespace std;
 
 List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
-              const CharacterVector selectcols) {
+              const CharacterVector selectcols,
+              const bool strlexport, const CharacterVector strlpath)
+{
   // stata_dta><header>
   test("stata_dta><header>", file);
   test("<release>", file);
@@ -427,7 +429,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   if (selectvars)
     select = choose(selectcols, varnames);
 
- // separate the selected from the not selected cases
+  // separate the selected from the not selected cases
   LogicalVector ll = is_na(select);
   nselect = cvec[ll == 1];
   select = cvec[ll == 0];
@@ -539,6 +541,22 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
 
     readstring(strl, file, strl.size());
 
+    // write strl to file. Stata allows binary files in strls
+    if (strlexport) {
+
+      std::string path = Rcpp::as<std::string>(strlpath);
+      std::string outputpath = path + "/" + ref;
+
+      ofstream file1(outputpath, ios::out | ios::binary);
+      if (file1.good()) {
+        file1.write(strl.c_str(), strl.size());
+        file1.close();
+      } else {
+        std::cout << "file error write" << endl;
+      }
+
+    }
+
     strlvalues.push_back( strl );
     strlnames.push_back( ref );
 
@@ -653,9 +671,9 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
   }
 
   /*
-  * Final test if we reached the end of the file
-  * close the file
-  */
+   * Final test if we reached the end of the file
+   * close the file
+   */
 
   // [</val]ue_labels>
   test("ue_labels>", file);

From b11b58c0516bfd472b26aa806d2d74bd1818a4bb Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Fri, 11 May 2018 13:49:04 +0200
Subject: [PATCH 54/76] R does not like cout

---
 src/read_dta.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index 26cb15f0..fe0615cf 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -552,7 +552,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
         file1.write(strl.c_str(), strl.size());
         file1.close();
       } else {
-        std::cout << "file error write" << endl;
+        Rcpp::Rcout << "strl export failed" << std::endl;
       }
 
     }

From 6ba176c57691c34d1ab54b333cba26202e5fb2c3 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Fri, 11 May 2018 14:25:11 +0200
Subject: [PATCH 55/76] fix for c++ < 11

---
 src/read_dta.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/read_dta.cpp b/src/read_dta.cpp
index fe0615cf..6578f756 100644
--- a/src/read_dta.cpp
+++ b/src/read_dta.cpp
@@ -547,7 +547,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows,
       std::string path = Rcpp::as<std::string>(strlpath);
       std::string outputpath = path + "/" + ref;
 
-      ofstream file1(outputpath, ios::out | ios::binary);
+      ofstream file1(outputpath.c_str(), ios::out | ios::binary);
       if (file1.good()) {
         file1.write(strl.c_str(), strl.size());
         file1.close();

From 29f2ae1fe60a936949a1c29dfc56f8d68ae6d78e Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Fri, 11 May 2018 14:41:44 +0200
Subject: [PATCH 56/76] Update news

---
 NEWS      | 1 +
 README.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/NEWS b/NEWS
index 7e28c8c7..a72b43e7 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,7 @@
 - allow reading only pre-selected variables
 - experimental support for format 119
 - improve partial reading
+- export of binary data from dta-files
 
 [0.9.0]
 - generate unique factor labels to prevent errors in factor definition
diff --git a/README.md b/README.md
index ce5684a6..23dfe244 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,7 @@ devtools::install_github("sjewo/readstata13", ref="testing")
 * [testing] Allow reading only pre-selected variables
 * [testing] Experimental support for format 119
 * [testing] Improvements to partial reading. Idea by Kevin Jin
+* [testing] Export of binary data from dta-files
 * [0.9.0] Generate unique factor labels to prevent errors in factor definition
 * [0.9.0] check interrupt for long read. Patch by Giovanni Righi
 * [0.9.0] updates to notes, roxygen and register

From 8588dfcfff83085460cfac237cdf206a7e7d5485 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Fri, 11 May 2018 22:50:01 +0200
Subject: [PATCH 57/76] avoid multiple slow calls, assume everythings fine and
 carry on

---
 R/tools.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/tools.R b/R/tools.R
index 1eca8f14..17b84b90 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -365,6 +365,10 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
     cat("Replacing value labels. This might take some time...\n")
     pb <- txtProgressBar(min=1,max=length(val.labels)+1)
 
+
+    oldlabname_f <- get.label.name(dat)
+    oldlabtab_f <- lapply(oldlabname_f, function(x) get.label(dat, x))
+
     for (i in seq_along(val.labels)) {
       if (val.labels[i]!="") {
         labname <- val.labels[i]
@@ -374,8 +378,8 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
 
         # get old codes
         if (is.factor(dat[, varname])) {
-          oldlabname <- get.label.name(dat, varname)
-          oldlabtab <- get.label(dat, oldlabname)
+          oldlabname <- oldlabname_f[oldlabname_f == varname]
+          oldlabtab <- oldlabtab_f[[names(oldlabname)]]
           codes <- get.origin.codes(dat[,varname], oldlabtab)
           varunique <- na.omit(unique(codes))
         } else {

From df1c1b1e2e9be1536b7d62ffceac56d4c51d1a25 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Fri, 11 May 2018 22:59:38 +0200
Subject: [PATCH 58/76] thinko

---
 R/tools.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/tools.R b/R/tools.R
index 17b84b90..70c1a91a 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -378,7 +378,7 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
 
         # get old codes
         if (is.factor(dat[, varname])) {
-          oldlabname <- oldlabname_f[oldlabname_f == varname]
+          oldlabname <- oldlabname_f[names(oldlabname_f) == varname]
           oldlabtab <- oldlabtab_f[[names(oldlabname)]]
           codes <- get.origin.codes(dat[,varname], oldlabtab)
           varunique <- na.omit(unique(codes))

From 45994f604c0473e5f63b4e0efb4d070d94b589a6 Mon Sep 17 00:00:00 2001
From: Jan Marvin Garbuszus <jan.garbuszus@ruhr-uni-bochum.de>
Date: Sat, 12 May 2018 01:33:01 +0200
Subject: [PATCH 59/76] cleanup

---
 R/tools.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/R/tools.R b/R/tools.R
index 70c1a91a..d0b95aea 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -369,8 +369,8 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
     oldlabname_f <- get.label.name(dat)
     oldlabtab_f <- lapply(oldlabname_f, function(x) get.label(dat, x))
 
-    for (i in seq_along(val.labels)) {
-      if (val.labels[i]!="") {
+    for (i in which(val.labels != "")) {
+
         labname <- val.labels[i]
         vartype <- types[i]
         labtable <- label[[labname]]
@@ -407,7 +407,6 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
         }
 
         setTxtProgressBar(pb, i)
-        }
     }
     close(pb)
 

From 8d9aeaa3f1b595be518a55480f726ce0a5779dc1 Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 12:31:13 +0200
Subject: [PATCH 60/76] Update DESCRIPTION and NEWS for release

---
 DESCRIPTION |  2 +-
 NEWS        |  4 +++-
 README.md   | 22 ++++++++++++++++------
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a0bcccd9..a04ab6ff 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: readstata13
 Type: Package
 Title: Import 'Stata' Data Files
-Version: 0.9.0
+Version: 0.9.1
 Authors@R: c(
     person("Jan Marvin", "Garbuszus",
     email = "jan.garbuszus@ruhr-uni-bochum.de", role = c("aut")),
diff --git a/NEWS b/NEWS
index a72b43e7..b8d1ead1 100644
--- a/NEWS
+++ b/NEWS
@@ -1,8 +1,10 @@
-[testing]
+[0.9.1]
 - allow reading only pre-selected variables
 - experimental support for format 119
 - improve partial reading
 - export of binary data from dta-files
+- new function get.label.tables() to show all Stata label sets
+- fix check for duplicate labels
 
 [0.9.0]
 - generate unique factor labels to prevent errors in factor definition
diff --git a/README.md b/README.md
index 23dfe244..9bb1b955 100644
--- a/README.md
+++ b/README.md
@@ -35,12 +35,12 @@ users need to install
 
 ```R
 # install.packages("devtools")
-devtools::install_github("sjewo/readstata13", ref="0.9.0")
+devtools::install_github("sjewo/readstata13", ref="0.9.1")
 ```
 
 Older Versions of devtools require a username option:
 ```R
-install_github("readstata13", username="sjewo", ref="0.9.0")
+install_github("readstata13", username="sjewo", ref="0.9.1")
 ```
 
 To install the current development version from github:
@@ -56,10 +56,14 @@ devtools::install_github("sjewo/readstata13", ref="testing")
 [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/package=readstata13)
 
 ### Working features
-* [testing] Allow reading only pre-selected variables
-* [testing] Experimental support for format 119
-* [testing] Improvements to partial reading. Idea by Kevin Jin
-* [testing] Export of binary data from dta-files
+
+* [0.9.1] Allow reading only pre-selected variables
+* [0.9.1] Experimental support for format 119
+* [0.9.1] Improvements to partial reading. Idea by Kevin Jin
+* [0.9.1] Export of binary data from dta-files
+* [0.9.1] new function get.label.tables() to show all Stata label sets
+* [0.9.1] fix check for duplicate labels
+
 * [0.9.0] Generate unique factor labels to prevent errors in factor definition
 * [0.9.0] check interrupt for long read. Patch by Giovanni Righi
 * [0.9.0] updates to notes, roxygen and register
@@ -67,22 +71,28 @@ devtools::install_github("sjewo/readstata13", ref="testing")
 * [0.9.0] fix saving characters containing missings. Bug reported by Eivind H. Olsen
 * [0.9.0] adjustments to convert.underscore. Patch by luke-m-olson
 * [0.9.0] alow partial reading of selected rows
+
 * [0.8.5] fix errors on big-endians systems
+
 * [0.8.4] fix valgrind errors. converting from dta.write to writestr
 * [0.8.4] fix for empty data label
 * [0.8.4] make replace.strl default
+
 * [0.8.3] restrict length of varnames to 32 chars for compatibility with Stata 14
 * [0.8.3] add many function tests
 * [0.8.3] avoid converting of double to floats while writing compressed files
+
 * [0.8.2] save NA values in character vector as empty string
 * [0.8.2] convert.underscore=T will convert all non-literal characters to underscores
 * [0.8.2] fix saving of Dates
 * [0.8.2] save with convert.factors by default
 * [0.8.2] test for NaN and inf values while writing missing values and replace with NA
 * [0.8.2] remove message about saving factors
+
 * [0.8.1] convert non-integer variables to factors (```nonint.factors=T```) 
 * [0.8.1] handle large datasets
 * [0.8.1] working with strL variables is now a lot faster
+
 * reading data files from disk or url and create a data.frame
 * saving dta files to disk - most features of the dta file format are supported
 * assign variable names

From 19b3c0d4928b9e9eda5fbb138061186549b61199 Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 12:51:16 +0200
Subject: [PATCH 61/76] Update Readme

---
 README.md | 100 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 9bb1b955..95490119 100644
--- a/README.md
+++ b/README.md
@@ -55,55 +55,57 @@ devtools::install_github("sjewo/readstata13", ref="testing")
 [![Build Status](https://travis-ci.org/sjewo/readstata13.svg?branch=master)](https://travis-ci.org/sjewo/readstata13)
 [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/package=readstata13)
 
-### Working features
-
-* [0.9.1] Allow reading only pre-selected variables
-* [0.9.1] Experimental support for format 119
-* [0.9.1] Improvements to partial reading. Idea by Kevin Jin
-* [0.9.1] Export of binary data from dta-files
-* [0.9.1] new function get.label.tables() to show all Stata label sets
-* [0.9.1] fix check for duplicate labels
-
-* [0.9.0] Generate unique factor labels to prevent errors in factor definition
-* [0.9.0] check interrupt for long read. Patch by Giovanni Righi
-* [0.9.0] updates to notes, roxygen and register
-* [0.9.0] fixed size of character length. Bug reported by Yiming (Paul) Li
-* [0.9.0] fix saving characters containing missings. Bug reported by Eivind H. Olsen
-* [0.9.0] adjustments to convert.underscore. Patch by luke-m-olson
-* [0.9.0] alow partial reading of selected rows
-
-* [0.8.5] fix errors on big-endians systems
-
-* [0.8.4] fix valgrind errors. converting from dta.write to writestr
-* [0.8.4] fix for empty data label
-* [0.8.4] make replace.strl default
-
-* [0.8.3] restrict length of varnames to 32 chars for compatibility with Stata 14
-* [0.8.3] add many function tests
-* [0.8.3] avoid converting of double to floats while writing compressed files
-
-* [0.8.2] save NA values in character vector as empty string
-* [0.8.2] convert.underscore=T will convert all non-literal characters to underscores
-* [0.8.2] fix saving of Dates
-* [0.8.2] save with convert.factors by default
-* [0.8.2] test for NaN and inf values while writing missing values and replace with NA
-* [0.8.2] remove message about saving factors
-
-* [0.8.1] convert non-integer variables to factors (```nonint.factors=T```) 
-* [0.8.1] handle large datasets
-* [0.8.1] working with strL variables is now a lot faster
-
-* reading data files from disk or url and create a data.frame
-* saving dta files to disk - most features of the dta file format are supported
-* assign variable names
-* read the new strL strings and save them as attribute 
-* convert stata label to factors and save them as attribute
-* read some meta data (timestamp, dataset label, formats,...)
-* convert strings to system encoding
-* handle different NA values
-* handle multiple label languages
-* convert dates
-* reading business calendar files
+### Changelog and Features
+
+| Version | Changes                                                                       |
+| ------  | ----------------------------------------------------                          |
+| 0.9.1   | Allow reading only pre-selected variables                                     |
+| 0.9.1   | Experimental support for format 119                                           |
+| 0.9.1   | Improvements to partial reading. Idea by Kevin Jin                            |
+| 0.9.1   | Export of binary data from dta-files                                          |
+| 0.9.1   | new function get.label.tables() to show all Stata label sets                  |
+| 0.9.1   | fix check for duplicate labels                                                |
+|         |
+| 0.9.0   | Generate unique factor labels to prevent errors in factor definition          |
+| 0.9.0   | check interrupt for long read. Patch by Giovanni Righi                        |
+| 0.9.0   | updates to notes, roxygen and register                                        |
+| 0.9.0   | fixed size of character length. Bug reported by Yiming (Paul) Li              |
+| 0.9.0   | fix saving characters containing missings. Bug reported by Eivind H. Olsen    |
+| 0.9.0   | adjustments to convert.underscore. Patch by luke-m-olson                      |
+| 0.9.0   | alow partial reading of selected rows                                         |
+|         |
+| 0.8.5   | fix errors on big-endians systems                                             |
+|         |
+| 0.8.4   | fix valgrind errors. converting from dta.write to writestr                    |
+| 0.8.4   | fix for empty data label                                                      |
+| 0.8.4   | make replace.strl default                                                     |
+|         |
+| 0.8.3   | restrict length of varnames to 32 chars for compatibility with Stata 14       |
+| 0.8.3   | add many function tests                                                       |
+| 0.8.3   | avoid converting of double to floats while writing compressed files           |
+|         |
+| 0.8.2   | save NA values in character vector as empty string                            |
+| 0.8.2   | convert.underscore=T will convert all non-literal characters to underscores   |
+| 0.8.2   | fix saving of Dates                                                           |
+| 0.8.2   | save with convert.factors by default                                          |
+| 0.8.2   | test for NaN and inf values while writing missing values and replace with NA  |
+| 0.8.2   | remove message about saving factors                                           |
+|         |
+| 0.8.1   | convert non-integer variables to factors (```nonint.factors=T```)             |
+| 0.8.1   | handle large datasets                                                         |
+| 0.8.1   | working with strL variables is now a lot faster                               |
+|         |                                                                               |
+| <0.8.1  | reading data files from disk or url and create a data.frame                   |
+| <0.8.1  | saving dta files to disk - most features of the dta file format are supported |
+| <0.8.1  | assign variable names                                                         |
+| <0.8.1  | read the new strL strings and save them as attribute                          |
+| <0.8.1  | convert stata label to factors and save them as attribute                     |
+| <0.8.1  | read some meta data (timestamp, dataset label, formats,...)                   |
+| <0.8.1  | convert strings to system encoding                                            |
+| <0.8.1  | handle different NA values                                                    |
+| <0.8.1  | handle multiple label languages                                               |
+| <0.8.1  | convert dates                                                                 |
+| <0.8.1  | reading business calendar files                                               |
 
 ### Todo
 

From 0e7c8807a0e3dd6937329156bd9f5d0267d359ee Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 12:52:44 +0200
Subject: [PATCH 62/76] update affiliation

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 95490119..20e3c451 100644
--- a/README.md
+++ b/README.md
@@ -134,7 +134,7 @@ Map(identical,r12,r13)
 
 ## Authors
 
-[Marvin Garbuszus](mailto:jan.garbuszus@ruhr-uni-bochum.de) ([JanMarvin](https://github.com/JanMarvin)) and [Sebastian Jeworutzki](mailto:Sebastian.Jeworutzki@ruhr-uni-bochum.de) (both Ruhr-Universität Bochum)
+[Marvin Garbuszus](mailto:jan.garbuszus@ruhr-uni-bochum.de) ([JanMarvin](https://github.com/JanMarvin)) and [Sebastian Jeworutzki](mailto:Sebastian.Jeworutzki@ruhr-uni-bochum.de)([sjewo](https://github.com/sjewo)) 
 
 ## Licence
 

From 346977bf973355c4f5fd9810024cc8248b74698e Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 12:53:27 +0200
Subject: [PATCH 63/76] Add ORCID

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a04ab6ff..6a873e43 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -6,7 +6,7 @@ Authors@R: c(
     person("Jan Marvin", "Garbuszus",
     email = "jan.garbuszus@ruhr-uni-bochum.de", role = c("aut")),
     person("Sebastian", "Jeworutzki",
-    email="Sebastian.Jeworutzki@ruhr-uni-bochum.de", role = c("aut", "cre")),
+    email="Sebastian.Jeworutzki@ruhr-uni-bochum.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2671-5253")),
     person("R Core Team", role="cph"),
     person("Magnus Thor", "Torfason", role="ctb"),
     person("Luke M.", "Olson", role="ctb"),

From 9b595f4b861cde44a2dc1fc6578bd6e6b09709e7 Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <Sebastian.Jeworutzki@rub.de>
Date: Fri, 25 May 2018 12:55:58 +0200
Subject: [PATCH 64/76] fix typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 20e3c451..540b3b60 100644
--- a/README.md
+++ b/README.md
@@ -134,7 +134,7 @@ Map(identical,r12,r13)
 
 ## Authors
 
-[Marvin Garbuszus](mailto:jan.garbuszus@ruhr-uni-bochum.de) ([JanMarvin](https://github.com/JanMarvin)) and [Sebastian Jeworutzki](mailto:Sebastian.Jeworutzki@ruhr-uni-bochum.de)([sjewo](https://github.com/sjewo)) 
+[Marvin Garbuszus](mailto:jan.garbuszus@ruhr-uni-bochum.de) ([JanMarvin](https://github.com/JanMarvin)) and [Sebastian Jeworutzki](mailto:Sebastian.Jeworutzki@ruhr-uni-bochum.de) ([sjewo](https://github.com/sjewo)) 
 
 ## Licence
 

From c51c844e82d6936651446c59f2aa87840937358b Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 13:35:24 +0200
Subject: [PATCH 65/76] Support only up to 118

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 20e3c451..b75b4c16 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # readstata13
 
 Package to read and write all Stata file formats (version 15 and older) into a
-R data.frame. The dta file format versions 102 to 119 are supported.
+R data.frame. The dta file format versions 102 to 118 are supported.
 
 The function ```read.dta``` from the foreign package imports only dta files from
 Stata versions <= 12. Due to the different structure and features of dta 117

From 8a3b815d50f3ba4cd7707af9260db735880bd1de Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 13:35:41 +0200
Subject: [PATCH 66/76] cleanup code

---
 R/tools.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/tools.R b/R/tools.R
index d0b95aea..6457b8ca 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -360,14 +360,14 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
     val.labels <- get.label.name(dat, NULL, lang)
     oldval.labels <- get.label.name(dat)
     oldval.labels <- oldval.labels[!is.na(oldval.labels)]
+    oldval.labtab <- lapply(oldval.labels, function(x) get.label(dat, x))
+
     oldlang <- get.lang(dat, F)$default
 
     cat("Replacing value labels. This might take some time...\n")
     pb <- txtProgressBar(min=1,max=length(val.labels)+1)
 
 
-    oldlabname_f <- get.label.name(dat)
-    oldlabtab_f <- lapply(oldlabname_f, function(x) get.label(dat, x))
 
     for (i in which(val.labels != "")) {
 
@@ -378,8 +378,8 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
 
         # get old codes
         if (is.factor(dat[, varname])) {
-          oldlabname <- oldlabname_f[names(oldlabname_f) == varname]
-          oldlabtab <- oldlabtab_f[[names(oldlabname)]]
+          oldlabname <- oldval.labels[names(oldval.labels) == varname]
+          oldlabtab <- oldval.labtab[[names(oldlabname)]]
           codes <- get.origin.codes(dat[,varname], oldlabtab)
           varunique <- na.omit(unique(codes))
         } else {

From f2f310c65096b790ebb84035b0b974512f41fd7b Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 13:35:50 +0200
Subject: [PATCH 67/76] typo

---
 R/read.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/read.R b/R/read.R
index db422b10..c320c4be 100644
--- a/R/read.R
+++ b/R/read.R
@@ -157,11 +157,11 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       if (any(select.rows < 0) )
         select.rows <- abs(select.rows)
 
-      # check that lenght is not > 2
+      # check that length is not > 2
       if (length(select.rows) > 2)
         return(message("select.rows must be of length 1 or 2."))
 
-      # if lenght 1 start at row 1
+      # if length 1 start at row 1
       if (length(select.rows) == 1)
         select.rows <- c(1, select.rows)
     }

From f3764042533368b4a10af039ac204c8d55326ce4 Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 13:42:21 +0200
Subject: [PATCH 68/76] fix typo

---
 R/save.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/save.R b/R/save.R
index 4396ff3e..ecca73fa 100644
--- a/R/save.R
+++ b/R/save.R
@@ -73,7 +73,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
   if (!is.data.frame(data))
     stop("The object \"data\" must have class data.frame")
   if (!dir.exists13(dirname(file)))
-    stop("Path is invalid. Possibly a non existend directory.")
+    stop("Path is invalid. Possibly a non-existing directory.")
 
   # Allow writing version as Stata version not Stata format
   if (version==15L)
@@ -94,7 +94,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
     version <- 108
 
   if (version<102 | version == 109 | version == 116 | version>119)
-    stop("Version missmatch abort execution. No Data was saved.")
+    stop("Version mismatch abort execution. No Data was saved.")
 
   sstr     <- 2045
   sstrl    <- 32768

From 4fcfbbf38e860facf30831d262315102f4f9e11e Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 15:17:42 +0200
Subject: [PATCH 69/76] Fixed spacing in messages

---
 R/read.R | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/R/read.R b/R/read.R
index c320c4be..d2e00249 100644
--- a/R/read.R
+++ b/R/read.R
@@ -28,7 +28,7 @@
 #'  "label_(integer code)".
 #' @param encoding \emph{character.} Strings can be converted from Windows-1252
 #'  or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify
-#'  target encoding explicitly. Stata 14 files are UTF-8 encoded and may contain
+#'  target encoding explicitly. Stata 14 and 15 files are UTF-8 encoded and may contain
 #'  strings which can't be displayed in the current locale.
 #'  Set encoding=NULL to stop reencoding.
 #' @param fromEncoding \emph{character.} We expect strings to be encoded as
@@ -366,9 +366,9 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       if (labname %in% names(label)) {
         if((vartype == sdouble | vartype == sfloat)) {
           if(!nonint.factors) {
-            warning(paste0("\n  ",vnames[i], ":\n  Factor codes of type double",
-                           "or float detected - no labels assigned.\n  Set",
-                           "option nonint.factors to TRUE to assign labels",
+            warning(paste0("\n  ",vnames[i], ":\n  Factor codes of type double ",
+                           "or float detected - no labels assigned.\n  Set ",
+                           "option nonint.factors to TRUE to assign labels ",
                            "anyway.\n"))
             next
           }
@@ -379,7 +379,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
         #check for duplicated labels
         labcount <- table(names(labtable))
         if(any(labcount > 1)) {
-          warning(paste0("\n  ",vnames[i], ":\n  Duplicated factor levels",
+          warning(paste0("\n  ",vnames[i], ":\n  Duplicated factor levels ",
                          "detected - generating unique labels.\n"))
           labdups <- names(labtable) %in% names(labcount[labcount > 1])
           # generate unique labels from assigned label and code number
@@ -400,8 +400,8 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
                               labels=names(gen.lab))
 
         } else {
-          warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no",
-                         "labels assigned.\n  Set option generate.factors=T to",
+          warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no ",
+                         "labels assigned.\n  Set option generate.factors=T to ",
                          "generate labels."))
         }
       }

From 22ffd757ab535a77dbeb1ca85ae3089e2d94b5bf Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 15:18:13 +0200
Subject: [PATCH 70/76] Add generated labels to label.table

---
 R/read.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/R/read.R b/R/read.R
index d2e00249..68cd6822 100644
--- a/R/read.R
+++ b/R/read.R
@@ -398,6 +398,11 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
 
           data[, i] <- factor(data[, i], levels=gen.lab,
                               labels=names(gen.lab))
+          
+          # add generated labels to label.table
+          gen.lab.name <- paste0("gen_",vnames[i])
+          attr(data, "label.table")[[gen.lab.name]] <- gen.lab 
+          attr(data, "val.labels")[i] <- gen.lab.name
 
         } else {
           warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no ",

From a35913e40ac965880183b284f3997dc546479208 Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 15:19:41 +0200
Subject: [PATCH 71/76] Add note about experimental support for Stata 15/MP

---
 R/save.R | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/R/save.R b/R/save.R
index ecca73fa..1fd39cee 100644
--- a/R/save.R
+++ b/R/save.R
@@ -39,7 +39,9 @@
 #' @param compress \emph{logical.} If \code{TRUE}, the resulting dta-file will
 #'  use all of Statas numeric-vartypes.
 #' @param version \emph{numeric.} Stata format for the resulting dta-file either
-#'  the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 15.
+#'  Stata version number (6 - 15) or the internal Stata dta-format (e.g. 117 for Stata 13). 
+#'  Experimental support for large datasets: Use version="15mp" to save the dataset
+#'  in the new Stata 15/MP file format. This feature is not thoroughly tested yet.
 #' @return The function writes a dta-file to disk. The following features of the
 #'  dta file format are supported:
 #' \describe{
@@ -76,8 +78,10 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
     stop("Path is invalid. Possibly a non-existing directory.")
 
   # Allow writing version as Stata version not Stata format
-  if (version==15L)
+  if (version=="15mp")
     version <- 119
+  if (version==15L)
+    version <- 118
   if (version==14L)
     version <- 118
   if (version==13L)
@@ -93,6 +97,9 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
   if (version==6)
     version <- 108
 
+  if (version == 119)
+    message("Support for Stata 15/MP (119) format is experimental and not thoroughly tested.")
+
   if (version<102 | version == 109 | version == 116 | version>119)
     stop("Version mismatch abort execution. No Data was saved.")
 

From 414859bdc7d044f1ae503754288d7b82f14da7a0 Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 15:20:15 +0200
Subject: [PATCH 72/76] Update manpages

---
 R/tools.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/R/tools.R b/R/tools.R
index 6457b8ca..cc05c72c 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -227,7 +227,7 @@ get.label.tables <- function(dat) {
 #'
 #' Assign value labels from a Stata label set to a variable. If duplicated
 #'  labels are found, unique labels will be generated according the following
-#'  scheme: "label_(integer code)".
+#'  scheme: "label_(integer code)". Levels without labels will become <NA>.
 #'
 #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
 #' @param var.name \emph{character.} Name of the variable in the data.frame
@@ -328,7 +328,8 @@ varlabel <- function(dat, var.name=NULL, lang=NA) {
 
 #' Assign Stata Language Labels
 #'
-#' Changes default label language for a dataset.
+#' Changes default label language for a dataset. 
+#' Variables with generated labels (option generate.labels=TRUE) are kept unchanged.
 #'
 #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
 #' @param lang \emph{character.} Label language. Default language defined by

From 2cbf15469e02799bed75576a30bcb3662ec6a077 Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 15:23:05 +0200
Subject: [PATCH 73/76] Change label for all factors and ignore vartype

---
 R/tools.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/tools.R b/R/tools.R
index cc05c72c..631b58e8 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -387,8 +387,8 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
           varunique <- na.omit(unique(dat[,varname]))
         }
 
-        if (labname %in% names(label) & vartype > 65527 &
-           is.factor(dat[,varname])) {
+        if (labname %in% names(label) & is.factor(dat[,varname])) {
+                     
           # assign label if label set is complete
           if (all(varunique %in% labtable)) {
 

From 7b1d1961ce67eae3b1bcb9f4b852d0a6dd13b82e Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 15:23:33 +0200
Subject: [PATCH 74/76] Fix factor generation if generate.factors is true in
 set.lang

---
 R/tools.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/tools.R b/R/tools.R
index 631b58e8..a29eddfb 100644
--- a/R/tools.R
+++ b/R/tools.R
@@ -400,7 +400,7 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
           names(varunique) <- as.character(varunique)
           gen.lab  <- sort(c(varunique[!varunique %in% labtable], labtable))
 
-          dat[,varname] <- factor(dat[,varname], levels=gen.lab,
+          dat[,varname] <- factor(codes, levels=gen.lab,
                                   labels=names(gen.lab))
         } else {
           warning(paste(vnames[i], "Missing factor labels - no labels assigned.

From 888c2661430a834d7330b4b4da8ed0e1265f169e Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 15:26:00 +0200
Subject: [PATCH 75/76] add test for option version="15mp"

---
 tests/testthat/test_save.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/testthat/test_save.R b/tests/testthat/test_save.R
index 5800e08d..786944ee 100644
--- a/tests/testthat/test_save.R
+++ b/tests/testthat/test_save.R
@@ -40,6 +40,7 @@ dir.create("data")
 
 dd <- mtcars
 
+save.dta13(dd, "data/dta_15mp.dta", version = "15mp")
 save.dta13(dd, "data/dta_119.dta", version = 119)
 save.dta13(dd, "data/dta_118.dta", version = 118)
 save.dta13(dd, "data/dta_117.dta", version = 117)
@@ -57,6 +58,7 @@ save.dta13(dd, "data/dta_104.dta", version = 104)
 save.dta13(dd, "data/dta_103.dta", version = 103)
 save.dta13(dd, "data/dta_102.dta", version = 102)
 
+dd15mp<- read.dta13("data/dta_15mp.dta")
 dd119 <- read.dta13("data/dta_119.dta")
 dd118 <- read.dta13("data/dta_118.dta")
 dd117 <- read.dta13("data/dta_117.dta")
@@ -78,6 +80,7 @@ dd102 <- read.dta13("data/dta_102.dta")
 unlink("data", recursive = TRUE)
 
 test_that("version", {
+  expect_true(datacompare(dd, dd15mp))
   expect_true(datacompare(dd, dd119))
   expect_true(datacompare(dd, dd118))
   expect_true(datacompare(dd, dd117))

From 7ec4d24c81e9df98c7e49c7c78c666234ea3c8ec Mon Sep 17 00:00:00 2001
From: Sebastian Jeworutzki <sebastian.jeworutzki@ruhr-uni-bochum.de>
Date: Fri, 25 May 2018 15:28:55 +0200
Subject: [PATCH 76/76] update docs

---
 NEWS              | 1 +
 README.md         | 2 +-
 man/read.dta13.Rd | 2 +-
 man/save.dta13.Rd | 4 +++-
 man/set.label.Rd  | 2 +-
 man/set.lang.Rd   | 3 ++-
 6 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/NEWS b/NEWS
index b8d1ead1..1173a2b7 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,7 @@
 - export of binary data from dta-files
 - new function get.label.tables() to show all Stata label sets
 - fix check for duplicate labels
+- fixes in set.lang
 
 [0.9.0]
 - generate unique factor labels to prevent errors in factor definition
diff --git a/README.md b/README.md
index 5fbfd216..de4d8909 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,7 @@ devtools::install_github("sjewo/readstata13", ref="testing")
 | 0.9.1   | Improvements to partial reading. Idea by Kevin Jin                            |
 | 0.9.1   | Export of binary data from dta-files                                          |
 | 0.9.1   | new function get.label.tables() to show all Stata label sets                  |
-| 0.9.1   | fix check for duplicate labels                                                |
+| 0.9.1   | fix check for duplicate labels and in set.lang()                              |
 |         |
 | 0.9.0   | Generate unique factor labels to prevent errors in factor definition          |
 | 0.9.0   | check interrupt for long read. Patch by Giovanni Righi                        |
diff --git a/man/read.dta13.Rd b/man/read.dta13.Rd
index 98534404..f4b20f9e 100644
--- a/man/read.dta13.Rd
+++ b/man/read.dta13.Rd
@@ -23,7 +23,7 @@ are found, unique labels will be generated according the following scheme:
 
 \item{encoding}{\emph{character.} Strings can be converted from Windows-1252
 or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify
-target encoding explicitly. Stata 14 files are UTF-8 encoded and may contain
+target encoding explicitly. Stata 14 and 15 files are UTF-8 encoded and may contain
 strings which can't be displayed in the current locale.
 Set encoding=NULL to stop reencoding.}
 
diff --git a/man/save.dta13.Rd b/man/save.dta13.Rd
index 3b698a70..635a9f1e 100644
--- a/man/save.dta13.Rd
+++ b/man/save.dta13.Rd
@@ -37,7 +37,9 @@ will be added to the dta-file.}
 use all of Statas numeric-vartypes.}
 
 \item{version}{\emph{numeric.} Stata format for the resulting dta-file either
-the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 15.}
+Stata version number (6 - 15) or the internal Stata dta-format (e.g. 117 for Stata 13). 
+Experimental support for large datasets: Use version="15mp" to save the dataset
+in the new Stata 15/MP file format. This feature is not thoroughly tested yet.}
 
 \item{convert.underscore}{\emph{logical.} If \code{TRUE}, all non numerics or
 non alphabet characters will be converted to underscores.}
diff --git a/man/set.label.Rd b/man/set.label.Rd
index 84b7493a..f57b1afb 100644
--- a/man/set.label.Rd
+++ b/man/set.label.Rd
@@ -20,7 +20,7 @@ Returns a labeled factor
 \description{
 Assign value labels from a Stata label set to a variable. If duplicated
  labels are found, unique labels will be generated according the following
- scheme: "label_(integer code)".
+ scheme: "label_(integer code)". Levels without labels will become <NA>.
 }
 \examples{
 dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"),
diff --git a/man/set.lang.Rd b/man/set.lang.Rd
index f4e055cb..1f615718 100644
--- a/man/set.lang.Rd
+++ b/man/set.lang.Rd
@@ -19,7 +19,8 @@ are generated.}
 Returns a data.frame with value labels in language "lang".
 }
 \description{
-Changes default label language for a dataset.
+Changes default label language for a dataset. 
+Variables with generated labels (option generate.labels=TRUE) are kept unchanged.
 }
 \examples{
 dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"))