Merge branch 'testing'

sjewo · Nov 14, 2015 · 9da6cc0 · 9da6cc0
2 parents 49572ac + 856250d
commit 9da6cc0
Show file tree

Hide file tree

Showing 11 changed files with 147 additions and 100 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,13 +1,14 @@
 Package: readstata13
 Type: Package
 Title: Import Stata Data Files
-Version: 0.8
+Version: 0.8.1
 Authors@R: c(
     person("Jan Marvin", "Garbuszus",
     email = "[email protected]", role = c("aut")),
     person("Sebastian", "Jeworutzki",
     email="[email protected]", role = c("aut", "cre")),
-    person("R Core Team", role="cph")
+    person("R Core Team", role="cph"),
+    person("Magnus Thor", "Torfason", role="ctb")
     )
 Description: Function to read and write the Stata file format.
 URL: https://github.com/sjewo/readstata13

diff --git a/R/read.R b/R/read.R
@@ -43,11 +43,13 @@
 #'  converted.
 #' @param add.rownames \emph{logical.} If \code{TRUE}, the first column will be
 #'  used as rownames. Variable will be dropped afterwards.
+#' @param nonint.factors \emph{logical.} If \code{TRUE}, factors labels 
+#'  will be assigned to variables of type float and double.
 #'
 #' @details If the filename is a url, the file will be downloaded as a temporary
 #'  file and read afterwards.
 #'
-#' Stata files are encoded in ansinew. Depending on your system default encoding
+#' Stata files are encoded in ansinew. Depending on your system's default encoding
 #'  certain characters may appear wrong. Using a correct encoding may fix these.
 #'
 #' Variable names stored in the dta-file will be used in the resulting
@@ -61,20 +63,19 @@
 #' dates.
 #'
 #' Stata 13 introduced a new character type called strL. strLs are able to store
-#'  strings of any size up to 2 billion characters.  While R is able to store
-#'  strings of this size in a character, certain data.frames may appear messed,
-#'  if long strings are inserted default is \code{FALSE}.
+#'  strings up to 2 billion characters.  While R is able to store
+#'  strings of this size in a character vector, the printed representation of such 
+#'  vectors looks rather cluttered, so by default only a reference is saved in the 
+#'  data.frame (\code{replace.strl=FALSE}). 
 #'
 #' In R, you may use rownames to store characters (see for instance
 #'  \code{data(swiss)}). In Stata, this is not possible and rownames have to be
-#'  stored as a variable.  If this is the case for your file and you want to use
-#'  rownames, \code{add.rownames=TRUE} will convert the first variable of the
-#'  dta-file into rownames of the resulting data.frame.
+#'  stored as a variable. If you want to use rownames, set add.rownames to TRUE. 
+#'  Then the first variable of the dta-file will hold the rownames of the resulting 
+#'  data.frame.
 #'
-#' Beginning with Stata 13 (format 117), a new dta-format was introduced, which
-#'  was not handled by foreign at the time. It was implemented in this package
-#'  therefore the package got its name. Reading dta-files from earlier Stata
-#'  versions was not implemented until version 0.8.
+#' Reading dta-files of older and newer versions than 13 was introduced 
+#'  with version 0.8.
 #' @return The function returns a data.frame with attributes. The attributes
 #'  include
 #' \describe{
@@ -88,18 +89,17 @@
 #'   \item{var.labels:}{Variable labels}
 #'   \item{version:}{dta file format version}
 #'   \item{label.table:}{List of value labels.}
-#'   \item{strl:}{List of character vectors for the new strl string variable
-#'    type. The first element is the identifier and
-#'    the second element the string.}
+#'   \item{strl:}{Character vector with long strings for the new strl string variable
+#'    type. The name of every element is the identifier.}
 #'   \item{expansion.fields:}{list providing variable name, characteristic name
 #'    and the contents of Stata characteristic field.}
 #'   \item{missing:}{List of numeric vectors with Stata missing type for each
 #'    variable.}
 #' }
 #' @note read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members
 #'  from foreign::read.dta().
-#' @seealso \code{\link{read.dta}} and \code{memisc} for dta files from Stata
-#' versions < 13.
+#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
+#' versions < 13 and \code{\link[haven]{read_dta}} in package \code{haven} for Stata version >= 13.
 #' @references Stata Corp (2014): Description of .dta file format
 #'  \url{http://www.stata.com/help.cgi?dta}
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
@@ -112,7 +112,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
                        encoding = NULL, fromEncoding=NULL,
                        convert.underscore = FALSE, missing.type = FALSE,
                        convert.dates = TRUE, replace.strl = FALSE,
-                       add.rownames = FALSE) {
+                       add.rownames = FALSE, nonint.factors=FALSE) {
   # Check if path is a url
   if (length(grep("^(http|ftp|https)://", file))) {
     tmp <- tempfile()
@@ -260,21 +260,12 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
 
   if (replace.strl) {
     if (version >= 117L) {
-      strl <- do.call(rbind, attr(data,"strl"))
+      strl <- c("")
+      names(strl) <- "00000000000000000000"
+      strl <- c(strl, attr(data,"strl"))
       for (j in seq(ncol(data))[types == 32768] ) {
-        refs <- unique(data[, j])
-        for (ref in refs) {
-          if (length(strl[strl[,1] == ref,2]) != 0){
-            data[data[, j] == ref, j] <- strl[strl[, 1] == ref, 2]
-          }
-        }
-      }
-
-      # recode strL 0 to void
-      for (v in (1:ncol(data))[types == sstrl]) {
-        data[[v]] <- gsub("00000000000000000000","", data[[v]] )
+        data[, j] <- strl[data[,j]]
       }
-
       # if strls are in data.frame remove attribute strl
       attr(data, "strl") <- NULL
     } else {
@@ -321,8 +312,13 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       vartype <- types[i]
       labtable <- label[[labname]]
       #don't convert columns of type double or float to factor
-      if (labname %in% names(label) & !(vartype == sdouble | vartype == sfloat))
-      {
+      if (labname %in% names(label)) {
+        if((vartype == sdouble | vartype == sfloat)) {
+          if(!nonint.factors) {
+            warning(paste0("\n  ",vnames[i], ":\n  Factor codes of type double or float detected - no labels assigned.\n  Set option nonint.factors to TRUE to assign labels anyway."))
+            next
+          }
+        }
         # get unique values / omit NA
         varunique <- na.omit(unique(data[, i]))
         # assign label if label set is complete
@@ -337,13 +333,12 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
           data[, i] <- factor(data[, i], levels=gen.lab,
                               labels=names(gen.lab))
         } else {
-          warning(paste(vnames[i], "Missing factor labels - no labels assigned.
-                        Set option generate.factors=T to generate labels."))
+          warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no labels assigned.\n  Set option generate.factors=T to generate labels."))
         }
       }
     }
   }
-
+  
   if (add.rownames) {
     rownames(data) <- data[[1]]
     data[[1]] <- NULL

diff --git a/R/save.R b/R/save.R
@@ -54,8 +54,8 @@
 #'    type. The first element is the identifier and the second element the
 #'    string.}
 #' }
-#' @seealso \code{\link[foreign]{write.dta}} and \code{memisc} for dta files
-#' from Stata versions < 13.
+#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
+#' versions < 13 and \code{\link[haven]{read_dta}} in package \code{haven} for Stata version >= 13.
 #' @references Stata Corp (2014): Description of .dta file format
 #'  \url{http://www.stata.com/help.cgi?dta}
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
@@ -71,7 +71,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
 
   if (!is.data.frame(data))
     stop("The object \"data\" must have class data.frame")
-  if (!dir.exists(dirname(file)))
+  if (!dir.exists13(dirname(file)))
     stop("Path is invalid. Possibly a non existend directory.")
 
   # Allow writing version as Stata version not Stata format

diff --git a/R/tools.R b/R/tools.R
@@ -33,6 +33,12 @@ save.encoding <- function(x, encoding) {
         sub="byte")
 }
 
+# Function to check if directory exists
+# @param x file path
+dir.exists13 <-function(x) {
+  path <- dirname(x)
+  return(file.exists(path))
+}
 
 # Construct File Path
 #

diff --git a/README.md b/README.md
@@ -35,12 +35,12 @@ users need to install
 
 ```R
 # install.packages("devtools")
-devtools::install_github("sjewo/readstata13", ref="0.8")
+devtools::install_github("sjewo/readstata13", ref="0.8.1")
 ```
 
 Older Versions of devtools require a username option:
 ```R
-install_github("readstata13", username="sjewo", ref="0.8")
+install_github("readstata13", username="sjewo", ref="0.8.1")
 ```
 
 To install the current development version from github:
@@ -53,13 +53,18 @@ devtools::install_github("sjewo/readstata13", ref="testing")
 ## Current Status
 
 [![Build Status](https://travis-ci.org/sjewo/readstata13.svg?branch=master)](https://travis-ci.org/sjewo/readstata13)
+[![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/web/packages/readstata13/index.html)
 
 ### Working features
 
+* [new in 0.8.1] convert non-integer variables to factors (```nonint.factors=T```) 
+* [new in 0.8.1] handle large datasets
+* [new in 0.8.1] working with strL variables is now a lot faster
+
 * reading data files from disk or url and create a data.frame
 * saving dta files to disk - most features of the dta file format are supported
 * assign variable names
-* read the new strL strings and save them as attribute
+* read the new strL strings and save them as attribute 
 * convert stata label to factors and save them as attribute
 * read some meta data (timestamp, dataset label, formats,...)
 * convert strings to system encoding

diff --git a/inst/include/readstata.h b/inst/include/readstata.h
@@ -21,7 +21,23 @@
 #include <Rcpp.h>
 #include <fstream>
 #include <string>
+
+#define GCC_VERSION (__GNUC__ * 10000 \
++ __GNUC_MINOR__ * 100                \
++ __GNUC_PATCHLEVEL__)
+
+/* Test for GCC < 4.9.0 */
+#if GCC_VERSION < 40900 & !__clang__
+    typedef signed char int8_t;
+    typedef unsigned char uint8_t;
+    typedef signed short int16_t;
+    typedef unsigned short uint16_t;
+    typedef signed int int32_t;
+    typedef unsigned int uint32_t;
+#else
 #include <stdint.h>
+#endif
+
 
 #include "read_dta.h"
 #include "read_pre13_dta.h"
@@ -76,6 +92,8 @@ inline void test(std::string testme, FILE * file)
   readstring(test,file, test.size());
   if (testme.compare(test)!=0)
   {
+    fclose(file);
+    Rcpp::warning("\n testme:%s \n test: %s\n", testme.c_str(), test.c_str());
     Rcpp::stop("When attempting to read %s: Something went wrong!", testme.c_str());
   }
 }

diff --git a/inst/include/swap_endian.h b/inst/include/swap_endian.h
@@ -1,7 +1,7 @@
 #ifndef SWAP_ENDIAN
 #define SWAP_ENDIAN
 
-#include <stdint.h>
+/*#include <stdint.h>*/
 #include <typeinfo>
 
 #define GCC_VERSION (__GNUC__ * 10000 \

diff --git a/man/read.dta13.Rd b/man/read.dta13.Rd
@@ -7,7 +7,7 @@
 read.dta13(file, convert.factors = TRUE, generate.factors = FALSE,
   encoding = NULL, fromEncoding = NULL, convert.underscore = FALSE,
   missing.type = FALSE, convert.dates = TRUE, replace.strl = FALSE,
-  add.rownames = FALSE)
+  add.rownames = FALSE, nonint.factors = FALSE)
 }
 \arguments{
 \item{file}{\emph{character.} Path to the dta file you want to import.}
@@ -42,7 +42,10 @@ a strL string in the data.frame with the actual value. The strl attribute
 will be removed from the data.frame.}
 
 \item{add.rownames}{\emph{logical.} If \code{TRUE}, the first column will be
- used as rownames. Variable will be dropped afterwards.}
+used as rownames. Variable will be dropped afterwards.}
+
+\item{nonint.factors}{\emph{logical.} If \code{TRUE}, factors labels
+ will be assigned to variables of type float and double.}
 }
 \value{
 The function returns a data.frame with attributes. The attributes
@@ -58,9 +61,8 @@ The function returns a data.frame with attributes. The attributes
   \item{var.labels:}{Variable labels}
   \item{version:}{dta file format version}
   \item{label.table:}{List of value labels.}
-  \item{strl:}{List of character vectors for the new strl string variable
-   type. The first element is the identifier and
-   the second element the string.}
+  \item{strl:}{Character vector with long strings for the new strl string variable
+   type. The name of every element is the identifier.}
   \item{expansion.fields:}{list providing variable name, characteristic name
    and the contents of Stata characteristic field.}
   \item{missing:}{List of numeric vectors with Stata missing type for each
@@ -75,7 +77,7 @@ The function returns a data.frame with attributes. The attributes
 If the filename is a url, the file will be downloaded as a temporary
  file and read afterwards.
 
-Stata files are encoded in ansinew. Depending on your system default encoding
+Stata files are encoded in ansinew. Depending on your system's default encoding
  certain characters may appear wrong. Using a correct encoding may fix these.
 
 Variable names stored in the dta-file will be used in the resulting
@@ -89,20 +91,19 @@ Stata dates are converted to R's Date class the same way foreign handles
 dates.
 
 Stata 13 introduced a new character type called strL. strLs are able to store
- strings of any size up to 2 billion characters.  While R is able to store
- strings of this size in a character, certain data.frames may appear messed,
- if long strings are inserted default is \code{FALSE}.
+ strings up to 2 billion characters.  While R is able to store
+ strings of this size in a character vector, the printed representation of such
+ vectors looks rather cluttered, so by default only a reference is saved in the
+ data.frame (\code{replace.strl=FALSE}).
 
 In R, you may use rownames to store characters (see for instance
  \code{data(swiss)}). In Stata, this is not possible and rownames have to be
- stored as a variable.  If this is the case for your file and you want to use
- rownames, \code{add.rownames=TRUE} will convert the first variable of the
- dta-file into rownames of the resulting data.frame.
-
-Beginning with Stata 13 (format 117), a new dta-format was introduced, which
- was not handled by foreign at the time. It was implemented in this package
- therefore the package got its name. Reading dta-files from earlier Stata
- versions was not implemented until version 0.8.
+ stored as a variable. If you want to use rownames, set add.rownames to TRUE.
+ Then the first variable of the dta-file will hold the rownames of the resulting
+ data.frame.
+
+Reading dta-files of older and newer versions than 13 was introduced
+ with version 0.8.
 }
 \note{
 read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members
@@ -118,7 +119,7 @@ Stata Corp (2014): Description of .dta file format
  \url{http://www.stata.com/help.cgi?dta}
 }
 \seealso{
-\code{\link{read.dta}} and \code{memisc} for dta files from Stata
-versions < 13.
+\code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
+versions < 13 and \code{\link[haven]{read_dta}} in package \code{haven} for Stata version >= 13.
 }
 
diff --git a/man/save.dta13.Rd b/man/save.dta13.Rd
@@ -72,7 +72,7 @@ Stata Corp (2014): Description of .dta file format
  \url{http://www.stata.com/help.cgi?dta}
 }
 \seealso{
-\code{\link[foreign]{write.dta}} and \code{memisc} for dta files
-from Stata versions < 13.
+\code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
+versions < 13 and \code{\link[haven]{read_dta}} in package \code{haven} for Stata version >= 13.
 }
 
diff --git a/src/rcpp_pre13_savestata.cpp b/src/rcpp_pre13_savestata.cpp
@@ -121,7 +121,7 @@ int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat)
 
     /* write a datalabel */
     if (datalabel.size() > ndlabel)
-      Rcpp::warning("Datalabel to long. Resizing. Max size is %d.",
+      Rcpp::warning("Datalabel too long. Resizing. Max size is %d.",
                     ndlabel - 1);
 
     dta.write(datalabel.c_str(), ndlabel);
@@ -131,7 +131,7 @@ int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat)
     {
       if (timestamp.size() > 18)
       {
-        Rcpp::warning("Timestamp to long. Dropping.");
+        Rcpp::warning("Timestamp too long. Dropping.");
         timestamp = "";
       }
       dta.write(timestamp.c_str(),timestamp.size());