Merge pull request #54 from sjewo/testing

Merge for 0.9.1
sjewo · May 25, 2018 · 4c122aa · 4c122aa
2 parents 36e5803 + 7ec4d24
commit 4c122aa
Show file tree

Hide file tree

Showing 36 changed files with 1,263 additions and 674 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,16 +1,17 @@
 Package: readstata13
 Type: Package
 Title: Import 'Stata' Data Files
-Version: 0.9.0
+Version: 0.9.1
 Authors@R: c(
     person("Jan Marvin", "Garbuszus",
     email = "[email protected]", role = c("aut")),
     person("Sebastian", "Jeworutzki",
-    email="[email protected]", role = c("aut", "cre")),
+    email="[email protected]", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2671-5253")),
     person("R Core Team", role="cph"),
     person("Magnus Thor", "Torfason", role="ctb"),
     person("Luke M.", "Olson", role="ctb"),
-    person("Giovanni", "Righi", role="ctb")
+    person("Giovanni", "Righi", role="ctb"),
+    person("Kevin", "Jin", role="ctb")
     )
 Description: Function to read and write the 'Stata' file format.
 URL: https://github.com/sjewo/readstata13

diff --git a/NAMESPACE b/NAMESPACE
@@ -4,6 +4,7 @@ export("varlabel<-")
 export(as.caldays)
 export(get.label)
 export(get.label.name)
+export(get.label.tables)
 export(get.lang)
 export(get.origin.codes)
 export(read.dta13)
@@ -15,6 +16,7 @@ export(varlabel)
 import(Rcpp)
 importFrom(stats,complete.cases)
 importFrom(stats,na.omit)
+importFrom(stats,setNames)
 importFrom(utils,download.file)
 importFrom(utils,localeToCharset)
 importFrom(utils,setTxtProgressBar)

diff --git a/NEWS b/NEWS
@@ -1,3 +1,12 @@
+[0.9.1]
+- allow reading only pre-selected variables
+- experimental support for format 119
+- improve partial reading
+- export of binary data from dta-files
+- new function get.label.tables() to show all Stata label sets
+- fix check for duplicate labels
+- fixes in set.lang
+
 [0.9.0]
 - generate unique factor labels to prevent errors in factor definition
 - check interrupt for long read

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,15 +1,15 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-stata_pre13_save <- function(filePath, dat) {
-    .Call('readstata13_stata_pre13_save', PACKAGE = 'readstata13', filePath, dat)
+stata_read <- function(filePath, missing, selectrows, selectcols, strlexport, strlpath) {
+    .Call(`_readstata13_stata_read`, filePath, missing, selectrows, selectcols, strlexport, strlpath)
 }
 
-stata_read <- function(filePath, missing, selectrows) {
-    .Call('readstata13_stata_read', PACKAGE = 'readstata13', filePath, missing, selectrows)
+stata_save <- function(filePath, dat) {
+    .Call(`_readstata13_stata_save`, filePath, dat)
 }
 
-stata_save <- function(filePath, dat) {
-    .Call('readstata13_stata_save', PACKAGE = 'readstata13', filePath, dat)
+stata_pre13_save <- function(filePath, dat) {
+    .Call(`_readstata13_stata_pre13_save`, filePath, dat)
 }
 
diff --git a/R/dbcal.R b/R/dbcal.R
@@ -126,8 +126,12 @@ stbcal <- function(stbcalfile) {
   # In case centerdate is not rangestart:
   stbcal$buisdays <- NA
   stbcal$buisdays[stbcal$range==centerdate] <- 0
-  stbcal$buisdays[stbcal$range<centerdate] <- seq(from=-length(stbcal$range[stbcal$range<centerdate]), to=-1)
-  stbcal$buisdays[stbcal$range>centerdate] <- seq(from=1, to=length(stbcal$range[stbcal$range>centerdate]))
+  stbcal$buisdays[stbcal$range<centerdate] <- seq(
+    from=-length(stbcal$range[stbcal$range<centerdate]),
+    to=-1)
+  stbcal$buisdays[stbcal$range>centerdate] <- seq(
+    from=1,
+    to=length(stbcal$range[stbcal$range>centerdate]))
 
   # Add purpose
   if (any(grepl("purpose", x))) {

diff --git a/R/read.R b/R/read.R
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
+# Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
 # Copyright (C) of 'convert.dates' and 'missing.types' Thomas Lumley
 #
 # This program is free software; you can redistribute it and/or modify it
@@ -23,13 +23,14 @@
 #' @param convert.factors \emph{logical.} If \code{TRUE}, factors from Stata
 #'  value labels are created.
 #' @param generate.factors \emph{logical.} If \code{TRUE} and convert.factors is
-#'  TRUE, missing factor labels are created from integers. If duplicated labels are found, 
-#'  unique labels will be generated according the following scheme: "label_(integer code)".
-#' @param encoding \emph{character.} Strings can be converted from Windows-1252 or UTF-8
-#'  to system encoding. Options are "latin1" or "UTF-8" to specify target
-#'  encoding explicitly. Stata 14 files are UTF-8 encoded and may contain strings
-#'   which can't be displayed in the current locale.
-#'   Set encoding=NULL to stop reencoding.
+#'  TRUE, missing factor labels are created from integers. If duplicated labels
+#'  are found, unique labels will be generated according the following scheme:
+#'  "label_(integer code)".
+#' @param encoding \emph{character.} Strings can be converted from Windows-1252
+#'  or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify
+#'  target encoding explicitly. Stata 14 and 15 files are UTF-8 encoded and may contain
+#'  strings which can't be displayed in the current locale.
+#'  Set encoding=NULL to stop reencoding.
 #' @param fromEncoding \emph{character.} We expect strings to be encoded as
 #'  "CP1252" for Stata Versions 13 and older. For dta files saved with Stata 14
 #'  or newer "UTF-8" is used. In some situation the used encoding can differ for
@@ -51,12 +52,17 @@
 #' @param select.rows \emph{integer.} Vector of one or two numbers. If single
 #'  value rows from 1:val are selected. If two values of a range are selected
 #'  the rows in range will be selected.
+#' @param select.cols \emph{character:} Vector of variables to select.
+#' @param strlexport \emph{logical:} Should strl content be exported as binary
+#'  files?
+#' @param strlpath \emph{cahracter:} Path for strl export.
 #'
 #' @details If the filename is a url, the file will be downloaded as a temporary
 #'  file and read afterwards.
 #'
-#' Stata files are encoded in ansinew. Depending on your system's default encoding
-#'  certain characters may appear wrong. Using a correct encoding may fix these.
+#' Stata files are encoded in ansinew. Depending on your system's default
+#'  encoding certain characters may appear wrong. Using a correct encoding may
+#'  fix these.
 #'
 #' Variable names stored in the dta-file will be used in the resulting
 #'  data.frame. Stata types char, byte, and int will become integer; float and
@@ -70,15 +76,15 @@
 #'
 #' Stata 13 introduced a new character type called strL. strLs are able to store
 #'  strings up to 2 billion characters.  While R is able to store
-#'  strings of this size in a character vector, the printed representation of such
-#'  vectors looks rather cluttered, so it's possible to save only a reference in the
-#'  data.frame with option \code{replace.strl=FALSE}.
+#'  strings of this size in a character vector, the printed representation of
+#'  such vectors looks rather cluttered, so it's possible to save only a
+#'  reference in the data.frame with option \code{replace.strl=FALSE}.
 #'
 #' In R, you may use rownames to store characters (see for instance
 #'  \code{data(swiss)}). In Stata, this is not possible and rownames have to be
 #'  stored as a variable. If you want to use rownames, set add.rownames to TRUE.
-#'  Then the first variable of the dta-file will hold the rownames of the resulting
-#'  data.frame.
+#'  Then the first variable of the dta-file will hold the rownames of the
+#'  resulting data.frame.
 #'
 #' Reading dta-files of older and newer versions than 13 was introduced
 #'  with version 0.8.
@@ -95,17 +101,21 @@
 #'   \item{var.labels:}{Variable labels}
 #'   \item{version:}{dta file format version}
 #'   \item{label.table:}{List of value labels.}
-#'   \item{strl:}{Character vector with long strings for the new strl string variable
-#'    type. The name of every element is the identifier.}
+#'   \item{strl:}{Character vector with long strings for the new strl string
+#'    variable type. The name of every element is the identifier.}
 #'   \item{expansion.fields:}{list providing variable name, characteristic name
 #'    and the contents of Stata characteristic field.}
 #'   \item{missing:}{List of numeric vectors with Stata missing type for each
 #'    variable.}
+#'   \item{byteorder:}{Byteorder of the dta-file. LSF or MSF.}
+#'   \item{orig.dim:}{Dimension recorded inside the dta-file.}
 #' }
 #' @note read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members
 #'  from foreign::read.dta().
-#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
-#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13.
+#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and
+#'  \code{memisc} for dta files from Stata
+#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version
+#'  >= 13.
 #' @references Stata Corp (2014): Description of .dta file format
 #'  \url{http://www.stata.com/help.cgi?dta}
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
@@ -119,7 +129,9 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
                        convert.underscore = FALSE, missing.type = FALSE,
                        convert.dates = TRUE, replace.strl = TRUE,
                        add.rownames = FALSE, nonint.factors=FALSE,
-                       select.rows = NULL) {
+                       select.rows = NULL, select.cols = NULL,
+                       strlexport = FALSE, strlpath = ".") {
+
   # Check if path is a url
   if (length(grep("^(http|ftp|https)://", file))) {
     tmp <- tempfile()
@@ -145,11 +157,11 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       if (any(select.rows < 0) )
         select.rows <- abs(select.rows)
 
-      # check that lenght is not > 2
+      # check that length is not > 2
       if (length(select.rows) > 2)
         return(message("select.rows must be of length 1 or 2."))
 
-      # if lenght 1 start at row 1
+      # if length 1 start at row 1
       if (length(select.rows) == 1)
         select.rows <- c(1, select.rows)
     }
@@ -165,7 +177,12 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     select.rows <- c(0,0)
   }
 
-  data <- stata_read(filepath, missing.type, select.rows)
+  if (is.null(select.cols)){
+    select.cols <- ""
+  }
+
+  data <- stata_read(filepath, missing.type, select.rows, select.cols,
+                     strlexport, strlpath)
 
   version <- attr(data, "version")
 
@@ -301,7 +318,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
     strl <- c("")
     names(strl) <- "00000000000000000000"
     strl <- c(strl, attr(data,"strl"))
-    for (j in seq(ncol(data))[types == 32768] ) {
+    for (j in seq(ncol(data))[types == sstrl] ) {
       data[, j] <- strl[data[,j]]
     }
     # if strls are in data.frame remove attribute strl
@@ -349,36 +366,48 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       if (labname %in% names(label)) {
         if((vartype == sdouble | vartype == sfloat)) {
           if(!nonint.factors) {
-            warning(paste0("\n  ",vnames[i], ":\n  Factor codes of type double or float detected - no labels assigned.\n  Set option nonint.factors to TRUE to assign labels anyway.\n"))
+            warning(paste0("\n  ",vnames[i], ":\n  Factor codes of type double ",
+                           "or float detected - no labels assigned.\n  Set ",
+                           "option nonint.factors to TRUE to assign labels ",
+                           "anyway.\n"))
             next
           }
         }
         # get unique values / omit NA
         varunique <- na.omit(unique(data[, i]))
+
+        #check for duplicated labels
+        labcount <- table(names(labtable))
+        if(any(labcount > 1)) {
+          warning(paste0("\n  ",vnames[i], ":\n  Duplicated factor levels ",
+                         "detected - generating unique labels.\n"))
+          labdups <- names(labtable) %in% names(labcount[labcount > 1])
+          # generate unique labels from assigned label and code number
+          names(labtable)[labdups] <- paste0(names(labtable)[labdups],
+                                             "_(", labtable[labdups], ")")
+        }
+
         # assign label if label set is complete
         if (all(varunique %in% labtable)) {
-
-          #check for duplicated labels
-          labcount <- table(names(labtable))
-          if(any(labcount > 1)) {
-            warning(paste0("\n  ",vnames[i], ":\n  Duplicated factor levels detected - generating unique labels.\n"))
-            labdups <- names(labtable) %in% names(labcount[labcount > 1])
-            # generate unique labels from assigned label and code number
-            names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")")
-          }
-
           data[, i] <- factor(data[, i], levels=labtable,
                               labels=names(labtable))
           # else generate labels from codes
         } else if (generate.factors) {
           names(varunique) <- as.character(varunique)
           gen.lab  <- sort(c(varunique[!varunique %in% labtable], labtable))
-          
+
           data[, i] <- factor(data[, i], levels=gen.lab,
                               labels=names(gen.lab))
 
+          # add generated labels to label.table
+          gen.lab.name <- paste0("gen_",vnames[i])
+          attr(data, "label.table")[[gen.lab.name]] <- gen.lab 
+          attr(data, "val.labels")[i] <- gen.lab.name
+
         } else {
-          warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no labels assigned.\n  Set option generate.factors=T to generate labels."))
+          warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no ",
+                         "labels assigned.\n  Set option generate.factors=T to ",
+                         "generate labels."))
         }
       }
     }

diff --git a/R/readstata13.R b/R/readstata13.R
@@ -11,6 +11,6 @@
 #' @useDynLib readstata13, .registration = TRUE
 #' @import Rcpp
 #' @note If you catch a bug, please do not sue us, we do not have any money.
-#' @seealso \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from Stata
-#' Versions < 13
+#' @seealso \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from
+#'  Stata Versions < 13
 NULL
diff --git a/R/save.R b/R/save.R
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
+# Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
@@ -39,7 +39,9 @@
 #' @param compress \emph{logical.} If \code{TRUE}, the resulting dta-file will
 #'  use all of Statas numeric-vartypes.
 #' @param version \emph{numeric.} Stata format for the resulting dta-file either
-#'  the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 14.
+#'  Stata version number (6 - 15) or the internal Stata dta-format (e.g. 117 for Stata 13). 
+#'  Experimental support for large datasets: Use version="15mp" to save the dataset
+#'  in the new Stata 15/MP file format. This feature is not thoroughly tested yet.
 #' @return The function writes a dta-file to disk. The following features of the
 #'  dta file format are supported:
 #' \describe{
@@ -54,8 +56,9 @@
 #'    type. The first element is the identifier and the second element the
 #'    string.}
 #' }
-#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
-#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13.
+#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and
+#'  \code{memisc} for dta files from Stata versions < 13 and \code{read_dta} in
+#'  package \code{haven} for Stata version >= 13.
 #' @references Stata Corp (2014): Description of .dta file format
 #'  \url{http://www.stata.com/help.cgi?dta}
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
@@ -72,9 +75,13 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
   if (!is.data.frame(data))
     stop("The object \"data\" must have class data.frame")
   if (!dir.exists13(dirname(file)))
-    stop("Path is invalid. Possibly a non existend directory.")
+    stop("Path is invalid. Possibly a non-existing directory.")
 
   # Allow writing version as Stata version not Stata format
+  if (version=="15mp")
+    version <- 119
+  if (version==15L)
+    version <- 118
   if (version==14L)
     version <- 118
   if (version==13L)
@@ -90,8 +97,11 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
   if (version==6)
     version <- 108
 
-  if (version<102 | version == 109 | version == 116 | version>118)
-    stop("Version missmatch abort execution. No Data was saved.")
+  if (version == 119)
+    message("Support for Stata 15/MP (119) format is experimental and not thoroughly tested.")
+
+  if (version<102 | version == 109 | version == 116 | version>119)
+    stop("Version mismatch abort execution. No Data was saved.")
 
   sstr     <- 2045
   sstrl    <- 32768
@@ -168,7 +178,8 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
       hasfactors <- sapply(data, is.factor)
 
       if (any(hasfactors))
-        warning("dta-format < 106 does not handle factors. Labels are not saved!")
+        warning(paste("dta-format < 106 can not handle factors.",
+                      "Labels are not saved!"))
     }
     # If our data.frame contains factors, we create a label.table
     factors <- which(sapply(data, is.factor))
@@ -278,7 +289,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
     vartypen[empty] <- sbyte
   }
 
-  # recode character variables. 118 wants utf-8, so encoding may be required
+  # recode character variables. >118 wants utf-8, so encoding may be required
   if(doRecode) {
     #TODO: use seq_len ?
     for(v in (1:ncol(data))[vartypen == "character"]) {