diff --git a/DESCRIPTION b/DESCRIPTION index fea0e9bf..6a873e43 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,16 +1,17 @@ Package: readstata13 Type: Package Title: Import 'Stata' Data Files -Version: 0.9.0 +Version: 0.9.1 Authors@R: c( person("Jan Marvin", "Garbuszus", email = "jan.garbuszus@ruhr-uni-bochum.de", role = c("aut")), person("Sebastian", "Jeworutzki", - email="Sebastian.Jeworutzki@ruhr-uni-bochum.de", role = c("aut", "cre")), + email="Sebastian.Jeworutzki@ruhr-uni-bochum.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2671-5253")), person("R Core Team", role="cph"), person("Magnus Thor", "Torfason", role="ctb"), person("Luke M.", "Olson", role="ctb"), - person("Giovanni", "Righi", role="ctb") + person("Giovanni", "Righi", role="ctb"), + person("Kevin", "Jin", role="ctb") ) Description: Function to read and write the 'Stata' file format. URL: https://github.com/sjewo/readstata13 diff --git a/NAMESPACE b/NAMESPACE index d93352b7..8b070e9b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export("varlabel<-") export(as.caldays) export(get.label) export(get.label.name) +export(get.label.tables) export(get.lang) export(get.origin.codes) export(read.dta13) @@ -15,6 +16,7 @@ export(varlabel) import(Rcpp) importFrom(stats,complete.cases) importFrom(stats,na.omit) +importFrom(stats,setNames) importFrom(utils,download.file) importFrom(utils,localeToCharset) importFrom(utils,setTxtProgressBar) diff --git a/NEWS b/NEWS index f151b3b7..1173a2b7 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,12 @@ +[0.9.1] +- allow reading only pre-selected variables +- experimental support for format 119 +- improve partial reading +- export of binary data from dta-files +- new function get.label.tables() to show all Stata label sets +- fix check for duplicate labels +- fixes in set.lang + [0.9.0] - generate unique factor labels to prevent errors in factor definition - check interrupt for long read diff --git a/R/RcppExports.R b/R/RcppExports.R index 42ed113c..1ef45a93 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,15 +1,15 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -stata_pre13_save <- function(filePath, dat) { - .Call('readstata13_stata_pre13_save', PACKAGE = 'readstata13', filePath, dat) +stata_read <- function(filePath, missing, selectrows, selectcols, strlexport, strlpath) { + .Call(`_readstata13_stata_read`, filePath, missing, selectrows, selectcols, strlexport, strlpath) } -stata_read <- function(filePath, missing, selectrows) { - .Call('readstata13_stata_read', PACKAGE = 'readstata13', filePath, missing, selectrows) +stata_save <- function(filePath, dat) { + .Call(`_readstata13_stata_save`, filePath, dat) } -stata_save <- function(filePath, dat) { - .Call('readstata13_stata_save', PACKAGE = 'readstata13', filePath, dat) +stata_pre13_save <- function(filePath, dat) { + .Call(`_readstata13_stata_pre13_save`, filePath, dat) } diff --git a/R/dbcal.R b/R/dbcal.R index aac4e585..3381feaa 100644 --- a/R/dbcal.R +++ b/R/dbcal.R @@ -126,8 +126,12 @@ stbcal <- function(stbcalfile) { # In case centerdate is not rangestart: stbcal$buisdays <- NA stbcal$buisdays[stbcal$range==centerdate] <- 0 - stbcal$buisdays[stbcal$rangecenterdate] <- seq(from=1, to=length(stbcal$range[stbcal$range>centerdate])) + stbcal$buisdays[stbcal$rangecenterdate] <- seq( + from=1, + to=length(stbcal$range[stbcal$range>centerdate])) # Add purpose if (any(grepl("purpose", x))) { diff --git a/R/read.R b/R/read.R index 8df7e895..68cd6822 100644 --- a/R/read.R +++ b/R/read.R @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki +# Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki # Copyright (C) of 'convert.dates' and 'missing.types' Thomas Lumley # # This program is free software; you can redistribute it and/or modify it @@ -23,13 +23,14 @@ #' @param convert.factors \emph{logical.} If \code{TRUE}, factors from Stata #' value labels are created. #' @param generate.factors \emph{logical.} If \code{TRUE} and convert.factors is -#' TRUE, missing factor labels are created from integers. If duplicated labels are found, -#' unique labels will be generated according the following scheme: "label_(integer code)". -#' @param encoding \emph{character.} Strings can be converted from Windows-1252 or UTF-8 -#' to system encoding. Options are "latin1" or "UTF-8" to specify target -#' encoding explicitly. Stata 14 files are UTF-8 encoded and may contain strings -#' which can't be displayed in the current locale. -#' Set encoding=NULL to stop reencoding. +#' TRUE, missing factor labels are created from integers. If duplicated labels +#' are found, unique labels will be generated according the following scheme: +#' "label_(integer code)". +#' @param encoding \emph{character.} Strings can be converted from Windows-1252 +#' or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify +#' target encoding explicitly. Stata 14 and 15 files are UTF-8 encoded and may contain +#' strings which can't be displayed in the current locale. +#' Set encoding=NULL to stop reencoding. #' @param fromEncoding \emph{character.} We expect strings to be encoded as #' "CP1252" for Stata Versions 13 and older. For dta files saved with Stata 14 #' or newer "UTF-8" is used. In some situation the used encoding can differ for @@ -51,12 +52,17 @@ #' @param select.rows \emph{integer.} Vector of one or two numbers. If single #' value rows from 1:val are selected. If two values of a range are selected #' the rows in range will be selected. +#' @param select.cols \emph{character:} Vector of variables to select. +#' @param strlexport \emph{logical:} Should strl content be exported as binary +#' files? +#' @param strlpath \emph{cahracter:} Path for strl export. #' #' @details If the filename is a url, the file will be downloaded as a temporary #' file and read afterwards. #' -#' Stata files are encoded in ansinew. Depending on your system's default encoding -#' certain characters may appear wrong. Using a correct encoding may fix these. +#' Stata files are encoded in ansinew. Depending on your system's default +#' encoding certain characters may appear wrong. Using a correct encoding may +#' fix these. #' #' Variable names stored in the dta-file will be used in the resulting #' data.frame. Stata types char, byte, and int will become integer; float and @@ -70,15 +76,15 @@ #' #' Stata 13 introduced a new character type called strL. strLs are able to store #' strings up to 2 billion characters. While R is able to store -#' strings of this size in a character vector, the printed representation of such -#' vectors looks rather cluttered, so it's possible to save only a reference in the -#' data.frame with option \code{replace.strl=FALSE}. +#' strings of this size in a character vector, the printed representation of +#' such vectors looks rather cluttered, so it's possible to save only a +#' reference in the data.frame with option \code{replace.strl=FALSE}. #' #' In R, you may use rownames to store characters (see for instance #' \code{data(swiss)}). In Stata, this is not possible and rownames have to be #' stored as a variable. If you want to use rownames, set add.rownames to TRUE. -#' Then the first variable of the dta-file will hold the rownames of the resulting -#' data.frame. +#' Then the first variable of the dta-file will hold the rownames of the +#' resulting data.frame. #' #' Reading dta-files of older and newer versions than 13 was introduced #' with version 0.8. @@ -95,17 +101,21 @@ #' \item{var.labels:}{Variable labels} #' \item{version:}{dta file format version} #' \item{label.table:}{List of value labels.} -#' \item{strl:}{Character vector with long strings for the new strl string variable -#' type. The name of every element is the identifier.} +#' \item{strl:}{Character vector with long strings for the new strl string +#' variable type. The name of every element is the identifier.} #' \item{expansion.fields:}{list providing variable name, characteristic name #' and the contents of Stata characteristic field.} #' \item{missing:}{List of numeric vectors with Stata missing type for each #' variable.} +#' \item{byteorder:}{Byteorder of the dta-file. LSF or MSF.} +#' \item{orig.dim:}{Dimension recorded inside the dta-file.} #' } #' @note read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members #' from foreign::read.dta(). -#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata -#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13. +#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and +#' \code{memisc} for dta files from Stata +#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version +#' >= 13. #' @references Stata Corp (2014): Description of .dta file format #' \url{http://www.stata.com/help.cgi?dta} #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} @@ -119,7 +129,9 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE, convert.underscore = FALSE, missing.type = FALSE, convert.dates = TRUE, replace.strl = TRUE, add.rownames = FALSE, nonint.factors=FALSE, - select.rows = NULL) { + select.rows = NULL, select.cols = NULL, + strlexport = FALSE, strlpath = ".") { + # Check if path is a url if (length(grep("^(http|ftp|https)://", file))) { tmp <- tempfile() @@ -145,11 +157,11 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE, if (any(select.rows < 0) ) select.rows <- abs(select.rows) - # check that lenght is not > 2 + # check that length is not > 2 if (length(select.rows) > 2) return(message("select.rows must be of length 1 or 2.")) - # if lenght 1 start at row 1 + # if length 1 start at row 1 if (length(select.rows) == 1) select.rows <- c(1, select.rows) } @@ -165,7 +177,12 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE, select.rows <- c(0,0) } - data <- stata_read(filepath, missing.type, select.rows) + if (is.null(select.cols)){ + select.cols <- "" + } + + data <- stata_read(filepath, missing.type, select.rows, select.cols, + strlexport, strlpath) version <- attr(data, "version") @@ -301,7 +318,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE, strl <- c("") names(strl) <- "00000000000000000000" strl <- c(strl, attr(data,"strl")) - for (j in seq(ncol(data))[types == 32768] ) { + for (j in seq(ncol(data))[types == sstrl] ) { data[, j] <- strl[data[,j]] } # if strls are in data.frame remove attribute strl @@ -349,36 +366,48 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE, if (labname %in% names(label)) { if((vartype == sdouble | vartype == sfloat)) { if(!nonint.factors) { - warning(paste0("\n ",vnames[i], ":\n Factor codes of type double or float detected - no labels assigned.\n Set option nonint.factors to TRUE to assign labels anyway.\n")) + warning(paste0("\n ",vnames[i], ":\n Factor codes of type double ", + "or float detected - no labels assigned.\n Set ", + "option nonint.factors to TRUE to assign labels ", + "anyway.\n")) next } } # get unique values / omit NA varunique <- na.omit(unique(data[, i])) + + #check for duplicated labels + labcount <- table(names(labtable)) + if(any(labcount > 1)) { + warning(paste0("\n ",vnames[i], ":\n Duplicated factor levels ", + "detected - generating unique labels.\n")) + labdups <- names(labtable) %in% names(labcount[labcount > 1]) + # generate unique labels from assigned label and code number + names(labtable)[labdups] <- paste0(names(labtable)[labdups], + "_(", labtable[labdups], ")") + } + # assign label if label set is complete if (all(varunique %in% labtable)) { - - #check for duplicated labels - labcount <- table(names(labtable)) - if(any(labcount > 1)) { - warning(paste0("\n ",vnames[i], ":\n Duplicated factor levels detected - generating unique labels.\n")) - labdups <- names(labtable) %in% names(labcount[labcount > 1]) - # generate unique labels from assigned label and code number - names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")") - } - data[, i] <- factor(data[, i], levels=labtable, labels=names(labtable)) # else generate labels from codes } else if (generate.factors) { names(varunique) <- as.character(varunique) gen.lab <- sort(c(varunique[!varunique %in% labtable], labtable)) - + data[, i] <- factor(data[, i], levels=gen.lab, labels=names(gen.lab)) + # add generated labels to label.table + gen.lab.name <- paste0("gen_",vnames[i]) + attr(data, "label.table")[[gen.lab.name]] <- gen.lab + attr(data, "val.labels")[i] <- gen.lab.name + } else { - warning(paste0("\n ",vnames[i], ":\n Missing factor labels - no labels assigned.\n Set option generate.factors=T to generate labels.")) + warning(paste0("\n ",vnames[i], ":\n Missing factor labels - no ", + "labels assigned.\n Set option generate.factors=T to ", + "generate labels.")) } } } diff --git a/R/readstata13.R b/R/readstata13.R index d444448a..f5ec6f8f 100644 --- a/R/readstata13.R +++ b/R/readstata13.R @@ -11,6 +11,6 @@ #' @useDynLib readstata13, .registration = TRUE #' @import Rcpp #' @note If you catch a bug, please do not sue us, we do not have any money. -#' @seealso \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from Stata -#' Versions < 13 +#' @seealso \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from +#' Stata Versions < 13 NULL diff --git a/R/save.R b/R/save.R index 0d2f0d43..1fd39cee 100644 --- a/R/save.R +++ b/R/save.R @@ -1,5 +1,5 @@ # -# Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki +# Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the @@ -39,7 +39,9 @@ #' @param compress \emph{logical.} If \code{TRUE}, the resulting dta-file will #' use all of Statas numeric-vartypes. #' @param version \emph{numeric.} Stata format for the resulting dta-file either -#' the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 14. +#' Stata version number (6 - 15) or the internal Stata dta-format (e.g. 117 for Stata 13). +#' Experimental support for large datasets: Use version="15mp" to save the dataset +#' in the new Stata 15/MP file format. This feature is not thoroughly tested yet. #' @return The function writes a dta-file to disk. The following features of the #' dta file format are supported: #' \describe{ @@ -54,8 +56,9 @@ #' type. The first element is the identifier and the second element the #' string.} #' } -#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata -#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13. +#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and +#' \code{memisc} for dta files from Stata versions < 13 and \code{read_dta} in +#' package \code{haven} for Stata version >= 13. #' @references Stata Corp (2014): Description of .dta file format #' \url{http://www.stata.com/help.cgi?dta} #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} @@ -72,9 +75,13 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE, if (!is.data.frame(data)) stop("The object \"data\" must have class data.frame") if (!dir.exists13(dirname(file))) - stop("Path is invalid. Possibly a non existend directory.") + stop("Path is invalid. Possibly a non-existing directory.") # Allow writing version as Stata version not Stata format + if (version=="15mp") + version <- 119 + if (version==15L) + version <- 118 if (version==14L) version <- 118 if (version==13L) @@ -90,8 +97,11 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE, if (version==6) version <- 108 - if (version<102 | version == 109 | version == 116 | version>118) - stop("Version missmatch abort execution. No Data was saved.") + if (version == 119) + message("Support for Stata 15/MP (119) format is experimental and not thoroughly tested.") + + if (version<102 | version == 109 | version == 116 | version>119) + stop("Version mismatch abort execution. No Data was saved.") sstr <- 2045 sstrl <- 32768 @@ -168,7 +178,8 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE, hasfactors <- sapply(data, is.factor) if (any(hasfactors)) - warning("dta-format < 106 does not handle factors. Labels are not saved!") + warning(paste("dta-format < 106 can not handle factors.", + "Labels are not saved!")) } # If our data.frame contains factors, we create a label.table factors <- which(sapply(data, is.factor)) @@ -278,7 +289,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE, vartypen[empty] <- sbyte } - # recode character variables. 118 wants utf-8, so encoding may be required + # recode character variables. >118 wants utf-8, so encoding may be required if(doRecode) { #TODO: use seq_len ? for(v in (1:ncol(data))[vartypen == "character"]) { diff --git a/R/tools.R b/R/tools.R index 28be9ae7..a29eddfb 100644 --- a/R/tools.R +++ b/R/tools.R @@ -45,13 +45,13 @@ dir.exists13 <-function(x) { # @param path path to dta file # @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} # @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} -get.filepath <- function(path=""){ - if(substring(path, 1, 1) == "~") { +get.filepath <- function(path="") { + if (substring(path, 1, 1) == "~") { filepath <- path.expand(path) } else { filepath <- path } - if(!file.exists(filepath)) { + if (!file.exists(filepath)) { return("File does not exist.") } @@ -63,14 +63,16 @@ get.filepath <- function(path=""){ #' Displays informations about the defined label languages. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. -#' @param print \emph{logical.} If \code{TRUE}, print available languages and default language. +#' @param print \emph{logical.} If \code{TRUE}, print available languages and +#' default language. #' @return Returns a list with two components: #' \describe{ #' \item{languages:}{Vector of label languages used in the dataset} #' \item{default:}{Name of the actual default label language, otherwise NA} #' } -#' @details Stata allows to define multiple label sets in different languages. This functions reports the -#' available languages and the selected default language. +#' @details Stata allows to define multiple label sets in different languages. +#' This functions reports the available languages and the selected default +#' language. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @export @@ -78,7 +80,7 @@ get.lang <- function(dat, print=T) { ex <- attr(dat, "expansion.fields") lang <- list() - if(length(grep("_lang_list", ex)) > 0) { + if (length(grep("_lang_list", ex)) > 0) { lang$languages <- strsplit(ex[[grep("_lang_list", ex)]][3], " ")[[1]] } else { lang$languages <- NA @@ -87,7 +89,7 @@ get.lang <- function(dat, print=T) { ex[[grep("_lang_c", ex)]][3], NA) - if(print) { + if (print) { cat("Available languages:\n ") cat(paste0(lang$languages, "\n")) cat("\nDefault language:\n") @@ -100,13 +102,17 @@ get.lang <- function(dat, print=T) { #' Get Names of Stata Label Set #' -#' Retrieves the Stata label set in the dataset for all or an vector of variable names. +#' Retrieves the Stata label set in the dataset for all or an vector of variable +#' names. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. -#' @param var.name \emph{character vector.} Variable names. If \code{NULL}, get names of all label sets. -#' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA +#' @param var.name \emph{character vector.} Variable names. If \code{NULL}, get +#' names of all label sets. +#' @param lang \emph{character.} Label language. Default language defined by +#' \code{\link{get.lang}} is used if NA #' @return Returns an named vector of variable labels -#' @details Stata stores factor labels in variable independent labels sets. This function retrieves the name of the label set for a variable. +#' @details Stata stores factor labels in variable independent labels sets. This +#' function retrieves the name of the label set for a variable. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @export @@ -117,6 +123,16 @@ get.label.name <- function(dat, var.name=NULL, lang=NA) { names(labelsets) <- vnames } else if (is.character(lang)) { ex <- attr(dat, "expansion.fields") + + has_no_label_lang <- identical( + integer(0), + unlist(lapply(ex, grep, pattern ="_lang_l_")) + ) + + if (has_no_label_lang) { + return("") + } + varname <- sapply(ex[grep(paste0("_lang_l_", lang), ex)], function(x) x[1]) labelsets.tmp <- sapply(ex[grep(paste0("_lang_l_", lang), ex)], @@ -128,7 +144,7 @@ get.label.name <- function(dat, var.name=NULL, lang=NA) { labelsets[varname] <- labelsets.tmp[varname] } - if(is.null(var.name)) { + if (is.null(var.name)) { return(labelsets) } else { return(labelsets[var.name]) @@ -140,9 +156,12 @@ get.label.name <- function(dat, var.name=NULL, lang=NA) { #' Recreates the code numbers of a factor as stored in the Stata dataset. #' #' @param x \emph{factor.} Factor to obtain code for -#' @param label.table \emph{table.} Table with factor levels obtained by \code{\link{get.label}}. +#' @param label.table \emph{table.} Table with factor levels obtained by +#' \code{\link{get.label}}. #' @return Returns an integer with original codes -#' @details While converting numeric variables into factors, the original code numbers are lost. This function reconstructs the codes from the attribute \code{label.table}. +#' @details While converting numeric variables into factors, the original code +#' numbers are lost. This function reconstructs the codes from the attribute +#' \code{label.table}. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @examples @@ -155,7 +174,7 @@ get.label.name <- function(dat, var.name=NULL, lang=NA) { #' as.integer(dat$type) #' @export get.origin.codes <- function(x, label.table) { - if(is.factor(x)) { + if (is.factor(x)) { fac <- as.character(x) return(as.integer(label.table[fac])) } else { @@ -170,8 +189,9 @@ get.origin.codes <- function(x, label.table) { #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param label.name \emph{character.} Name of the Stata label set #' @return Returns a named vector of code numbers -#' @details This function returns the table of factor levels which represent a Stata label set. -#' The name of a label set for a variable can be obtained by \code{\link{get.label.name}}. +#' @details This function returns the table of factor levels which represent +#' a Stata label set. The name of a label set for a variable can be obtained +#' by \code{\link{get.label.name}}. #' @examples #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) #' labname <- get.label.name(dat,"type") @@ -183,17 +203,40 @@ get.label <- function(dat, label.name) { return(attr(dat, "label.table")[label.name][[1]]) } +#' Get all Stata Label Sets for a Data.frame +#' +#' Retrieve the value labels for all variables. +#' +#' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. +#' @return Returns a named list of label tables +#' @details This function returns the factor levels which represent +#' a Stata label set for all variables. +#' @examples +#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) +#' get.label.tables(dat) +#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} +#' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} +#' @importFrom stats setNames +#' @export +get.label.tables <- function(dat) { + varnames <- setNames(names(dat), names(dat)) + lapply(varnames, function(varname) get.label(dat, get.label.name(dat, varname))) +} + #' Assign Stata Labels to a Variable #' -#' Assign value labels from a Stata label set to a variable. If duplicated labels are found, -#' unique labels will be generated according the following scheme: "label_(integer code)". +#' Assign value labels from a Stata label set to a variable. If duplicated +#' labels are found, unique labels will be generated according the following +#' scheme: "label_(integer code)". Levels without labels will become . #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param var.name \emph{character.} Name of the variable in the data.frame -#' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA +#' @param lang \emph{character.} Label language. Default language defined by +#' \code{\link{get.lang}} is used if NA #' @return Returns a labeled factor #' @examples -#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"), convert.factors=FALSE) +#' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"), +#' convert.factors=FALSE) #' #' # compare vectors #' set.label(dat, "type") @@ -203,8 +246,9 @@ get.label <- function(dat, label.name) { #' set.label(dat, "type", "de") #' @export set.label <- function(dat, var.name, lang=NA) { - if(is.factor(dat[,var.name])) { - tmp <- get.origin.codes(dat[,var.name], get.label(dat, get.label.name(dat, var.name))) + if (is.factor(dat[,var.name])) { + tmp <- get.origin.codes(dat[,var.name], + get.label(dat, get.label.name(dat, var.name))) } else { tmp <- dat[,var.name] } @@ -213,15 +257,17 @@ set.label <- function(dat, var.name, lang=NA) { #check for duplicated labels labcount <- table(names(labtable)) - if(any(labcount > 1)) { - - - warning(paste0("\n ",var.name, ":\n Duplicated factor levels detected - generating unique labels.\n")) + if (any(labcount > 1)) { + + + warning(paste0("\n ",var.name, ":\n Duplicated factor levels detected -", + "generating unique labels.\n")) labdups <- names(labtable) %in% names(labcount[labcount > 1]) # generate unique labels from assigned label and code number - names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")") + names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", + labtable[labdups], ")") } - + return(factor(tmp, levels=labtable, labels=names(labtable)) ) @@ -234,8 +280,10 @@ set.label <- function(dat, var.name, lang=NA) { #' @name varlabel #' @rdname varlabel #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. -#' @param var.name \emph{character vector.} Variable names. If NULL, get label for all variables. -#' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA +#' @param var.name \emph{character vector.} Variable names. If NULL, get label +#' for all variables. +#' @param lang \emph{character.} Label language. Default language defined by +#' \code{\link{get.lang}} is used if NA #' @param value \emph{character vector.} Vector of variable names. #' @return Returns an named vector of variable labels #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} @@ -257,7 +305,7 @@ varlabel <- function(dat, var.name=NULL, lang=NA) { varlabel <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[3]) names(varlabel) <- varname } - if(is.null(var.name)) { + if (is.null(var.name)) { # order by data.frame columns and return return(varlabel[vnames]) } else { @@ -269,7 +317,7 @@ varlabel <- function(dat, var.name=NULL, lang=NA) { #' @export 'varlabel<-' <- function(dat, value) { nlabs <- length(attr(dat, "var.labels")) - if(length(value)==nlabs) { + if (length(value)==nlabs) { attr(x, "var.labels") <- value } else { warning(paste("Vector of new labels must have",nlabs,"entries.")) @@ -280,11 +328,14 @@ varlabel <- function(dat, var.name=NULL, lang=NA) { #' Assign Stata Language Labels #' -#' Changes default label language for a dataset. +#' Changes default label language for a dataset. +#' Variables with generated labels (option generate.labels=TRUE) are kept unchanged. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. -#' @param lang \emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA -#' @param generate.factors \emph{logical.} If \code{TRUE}, missing factor levels are generated. +#' @param lang \emph{character.} Label language. Default language defined by +#' \code{\link{get.lang}} is used if NA +#' @param generate.factors \emph{logical.} If \code{TRUE}, missing factor levels +#' are generated. #' @return Returns a data.frame with value labels in language "lang". #' @examples #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) @@ -310,29 +361,34 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) { val.labels <- get.label.name(dat, NULL, lang) oldval.labels <- get.label.name(dat) oldval.labels <- oldval.labels[!is.na(oldval.labels)] + oldval.labtab <- lapply(oldval.labels, function(x) get.label(dat, x)) + oldlang <- get.lang(dat, F)$default cat("Replacing value labels. This might take some time...\n") pb <- txtProgressBar(min=1,max=length(val.labels)+1) - for (i in seq_along(val.labels)) { - if(val.labels[i]!="") { + + + for (i in which(val.labels != "")) { + labname <- val.labels[i] vartype <- types[i] labtable <- label[[labname]] varname <- names(val.labels)[i] # get old codes - if(is.factor(dat[, varname])) { - oldlabname <- get.label.name(dat, varname) - oldlabtab <- get.label(dat, oldlabname) + if (is.factor(dat[, varname])) { + oldlabname <- oldval.labels[names(oldval.labels) == varname] + oldlabtab <- oldval.labtab[[names(oldlabname)]] codes <- get.origin.codes(dat[,varname], oldlabtab) varunique <- na.omit(unique(codes)) } else { varunique <- na.omit(unique(dat[,varname])) } - if(labname %in% names(label) & vartype > 65527 & is.factor(dat[,varname])) { + if (labname %in% names(label) & is.factor(dat[,varname])) { + # assign label if label set is complete if (all(varunique %in% labtable)) { @@ -340,11 +396,11 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) { labels=names(labtable)) } # else generate labels from codes - } else if(generate.factors) { + } else if (generate.factors) { names(varunique) <- as.character(varunique) gen.lab <- sort(c(varunique[!varunique %in% labtable], labtable)) - dat[,varname] <- factor(dat[,varname], levels=gen.lab, + dat[,varname] <- factor(codes, levels=gen.lab, labels=names(gen.lab)) } else { warning(paste(vnames[i], "Missing factor labels - no labels assigned. @@ -352,7 +408,6 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) { } setTxtProgressBar(pb, i) - } } close(pb) @@ -361,7 +416,7 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) { vnames <- names(oldval.labels) names(oldval.labels) <- NULL tmp <- list() - for (i in seq_along(val.labels)){ + for (i in seq_along(val.labels)) { tmp[[i]] <- c(vnames[i],paste0("_lang_l_",oldlang), oldval.labels[i]) } attr(dat, "expansion.fields") <- c(attr(dat, "expansion.fields"),tmp) @@ -369,7 +424,7 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) { # variable label old.varlabel <- attr(dat, "var.labels") tmp <- list() - for (i in seq_along(old.varlabel)){ + for (i in seq_along(old.varlabel)) { tmp[[i]] <- c(vnames[i],paste0("_lang_v_", oldlang), old.varlabel[i]) } attr(dat, "expansion.fields") <- c(attr(dat, "expansion.fields"),tmp) @@ -422,7 +477,7 @@ maxchar <- function(x) { z <- max(nchar(x, type="byte"), na.rm = TRUE) # Stata does not allow storing a string of size 0 - if(is.infinite(z) | (z == 0)) + if (is.infinite(z) | (z == 0)) z <- 1 z diff --git a/README.md b/README.md index f4eea0a8..de4d8909 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # readstata13 -Package to read and write all Stata file formats (version 14 and older) into a +Package to read and write all Stata file formats (version 15 and older) into a R data.frame. The dta file format versions 102 to 118 are supported. The function ```read.dta``` from the foreign package imports only dta files from @@ -35,12 +35,12 @@ users need to install ```R # install.packages("devtools") -devtools::install_github("sjewo/readstata13", ref="0.9.0") +devtools::install_github("sjewo/readstata13", ref="0.9.1") ``` Older Versions of devtools require a username option: ```R -install_github("readstata13", username="sjewo", ref="0.9.0") +install_github("readstata13", username="sjewo", ref="0.9.1") ``` To install the current development version from github: @@ -55,41 +55,57 @@ devtools::install_github("sjewo/readstata13", ref="testing") [![Build Status](https://travis-ci.org/sjewo/readstata13.svg?branch=master)](https://travis-ci.org/sjewo/readstata13) [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/package=readstata13) -### Working features -* [0.9.0] Generate unique factor labels to prevent errors in factor definition -* [0.9.0] check interrupt for long read. Patch by Giovanni Righi -* [0.9.0] updates to notes, roxygen and register -* [0.9.0] fixed size of character length. Bug reported by Yiming (Paul) Li -* [0.9.0] fix saving characters containing missings. Bug reported by Eivind H. Olsen -* [0.9.0] adjustments to convert.underscore. Patch by luke-m-olson -* [0.9.0] alow partial reading of selected rows -* [0.8.5] fix errors on big-endians systems -* [0.8.4] fix valgrind errors. converting from dta.write to writestr -* [0.8.4] fix for empty data label -* [0.8.4] make replace.strl default -* [0.8.3] restrict length of varnames to 32 chars for compatibility with Stata 14 -* [0.8.3] add many function tests -* [0.8.3] avoid converting of double to floats while writing compressed files -* [0.8.2] save NA values in character vector as empty string -* [0.8.2] convert.underscore=T will convert all non-literal characters to underscores -* [0.8.2] fix saving of Dates -* [0.8.2] save with convert.factors by default -* [0.8.2] test for NaN and inf values while writing missing values and replace with NA -* [0.8.2] remove message about saving factors -* [0.8.1] convert non-integer variables to factors (```nonint.factors=T```) -* [0.8.1] handle large datasets -* [0.8.1] working with strL variables is now a lot faster -* reading data files from disk or url and create a data.frame -* saving dta files to disk - most features of the dta file format are supported -* assign variable names -* read the new strL strings and save them as attribute -* convert stata label to factors and save them as attribute -* read some meta data (timestamp, dataset label, formats,...) -* convert strings to system encoding -* handle different NA values -* handle multiple label languages -* convert dates -* reading business calendar files +### Changelog and Features + +| Version | Changes | +| ------ | ---------------------------------------------------- | +| 0.9.1 | Allow reading only pre-selected variables | +| 0.9.1 | Experimental support for format 119 | +| 0.9.1 | Improvements to partial reading. Idea by Kevin Jin | +| 0.9.1 | Export of binary data from dta-files | +| 0.9.1 | new function get.label.tables() to show all Stata label sets | +| 0.9.1 | fix check for duplicate labels and in set.lang() | +| | +| 0.9.0 | Generate unique factor labels to prevent errors in factor definition | +| 0.9.0 | check interrupt for long read. Patch by Giovanni Righi | +| 0.9.0 | updates to notes, roxygen and register | +| 0.9.0 | fixed size of character length. Bug reported by Yiming (Paul) Li | +| 0.9.0 | fix saving characters containing missings. Bug reported by Eivind H. Olsen | +| 0.9.0 | adjustments to convert.underscore. Patch by luke-m-olson | +| 0.9.0 | alow partial reading of selected rows | +| | +| 0.8.5 | fix errors on big-endians systems | +| | +| 0.8.4 | fix valgrind errors. converting from dta.write to writestr | +| 0.8.4 | fix for empty data label | +| 0.8.4 | make replace.strl default | +| | +| 0.8.3 | restrict length of varnames to 32 chars for compatibility with Stata 14 | +| 0.8.3 | add many function tests | +| 0.8.3 | avoid converting of double to floats while writing compressed files | +| | +| 0.8.2 | save NA values in character vector as empty string | +| 0.8.2 | convert.underscore=T will convert all non-literal characters to underscores | +| 0.8.2 | fix saving of Dates | +| 0.8.2 | save with convert.factors by default | +| 0.8.2 | test for NaN and inf values while writing missing values and replace with NA | +| 0.8.2 | remove message about saving factors | +| | +| 0.8.1 | convert non-integer variables to factors (```nonint.factors=T```) | +| 0.8.1 | handle large datasets | +| 0.8.1 | working with strL variables is now a lot faster | +| | | +| <0.8.1 | reading data files from disk or url and create a data.frame | +| <0.8.1 | saving dta files to disk - most features of the dta file format are supported | +| <0.8.1 | assign variable names | +| <0.8.1 | read the new strL strings and save them as attribute | +| <0.8.1 | convert stata label to factors and save them as attribute | +| <0.8.1 | read some meta data (timestamp, dataset label, formats,...) | +| <0.8.1 | convert strings to system encoding | +| <0.8.1 | handle different NA values | +| <0.8.1 | handle multiple label languages | +| <0.8.1 | convert dates | +| <0.8.1 | reading business calendar files | ### Todo @@ -118,7 +134,7 @@ Map(identical,r12,r13) ## Authors -[Marvin Garbuszus](mailto:jan.garbuszus@ruhr-uni-bochum.de) ([JanMarvin](https://github.com/JanMarvin)) and [Sebastian Jeworutzki](mailto:Sebastian.Jeworutzki@ruhr-uni-bochum.de) (both Ruhr-Universität Bochum) +[Marvin Garbuszus](mailto:jan.garbuszus@ruhr-uni-bochum.de) ([JanMarvin](https://github.com/JanMarvin)) and [Sebastian Jeworutzki](mailto:Sebastian.Jeworutzki@ruhr-uni-bochum.de) ([sjewo](https://github.com/sjewo)) ## Licence diff --git a/inst/include/read_data.h b/inst/include/read_data.h new file mode 100644 index 00000000..7067150e --- /dev/null +++ b/inst/include/read_data.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . + */ + +#ifndef READDATA_H +#define READDATA_H + +Rcpp::List read_data(FILE * file, + const Rcpp::IntegerVector vartype_kk, + const bool missing, + const int8_t release, + const uint64_t nn, uint32_t kk, + const Rcpp::IntegerVector vartype_sj, + const std::string byteorder, const bool swapit); + +#endif diff --git a/inst/include/read_dta.h b/inst/include/read_dta.h index 18ee0856..f0a8ee27 100644 --- a/inst/include/read_dta.h +++ b/inst/include/read_dta.h @@ -18,6 +18,10 @@ #ifndef READDTA_H #define READDTA_H -Rcpp::List read_dta(FILE * file, const bool missing, const Rcpp::IntegerVector selectrows); +Rcpp::List read_dta(FILE * file, const bool missing, + const Rcpp::IntegerVector selectrows, + const Rcpp::CharacterVector selectcols, + const bool strlexport, + const Rcpp::CharacterVector strlpath); #endif diff --git a/inst/include/read_pre13_dta.h b/inst/include/read_pre13_dta.h index 624b725e..108dd492 100644 --- a/inst/include/read_pre13_dta.h +++ b/inst/include/read_pre13_dta.h @@ -18,6 +18,8 @@ #ifndef READPRE13DTA_H #define READPRE13DTA_H -Rcpp::List read_pre13_dta(FILE * file, const bool missing, const Rcpp::IntegerVector selectrows); +Rcpp::List read_pre13_dta(FILE * file, const bool missing, + const Rcpp::IntegerVector selectrows, + const Rcpp::CharacterVector selectcols); #endif diff --git a/inst/include/readstata.h b/inst/include/readstata.h index 548c6d25..4eb125b7 100644 --- a/inst/include/readstata.h +++ b/inst/include/readstata.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki + * Copyright (C) 2015-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -30,12 +30,12 @@ /* Test for GCC < 4.9.0 */ #if GCC_VERSION < 40900 & !__clang__ - typedef signed char int8_t; - typedef unsigned char uint8_t; - typedef signed short int16_t; - typedef unsigned short uint16_t; - typedef signed int int32_t; - typedef unsigned int uint32_t; +typedef signed char int8_t; +typedef unsigned char uint8_t; +typedef signed short int16_t; +typedef unsigned short uint16_t; +typedef signed int int32_t; +typedef unsigned int uint32_t; #else #include #endif @@ -125,4 +125,115 @@ static void writestr(std::string val_s, T len, std::fstream& dta) } +inline Rcpp::IntegerVector calc_rowlength(Rcpp::IntegerVector vartype) { + + uint32_t k = vartype.size(); + + Rcpp::IntegerVector rlen(k); + // calculate row length in byte + for (uint32_t i=0; i(ms) + Rcpp::Rcout << "Variable " << ms << + " was not found in dta-file." << std::endl; + } + + // report position for found cases + mm = Rcpp::match(y, x); + + return(mm); +} + +// calculate the maximum jump. This calculates the maximum space we can skip if +// reading only a single variable. Before we skipped over each variable. Now we +// skip over them combined. Therefore if a value in x is positive push it +// into a new vector. If negative, sum the length up. +inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector x) { + + Rcpp::IntegerVector y; + int64_t val = 0; + bool last = 0; + + uint32_t k = x.size(); + + for (uint32_t i=0; i 0) & (last == 0)) + y.push_back(val); + + val = value; + y.push_back(val); + + last = 1; + } + + if ((i+1 == k) & (last == 0)) { + y.push_back(val); + } + + } + + return(y); +} + #endif diff --git a/inst/include/statadefines.h b/inst/include/statadefines.h index 856a83c6..2bda8635 100644 --- a/inst/include/statadefines.h +++ b/inst/include/statadefines.h @@ -54,6 +54,8 @@ #define STATA_FLOAT 65527 #define STATA_DOUBLE 65526 +#define STATA_STR 2045 +#define STATA_SHORT_STR 244 #define STATA_STRL 32768 #endif diff --git a/man/get.label.Rd b/man/get.label.Rd index 51535074..34829a41 100644 --- a/man/get.label.Rd +++ b/man/get.label.Rd @@ -18,8 +18,9 @@ Returns a named vector of code numbers Retrieve the value labels for a specific Stata label set. } \details{ -This function returns the table of factor levels which represent a Stata label set. -The name of a label set for a variable can be obtained by \code{\link{get.label.name}}. +This function returns the table of factor levels which represent + a Stata label set. The name of a label set for a variable can be obtained + by \code{\link{get.label.name}}. } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) diff --git a/man/get.label.name.Rd b/man/get.label.name.Rd index fd3e0224..f90cfcd9 100644 --- a/man/get.label.name.Rd +++ b/man/get.label.name.Rd @@ -9,18 +9,22 @@ get.label.name(dat, var.name = NULL, lang = NA) \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} -\item{var.name}{\emph{character vector.} Variable names. If \code{NULL}, get names of all label sets.} +\item{var.name}{\emph{character vector.} Variable names. If \code{NULL}, get +names of all label sets.} -\item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} +\item{lang}{\emph{character.} Label language. Default language defined by +\code{\link{get.lang}} is used if NA} } \value{ Returns an named vector of variable labels } \description{ -Retrieves the Stata label set in the dataset for all or an vector of variable names. +Retrieves the Stata label set in the dataset for all or an vector of variable +names. } \details{ -Stata stores factor labels in variable independent labels sets. This function retrieves the name of the label set for a variable. +Stata stores factor labels in variable independent labels sets. This + function retrieves the name of the label set for a variable. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} diff --git a/man/get.label.tables.Rd b/man/get.label.tables.Rd new file mode 100644 index 00000000..3321167e --- /dev/null +++ b/man/get.label.tables.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tools.R +\name{get.label.tables} +\alias{get.label.tables} +\title{Get all Stata Label Sets for a Data.frame} +\usage{ +get.label.tables(dat) +} +\arguments{ +\item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} +} +\value{ +Returns a named list of label tables +} +\description{ +Retrieve the value labels for all variables. +} +\details{ +This function returns the factor levels which represent + a Stata label set for all variables. +} +\examples{ +dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) +get.label.tables(dat) +} +\author{ +Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} + +Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} +} diff --git a/man/get.lang.Rd b/man/get.lang.Rd index 12577494..11be34c9 100644 --- a/man/get.lang.Rd +++ b/man/get.lang.Rd @@ -9,7 +9,8 @@ get.lang(dat, print = T) \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} -\item{print}{\emph{logical.} If \code{TRUE}, print available languages and default language.} +\item{print}{\emph{logical.} If \code{TRUE}, print available languages and +default language.} } \value{ Returns a list with two components: @@ -22,8 +23,9 @@ Returns a list with two components: Displays informations about the defined label languages. } \details{ -Stata allows to define multiple label sets in different languages. This functions reports the -available languages and the selected default language. +Stata allows to define multiple label sets in different languages. + This functions reports the available languages and the selected default + language. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} diff --git a/man/get.origin.codes.Rd b/man/get.origin.codes.Rd index 33ee7d35..59822e78 100644 --- a/man/get.origin.codes.Rd +++ b/man/get.origin.codes.Rd @@ -9,7 +9,8 @@ get.origin.codes(x, label.table) \arguments{ \item{x}{\emph{factor.} Factor to obtain code for} -\item{label.table}{\emph{table.} Table with factor levels obtained by \code{\link{get.label}}.} +\item{label.table}{\emph{table.} Table with factor levels obtained by +\code{\link{get.label}}.} } \value{ Returns an integer with original codes @@ -18,7 +19,9 @@ Returns an integer with original codes Recreates the code numbers of a factor as stored in the Stata dataset. } \details{ -While converting numeric variables into factors, the original code numbers are lost. This function reconstructs the codes from the attribute \code{label.table}. +While converting numeric variables into factors, the original code + numbers are lost. This function reconstructs the codes from the attribute + \code{label.table}. } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) diff --git a/man/read.dta13.Rd b/man/read.dta13.Rd index a4b2ab6b..f4b20f9e 100644 --- a/man/read.dta13.Rd +++ b/man/read.dta13.Rd @@ -7,7 +7,8 @@ read.dta13(file, convert.factors = TRUE, generate.factors = FALSE, encoding = "UTF-8", fromEncoding = NULL, convert.underscore = FALSE, missing.type = FALSE, convert.dates = TRUE, replace.strl = TRUE, - add.rownames = FALSE, nonint.factors = FALSE, select.rows = NULL) + add.rownames = FALSE, nonint.factors = FALSE, select.rows = NULL, + select.cols = NULL, strlexport = FALSE, strlpath = ".") } \arguments{ \item{file}{\emph{character.} Path to the dta file you want to import.} @@ -16,14 +17,15 @@ read.dta13(file, convert.factors = TRUE, generate.factors = FALSE, value labels are created.} \item{generate.factors}{\emph{logical.} If \code{TRUE} and convert.factors is -TRUE, missing factor labels are created from integers. If duplicated labels are found, -unique labels will be generated according the following scheme: "label_(integer code)".} +TRUE, missing factor labels are created from integers. If duplicated labels +are found, unique labels will be generated according the following scheme: +"label_(integer code)".} -\item{encoding}{\emph{character.} Strings can be converted from Windows-1252 or UTF-8 -to system encoding. Options are "latin1" or "UTF-8" to specify target -encoding explicitly. Stata 14 files are UTF-8 encoded and may contain strings - which can't be displayed in the current locale. - Set encoding=NULL to stop reencoding.} +\item{encoding}{\emph{character.} Strings can be converted from Windows-1252 +or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify +target encoding explicitly. Stata 14 and 15 files are UTF-8 encoded and may contain +strings which can't be displayed in the current locale. +Set encoding=NULL to stop reencoding.} \item{fromEncoding}{\emph{character.} We expect strings to be encoded as "CP1252" for Stata Versions 13 and older. For dta files saved with Stata 14 @@ -53,6 +55,13 @@ will be assigned to variables of type float and double.} \item{select.rows}{\emph{integer.} Vector of one or two numbers. If single value rows from 1:val are selected. If two values of a range are selected the rows in range will be selected.} + +\item{select.cols}{\emph{character:} Vector of variables to select.} + +\item{strlexport}{\emph{logical:} Should strl content be exported as binary +files?} + +\item{strlpath}{\emph{cahracter:} Path for strl export.} } \value{ The function returns a data.frame with attributes. The attributes @@ -68,12 +77,14 @@ The function returns a data.frame with attributes. The attributes \item{var.labels:}{Variable labels} \item{version:}{dta file format version} \item{label.table:}{List of value labels.} - \item{strl:}{Character vector with long strings for the new strl string variable - type. The name of every element is the identifier.} + \item{strl:}{Character vector with long strings for the new strl string + variable type. The name of every element is the identifier.} \item{expansion.fields:}{list providing variable name, characteristic name and the contents of Stata characteristic field.} \item{missing:}{List of numeric vectors with Stata missing type for each variable.} + \item{byteorder:}{Byteorder of the dta-file. LSF or MSF.} + \item{orig.dim:}{Dimension recorded inside the dta-file.} } } \description{ @@ -84,8 +95,9 @@ The function returns a data.frame with attributes. The attributes If the filename is a url, the file will be downloaded as a temporary file and read afterwards. -Stata files are encoded in ansinew. Depending on your system's default encoding - certain characters may appear wrong. Using a correct encoding may fix these. +Stata files are encoded in ansinew. Depending on your system's default + encoding certain characters may appear wrong. Using a correct encoding may + fix these. Variable names stored in the dta-file will be used in the resulting data.frame. Stata types char, byte, and int will become integer; float and @@ -99,15 +111,15 @@ dates. Stata 13 introduced a new character type called strL. strLs are able to store strings up to 2 billion characters. While R is able to store - strings of this size in a character vector, the printed representation of such - vectors looks rather cluttered, so it's possible to save only a reference in the - data.frame with option \code{replace.strl=FALSE}. + strings of this size in a character vector, the printed representation of + such vectors looks rather cluttered, so it's possible to save only a + reference in the data.frame with option \code{replace.strl=FALSE}. In R, you may use rownames to store characters (see for instance \code{data(swiss)}). In Stata, this is not possible and rownames have to be stored as a variable. If you want to use rownames, set add.rownames to TRUE. - Then the first variable of the dta-file will hold the rownames of the resulting - data.frame. + Then the first variable of the dta-file will hold the rownames of the + resulting data.frame. Reading dta-files of older and newer versions than 13 was introduced with version 0.8. @@ -121,8 +133,10 @@ Stata Corp (2014): Description of .dta file format \url{http://www.stata.com/help.cgi?dta} } \seealso{ -\code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata -versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13. +\code{\link[foreign]{read.dta}} in package \code{foreign} and + \code{memisc} for dta files from Stata +versions < 13 and \code{read_dta} in package \code{haven} for Stata version + >= 13. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} diff --git a/man/readstata13.Rd b/man/readstata13.Rd index 62dafbde..09b47117 100644 --- a/man/readstata13.Rd +++ b/man/readstata13.Rd @@ -12,8 +12,8 @@ Function to read the Stata file format into a data.frame. If you catch a bug, please do not sue us, we do not have any money. } \seealso{ -\code{\link[foreign]{read.dta}} and \code{memisc} for dta files from Stata -Versions < 13 +\code{\link[foreign]{read.dta}} and \code{memisc} for dta files from + Stata Versions < 13 } \author{ Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} diff --git a/man/save.dta13.Rd b/man/save.dta13.Rd index e4c9808b..635a9f1e 100644 --- a/man/save.dta13.Rd +++ b/man/save.dta13.Rd @@ -37,7 +37,9 @@ will be added to the dta-file.} use all of Statas numeric-vartypes.} \item{version}{\emph{numeric.} Stata format for the resulting dta-file either -the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 14.} +Stata version number (6 - 15) or the internal Stata dta-format (e.g. 117 for Stata 13). +Experimental support for large datasets: Use version="15mp" to save the dataset +in the new Stata 15/MP file format. This feature is not thoroughly tested yet.} \item{convert.underscore}{\emph{logical.} If \code{TRUE}, all non numerics or non alphabet characters will be converted to underscores.} @@ -67,8 +69,9 @@ Stata Corp (2014): Description of .dta file format \url{http://www.stata.com/help.cgi?dta} } \seealso{ -\code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata -versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13. +\code{\link[foreign]{read.dta}} in package \code{foreign} and + \code{memisc} for dta files from Stata versions < 13 and \code{read_dta} in + package \code{haven} for Stata version >= 13. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} diff --git a/man/set.label.Rd b/man/set.label.Rd index 65ca65c0..f57b1afb 100644 --- a/man/set.label.Rd +++ b/man/set.label.Rd @@ -11,17 +11,20 @@ set.label(dat, var.name, lang = NA) \item{var.name}{\emph{character.} Name of the variable in the data.frame} -\item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} +\item{lang}{\emph{character.} Label language. Default language defined by +\code{\link{get.lang}} is used if NA} } \value{ Returns a labeled factor } \description{ -Assign value labels from a Stata label set to a variable. If duplicated labels are found, -unique labels will be generated according the following scheme: "label_(integer code)". +Assign value labels from a Stata label set to a variable. If duplicated + labels are found, unique labels will be generated according the following + scheme: "label_(integer code)". Levels without labels will become . } \examples{ -dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"), convert.factors=FALSE) +dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"), + convert.factors=FALSE) # compare vectors set.label(dat, "type") diff --git a/man/set.lang.Rd b/man/set.lang.Rd index 3afa187a..1f615718 100644 --- a/man/set.lang.Rd +++ b/man/set.lang.Rd @@ -9,15 +9,18 @@ set.lang(dat, lang = NA, generate.factors = FALSE) \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} -\item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} +\item{lang}{\emph{character.} Label language. Default language defined by +\code{\link{get.lang}} is used if NA} -\item{generate.factors}{\emph{logical.} If \code{TRUE}, missing factor levels are generated.} +\item{generate.factors}{\emph{logical.} If \code{TRUE}, missing factor levels +are generated.} } \value{ Returns a data.frame with value labels in language "lang". } \description{ -Changes default label language for a dataset. +Changes default label language for a dataset. +Variables with generated labels (option generate.labels=TRUE) are kept unchanged. } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) diff --git a/man/varlabel.Rd b/man/varlabel.Rd index 316d7c6b..0968cc9d 100644 --- a/man/varlabel.Rd +++ b/man/varlabel.Rd @@ -13,9 +13,11 @@ varlabel(dat) <- value \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} -\item{var.name}{\emph{character vector.} Variable names. If NULL, get label for all variables.} +\item{var.name}{\emph{character vector.} Variable names. If NULL, get label +for all variables.} -\item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} +\item{lang}{\emph{character.} Label language. Default language defined by +\code{\link{get.lang}} is used if NA} \item{value}{\emph{character vector.} Vector of variable names.} } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 33e3c458..0f12b52a 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,40 +5,55 @@ using namespace Rcpp; -// stata_pre13_save -int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat); -RcppExport SEXP readstata13_stata_pre13_save(SEXP filePathSEXP, SEXP datSEXP) { +// stata_read +List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows, const CharacterVector selectcols, const bool strlexport, const CharacterVector strlpath); +RcppExport SEXP _readstata13_stata_read(SEXP filePathSEXP, SEXP missingSEXP, SEXP selectrowsSEXP, SEXP selectcolsSEXP, SEXP strlexportSEXP, SEXP strlpathSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); - Rcpp::traits::input_parameter< Rcpp::DataFrame >::type dat(datSEXP); - rcpp_result_gen = Rcpp::wrap(stata_pre13_save(filePath, dat)); + Rcpp::traits::input_parameter< const bool >::type missing(missingSEXP); + Rcpp::traits::input_parameter< const IntegerVector >::type selectrows(selectrowsSEXP); + Rcpp::traits::input_parameter< const CharacterVector >::type selectcols(selectcolsSEXP); + Rcpp::traits::input_parameter< const bool >::type strlexport(strlexportSEXP); + Rcpp::traits::input_parameter< const CharacterVector >::type strlpath(strlpathSEXP); + rcpp_result_gen = Rcpp::wrap(stata_read(filePath, missing, selectrows, selectcols, strlexport, strlpath)); return rcpp_result_gen; END_RCPP } -// stata_read -List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows); -RcppExport SEXP readstata13_stata_read(SEXP filePathSEXP, SEXP missingSEXP, SEXP selectrowsSEXP) { +// stata_save +int stata_save(const char * filePath, Rcpp::DataFrame dat); +RcppExport SEXP _readstata13_stata_save(SEXP filePathSEXP, SEXP datSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); - Rcpp::traits::input_parameter< const bool >::type missing(missingSEXP); - Rcpp::traits::input_parameter< const IntegerVector >::type selectrows(selectrowsSEXP); - rcpp_result_gen = Rcpp::wrap(stata_read(filePath, missing, selectrows)); + Rcpp::traits::input_parameter< Rcpp::DataFrame >::type dat(datSEXP); + rcpp_result_gen = Rcpp::wrap(stata_save(filePath, dat)); return rcpp_result_gen; END_RCPP } -// stata_save -int stata_save(const char * filePath, Rcpp::DataFrame dat); -RcppExport SEXP readstata13_stata_save(SEXP filePathSEXP, SEXP datSEXP) { +// stata_pre13_save +int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat); +RcppExport SEXP _readstata13_stata_pre13_save(SEXP filePathSEXP, SEXP datSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); Rcpp::traits::input_parameter< Rcpp::DataFrame >::type dat(datSEXP); - rcpp_result_gen = Rcpp::wrap(stata_save(filePath, dat)); + rcpp_result_gen = Rcpp::wrap(stata_pre13_save(filePath, dat)); return rcpp_result_gen; END_RCPP } + +static const R_CallMethodDef CallEntries[] = { + {"_readstata13_stata_read", (DL_FUNC) &_readstata13_stata_read, 6}, + {"_readstata13_stata_save", (DL_FUNC) &_readstata13_stata_save, 2}, + {"_readstata13_stata_pre13_save", (DL_FUNC) &_readstata13_stata_pre13_save, 2}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_readstata13(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/rcpp_readstata.cpp b/src/read.cpp similarity index 83% rename from src/rcpp_readstata.cpp rename to src/read.cpp index d00a619e..bc6b2269 100644 --- a/src/rcpp_readstata.cpp +++ b/src/read.cpp @@ -27,7 +27,9 @@ using namespace Rcpp; // @export // [[Rcpp::export]] List stata_read(const char * filePath, const bool missing, - const IntegerVector selectrows) + const IntegerVector selectrows, + const CharacterVector selectcols, + const bool strlexport, const CharacterVector strlpath) { FILE *file = NULL; // File pointer @@ -53,9 +55,10 @@ List stata_read(const char * filePath, const bool missing, List df(0); if (fbit.compare(expfbit) == 0) - df = read_dta(file, missing, selectrows); + df = read_dta(file, missing, selectrows, selectcols, + strlexport, strlpath); else - df = read_pre13_dta(file, missing, selectrows); + df = read_pre13_dta(file, missing, selectrows, selectcols); fclose(file); diff --git a/src/read_data.cpp b/src/read_data.cpp new file mode 100644 index 00000000..2b50e21d --- /dev/null +++ b/src/read_data.cpp @@ -0,0 +1,241 @@ +/* + * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . + */ + +#include "readstata.h" + +using namespace Rcpp; +using namespace std; + +List read_data(FILE * file, + const IntegerVector vartype_kk, + const bool missing, const int8_t release, + const uint64_t nn, uint32_t kk, + const IntegerVector vartype_sj, + const std::string byteorder, const bool swapit) { + + // 1. create the list + List df(kk); + for (uint32_t i=0; i0) & (type < 2046)) ? STATA_STR : type) + { + // double + case STATA_DOUBLE: + { + double val_d = 0; + val_d = readbin(val_d, file, swapit); + + if ((missing == 0) && !(val_d == R_NegInf) && ((val_dSTATA_DOUBLE_NA_MAX)) ) + REAL(VECTOR_ELT(df,ii))[j] = NA_REAL; + else + REAL(VECTOR_ELT(df,ii))[j] = val_d; + + break; + } + // float + case STATA_FLOAT: + { + float val_f = 0; + val_f = readbin(val_f, file, swapit); + + if ((missing == 0) && ((val_fSTATA_FLOAT_NA_MAX)) ) + REAL(VECTOR_ELT(df,ii))[j] = NA_REAL; + else + REAL(VECTOR_ELT(df,ii))[j] = val_f; + + break; + } + // long + case STATA_INT: + { + int32_t val_l = 0; + val_l = readbin(val_l, file, swapit); + + if ((missing == 0) && ((val_lSTATA_INT_NA_MAX)) ) + INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER; + else + INTEGER(VECTOR_ELT(df,ii))[j] = val_l; + + break; + } + // int + case STATA_SHORTINT: + { + int16_t val_i = 0; + val_i = readbin(val_i, file, swapit); + + if ((missing == 0) && ((val_iSTATA_SHORTINT_NA_MAX)) ) + INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER; + else + INTEGER(VECTOR_ELT(df,ii))[j] = val_i; + + break; + } + // byte + case STATA_BYTE: + { + int8_t val_b = 0; + val_b = readbin(val_b, file, swapit); + + if (missing == 0 && ( (val_bSTATA_BYTE_NA_MAX)) ) + INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER; + else + INTEGER(VECTOR_ELT(df,ii))[j] = val_b; + + break; + } + // strings with 2045 or fewer characters + case STATA_STR: + { + int32_t len = 0; + len = vartype_sj[i]; + std::string val_s (len, '\0'); + + readstring(val_s, file, val_s.size()); + as(df[ii])[j] = val_s; + break; + } + // string of any length + case STATA_STRL: + {// strL 2*4bit or 2 + 6 bit + + // FixMe: Strl in 118 + switch (release) + { + + case 117: + { + uint32_t v = 0, o = 0; + + v = readbin(v, file, swapit); + o = readbin(o, file, swapit); + + stringstream val_stream; + val_stream << v << '_' << o; + string val_strl = val_stream.str(); + + as(df[ii])[j] = val_strl; + + break; + } + case 118: + { + int16_t v = 0; + int64_t o = 0, z = 0; + + z = readbin(z, file, swapit); + + // works for LSF on little- and big-endian + if (byteorder.compare("LSF")==0) { + v = (int16_t)z; + o = (z >> 16); + } + + // works if we read a big-endian file on little-endian + if (byteorder.compare("MSF")==0) { + v = (z >> 48) & ((1 << 16) - 1); + o = z & ((1 << 16) - 1); + } + + stringstream val_stream; + val_stream << v << '_' << o; + string val_strl = val_stream.str(); + + as(df[ii])[j] = val_strl; + + break; + } + case 119: + { + int32_t v = 0; + int64_t o = 0, z = 0; + + z = readbin(z, file, swapit); + + // works for LSF on little- and big-endian + if (byteorder.compare("LSF")==0) { + v = (int32_t)z & ((1 << 24) - 1); + o = (z >> 24); + } + + // FixMe: works if we read a big-endian file on little-endian + if (byteorder.compare("MSF")==0) { + v = (z >> 48) & ((1 << 24) - 1); + o = z & ((1 << 24) - 1); + } + + stringstream val_stream; + val_stream << v << '_' << o; + string val_strl = val_stream.str(); + + as(df[ii])[j] = val_strl; + + break; + } + } + break; + } + // case < 0: + default: + { + // skip to the next valid case + fseeko64(file, abs(type), SEEK_CUR); + break; + } + } + + if (type >= 0) ii += 1; + + checkUserInterrupt(); + } + } + + return(df); +} diff --git a/src/read_dta.cpp b/src/read_dta.cpp index 0a582640..6578f756 100644 --- a/src/read_dta.cpp +++ b/src/read_dta.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki + * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -16,11 +16,15 @@ */ #include "readstata.h" +#include "read_data.h" using namespace Rcpp; using namespace std; -List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { +List read_dta(FILE * file, const bool missing, const IntegerVector selectrows, + const CharacterVector selectcols, + const bool strlexport, const CharacterVector strlpath) +{ // stata_dta>
test("stata_dta>
", file); test("", file); @@ -30,7 +34,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { */ int8_t fversion = 117L; //f = first - int8_t lversion = 118L; //l = last + int8_t lversion = 119L; //l = last std::string version(3, '\0'); readstring(version, file, version.size()); @@ -43,7 +47,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { // check the release version. if (releaselversion) { - Rcpp::warning("File version is %d.\nVersion: Not a version 13/14 dta-file", release); + warning("File version is %d.\nVersion: Not a version 13/14 dta-file", release); return -1; } @@ -65,6 +69,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { lbllen = 33; break; case 118: + case 119: nvarnameslen = 129; nformatslen = 57; nvalLabelslen = 129; @@ -79,7 +84,7 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { test("", file); /* - * byteorder is a 4 byte character e.g. "LSF". MSF referes to big-memory data. + * byteorder is a 4 byte character e.g. "LSF". MSF refers to big-endian. */ std::string byteorder(3, '\0'); @@ -96,8 +101,11 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { * Number of Variables */ - uint16_t k = 0; - k = readbin(k, file, swapit); + uint32_t k = 0; + if (release < 119) + k = readbin((uint16_t)k, file, swapit); + if (release == 119) + k = readbin(k, file, swapit); // test("", file); @@ -109,17 +117,20 @@ List read_dta(FILE * file, const bool missing, const IntegerVector selectrows) { uint64_t n = 0; - if(release==117) { + if (release == 117) n = readbin((uint32_t)n, file, swapit); - } - if (release ==118) { + if ((release == 118) | (release == 119)) n = readbin(n, file, swapit); - } // test("", file); test("