Skip to content

Commit

Permalink
Merge pull request #54 from sjewo/testing
Browse files Browse the repository at this point in the history
Merge for 0.9.1
  • Loading branch information
sjewo authored May 25, 2018
2 parents 36e5803 + 7ec4d24 commit 4c122aa
Show file tree
Hide file tree
Showing 36 changed files with 1,263 additions and 674 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
Package: readstata13
Type: Package
Title: Import 'Stata' Data Files
Version: 0.9.0
Version: 0.9.1
Authors@R: c(
person("Jan Marvin", "Garbuszus",
email = "[email protected]", role = c("aut")),
person("Sebastian", "Jeworutzki",
email="[email protected]", role = c("aut", "cre")),
email="[email protected]", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2671-5253")),
person("R Core Team", role="cph"),
person("Magnus Thor", "Torfason", role="ctb"),
person("Luke M.", "Olson", role="ctb"),
person("Giovanni", "Righi", role="ctb")
person("Giovanni", "Righi", role="ctb"),
person("Kevin", "Jin", role="ctb")
)
Description: Function to read and write the 'Stata' file format.
URL: https://github.com/sjewo/readstata13
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export("varlabel<-")
export(as.caldays)
export(get.label)
export(get.label.name)
export(get.label.tables)
export(get.lang)
export(get.origin.codes)
export(read.dta13)
Expand All @@ -15,6 +16,7 @@ export(varlabel)
import(Rcpp)
importFrom(stats,complete.cases)
importFrom(stats,na.omit)
importFrom(stats,setNames)
importFrom(utils,download.file)
importFrom(utils,localeToCharset)
importFrom(utils,setTxtProgressBar)
Expand Down
9 changes: 9 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
[0.9.1]
- allow reading only pre-selected variables
- experimental support for format 119
- improve partial reading
- export of binary data from dta-files
- new function get.label.tables() to show all Stata label sets
- fix check for duplicate labels
- fixes in set.lang

[0.9.0]
- generate unique factor labels to prevent errors in factor definition
- check interrupt for long read
Expand Down
12 changes: 6 additions & 6 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

stata_pre13_save <- function(filePath, dat) {
.Call('readstata13_stata_pre13_save', PACKAGE = 'readstata13', filePath, dat)
stata_read <- function(filePath, missing, selectrows, selectcols, strlexport, strlpath) {
.Call(`_readstata13_stata_read`, filePath, missing, selectrows, selectcols, strlexport, strlpath)
}

stata_read <- function(filePath, missing, selectrows) {
.Call('readstata13_stata_read', PACKAGE = 'readstata13', filePath, missing, selectrows)
stata_save <- function(filePath, dat) {
.Call(`_readstata13_stata_save`, filePath, dat)
}

stata_save <- function(filePath, dat) {
.Call('readstata13_stata_save', PACKAGE = 'readstata13', filePath, dat)
stata_pre13_save <- function(filePath, dat) {
.Call(`_readstata13_stata_pre13_save`, filePath, dat)
}

8 changes: 6 additions & 2 deletions R/dbcal.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,12 @@ stbcal <- function(stbcalfile) {
# In case centerdate is not rangestart:
stbcal$buisdays <- NA
stbcal$buisdays[stbcal$range==centerdate] <- 0
stbcal$buisdays[stbcal$range<centerdate] <- seq(from=-length(stbcal$range[stbcal$range<centerdate]), to=-1)
stbcal$buisdays[stbcal$range>centerdate] <- seq(from=1, to=length(stbcal$range[stbcal$range>centerdate]))
stbcal$buisdays[stbcal$range<centerdate] <- seq(
from=-length(stbcal$range[stbcal$range<centerdate]),
to=-1)
stbcal$buisdays[stbcal$range>centerdate] <- seq(
from=1,
to=length(stbcal$range[stbcal$range>centerdate]))

# Add purpose
if (any(grepl("purpose", x))) {
Expand Down
103 changes: 66 additions & 37 deletions R/read.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
# Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
# Copyright (C) of 'convert.dates' and 'missing.types' Thomas Lumley
#
# This program is free software; you can redistribute it and/or modify it
Expand All @@ -23,13 +23,14 @@
#' @param convert.factors \emph{logical.} If \code{TRUE}, factors from Stata
#' value labels are created.
#' @param generate.factors \emph{logical.} If \code{TRUE} and convert.factors is
#' TRUE, missing factor labels are created from integers. If duplicated labels are found,
#' unique labels will be generated according the following scheme: "label_(integer code)".
#' @param encoding \emph{character.} Strings can be converted from Windows-1252 or UTF-8
#' to system encoding. Options are "latin1" or "UTF-8" to specify target
#' encoding explicitly. Stata 14 files are UTF-8 encoded and may contain strings
#' which can't be displayed in the current locale.
#' Set encoding=NULL to stop reencoding.
#' TRUE, missing factor labels are created from integers. If duplicated labels
#' are found, unique labels will be generated according the following scheme:
#' "label_(integer code)".
#' @param encoding \emph{character.} Strings can be converted from Windows-1252
#' or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify
#' target encoding explicitly. Stata 14 and 15 files are UTF-8 encoded and may contain
#' strings which can't be displayed in the current locale.
#' Set encoding=NULL to stop reencoding.
#' @param fromEncoding \emph{character.} We expect strings to be encoded as
#' "CP1252" for Stata Versions 13 and older. For dta files saved with Stata 14
#' or newer "UTF-8" is used. In some situation the used encoding can differ for
Expand All @@ -51,12 +52,17 @@
#' @param select.rows \emph{integer.} Vector of one or two numbers. If single
#' value rows from 1:val are selected. If two values of a range are selected
#' the rows in range will be selected.
#' @param select.cols \emph{character:} Vector of variables to select.
#' @param strlexport \emph{logical:} Should strl content be exported as binary
#' files?
#' @param strlpath \emph{cahracter:} Path for strl export.
#'
#' @details If the filename is a url, the file will be downloaded as a temporary
#' file and read afterwards.
#'
#' Stata files are encoded in ansinew. Depending on your system's default encoding
#' certain characters may appear wrong. Using a correct encoding may fix these.
#' Stata files are encoded in ansinew. Depending on your system's default
#' encoding certain characters may appear wrong. Using a correct encoding may
#' fix these.
#'
#' Variable names stored in the dta-file will be used in the resulting
#' data.frame. Stata types char, byte, and int will become integer; float and
Expand All @@ -70,15 +76,15 @@
#'
#' Stata 13 introduced a new character type called strL. strLs are able to store
#' strings up to 2 billion characters. While R is able to store
#' strings of this size in a character vector, the printed representation of such
#' vectors looks rather cluttered, so it's possible to save only a reference in the
#' data.frame with option \code{replace.strl=FALSE}.
#' strings of this size in a character vector, the printed representation of
#' such vectors looks rather cluttered, so it's possible to save only a
#' reference in the data.frame with option \code{replace.strl=FALSE}.
#'
#' In R, you may use rownames to store characters (see for instance
#' \code{data(swiss)}). In Stata, this is not possible and rownames have to be
#' stored as a variable. If you want to use rownames, set add.rownames to TRUE.
#' Then the first variable of the dta-file will hold the rownames of the resulting
#' data.frame.
#' Then the first variable of the dta-file will hold the rownames of the
#' resulting data.frame.
#'
#' Reading dta-files of older and newer versions than 13 was introduced
#' with version 0.8.
Expand All @@ -95,17 +101,21 @@
#' \item{var.labels:}{Variable labels}
#' \item{version:}{dta file format version}
#' \item{label.table:}{List of value labels.}
#' \item{strl:}{Character vector with long strings for the new strl string variable
#' type. The name of every element is the identifier.}
#' \item{strl:}{Character vector with long strings for the new strl string
#' variable type. The name of every element is the identifier.}
#' \item{expansion.fields:}{list providing variable name, characteristic name
#' and the contents of Stata characteristic field.}
#' \item{missing:}{List of numeric vectors with Stata missing type for each
#' variable.}
#' \item{byteorder:}{Byteorder of the dta-file. LSF or MSF.}
#' \item{orig.dim:}{Dimension recorded inside the dta-file.}
#' }
#' @note read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members
#' from foreign::read.dta().
#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13.
#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and
#' \code{memisc} for dta files from Stata
#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version
#' >= 13.
#' @references Stata Corp (2014): Description of .dta file format
#' \url{http://www.stata.com/help.cgi?dta}
#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
Expand All @@ -119,7 +129,9 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
convert.underscore = FALSE, missing.type = FALSE,
convert.dates = TRUE, replace.strl = TRUE,
add.rownames = FALSE, nonint.factors=FALSE,
select.rows = NULL) {
select.rows = NULL, select.cols = NULL,
strlexport = FALSE, strlpath = ".") {

# Check if path is a url
if (length(grep("^(http|ftp|https)://", file))) {
tmp <- tempfile()
Expand All @@ -145,11 +157,11 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
if (any(select.rows < 0) )
select.rows <- abs(select.rows)

# check that lenght is not > 2
# check that length is not > 2
if (length(select.rows) > 2)
return(message("select.rows must be of length 1 or 2."))

# if lenght 1 start at row 1
# if length 1 start at row 1
if (length(select.rows) == 1)
select.rows <- c(1, select.rows)
}
Expand All @@ -165,7 +177,12 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
select.rows <- c(0,0)
}

data <- stata_read(filepath, missing.type, select.rows)
if (is.null(select.cols)){
select.cols <- ""
}

data <- stata_read(filepath, missing.type, select.rows, select.cols,
strlexport, strlpath)

version <- attr(data, "version")

Expand Down Expand Up @@ -301,7 +318,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
strl <- c("")
names(strl) <- "00000000000000000000"
strl <- c(strl, attr(data,"strl"))
for (j in seq(ncol(data))[types == 32768] ) {
for (j in seq(ncol(data))[types == sstrl] ) {
data[, j] <- strl[data[,j]]
}
# if strls are in data.frame remove attribute strl
Expand Down Expand Up @@ -349,36 +366,48 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
if (labname %in% names(label)) {
if((vartype == sdouble | vartype == sfloat)) {
if(!nonint.factors) {
warning(paste0("\n ",vnames[i], ":\n Factor codes of type double or float detected - no labels assigned.\n Set option nonint.factors to TRUE to assign labels anyway.\n"))
warning(paste0("\n ",vnames[i], ":\n Factor codes of type double ",
"or float detected - no labels assigned.\n Set ",
"option nonint.factors to TRUE to assign labels ",
"anyway.\n"))
next
}
}
# get unique values / omit NA
varunique <- na.omit(unique(data[, i]))

#check for duplicated labels
labcount <- table(names(labtable))
if(any(labcount > 1)) {
warning(paste0("\n ",vnames[i], ":\n Duplicated factor levels ",
"detected - generating unique labels.\n"))
labdups <- names(labtable) %in% names(labcount[labcount > 1])
# generate unique labels from assigned label and code number
names(labtable)[labdups] <- paste0(names(labtable)[labdups],
"_(", labtable[labdups], ")")
}

# assign label if label set is complete
if (all(varunique %in% labtable)) {

#check for duplicated labels
labcount <- table(names(labtable))
if(any(labcount > 1)) {
warning(paste0("\n ",vnames[i], ":\n Duplicated factor levels detected - generating unique labels.\n"))
labdups <- names(labtable) %in% names(labcount[labcount > 1])
# generate unique labels from assigned label and code number
names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")")
}

data[, i] <- factor(data[, i], levels=labtable,
labels=names(labtable))
# else generate labels from codes
} else if (generate.factors) {
names(varunique) <- as.character(varunique)
gen.lab <- sort(c(varunique[!varunique %in% labtable], labtable))

data[, i] <- factor(data[, i], levels=gen.lab,
labels=names(gen.lab))

# add generated labels to label.table
gen.lab.name <- paste0("gen_",vnames[i])
attr(data, "label.table")[[gen.lab.name]] <- gen.lab
attr(data, "val.labels")[i] <- gen.lab.name

} else {
warning(paste0("\n ",vnames[i], ":\n Missing factor labels - no labels assigned.\n Set option generate.factors=T to generate labels."))
warning(paste0("\n ",vnames[i], ":\n Missing factor labels - no ",
"labels assigned.\n Set option generate.factors=T to ",
"generate labels."))
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions R/readstata13.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@
#' @useDynLib readstata13, .registration = TRUE
#' @import Rcpp
#' @note If you catch a bug, please do not sue us, we do not have any money.
#' @seealso \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from Stata
#' Versions < 13
#' @seealso \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from
#' Stata Versions < 13
NULL
29 changes: 20 additions & 9 deletions R/save.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
# Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
Expand Down Expand Up @@ -39,7 +39,9 @@
#' @param compress \emph{logical.} If \code{TRUE}, the resulting dta-file will
#' use all of Statas numeric-vartypes.
#' @param version \emph{numeric.} Stata format for the resulting dta-file either
#' the internal Stata dta-format (e.g. 117 for Stata 13) or versions 6 - 14.
#' Stata version number (6 - 15) or the internal Stata dta-format (e.g. 117 for Stata 13).
#' Experimental support for large datasets: Use version="15mp" to save the dataset
#' in the new Stata 15/MP file format. This feature is not thoroughly tested yet.
#' @return The function writes a dta-file to disk. The following features of the
#' dta file format are supported:
#' \describe{
Expand All @@ -54,8 +56,9 @@
#' type. The first element is the identifier and the second element the
#' string.}
#' }
#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata
#' versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13.
#' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and
#' \code{memisc} for dta files from Stata versions < 13 and \code{read_dta} in
#' package \code{haven} for Stata version >= 13.
#' @references Stata Corp (2014): Description of .dta file format
#' \url{http://www.stata.com/help.cgi?dta}
#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
Expand All @@ -72,9 +75,13 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
if (!is.data.frame(data))
stop("The object \"data\" must have class data.frame")
if (!dir.exists13(dirname(file)))
stop("Path is invalid. Possibly a non existend directory.")
stop("Path is invalid. Possibly a non-existing directory.")

# Allow writing version as Stata version not Stata format
if (version=="15mp")
version <- 119
if (version==15L)
version <- 118
if (version==14L)
version <- 118
if (version==13L)
Expand All @@ -90,8 +97,11 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
if (version==6)
version <- 108

if (version<102 | version == 109 | version == 116 | version>118)
stop("Version missmatch abort execution. No Data was saved.")
if (version == 119)
message("Support for Stata 15/MP (119) format is experimental and not thoroughly tested.")

if (version<102 | version == 109 | version == 116 | version>119)
stop("Version mismatch abort execution. No Data was saved.")

sstr <- 2045
sstrl <- 32768
Expand Down Expand Up @@ -168,7 +178,8 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
hasfactors <- sapply(data, is.factor)

if (any(hasfactors))
warning("dta-format < 106 does not handle factors. Labels are not saved!")
warning(paste("dta-format < 106 can not handle factors.",
"Labels are not saved!"))
}
# If our data.frame contains factors, we create a label.table
factors <- which(sapply(data, is.factor))
Expand Down Expand Up @@ -278,7 +289,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
vartypen[empty] <- sbyte
}

# recode character variables. 118 wants utf-8, so encoding may be required
# recode character variables. >118 wants utf-8, so encoding may be required
if(doRecode) {
#TODO: use seq_len ?
for(v in (1:ncol(data))[vartypen == "character"]) {
Expand Down
Loading

0 comments on commit 4c122aa

Please sign in to comment.