diff --git a/DESCRIPTION b/DESCRIPTION index 7d11bef..994ef84 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: blink Type: Package Title: Record Linkage for Empirically Motivated Priors -Version: 0.1.0 +Version: 1.1.0 Authors@R: person("Rebecca", "Steorts", email = "beka@stat.duke.edu", role = c("aut", "cre")) Depends: R (>= 3.0.2), diff --git a/data-raw/.Rapp.history b/data-raw/.Rapp.history new file mode 100644 index 0000000..9c1584d --- /dev/null +++ b/data-raw/.Rapp.history @@ -0,0 +1,15 @@ +library(RecordLinkage) +data(RLdata500) +save(file="RLdata500-new.csv", header=TRUE, sep=",") +write.table(file="RLdata500-new.csv", header=TRUE, sep=",") +write.table(file="RLdata500-new.csv") +head(RLdata500) +write.table(RLdata500, file="RLdata500-new.csv", header=TRUE, sep=",", row.names=FALSE) +write.table(RLdata500, file="RLdata500-new.csv", sep=",", row.names=FALSE) +write.table(RLdata500, file="RLdata500-new.csv", sep=",", row.names=FALSE, quote="FALSE") +?write.table() +write.table(RLdata500, file="RLdata500-new.csv", sep=",", row.names=FALSE, quote=FALSE) +write.table(RLdata500, file="RLdata500-new.csv", sep=",", row.names=FALSE, quote=FALSE) +write.table(identity.RLdata500, file="identity.RLdata500.csv", quote=FALSE) +write.table(identity.RLdata500, file="identity.RLdata500.csv", quote=FALSE, row.names=FALSE) +write.table(identity.RLdata500, file="identity.RLdata500.csv", quote=FALSE, row.names=FALSE) diff --git a/vignettes/introEBLink.R b/vignettes/introEBLink.R new file mode 100644 index 0000000..8a7668e --- /dev/null +++ b/vignettes/introEBLink.R @@ -0,0 +1,55 @@ +## ---- echo = FALSE------------------------------------------------------------ +knitr::opts_chunk$set(collapse = TRUE, comment = "#>") + +## ---- echo=TRUE, message=FALSE, knitr::opts_chunk$set(cache=TRUE)------------- +library(blink) +data(RLdata500) +head(RLdata500) + +## ----------------------------------------------------------------------------- +# X.c contains the categorical variables +# X.s contains the string variables +# p.c is the number of categorical variables +# p.s contains the number of string variables +X.c <- RLdata500[c("by","bm","bd")] +X.c <- as.matrix(RLdata500[,"bd"],ncol=1) +p.c <- ncol(X.c) +X.s <- as.matrix(RLdata500[-c(2,4,7)]) +p.s <- ncol(X.s) + +## ----------------------------------------------------------------------------- +# File number identifier +# Note: Recall that X.c and X.s include all files "stacked" on top of each other. +# The vector below keeps track of which rows of X.c and X.s are in which files. +file.num <- rep(c(1,2,3),c(200,150,150)) + +## ----------------------------------------------------------------------------- +# Subjective choices for distortion probability prior +a <-1 +b <- 999 + +## ----------------------------------------------------------------------------- +d <- function(string1,string2){adist(string1,string2)} + +## ----------------------------------------------------------------------------- +c <- 1 + +## ----results="hide"----------------------------------------------------------- +library(knitr) +library(blink) +library(plyr) +Sys.setenv(TMPDIR="/tmp/") +configure.vars="TMPDIR=/tmp/" +lam.gs <- rl.gibbs(file.num=file.num,X.s=X.s,X.c=X.c,num.gs=2,a=a,b=b,c=c,d=d, M=500) +#system.time(lam.gs <- rl.gibbs(file.num=file.num,X.s=X.s,X.c=X.c,num.gs=2,a=a,b=b,c=c,d=d, M=500)) + +## ---- fig.show="hold", fig.cap="The red line is the ground truth (450), which is not close to the estimate (500) since we only ran 10 Gibbs sampling iterations."---- +#estLink <- tempfile(pattern = "lam.gs") +estLink <- lam.gs +estPopSize <- apply(estLink , 1, function(x) {length(unique(x))}) +plot(density(estPopSize),xlim=c(300,500),main="",lty=1, "Observed Population Size", ylim= c(0,1)) +abline(v=450,col="red") +abline(v=mean(estPopSize),col="black",lty=2) +mean(estPopSize) +sd(estPopSize) + diff --git a/vignettes/introEBLink.html b/vignettes/introEBLink.html new file mode 100644 index 0000000..378bb2a --- /dev/null +++ b/vignettes/introEBLink.html @@ -0,0 +1,429 @@ + + + + +
+ + + + + + + + + + + +We present a small example from “Entity Resolution with Emprically Motivated Priors”, Bayesian Analysis, (10),4:849-975. We will be using the RecordLinkage package in R and the RLdata500 data set.
+The blink package removes duplicate entries from multiple databases using the method outlined in the paper above. We illustrate an example of using this package using a German dataset comprised of first and last name and full date of birth.
+Our goals include
+The RLdata500 data set exists already in the RecordLinkage package in R. We review this data set for the user.
+The RLdata500 data consists of 500 records with 10 percent duplication. Thus, there are 450 unique individuals. There is full information on each record containing first name, last name, and full date of birth.
+We first load the Record Linkgae package and load the RLdata500 data set. We also, provide the first few lines of the data.
+library(blink)
+data(RLdata500)
+head(RLdata500)
+#> fname_c1 fname_c2 lname_c1 lname_c2 by bm bd
+#> 1 CARSTEN <NA> MEIER <NA> 1949 7 22
+#> 2 GERD <NA> BAUER <NA> 1968 7 27
+#> 3 ROBERT <NA> HARTMANN <NA> 1930 4 30
+#> 4 STEFAN <NA> WOLFF <NA> 1957 9 2
+#> 5 RALF <NA> KRUEGER <NA> 1966 1 13
+#> 6 JUERGEN <NA> FRANKE <NA> 1929 7 4
Next, we prepare the data for working with the blink package.
+# X.c contains the categorical variables
+# X.s contains the string variables
+# p.c is the number of categorical variables
+# p.s contains the number of string variables
+X.c <- RLdata500[c("by","bm","bd")]
+X.c <- as.matrix(RLdata500[,"bd"],ncol=1)
+p.c <- ncol(X.c)
+X.s <- as.matrix(RLdata500[-c(2,4,7)])
+p.s <- ncol(X.s)
Now, we give a small example for setting the tuning parameters before running the Gibbs sampler.
+First, we work with a file number identifier.
+# File number identifier
+# Note: Recall that X.c and X.s include all files "stacked" on top of each other.
+# The vector below keeps track of which rows of X.c and X.s are in which files.
+file.num <- rep(c(1,2,3),c(200,150,150))
Next, we work with the parameters that tune the prior on the amount of distortion that goes into the model.
+ +Then we write a function for the Edit distance between two strings. Other distance functions could be used, such as Jaro-Winkler.
+ +For the steepness parameter, we recommend
+ +We now run a test version of the Gibbs sampler using blink, with 10 Gibbs iterations and a maximum size of M=500 (assuming the overall known population size is 500).
+ +Let’s read in the estimate linkage structure using 10 Gibbs iterations.
+#estLink <- tempfile(pattern = "lam.gs")
+estLink <- lam.gs
+estPopSize <- apply(estLink , 1, function(x) {length(unique(x))})
+plot(density(estPopSize),xlim=c(300,500),main="",lty=1, "Observed Population Size", ylim= c(0,1))
+abline(v=450,col="red")
+abline(v=mean(estPopSize),col="black",lty=2)
+mean(estPopSize)
+#> [1] 499.5
+sd(estPopSize)
+#> [1] 0.7071068
For more information, such as how to use the recall, precision, and other summary statistics, please see the paper.
+