-
Notifications
You must be signed in to change notification settings - Fork 1
/
dtalink_example.do
478 lines (396 loc) · 15.9 KB
/
dtalink_example.do
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
cd "`c(sysdir_personal)'/d"
cd ..\..\..\dtalink\code-dtalink\
cap log close dtalink_example
clear all
cls
set linesize 160
cap nois log using "dtalink_example.log", replace name(dtalink_example)
*! dtalink_example.do
*! Probabilistic record linkage routine - examples
*!
*! This progam shows how the dtalink command might be used.
*!
*! By Keith Kranker
*
* Copyright (C) Mathematica Policy Research, Inc. This code cannot be copied, distributed or used without the express written permission of Mathematica Policy Research, Inc.
pwd
about
which dtalink
set processors `c(processors_max)'
***************************************
* Examples w/ baseball players
***************************************
input byte id str8 first str6 middle str9 last byte yankee byte file
1 "M." "" "Mantle" 1 0
1 "Mickey" "" "Mantle" 1 0
1 "Mickey" "" "Mantle" 1 0
2 "Henry" "Louis" "Gehrig" 1 0
2 "Lou" "" "Gehrig" 1 0
3 "Babe" "" "Ruth" 1 0
3 "George" "Herman" "Ruth, Jr." 1 0
3 "George" "Herman" "Ruth" 0 0
4 "Ted" "" "Williams" 0 0
4 "Theodore" "Samuel" "Williams" 0 0
5 "William" "Ted" "Cox" 0 0
5 "Ted" "" "Cox" 0 0
6 "Stan" "" "Williams" 0 0
7 "M." "" "Mantle" 1 1
7 "Mickey" "" "Mantle" 1 1
8 "Henry" "Louis" "Gehrig" 1 1
8 "Lou" "" "Gehrig" 1 1
9 "Babe" "" "Ruth" 1 1
9 "Babe" "" "Ruth, Jr" 1 1
9 "George" "Herman" "Ruth, Jr" 0 1
10 "Ted" "" "Williams" 0 1
10 "Theodore" "Samuel" "Williams" 0 1
11 "William" "Ted" "Cox" 0 1
11 "Ted" "" "Cox" 0 1
12 "Stan" "" "Williams" 0 1
end
bys file (id): list, sepby(id) noobs
preserve
// basic dedup
drop id file
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 0 -1, cutoff(4) examples(16) describe
// basic dedup - with id() variable and the "fillunmatched" option
restore, preserve
keep if file==0
drop file
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 0 -1, cutoff(4) id(id) fillunmatched describe examples(16)
cap nois list , sepby(_matchID)
// same thing with one caliper matching variable
restore, preserve
tostring id , replace format(%03.0f)
replace id = id + "XX"
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 0 -1 1, cutoff(4) id(id) fillunmatched describe examples(16) calcweights
cap nois list , sepby(_matchID)
return list
// dedup where we keep only the "best" _matchid for each id --- notice that we no longer have matches between 4 and 6, 4 and 10, 4 and 12, or 10 and 12
restore, preserve
drop file
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 0 -1, cutoff(4) id(id) bestmatch noweight
list , sepby(_matchID)
// dedup with "combined" matched sets --- notice that 4, 6, 10, and 12 are now in a single matched set
restore, preserve
drop file
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 0 -1, cutoff(4) id(id) combinesets noweight
list , sepby(_matchID)
// dedup with ties
restore, preserve
drop file
dtalink first 3 -3 middle 3 -3 last 3 -3 yankee 3 -3, cutoff(4) id(id) bestmatch noweight ties
list , sepby(_matchID)
// dedup with ties
restore, preserve
drop file
dtalink first 3 -3 middle 3 -3 last 3 -3 yankee 3 -3, cutoff(4) id(id) bestmatch noweight ties combinesets
list , sepby(_matchID)
// basic dedup - wide format
restore, preserve
keep if file==0
drop file id
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 0 -1, cutoff(4) wide nomerge describe examples(16)
// basic dedup with blocks
restore, preserve
keep if file==0
drop file id
replace yankee = . in 4/5
dtalink first 3 -3 middle 2 -1 last 6 -5 yankee 2 0, cutoff(3) block(yankee | last) describe examples(16)
// basic merge
restore, preserve
drop id
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 2 0, cutoff(3) source(file) describe
gsort -_score _matchID file _id
list, sepby(_matchID)
// basic merge - with id() variable
restore, preserve
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 2 0, cutoff(3) id(id) source(file) describe
list , sepby(_matchID)
// basic merge - with id() variable, keep all scores
restore, preserve
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 2 0, cutoff(3) id(id) source(file) allscores describe
list , sepby(_matchID)
return list
// basic merge where we keep only the "best" _matchid for each id --- notice that 4 and 12 are no longer matched together, since 4 was (only) matched to 12
restore, preserve
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 2 0, cutoff(3) id(id) source(file) noweight bestmatch describe
list , sepby(_matchID)
// same thing with srcbestmatch(0)
restore, preserve
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 2 0, cutoff(3) id(id) source(file) noweight srcbestmatch(0)
list , sepby(_matchID)
// same thing with srcbestmatch(1)
restore, preserve
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 2 0, cutoff(3) id(id) source(file) noweight srcbestmatch(1)
list , sepby(_matchID)
// same thing with bestmatch(0) & ties & combinesets
restore, preserve
dtalink first 3 -3 middle 3 -3 last 3 -3 yankee 3 -3, cutoff(3) id(id) source(file) noweight bestmatch ties
list , sepby(_matchID)
// same thing with srcbestmatch(0)
restore, preserve
dtalink first 3 -3 middle 3 -3 last 3 -3 yankee 3 -3, cutoff(3) id(id) source(file) noweight srcbestmatch(0) ties
list , sepby(_matchID)
// same thing with srcbestmatch(1)
restore, preserve
dtalink first 3 -3 middle 3 -3 last 3 -3 yankee 3 -3, cutoff(3) id(id) source(file) noweight srcbestmatch(1) ties
list , sepby(_matchID)
// basic merge with "combined" matched sets --- notice that 4, 6, 10, and 12 are all matched together in a single _matchID
restore, preserve
tostring id , replace format(%03.0f)
replace id = id + "XX"
dtalink first 3 -3 middle 3 -1 last 8 -5 yankee 2 0, cutoff(3) id(id) source(file) combinesets noweight
list , sepby(_matchID)
// basic merge with `using' syntax
restore, preserve
tempfile file2
keep if file==0
drop file
save "`file2'"
restore, preserve
drop if file==0
drop file
dtalink last 3 -3 yankee 1 -1 using "`file2'", cutoff(3) describe
// the parallel prefix might speed things up:
restore
drop if file==0
drop file
gen subsetfile = floor(_n/5)
sort subsetfile, stable
cap nois {
parallel setclusters 2
parallel, by(subsetfile) processors(2): dtalink last 3 -3 yankee 1 -1 using "`file2'", cutoff(3)
tab _score, plot
}
***************************************
* Trivial example
***************************************
clear
set obs 10
set seed 1
gen byte file = _n <= .45*_N
gen int var1 = floor(runiform()*3) if runiform()<.75
tab var1 file, mi
dtalink var1 1 0 var1 1 0 0, source(file) wide cutoff(1)
list
*********************************************
* Example with fake "birth certificate" data
*********************************************
* This program shows how the dtalink command might be used to verify the quality of a data linkage.
* Suppose someone else links a new file to (variables dob ssn lastname and firstname) onto your
* existing dataset (variables mpr_dob mpr_ssn mpr_lastname and mpr_firstname). After a little work to
* reshape the dataset, you can use dtalink to score each matched pairs. The trick is to use the
* block() option to restrict the matched pairs to those that were found by the other person.
clear
input byte trueID byte fileid str16 dob_str double ssn str16 lastname str9 firstname byte mofb int yofb str4 lastname_soundex str8 firstname_nysiis
1 0 "01/05/1985" 1000000000 "Doe" "Jane" 1 1985 "D000" "jan"
1 1 "01/06/1985" 1000000000 "Doe" "Jane" 1 1985 "D000" "jan"
2 0 "02/07/1985" 1000000001 "Smith" "Mary" 2 1985 "S530" "mary"
2 1 "02/07/1985" . "Smoth" "Mary" 2 1985 "S530" "mary"
3 0 "05/05/1985" 1000000002 "Johnson" "Catherine" 5 1985 "J525" "cataran"
3 1 "05/05/1985" 1000000002 "Jonson" "Katie" 5 1985 "J525" "caty"
4 0 "05/05/1985" 1000000003 "Jones" "Elizabeth" 5 1985 "J520" "elasabat"
4 1 "05/05/1985" 1000000003 "Jones" "" 5 1985 "J520" ""
5 0 "01/01/1983" . "Sanchez" "Maria" 1 1983 "S522" "mar"
5 1 "01/01/1983" 1000000004 "Sanchez-Martinez" "Maria" 1 1983 "S522" "mar"
6 0 "05/05/1985" 1000000005 "Johnson" "Jane" 5 1985 "J525" "jan"
6 1 "05/05/1985" 1000000005 "Johnson" "Jane" 5 1985 "J525" "jan"
7 0 "11/25/1985" 1000000006 "Miller" "Amy" 11 1985 "M460" "any"
8 0 "08/05/2000" 1000000007 "Miller" "Amy" 8 2000 "M460" "any"
8 1 "05/01/1980" 2000000007 "Miller" "Anne" 5 1980 "M460" "an"
end
gen int dob = date(dob_str,"MDY")
format dob %tdN/D/CY
format ssn %12.0f
order dob, before(dob_str)
drop dob_str
label define fileid 0 "A" 1 "B"
label val fileid fileid
label var trueID "Study ID"
label var fileid "File A or B"
label var dob "Date of birth"
label var ssn "Social security number"
label var lastname "Last name"
label var firstname "First name"
label var mofb "Month of birth"
label var yofb "Year of birth"
label var lastname_soundex "Soundex code for last name"
label var firstname_nysiis "NYSIIS code for first name"
// show reshaped dataset
desc
list,sepby(trueID) noobs
preserve
// score the linkages
dtalink dob 6.2 -1.7 yofb 1 -1 mofb 1 -1 ssn 15 -5 lastname 3.2 -1.0 lastname_soundex 3.2 -.9 firstname 2.5 -1.8 firstname_nysiis 2.5 -1.7, ///
source(fileid) cutoff(-9999) describe examples(16)
list if _matchflag!=1, sepby(_matchID)
// score the linkages with the distance() option too
restore, preserve
dtalink dob 6.2 -1.7 dob 20 0 30 dob 10 -5 365 yofb 1 -1 mofb 1 -1 ssn 15 -5 lastname 3.2 -1.0 lastname_soundex 3.2 -.9 firstname 2.5 -1.8 firstname_nysiis 2.5 -1.7, ///
source(fileid) cutoff(-9999) describe examples(16)
list if _matchflag!=1, sepby(_matchID)
// score the linkages with only the distance() option
restore, preserve
dtalink dob 20 0 30 dob 10 -5 365 , source(fileid) cutoff(-9999) describe examples(16)
list if _matchflag!=1, sepby(_matchID)
// calc weights as if this were a training dataset
restore, preserve
dtalink trueID 1 -1 dob 0 0 30 dob 0 0 365 yofb 0 0 mofb 0 0 ssn 0 0 lastname 0 0 lastname_soundex 0 0 firstname 0 0 firstname_nysiis 0 0, source(fileid) cutoff(0) calcweight
// remove weights for trueID
local newwgts = r(new_wgt_specs)
di "`macval(_newwgts)'"
gettoken trsh newwgts : newwgts
gettoken trsh newwgts : newwgts
gettoken trsh newwgts : newwgts
di "`macval(_newwgts)'"
// see what happens when we use recommended weights
// (after picking a new cutoff)
restore
dtalink `newwgts', source(fileid) cutoff(6) bestmatch
// It worked!
sort _matchID trueID
list _matchflag _matchID trueID, sepby(_matchID)
by _matchID (trueID): assert trueID[_n]==trueID[_N] if _matchflag
***************************************
* Short program for 1-line timers
***************************************
program define timer99
_on_colon_parse `0'
timer clear 99
timer on 99
`s(after)'
timer off 99
qui timer list 99
di as txt " Time to run = " _c
if r(t99) < 90 di as res =round(r(t99) ,.01) " sec"
else if r(t99)/60 < 400 di as res =round(r(t99)/60 ,.01) " min"
else di as res =round(r(t99)/60/60,.01) " hr"
timer clear 99
end
***************************************
* Randomly generated data
***************************************
* ----------- Tall example dataset ----------
clear
set seed 1
set obs 2000
gen int x1 = floor(runiform(0,30))
gen int x2 = floor(runiform(0,30))
gen int x3 = floor(runiform(0,30))
gen x4 = cond(runiform()<.5,"male","female")
gen byte b1 = runiform()<.5
gen byte b2 = runiform()<.5
gen byte b3 = runiform()<.5
gen str2 b4 = cond(runiform()<.5,"TX","CA")
compress
local myfloor = 4
* ----------- without missing data ----------
preserve
dtalink x1 1 -1 x2 2 -2 x3 3 -3 x4 .1 -5 in 1/20, cutoff(4) describe examples(16)
restore, preserve
dtalink x1 1 -1 x2 2 -2 x3 3 -3 x4 .1 -5 in 1/20, cutoff(4) block(b1 b2 | b3 | b1 b3) describe examples(16)
restore, preserve
timer99: dtalink x1 1 -1 x2 2 -2 x3 3 -3 x4 .1 -5 , cutoff(4) describe examples(16)
restore, preserve
timer99: dtalink x1 1 -1 x2 2 -2 x3 3 -3 x4 .1 -5 , cutoff(4) block(b1 b2 | b3 | b1 b3 | b2 b4) wide describe examples(16)
restore, preserve
qui {
dtalink x1 1 -1 x2 2 -2 x3 3 -3 2 x4 .1 -5, cutoff(2) describe examples(16) calcweights
restore, preserve
dtalink `r(new_wgt_specs)' , cutoff(2) describe examples(16) calcweights
restore, preserve
dtalink `r(new_wgt_specs)' , cutoff(2) describe examples(16) calcweights
restore, preserve
}
dtalink `r(new_wgt_specs)' , cutoff(2) describe examples(16) calcweights
return list
* ----------- with missing data ----------
restore
replace x1=. in 1/2
replace x2=. in 2/3
set seed 2
replace x1=. if runiform()<.05
replace x2=. if runiform()<.05
preserve
dtalink x1 1 -1 x2 2 -2 x3 3 -3 x4 .1 -2 in 1/10, cutoff(0) merge describe examples(16)
restore
timer99: dtalink x1 1 -1 x2 2 -2 x3 3 -3 , cutoff(4) merge describe examples(16)
* ----------- Wide example dataset ----------
clear
set seed 3
set obs 2000
forvalues k = 1/500 {
gen int var`k'=floor(runiform(0,10))
if `k' <= 100 local v1_100 `v1_100' var`k' 1 0
local v1_500 `v1_500' var`k' 1 0
}
qui compress
preserve
dtalink `v1_100', cutoff(20) noweighttable wide describe examples(16)
restore
timer99: dtalink `v1_500', cutoff(20) noweighttable wide describe examples(16)
* ----------- Lots of blocks example ----------
clear
set seed 4
set obs 50000
forvalues k = 1/10 {
gen var`k'=floor(runiform(0,20))
local v1_k `v1_k' var`k' 1 0
if `k'>4 continue
gen blk`k' = floor(runiform(0,10000))
}
qui compress
gen f = runiform()<.5
gen g = floor(runiform(0,10))
gen dob = floor(runiform(0,400))
sort f g dob
preserve
timer99: dtalink `v1_k' , cutoff(5) block(blk1 | blk2 | blk3 | f) wide describe
restore, preserve
timer99: dtalink `v1_k' , cutoff(5) block(blk1 | blk2 | blk3) source(f) wide describe
restore, preserve
timer99: dtalink `v1_k' dob 20 0 30 dob 10 -5 365 , cutoff(5) block(blk1 | blk2 | blk3 ) noweighttable
// audit scores to see if I'm getting right results
bys _score _matchID (_id) : gen diff = abs(dob[1]-dob[_N])
by _score _matchID (_id) : gen scoreE = (var1[1]==var1[_N])+(var2[1]==var2[_N])+(var3[1]==var3[_N])+(var4[1]==var4[_N])+(var5[1]==var5[_N])+(var6[1]==var6[_N])+(var7[1]==var7[_N])+(var8[1]==var8[_N])+(var9[1]==var9[_N])+(var10[1]==var10[_N])
by _score _matchID (_id) : gen scoreD = cond(diff<=30, 30, cond(diff<=365, 10, -5))
gen score = scoreE + scoreD
by _score _matchID (_id) : gen s1 = (_n==1 * runiform()<.00002)
by _score _matchID (_id) : egen s2 = max(s1)
count if s2
// list _* dob diff-score if s2 | score!=_score, sepby(_matchID) noobs
assert score==_score
restore, preserve
timer99: dtalink `v1_k' dob 20 0 30 dob 10 -5 365 , cutoff(5) block(blk1 | blk2 | blk3 ) wide noweighttable source(f)
// the first dtalink example above takes so long because the variable f makes huge blocks. See, this takes almost as long:
restore, preserve
timer99: dtalink `v1_k', cutoff(5) block(f) wide describe
// when you only have one block, the parallel prefix might speed things up:
restore, preserve
cap nois {
parallel setclusters 2
timer99: ///
parallel, by(f) processors(2): dtalink `v1_k', cutoff(5) wide
tab _score, plot
}
// EM example
restore, preserve
di as input "`v1_k'"
dtalink `v1_k' dob 20 0 30 dob 10 -5 365 , cutoff(5) block(blk1 | blk2 | blk3 ) wide noweighttable calcweights
di as input =r(new_wgt_specs)
restore, preserve
dtalink `r(new_wgt_specs)', cutoff(5) block(blk1 | blk2 | blk3 ) wide noweighttable calcweights
restore, preserve
di as input =r(new_wgt_specs)
dtalink `r(new_wgt_specs)', cutoff(5) block(blk1 | blk2 | blk3 ) wide noweighttable calcweights
restore, preserve
di as input "`v1_k'"
qui dtalink `v1_k' dob 20 0 30 dob 10 -5 365 , cutoff(5) block(blk1 | blk2 | blk3 ) wide noweighttable source(f) calcweights
di as input =r(new_wgt_specs)
restore, preserve
qui dtalink `r(new_wgt_specs)', cutoff(5) block(blk1 | blk2 | blk3 ) wide noweighttable source(f) calcweights
di as input =r(new_wgt_specs)
restore, preserve
dtalink `r(new_wgt_specs)', cutoff(5) block(blk1 | blk2 | blk3 ) wide noweighttable source(f) calcweights
return list
log close dtalink_example