-
Notifications
You must be signed in to change notification settings - Fork 3
/
addNewHES.py
105 lines (89 loc) · 3.93 KB
/
addNewHES.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import argparse
from datetime import datetime
from dateutil.relativedelta import relativedelta
import json
import numpy as np
import os
import pandas as pd
import scipy.stats
from subprocess import call
import sys
parser = argparse.ArgumentParser(
description="""Join existing demographic and health data file with new Hospital Episode Statistics data
for subsequent health analyses""",
add_help=False
)
# required args
parser.add_argument('inCSV', type=str,
help="""CSV file containing current data
If the path contains spaces, it must be enclosed in
quote marks (e.g. "/data/dph.../data sets/")""")
parser.add_argument('hesCSV', type=str, help="hospital episode statistics csv")
parser.add_argument('outCSV', type=str, help="""output for analysis csv""")
parser.add_argument('diseaseJSON', type=str, default="icdGroups.json", help="""target ICD10/ICD9 groups json""")
# optional args
parser.add_argument('--incident_prevalent', type=bool, default = False, help="""Should columns for incident and prevalent disease be added?""")
parser.add_argument('--date_column', type=str, default = 'endTime', help="""Name of date column in 'inCSV'""")
# parse arguments
if len(sys.argv) < 5:
parser.print_help()
sys.exit(-1)
args = parser.parse_args()
'''
Add date parsers
'''
hes_format_parser = lambda x: pd.to_datetime(x, format = "%d/%m/%Y", errors="coerce")
'''
Read file of current data
'''
print('read ' + args.inCSV)
dAll = pd.read_csv(args.inCSV)
if ('eid' not in list(dAll.columns)):
sys.exit('inCSV must contain a participant ID column under \'eid\'')
if args.incident_prevalent:
if (str(args.date_column) not in list(dAll.columns)):
sys.exit('Date column needs to be a column of inCSV in order to define incident and prevalent disease.')
#print(dAll[args.date_column])
dAll[args.date_column] = pd.to_datetime(dAll[args.date_column], format = "%Y-%m-%d %H:%M:%S", errors = "coerce")
print(dAll[args.date_column].head())
dAll = dAll.set_index('eid')
'''
Read HES file
'''
print('read and clean ' + args.hesCSV)
dHES = pd.read_csv(args.hesCSV, parse_dates=['epistart','disdate'], date_parser= hes_format_parser)
dHES = dHES[dHES['eid'].isin(dAll.index)] # restrict to participants in dAll
print(len(dHES), 'len dataframe')
diseaseList = json.loads(open(args.diseaseJSON).read())
def cleanHESstr(s):
return s.strip().replace('&','').replace("'","").replace(' ','-').replace(',','')
print('Finding participants with: ')
dHES.loc[dHES['epistart'].isnull(), 'epistart'] = dHES['disdate']
# check for history of specific diseases
for outcome in diseaseList:
outcomeName = cleanHESstr(outcome['disease'])
if outcome['level'] == "all":
e = dHES[['eid','epistart']]\
[(dHES['diag_icd10'].str.contains(outcome['icd10'], na=False)) | \
(dHES['diag_icd9'].str.contains(outcome['icd9'], na=False)) ]
if outcome['level'] == "primary":
dHESPrimary = dHES[dHES['level'] == 1]
e = dHESPrimary[['eid','epistart']]\
[(dHESPrimary['diag_icd10'].str.contains(outcome['icd10'], na=False)) | \
(dHESPrimary['diag_icd9'].str.contains(outcome['icd9'], na=False)) ]
outcomePts = e[['epistart']].groupby(e['eid']).min()
outcomePts.columns = [outcomeName]
print(outcomeName)
dAll = dAll.join(outcomePts)
if args.incident_prevalent:
dAll[outcomeName + "-incident"] = 0
dAll.loc[(dAll[outcomeName] > dAll[args.date_column]) & (~dAll[outcomeName].isnull()), outcomeName + '-incident'] = 1
dAll[outcomeName + "-prevalent"] = 0
dAll.loc[(dAll[outcomeName] <= dAll[args.date_column]) & (~dAll[outcomeName].isnull()), outcomeName + '-prevalent'] = 1
print(outcomeName, ', n = ', len(dAll[~dAll[outcomeName].isnull()]))
'''
Write final output file...
'''
print('write final cleaned file to ' + args.outCSV)
dAll.to_csv(args.outCSV)
print('finished')