diff --git a/.gitignore b/.gitignore index d7fd1d8..578b833 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ json.zip **/*.pyc /.venv/ +/config.txt.tmp_local diff --git a/build_dictionary.py b/build_dictionary.py index c766d1e..7b48ce8 100755 --- a/build_dictionary.py +++ b/build_dictionary.py @@ -91,23 +91,10 @@ def createRootEntry(self, type): res["flags"] = flags return res - def init(self, settings): - global globalSymbolsFile - global icd9File - global pntFile - global ccs_diag_file - global ccs_proc_file - global productFile - global packageFile - globalSymbolsFile = settings['filename'] - icd9File = settings['icd9'] - pntFile = settings['pnt'] - ccs_diag_file = settings['ccs_diag'] - ccs_proc_file = settings['ccs_proc'] - productFile = settings['ndc_prod'] - packageFile = settings['ndc_package'] + def init(self, settings, settingsFile): for k in self._baseTypes.keys(): - self._codeTables[k] = self._baseTypes[k].init() + self._codeTables[k] = self._baseTypes[k].init(settings) + util.save_config(settings, settingsFile) dictionary = EntryCreator() @@ -124,10 +111,10 @@ def flags(self): return {} def addCodeType(self, code, codeType): self._codeTypes[code] = codeType - def init(self): + def init(self, settings): res = {} for code in self._codeTypes.keys(): - res[code] = self._codeTypes[code].init() + res[code] = self._codeTypes[code].init(settings) return res def create(self, symbols, type, id, code): candidate = None @@ -161,7 +148,7 @@ def create(self, symbols, type, id, code): return (candidate, code) class TypeCode(object): - def init(self): + def init(self, settings): raise NotImplementedError() def create(self, symbols, type, id): raise NotImplementedError @@ -183,9 +170,9 @@ def create(self, symbols, type, id): if len(id) == 2: return createUnknownEntry(symbols, type, id, pid, code=self.code) return toEntry(id, pid, id, "Provider Number: {0}".format(id)) - def init(self): + def init(self, settings): res = {} - file = util.get_file(pntFile, debugOutput) + file = get_file(settings, 'pnt', 'code/pnt/pnt.txt') if not os.path.isfile(file): return res with open(file, 'r') as pnt: @@ -213,9 +200,29 @@ class CmsPhysicianCode(TypeCode): def create(self, symbols, type, id): pid = "" return createUnknownEntry(symbols, type, id, pid, code=self.code) - def init(self): + def init(self, settings): return {} +@dictionary.codeType("physician", "ibc") +class CmsPhysicianCode(TypeCode): + def create(self, symbols, type, id): + pid = "" + if id in symbols: + return toEntry(id, pid, symbols[id], symbols[id]) + return createUnknownEntry(symbols, type, id, pid, code=self.code) + def init(self, settings): + res = {} + spec_file = get_file(settings, 'ibc_speciality', '/m/CODES/specialty/specialty_headers.txt') + if os.path.isfile(spec_file): + with open(spec_file, 'r') as file: + for line in file: + l = line.strip() + spl = l.split('#', 1) + if len(spl) < 2: + continue + res[spl[0]] = spl[1] + return res + ### prescribed ### @dictionary.baseType("prescribed") class TypePrescribed(TypeBase): @@ -232,9 +239,9 @@ def create(self, symbols, type, id): l = symbols[id] return toEntry(id, pid, l["nonp"], l["nonp"]+" ["+l["desc"]+"] ("+l["prop"]+") "+l["subst"]+" - "+l["pharm"]+" - "+l["pType"], l["alias"] if "alias" in l else None) return createUnknownEntry(symbols, type, id, pid, code=self.code) - def init(self): + def init(self, settings): prescribeLookup = {} - fileA = util.get_file(productFile, debugOutput) + fileA = get_file(settings, 'ndc_prod', 'code/ndc/product.txt') if not os.path.isfile(fileA): return prescribeLookup uidLookup = {} @@ -287,7 +294,7 @@ def init(self): prescribeLookup[ndc] = obj prescribeLookup[normndc] = obj prescribeLookup[fullndc] = obj - fileB = util.get_file(packageFile, debugOutput) + fileB = get_file(settings, 'ndc_package', 'code/ndc/package.txt') if not os.path.isfile(fileB): return prescribeLookup with open(fileB, 'r') as paFile: @@ -356,8 +363,18 @@ def create(self, symbols, type, id): if id in symbols: return toEntry(id, pid, symbols[id], symbols[id]) return createUnknownEntry(symbols, type, id, pid, code=self.code) - def init(self): - return getGlobalSymbols() + def init(self, settings): + res = getGlobalSymbols(settings) + loinc_file = get_file(settings, 'loinc', '/m/CODES/loinc/loinc_file.all.headers') + if os.path.isfile(loinc_file): + with open(loinc_file, 'r') as file: + for line in file: + l = line.strip() + spl = l.split('#', 1) + if len(spl) < 2: + continue + res[spl[0]] = spl[1] + return res ### diagnosis ### @dictionary.baseType("diagnosis") @@ -380,10 +397,10 @@ def create(self, symbols, type, id): return toEntry(id, pid, symbols[prox_id], symbols[prox_id], id.replace(".", "") if "HIERARCHY" not in id else None) prox_id = prox_id[:-1] return createUnknownEntry(symbols, type, id, pid, code=self.code) - def init(self): - codes = getGlobalSymbols() - codes.update(getICD9()) - self._parents = readCCS(util.get_file(ccs_diag_file, debugOutput), codes) + def init(self, settings): + codes = getGlobalSymbols(settings) + codes.update(getICD9(settings, True)) + self._parents = readCCS(get_file(settings, 'ccs_diag', 'code/ccs/multi_diag.txt'), codes) return codes ### procedure ### @@ -407,10 +424,10 @@ def create(self, symbols, type, id): return toEntry(id, pid, symbols[prox_id], symbols[prox_id], id.replace(".", "") if "HIERARCHY" not in id else None) prox_id = prox_id[:-1] return createUnknownEntry(symbols, type, id, pid) - def init(self): - codes = getGlobalSymbols() - codes.update(getICD9()) - self._parents = readCCS(util.get_file(ccs_proc_file, debugOutput), codes) + def init(self, settings): + codes = getGlobalSymbols(settings) + codes.update(getICD9(settings, False)) + self._parents = readCCS(get_file(settings, 'ccs_proc', 'code/ccs/multi_proc.txt'), codes) return codes ### info ### @@ -426,7 +443,7 @@ class InfoInfoCode(TypeCode): def create(self, symbols, type, id): pid = "" return toEntry(id, pid, id, "Info: " + id) - def init(self): + def init(self, settings): return {} @@ -439,7 +456,7 @@ def name(self): return UNKNOWN def color(self): return "red" - def init(self): + def init(self, settings): raise NotImplementedError() def create(self, symbols, type, id, code): return createUnknownEntry(symbols, type, id, code=code) @@ -468,17 +485,37 @@ def toEntry(id, pid, name, desc, alias=None): ### icd9 ### -globalICD9 = {} +globalICD9 = { + 'diagnosis': {}, + 'procedure': {} +} -def getICD9(): - global globalICD9 - if not len(globalICD9.keys()): - globalICD9 = initICD9() - return globalICD9.copy() +def getICD9(settings, isDiagnosis): + k = 'diagnosis' if isDiagnosis else 'procedure' + if not len(globalICD9[k].keys()): + fileKeyS = k + '_icd9' + fileKeyL = k + '_icd9_long' + fileDefaultS = '/m/CODES/icd9/ICD-9-CM-v32-master-descriptions/' + ('CMS32_DESC_SHORT_DX.txt' if isDiagnosis else 'CMS32_DESC_SHORT_SG.txt') + fileDefaultL = '/m/CODES/icd9/ICD-9-CM-v32-master-descriptions/' + ('CMS32_DESC_LONG_DX.txt' if isDiagnosis else 'CMS32_DESC_LONG_SG.txt') + fileS = get_file(settings, fileKeyS, fileDefaultS) + fileL = get_file(settings, fileKeyL, fileDefaultL) + if not os.path.isfile(fileS) and not os.path.isfile(fileL): + globalICD9[k] = initICD9(settings) + else: + symbols = globalICD9[k] + f = fileS if not os.path.isfile(fileL) else fileL + with open(f, 'r') as file: + for line in file: + l = line.strip() + spl = l.split(' ', 1) + if len(spl) < 2: + continue + symbols[spl[0].strip()] = spl[1].strip() + return globalICD9[k].copy() -def initICD9(): +def initICD9(settings): codes = {} - f = util.get_file(icd9File, debugOutput) + f = get_file(settings, 'icd9', 'code/icd9/ucod.txt') if not os.path.isfile(f): return codes with open(f, 'r') as file: @@ -535,15 +572,15 @@ def readCCS(ccsFile, codes): globalSymbols = {} -def getGlobalSymbols(): +def getGlobalSymbols(settings): global globalSymbols if not len(globalSymbols.keys()): - globalSymbols = initGlobalSymbols() + globalSymbols = initGlobalSymbols(settings) return globalSymbols.copy() -def initGlobalSymbols(): +def initGlobalSymbols(settings): codes_dict = {} - f = util.get_file(globalSymbolsFile, debugOutput) + f = get_file(settings, 'filename', 'code/code_names.txt') if not os.path.isfile(f): return codes_dict with open(f, 'r') as file: @@ -581,19 +618,18 @@ def enrichDict(file, mid): with util.OutWrapper(file) as out: print(json.dumps(dict, indent=2, sort_keys=True), file=out) -def init(settings): - dictionary.init(settings) +def init(settings, settingsFile): + dictionary.init(settings, settingsFile) ### argument API -icd9File = 'code/icd9/ucod.txt' -ccs_diag_file = 'code/ccs/multi_diag.txt' -ccs_proc_file = 'code/ccs/multi_proc.txt' -productFile = 'code/ndc/product.txt' -packageFile = 'code/ndc/package.txt' -pntFile = 'code/pnt/pnt.txt' -globalSymbolsFile = 'code/code_names.txt' -globalMid = '2507387001' +def get_file(settings, key, default): + if key in settings: + file = settings[key] + else: + file = default + settings[key] = file + return util.get_file(file, debugOutput) def usage(): print("{0}: [--debug] -p -c -o [-h|--help] [--lookup ]".format(sys.argv[0]), file=sys.stderr) @@ -605,21 +641,12 @@ def usage(): print("-h|--help: prints this help.", file=sys.stderr) sys.exit(1) -defaultSettings = { - 'filename': globalSymbolsFile, - 'ndc_prod': productFile, - 'ndc_package': packageFile, - 'icd9': icd9File, - 'pnt': pntFile, - 'ccs_diag': ccs_diag_file, - 'ccs_proc': ccs_proc_file, -} - def interpretArgs(): global debugOutput - settings = defaultSettings + settings = {} + settingsFile = None info = { - 'mid': globalMid, + 'mid': '-', 'output': '-' } lookupMode = False @@ -638,7 +665,8 @@ def interpretArgs(): if not args: print('-c requires argument', file=sys.stderr) usage() - util.read_config(settings, args.pop(0), debugOutput) + settingsFile = args.pop(0) + util.read_config(settings, settingsFile, debugOutput) elif val == '-o': if not args: print('-o requires argument', file=sys.stderr) @@ -652,11 +680,11 @@ def interpretArgs(): else: print('illegal argument '+val, file=sys.stderr) usage() - return (settings, info, lookupMode, args) + return (settings, settingsFile, info, lookupMode, args) if __name__ == '__main__': - (settings, info, lookupMode, rest) = interpretArgs() - dictionary.init(settings) + (settings, settingsFile, info, lookupMode, rest) = interpretArgs() + dictionary.init(settings, settingsFile) if lookupMode: dict = {} diff --git a/feature_extraction/cohort.py b/feature_extraction/cohort.py index 77bfa9b..ce2a1ef 100755 --- a/feature_extraction/cohort.py +++ b/feature_extraction/cohort.py @@ -510,7 +510,8 @@ def usage(): if __name__ == '__main__': output = '-' - settings = build_dictionary.defaultSettings + settingsFile = None + settings = {} settings['delim'] = ',' settings['quote'] = '"' query = "" @@ -553,7 +554,8 @@ def usage(): if not args or args[0] == '--': print('-c requires argument', file=sys.stderr) usage() - util.read_config(settings, args.pop(0), build_dictionary.debugOutput) + settingsFile = args.pop(0) + util.read_config(settings, settingsFile, build_dictionary.debugOutput) elif arg == '--debug': build_dictionary.debugOutput = True else: @@ -564,7 +566,7 @@ def usage(): print('query is required', file=sys.stderr) usage() - build_dictionary.init(settings) + build_dictionary.init(settings, settingsFile) allPaths = [] while args: diff --git a/feature_extraction/extract.py b/feature_extraction/extract.py index ee9eb5f..20a8236 100755 --- a/feature_extraction/extract.py +++ b/feature_extraction/extract.py @@ -233,7 +233,8 @@ def usage(): if __name__ == '__main__': output = '-' - settings = build_dictionary.defaultSettings + settingsFile = None + settings = {} settings['delim'] = ',' settings['quote'] = '"' whitelist = None @@ -295,7 +296,8 @@ def usage(): if not args or args[0] == '--': print('-c requires argument', file=sys.stderr) usage() - util.read_config(settings, args.pop(0), build_dictionary.debugOutput) + settingsFile = args.pop(0) + util.read_config(settings, settingsFile, build_dictionary.debugOutput) elif arg == '--debug': build_dictionary.debugOutput = True else: @@ -303,7 +305,7 @@ def usage(): usage() build_dictionary.reportMissingEntries = False - build_dictionary.init(settings) + build_dictionary.init(settings, settingsFile) allPaths = [] while args: diff --git a/setup.sh b/setup.sh index 561ee83..0a96d88 100755 --- a/setup.sh +++ b/setup.sh @@ -291,6 +291,7 @@ pip_install() { source ${venv_activate} test_fail $? echo "install python packages" + pip install --upgrade pip pip install -r requirements.txt test_fail $? } diff --git a/test/etc/.gitignore b/test/etc/.gitignore index 3362443..0471319 100644 --- a/test/etc/.gitignore +++ b/test/etc/.gitignore @@ -1 +1,2 @@ /cohort.txt +/config.txt.tmp diff --git a/test/etc/config.txt b/test/etc/config.txt index 107dbca..fde41cf 100644 --- a/test/etc/config.txt +++ b/test/etc/config.txt @@ -1,11 +1,17 @@ { + "ccs_diag": "../code/ccs/multi_diag.txt", "ccs_proc": "../code/ccs/multi_proc.txt", - "quote": "\"", "delim": ",", - "pnt": "../code/pnt/pnt.txt", + "diagnosis_icd9": "../code/not_found.txt", + "diagnosis_icd9_long": "../code/not_found.txt", "filename": "../code/code_names.txt", - "ndc_package": "../code/ndc/package.txt", + "ibc_speciality": "../code/code_names.txt", "icd9": "../code/icd9/ucod.txt", + "loinc": "../code/code_names.txt", + "ndc_package": "../code/ndc/package.txt", "ndc_prod": "../code/ndc/product.txt", - "ccs_diag": "../code/ccs/multi_diag.txt" + "pnt": "../code/pnt/pnt.txt", + "procedure_icd9": "../code/not_found.txt", + "procedure_icd9_long": "../code/not_found.txt", + "quote": "\"" } diff --git a/test/local.sh b/test/local.sh index 7a7f9e0..22a287b 100755 --- a/test/local.sh +++ b/test/local.sh @@ -13,7 +13,8 @@ FEATURE_EXTRACT="../feature_extraction" cohort="./etc/cohort.txt" format="../format.json" -config="../config.txt" +config="../config.txt.tmp_local" +config_regr="../config.txt" print() { echo "$@" 2>&1 @@ -36,12 +37,18 @@ check_file() { } print "test: training predictive model" +cp "${config_regr}" "${config}" if [ ! -f "${cohort}" ]; then print "building cohort" ${FEATURE_EXTRACT}/cohort.py --debug --query-file "${FEATURE_EXTRACT}/cases.txt" -f "${format}" -c "${config}" -o "${OUTPUT}/cohort_cases.txt.tmp_local" -- "$CMS_DIR" + check $? + check_file "${config_regr}" "${config}" ${FEATURE_EXTRACT}/cohort.py --debug --query-file "${FEATURE_EXTRACT}/control.txt" -f "${format}" -c "${config}" -o "${OUTPUT}/cohort_control.txt.tmp_local" -- "$CMS_DIR" + check $? + check_file "${config_regr}" "${config}" ${FEATURE_EXTRACT}/merge.py --cases "${OUTPUT}/cohort_cases.txt.tmp_local" --control "${OUTPUT}/cohort_control.txt.tmp_local" -o "${cohort}" --test 30 --seed 0 + check $? else print "use existing cohort at ${cohort}" fi @@ -49,10 +56,11 @@ fi ${FEATURE_EXTRACT}/extract.py --debug -w "${cohort}" --age-time 20100101 --to 20100101 -o - -f "${format}" -c "${config}" -- "$CMS_DIR" | \ ${FEATURE_EXTRACT}/train.py -w --in - --out "${OUTPUT}/model" --seed 0 --model reg -v 20 2> "${OUTPUT}/train.txt.tmp_local" check $? +check_file "${config_regr}" "${config}" check_file "${OUTPUT}/train.txt" "${OUTPUT}/train.txt.tmp_local" -rm -- ${OUTPUT}/*.tmp_local -rm -r -- ${OUTPUT}/model/ +rm -- "${config}" ${OUTPUT}/*.tmp_local +rm -r -- "${OUTPUT}/model/" print "all tests successful!" exec 3>&- # don't really need to close the FD diff --git a/test/run.sh b/test/run.sh index 16a75ca..cc97ade 100755 --- a/test/run.sh +++ b/test/run.sh @@ -15,7 +15,8 @@ FEATURE_EXTRACT="../feature_extraction" format="../format.json" style_classes="../style_classes.json" etc="./etc" -config="${etc}/config.txt" +config="${etc}/config.txt.tmp" +config_regr="${etc}/config.txt" print() { echo "$@" 2>&1 @@ -53,6 +54,7 @@ convert_patient() { ../build_dictionary.py --debug -p "${OUTPUT}/${id}.json.tmp" -c "${config}" -o "${OUTPUT}/dictionary.json.tmp" check $? check_file "${OUTPUT}/dictionary.json" "${OUTPUT}/dictionary.json.tmp" + check_file "${config_regr}" "${config}" } create_predictive_model() { @@ -65,24 +67,30 @@ create_predictive_model() { ${FEATURE_EXTRACT}/cohort.py --debug --query-file "${etc}/cases.txt" -f "${format}" -c "${config}" -o "${OUTPUT}/cohort_cases.txt.tmp" -- "${CMS_DIR}" check $? check_file "${OUTPUT}/cohort_cases.txt" "${OUTPUT}/cohort_cases.txt.tmp" + check_file "${config_regr}" "${config}" ${FEATURE_EXTRACT}/cohort.py --debug --query-file "${etc}/control.txt" -f "${format}" -c "${config}" -o "${OUTPUT}/cohort_control.txt.tmp" -- "${CMS_DIR}" check $? check_file "${OUTPUT}/cohort_control.txt" "${OUTPUT}/cohort_control.txt.tmp" + check_file "${config_regr}" "${config}" ${FEATURE_EXTRACT}/merge.py --cases "${OUTPUT}/cohort_cases.txt.tmp" --control "${OUTPUT}/cohort_control.txt.tmp" -o "${OUTPUT}/cohort.txt.tmp" --test 30 --seed 0 2> $ERR_FILE check $? check_file "${OUTPUT}/cohort.txt" "${OUTPUT}/cohort.txt.tmp" ${FEATURE_EXTRACT}/extract.py --debug -w "${OUTPUT}/cohort.txt.tmp" --num-cutoff 1 --age-time 20100101 --to 20100101 -o "${OUTPUT}/output.csv.tmp" -f "${format}" -c "${config}" -- "${CMS_DIR}" check $? check_file "${OUTPUT}/output.csv" "${OUTPUT}/output.csv.tmp" + check_file "${config_regr}" "${config}" head -n 1 "${OUTPUT}/output.csv.tmp" | sed "s/,/ /g" | ../build_dictionary.py --debug -o "${OUTPUT}/headers.json.tmp" -c "${config}" --lookup - check $? check_file "${OUTPUT}/headers.json" "${OUTPUT}/headers.json.tmp" + check_file "${config_regr}" "${config}" } +cp "${config_regr}" "${config}" + convert_patient "8CDC0C5ACBDFC9CE" create_predictive_model -rm -- ${ERR_FILE} ${OUTPUT}/*.tmp +rm -- "${ERR_FILE}" "${config}" ${OUTPUT}/*.tmp print "all tests successful!" exec 3>&- # don't really need to close the FD diff --git a/util.py b/util.py index 3fd7510..3284c9c 100755 --- a/util.py +++ b/util.py @@ -98,6 +98,8 @@ def get_file(file, debugOutput=False): def read_config(settings, file, debugOutput=False): global _path_correction + if file is None: + return _path_correction = os.path.dirname(os.path.abspath(file)) config = {} if debugOutput: @@ -106,9 +108,39 @@ def read_config(settings, file, debugOutput=False): with open(file, 'r') as input: config = json.loads(input.read()) settings.update(config) - if set(settings.keys()) - set(config.keys()): + save_on_change(settings, config, file) + +def save_config(settings, file): + global _path_correction + if file is None: + return + _path_correction = os.path.dirname(os.path.abspath(file)) + config = {} + if os.path.isfile(file): + with open(file, 'r') as input: + config = json.loads(input.read()) + save_on_change(settings, config, file) + +def save_on_change(local, original, file): + same = True + lk = local.keys() + ok = original.keys() + if len(lk) != len(ok): + same = False + else: + for k in lk: + if k not in original or local[k] != original[k]: + same = False + break + if same: + # small number of keys so it is not bad to iterate twice + for k in ok: + if k not in local or original[k] != local[k]: + same = False + break + if not same: with open(file, 'w') as output: - print(json.dumps(settings, indent=2, sort_keys=True), file=output) + print(json.dumps(local, indent=2, sort_keys=True), file=output) def read_format(file, input_format, usage): if not os.path.isfile(file):