Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
vladd-bit committed Jul 25, 2024
2 parents 849de4a + bcd1f7e commit 3687116
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 24 deletions.
88 changes: 66 additions & 22 deletions nifi/user-scripts/cogstack_cohort_generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,18 +268,34 @@ def multiprocess_annotation_records(input_annotations: dict):
else:
record_chunks = input_annotations

counter = 0
for record_chunk in record_chunks:
rec_que.put(record_chunk)
annotation_process_pool_results.append(annotations_process_pool.starmap_async(_process_annotation_records, [(rec_que.get(),)], error_callback=logging.error))
counter += 1

for result in annotation_process_pool_results:
result_data = result.get(timeout=TIMEOUT)

_cui2ptt_pos, _cui2ptt_tsp = result_data[0][0], result_data[0][1]
cui2ptt_pos.update(_cui2ptt_pos)
cui2ptt_tsp.update(_cui2ptt_tsp)
for result in annotation_process_pool_results:
result_data = result.get(timeout=TIMEOUT)

_cui2ptt_pos, _cui2ptt_tsp = result_data[0][0], result_data[0][1]

for cui, patient_id_count_vals in _cui2ptt_pos.items():
if cui not in cui2ptt_pos.keys():
cui2ptt_pos[cui] = patient_id_count_vals
else:
for patient_id, count in patient_id_count_vals.items():
if patient_id not in cui2ptt_pos[cui].keys():
cui2ptt_pos[cui][patient_id] = count
else:
cui2ptt_pos[cui][patient_id] += count

for cui, patient_id_timestamps in _cui2ptt_tsp.items():
if cui not in cui2ptt_tsp.keys():
cui2ptt_tsp[cui] = patient_id_timestamps
else:
for patient_id, timestamp in patient_id_timestamps.items():
if patient_id not in cui2ptt_tsp[cui].keys():
cui2ptt_tsp[cui][patient_id] = timestamp
else:
cui2ptt_tsp[cui][patient_id] = timestamp

except Exception as exception:
time = datetime.now()
Expand Down Expand Up @@ -347,6 +363,13 @@ def multiprocess_annotation_records(input_annotations: dict):
global_doc2ptt = json.loads(global_doc2ptt)

if INPUT_ANNOTATIONS_RECORDS_FILE_NAME_PATTERN:

# cui2ptt_pos.jsonl each line is a dictionary of cui and the value is a dictionary of patients with a count {<cui>: {<patient_id>:<count>, ...}}\n...
cui2ptt_pos = defaultdict(Counter) # store the count of a SNOMED term for a patient

# cui2ptt_tsp.jsonl each line is a dictionary of cui and the value is a dictionary of patients with a timestamp {<cui>: {<patient_id>:<tsp>, ...}}\n...
cui2ptt_tsp = defaultdict(lambda: defaultdict(int)) # store the first mention timestamp of a SNOMED term for a patient

# read each of the patient record files one by one
for root, sub_directories, files in os.walk(INPUT_FOLDER_PATH):
for file_name in files:
Expand All @@ -358,21 +381,42 @@ def multiprocess_annotation_records(input_annotations: dict):
with open(f_path, mode="r+") as f:
contents = json.loads(f.read())

cui2ptt_pos, cui2ptt_tsp = multiprocess_annotation_records(contents)
with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_pos.jsonl"), "a+", encoding="utf-8") as outfile:
for k,v in cui2ptt_pos.items():
o = {k: v}
json_obj = json.loads(json.dumps(o))
json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
print('', file=outfile)

with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_tsp.jsonl"), "a+", encoding="utf-8") as outfile:
for k,v in cui2ptt_tsp.items():
o = {k: v}
json_obj = json.loads(json.dumps(o))
json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
print('', file=outfile)
_cui2ptt_pos, _cui2ptt_tsp = multiprocess_annotation_records(contents)

with open(log_file_path, "a+") as log_file:
time = datetime.now()
log_file.write("\n" + str(time) + ": processed file " + str(file_name))

for cui, patient_id_count_vals in _cui2ptt_pos.items():
if cui not in cui2ptt_pos.keys():
cui2ptt_pos[cui] = patient_id_count_vals
else:
for patient_id, count in patient_id_count_vals.items():
if patient_id not in cui2ptt_pos[cui]:
cui2ptt_pos[cui][patient_id] = count
else:
cui2ptt_pos[cui][patient_id] += count

for cui, patient_id_timestamps in _cui2ptt_tsp.items():
if cui not in cui2ptt_tsp.keys():
cui2ptt_tsp[cui] = patient_id_timestamps
else:
for patient_id, timestamp in patient_id_timestamps.items():
if patient_id not in cui2ptt_pos[cui].keys():
cui2ptt_tsp[cui][patient_id] = timestamp
else:
cui2ptt_tsp[cui][patient_id] = timestamp

with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_pos.jsonl"), "a+", encoding="utf-8") as outfile:
for k,v in cui2ptt_pos.items():
o = {k: v}
json_obj = json.loads(json.dumps(o))
json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
print('', file=outfile)

with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_tsp.jsonl"), "a+", encoding="utf-8") as outfile:
for k,v in cui2ptt_tsp.items():
o = {k: v}
json_obj = json.loads(json.dumps(o))
json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
print('', file=outfile)
23 changes: 23 additions & 0 deletions nifi/user-scripts/tests/generate_big_ann_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import json

f_path = "../../../data/cogstack-cohort/medical_reports_anns_medcat_medmen__*.json"


def chunk(input_list: list, num_slices: int):
for i in range(0, len(input_list), num_slices):
yield input_list[i:i + num_slices]


contents = None

add_records = 400000

first_annotation = contents[0]

for i in range(add_records):
contents.append(first_annotation)

export_path = "../../../data/medical_reports_anns_medcat_medmen__test_big.json"

with open(export_path, mode="w+") as f:
f.write(json.dumps(contents))
2 changes: 1 addition & 1 deletion scripts/installation_utils/install_docker_and_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ then
sudo apt-get install -y htop iotop sysstat

sudo apt-get install -y --no-install-recommends libreoffice-core libreoffice-writer
sudo apt-get install -y jq wget curl gnupg-agent git ca-certificates apt-transport-https python3 python3-pip libssl-dev zip unzip tar nano gcc make python3-dev build-essential software-properties-common
sudo apt-get install -y jq wget curl gnupg-agent git ca-certificates apt-transport-https python3 python3-pip python3-full libssl-dev zip unzip tar nano gcc make python3-dev build-essential software-properties-common

sudo add-apt-repository -y "deb [arch=amd64] https://download.docker.com/linux/$os_distribution $(lsb_release -cs) stable"

Expand Down

0 comments on commit 3687116

Please sign in to comment.