From 0a7b98f69dd224ebe58e448b08bcc9d5eddfd0e7 Mon Sep 17 00:00:00 2001 From: vladd-bit Date: Wed, 24 Jul 2024 20:37:32 +0100 Subject: [PATCH 1/4] Scripts: install utils script update for ubuntu > 22.04. --- scripts/installation_utils/install_docker_and_utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/installation_utils/install_docker_and_utils.sh b/scripts/installation_utils/install_docker_and_utils.sh index 23b005c4..e8df1dd8 100644 --- a/scripts/installation_utils/install_docker_and_utils.sh +++ b/scripts/installation_utils/install_docker_and_utils.sh @@ -16,7 +16,7 @@ then sudo apt-get install -y htop iotop sysstat sudo apt-get install -y --no-install-recommends libreoffice-core libreoffice-writer - sudo apt-get install -y jq wget curl gnupg-agent git ca-certificates apt-transport-https python3 python3-pip libssl-dev zip unzip tar nano gcc make python3-dev build-essential software-properties-common + sudo apt-get install -y jq wget curl gnupg-agent git ca-certificates apt-transport-https python3 python3-pip python3-full libssl-dev zip unzip tar nano gcc make python3-dev build-essential software-properties-common sudo add-apt-repository -y "deb [arch=amd64] https://download.docker.com/linux/$os_distribution $(lsb_release -cs) stable" From ffea6e8765f63a989ba5e7a8cc94275af9c1440b Mon Sep 17 00:00:00 2001 From: vladd-bit Date: Wed, 24 Jul 2024 20:42:17 +0100 Subject: [PATCH 2/4] NiFi scripts: added big ann file gen util. --- .../tests/generate_big_ann_file.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 nifi/user-scripts/tests/generate_big_ann_file.py diff --git a/nifi/user-scripts/tests/generate_big_ann_file.py b/nifi/user-scripts/tests/generate_big_ann_file.py new file mode 100644 index 00000000..b3dcf807 --- /dev/null +++ b/nifi/user-scripts/tests/generate_big_ann_file.py @@ -0,0 +1,23 @@ +import json + +f_path = "../../../data/cogstack-cohort/medical_reports_anns_medcat_medmen__*.json" + + +def chunk(input_list: list, num_slices: int): + for i in range(0, len(input_list), num_slices): + yield input_list[i:i + num_slices] + + +contents = None + +add_records = 400000 + +first_annotation = contents[0] + +for i in range(add_records): + contents.append(first_annotation) + +export_path = "../../../data/medical_reports_anns_medcat_medmen__test_big.json" + +with open(export_path, mode="w+") as f: + f.write(json.dumps(contents)) From 435f84da916463ba3a25a72a2fc59c3553cbc0e2 Mon Sep 17 00:00:00 2001 From: Git bot Date: Wed, 24 Jul 2024 19:44:07 +0000 Subject: [PATCH 3/4] Auto updated submodule references --- services/jupyter-hub/notebooks/working_with_cogstack | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/jupyter-hub/notebooks/working_with_cogstack b/services/jupyter-hub/notebooks/working_with_cogstack index 1740e274..581b838d 160000 --- a/services/jupyter-hub/notebooks/working_with_cogstack +++ b/services/jupyter-hub/notebooks/working_with_cogstack @@ -1 +1 @@ -Subproject commit 1740e274342c1ac2424cb9300ab55f126ee702bc +Subproject commit 581b838d9ea3bf7a67513af2a1d942b030b88c26 From bcd1f7e240d723252d4e07bbdc6d490d7196ece9 Mon Sep 17 00:00:00 2001 From: vladd-bit Date: Thu, 25 Jul 2024 11:15:57 +0100 Subject: [PATCH 4/4] NiFi scrpts: cohort export possible ann patient count fix. --- .../cogstack_cohort_generate_data.py | 88 ++++++++++++++----- 1 file changed, 66 insertions(+), 22 deletions(-) diff --git a/nifi/user-scripts/cogstack_cohort_generate_data.py b/nifi/user-scripts/cogstack_cohort_generate_data.py index 658896ca..68678b6e 100644 --- a/nifi/user-scripts/cogstack_cohort_generate_data.py +++ b/nifi/user-scripts/cogstack_cohort_generate_data.py @@ -261,18 +261,34 @@ def multiprocess_annotation_records(input_annotations: dict): else: record_chunks = input_annotations - counter = 0 for record_chunk in record_chunks: rec_que.put(record_chunk) annotation_process_pool_results.append(annotations_process_pool.starmap_async(_process_annotation_records, [(rec_que.get(),)], error_callback=logging.error)) - counter += 1 - for result in annotation_process_pool_results: - result_data = result.get(timeout=TIMEOUT) - - _cui2ptt_pos, _cui2ptt_tsp = result_data[0][0], result_data[0][1] - cui2ptt_pos.update(_cui2ptt_pos) - cui2ptt_tsp.update(_cui2ptt_tsp) + for result in annotation_process_pool_results: + result_data = result.get(timeout=TIMEOUT) + + _cui2ptt_pos, _cui2ptt_tsp = result_data[0][0], result_data[0][1] + + for cui, patient_id_count_vals in _cui2ptt_pos.items(): + if cui not in cui2ptt_pos.keys(): + cui2ptt_pos[cui] = patient_id_count_vals + else: + for patient_id, count in patient_id_count_vals.items(): + if patient_id not in cui2ptt_pos[cui].keys(): + cui2ptt_pos[cui][patient_id] = count + else: + cui2ptt_pos[cui][patient_id] += count + + for cui, patient_id_timestamps in _cui2ptt_tsp.items(): + if cui not in cui2ptt_tsp.keys(): + cui2ptt_tsp[cui] = patient_id_timestamps + else: + for patient_id, timestamp in patient_id_timestamps.items(): + if patient_id not in cui2ptt_tsp[cui].keys(): + cui2ptt_tsp[cui][patient_id] = timestamp + else: + cui2ptt_tsp[cui][patient_id] = timestamp except Exception as exception: time = datetime.now() @@ -340,6 +356,13 @@ def multiprocess_annotation_records(input_annotations: dict): global_doc2ptt = json.loads(global_doc2ptt) if INPUT_ANNOTATIONS_RECORDS_FILE_NAME_PATTERN: + + # cui2ptt_pos.jsonl each line is a dictionary of cui and the value is a dictionary of patients with a count {: {:, ...}}\n... + cui2ptt_pos = defaultdict(Counter) # store the count of a SNOMED term for a patient + + # cui2ptt_tsp.jsonl each line is a dictionary of cui and the value is a dictionary of patients with a timestamp {: {:, ...}}\n... + cui2ptt_tsp = defaultdict(lambda: defaultdict(int)) # store the first mention timestamp of a SNOMED term for a patient + # read each of the patient record files one by one for root, sub_directories, files in os.walk(INPUT_FOLDER_PATH): for file_name in files: @@ -351,21 +374,42 @@ def multiprocess_annotation_records(input_annotations: dict): with open(f_path, mode="r+") as f: contents = json.loads(f.read()) - cui2ptt_pos, cui2ptt_tsp = multiprocess_annotation_records(contents) - with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_pos.jsonl"), "a+", encoding="utf-8") as outfile: - for k,v in cui2ptt_pos.items(): - o = {k: v} - json_obj = json.loads(json.dumps(o)) - json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':')) - print('', file=outfile) - - with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_tsp.jsonl"), "a+", encoding="utf-8") as outfile: - for k,v in cui2ptt_tsp.items(): - o = {k: v} - json_obj = json.loads(json.dumps(o)) - json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':')) - print('', file=outfile) + _cui2ptt_pos, _cui2ptt_tsp = multiprocess_annotation_records(contents) with open(log_file_path, "a+") as log_file: time = datetime.now() log_file.write("\n" + str(time) + ": processed file " + str(file_name)) + + for cui, patient_id_count_vals in _cui2ptt_pos.items(): + if cui not in cui2ptt_pos.keys(): + cui2ptt_pos[cui] = patient_id_count_vals + else: + for patient_id, count in patient_id_count_vals.items(): + if patient_id not in cui2ptt_pos[cui]: + cui2ptt_pos[cui][patient_id] = count + else: + cui2ptt_pos[cui][patient_id] += count + + for cui, patient_id_timestamps in _cui2ptt_tsp.items(): + if cui not in cui2ptt_tsp.keys(): + cui2ptt_tsp[cui] = patient_id_timestamps + else: + for patient_id, timestamp in patient_id_timestamps.items(): + if patient_id not in cui2ptt_pos[cui].keys(): + cui2ptt_tsp[cui][patient_id] = timestamp + else: + cui2ptt_tsp[cui][patient_id] = timestamp + + with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_pos.jsonl"), "a+", encoding="utf-8") as outfile: + for k,v in cui2ptt_pos.items(): + o = {k: v} + json_obj = json.loads(json.dumps(o)) + json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':')) + print('', file=outfile) + + with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_tsp.jsonl"), "a+", encoding="utf-8") as outfile: + for k,v in cui2ptt_tsp.items(): + o = {k: v} + json_obj = json.loads(json.dumps(o)) + json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':')) + print('', file=outfile)