From 0a7b98f69dd224ebe58e448b08bcc9d5eddfd0e7 Mon Sep 17 00:00:00 2001
From: vladd-bit <vlad.a.dinu@gmail.com>
Date: Wed, 24 Jul 2024 20:37:32 +0100
Subject: [PATCH 1/4] Scripts: install utils script update for ubuntu > 22.04.

---
 scripts/installation_utils/install_docker_and_utils.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/installation_utils/install_docker_and_utils.sh b/scripts/installation_utils/install_docker_and_utils.sh
index 23b005c4..e8df1dd8 100644
--- a/scripts/installation_utils/install_docker_and_utils.sh
+++ b/scripts/installation_utils/install_docker_and_utils.sh
@@ -16,7 +16,7 @@ then
     sudo apt-get install -y htop iotop sysstat
 
     sudo apt-get install -y --no-install-recommends libreoffice-core libreoffice-writer
-    sudo apt-get install -y jq wget curl gnupg-agent git ca-certificates apt-transport-https python3 python3-pip libssl-dev zip unzip tar nano gcc make python3-dev build-essential software-properties-common
+    sudo apt-get install -y jq wget curl gnupg-agent git ca-certificates apt-transport-https python3 python3-pip python3-full libssl-dev zip unzip tar nano gcc make python3-dev build-essential software-properties-common
 
     sudo add-apt-repository -y "deb [arch=amd64] https://download.docker.com/linux/$os_distribution $(lsb_release -cs) stable"
 

From ffea6e8765f63a989ba5e7a8cc94275af9c1440b Mon Sep 17 00:00:00 2001
From: vladd-bit <vlad.a.dinu@gmail.com>
Date: Wed, 24 Jul 2024 20:42:17 +0100
Subject: [PATCH 2/4] NiFi scripts: added big ann file gen util.

---
 .../tests/generate_big_ann_file.py            | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 nifi/user-scripts/tests/generate_big_ann_file.py

diff --git a/nifi/user-scripts/tests/generate_big_ann_file.py b/nifi/user-scripts/tests/generate_big_ann_file.py
new file mode 100644
index 00000000..b3dcf807
--- /dev/null
+++ b/nifi/user-scripts/tests/generate_big_ann_file.py
@@ -0,0 +1,23 @@
+import json
+
+f_path = "../../../data/cogstack-cohort/medical_reports_anns_medcat_medmen__*.json"
+
+
+def chunk(input_list: list, num_slices: int):
+    for i in range(0, len(input_list), num_slices):
+        yield input_list[i:i + num_slices]
+
+
+contents = None
+
+add_records = 400000
+
+first_annotation = contents[0]
+
+for i in range(add_records):
+    contents.append(first_annotation)
+
+export_path = "../../../data/medical_reports_anns_medcat_medmen__test_big.json"
+
+with open(export_path, mode="w+") as f:
+    f.write(json.dumps(contents))

From 435f84da916463ba3a25a72a2fc59c3553cbc0e2 Mon Sep 17 00:00:00 2001
From: Git bot <bot@noreply.github.com>
Date: Wed, 24 Jul 2024 19:44:07 +0000
Subject: [PATCH 3/4] Auto updated submodule references

---
 services/jupyter-hub/notebooks/working_with_cogstack | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/jupyter-hub/notebooks/working_with_cogstack b/services/jupyter-hub/notebooks/working_with_cogstack
index 1740e274..581b838d 160000
--- a/services/jupyter-hub/notebooks/working_with_cogstack
+++ b/services/jupyter-hub/notebooks/working_with_cogstack
@@ -1 +1 @@
-Subproject commit 1740e274342c1ac2424cb9300ab55f126ee702bc
+Subproject commit 581b838d9ea3bf7a67513af2a1d942b030b88c26

From bcd1f7e240d723252d4e07bbdc6d490d7196ece9 Mon Sep 17 00:00:00 2001
From: vladd-bit <vlad.a.dinu@gmail.com>
Date: Thu, 25 Jul 2024 11:15:57 +0100
Subject: [PATCH 4/4] NiFi scrpts: cohort export possible ann patient count
 fix.

---
 .../cogstack_cohort_generate_data.py          | 88 ++++++++++++++-----
 1 file changed, 66 insertions(+), 22 deletions(-)

diff --git a/nifi/user-scripts/cogstack_cohort_generate_data.py b/nifi/user-scripts/cogstack_cohort_generate_data.py
index 658896ca..68678b6e 100644
--- a/nifi/user-scripts/cogstack_cohort_generate_data.py
+++ b/nifi/user-scripts/cogstack_cohort_generate_data.py
@@ -261,18 +261,34 @@ def multiprocess_annotation_records(input_annotations: dict):
             else:
                 record_chunks = input_annotations
 
-            counter = 0
             for record_chunk in record_chunks:
                 rec_que.put(record_chunk)
                 annotation_process_pool_results.append(annotations_process_pool.starmap_async(_process_annotation_records, [(rec_que.get(),)], error_callback=logging.error))
-                counter += 1
 
-                for result in annotation_process_pool_results:
-                    result_data = result.get(timeout=TIMEOUT)
-
-                    _cui2ptt_pos, _cui2ptt_tsp = result_data[0][0], result_data[0][1]
-                    cui2ptt_pos.update(_cui2ptt_pos)
-                    cui2ptt_tsp.update(_cui2ptt_tsp)
+            for result in annotation_process_pool_results:
+                result_data = result.get(timeout=TIMEOUT)
+
+                _cui2ptt_pos, _cui2ptt_tsp = result_data[0][0], result_data[0][1]
+
+                for cui, patient_id_count_vals in _cui2ptt_pos.items():
+                    if cui not in cui2ptt_pos.keys():
+                        cui2ptt_pos[cui] = patient_id_count_vals
+                    else:
+                        for patient_id, count in patient_id_count_vals.items():
+                            if patient_id not in cui2ptt_pos[cui].keys():
+                                cui2ptt_pos[cui][patient_id] = count
+                            else:
+                                cui2ptt_pos[cui][patient_id] += count
+
+                for cui, patient_id_timestamps in _cui2ptt_tsp.items():
+                    if cui not in cui2ptt_tsp.keys():
+                        cui2ptt_tsp[cui] = patient_id_timestamps
+                    else:
+                        for patient_id, timestamp in patient_id_timestamps.items():
+                            if patient_id not in cui2ptt_tsp[cui].keys():
+                                cui2ptt_tsp[cui][patient_id] = timestamp
+                            else:
+                                cui2ptt_tsp[cui][patient_id] = timestamp
 
         except Exception as exception:
             time = datetime.now()
@@ -340,6 +356,13 @@ def multiprocess_annotation_records(input_annotations: dict):
             global_doc2ptt = json.loads(global_doc2ptt)
 
 if INPUT_ANNOTATIONS_RECORDS_FILE_NAME_PATTERN:
+
+    # cui2ptt_pos.jsonl each line is a dictionary of cui and the value is a dictionary of patients with a count {<cui>: {<patient_id>:<count>, ...}}\n...
+    cui2ptt_pos = defaultdict(Counter) # store the count of a SNOMED term for a patient
+
+    # cui2ptt_tsp.jsonl each line is a dictionary of cui and the value is a dictionary of patients with a timestamp {<cui>: {<patient_id>:<tsp>, ...}}\n...
+    cui2ptt_tsp = defaultdict(lambda: defaultdict(int)) # store the first mention timestamp of a SNOMED term for a patient
+
     # read each of the patient record files one by one
     for root, sub_directories, files in os.walk(INPUT_FOLDER_PATH):
         for file_name in files:
@@ -351,21 +374,42 @@ def multiprocess_annotation_records(input_annotations: dict):
                 with open(f_path, mode="r+") as f:
                     contents = json.loads(f.read())
 
-                    cui2ptt_pos, cui2ptt_tsp = multiprocess_annotation_records(contents)
-                    with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_pos.jsonl"), "a+", encoding="utf-8") as outfile:
-                        for k,v in cui2ptt_pos.items():
-                            o = {k: v}
-                            json_obj = json.loads(json.dumps(o))
-                            json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
-                            print('', file=outfile)
-
-                    with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_tsp.jsonl"), "a+", encoding="utf-8") as outfile:
-                        for k,v in cui2ptt_tsp.items():
-                            o = {k: v}
-                            json_obj = json.loads(json.dumps(o))
-                            json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
-                            print('', file=outfile)
+                    _cui2ptt_pos, _cui2ptt_tsp = multiprocess_annotation_records(contents)
 
                 with open(log_file_path, "a+") as log_file:
                     time = datetime.now()
                     log_file.write("\n" + str(time) + ": processed file " + str(file_name))
+
+    for cui, patient_id_count_vals in _cui2ptt_pos.items():
+        if cui not in cui2ptt_pos.keys():
+            cui2ptt_pos[cui] = patient_id_count_vals
+        else:
+            for patient_id, count in patient_id_count_vals.items():
+                if patient_id not in cui2ptt_pos[cui]:
+                    cui2ptt_pos[cui][patient_id] = count
+                else:
+                    cui2ptt_pos[cui][patient_id] += count
+
+    for cui, patient_id_timestamps in _cui2ptt_tsp.items():
+        if cui not in cui2ptt_tsp.keys():
+            cui2ptt_tsp[cui] = patient_id_timestamps
+        else:
+            for patient_id, timestamp in patient_id_timestamps.items():
+                if patient_id not in cui2ptt_pos[cui].keys():
+                    cui2ptt_tsp[cui][patient_id] = timestamp
+                else:
+                    cui2ptt_tsp[cui][patient_id] = timestamp
+
+    with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_pos.jsonl"), "a+", encoding="utf-8") as outfile:
+        for k,v in cui2ptt_pos.items():
+            o = {k: v}
+            json_obj = json.loads(json.dumps(o))
+            json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
+            print('', file=outfile)
+
+    with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_tsp.jsonl"), "a+", encoding="utf-8") as outfile:
+        for k,v in cui2ptt_tsp.items():
+            o = {k: v}
+            json_obj = json.loads(json.dumps(o))
+            json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
+            print('', file=outfile)