remove dependency on primary index by converting queries that are agi…

…qanst the id field to gets. (#335) remove dependency on primary index by converting queries that are agiqanst the id field to gets. These are queries for job documents which aren't even all that large. By changing them from queries to data fetches they are a lot more efficient and don't require either a special index or a primary index.
NOAA-GSL · Feb 21, 2024 · 1d7240a · 1d7240a
2 parents 7f4539c + 375a4be
commit 1d7240a
Show file tree

Hide file tree

Showing 6 changed files with 21 additions and 36 deletions.
diff --git a/src/vxingest/ctc_to_cb/run_ingest_threads.py b/src/vxingest/ctc_to_cb/run_ingest_threads.py
@@ -195,14 +195,10 @@ def runit(self, args, log_queue: Queue, log_configurer: Callable[[Queue], None])
             self.cb_credentials = self.get_credentials(self.load_spec)
             # establish connections to cb, collection
             self.connect_cb()
-            bucket = self.load_spec["cb_connection"]["bucket"]
-            scope = self.load_spec["cb_connection"]["scope"]
-            collection = self.load_spec["cb_connection"]["collection"]
-
             # load the ingest document ids into the load_spec (this might be redundant)
-            stmnt = f'Select ingest_document_ids from `{bucket}`.{scope}.{collection} where meta().id = "{self.job_document_id}"'
-            result = self.cluster.query(stmnt)
-            self.load_spec["ingest_document_ids"] = list(result)[0][
+            ingest_document_result = self.collection.get(self.job_document_id)
+            ingest_document = ingest_document_result.content_as[dict]
+            self.load_spec["ingest_document_ids"] = ingest_document[
                 "ingest_document_ids"
             ]
             # put all the ingest documents into the load_spec too

diff --git a/src/vxingest/grib2_to_cb/run_ingest_threads.py b/src/vxingest/grib2_to_cb/run_ingest_threads.py
@@ -188,23 +188,21 @@ def runit(self, args, log_queue: Queue, log_configurer: Callable[[Queue], None])
             # establish connections to cb, collection
             self.connect_cb()
             # load the ingest document ids into the load_spec (this might be redundant)
-            stmnt = f"Select ingest_document_ids from `{self.cb_credentials['bucket']}`.{self.cb_credentials['scope']}.{self.cb_credentials['collection']} where meta().id = \"{self.job_document_id}\""
-            result = self.cluster.query(stmnt)
-            self.load_spec["ingest_document_ids"] = list(result)[0][
+            ingest_document_result = self.collection.get(self.job_document_id)
+            ingest_document = ingest_document_result.content_as[dict]
+            self.load_spec["ingest_document_ids"] = ingest_document[
                 "ingest_document_ids"
             ]
+
             # put all the ingest documents into the load_spec too
             self.load_spec["ingest_documents"] = {}
             for _id in self.load_spec["ingest_document_ids"]:
                 self.load_spec["ingest_documents"][_id] = self.collection.get(
                     _id
                 ).content_as[dict]
             # load the fmask and input_data_path into the load_spec
-            stmnt = f"Select file_mask, input_data_path from `{self.cb_credentials['bucket']}`.{self.cb_credentials['scope']}.{self.cb_credentials['collection']} where meta().id = \"{self.job_document_id}\""
-            result = self.cluster.query(stmnt)
-            result_list = list(result)
-            self.fmask = result_list[0]["file_mask"]
-            self.path = result_list[0]["input_data_path"]
+            self.fmask = ingest_document["file_mask"]
+            self.path = ingest_document["input_data_path"]
             self.load_spec["fmask"] = self.fmask
             self.load_spec["input_data_path"] = self.path
             # stash the load_job in the load_spec

diff --git a/src/vxingest/main.py b/src/vxingest/main.py
@@ -228,8 +228,7 @@ def get_job_docs(
                 "LOWER(META().id) as name, "
                 "run_priority, "
                 "offset_minutes, "
-                "LOWER(subType) as sub_type, "
-                "input_data_path as input_data_path "
+                "LOWER(subType) as sub_type "
             f"FROM {creds['cb_bucket']}.{creds['cb_scope']}.{creds['cb_collection']} "
             f"WHERE id='{job_id}' "
                 "AND (type = 'JOB-TEST' or type = 'JOB') "
@@ -247,8 +246,7 @@ def get_job_docs(
             "LOWER(META().id) as name, "
             "run_priority, "
             "offset_minutes, "
-            "LOWER(subType) as sub_type, "
-            "input_data_path as input_data_path "
+            "LOWER(subType) as sub_type "
         f"FROM {creds['cb_bucket']}.{creds['cb_scope']}.{creds['cb_collection']} "
         "LET millis = ROUND(CLOCK_MILLIS()), "
             "sched = SPLIT(schedule,' '), "

diff --git a/src/vxingest/netcdf_to_cb/run_ingest_threads.py b/src/vxingest/netcdf_to_cb/run_ingest_threads.py
@@ -171,13 +171,13 @@ def runit(self, args, log_queue: Queue, log_configurer: Callable[[Queue], None])
             # establish connections to cb, collection
             self.connect_cb()
             logger.info("connected to cb")
+            collection = self.load_spec["cb_connection"]["collection"]
             bucket = self.load_spec["cb_connection"]["bucket"]
             scope = self.load_spec["cb_connection"]["scope"]
-            collection = self.load_spec["cb_connection"]["collection"]
             # load the ingest document ids into the load_spec (this might be redundant)
-            stmnt = f'Select ingest_document_ids from `{bucket}`.{scope}.{collection} where meta().id = "{self.job_document_id}"'
-            result = self.cluster.query(stmnt)
-            self.load_spec["ingest_document_ids"] = list(result)[0][
+            ingest_document_result = self.collection.get(self.job_document_id)
+            ingest_document = ingest_document_result.content_as[dict]
+            self.load_spec["ingest_document_ids"] = ingest_document[
                 "ingest_document_ids"
             ]
             # put all the ingest documents into the load_spec too
@@ -187,11 +187,8 @@ def runit(self, args, log_queue: Queue, log_configurer: Callable[[Queue], None])
                     _id
                 ).content_as[dict]
             # load the fmask and input_data_path into the load_spec
-            stmnt = f'Select file_mask, input_data_path from `{bucket}`.{scope}.{collection} where meta().id = "{self.job_document_id}"'
-            result = self.cluster.query(stmnt)
-            result_list = list(result)
-            self.fmask = result_list[0]["file_mask"]
-            self.path = result_list[0]["input_data_path"]
+            self.fmask = ingest_document["file_mask"]
+            self.path = ingest_document["input_data_path"]
             self.load_spec["fmask"] = self.fmask
             self.load_spec["input_data_path"] = self.path
             # stash the load_job in the load_spec

diff --git a/src/vxingest/partial_sums_to_cb/run_ingest_threads.py b/src/vxingest/partial_sums_to_cb/run_ingest_threads.py
@@ -192,14 +192,10 @@ def runit(self, args, log_queue: Queue, log_configurer: Callable[[Queue], None])
             self.cb_credentials = self.get_credentials(self.load_spec)
             # establish connections to cb, collection
             self.connect_cb()
-            bucket = self.load_spec["cb_connection"]["bucket"]
-            scope = self.load_spec["cb_connection"]["scope"]
-            collection = self.load_spec["cb_connection"]["collection"]
-
             # load the ingest document ids into the load_spec (this might be redundant)
-            stmnt = f'Select ingest_document_ids from `{bucket}`.{scope}.{collection} where meta().id = "{self.job_document_id}"'
-            result = self.cluster.query(stmnt)
-            self.load_spec["ingest_document_ids"] = list(result)[0][
+            ingest_document_result = self.collection.get(self.job_document_id)
+            ingest_document = ingest_document_result.content_as[dict]
+            self.load_spec["ingest_document_ids"] = ingest_document[
                 "ingest_document_ids"
             ]
             # put all the ingest documents into the load_spec too

diff --git a/tests/vxingest/builder_common/testdata/stations_fcst_valid_epoch.n1ql b/tests/vxingest/builder_common/testdata/stations_fcst_valid_epoch.n1ql
@@ -64,4 +64,4 @@ WHEN ov.name = mv.name
 END
 ) FOR mv IN m.m0data
 END
-WHERE m.mfve = o.ofve
+WHERE m.mfve = o.ofve