[DC-3367] Fixing branch

all-of-us · Jan 9, 2024 · 6a2723f · 6a2723f
1 parent 259d932
commit 6a2723f
Showing 1 changed file with 126 additions and 0 deletions.
diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/search_sandbox_for_ids.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/search_sandbox_for_ids.py
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: light
+#       format_version: '1.5'
+#       jupytext_version: 1.7.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# Purpose: Use this notebook to search for ids in sandbox datasets
+
+# + tags=["parameters"]
+project_id = ''
+sandbox_dataset_id = '' # Sandbox dataset to search in for the problem ids
+search_field = '' # field in the sandbox tables expected to contain the ids. Example: observation_id
+run_as = ''
+
+# +
+from utils import auth
+import pandas as pd
+from gcloud.bq import BigQueryClient
+from common import JINJA_ENV
+from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message
+
+pd.set_option('display.max_rows', None)
+# -
+
+impersonation_creds = auth.get_impersonation_credentials(
+    run_as, target_scopes=IMPERSONATION_SCOPES)
+
+client = BigQueryClient(project_id, credentials=impersonation_creds)
+
+# # Create list of ids to search
+# Run the following cell to create a list of ids to search for. Recommend using a LIMIT if the list is quite large.<br>
+# OR <br>
+# Manually create a list of ids called ids_list
+
+# +
+tpl = JINJA_ENV.from_string('''
+{INSERT QUERY HERE}
+''')
+query = tpl.render()
+ids = execute(client, query)
+
+ids_list = ids[search_field].to_list()
+
+
+# -
+
+# # Get the tables that contain the search_field, from the sandbox dataset
+#
+# The query will return the sandbox tables in the order of their creation time. Earliest to latest.
+
+# +
+tpl = JINJA_ENV.from_string('''
+ 
+SELECT
+  c.*, t.creation_time
+  , ROW_NUMBER() OVER (ORDER BY t.creation_time) as run_order
+FROM
+  `{{project_id}}.{{sandbox_dataset_id}}.INFORMATION_SCHEMA.COLUMNS` AS c
+JOIN
+  `{{project_id}}.{{sandbox_dataset_id}}.INFORMATION_SCHEMA.TABLES` AS t
+ON
+  c.table_name = t.table_name
+WHERE
+  c.column_name = '{{search_field}}'
+ORDER BY
+  t.creation_time;
+    
+''')
+query = tpl.render(sandbox_dataset_id=sandbox_dataset_id,
+                   project_id=project_id,
+                   search_field=search_field)
+tables_in_dataset = execute(client, query)
+
+tables_list = tables_in_dataset['table_name'].to_list()
+tables_list
+# -
+
+# # Search in each sandbox table and print results
+
+queries = []
+for table in tables_list:
+    tpl = JINJA_ENV.from_string('''    
+    SELECT 
+    '{{table}}' as table,
+    COUNT(*) AS n_{{search_field}}s_found
+    FROM
+    `{{project_id}}.{{sandbox_dataset_id}}.{{table}}`
+    WHERE {{search_field}} IN UNNEST ({{ids_list}})
+    ''')
+    query = tpl.render(sandbox_dataset_id=sandbox_dataset_id,
+                       project_id=project_id,
+                       table=table,
+                       ids_list=ids_list,
+                       search_field=search_field)
+    queries.append(query)
+df = execute(client, '\nUNION ALL\n'.join(queries))
+
+
+# # Order and view the results
+
+# +
+# Define the run order
+df['run_order'] = pd.Categorical(df['table'], 
+                                 categories=tables_list,
+                                 ordered=True)
+
+# Sort the results
+ordered_df = (
+    df.sort_values(by='run_order')
+      .iloc[:, :2]
+      .reset_index(drop=True)
+)
+
+ordered_df
+# -
+
+