Enable users to store sample look up optimized tables

+ Add input flag and validate its value. + Add BQ queries to flatten call column. + Extract the schema of the flatten table. + Add unit tests to verify the correctness of extracted schema. + Add a new method `table_empty()` for checking whether a table is empty or not.
googlegenomics · Apr 25, 2020 · c74ad88 · c74ad88
1 parent 671c4b9
commit c74ad88
Show file tree

Hide file tree

Showing 5 changed files with 431 additions and 53 deletions.
diff --git a/gcp_variant_transforms/libs/bigquery_util.py b/gcp_variant_transforms/libs/bigquery_util.py
@@ -14,6 +14,7 @@
 
 """Constants and simple utility functions related to BigQuery."""
 
+from concurrent.futures import TimeoutError
 import enum
 import exceptions
 import logging
@@ -40,17 +41,32 @@
 _RANGE_END_SIG_DIGITS = 4
 _RANGE_INTERVAL_SIG_DIGITS = 1
 
-START_POSITION_COLUMN = 'start_position'
 _BQ_CREATE_PARTITIONED_TABLE_COMMAND = (
     'bq mk --table --range_partitioning='
     '{PARTITION_COLUMN},0,{RANGE_END},{RANGE_INTERVAL} '
     '--clustering_fields=start_position,end_position '
     '{FULL_TABLE_ID} {SCHEMA_FILE_PATH}')
 _BQ_DELETE_TABLE_COMMAND = 'bq rm -f -t {FULL_TABLE_ID}'
+_BQ_EXTRACT_SCHEMA_COMMAND = (
+    'bq show --schema --format=prettyjson {FULL_TABLE_ID} > {SCHEMA_FILE_PATH}')
 _GCS_DELETE_FILES_COMMAND = 'gsutil -m rm -f -R {ROOT_PATH}'
-_BQ_LOAD_JOB_NUM_RETRIES = 5
+_BQ_NUM_RETRIES = 3
 _MAX_NUM_CONCURRENT_BQ_LOAD_JOBS = 4
 
+_GET_COLUMN_NAMES_QUERY = (
+    'SELECT column_name '
+    'FROM `{PROJECT_ID}`.{DATASET_ID}.INFORMATION_SCHEMA.COLUMNS '
+    'WHERE table_name = "{TABLE_ID}"')
+_GET_CALL_SUB_FIELDS_QUERY = (
+  'SELECT field_path '
+  'FROM `{PROJECT_ID}`.{DATASET_ID}.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS '
+  'WHERE table_name = "{TABLE_ID}" AND column_name="{CALL_COLUMN}"')
+_MAIN_TABLE_ALIAS = 'main_table'
+_CALL_TABLE_ALIAS = 'call_table'
+_FLATTEN_CALL_QUERY = (
+    'SELECT {SELECT_COLUMNS} '
+    'FROM `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}` as {MAIN_TABLE_ALIAS}, '
+    'UNNEST({CALL_COLUMN}) as {CALL_TABLE_ALIAS}')
 
 class ColumnKeyConstants(object):
   """Constants for column names in the BigQuery schema."""
@@ -189,6 +205,37 @@ def table_exist(client, project_id, dataset_id, table_id):
       raise
   return True
 
+def table_empty(project_id, dataset_id, table_id):
+  client = bigquery.Client(project=project_id)
+  query = 'SELECT count(0) AS num_rows FROM {DATASET_ID}.{TABLE_ID}'.format(
+      DATASET_ID=dataset_id, TABLE_ID=table_id)
+  query_job = client.query(query)
+  num_retries = 0
+  while True:
+    try:
+      iterator = query_job.result(timeout=300)
+    except TimeoutError as e:
+      logging.warning('Time out waiting for query: %s', query)
+      if num_retries < _BQ_NUM_RETIRES:
+        num_retries += 1
+        time.sleep(90)
+      else:
+        raise e
+    else:
+      break
+
+  rows = list(iterator)
+  if len(rows) != 1:
+    logging.error('Query did not returned expected # of rows: {}'.format(query))
+    raise ValueError('Expected 1 row in query result, got {}'.format(len(rows)))
+
+  col = rows[0]
+  if len(col) != 1:
+    logging.error('Query did not returned expected # of cols: {}'.format(query))
+    raise ValueError('Expected 1 col in query result, got {}'.format(len(col)))
+
+  return col.get('num_rows') == 0
+
 def get_bigquery_type_from_vcf_type(vcf_type):
   # type: (str) -> str
   vcf_type = vcf_type.lower()
@@ -416,7 +463,7 @@ def _cancel_all_running_load_jobs(self):
       load_job.cancel()
 
   def _handle_failed_load_job(self, suffix, load_job):
-    if self._num_load_jobs_retries < _BQ_LOAD_JOB_NUM_RETRIES:
+    if self._num_load_jobs_retries < _BQ_NUM_RETRIES:
       self._num_load_jobs_retries += 1
       # Retry the failed job after 5 minutes wait.
       time.sleep(300)
@@ -447,6 +494,115 @@ def _monitor_load_jobs(self):
               self._start_one_load_job(next_suffix)
 
 
+class FlattenCallColumn(object):
+  def __init__(self, full_table_id):
+    (self._project_id,
+     self._dataset_id,
+     self._table_id) = parse_table_reference(full_table_id)
+
+    self._client = bigquery.Client(project=self._project_id)
+
+  def _run_query(self, query):
+    query_job = self._client.query(query)
+    num_retries = 0
+    while True:
+      try:
+        iterator = query_job.result(timeout=300)
+      except TimeoutError as e:
+        logging.warning('Time out waiting for query: %s', query)
+        if num_retries < _BQ_NUM_RETIRES:
+          num_retries += 1
+          time.sleep(90)
+        else:
+          raise e
+      else:
+        break
+    result = []
+    for i in iterator:
+      result.append(str(i.values()[0]))
+    return result
+
+  def _get_column_names(self):
+    query = _GET_COLUMN_NAMES_QUERY.format(PROJECT_ID=self._project_id,
+                                           DATASET_ID=self._dataset_id,
+                                           TABLE_ID=self._table_id)
+    return self._run_query(query)
+
+  def _get_call_sub_fields(self):
+    query = _GET_CALL_SUB_FIELDS_QUERY.format(PROJECT_ID=self._project_id,
+                                              DATASET_ID=self._dataset_id,
+                                              TABLE_ID=self._table_id,
+                                              CALL_COLUMN=ColumnKeyConstants.CALLS)
+    # returned list will be [call, call.name, call.genotype, call.phaseset, ...]
+    result = self._run_query(query)[1:]  # Drop the first element
+    return [sub_field.split('.')[1] for sub_field in result]
+
+  def _get_flatten_column_names(self):
+    column_names = self._get_column_names()
+    assert len(column_names) > 0
+    sub_fields = self._get_call_sub_fields()
+    assert len(sub_fields) > 0
+    select_list = []
+    for column in column_names:
+      if column != ColumnKeyConstants.CALLS:
+        select_list.append(_MAIN_TABLE_ALIAS + '.' + column + ' AS ' + column)
+      else:
+        for s_f in sub_fields:
+          select_list.append(_CALL_TABLE_ALIAS + '.' + s_f + ' AS ' + ColumnKeyConstants.CALLS + '_' + s_f)
+    return ', '.join(select_list)
+
+  def _create_temp_flatten_table(self):
+    temp_suffix = time.strftime('%Y%m%d_%H%M%S')
+    temp_table_id = '{}{}'.format(self._table_id, temp_suffix)
+    full_table_id = '{}.{}.{}'.format(
+        self._project_id, self._dataset_id, temp_table_id)
+    job_config = bigquery.QueryJobConfig(destination=full_table_id)
+
+    select_columns = self._get_flatten_column_names()
+    sql = _FLATTEN_CALL_QUERY.format(SELECT_COLUMNS=select_columns,
+                                     PROJECT_ID=self._project_id,
+                                     DATASET_ID=self._dataset_id,
+                                     TABLE_ID=self._table_id,
+                                     MAIN_TABLE_ALIAS=_MAIN_TABLE_ALIAS,
+                                     CALL_COLUMN=ColumnKeyConstants.CALLS,
+                                     CALL_TABLE_ALIAS=_CALL_TABLE_ALIAS)
+    sql += ' LIMIT 1'  # We need this table only to extract its schema.
+    query_job = self._client.query(sql, job_config=job_config)
+    while True:
+      try:
+        _ = query_job.result(timeout=300)
+      except TimeoutError as e:
+        logging.warning('Time out waiting for query: %s', sql)
+        if num_retries < _BQ_NUM_RETIRES:
+          num_retries += 1
+          time.sleep(90)
+        else:
+          raise e
+      else:
+        break
+    logging.info('A new table with only 1 row was crated: %s', full_table_id)
+    logging.info('This table is used to extract the schema of flatten tables.')
+    return temp_table_id
+
+  def get_flatten_table_schema(self, schema_file_path):
+    temp_table_id = self._create_temp_flatten_table()
+    full_table_id = '{}:{}.{}'.format(
+        self._project_id, self._dataset_id, temp_table_id)
+    bq_command = _BQ_EXTRACT_SCHEMA_COMMAND.format(
+        FULL_TABLE_ID=full_table_id,
+        SCHEMA_FILE_PATH=schema_file_path)
+    result = os.system(bq_command)
+    if result != 0:
+      logging.error('Failed to extract flatten table schema using "%s" command',
+                    bq_command)
+    else:
+      logging.info('Successfully extracted the schema of flatten table.')
+    if _delete_table(full_table_id) == 0:
+      logging.info('Successfully deleted temporary table: %s', full_table_id)
+    else:
+      logging.error('Was not able to delete temporary table: %s', full_table_id)
+    return result
+
 def create_output_table(full_table_id,  # type: str
                         partition_column,  # type: str
                         range_end,  # type: int

diff --git a/gcp_variant_transforms/libs/bigquery_util_test.py b/gcp_variant_transforms/libs/bigquery_util_test.py
@@ -14,6 +14,8 @@
 
 from __future__ import absolute_import
 
+import filecmp
+import tempfile
 import unittest
 from apitools.base.py import exceptions
 
@@ -439,6 +441,17 @@ def test_does_table_exist(self):
                       bigquery_util.table_exist,
                       client, 'project', 'dataset', 'table')
 
+  def test_table_empty(self):
+    project_id = 'gcp-variant-transforms-test'
+    dataset_id = 'bq_to_vcf_integration_tests'
+    table_with_3_rows = 'merge_option_move_to_calls___chr20'
+    self.assertFalse(bigquery_util.table_empty(project_id, dataset_id,
+                                               table_with_3_rows))
+
+    table_with_0_rows = 'merge_option_move_to_calls___chr21'
+    self.assertTrue(bigquery_util.table_empty(project_id, dataset_id,
+                                              table_with_0_rows))
+
   def test_raise_error_if_dataset_not_exists(self):
     client = mock.Mock()
     client.datasets.Get.return_value = bigquery.Dataset(
@@ -502,3 +515,51 @@ def test_calculate_optimal_partition_size(self):
       self.assertEqual(expected_partition_size *
                        (bigquery_util._MAX_BQ_NUM_PARTITIONS - 1),
                        total_base_pairs_enlarged)
+
+class FlattenCallColumnTest(unittest.TestCase):
+  """Test cases for class `FlattenCallColumn`."""
+
+  def setUp(self):
+    self._input_table = 'gcp-variant-transforms-test:bq_to_vcf_integration_tests.merge_option_move_to_calls___chr20'
+    self._flatter = bigquery_util.FlattenCallColumn(self._input_table)
+
+  def test_get_column_names(self):
+    expected_column_names = ['reference_name', 'start_position', 'end_position',
+                             'reference_bases', 'alternate_bases', 'names',
+                             'quality', 'filter', 'call', 'NS',  'DP', 'AA',
+                             'DB', 'H2']
+    self.assertEqual(expected_column_names, self._flatter._get_column_names())
+
+  def test_get_call_sub_fields(self):
+    expected_sub_fields = \
+      ['sample_id', 'genotype', 'phaseset', 'DP', 'GQ', 'HQ']
+    self.assertEqual(expected_sub_fields, self._flatter._get_call_sub_fields())
+
+  def test_get_flatten_column_names(self):
+    expected_select = (
+        'main_table.reference_name AS reference_name, '
+        'main_table.start_position AS start_position, '
+        'main_table.end_position AS end_position, '
+        'main_table.reference_bases AS reference_bases, '
+        'main_table.alternate_bases AS alternate_bases, '
+        'main_table.names AS names, '
+        'main_table.quality AS quality, '
+        'main_table.filter AS filter, '
+        'call_table.sample_id AS call_sample_id, '
+        'call_table.genotype AS call_genotype, '
+        'call_table.phaseset AS call_phaseset, '
+        'call_table.DP AS call_DP, '
+        'call_table.GQ AS call_GQ, '
+        'call_table.HQ AS call_HQ, '
+        'main_table.NS AS NS, '
+        'main_table.DP AS DP, '
+        'main_table.AA AS AA, '
+        'main_table.DB AS DB, '
+        'main_table.H2 AS H2')
+    self.assertEqual(expected_select, self._flatter._get_flatten_column_names())
+
+  def test_get_flatten_table_schema(self):
+    generated_flatten_schema = tempfile.mkstemp()[1]
+    self._flatter.get_flatten_table_schema(generated_flatten_schema)
+    expected_flatten_schema = 'gcp_variant_transforms/testing/data/schema/flatten_merge_option_move_to_calls.json'
+    self.assertTrue(filecmp(generated_flatten_schema, expected_flatten_schema), 'Schema file does not match the expected schema')