Creates PoC pipeline to estimate disk usage of vcf_to_bq on Dataflow.

The pipeline uses raw file size and raw+encoded sizes of a short snippet at beginning of VCF files to estimate the encoded size for a commit. The major blocking bug is that when the snippets are being read from VCFs in an encoded format, lines are being read more than once.
googlegenomics · Aug 17, 2018 · 0415861 · 0415861
1 parent 6420e96
commit 0415861
Show file tree

Hide file tree

Showing 7 changed files with 373 additions and 0 deletions.
diff --git a/gcp_variant_transforms/beam_io/vcf_snippet_io.py b/gcp_variant_transforms/beam_io/vcf_snippet_io.py
@@ -0,0 +1,128 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A source for reading VCF file headers."""
+
+from __future__ import absolute_import
+
+from typing import Iterable # pylint: disable=unused-import
+
+from apache_beam.io import filebasedsource
+from apache_beam.io import range_trackers # pylint: disable=unused-import
+from apache_beam.io.filesystem import CompressionTypes
+from apache_beam.io.filesystems import FileSystems
+from apache_beam.io.iobase import Read
+from apache_beam.transforms import PTransform
+
+from gcp_variant_transforms.beam_io import vcf_parser, vcfio
+
+
+class _VcfSnippetSource(filebasedsource.FileBasedSource):
+ """A source for reading a limited number of variants from a set of VCF files.
+
+ Lines that are malformed are skipped.
+
+ Parses VCF files (version 4) using PyVCF library.
+ """
+
+ DEFAULT_VCF_READ_BUFFER_SIZE = 65536 # 64kB
+
+ def __init__(self,
+ file_pattern,
+ snippet_size,
+ compression_type=CompressionTypes.AUTO,
+ validate=True):
+ # type: (str, int, str, bool) -> None
+ super(_VcfSnippetSource, self).__init__(file_pattern,
+ compression_type=compression_type,
+ validate=validate,
+ splittable=False)
+ self._compression_type = compression_type
+ self._snippet_size = snippet_size
+
+ def read_records(
+ self,
+ file_name, # type: str
+ range_tracker # type: range_trackers.UnsplittableRangeTracker
+ ):
+ # type: (...) -> Iterable[Tuple[str, str, vcfio.Variant]]
+ # Iterator to emit lines encoded as `Variant` objects.
+ record_iterator = vcf_parser.PyVcfParser(
+ file_name,
+ range_tracker,
+ self._pattern,
+ self._compression_type,
+ allow_malformed_records=True,
+ representative_header_lines=None,
+ buffer_size=self.DEFAULT_VCF_READ_BUFFER_SIZE,
+ skip_header_lines=0)
+
+ # Open distinct channel to read lines as raw bytestrings.
+ with FileSystems.open(file_name, self._compression_type) as raw_reader:
+ line = raw_reader.readline()
+ while line and line.startswith('#'):
+ # Skip headers, assume header size is negligible.
+ line = raw_reader.readline()
+
+ count = 0
+ for encoded_record in record_iterator:
+ raw_record = raw_reader.readline()
+
+ if count >= self._snippet_size:
+ break
+ if not isinstance(encoded_record, vcfio.Variant):
+ continue
+
+ count += 1
+ yield file_name, raw_record, encoded_record
+
+
+class ReadVcfSnippet(PTransform):
+ """A PTransform for reading a limited number of lines from a set of VCF files.
+
+ Output will be a PTable mapping from `file names -> Tuple[(line, Variant)]`
+ objects. The list contains the first `snippet_size` number of lines that are
+ not malformed, first as a raw string and then encoded as a `Variant` class.
+
+ Parses VCF files (version 4) using PyVCF library.
+ """
+
+ def __init__(
+ self,
+ file_pattern, # type: str
+ snippet_size, # type: int
+ compression_type=CompressionTypes.AUTO, # type: str
+ validate=True, # type: bool
+ **kwargs # type: **str
+ ):
+ # type: (...) -> None
+ """Initialize the :class:`ReadVcfHeaders` transform.
+
+ Args:
+ file_pattern: The file path to read from either as a single file or a glob
+ pattern.
+ snippet_size: The number of lines that should be read from the file.
+ compression_type: Used to handle compressed input files.
+ Typical value is :attr:`CompressionTypes.AUTO
+ <apache_beam.io.filesystem.CompressionTypes.AUTO>`, in which case the
+ underlying file_path's extension will be used to detect the compression.
+ validate: Flag to verify that the files exist during the pipeline creation
+ time.
+ """
+ super(ReadVcfSnippet, self).__init__(**kwargs)
+ self._source = _VcfSnippetSource(
+ file_pattern, snippet_size, compression_type, validate=validate)
+
+ def expand(self, pvalue):
+ return pvalue.pipeline | Read(self._source)
diff --git a/gcp_variant_transforms/beam_io/vcf_snippet_io_test.py b/gcp_variant_transforms/beam_io/vcf_snippet_io_test.py
diff --git a/gcp_variant_transforms/libs/preprocess_reporter.py b/gcp_variant_transforms/libs/preprocess_reporter.py
@@ -26,6 +26,8 @@
 TODO(allieychen): Eventually, it also contains the resource estimation.
 
 Output example (assuming opening in spreedsheet):
+Estimated disk usage by Dataflow: 4846.0 GB. The total raw file sizes summed up
+to 1231.0 GB.
 Header Conflicts
 ID Category Conflicts File Paths Proposed Resolution
 NS INFO num=1 type=Float file1 num=1 type=Float
@@ -43,6 +45,7 @@
 File Path Variant Record Error Message
 file 1 rs6 G A 29 PASS NS=3; invalid literal for int() with base 10.
 """
+import math
 
 from typing import Dict, List, Optional, Union # pylint: disable=unused-import
 
@@ -77,6 +80,7 @@ class _HeaderLine(object):
 def generate_report(
  header_definitions, # type: merge_header_definitions.VcfHeaderDefinitions
  file_path, # type: str
+ disk_usage_estimate, # type: int
  resolved_headers=None, # type: vcf_header_io.VcfHeader
  inferred_headers=None, # type: vcf_header_io.VcfHeader
  malformed_records=None # type: List[vcfio.MalformedVcfRecord]
@@ -97,6 +101,7 @@ def generate_report(
  """
  resolved_headers = resolved_headers or vcf_header_io.VcfHeader()
  with filesystems.FileSystems.create(file_path) as file_to_write:
+ _append_disk_usage_estimate_to_report(file_to_write, disk_usage_estimate)
  _append_conflicting_headers_to_report(file_to_write, header_definitions,
  resolved_headers)
  _append_inferred_headers_to_report(file_to_write, inferred_headers)
@@ -272,6 +277,15 @@ def _format_definition(num_value, type_value):
  ]
  return ' '.join(formatted_definition)
 
+def _append_disk_usage_estimate_to_report(file_to_write, disk_usage_estimate):
+ # type: (file, FileSizeInfo) -> None
+ if disk_usage_estimate is None:
+ return
+ file_to_write.write(
+ 'Estimated disk usage by Dataflow: {} GB. The total raw file sizes '
+ 'summed up to {} GB.\n'.format(
+ int(math.ceil(disk_usage_estimate.encoded / 1e9)),
+ int(math.ceil(disk_usage_estimate.raw / 1e9))))
 
 def _append_to_report(file_to_write, error_type, header, contents):
  # type: (file, str, str, List[str]) -> None

diff --git a/gcp_variant_transforms/libs/preprocess_reporter_test.py b/gcp_variant_transforms/libs/preprocess_reporter_test.py
@@ -47,6 +47,7 @@ def _generate_report_and_assert_contents_equal(
  PreprocessReporterTest._REPORT_NAME)
  preprocess_reporter.generate_report(header_definitions,
  file_path,
+ None,
  resolved_headers,
  inferred_headers,
  malformed_records)

diff --git a/gcp_variant_transforms/libs/resource_estimator.py b/gcp_variant_transforms/libs/resource_estimator.py
@@ -0,0 +1,180 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper functions for estimating the resources that a vcf_to_bq will require.
+
+Currently, the resource estimator only estimates the disk usage that a Dataflow
+pipeline will take along with the `MergeVariants` step, since this can cause
+expensive pipeline failures late in the run.
+"""
+import logging
+
+import apache_beam as beam
+from apache_beam import coders
+from apache_beam.io.filesystem import CompressionTypes
+from apache_beam.io.filesystems import FileSystems
+
+from gcp_variant_transforms.beam_io import vcfio
+
+
+# TODO(hanjohn): Add unit tests.
+
+def _convert_variant_snippets_to_bytesize(variant):
+ # type: (vcfio.Variant -> int)
+ return coders.registry.get_coder(vcfio.Variant).estimate_size(variant)
+
+
+class SnippetSizeInfo(object):
+ def __init__(self,
+ raw_snippet_size, # type: int
+ encoded_snippet_size, # type: int
+ ):
+ # type: (...) -> (None)
+ self.raw = raw_snippet_size
+ self.encoded = encoded_snippet_size
+
+
+class FileSizeInfo(object):
+ def __init__(self, raw_file_size, encoded_file_size=None):
+ # type: (int, int) -> (None)
+ self.raw = raw_file_size
+ self.encoded = encoded_file_size
+
+ def calculate_encoded_file_size(self, snippet_size_info):
+ # type: (SnippetSizeInfo) -> (None)
+ """Estimate a VCF file's encoded size based on snippet analysis.
+
+ Given the raw_file_size and measurements of several VCF lines from the file,
+ estimate how much disk the file will take after expansion due to encoding
+ lines as `vcfio.Variant` objects. The encoded_snippet_size will be set as
+ `self.encoded`.
+
+ This is a simple ratio problem, solving for encoded_snippet_size which is
+ the only unknown:
+ encoded_snippet_size / raw_snippet_size = encoded_file_size / raw_file_size
+ """
+ if snippet_size_info.raw == 0:
+ # Propagate in-band error state to avoid divide-by-zero.
+ self.encoded = 0
+ self.raw = 0
+ else:
+ self.encoded = (self.raw * snippet_size_info.encoded /
+ snippet_size_info.raw)
+
+
+def measure_variant_size(element):
+ # type: (Tuple[str, str, vcfio.Variant]) -> (Tuple[str, SnippetSizeInfo])
+ """Measure the lengths of the raw and encoded representations of a Variant.
+
+ Given a PTable mapping file_paths to the raw (bytestring) and vcfio.Variant-
+ encoded representations of a Variant line, have the output PTable instead map
+ from the file_paths to a Tuple with the (raw, encoded) representation sizes.
+
+ The file_path keys are not expected to be unique.
+ """
+ file_path, raw_variant, encoded_variant = element
+ encoded_variant_size = _convert_variant_snippets_to_bytesize(encoded_variant)
+ raw_variant_size = len(raw_variant)
+ return file_path, SnippetSizeInfo(raw_variant_size, encoded_variant_size)
+
+
+def estimate_file_encoded_size(element):
+ # type: (Tuple[str, Dict[str, Object]]) -> (Tuple[str, FileSizeInfo])
+ file_name, metrics = element
+ file_size_info = metrics['whole_file_raw_size'][0] # type: FileSizeInfo
+ snippet_size_info = metrics['snippet_stats'][0] # type: SnippetSizeInfo
+
+ # Assume that the ratio of encoded size to raw disk size is roughly the same
+ # throughout the file compared to the first several lines.
+ file_size_info.calculate_encoded_file_size(snippet_size_info)
+ if file_size_info.raw == 0:
+ logging.error("VCF file %s reported with 0 well-formed variant lines; "
+ "its contribution to disk resource usage will be "
+ "ignored.", file_name)
+
+ logging.debug("File %s has raw file size %d, raw snippet size %d, encoded "
+ "size %d. Estimated encoded file size: %d",
+ file_name, file_size_info.raw, snippet_size_info.raw,
+ snippet_size_info.encoded, file_size_info.encoded)
+ return file_name, file_size_info
+
+def get_file_sizes(input_pattern):
+ # type: (str) -> (List[FileSizeInfo])
+ match_results = FileSystems.match([input_pattern])
+ file_sizes = []
+ for match in match_results:
+ for file_metadata in match.metadata_list:
+ compression_type = CompressionTypes.detect_compression_type(
+ file_metadata.path)
+ if compression_type != CompressionTypes.UNCOMPRESSED:
+ logging.error("VCF file %s is compressed; disk requirement estimator "
+ "will not be accurate.", file_metadata.path)
+
+ file_sizes.append((file_metadata.path,
+ FileSizeInfo(file_metadata.size_in_bytes),))
+ return file_sizes
+
+
+class SnippetSizeInfoSumFn(beam.CombineFn):
+ """Combiner Function to sum up the size fields of SnippetSizeInfos.
+
+ Example: [SnippetSizeInfo(a, b), SnippetSizeInfo(c, d)] ->
+ SnippetSizeInfo(a+c, b+d)
+ """
+ def create_accumulator(self):
+ # type: (None) -> (Tuple[int, int])
+ return (0, 0) # (raw, encoded) sums
+
+ def add_input(self, sums, snippet_size_info):
+ # type: (Tuple[int, int], SnippetSizeInfo) -> (Tuple[int, int])
+ return sums[0] + snippet_size_info.raw, sums[1] + snippet_size_info.encoded
+
+ def merge_accumulators(self, accumulators):
+ # type: (Iterable[Tuple[int, int]]) -> (Tuple[int, int])
+ first, second = zip(*accumulators)
+ return sum(first), sum(second)
+
+ def extract_output(self, sums):
+ # type: (Tuple[int, int]) -> (SnippetSizeInfo)
+ return SnippetSizeInfo(*sums)
+
+
+class FileSizeInfoSumFn(beam.CombineFn):
+ """Combiner Function to sum up the size fields of Tuple[str, FileSizeInfo]s.
+
+ Unlike SnippetSizeInfoSumFn, the input is a PTable mapping str to
+ FileSizeInfo, so the input is a tuple with the FileSizeInfos as the second
+ field. The output strips out the str key which represents the file path.
+
+ Example: [('/path/a', FileSizeInfo(a, b)), ('/path/b;, FileSizeInfo(c, d))] ->
+ FileSizeInfo(a+c, b+d)
+ """
+ def create_accumulator(self):
+ # type: (None) -> (Tuple[int, int])
+ return (0, 0) # (raw, encoded) sums
+
+ def add_input(self, raw_encoded, path_and_file_size_info):
+ # type: (Tuple[int, int], Tuple[str, FileSizeInfo]) -> (Tuple[int, int])
+ raw, encoded = raw_encoded
+ _, file_size_info = path_and_file_size_info
+ return raw + file_size_info.raw, encoded + file_size_info.encoded
+
+ def merge_accumulators(self, accumulators):
+ # type: (Iterable[Tuple[int, int]]) -> (Tuple[int, int])
+ raw, encoded = zip(*accumulators)
+ return sum(raw), sum(encoded)
+
+ def extract_output(self, raw_encoded):
+ # type: (Tuple[int, int]) -> (FileSizeInfo)
+ raw, encoded = raw_encoded
+ return FileSizeInfo(raw, encoded)
diff --git a/gcp_variant_transforms/options/variant_transform_options.py b/gcp_variant_transforms/options/variant_transform_options.py
@@ -424,6 +424,13 @@ def add_arguments(self, parser):
  help=('The full path of the resolved headers. The file will not be'
  'generated if unspecified. Otherwise, please provide a local '
  'path if run locally, or a cloud path if run on Dataflow.'))
+ parser.add_argument(
+ '--estimate_disk_usage',
+ type='bool', default=False, nargs='?', const=True,
+ help=('By default, disk resource usage will not be estimated.'
+ 'If true, the preprocessor will estimate the maximum disk usage '
+ 'consumed at any step in the pipeline, which could lead to '
+ 'out-of-disk errors at a shuffle step e.g. MergeVariants.'))
 
 class PartitionOptions(VariantTransformsOptions):
  """Options for partitioning Variant records."""