Improve MQ calculation accuracy (#4969)

Change raw MQ to a tuple of (sumSquaredMQs, totalDepth) for better accuracy where there are lots of uninformative reads or called single-sample variants with homRef genotypes. Note that incorporating this change into a pipeline will require a concomitant update to this version for GenomicsDBImport and GenotypeGVCFs.
broadinstitute · Oct 9, 2018 · 4dd7ba8 · 4dd7ba8
1 parent 158f7f7
commit 4dd7ba8
Show file tree

Hide file tree

Showing 39 changed files with 50,129 additions and 816 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java
@@ -1,10 +1,11 @@
 package org.broadinstitute.hellbender.engine;
 
+import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
+import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
 import htsjdk.samtools.SAMSequenceDictionary;
 import htsjdk.samtools.util.IOUtil;
 import htsjdk.tribble.*;
 import htsjdk.variant.bcf2.BCF2Codec;
-import htsjdk.variant.variantcontext.GenotypeLikelihoods;
 import htsjdk.variant.variantcontext.VariantContext;
 import htsjdk.variant.vcf.VCFHeader;
 import org.apache.logging.log4j.LogManager;
@@ -19,25 +20,18 @@
 import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;
-
-import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
-import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
-import com.googlecode.protobuf.format.JsonFormat;
-import com.intel.genomicsdb.model.GenomicsDBVidMapProto;
+import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.*;
 
 import java.io.File;
 import java.io.IOException;
 import java.nio.channels.SeekableByteChannel;
-import java.nio.file.Files;
 import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Optional;
 import java.util.function.Function;
-import java.io.FileReader;
-import java.util.HashMap;
-import java.util.Map;
+
+
 
 /**
  * Enables traversals and queries over sources of Features, which are metadata associated with a location
@@ -68,11 +62,6 @@
 public final class FeatureDataSource<T extends Feature> implements GATKDataSource<T>, AutoCloseable {
  private static final Logger logger = LogManager.getLogger(FeatureDataSource.class);
 
- /**
- * identifies a path as a GenomicsDB URI
- */
- public static final String GENOMIC_DB_URI_SCHEME = "gendb://";
-
  /**
  * Feature reader used to retrieve records from our file
  */
@@ -288,14 +277,6 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
  this.queryLookaheadBases = queryLookaheadBases;
  }
 
- /**
- * @param path String containing the path to test
- * @return true if path represent a GenomicsDB URI, otherwise false
- */
- public static boolean isGenomicsDBPath(final String path) {
- return path != null && path.startsWith(GENOMIC_DB_URI_SCHEME);
- }
-
  @SuppressWarnings("unchecked")
  private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
  final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
@@ -368,7 +349,7 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
  }
  }
 
- private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
+ protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
  if (!isGenomicsDBPath(path)) {
  throw new IllegalArgumentException("Trying to create a GenomicsDBReader from a non-GenomicsDB input");
  }
@@ -404,149 +385,6 @@ private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final St
  }
  }
 
- private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace,
- final File callsetJson, final File vidmapJson,
- final File vcfHeader) {
- final GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder =
- GenomicsDBExportConfiguration.ExportConfiguration.newBuilder()
- .setWorkspace(workspace.getAbsolutePath())
- .setReferenceGenome(reference.getAbsolutePath())
- .setVidMappingFile(vidmapJson.getAbsolutePath())
- .setCallsetMappingFile(callsetJson.getAbsolutePath())
- .setVcfHeaderFilename(vcfHeader.getAbsolutePath())
- .setProduceGTField(false)
- .setProduceGTWithMinPLValueForSpanningDeletions(false)
- .setSitesOnlyQuery(false)
- .setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
- final Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath();
-
- // For the multi-interval support, we create multiple arrays (directories) in a single workspace -
- // one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]),
- // you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the
- // partition bounds.
-
- // During the read phase, the user only supplies the workspace. The array names are obtained by scanning
- // the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2",
- // 50, 50M), then only the second array is queried.
-
- // In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version
- // will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found,
- // the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the
- // directory entries.
- if (Files.exists(arrayFolder)) {
- exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME);
- } else {
- exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
- }
-
- //Sample code snippet to show how combine operations for INFO fields can be specified using the Protobuf
- //API
- //
- //References
- //GenomicsDB Protobuf structs: https://github.com/Intel-HLS/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto
- //Protobuf generated Java code guide:
- //https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api
- //https://developers.google.com/protocol-buffers/docs/reference/java-generated
-
- //Parse the vid json and create an in-memory Protobuf structure representing the
- //information in the JSON file
- GenomicsDBVidMapProto.VidMappingPB vidMapPB;
- try {
- vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson);
- } catch (final IOException e) {
- throw new UserException("Could not open vid json file " + vidmapJson, e);
- }
-
- //In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
- //Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
- //We iterate over the list and create a field name to list index map
- final HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
- getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);
-
- //Example: set MQ combine operation to median (default is also median, but this is just an example)
- vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
- "MQ", "median");
- if (vidMapPB != null) {
- //Use rebuilt vidMap in exportConfiguration
- //NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to
- //C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information
- //in the JSON file
- exportConfigurationBuilder.setVidMapping(vidMapPB);
- }
-
- return exportConfigurationBuilder.build();
- }
-
- /**
- * Parse the vid json and create an in-memory Protobuf structure representing the
- * information in the JSON file
- *
- * @param vidmapJson vid JSON file
- * @return Protobuf object
- */
- public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson)
- throws IOException {
- final GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder();
- try (final FileReader reader = new FileReader(vidmapJson)) {
- JsonFormat.merge(reader, vidMapBuilder);
- }
- return vidMapBuilder.build();
- }
-
- /**
- * In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
- * Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
- * We iterate over the list and create a field name to list index map
- *
- * @param vidMapPB Protobuf vid mapping object
- * @return map from field name to index in vidMapPB.fields list
- */
- public static HashMap<String, Integer> getFieldNameToListIndexInProtobufVidMappingObject(
- final GenomicsDBVidMapProto.VidMappingPB vidMapPB) {
- final HashMap<String, Integer> fieldNameToIndexInVidFieldsList = new HashMap<>();
- for (int fieldIdx = 0; fieldIdx < vidMapPB.getFieldsCount(); ++fieldIdx) {
- fieldNameToIndexInVidFieldsList.put(vidMapPB.getFields(fieldIdx).getName(), fieldIdx);
- }
- return fieldNameToIndexInVidFieldsList;
- }
-
- /**
- * Update vid Protobuf object with new combine operation for field
- *
- * @param vidMapPB input vid object
- * @param fieldNameToIndexInVidFieldsList name to index in list
- * @param fieldName INFO field name
- * @param newCombineOperation combine op ("sum", "median")
- * @return updated vid Protobuf object if field exists, else null
- */
- public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation(
- final GenomicsDBVidMapProto.VidMappingPB vidMapPB,
- final Map<String, Integer> fieldNameToIndexInVidFieldsList,
- final String fieldName,
- final String newCombineOperation) {
- final int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName)
- ? fieldNameToIndexInVidFieldsList.get(fieldName) : -1;
- if (fieldIdx >= 0) {
- //Would need to rebuild vidMapPB - so get top level builder first
- final GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder();
- //To update the list element corresponding to fieldName, we get the builder for that specific list element
- final GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder fieldBuilder =
- updatedVidMapBuilder.getFieldsBuilder(fieldIdx);
- //And update its combine operation
- fieldBuilder.setVCFFieldCombineOperation(newCombineOperation);
-
- //Shorter way of writing the same operation
- /*
- updatedVidMapBuilder.getFieldsBuilder(fieldIdx)
- .setVCFFieldCombineOperation(newCombineOperation);
- */
-
- //Rebuild full vidMap
- return updatedVidMapBuilder.build();
- }
- return null;
- }
-
  /**
  * Returns the sequence dictionary for this source of Features.
  * Uses the dictionary from the VCF header (if present) for variant inputs,

diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java
@@ -4,6 +4,7 @@
 import htsjdk.tribble.Feature;
 import htsjdk.tribble.FeatureCodec;
 import org.broadinstitute.barclay.argparser.CommandLineException;
+import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 
@@ -244,8 +245,8 @@ public void setFeatureCodecClass(final Class<FeatureCodec<T, ?>> featureCodecCla
  * creates a name from the given filePath by finding the absolute path of the given input
  */
  private static String makeIntoAbsolutePath(final String filePath){
- if(FeatureDataSource.isGenomicsDBPath(filePath)){
- return FeatureDataSource.GENOMIC_DB_URI_SCHEME + new File(filePath.replace(FeatureDataSource.GENOMIC_DB_URI_SCHEME,"")).getAbsolutePath();
+ if(GenomicsDBUtils.isGenomicsDBPath(filePath)){
+ return GenomicsDBUtils.GENOMIC_DB_URI_SCHEME + new File(filePath.replace(GenomicsDBUtils.GENOMIC_DB_URI_SCHEME,"")).getAbsolutePath();
  } else if (URI.create(filePath).getScheme() != null) {
  return IOUtils.getPath(filePath).toAbsolutePath().toUri().toString();
  } else {