Skip to content

Commit

Permalink
Merge pull request #1806 from broadinstitute/ak_2bitReferenceFromBucket
Browse files Browse the repository at this point in the history
add BROADCAST tests and use 2bit reference from bucket
  • Loading branch information
akiezun committed May 14, 2016
2 parents 2c43d13 + ee484c7 commit 4780534
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ public void setTestVerbosity(){
public static final String publicTestDir = new File(gatkDirectory, publicTestDirRelative).getAbsolutePath() + "/";
public static final String publicTestDirRoot = publicTestDir.replace(publicTestDirRelative, "");

public static final String GSC_GATK_TEST_RESOURCES = "gs://hellbender/test/resources/";

public static final String GCS_b37_REFERENCE_2BIT = GSC_GATK_TEST_RESOURCES + "benchmark/human_g1k_v37.2bit";
public static final String GCS_b37_CHR20_21_REFERENCE_2BIT = GSC_GATK_TEST_RESOURCES + "human_g1k_v37.20.21.2bit";

/**
* LARGE FILES FOR TESTING (MANAGED BY GIT LFS)
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ public Object[][] createBQSRTestData() {
final String dbSNPb37_chr2021 = dbsnp_138_b37_20_21_vcf;

final String hg19Chr171Mb = publicTestDir + "human_g1k_v37.chr17_1Mb.fasta";
final String hg19Chr171Mb_2bit = publicTestDir + "human_g1k_v37.chr17_1Mb.2bit";
final String HiSeqBam_chr17 = localResources + "NA12878.chr17_69k_70k.dictFix.bam";
final String dbSNPb37_chr17 = localResources + "dbsnp_132.b37.excluding_sites_after_129.chr17_69k_70k.vcf";
final String more17Sites = localResources + "bqsr.fakeSitesForTesting.b37.chr17.vcf";
Expand Down Expand Up @@ -102,14 +103,16 @@ public Object[][] createBQSRTestData() {
// multiple known sites with SHUFFLE; entire test case shared with walker version
{new BQSRTest(hg19Chr171Mb, HiSeqBam_chr17, dbSNPb37_chr17, " --joinStrategy SHUFFLE -knownSites " + more17Sites, getResourceDir() + "expected.NA12878.chr17_69k_70k.2inputs.txt")},

// multiple known sites with BROADCAST; entire test case shared with walker version
{new BQSRTest(hg19Chr171Mb_2bit, HiSeqBam_chr17, dbSNPb37_chr17, " --joinStrategy BROADCAST -knownSites " + more17Sites, getResourceDir() + "expected.NA12878.chr17_69k_70k.2inputs.txt")},

// local input/computation, 2Bit Reference, BROADCAST
{new BQSRTest(GRCh37Ref2bit_chr2021, hiSeqBam_1read, dbSNPb37_chr2021, "--joinStrategy BROADCAST", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1READ_RECAL)},
{new BQSRTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "--joinStrategy BROADCAST", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
{new BQSRTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "--joinStrategy BROADCAST --indels_context_size 4", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_INDELS_CONTEXT_SIZE_4_RECAL)},
{new BQSRTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "--joinStrategy BROADCAST --low_quality_tail 5", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_LOW_QUALITY_TAIL_5_RECAL)},
{new BQSRTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "--joinStrategy BROADCAST --quantizing_levels 6", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_QUANTIZING_LEVELS_6_RECAL)},
{new BQSRTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "--joinStrategy BROADCAST --mismatches_context_size 4", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_MISMATCHES_CONTEXT_SIZE_4_RECAL)},
{new BQSRTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "--joinStrategy BROADCAST --mismatches_context_size 4", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_MISMATCHES_CONTEXT_SIZE_4_RECAL)},
// multiple known sites with 2bit BROADCAST; same output used for multiple known sites SHUFFLE test above
{new BQSRTest(GRCh37Ref2bit_chr2021, hiSeqBam_20_21_100000, more20Sites, " -knownSites " + more21Sites, getResourceDir() + "expected.CEUTrio.HiSeq.WGS.b37.ch20.ch21.10m-10m100.recal.txt")},
// Can't use 2 bit reference with a CRAM file: https://github.com/broadinstitute/gatk/issues/1443
Expand All @@ -119,11 +122,22 @@ public Object[][] createBQSRTestData() {
};
}

@Test(dataProvider = "BQSRTest", groups = "spark")
public void testBQSRSpark(BQSRTest params) throws IOException {
ArgumentsBuilder ab = new ArgumentsBuilder().add(params.getCommandLineNoApiKey());
IntegrationTestSpec spec = new IntegrationTestSpec(
ab.getString(),
Arrays.asList(params.expectedFileName));
spec.executeTest("testBQSRSpark-" + params.args, this);
}

//This data provider is for tests that use reference (but not BAM) files stored in buckets
@DataProvider(name = "BQSRCloudTest")
public Object[][] createBQSRCloudTestData() {
final String localResources = getResourceDir();

final String GRCh37RefCloud = ReferenceAPISource.URL_PREFIX + ReferenceAPISource.GRCH37_REF_ID;
final String chr2021Reference2bit = GCS_b37_CHR20_21_REFERENCE_2BIT;
final String hiSeqBam_chr20 = localResources + WGS_B37_CH20_1M_1M1K_BAM;
final String hiSeqBam_1read = localResources + "overlappingRead.bam";
final String dbSNPb37_chr20 = localResources + DBSNP_138_B37_CH20_1M_1M1K_VCF;
Expand All @@ -135,27 +149,26 @@ public Object[][] createBQSRCloudTestData() {
// See MathUtilsUniTest.testAddDoubles for a demonstration how that can change the results.
// See RecalDatum for explanation of why the multiplier is needed.

// local input/computation, cloud reference, SHUFFLE
// local input/computation, using the GA4GH reference API, SHUFFLE
{new BQSRTest(GRCh37RefCloud, hiSeqBam_1read, dbSNPb37_chr2021, " --joinStrategy SHUFFLE", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1READ_RECAL)},
{new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, " --joinStrategy SHUFFLE", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
{new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, " --joinStrategy SHUFFLE --indels_context_size 4", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_INDELS_CONTEXT_SIZE_4_RECAL)},
{new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, " --joinStrategy SHUFFLE --low_quality_tail 5", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_LOW_QUALITY_TAIL_5_RECAL)},
{new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, " --joinStrategy SHUFFLE --quantizing_levels 6", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_QUANTIZING_LEVELS_6_RECAL)},
{new BQSRTest(GRCh37RefCloud, hiSeqBam_chr20, dbSNPb37_chr20, " --joinStrategy SHUFFLE --mismatches_context_size 4", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_MISMATCHES_CONTEXT_SIZE_4_RECAL)},
};
}

@Test(dataProvider = "BQSRTest", groups = "spark")
public void testBQSRSpark(BQSRTest params) throws IOException {
ArgumentsBuilder ab = new ArgumentsBuilder().add(params.getCommandLineNoApiKey());
IntegrationTestSpec spec = new IntegrationTestSpec(
ab.getString(),
Arrays.asList(params.expectedFileName));
spec.executeTest("testBQSRSpark-" + params.args, this);
// local input/computation, using a 2bit reference file in a GCS bucket, BROADCAST
{new BQSRTest(chr2021Reference2bit, hiSeqBam_1read, dbSNPb37_chr2021, " --joinStrategy BROADCAST", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1READ_RECAL)},
{new BQSRTest(chr2021Reference2bit, hiSeqBam_chr20, dbSNPb37_chr20, " --joinStrategy BROADCAST", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
{new BQSRTest(chr2021Reference2bit, hiSeqBam_chr20, dbSNPb37_chr20, " --joinStrategy BROADCAST --indels_context_size 4", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_INDELS_CONTEXT_SIZE_4_RECAL)},
{new BQSRTest(chr2021Reference2bit, hiSeqBam_chr20, dbSNPb37_chr20, " --joinStrategy BROADCAST --low_quality_tail 5", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_LOW_QUALITY_TAIL_5_RECAL)},
{new BQSRTest(chr2021Reference2bit, hiSeqBam_chr20, dbSNPb37_chr20, " --joinStrategy BROADCAST --quantizing_levels 6", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_QUANTIZING_LEVELS_6_RECAL)},
{new BQSRTest(chr2021Reference2bit, hiSeqBam_chr20, dbSNPb37_chr20, " --joinStrategy BROADCAST --mismatches_context_size 4", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_MISMATCHES_CONTEXT_SIZE_4_RECAL)},
};
}

@Test(dataProvider = "BQSRCloudTest", groups = {"spark", "cloud"})
public void testBQSRSparkCloud(BQSRTest params) throws IOException {
@Test(dataProvider = "BQSRCloudTest", groups = {"cloud", "spark"})
public void testBQSRSparkCloud(final BQSRTest params) throws IOException {
ArgumentsBuilder ab = new ArgumentsBuilder().add(params.getCommandLine());
IntegrationTestSpec spec = new IntegrationTestSpec(
ab.getString(),
Expand All @@ -179,16 +192,19 @@ public void testBlowUpOnBroadcastIncompatibleReference() throws IOException {
spec.executeTest("testBQSR-" + params.args, this);
}

//This data provider is for tests that use BAM files stored in buckets
@DataProvider(name = "BQSRTestBucket")
public Object[][] createBQSRTestDataBucket() {
final String GRCh37Ref = ReferenceAPISource.URL_PREFIX + ReferenceAPISource.GRCH37_REF_ID;
final String localResources = getResourceDir();
final String GRCh37RefCloud = ReferenceAPISource.URL_PREFIX + ReferenceAPISource.GRCH37_REF_ID;
final String chr2021Reference2bit = GCS_b37_CHR20_21_REFERENCE_2BIT;
final String localResources = getResourceDir();
final String HiSeqBamCloud_chr20 = getCloudInputs() + WGS_B37_CH20_1M_1M1K_BAM;
final String dbSNPb37_chr20 = localResources + DBSNP_138_B37_CH20_1M_1M1K_VCF;

return new Object[][]{
// input in cloud, computation local.
{new BQSRTest(GRCh37Ref, HiSeqBamCloud_chr20, dbSNPb37_chr20, " --joinStrategy SHUFFLE", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
{new BQSRTest(GRCh37RefCloud, HiSeqBamCloud_chr20, dbSNPb37_chr20, " --joinStrategy SHUFFLE", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
{new BQSRTest(chr2021Reference2bit, HiSeqBamCloud_chr20, dbSNPb37_chr20, " --joinStrategy BROADCAST", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
};
}

Expand All @@ -199,29 +215,29 @@ public void testBQSRBucket(BQSRTest params) throws IOException {
IntegrationTestSpec spec = new IntegrationTestSpec(
ab.getString(),
Arrays.asList(params.expectedFileName));
spec.executeTest("testBQSR-" + params.args, this);
spec.executeTest("testBQSRBucket-" + params.args, this);
}

// TODO: This test is disabled because a new expected result needs to be created.
@Test(description = "This is to test https://github.com/broadinstitute/hellbender/issues/322", groups = {"spark", "cloud"}, enabled = false)
@Test(description = "This is to test https://github.com/broadinstitute/hellbender/issues/322", groups = {"cloud", "spark"}, enabled = false)
public void testPlottingWorkflow() throws IOException {
final String resourceDir = getTestDataDir() + "/" + "BQSR" + "/";
final String GRCh37Ref = ReferenceAPISource.GRCH37_REF_ID; // that's the "full" version
final String chr2021Reference2bit = GCS_b37_CHR20_21_REFERENCE_2BIT;
final String dbSNPb37_chr2021 = resourceDir + DBSNP_138_B37_CH20_1M_1M1K_VCF;
final String HiSeqBam_chr20 = getResourceDir() + WGS_B37_CH20_1M_1M1K_BAM;

final File actualHiSeqBam_recalibrated = createTempFile("actual.recalibrated", ".bam");

final String tablePre = createTempFile("gatk4.pre.cols", ".table").getAbsolutePath();
final String argPre = " -R " + ReferenceAPISource.URL_PREFIX + GRCh37Ref + " -knownSites " + dbSNPb37_chr2021 + " -I " + HiSeqBam_chr20
final String argPre = " -R " + ReferenceAPISource.URL_PREFIX + chr2021Reference2bit + " -knownSites " + dbSNPb37_chr2021 + " -I " + HiSeqBam_chr20
+ " -O " + tablePre + " " + " --apiKey " + getGCPTestApiKey();
new BaseRecalibratorSpark().instanceMain(Utils.escapeExpressions(argPre));

final String argApply = "-I " + HiSeqBam_chr20 + " --bqsr_recal_file " + tablePre + " -O " + actualHiSeqBam_recalibrated.getAbsolutePath() + " --apiKey " + getGCPTestApiKey();
new ApplyBQSRSpark().instanceMain(Utils.escapeExpressions(argApply));

final File actualTablePost = createTempFile("gatk4.post.cols", ".table");
final String argsPost = " -R " + ReferenceAPISource.URL_PREFIX + GRCh37Ref + " -knownSites " + dbSNPb37_chr2021 + " -I " + actualHiSeqBam_recalibrated.getAbsolutePath()
final String argsPost = " -R " + ReferenceAPISource.URL_PREFIX + chr2021Reference2bit + " -knownSites " + dbSNPb37_chr2021 + " -I " + actualHiSeqBam_recalibrated.getAbsolutePath()
+ " -O " + actualTablePost.getAbsolutePath() + " " + " --apiKey " + getGCPTestApiKey();
new BaseRecalibratorSpark().instanceMain(Utils.escapeExpressions(argsPost));

Expand All @@ -238,11 +254,11 @@ public void testBQSRFailWithoutDBSNP() throws IOException {
final String resourceDir = getTestDataDir() + "/" + "BQSR" + "/";
final String localResources = getResourceDir();

final String GRCh37Ref = ReferenceAPISource.URL_PREFIX + ReferenceAPISource.GRCH37_REF_ID; // that's the "full" version
final String chr2021Reference2bit = GCS_b37_CHR20_21_REFERENCE_2BIT;
final String HiSeqBam_chr17 = resourceDir + "NA12878.chr17_69k_70k.dictFix.bam";

final String NO_DBSNP = "";
final BQSRTest params = new BQSRTest(GRCh37Ref, HiSeqBam_chr17, NO_DBSNP, "", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL);
final BQSRTest params = new BQSRTest(chr2021Reference2bit, HiSeqBam_chr17, NO_DBSNP, "", localResources + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL);
IntegrationTestSpec spec = new IntegrationTestSpec(
params.getCommandLine(),
1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Stream;

public class BQSRPipelineSparkIntegrationTest extends CommandLineProgramTest {
Expand Down Expand Up @@ -75,17 +76,19 @@ public Object[][] createBQSRLocalRefTestData() {
//Note: these output files were created by running GATK3
{new BQSRTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, dbSNPb37_20, ".bam", "--joinStrategy BROADCAST", getResourceDir() + "expected.CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.recalibrated.DIQ.bam")},
{new BQSRTest(GRCh37Ref_2021, hiSeqBam_chr20, dbSNPb37_20, ".bam", "--joinStrategy SHUFFLE", getResourceDir() + "expected.CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.recalibrated.DIQ.bam")},
{new BQSRTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, dbSNPb37_20, ".bam", "--joinStrategy BROADCAST", getResourceDir() + "expected.CEUTrio.HiSeq.WGS.b37.ch20.1m-1m1k.NA12878.recalibrated.DIQ.bam")},

//Output generated with GATK4 (resulting BAM has 4 differences with GATK3)
{new BQSRTest(b37_reference_20_21 , hiSeqBam_20_21_100000, more20Sites, ".bam", "--joinStrategy SHUFFLE -knownSites " + more21Sites, getResourceDir() + "expected.MultiSite.bqsr.pipeline.bam")},
{new BQSRTest(b37_reference_20_21 , hiSeqCram_20_21_100000, more20Sites, ".cram", "--joinStrategy SHUFFLE -knownSites " + more21Sites, getResourceDir() + "expected.MultiSite.bqsr.pipeline.cram")},
{new BQSRTest(b37_2bit_reference_20_21 , hiSeqBam_20_21_100000, more20Sites, ".bam", "--joinStrategy BROADCAST -knownSites " + more21Sites, getResourceDir() + "expected.MultiSite.bqsr.pipeline.bam")},
};
}

@Test(dataProvider = "BQSRLocalRefTest", groups = "spark")
public void testBQSRLocalRef(BQSRTest params) throws IOException {
File outFile = BaseTest.createTempFile("bqsrSparkPipelineTest", params.outputExtension);
final ArrayList<String> args = new ArrayList<>();
final List<String> args = new ArrayList<>();

args.add("-I");
args.add(new File(params.bam).getAbsolutePath());
Expand All @@ -101,7 +104,7 @@ public void testBQSRLocalRef(BQSRTest params) throws IOException {
args.add("--knownSites");
args.add(params.knownSites);
if (params.args != null) {
Stream.of(params.args.split(" ")).forEach(arg -> args.add(arg));
Stream.of(params.args.trim().split(" ")).forEach(args::add);
}

runCommandLine(args);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ public Object[][] createReadsPipelineSparkTestData() {
// Output generated with GATK4
{new PipelineTest(GRCh37Ref_2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, "--joinStrategy SHUFFLE --knownSites " + more20Sites, getResourceDir() + expectedMultipleKnownSites)},
{new PipelineTest(GRCh37Ref_2021, hiSeqCram_chr20, ".cram", dbSNPb37_20, "--joinStrategy SHUFFLE --knownSites " + more20Sites, getResourceDir() + expectedMultipleKnownSitesCram)},
{new PipelineTest(GRCh37Ref2bit_chr2021, hiSeqBam_chr20, ".bam", dbSNPb37_20, "--joinStrategy BROADCAST --knownSites " + more20Sites, getResourceDir() + expectedMultipleKnownSites)},
};
}

Expand All @@ -101,7 +102,7 @@ public void testReadsPipelineSpark(PipelineTest params) throws IOException {
args.add("--knownSites");
args.add(params.knownSites);
if (params.args != null) {
Stream.of(params.args.split(" ")).forEach(arg -> args.add(arg));
Stream.of(params.args.trim().split(" ")).forEach(args::add);
}

runCommandLine(args);
Expand Down
Binary file added src/test/resources/human_g1k_v37.chr17_1Mb.2bit
Binary file not shown.

0 comments on commit 4780534

Please sign in to comment.