Skip to content

Commit

Permalink
Changes for Issue #442, #520, #528
Browse files Browse the repository at this point in the history
Massive mega-change updating AlleleParsers to extract AN, AC, HOM from population frequency sources.
Add new Gnomad3 and Gnomad4 AlleleParsers
Add new DirectoryArchive to enable parsing multiple VCF files in a single directory, e.g. for gnomad-v3 and gnomad-v4
Remove dbsnp, exac and esp data sources as these are now merged into gnomad (dbsnp was used as the source of the 1K genomes and TOPMed data).
Update hg19 to use gnomad-v2.1
Update hg38 to use gnomad-v4 (#528)
Update path sources to use dbNSFP v4.5a and include AlphMissense (#520). Remove M-CAP, MPC and PrimateAI
  • Loading branch information
julesjacobsen committed Jan 25, 2024
1 parent b2c2ad8 commit 8099fa0
Show file tree
Hide file tree
Showing 58 changed files with 2,478 additions and 886 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -231,36 +231,38 @@ class CommandLineJobReaderTest {
.build()
))
.frequencySources(ImmutableSet.of(
FrequencySource.THOUSAND_GENOMES,
FrequencySource.TOPMED,
// commented out values are legacy/ founder populations and not required.
// Why keep them? I don't know, for historical purposes perhaps?
// FrequencySource.THOUSAND_GENOMES,
// FrequencySource.TOPMED,
FrequencySource.UK10K,

FrequencySource.ESP_AFRICAN_AMERICAN,
FrequencySource.ESP_EUROPEAN_AMERICAN,
FrequencySource.ESP_ALL,
// FrequencySource.ESP_AA,
// FrequencySource.ESP_EA,
// FrequencySource.ESP_ALL,

FrequencySource.EXAC_AFRICAN_INC_AFRICAN_AMERICAN,
FrequencySource.EXAC_AMERICAN,
FrequencySource.EXAC_SOUTH_ASIAN,
FrequencySource.EXAC_EAST_ASIAN,
FrequencySource.EXAC_FINNISH,
FrequencySource.EXAC_NON_FINNISH_EUROPEAN,
FrequencySource.EXAC_OTHER,
// FrequencySource.EXAC_AFRICAN_INC_AFRICAN_AMERICAN,
// FrequencySource.EXAC_AMERICAN,
// FrequencySource.EXAC_SOUTH_ASIAN,
// FrequencySource.EXAC_EAST_ASIAN,
// FrequencySource.EXAC_FINNISH,
// FrequencySource.EXAC_NON_FINNISH_EUROPEAN,
// FrequencySource.EXAC_OTHER,

FrequencySource.GNOMAD_E_AFR,
FrequencySource.GNOMAD_E_AMR,
FrequencySource.GNOMAD_E_EAS,
FrequencySource.GNOMAD_E_FIN,
// FrequencySource.GNOMAD_E_FIN,
FrequencySource.GNOMAD_E_NFE,
FrequencySource.GNOMAD_E_OTH,
// FrequencySource.GNOMAD_E_OTH,
FrequencySource.GNOMAD_E_SAS,

FrequencySource.GNOMAD_G_AFR,
FrequencySource.GNOMAD_G_AMR,
FrequencySource.GNOMAD_G_EAS,
FrequencySource.GNOMAD_G_FIN,
// FrequencySource.GNOMAD_G_FIN,
FrequencySource.GNOMAD_G_NFE,
FrequencySource.GNOMAD_G_OTH,
// FrequencySource.GNOMAD_G_OTH,
FrequencySource.GNOMAD_G_SAS
))
.pathogenicitySources(ImmutableSet.of(REVEL, MVP))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ public class BuildCommand implements Callable<Integer> {
private Path buildClinVar;
@Option(names = "--transcripts", converter = TranscriptSourceConverter.class, split = ",", arity = "0..1", fallbackValue = "ensembl,refseq,ucsc", description = "List of transcript databases to build. If specified without parameter, will build all sources: ${FALLBACK-VALUE}")
private List<TranscriptSource> transcriptSources;
@Option(names = "--variants", split = ",", arity = "0..1", fallbackValue = "esp,exac,uk10k,topmed,dbsnp,gnomad-exome,gnomad-genome,dbnsfp", description = "List of variant data sources to build. If specified without parameter, will build all sources: ${FALLBACK-VALUE}")
@Option(names = "--variants", split = ",", arity = "0..1", fallbackValue = "esp,exac,uk10k,topmed,dbsnp,gnomad-exome,gnomad-genome,gnomad-mito,alfa,dbnsfp", description = "List of variant data sources to build. If specified without parameter, will build all sources: ${FALLBACK-VALUE}")
private List<String> variantSources;
@Option(names = "--genome", description = "Flag to trigger building of genome data.")
private boolean buildGenome;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.sql.CallableStatement;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -104,16 +107,20 @@ public void run() {
// e.g. V1.3.1__insert_sv_path
// truncate and drop original resource tables
// TODO: ADD gnomAD pLOF, pLI, HI, triplosensitivity, genic intolerance, gene constraint scores.
// // TODO: do we want to do this? Don't we want each resource to handle its own sources, parsing and writing?
// // e.g. dbVar has 3-4 nr_ source files, clinVar has one file for both assemblies and only the correct
// // lines should be converted and written.


//build genome.h2.db
Path databasePath = outputPath.resolve(String.format("%s_genome", buildInfo.getBuildString()));
DataSource dataSource = createDataSource(databasePath);
logger.info("Created database: {}", databasePath);
migrateDatabase(dataSource);
// try compacting the database once everything is loaded to reduce size on disk
try(Connection connection = dataSource.getConnection();
CallableStatement statement = connection.prepareCall("SHUTDOWN COMPACT;")) {
logger.info("Shutting down and compacting genome database");
statement.execute();
} catch (SQLException e) {
logger.error("", e);
}
logger.info("Finished importing genome data");
}

Expand Down Expand Up @@ -165,7 +172,7 @@ private void downloadResource(URL source, Path destination) {
}

private DataSource createDataSource(Path databasePath) {
String initSql = "MODE=PostgreSQL;LOG=0;CACHE_SIZE=65536;LOCK_MODE=0;UNDO_LOG=0;MV_STORE=FALSE;";
String initSql = "MODE=PostgreSQL;LOG=0;CACHE_SIZE=65536;LOCK_MODE=0;UNDO_LOG=0;";
String url = String.format("jdbc:h2:file:%s;%s", databasePath.toAbsolutePath(), initSql);
return DataSourceBuilder.create()
.type(HikariDataSource.class)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,6 @@ public void run() {
throw new IllegalStateException("Error writing to MVStore " + fileName, e);
}

MVMap<AlleleKey, AlleleProperties> alleleMVMap = MvStoreUtil.openAlleleMVMap(mvStore);
logger.info("Written {} alleles to store", alleleMVMap.size());

// super-important step for producing as small a store as possible, Could double (or more?) when this is in progress
logger.info("Compacting store...");
MVStoreTool.compact(fileName, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,11 @@ public Map<String, AlleleResource> hg19AlleleResources() {

alleleResources.put("gnomad-genome", gnomadGenomeAlleleResource());
alleleResources.put("gnomad-exome", gnomadExomeAlleleResource());
// TOPMed removed as this is now part of dbSNP
alleleResources.put("dbsnp", dbSnpAlleleResource());
alleleResources.put("gnomad-mito", gnomadMitoAlleleResource());
// TOPMed removed as this is now part of gnomAD v2.1
// dbSNP removed as this mostly adds a lot of empty data with only rsids
alleleResources.put("uk10k", uk10kAlleleResource());
alleleResources.put("exac", exacAlleleResource());
// ExAC removed as this is part of gnomad-exomes
alleleResources.put("esp", espAlleleResource());
alleleResources.put("dbnsfp", dbnsfpAlleleResource());
// CLinVar removed - now handled as a separate data source
Expand Down Expand Up @@ -132,6 +133,10 @@ public GnomadExomeAlleleResource gnomadExomeAlleleResource() {
return alleleResource(GnomadExomeAlleleResource.class, "hg19.gnomad-exome");
}

public Gnomad3MitoAlleleResource gnomadMitoAlleleResource() {
return alleleResource(Gnomad3MitoAlleleResource.class, "hg19.gnomad-mito");
}

private List<SvResource> hg19SvResources(Path genomeProcessPath) {
return List.of(
clinvarSvResource(genomeProcessPath),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,16 @@ public Path genomeProcessPath() {
public Map<String, AlleleResource> hg38AlleleResources() {
ImmutableMap.Builder<String, AlleleResource> alleleResources = new ImmutableMap.Builder<>();

// thousand genomes removed as this is part of gnomAD v2.1
alleleResources.put("gnomad-genome", gnomadGenomeAlleleResource());
alleleResources.put("gnomad-exome", gnomadExomeAlleleResource());
// TOPMed removed as this is now part of dbSNP
alleleResources.put("dbsnp", dbSnpAlleleResource());
alleleResources.put("gnomad-mito", gnomadMitoAlleleResource());
alleleResources.put("alfa", alfaAlleleResource());
// TOPMed removed as this is part of gnomAD v2.1
// dbSNP removed as this mostly adds a lot of empty data with only rsids
alleleResources.put("uk10k", uk10kAlleleResource());
alleleResources.put("exac", exacAlleleResource());
alleleResources.put("esp", espAlleleResource());
// ExAC removed as this is part of gnomad-exomes v2.1
// ESP removed as this is part of gnomad-exomes v4
alleleResources.put("dbnsfp", dbnsfpAlleleResource());
// CLinVar removed - now handled as a separate data source

Expand Down Expand Up @@ -127,21 +130,30 @@ public Uk10kAlleleResource uk10kAlleleResource() {
return alleleResource(Uk10kAlleleResource.class, "hg38.uk10k");
}

public GnomadGenomeAlleleResource gnomadGenomeAlleleResource() {
return alleleResource(GnomadGenomeAlleleResource.class, "hg38.gnomad-genome");
public Gnomad4GenomeAlleleResource gnomadGenomeAlleleResource() {
return alleleResource(Gnomad4GenomeAlleleResource.class, "hg38.gnomad-genome");
}

public GnomadExomeAlleleResource gnomadExomeAlleleResource() {
return alleleResource(GnomadExomeAlleleResource.class, "hg38.gnomad-exome");
public Gnomad4ExomeAlleleResource gnomadExomeAlleleResource() {
return alleleResource(Gnomad4ExomeAlleleResource.class, "hg38.gnomad-exome");
}

public Gnomad3MitoAlleleResource gnomadMitoAlleleResource() {
return alleleResource(Gnomad3MitoAlleleResource.class, "hg38.gnomad-mito");
}

public AlfaAlleleResource alfaAlleleResource() {
return alleleResource(AlfaAlleleResource.class, "hg38.alfa");
}

public List<SvResource> hg38SvResources(Path genomeProcessPath) {
// GgnomAD hg38 is part of dbVar, GoNL is hg19 only
return List.of(
clinvarSvResource(genomeProcessPath),
dbVarFrequencyResource(genomeProcessPath),
dgvSvResource(genomeProcessPath),
decipherSvResource(genomeProcessPath),
gonlSvFrequencyResource(genomeProcessPath),
// gonlSvFrequencyResource(genomeProcessPath),
gnomadSvFrequencyResource(genomeProcessPath)
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,21 @@
import org.monarchinitiative.exomiser.core.proto.AlleleProto.ClinVar;
import org.monarchinitiative.exomiser.data.genome.model.Allele;
import org.monarchinitiative.exomiser.data.genome.model.AlleleProperty;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collection;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;

/**
* @author Jules Jacobsen <[email protected]>
*/
public class AlleleConverter {

private static final Logger logger = LoggerFactory.getLogger(AlleleConverter.class);

private AlleleConverter() {
//static utility class
}
Expand All @@ -50,24 +57,77 @@ public static AlleleKey toAlleleKey(Allele allele) {
}

public static AlleleProperties mergeProperties(AlleleProperties originalProperties, AlleleProperties properties) {
String updatedRsId = (originalProperties.getRsId()
.isEmpty()) ? properties.getRsId() : originalProperties.getRsId();
return AlleleProperties.newBuilder()
.mergeFrom(originalProperties)
.mergeFrom(properties)
//original rsid would have been overwritten by the new one - we don't necessarily want that, so re-set it now.
if (originalProperties.equals(properties)) {
return originalProperties;
}
logger.debug("Merging {} with {}", originalProperties, properties);
//original rsid would have been overwritten by the new one - we don't necessarily want that, so re-set it now.
String updatedRsId = originalProperties.getRsId().isEmpty() ? properties.getRsId() : originalProperties.getRsId();

// unfortunately since changing from a map to a list-based representation of the frequencies and pathogenicity scores
// this is more manual than simply calling .mergeFrom(originalProperties) / .mergeFrom(properties) as these will
// append the lists resulting in possible duplicates, hence we're merging them manually to avoid duplicates and
// overwrite any existing values with the newer version for that source.
Collection<AlleleProto.Frequency> mergedFrequencies = mergeFrequencies(originalProperties.getFrequenciesList(), properties.getFrequenciesList());
Collection<AlleleProto.PathogenicityScore> mergedPathScores = mergePathScores(originalProperties.getPathogenicityScoresList(), properties.getPathogenicityScoresList());

AlleleProperties.Builder mergedProperties = originalProperties.toBuilder();
if (!mergedProperties.hasClinVar() && properties.hasClinVar()) {
mergedProperties.setClinVar(properties.getClinVar());
}
return mergedProperties
.clearFrequencies()
.addAllFrequencies(mergedFrequencies)
.clearPathogenicityScores()
.addAllPathogenicityScores(mergedPathScores)
.setRsId(updatedRsId)
.build();
}

private static Collection<AlleleProto.PathogenicityScore> mergePathScores(List<AlleleProto.PathogenicityScore> originalPathScores, List<AlleleProto.PathogenicityScore> currentPathScores) {
if (originalPathScores.isEmpty()) {
return currentPathScores;
}
Map<AlleleProto.PathogenicitySource, AlleleProto.PathogenicityScore> mergedPaths = new EnumMap<>(AlleleProto.PathogenicitySource.class);
mergePaths(originalPathScores, mergedPaths);
mergePaths(currentPathScores, mergedPaths);
return mergedPaths.values();
}

private static void mergePaths(List<AlleleProto.PathogenicityScore> originalPathScores, Map<AlleleProto.PathogenicitySource, AlleleProto.PathogenicityScore> mergedPaths) {
for (int i = 0; i < originalPathScores.size(); i++) {
var pathScore = originalPathScores.get(i);
mergedPaths.put(pathScore.getPathogenicitySource(), pathScore);
}
}

private static Collection<AlleleProto.Frequency> mergeFrequencies(List<AlleleProto.Frequency> originalFreqs, List<AlleleProto.Frequency> currentFreqs) {
if (originalFreqs.isEmpty()) {
return currentFreqs;
}
Map<AlleleProto.FrequencySource, AlleleProto.Frequency> mergedFreqs = new EnumMap<>(AlleleProto.FrequencySource.class);
mergeFreqs(originalFreqs, mergedFreqs);
mergeFreqs(currentFreqs, mergedFreqs);
return mergedFreqs.values();
}

private static void mergeFreqs(List<AlleleProto.Frequency> originalFreqs, Map<AlleleProto.FrequencySource, AlleleProto.Frequency> mergedFreqs) {
for (int i = 0; i < originalFreqs.size(); i++) {
var freq = originalFreqs.get(i);
mergedFreqs.put(freq.getFrequencySource(), freq);
}
}

public static AlleleProperties toAlleleProperties(Allele allele) {
AlleleProperties.Builder builder = AlleleProperties.newBuilder();
builder.setRsId(allele.getRsId());
addAllelePropertyValues(builder, allele.getValues());
builder.addAllFrequencies(allele.getFrequencies());
builder.addAllPathogenicityScores(allele.getPathogenicityScores());
addClinVarData(builder, allele);
return builder.build();
}

@Deprecated(since = "14.0.0")
private static void addAllelePropertyValues(AlleleProperties.Builder builder, Map<AlleleProperty, Float> values) {
for (Map.Entry<AlleleProperty, Float> entry : values.entrySet()) {
builder.putProperties(entry.getKey().toString(), entry.getValue());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,5 @@ default void index(Resource<T> resource) {

void write(T type);

long count();
}
Loading

0 comments on commit 8099fa0

Please sign in to comment.