From 14915b21c9ed8d31821468b7482d6ff45a72deb1 Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Fri, 2 Feb 2024 01:27:23 +0000 Subject: [PATCH 01/13] ingest assembly in batches --- .../contigalias/ContigAliasApplication.java | 2 + .../datasource/ENAAssemblyDataSource.java | 46 +++++- .../datasource/NCBIAssemblyDataSource.java | 51 +++++++ .../dus/ENAAssemblyReportReader.java | 46 ++++-- .../dus/NCBIAssemblyReportReader.java | 120 +++++++++++---- .../contigalias/scheduler/ChecksumSetter.java | 5 +- .../contigalias/service/AssemblyService.java | 141 ++++++++++++++++-- .../AssemblyServiceIntegrationTest.java | 6 +- 8 files changed, 357 insertions(+), 60 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java b/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java index 3f286d2e..bd1f1109 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java @@ -23,10 +23,12 @@ import org.springframework.hateoas.config.EnableHypermediaSupport; import org.springframework.retry.annotation.EnableRetry; import org.springframework.scheduling.annotation.EnableScheduling; +import org.springframework.transaction.annotation.EnableTransactionManagement; @EnableScheduling @SpringBootApplication @EnableRetry +@EnableTransactionManagement @EnableHypermediaSupport(type = EnableHypermediaSupport.HypermediaType.HAL) public class ContigAliasApplication extends SpringBootServletInitializer { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java index 37a5c791..eafa94a6 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java @@ -39,6 +39,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -96,6 +97,24 @@ public Optional getAssemblyByAccession(String accession) throws } + public Optional downloadAssemblyReport(String accession) throws IOException { + ENABrowser enaBrowser = factory.build(); + enaBrowser.connect(); + Optional downloadPath; + try { + enaBrowser.connect(); + downloadPath = downloadAssemblyReport(enaBrowser, accession); + } finally { + try { + enaBrowser.disconnect(); + } catch (IOException e) { + logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ") : " + e); + } + } + + return downloadPath; + } + @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) public Optional downloadAssemblyReport(ENABrowser enaBrowser, String accession) throws IOException { String dirPath = enaBrowser.getAssemblyDirPath(accession); @@ -105,18 +124,37 @@ public Optional downloadAssemblyReport(ENABrowser enaBrowser, String acces try { boolean success = enaBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize()); if (success) { - logger.info("ENA assembly report downloaded successfully for accession "+ accession); + logger.info("ENA assembly report downloaded successfully for accession " + accession); return Optional.of(downloadFilePath); } else { - logger.warn("ENA assembly report could not be downloaded successfully for accession "+accession); + logger.warn("ENA assembly report could not be downloaded successfully for accession " + accession); return Optional.empty(); } } catch (IOException | DownloadFailedException e) { - logger.warn("Error downloading ENA assembly report for accession "+ accession + e); + logger.warn("Error downloading ENA assembly report for accession " + accession + e); return Optional.empty(); } } + public List getChromosomeEntityList(AssemblyEntity assemblyEntity, List chrDataList) { + List chromosomeEntityList = new ArrayList<>(); + for (String chrData : chrDataList) { + ChromosomeEntity chromosomeEntity = getChromosomeEntity(assemblyEntity, chrData); + if (chromosomeEntity != null) { + chromosomeEntityList.add(chromosomeEntity); + } + } + return chromosomeEntityList; + } + + public ChromosomeEntity getChromosomeEntity(AssemblyEntity assemblyEntity, String chrLine) { + ChromosomeEntity chromosomeEntity = ENAAssemblyReportReader.getChromosomeEntity(chrLine); + if (chromosomeEntity != null) { + chromosomeEntity.setAssembly(assemblyEntity); + } + return chromosomeEntity; + } + /** * Adds ENA sequence names to chromosomes and scaffolds in an assembly. Will modify the AssemblyEntity in-place. * @@ -144,7 +182,7 @@ public boolean hasAllEnaSequenceNames(AssemblyEntity assembly) { return chromosomes.stream().allMatch(sequence -> sequence.getEnaSequenceName() != null); } - private void addENASequenceNames( + public void addENASequenceNames( List sourceSequences, List targetSequences) { Map insdcToSequenceEntity = new HashMap<>(); for (SequenceEntity targetSeq : targetSequences) { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java index 08226ff2..9a42031f 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java @@ -29,6 +29,7 @@ import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser; import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; +import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import java.io.FileInputStream; import java.io.IOException; @@ -36,7 +37,10 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; import java.util.Optional; +import java.util.stream.Collectors; @Repository("NCBIDataSource") public class NCBIAssemblyDataSource implements AssemblyDataSource { @@ -85,6 +89,53 @@ public Optional getAssemblyByAccession( return Optional.of(assemblyEntity); } + public AssemblyEntity getAssemblyEntity(Path downloadFilePath) throws IOException { + List asmDataLines = Files.lines(downloadFilePath) + .filter(line -> line.startsWith("#")) + .collect(Collectors.toList()); + return getAssemblyEntity(asmDataLines); + } + + public AssemblyEntity getAssemblyEntity(List asmDataLines) { + return NCBIAssemblyReportReader.getAssemblyEntity(asmDataLines); + } + + public List getChromosomeEntityList(AssemblyEntity assemblyEntity, List chrDataList) { + List chromosomeEntityList = new ArrayList<>(); + for (String chrData : chrDataList) { + ChromosomeEntity chromosomeEntity = getChromosomeEntity(assemblyEntity, chrData); + if (chromosomeEntity != null) { + chromosomeEntityList.add(chromosomeEntity); + } + } + return chromosomeEntityList; + } + + public ChromosomeEntity getChromosomeEntity(AssemblyEntity assemblyEntity, String chrLine) { + ChromosomeEntity chromosomeEntity = NCBIAssemblyReportReader.getChromosomeEntity(chrLine); + if (chromosomeEntity != null) { + chromosomeEntity.setAssembly(assemblyEntity); + } + return chromosomeEntity; + } + + public Optional downloadAssemblyReport(String accession) throws IOException { + NCBIBrowser ncbiBrowser = factory.build(); + Optional downloadPath; + try { + ncbiBrowser.connect(); + downloadPath = downloadAssemblyReport(accession, ncbiBrowser); + } finally { + try { + ncbiBrowser.disconnect(); + } catch (IOException e) { + logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ") : " + e); + } + } + + return downloadPath; + } + @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) public Optional downloadAssemblyReport(String accession, NCBIBrowser ncbiBrowser) throws IOException { Optional directory = ncbiBrowser.getGenomeReportDirectory(accession); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java index af5be00e..29fb45e1 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java @@ -59,13 +59,11 @@ protected void parseReport() throws IOException, NullPointerException { } // Not present in ENA assembly reports - protected void parseAssemblyData(String line) {} + protected void parseAssemblyData(String line) { + } protected void parseChromosomeLine(String[] columns) { - ChromosomeEntity chromosomeEntity = new ChromosomeEntity(); - - chromosomeEntity.setInsdcAccession(columns[0]); - chromosomeEntity.setEnaSequenceName(columns[1]); + ChromosomeEntity chromosomeEntity = getChromosome(columns); if (assemblyEntity == null) { assemblyEntity = new AssemblyEntity(); @@ -82,10 +80,7 @@ protected void parseChromosomeLine(String[] columns) { } protected void parseScaffoldLine(String[] columns) { - ChromosomeEntity scaffoldEntity = new ChromosomeEntity(); - - scaffoldEntity.setInsdcAccession(columns[0]); - scaffoldEntity.setEnaSequenceName(columns[1]); + ChromosomeEntity scaffoldEntity = getScaffold(columns); if (assemblyEntity == null) { assemblyEntity = new AssemblyEntity(); @@ -101,4 +96,37 @@ protected void parseScaffoldLine(String[] columns) { scaffolds.add(scaffoldEntity); } + public static ChromosomeEntity getChromosomeEntity(String line) { + if (!line.startsWith("accession")) { + String[] columns = line.split("\t", -1); + if (columns.length >= 6) { + if (columns[5].equals("Chromosome") && columns[3].equals("assembled-molecule")) { + return getChromosome(columns); + } else { + return getScaffold(columns); + } + } + } + + return null; + } + + public static ChromosomeEntity getChromosome(String[] columns) { + ChromosomeEntity chromosomeEntity = new ChromosomeEntity(); + chromosomeEntity.setInsdcAccession(columns[0]); + chromosomeEntity.setEnaSequenceName(columns[1]); + chromosomeEntity.setContigType(SequenceEntity.ContigType.CHROMOSOME); + + return chromosomeEntity; + } + + public static ChromosomeEntity getScaffold(String[] columns) { + ChromosomeEntity scaffoldEntity = new ChromosomeEntity(); + scaffoldEntity.setInsdcAccession(columns[0]); + scaffoldEntity.setEnaSequenceName(columns[1]); + scaffoldEntity.setContigType(SequenceEntity.ContigType.SCAFFOLD); + + return scaffoldEntity; + } + } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java index 28ad9329..f2c0d502 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java @@ -24,6 +24,8 @@ import java.io.InputStreamReader; import java.util.LinkedList; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; public class NCBIAssemblyReportReader extends AssemblyReportReader { @@ -95,6 +97,93 @@ protected void parseAssemblyData(String line) { } protected void parseChromosomeLine(String[] columns) { + ChromosomeEntity chromosomeEntity = getChromosome(columns); + + if (assemblyEntity == null) { + assemblyEntity = new AssemblyEntity(); + } + chromosomeEntity.setAssembly(this.assemblyEntity); + + List chromosomes = this.assemblyEntity.getChromosomes(); + if (chromosomes == null) { + chromosomes = new LinkedList<>(); + assemblyEntity.setChromosomes(chromosomes); + } + chromosomes.add(chromosomeEntity); + } + + protected void parseScaffoldLine(String[] columns) { + ChromosomeEntity scaffoldEntity = getScaffold(columns); + + if (assemblyEntity == null) { + assemblyEntity = new AssemblyEntity(); + } + scaffoldEntity.setAssembly(this.assemblyEntity); + + List scaffolds = this.assemblyEntity.getChromosomes(); + if (scaffolds == null) { + scaffolds = new LinkedList<>(); + assemblyEntity.setChromosomes(scaffolds); + } + scaffolds.add(scaffoldEntity); + } + + public static AssemblyEntity getAssemblyEntity(List lines) { + Map tagAndValuesMap = lines.stream() + .filter(line -> line.startsWith("#")) + .filter(line -> line.indexOf(':') != -1) + .collect(Collectors.toMap(l -> l.substring(2, l.indexOf(':')), l -> l.substring(l.indexOf(':') + 1).trim())); + + AssemblyEntity asmEntity = new AssemblyEntity(); + for (Map.Entry entry : tagAndValuesMap.entrySet()) { + String tag = entry.getKey(); + String tagData = entry.getValue(); + switch (tag) { + case "Assembly name": { + asmEntity.setName(tagData); + break; + } + case "Organism name": { + asmEntity.setOrganism(tagData); + break; + } + case "Taxid": { + asmEntity.setTaxid(Long.parseLong(tagData)); + break; + } + case "GenBank assembly accession": { + asmEntity.setInsdcAccession(tagData); + break; + } + case "RefSeq assembly accession": { + asmEntity.setRefseq(tagData); + break; + } + case "RefSeq assembly and GenBank assemblies identical": { + asmEntity.setGenbankRefseqIdentical(tagData.equals("yes")); + break; + } + } + } + + return asmEntity; + } + + public static ChromosomeEntity getChromosomeEntity(String line) { + String[] columns = line.split("\t", -1); + if (columns.length >= 6 && (columns[5].equals("=") || columns[5].equals("<>")) && + (columns[4] != null && !columns[4].isEmpty() && !columns[4].equals("na"))) { + if (columns[3].equals("Chromosome") && columns[1].equals("assembled-molecule")) { + return getChromosome(columns); + } else { + return getScaffold(columns); + } + } + + return null; + } + + public static ChromosomeEntity getChromosome(String[] columns) { ChromosomeEntity chromosomeEntity = new ChromosomeEntity(); chromosomeEntity.setGenbankSequenceName(columns[0]); @@ -104,7 +193,6 @@ protected void parseChromosomeLine(String[] columns) { } else { chromosomeEntity.setRefseq(columns[6]); } - if (columns.length > 8) { try { Long seqLength = Long.parseLong(columns[8]); @@ -113,26 +201,15 @@ protected void parseChromosomeLine(String[] columns) { } } - if (columns.length > 9 && !columns[9].equals("na")) { chromosomeEntity.setUcscName(columns[9]); } - - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - chromosomeEntity.setAssembly(this.assemblyEntity); chromosomeEntity.setContigType(SequenceEntity.ContigType.CHROMOSOME); - List chromosomes = this.assemblyEntity.getChromosomes(); - if (chromosomes == null) { - chromosomes = new LinkedList<>(); - assemblyEntity.setChromosomes(chromosomes); - } - chromosomes.add(chromosomeEntity); + return chromosomeEntity; } - protected void parseScaffoldLine(String[] columns) { + public static ChromosomeEntity getScaffold(String[] columns) { ChromosomeEntity scaffoldEntity = new ChromosomeEntity(); scaffoldEntity.setGenbankSequenceName(columns[0]); @@ -142,7 +219,6 @@ protected void parseScaffoldLine(String[] columns) { } else { scaffoldEntity.setRefseq(columns[6]); } - if (columns.length > 8) { try { Long seqLength = Long.parseLong(columns[8]); @@ -151,27 +227,15 @@ protected void parseScaffoldLine(String[] columns) { } } - - if (columns.length >= 10) { String ucscName = columns[9]; if (!ucscName.equals("na")) { scaffoldEntity.setUcscName(ucscName); } } - - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - scaffoldEntity.setAssembly(this.assemblyEntity); scaffoldEntity.setContigType(SequenceEntity.ContigType.SCAFFOLD); - List scaffolds = this.assemblyEntity.getChromosomes(); - if (scaffolds == null) { - scaffolds = new LinkedList<>(); - assemblyEntity.setChromosomes(scaffolds); - } - scaffolds.add(scaffoldEntity); + return scaffoldEntity; } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java index 27bb2b92..564b50ca 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java @@ -5,7 +5,6 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.jdbc.core.ResultSetExtractor; -import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.service.ChromosomeService; @@ -38,7 +37,7 @@ public ChecksumSetter(ChromosomeService chromosomeService, Md5ChecksumRetriever this.jdbcTemplate = jdbcTemplate; } - @Scheduled(cron = "0 0 1 ? * TUE") + //@Scheduled(cron = "0 0 1 ? * TUE") public void updateMd5CheckSumForAllAssemblies() { List assemblyList = chromosomeService.getAssembliesWhereChromosomeMd5ChecksumIsNull(); logger.info("List of assemblies to be updated for MD5 Checksum: " + assemblyList); @@ -111,6 +110,8 @@ public void updateMD5ChecksumForAllChromosomesInAssembly(String assembly) { logger.info("Chromosomes Updated till now: " + chromosomeUpdated); } + logger.info("Finished updating md5checksum for assembly: " + assembly); + return null; }); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java index e97adc2f..3d2a509c 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java @@ -29,10 +29,15 @@ import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblyException; import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; +import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; import uk.ac.ebi.eva.contigalias.scheduler.ChecksumSetter; import javax.transaction.Transactional; +import java.io.BufferedReader; +import java.io.FileReader; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -45,7 +50,9 @@ @Service public class AssemblyService { - private final AssemblyRepository repository; + private final AssemblyRepository assemblyRepository; + + private final ChromosomeRepository chromosomeRepository; private final NCBIAssemblyDataSource ncbiDataSource; @@ -53,48 +60,150 @@ public class AssemblyService { private final ChecksumSetter checksumSetter; + private final int BATCH_SIZE = 100; + private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); @Autowired - public AssemblyService(AssemblyRepository repository, NCBIAssemblyDataSource ncbiDataSource, - ENAAssemblyDataSource enaDataSource, ChecksumSetter checksumSetter) { - this.repository = repository; + public AssemblyService(AssemblyRepository repository, ChromosomeRepository chromosomeRepository, + NCBIAssemblyDataSource ncbiDataSource, ENAAssemblyDataSource enaDataSource, + ChecksumSetter checksumSetter) { + this.assemblyRepository = repository; + this.chromosomeRepository = chromosomeRepository; this.ncbiDataSource = ncbiDataSource; this.enaDataSource = enaDataSource; this.checksumSetter = checksumSetter; } public Optional getAssemblyByInsdcAccession(String insdcAccession) { - Optional entity = repository.findAssemblyEntityByInsdcAccession(insdcAccession); + Optional entity = assemblyRepository.findAssemblyEntityByInsdcAccession(insdcAccession); stripAssemblyFromChromosomes(entity); return entity; } public Optional getAssemblyByRefseq(String refseq) { - Optional entity = repository.findAssemblyEntityByRefseq(refseq); + Optional entity = assemblyRepository.findAssemblyEntityByRefseq(refseq); stripAssemblyFromChromosomes(entity); return entity; } public Page getAssembliesByTaxid(long taxid, Pageable request) { - Page page = repository.findAssemblyEntitiesByTaxid(taxid, request); + Page page = assemblyRepository.findAssemblyEntitiesByTaxid(taxid, request); page.forEach(this::stripAssemblyFromChromosomes); return page; } public void putAssemblyChecksumsByAccession(String accession, String md5, String trunc512) { - Optional entity = repository.findAssemblyEntityByAccession(accession); + Optional entity = assemblyRepository.findAssemblyEntityByAccession(accession); if (!entity.isPresent()) { throw new IllegalArgumentException( "No assembly corresponding to accession " + accession + " found in the database"); } AssemblyEntity assemblyEntity = entity.get(); assemblyEntity.setMd5checksum(md5).setTrunc512checksum(trunc512); - repository.save(assemblyEntity); + assemblyRepository.save(assemblyEntity); } public void fetchAndInsertAssembly(String accession) throws IOException { - Optional entity = repository.findAssemblyEntityByAccession(accession); + // check if assembly already exists in db + Optional entity = assemblyRepository.findAssemblyEntityByAccession(accession); + if (entity.isPresent()) { + throw duplicateAssemblyInsertionException(accession, entity.get()); + } + + Optional downloadNCBIFilePathOpt = ncbiDataSource.downloadAssemblyReport(accession); + Path downloadedNCBIFilePath = downloadNCBIFilePathOpt.orElseThrow(() -> new AssemblyNotFoundException(accession)); + Optional downloadENAFilePathOpt = enaDataSource.downloadAssemblyReport(accession); + Path downloadedENAFilePath = downloadENAFilePathOpt.orElse(null); + + long numberOfChromosomesInFile = Files.lines(downloadedNCBIFilePath).filter(line -> !line.startsWith("#")).count(); + logger.info("Number of chromosomes in assembly (" + accession + "): " + numberOfChromosomesInFile); + + // parse file and save data + parseFileAndInsertAssembly(downloadedNCBIFilePath, downloadedENAFilePath); + logger.info("Successfully inserted assembly for accession " + accession); + + // submit job for retrieving and updating MD5 Checksum for assembly (asynchronously) + checksumSetter.updateMd5CheckSumForAssemblyAsync(accession); + + Files.deleteIfExists(downloadedNCBIFilePath); + if (downloadedENAFilePath != null) { + Files.deleteIfExists(downloadedENAFilePath); + } + } + + //TODO: put it somewhere else where transaction works + @Transactional + public void parseFileAndInsertAssembly(Path downloadedNCBIFilePath, Path downloadedENAFilePath) throws IOException { + AssemblyEntity assemblyEntity = ncbiDataSource.getAssemblyEntity(downloadedNCBIFilePath); + assemblyRepository.save(assemblyEntity); + + try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedNCBIFilePath.toFile()))) { + List chrLines = new ArrayList<>(); + String line; + long chromosomesSavedTillNow = 0; + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + chrLines.add(line); + if (chrLines.size() == BATCH_SIZE) { + List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + if (downloadedENAFilePath != null) { + addENASequenceNameToChromosomes(assemblyEntity, chromosomeEntityList, downloadedENAFilePath); + } + chromosomeRepository.saveAll(chromosomeEntityList); + chromosomesSavedTillNow += chromosomeEntityList.size(); + logger.info("Number of total chromosomes saved till now : " + chromosomesSavedTillNow); + + chrLines = new ArrayList<>(); + } + } + + if (!chrLines.isEmpty()) { + List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + if (downloadedENAFilePath != null) { + addENASequenceNameToChromosomes(assemblyEntity, chromosomeEntityList, downloadedENAFilePath); + } + chromosomeRepository.saveAll(chromosomeEntityList); + chromosomesSavedTillNow += chromosomeEntityList.size(); + logger.info("Number of total chromosomes saved till now : " + chromosomesSavedTillNow); + } + } + } + + public void addENASequenceNameToChromosomes(AssemblyEntity assemblyEntity, List ncbiChromosomeList, + Path downloadedENAFilePath) throws IOException { + try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedENAFilePath.toFile()))) { + List chrLines = new ArrayList<>(); + String line; + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("accession")) { + continue; + } + chrLines.add(line); + if (chrLines.size() == BATCH_SIZE) { + List enaChromosomeList = enaDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + enaDataSource.addENASequenceNames( + !enaChromosomeList.isEmpty() ? enaChromosomeList : Collections.emptyList(), + !ncbiChromosomeList.isEmpty() ? ncbiChromosomeList : Collections.emptyList() + ); + + chrLines = new ArrayList<>(); + } + } + if (!chrLines.isEmpty()) { + List enaChromosomeList = enaDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + enaDataSource.addENASequenceNames( + !enaChromosomeList.isEmpty() ? enaChromosomeList : Collections.emptyList(), + !ncbiChromosomeList.isEmpty() ? ncbiChromosomeList : Collections.emptyList() + ); + } + } + } + + public void fetchAndInsertAssemblyOld(String accession) throws IOException { + Optional entity = assemblyRepository.findAssemblyEntityByAccession(accession); if (entity.isPresent()) { throw duplicateAssemblyInsertionException(accession, entity.get()); } @@ -127,7 +236,7 @@ public Map> getMD5ChecksumUpdateTaskStatus() { } public Optional getAssemblyByAccession(String accession) { - Optional entity = repository.findAssemblyEntityByAccession(accession); + Optional entity = assemblyRepository.findAssemblyEntityByAccession(accession); if (entity.isPresent()) { stripAssemblyFromChromosomes(entity); return entity; @@ -157,7 +266,7 @@ public void insertAssembly(AssemblyEntity entity) { if (isEntityPresent(entity)) { throw duplicateAssemblyInsertionException(null, entity); } else { - repository.save(entity); + assemblyRepository.save(entity); } } @@ -168,7 +277,7 @@ public boolean isEntityPresent(AssemblyEntity entity) { if (insdcAccession == null && refseq == null) { return false; } - Optional existingAssembly = repository.findAssemblyEntityByInsdcAccessionOrRefseq( + Optional existingAssembly = assemblyRepository.findAssemblyEntityByInsdcAccessionOrRefseq( // Setting to invalid prevents finding random accessions with null GCA/GCF insdcAccession == null ? "##########" : insdcAccession, refseq == null ? "##########" : refseq); @@ -197,11 +306,11 @@ public Map> fetchAndInsertAssembly(List accessions) } public void deleteAssemblyByInsdcAccession(String insdcAccession) { - repository.deleteAssemblyEntityByInsdcAccession(insdcAccession); + assemblyRepository.deleteAssemblyEntityByInsdcAccession(insdcAccession); } public void deleteAssemblyByRefseq(String refseq) { - repository.deleteAssemblyEntityByRefseq(refseq); + assemblyRepository.deleteAssemblyEntityByRefseq(refseq); } public void deleteAssemblyByAccession(String accession) { @@ -210,7 +319,7 @@ public void deleteAssemblyByAccession(String accession) { } public void deleteAssembly(AssemblyEntity entity) { - repository.delete(entity); + assemblyRepository.delete(entity); } private DuplicateAssemblyException duplicateAssemblyInsertionException(String accession, AssemblyEntity present) { diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java index d713e2e9..311c6952 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java @@ -31,6 +31,7 @@ import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entitygenerator.AssemblyGenerator; import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; +import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; import uk.ac.ebi.eva.contigalias.scheduler.ChecksumSetter; import java.io.IOException; @@ -57,6 +58,9 @@ public class AssemblyServiceIntegrationTest { @Autowired AssemblyRepository repository; + @Autowired + ChromosomeRepository chromosomeRepository; + @Autowired private AssemblyService service; @@ -75,7 +79,7 @@ void setup() throws IOException { Mockito.when(mockChecksumSetter.updateMd5CheckSumForAssemblyAsync(generate.getInsdcAccession())) .thenReturn(new CompletableFuture<>()); } - service = new AssemblyService(repository, mockNcbiDataSource, mockEnaDataSource, mockChecksumSetter); + service = new AssemblyService(repository, chromosomeRepository, mockNcbiDataSource, mockEnaDataSource, mockChecksumSetter); } @AfterEach From 2534ee240a19869333c271e69b9cf199f34940cc Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Fri, 2 Feb 2024 08:41:17 +0000 Subject: [PATCH 02/13] put right batch --- .../uk/ac/ebi/eva/contigalias/service/AssemblyService.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java index 3d2a509c..cedad343 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java @@ -60,7 +60,7 @@ public class AssemblyService { private final ChecksumSetter checksumSetter; - private final int BATCH_SIZE = 100; + private final int BATCH_SIZE = 100000; private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); @@ -132,7 +132,6 @@ public void fetchAndInsertAssembly(String accession) throws IOException { } } - //TODO: put it somewhere else where transaction works @Transactional public void parseFileAndInsertAssembly(Path downloadedNCBIFilePath, Path downloadedENAFilePath) throws IOException { AssemblyEntity assemblyEntity = ncbiDataSource.getAssemblyEntity(downloadedNCBIFilePath); From c6315523e16d2d060211fcb10399fa413beb362a Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Fri, 2 Feb 2024 11:24:42 +0000 Subject: [PATCH 03/13] fix transactional issue --- .../datasource/ENAAssemblyDataSource.java | 55 +++++--- .../datasource/NCBIAssemblyDataSource.java | 64 ++++++++++ .../dus/ENAAssemblyReportReader.java | 6 + .../dus/NCBIAssemblyReportReader.java | 6 + .../contigalias/service/AssemblyService.java | 117 +----------------- 5 files changed, 117 insertions(+), 131 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java index eafa94a6..b4c01557 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java @@ -33,7 +33,9 @@ import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; import uk.ac.ebi.eva.contigalias.exception.DownloadFailedException; +import java.io.BufferedReader; import java.io.FileInputStream; +import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; @@ -41,11 +43,11 @@ import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.stream.Collectors; @Repository("ENADataSource") public class ENAAssemblyDataSource implements AssemblyDataSource { @@ -136,10 +138,10 @@ public Optional downloadAssemblyReport(ENABrowser enaBrowser, String acces } } - public List getChromosomeEntityList(AssemblyEntity assemblyEntity, List chrDataList) { + public List getChromosomeEntityList(List chrDataList) { List chromosomeEntityList = new ArrayList<>(); for (String chrData : chrDataList) { - ChromosomeEntity chromosomeEntity = getChromosomeEntity(assemblyEntity, chrData); + ChromosomeEntity chromosomeEntity = getChromosomeEntity(chrData); if (chromosomeEntity != null) { chromosomeEntityList.add(chromosomeEntity); } @@ -147,12 +149,8 @@ public List getChromosomeEntityList(AssemblyEntity assemblyEnt return chromosomeEntityList; } - public ChromosomeEntity getChromosomeEntity(AssemblyEntity assemblyEntity, String chrLine) { - ChromosomeEntity chromosomeEntity = ENAAssemblyReportReader.getChromosomeEntity(chrLine); - if (chromosomeEntity != null) { - chromosomeEntity.setAssembly(assemblyEntity); - } - return chromosomeEntity; + public ChromosomeEntity getChromosomeEntity(String chrLine) { + return ENAAssemblyReportReader.getChromosomeEntity(chrLine); } /** @@ -184,16 +182,41 @@ public boolean hasAllEnaSequenceNames(AssemblyEntity assembly) { public void addENASequenceNames( List sourceSequences, List targetSequences) { - Map insdcToSequenceEntity = new HashMap<>(); - for (SequenceEntity targetSeq : targetSequences) { - insdcToSequenceEntity.put(targetSeq.getInsdcAccession(), targetSeq); + if (targetSequences == null || sourceSequences == null || targetSequences.isEmpty() || sourceSequences.isEmpty()) { + return; } + Map insdcToSequenceEntityMap = targetSequences.stream() + .collect(Collectors.toMap(s->s.getInsdcAccession(), s->s)); + for (SequenceEntity sourceSeq : sourceSequences) { String sourceInsdcAccession = sourceSeq.getInsdcAccession(); - if (insdcToSequenceEntity.containsKey(sourceInsdcAccession)) { - insdcToSequenceEntity.get(sourceInsdcAccession).setEnaSequenceName(sourceSeq.getEnaSequenceName()); - } else { - insdcToSequenceEntity.put(sourceInsdcAccession, sourceSeq); + if (insdcToSequenceEntityMap.containsKey(sourceInsdcAccession)) { + insdcToSequenceEntityMap.get(sourceInsdcAccession).setEnaSequenceName(sourceSeq.getEnaSequenceName()); + } + } + } + + public void addENASequenceNameToChromosomes(List ncbiChromosomeList, + Path downloadedENAFilePath, final int BATCH_SIZE) throws IOException { + try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedENAFilePath.toFile()))) { + List chrLines = new ArrayList<>(); + List enaChromosomeList; + String line; + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("accession")) { + continue; + } + chrLines.add(line); + if (chrLines.size() == BATCH_SIZE) { + enaChromosomeList = getChromosomeEntityList(chrLines); + addENASequenceNames(enaChromosomeList, ncbiChromosomeList); + + chrLines = new ArrayList<>(); + } + } + if (!chrLines.isEmpty()) { + enaChromosomeList = getChromosomeEntityList(chrLines); + addENASequenceNames(enaChromosomeList, ncbiChromosomeList); } } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java index 9a42031f..f798dd66 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java @@ -30,8 +30,14 @@ import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; +import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; +import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; +import javax.transaction.Transactional; +import java.io.BufferedReader; import java.io.FileInputStream; +import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; @@ -47,6 +53,8 @@ public class NCBIAssemblyDataSource implements AssemblyDataSource { private final Logger logger = LoggerFactory.getLogger(NCBIAssemblyDataSource.class); + private final int BATCH_SIZE = 100000; + private final NCBIBrowserFactory factory; private final NCBIAssemblyReportReaderFactory readerFactory; @@ -157,4 +165,60 @@ public Optional downloadAssemblyReport(String accession, NCBIBrowser ncbiB } } + @Transactional + public void parseFileAndInsertAssembly(String accession, ENAAssemblyDataSource enaDataSource, + AssemblyRepository assemblyRepository, ChromosomeRepository chromosomeRepository) throws IOException { + Optional downloadNCBIFilePathOpt = downloadAssemblyReport(accession); + Path downloadedNCBIFilePath = downloadNCBIFilePathOpt.orElseThrow(() -> new AssemblyNotFoundException(accession)); + Optional downloadENAFilePathOpt = enaDataSource.downloadAssemblyReport(accession); + Path downloadedENAFilePath = downloadENAFilePathOpt.orElse(null); + + long numberOfChromosomesInFile = Files.lines(downloadedNCBIFilePath).filter(line -> !line.startsWith("#")).count(); + logger.info("Number of chromosomes in assembly (" + accession + "): " + numberOfChromosomesInFile); + + AssemblyEntity assemblyEntity = getAssemblyEntity(downloadedNCBIFilePath); + assemblyRepository.save(assemblyEntity); + + try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedNCBIFilePath.toFile()))) { + long chromosomesSavedTillNow = 0l; + List chrLines = new ArrayList<>(); + String line; + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + chrLines.add(line); + if (chrLines.size() == BATCH_SIZE) { + // add ena sequence name and save + addENASequenceNameAndSave(assemblyEntity, chrLines, enaDataSource, downloadedENAFilePath, chromosomeRepository); + chromosomesSavedTillNow += chrLines.size(); + logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); + + chrLines = new ArrayList<>(); + } + } + if (!chrLines.isEmpty()) { + // add ena sequence name and save + addENASequenceNameAndSave(assemblyEntity, chrLines, enaDataSource, downloadedENAFilePath, chromosomeRepository); + chromosomesSavedTillNow += chrLines.size(); + logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); + } + } + + // delete the files after assembly insertion + Files.deleteIfExists(downloadedNCBIFilePath); + if (downloadedENAFilePath != null) { + Files.deleteIfExists(downloadedENAFilePath); + } + } + + public void addENASequenceNameAndSave(AssemblyEntity assemblyEntity, List chrLines, + ENAAssemblyDataSource enaDataSource, Path downloadedENAFilePath, + ChromosomeRepository chromosomeRepository) throws IOException { + List chromosomeEntityList = getChromosomeEntityList(assemblyEntity, chrLines); + if (downloadedENAFilePath != null) { + enaDataSource.addENASequenceNameToChromosomes(chromosomeEntityList, downloadedENAFilePath, BATCH_SIZE); + } + chromosomeRepository.saveAll(chromosomeEntityList); + } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java index 29fb45e1..3e857885 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java @@ -64,6 +64,9 @@ protected void parseAssemblyData(String line) { protected void parseChromosomeLine(String[] columns) { ChromosomeEntity chromosomeEntity = getChromosome(columns); + if (chromosomeEntity == null) { + return; + } if (assemblyEntity == null) { assemblyEntity = new AssemblyEntity(); @@ -81,6 +84,9 @@ protected void parseChromosomeLine(String[] columns) { protected void parseScaffoldLine(String[] columns) { ChromosomeEntity scaffoldEntity = getScaffold(columns); + if (scaffoldEntity == null) { + return; + } if (assemblyEntity == null) { assemblyEntity = new AssemblyEntity(); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java index f2c0d502..28417ab7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java @@ -98,6 +98,9 @@ protected void parseAssemblyData(String line) { protected void parseChromosomeLine(String[] columns) { ChromosomeEntity chromosomeEntity = getChromosome(columns); + if (chromosomeEntity == null) { + return; + } if (assemblyEntity == null) { assemblyEntity = new AssemblyEntity(); @@ -114,6 +117,9 @@ protected void parseChromosomeLine(String[] columns) { protected void parseScaffoldLine(String[] columns) { ChromosomeEntity scaffoldEntity = getScaffold(columns); + if (scaffoldEntity == null) { + return; + } if (assemblyEntity == null) { assemblyEntity = new AssemblyEntity(); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java index cedad343..2365181b 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java @@ -33,11 +33,7 @@ import uk.ac.ebi.eva.contigalias.scheduler.ChecksumSetter; import javax.transaction.Transactional; -import java.io.BufferedReader; -import java.io.FileReader; import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -60,8 +56,6 @@ public class AssemblyService { private final ChecksumSetter checksumSetter; - private final int BATCH_SIZE = 100000; - private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); @Autowired @@ -111,119 +105,12 @@ public void fetchAndInsertAssembly(String accession) throws IOException { throw duplicateAssemblyInsertionException(accession, entity.get()); } - Optional downloadNCBIFilePathOpt = ncbiDataSource.downloadAssemblyReport(accession); - Path downloadedNCBIFilePath = downloadNCBIFilePathOpt.orElseThrow(() -> new AssemblyNotFoundException(accession)); - Optional downloadENAFilePathOpt = enaDataSource.downloadAssemblyReport(accession); - Path downloadedENAFilePath = downloadENAFilePathOpt.orElse(null); - - long numberOfChromosomesInFile = Files.lines(downloadedNCBIFilePath).filter(line -> !line.startsWith("#")).count(); - logger.info("Number of chromosomes in assembly (" + accession + "): " + numberOfChromosomesInFile); - - // parse file and save data - parseFileAndInsertAssembly(downloadedNCBIFilePath, downloadedENAFilePath); + // download file and save assembly and chromosome data + ncbiDataSource.parseFileAndInsertAssembly(accession, enaDataSource, assemblyRepository, chromosomeRepository); logger.info("Successfully inserted assembly for accession " + accession); // submit job for retrieving and updating MD5 Checksum for assembly (asynchronously) checksumSetter.updateMd5CheckSumForAssemblyAsync(accession); - - Files.deleteIfExists(downloadedNCBIFilePath); - if (downloadedENAFilePath != null) { - Files.deleteIfExists(downloadedENAFilePath); - } - } - - @Transactional - public void parseFileAndInsertAssembly(Path downloadedNCBIFilePath, Path downloadedENAFilePath) throws IOException { - AssemblyEntity assemblyEntity = ncbiDataSource.getAssemblyEntity(downloadedNCBIFilePath); - assemblyRepository.save(assemblyEntity); - - try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedNCBIFilePath.toFile()))) { - List chrLines = new ArrayList<>(); - String line; - long chromosomesSavedTillNow = 0; - while ((line = bufferedReader.readLine()) != null) { - if (line.startsWith("#")) { - continue; - } - chrLines.add(line); - if (chrLines.size() == BATCH_SIZE) { - List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); - if (downloadedENAFilePath != null) { - addENASequenceNameToChromosomes(assemblyEntity, chromosomeEntityList, downloadedENAFilePath); - } - chromosomeRepository.saveAll(chromosomeEntityList); - chromosomesSavedTillNow += chromosomeEntityList.size(); - logger.info("Number of total chromosomes saved till now : " + chromosomesSavedTillNow); - - chrLines = new ArrayList<>(); - } - } - - if (!chrLines.isEmpty()) { - List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); - if (downloadedENAFilePath != null) { - addENASequenceNameToChromosomes(assemblyEntity, chromosomeEntityList, downloadedENAFilePath); - } - chromosomeRepository.saveAll(chromosomeEntityList); - chromosomesSavedTillNow += chromosomeEntityList.size(); - logger.info("Number of total chromosomes saved till now : " + chromosomesSavedTillNow); - } - } - } - - public void addENASequenceNameToChromosomes(AssemblyEntity assemblyEntity, List ncbiChromosomeList, - Path downloadedENAFilePath) throws IOException { - try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedENAFilePath.toFile()))) { - List chrLines = new ArrayList<>(); - String line; - while ((line = bufferedReader.readLine()) != null) { - if (line.startsWith("accession")) { - continue; - } - chrLines.add(line); - if (chrLines.size() == BATCH_SIZE) { - List enaChromosomeList = enaDataSource.getChromosomeEntityList(assemblyEntity, chrLines); - enaDataSource.addENASequenceNames( - !enaChromosomeList.isEmpty() ? enaChromosomeList : Collections.emptyList(), - !ncbiChromosomeList.isEmpty() ? ncbiChromosomeList : Collections.emptyList() - ); - - chrLines = new ArrayList<>(); - } - } - if (!chrLines.isEmpty()) { - List enaChromosomeList = enaDataSource.getChromosomeEntityList(assemblyEntity, chrLines); - enaDataSource.addENASequenceNames( - !enaChromosomeList.isEmpty() ? enaChromosomeList : Collections.emptyList(), - !ncbiChromosomeList.isEmpty() ? ncbiChromosomeList : Collections.emptyList() - ); - } - } - } - - public void fetchAndInsertAssemblyOld(String accession) throws IOException { - Optional entity = assemblyRepository.findAssemblyEntityByAccession(accession); - if (entity.isPresent()) { - throw duplicateAssemblyInsertionException(accession, entity.get()); - } - Optional fetchAssembly = ncbiDataSource.getAssemblyByAccession(accession); - if (!fetchAssembly.isPresent()) { - throw new AssemblyNotFoundException(accession); - } - if (fetchAssembly.isPresent()) { - AssemblyEntity assemblyEntity = fetchAssembly.get(); - enaDataSource.addENASequenceNamesToAssembly(assemblyEntity); - if (assemblyEntity.getChromosomes() != null && assemblyEntity.getChromosomes().size() > 0) { - insertAssembly(assemblyEntity); - logger.info("Successfully inserted assembly for accession " + accession); - // submit job for retrieving and updating MD5 Checksum for assembly (asynchronously) - checksumSetter.updateMd5CheckSumForAssemblyAsync(accession); - } else { - logger.error("Skipping inserting assembly : No chromosome in assembly " + accession); - } - } else { - logger.error("Could not get assembly from NCBI"); - } } public void retrieveAndInsertMd5ChecksumForAssembly(String assembly) { From c0a851aa3b89f1d3b5ab96355e1f2e8d5eb2424b Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Fri, 2 Feb 2024 11:58:42 +0000 Subject: [PATCH 04/13] handle exception when ena assembly report is not found --- .../contigalias/datasource/ENAAssemblyDataSource.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java index b4c01557..5423a505 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java @@ -102,19 +102,19 @@ public Optional getAssemblyByAccession(String accession) throws public Optional downloadAssemblyReport(String accession) throws IOException { ENABrowser enaBrowser = factory.build(); enaBrowser.connect(); - Optional downloadPath; try { enaBrowser.connect(); - downloadPath = downloadAssemblyReport(enaBrowser, accession); - } finally { + return downloadAssemblyReport(enaBrowser, accession); + } catch (Exception e){ + logger.warn("Could not fetch Assembly Report from ENA for accession " + accession + "Exception: " + e); + return Optional.empty(); + }finally { try { enaBrowser.disconnect(); } catch (IOException e) { logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ") : " + e); } } - - return downloadPath; } @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) From dadb92878c223be8fc456b548e0a1b5bc36c34f9 Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Mon, 5 Feb 2024 12:29:50 +0000 Subject: [PATCH 05/13] manually reverting changes in case of failures --- .../datasource/NCBIAssemblyDataSource.java | 2 -- .../exception/AssemblyIngestionException.java | 8 +++++++ .../exception/ControllerExceptionHandler.java | 5 ++++ .../contigalias/repo/AssemblyRepository.java | 16 +++++++++++-- .../repo/ChromosomeRepository.java | 6 +++++ .../contigalias/scheduler/ChecksumSetter.java | 3 +-- .../contigalias/service/AssemblyService.java | 23 +++++++++++++++---- 7 files changed, 52 insertions(+), 11 deletions(-) create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblyIngestionException.java diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java index f798dd66..b6438d75 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java @@ -34,7 +34,6 @@ import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; -import javax.transaction.Transactional; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileReader; @@ -165,7 +164,6 @@ public Optional downloadAssemblyReport(String accession, NCBIBrowser ncbiB } } - @Transactional public void parseFileAndInsertAssembly(String accession, ENAAssemblyDataSource enaDataSource, AssemblyRepository assemblyRepository, ChromosomeRepository chromosomeRepository) throws IOException { Optional downloadNCBIFilePathOpt = downloadAssemblyReport(accession); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblyIngestionException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblyIngestionException.java new file mode 100644 index 00000000..c60b42ac --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblyIngestionException.java @@ -0,0 +1,8 @@ +package uk.ac.ebi.eva.contigalias.exception; + +public class AssemblyIngestionException extends RuntimeException { + + public AssemblyIngestionException(String accession) { + super("Error Ingesting assembly with accession " + accession); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/ControllerExceptionHandler.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/ControllerExceptionHandler.java index 93d5c868..2d6c0d3b 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/exception/ControllerExceptionHandler.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/ControllerExceptionHandler.java @@ -30,4 +30,9 @@ public ResponseEntity handleExceptions(DownloadFailedException exception return new ResponseEntity<>(exception.getMessage(), HttpStatus.INTERNAL_SERVER_ERROR); } + @ExceptionHandler(AssemblyIngestionException.class) + public ResponseEntity handleExceptions(AssemblyIngestionException exception, WebRequest webRequest){ + return new ResponseEntity<>(exception.getMessage(), HttpStatus.INTERNAL_SERVER_ERROR); + } + } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblyRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblyRepository.java index 9ffe65d8..62881539 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblyRepository.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblyRepository.java @@ -20,6 +20,9 @@ import org.springframework.data.domain.Pageable; import org.springframework.data.jpa.repository.JpaRepository; import org.springframework.data.jpa.repository.JpaSpecificationExecutor; +import org.springframework.data.jpa.repository.Modifying; +import org.springframework.data.jpa.repository.Query; +import org.springframework.data.repository.query.Param; import org.springframework.stereotype.Repository; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; @@ -46,8 +49,17 @@ default Optional findAssemblyEntityByAccession(String accession) Page findAssemblyEntitiesByTaxid(long taxid, Pageable pageable); @Transactional - void deleteAssemblyEntityByInsdcAccession(String insdcAccession); + @Modifying + @Query("DELETE FROM AssemblyEntity a WHERE a.insdcAccession=:asmInsdcAccession") + void deleteAssemblyEntityByInsdcAccession(@Param("asmInsdcAccession") String asmInsdcAccession); @Transactional - void deleteAssemblyEntityByRefseq(String refseq); + @Modifying + @Query("DELETE FROM AssemblyEntity a WHERE a.refseq=:asmRefSeq") + void deleteAssemblyEntityByRefseq(@Param("asmRefSeq") String asmRefSeq); + + @Transactional + @Modifying + @Query("DELETE FROM AssemblyEntity a WHERE a.insdcAccession=:asmAccession OR a.refseq=:asmAccession") + void deleteAssemblyEntityByInsdcAccessionOrRefseq(@Param("asmAccession") String asmAccession); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java index 2a1ae338..bf3d2784 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java @@ -27,6 +27,7 @@ import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import javax.transaction.Transactional; import java.util.List; @Repository @@ -50,6 +51,11 @@ public interface ChromosomeRepository extends JpaRepository findChromosomeEntitiesByAssembly_Refseq(String asmRefseq, Pageable request); Page findChromosomeEntitiesByGenbankSequenceNameAndAssembly_Taxid(String genbankName, long asmTaxid, Pageable request); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java index 564b50ca..f6ff960f 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java @@ -17,7 +17,6 @@ import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ExecutionException; @Component public class ChecksumSetter { @@ -48,7 +47,7 @@ public void updateMd5CheckSumForAllAssemblies() { CompletableFuture future = updateMd5CheckSumForAssemblyAsync(assembly); try { future.get(); - } catch (InterruptedException | ExecutionException e) { + } catch (Exception e) { logger.error("Encountered an error when running MD5Checksum update for assembly: " + assembly); } finally { scheduledToRunMD5ChecksumUpdateTasks.remove(assembly); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java index 2365181b..ad373682 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java @@ -26,6 +26,7 @@ import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.exception.AssemblyIngestionException; import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblyException; import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; @@ -105,12 +106,24 @@ public void fetchAndInsertAssembly(String accession) throws IOException { throw duplicateAssemblyInsertionException(accession, entity.get()); } - // download file and save assembly and chromosome data - ncbiDataSource.parseFileAndInsertAssembly(accession, enaDataSource, assemblyRepository, chromosomeRepository); - logger.info("Successfully inserted assembly for accession " + accession); + try { + // download file and save assembly and chromosome data + ncbiDataSource.parseFileAndInsertAssembly(accession, enaDataSource, assemblyRepository, chromosomeRepository); + logger.info("Successfully inserted assembly for accession " + accession); + + // submit job for retrieving and updating MD5 Checksum for assembly (asynchronously) + checksumSetter.updateMd5CheckSumForAssemblyAsync(accession); + } catch (Exception e) { + // roll back inserted entries in case of any exception or error + logger.error("Exception while inserting assembly " + accession + " Rolling back changes"); + deleteEntriesForAssembly(accession); + throw new AssemblyIngestionException(accession); + } + } - // submit job for retrieving and updating MD5 Checksum for assembly (asynchronously) - checksumSetter.updateMd5CheckSumForAssemblyAsync(accession); + public void deleteEntriesForAssembly(String accession) { + chromosomeRepository.deleteChromosomeEntitiesByAssembly_InsdcAccession(accession); + assemblyRepository.deleteAssemblyEntityByInsdcAccessionOrRefseq(accession); } public void retrieveAndInsertMd5ChecksumForAssembly(String assembly) { From 5515132921bd5577460f922221e5052fa1c49637 Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Thu, 8 Feb 2024 23:06:03 +0000 Subject: [PATCH 06/13] scheduled jobs for ena sequence name and md5 checksum retriever --- .../controller/admin/AdminController.java | 33 +++-- .../controller/admin/AdminHandler.java | 13 +- .../contigalias/ContigAliasHandler.java | 2 +- .../datasource/ENAAssemblyDataSource.java | 29 +--- .../datasource/NCBIAssemblyDataSource.java | 63 -------- .../contigalias/entities/AssemblyEntity.java | 10 +- .../contigalias/entities/SequenceEntity.java | 3 +- .../repo/ChromosomeRepository.java | 4 + .../scheduler/ApplicationContextHolder.java | 21 +++ .../contigalias/scheduler/ChecksumSetter.java | 137 ------------------ .../scheduler/ChromosomeUpdater.java | 60 ++++++++ .../scheduler/ENASequenceNameUpdater.java | 90 ++++++++++++ .../ac/ebi/eva/contigalias/scheduler/Job.java | 19 +++ .../scheduler/JobSubmittedEvent.java | 10 ++ .../eva/contigalias/scheduler/JobType.java | 6 + .../scheduler/MD5ChecksumUpdater.java | 88 +++++++++++ .../scheduler/Md5ChecksumRetriever.java | 29 ---- .../contigalias/service/AssemblyService.java | 110 +++++++++----- .../service/ChromosomeService.java | 42 +++++- src/main/resources/application.properties | 1 + .../AssemblyServiceIntegrationTest.java | 13 +- 21 files changed, 456 insertions(+), 327 deletions(-) create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ApplicationContextHolder.java delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobSubmittedEvent.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobType.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Md5ChecksumRetriever.java diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java index 37e2d29a..e32d73b7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java @@ -33,8 +33,6 @@ import java.io.IOException; import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; @RequestMapping("/v1/admin") @RestController @@ -101,24 +99,35 @@ public ResponseEntity retrieveAndInsertMd5ChecksumForAssembly(@PathVaria handler.retrieveAndInsertMd5ChecksumForAssembly(asmAccession); return ResponseEntity.ok("A task has been submitted for updating md5checksum for all chromosomes " + "in assembly " + asmAccession + ". Depending upon the number of chromosomes present in assembly, " + - "this might take some time to complete"); + "and other scheduled jobs, this might take some time to complete"); } catch (AssemblyNotFoundException e) { return ResponseEntity.ok("Could not find assembly " + asmAccession + ". Please insert the assembly first (md5checksum will be updated as part of the insertion process"); } } + @ApiOperation(value = "Given an assembly accession, retrieve ENA sequence name for all chromosomes belonging to assembly and update") + @PutMapping(value = "assemblies/{accession}/enaSequenceName") + public ResponseEntity retrieveAndInsertENASequenceNameForAssembly(@PathVariable(name = "accession") + @ApiParam(value = "INSDC or RefSeq assembly accession. " + + "Eg: GCA_000001405.10") String asmAccession) { + try { + handler.getAssemblyByAccession(asmAccession); + handler.retrieveAndInsertENASequenceNameForAssembly(asmAccession); + return ResponseEntity.ok("A task has been submitted for updating ENA Sequence Name for all chromosomes " + + "in assembly " + asmAccession + ". Depending upon the number of chromosomes present in assembly, " + + "and other scheduled jobs, this might take some time to complete"); + } catch (AssemblyNotFoundException e) { + return ResponseEntity.ok("Could not find assembly " + asmAccession + + ". Please insert the assembly first (ENA sequence name will be updated as part of the insertion process"); + } + } + @ApiOperation(value = "Retrieve list of assemblies for which MD5 Checksum updates are running/going-to-run ") @GetMapping(value = "assemblies/md5checksum/status") - public ResponseEntity getMD5ChecksumUpdateTaskStatus() { - Map> md5ChecksumUpdateTasks = handler.getMD5ChecksumUpdateTaskStatus(); - Set runningTasks = md5ChecksumUpdateTasks.get("running"); - Set scheduledTasks = md5ChecksumUpdateTasks.get("scheduled"); - String runningTaskRes = runningTasks == null || runningTasks.isEmpty() ? "No running MD5 checksum update tasks" : - runningTasks.stream().collect(Collectors.joining(",")); - String scheduledTaskRes = scheduledTasks == null || scheduledTasks.isEmpty() ? "No scheduled MD5 checksum update tasks" : - scheduledTasks.stream().collect(Collectors.joining(",")); - return ResponseEntity.ok("running: " + runningTaskRes + "\nscheduled: " + scheduledTaskRes); + public ResponseEntity> getMD5ChecksumUpdateTaskStatus() { + List scheduledJobStatus = handler.getScheduledJobStatus(); + return ResponseEntity.ok(scheduledJobStatus); } // This endpoint can be enabled in the future when checksums for assemblies are added to the project. diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java index 2007cd17..d68f4c92 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java @@ -19,16 +19,13 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.web.PagedResourcesAssembler; import org.springframework.stereotype.Service; - import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.service.AssemblyService; import uk.ac.ebi.eva.contigalias.service.ChromosomeService; -import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; @Service public class AdminHandler { @@ -52,7 +49,7 @@ public Optional getAssemblyByAccession(String accession) { return assemblyService.getAssemblyByAccession(accession); } - public void fetchAndInsertAssemblyByAccession(String accession) throws IOException { + public void fetchAndInsertAssemblyByAccession(String accession) { assemblyService.fetchAndInsertAssembly(accession); } @@ -64,8 +61,12 @@ public void retrieveAndInsertMd5ChecksumForAssembly(String accession) { assemblyService.retrieveAndInsertMd5ChecksumForAssembly(accession); } - public Map> getMD5ChecksumUpdateTaskStatus() { - return assemblyService.getMD5ChecksumUpdateTaskStatus(); + public void retrieveAndInsertENASequenceNameForAssembly(String accession) { + assemblyService.retrieveAndInsertENASequenceNameForAssembly(accession); + } + + public List getScheduledJobStatus() { + return assemblyService.getScheduledJobStatus(); } public void deleteAssemblyByAccession(String accession) { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/contigalias/ContigAliasHandler.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/contigalias/ContigAliasHandler.java index f280ef16..11f07306 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/contigalias/ContigAliasHandler.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/contigalias/ContigAliasHandler.java @@ -19,7 +19,6 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.domain.Page; import org.springframework.data.domain.PageImpl; -import org.springframework.data.domain.PageRequest; import org.springframework.data.domain.Pageable; import org.springframework.data.web.PagedResourcesAssembler; import org.springframework.hateoas.EntityModel; @@ -83,6 +82,7 @@ public PagedModel> getAssemblyByRefseq(String refseq public PagedModel> getAssembliesByTaxid(long taxid, Pageable request) { Page page = assemblyService.getAssembliesByTaxid(taxid, request); + page.forEach(it->it.setChromosomes(null)); return generatePagedModelFromPage(page, assemblyAssembler); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java index 5423a505..5bdc32a7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java @@ -33,9 +33,7 @@ import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; import uk.ac.ebi.eva.contigalias.exception.DownloadFailedException; -import java.io.BufferedReader; import java.io.FileInputStream; -import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; @@ -112,7 +110,7 @@ public Optional downloadAssemblyReport(String accession) throws IOExceptio try { enaBrowser.disconnect(); } catch (IOException e) { - logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ") : " + e); + logger.warn("Error while trying to disconnect - enaBrowser (assembly: " + accession + ") : " + e); } } } @@ -195,29 +193,4 @@ public void addENASequenceNames( } } } - - public void addENASequenceNameToChromosomes(List ncbiChromosomeList, - Path downloadedENAFilePath, final int BATCH_SIZE) throws IOException { - try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedENAFilePath.toFile()))) { - List chrLines = new ArrayList<>(); - List enaChromosomeList; - String line; - while ((line = bufferedReader.readLine()) != null) { - if (line.startsWith("accession")) { - continue; - } - chrLines.add(line); - if (chrLines.size() == BATCH_SIZE) { - enaChromosomeList = getChromosomeEntityList(chrLines); - addENASequenceNames(enaChromosomeList, ncbiChromosomeList); - - chrLines = new ArrayList<>(); - } - } - if (!chrLines.isEmpty()) { - enaChromosomeList = getChromosomeEntityList(chrLines); - addENASequenceNames(enaChromosomeList, ncbiChromosomeList); - } - } - } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java index b6438d75..22f5c2a4 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java @@ -30,13 +30,8 @@ import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; -import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; -import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; -import java.io.BufferedReader; import java.io.FileInputStream; -import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; @@ -52,8 +47,6 @@ public class NCBIAssemblyDataSource implements AssemblyDataSource { private final Logger logger = LoggerFactory.getLogger(NCBIAssemblyDataSource.class); - private final int BATCH_SIZE = 100000; - private final NCBIBrowserFactory factory; private final NCBIAssemblyReportReaderFactory readerFactory; @@ -163,60 +156,4 @@ public Optional downloadAssemblyReport(String accession, NCBIBrowser ncbiB return Optional.empty(); } } - - public void parseFileAndInsertAssembly(String accession, ENAAssemblyDataSource enaDataSource, - AssemblyRepository assemblyRepository, ChromosomeRepository chromosomeRepository) throws IOException { - Optional downloadNCBIFilePathOpt = downloadAssemblyReport(accession); - Path downloadedNCBIFilePath = downloadNCBIFilePathOpt.orElseThrow(() -> new AssemblyNotFoundException(accession)); - Optional downloadENAFilePathOpt = enaDataSource.downloadAssemblyReport(accession); - Path downloadedENAFilePath = downloadENAFilePathOpt.orElse(null); - - long numberOfChromosomesInFile = Files.lines(downloadedNCBIFilePath).filter(line -> !line.startsWith("#")).count(); - logger.info("Number of chromosomes in assembly (" + accession + "): " + numberOfChromosomesInFile); - - AssemblyEntity assemblyEntity = getAssemblyEntity(downloadedNCBIFilePath); - assemblyRepository.save(assemblyEntity); - - try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedNCBIFilePath.toFile()))) { - long chromosomesSavedTillNow = 0l; - List chrLines = new ArrayList<>(); - String line; - while ((line = bufferedReader.readLine()) != null) { - if (line.startsWith("#")) { - continue; - } - chrLines.add(line); - if (chrLines.size() == BATCH_SIZE) { - // add ena sequence name and save - addENASequenceNameAndSave(assemblyEntity, chrLines, enaDataSource, downloadedENAFilePath, chromosomeRepository); - chromosomesSavedTillNow += chrLines.size(); - logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); - - chrLines = new ArrayList<>(); - } - } - if (!chrLines.isEmpty()) { - // add ena sequence name and save - addENASequenceNameAndSave(assemblyEntity, chrLines, enaDataSource, downloadedENAFilePath, chromosomeRepository); - chromosomesSavedTillNow += chrLines.size(); - logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); - } - } - - // delete the files after assembly insertion - Files.deleteIfExists(downloadedNCBIFilePath); - if (downloadedENAFilePath != null) { - Files.deleteIfExists(downloadedENAFilePath); - } - } - - public void addENASequenceNameAndSave(AssemblyEntity assemblyEntity, List chrLines, - ENAAssemblyDataSource enaDataSource, Path downloadedENAFilePath, - ChromosomeRepository chromosomeRepository) throws IOException { - List chromosomeEntityList = getChromosomeEntityList(assemblyEntity, chrLines); - if (downloadedENAFilePath != null) { - enaDataSource.addENASequenceNameToChromosomes(chromosomeEntityList, downloadedENAFilePath, BATCH_SIZE); - } - chromosomeRepository.saveAll(chromosomeEntityList); - } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java index 7ada7de5..4609d941 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java @@ -24,6 +24,7 @@ import javax.persistence.CascadeType; import javax.persistence.Column; import javax.persistence.Entity; +import javax.persistence.FetchType; import javax.persistence.Id; import javax.persistence.OneToMany; import javax.persistence.Table; @@ -63,8 +64,8 @@ public class AssemblyEntity { @JsonInclude(JsonInclude.Include.NON_NULL) @ApiModelProperty(value = "List of all chromosomes of the assembly present in the database.") - @LazyCollection(LazyCollectionOption.FALSE) - @OneToMany(mappedBy = "assembly", cascade = CascadeType.ALL) + @LazyCollection(LazyCollectionOption.TRUE) + @OneToMany(mappedBy = "assembly", cascade = CascadeType.REMOVE, fetch = FetchType.LAZY) private List chromosomes; public AssemblyEntity() { @@ -178,11 +179,6 @@ public String toString() { .append("trunc512checksum :\t") .append(this.trunc512checksum) .append("\n"); - if (this.chromosomes != null) { - builder.append("Number of chromosomes :\t") - .append(this.chromosomes.size()) - .append("\n"); - } return builder.toString(); } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java index e5afdbdc..641b3af4 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java @@ -19,7 +19,6 @@ import com.fasterxml.jackson.annotation.JsonInclude; import io.swagger.annotations.ApiModelProperty; -import javax.persistence.CascadeType; import javax.persistence.Column; import javax.persistence.EnumType; import javax.persistence.Enumerated; @@ -68,7 +67,7 @@ public enum ContigType { @Id @JsonInclude(JsonInclude.Include.NON_NULL) @ApiModelProperty(value = "Assembly that this sequence belongs to.") - @ManyToOne(cascade = CascadeType.ALL) + @ManyToOne private AssemblyEntity assembly; public String getGenbankSequenceName() { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java index bf3d2784..5979af07 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java @@ -51,6 +51,10 @@ public interface ChromosomeRepository extends JpaRepository> runningMD5ChecksumUpdateTasks = new ConcurrentHashMap<>(); - private Set scheduledToRunMD5ChecksumUpdateTasks = new HashSet<>(); - private int DEFAULT_PAGE_SIZE = 10000; - private JdbcTemplate jdbcTemplate; - private ChromosomeService chromosomeService; - private Md5ChecksumRetriever md5ChecksumRetriever; - - @Autowired - public ChecksumSetter(ChromosomeService chromosomeService, Md5ChecksumRetriever md5ChecksumRetriever, - JdbcTemplate jdbcTemplate) { - this.chromosomeService = chromosomeService; - this.md5ChecksumRetriever = md5ChecksumRetriever; - this.jdbcTemplate = jdbcTemplate; - } - - //@Scheduled(cron = "0 0 1 ? * TUE") - public void updateMd5CheckSumForAllAssemblies() { - List assemblyList = chromosomeService.getAssembliesWhereChromosomeMd5ChecksumIsNull(); - logger.info("List of assemblies to be updated for MD5 Checksum: " + assemblyList); - scheduledToRunMD5ChecksumUpdateTasks = new HashSet<>(assemblyList); - - for (String assembly : assemblyList) { - scheduledToRunMD5ChecksumUpdateTasks.remove(assembly); - CompletableFuture future = updateMd5CheckSumForAssemblyAsync(assembly); - try { - future.get(); - } catch (Exception e) { - logger.error("Encountered an error when running MD5Checksum update for assembly: " + assembly); - } finally { - scheduledToRunMD5ChecksumUpdateTasks.remove(assembly); - } - } - } - - public CompletableFuture updateMd5CheckSumForAssemblyAsync(String assembly) { - logger.info("Submitted job for updating MD5 Checksum for assembly (asynchronously)"); - // Check if the async task for this assembly is already running - CompletableFuture existingTask = runningMD5ChecksumUpdateTasks.get(assembly); - if (existingTask != null && !existingTask.isDone()) { - logger.info("Async task is still running for assembly: " + assembly); - return existingTask; - } - // Start the async task (removing existing run if present) - runningMD5ChecksumUpdateTasks.remove(assembly); - CompletableFuture future = CompletableFuture.runAsync(() -> { - updateMD5ChecksumForAllChromosomesInAssembly(assembly); - }); - // Store the future in the map for the given assembly - runningMD5ChecksumUpdateTasks.put(assembly, future); - - // check the status of task upon completion and remove from running tasks - future.whenComplete((result, exception) -> { - if (exception != null) { - logger.error("Async task (MD5Checksum setter) failed for assembly: " + assembly, exception); - } else { - logger.info("Async task (MD5Checksum setter) completed successfully for assembly: " + assembly); - } - runningMD5ChecksumUpdateTasks.remove(assembly); - }); - - return future; - } - - public void updateMD5ChecksumForAllChromosomesInAssembly(String assembly) { - logger.info("Trying to update md5checksum for assembly: " + assembly); - String sql = "select * from chromosome c where c.assembly_insdc_accession = '" + assembly - + "' AND (c.md5checksum IS NULL OR c.md5checksum = '')"; - jdbcTemplate.query(sql, (ResultSetExtractor) rs -> { - long chromosomeUpdated = 0; - List chromosomeEntityList = new ArrayList<>(); - while (rs.next()) { - ChromosomeEntity chromosome = new ChromosomeEntity(); - chromosome.setInsdcAccession(rs.getString(1)); - chromosomeEntityList.add(chromosome); - - if (chromosomeEntityList.size() == DEFAULT_PAGE_SIZE) { - updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); - chromosomeUpdated += chromosomeEntityList.size(); - logger.info("Chromosomes Updated till now: " + chromosomeUpdated); - chromosomeEntityList = new ArrayList<>(); - } - } - if (chromosomeEntityList.size() > 0) { - updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); - chromosomeUpdated += chromosomeEntityList.size(); - logger.info("Chromosomes Updated till now: " + chromosomeUpdated); - } - - logger.info("Finished updating md5checksum for assembly: " + assembly); - - return null; - }); - } - - public void updateMd5ChecksumForChromosome(String assembly, List chromosomesList) { - chromosomesList.parallelStream().forEach(chromosome -> { - try { - String md5Checksum = md5ChecksumRetriever.retrieveMd5Checksum(chromosome.getInsdcAccession()); - chromosome.setMd5checksum(md5Checksum); - } catch (Exception e) { - logger.info("Could not retrieve md5Checksum for insdc accession: " + chromosome.getInsdcAccession()); - } - }); - - chromosomeService.updateMd5ChecksumForAllChromosomeInAssembly(assembly, chromosomesList); - } - - public Map> getMD5ChecksumUpdateTaskStatus() { - Map> taskStatus = new HashMap<>(); - taskStatus.put("running", runningMD5ChecksumUpdateTasks.keySet()); - taskStatus.put("scheduled", scheduledToRunMD5ChecksumUpdateTasks); - return taskStatus; - } -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java new file mode 100644 index 00000000..b071eb9a --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java @@ -0,0 +1,60 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.ApplicationListener; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; + +import java.util.List; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.stream.Collectors; + +@Service +public class ChromosomeUpdater implements ApplicationListener { + private final Logger logger = LoggerFactory.getLogger(ChromosomeUpdater.class); + private final BlockingQueue jobQueue = new LinkedBlockingQueue<>(); + private final ENASequenceNameUpdater enaSequenceNameUpdater; + private final MD5ChecksumUpdater md5ChecksumUpdater; + + + @Autowired + public ChromosomeUpdater(ENASequenceNameUpdater enaSequenceNameUpdater, MD5ChecksumUpdater md5ChecksumUpdater) { + this.md5ChecksumUpdater = md5ChecksumUpdater; + this.enaSequenceNameUpdater = enaSequenceNameUpdater; + } + + public void submitJob(Job job) { + jobQueue.add(job); + logger.info("Submitted Job : " + job.getType() + " for assembly " + job.getParameter()); + JobSubmittedEvent event = new JobSubmittedEvent(this); + ApplicationContextHolder.getApplicationContext().publishEvent(event); + } + + @Override + public void onApplicationEvent(JobSubmittedEvent event) { + processJobs(); + } + + @Async + public void processJobs() { + while (!jobQueue.isEmpty()) { + try { + Job job = jobQueue.take(); + if (job.getType() == JobType.ENA_SEQUENCE_NAME_UPDATE) { + enaSequenceNameUpdater.updateENASequenceNameForAssembly(job.getParameter()); + } else if (job.getType() == JobType.MD5_CHECKSUM_UPDATE) { + md5ChecksumUpdater.updateMD5ChecksumForAssembly(job.getParameter()); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + } + + public List getScheduledJobStatus() { + return jobQueue.stream().map(j -> j.getType().toString()).collect(Collectors.toList()); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java new file mode 100644 index 00000000..7b14c7e3 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java @@ -0,0 +1,90 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Component; +import uk.ac.ebi.eva.contigalias.datasource.ENAAssemblyDataSource; +import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.service.ChromosomeService; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +@Component +public class ENASequenceNameUpdater { + private final Logger logger = LoggerFactory.getLogger(MD5ChecksumUpdater.class); + private final int DEFAULT_BATCH_SIZE = 100000; + private final ENAAssemblyDataSource enaDataSource; + + private final ChromosomeService chromosomeService; + + public ENASequenceNameUpdater(ENAAssemblyDataSource enaDataSource, ChromosomeService chromosomeService) { + this.enaDataSource = enaDataSource; + this.chromosomeService = chromosomeService; + } + + public void updateENASequenceNameForAssembly(String assembly) { + Path downloadedENAFilePath = Paths.get(""); + try { + logger.info("Trying to update ENA Sequence Name for assembly: " + assembly); + Optional downloadENAFilePathOpt = enaDataSource.downloadAssemblyReport(assembly); + if (downloadENAFilePathOpt.isPresent()) { + downloadedENAFilePath = downloadENAFilePathOpt.get(); + + long numberOfChromosomesInFile = Files.lines(downloadedENAFilePath) + .filter(line -> !line.startsWith("accession")).count(); + logger.info("Number of chromosomes in assembly (" + assembly + "): " + numberOfChromosomesInFile); + + // retrieve and save ena sequence names + retrieveAndUpdateENASequenceNames(assembly, downloadedENAFilePath); + } else { + logger.warn("Could not download assembly report for assembly : " + assembly); + } + } catch (Exception e) { + logger.error("Error while updating ENA Sequence Name for assembly : " + assembly + "\n" + e); + } finally { + try { + Files.deleteIfExists(downloadedENAFilePath); + } catch (IOException e) { + logger.error("Error while deleting downloaded ENA assembly report file with path " + downloadedENAFilePath + + " for assembly : " + assembly); + } + } + } + + public void retrieveAndUpdateENASequenceNames(String assembly, Path downloadedENAFilePath) throws IOException { + try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedENAFilePath.toFile()))) { + long chromosomesSavedTillNow = 0l; + List chrLines = new ArrayList<>(); + String line; + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("accession")) { + continue; + } + chrLines.add(line); + if (chrLines.size() == DEFAULT_BATCH_SIZE) { + List chromosomeEntityList = enaDataSource.getChromosomeEntityList(chrLines); + chromosomeService.updateENASequenceNameForAllChromosomeInAssembly(assembly, chromosomeEntityList); + chromosomesSavedTillNow += chrLines.size(); + logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); + + chrLines = new ArrayList<>(); + } + } + if (!chrLines.isEmpty()) { + // add ena sequence name and save + List chromosomeEntityList = enaDataSource.getChromosomeEntityList(chrLines); + chromosomeService.updateENASequenceNameForAllChromosomeInAssembly(assembly, chromosomeEntityList); + chromosomesSavedTillNow += chrLines.size(); + logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); + } + } + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job.java new file mode 100644 index 00000000..a5c39ec4 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job.java @@ -0,0 +1,19 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +public class Job { + private final JobType type; + private final String parameter; + + public Job(JobType type, String parameter) { + this.type = type; + this.parameter = parameter; + } + + public JobType getType() { + return type; + } + + public String getParameter() { + return parameter; + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobSubmittedEvent.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobSubmittedEvent.java new file mode 100644 index 00000000..df9eb94a --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobSubmittedEvent.java @@ -0,0 +1,10 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + + +import org.springframework.context.ApplicationEvent; + +public class JobSubmittedEvent extends ApplicationEvent { + public JobSubmittedEvent(Object source) { + super(source); + } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobType.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobType.java new file mode 100644 index 00000000..204e423e --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobType.java @@ -0,0 +1,6 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +public enum JobType { + ENA_SEQUENCE_NAME_UPDATE, + MD5_CHECKSUM_UPDATE +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java new file mode 100644 index 00000000..978d7580 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java @@ -0,0 +1,88 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +import com.fasterxml.jackson.databind.JsonNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.jdbc.core.ResultSetExtractor; +import org.springframework.stereotype.Component; +import org.springframework.web.client.RestTemplate; +import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.service.ChromosomeService; + +import java.util.ArrayList; +import java.util.List; + +@Component +public class MD5ChecksumUpdater { + private final Logger logger = LoggerFactory.getLogger(MD5ChecksumUpdater.class); + private final int DEFAULT_BATCH_SIZE = 10000; + private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER"; + private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata"; + private RestTemplate restTemplate; + private final JdbcTemplate jdbcTemplate; + private final ChromosomeService chromosomeService; + + @Autowired + public MD5ChecksumUpdater(RestTemplate restTemplate, JdbcTemplate jdbcTemplate, ChromosomeService chromosomeService) { + this.restTemplate = restTemplate; + this.jdbcTemplate = jdbcTemplate; + this.chromosomeService = chromosomeService; + } + + public void updateMD5ChecksumForAssembly(String assembly) { + try { + logger.info("Trying to update MD5 Checksum for assembly: " + assembly); + String sql = "select * from chromosome c where c.assembly_insdc_accession = '" + assembly + + "' AND (c.md5checksum IS NULL OR c.md5checksum = '')"; + jdbcTemplate.query(sql, (ResultSetExtractor) rs -> { + long chromosomeUpdated = 0; + List chromosomeEntityList = new ArrayList<>(); + while (rs.next()) { + ChromosomeEntity chromosome = new ChromosomeEntity(); + chromosome.setInsdcAccession(rs.getString(1)); + chromosomeEntityList.add(chromosome); + + if (chromosomeEntityList.size() == DEFAULT_BATCH_SIZE) { + updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); + chromosomeUpdated += chromosomeEntityList.size(); + logger.info("Chromosomes Updated till now: " + chromosomeUpdated); + chromosomeEntityList = new ArrayList<>(); + } + } + if (chromosomeEntityList.size() > 0) { + updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); + chromosomeUpdated += chromosomeEntityList.size(); + logger.info("Chromosomes Updated till now: " + chromosomeUpdated); + } + + logger.info("Finished updating MD5 Checksum for assembly: " + assembly); + + return null; + }); + } catch (Exception e) { + logger.error("Error while updating MD5 Checksum for assembly : " + assembly + "\n" + e); + } + } + + public void updateMd5ChecksumForChromosome(String assembly, List chromosomesList) { + chromosomesList.parallelStream().forEach(chromosome -> { + try { + String md5Checksum = retrieveMd5Checksum(chromosome.getInsdcAccession()); + chromosome.setMd5checksum(md5Checksum); + } catch (Exception e) { + logger.info("Could not retrieve MD5 Checksum for insdc accession: " + chromosome.getInsdcAccession()); + } + }); + + chromosomeService.updateMd5ChecksumForAllChromosomeInAssembly(assembly, chromosomesList); + } + + public String retrieveMd5Checksum(String insdcAccession) { + String apiURL = INSDC_CHECKSUM_URL.replace(INSDC_ACCESSION_PLACE_HOLDER, insdcAccession); + JsonNode jsonResponse = restTemplate.getForObject(apiURL, JsonNode.class); + String md5Checksum = jsonResponse.get("metadata").get("md5").asText(); + return md5Checksum; + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Md5ChecksumRetriever.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Md5ChecksumRetriever.java deleted file mode 100644 index 912e5d6c..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Md5ChecksumRetriever.java +++ /dev/null @@ -1,29 +0,0 @@ -package uk.ac.ebi.eva.contigalias.scheduler; - -import com.fasterxml.jackson.databind.JsonNode; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; -import org.springframework.web.client.RestTemplate; - -@Component -public class Md5ChecksumRetriever { - private final Logger logger = LoggerFactory.getLogger(Md5ChecksumRetriever.class); - private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER"; - private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata"; - - private RestTemplate restTemplate; - - @Autowired - public Md5ChecksumRetriever(RestTemplate restTemplate) { - this.restTemplate = restTemplate; - } - - public String retrieveMd5Checksum(String insdcAccession) { - String apiURL = INSDC_CHECKSUM_URL.replace(INSDC_ACCESSION_PLACE_HOLDER, insdcAccession); - JsonNode jsonResponse = restTemplate.getForObject(apiURL, JsonNode.class); - String md5Checksum = jsonResponse.get("metadata").get("md5").asText(); - return md5Checksum; - } -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java index ad373682..731cdd7a 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java @@ -31,21 +31,29 @@ import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblyException; import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; -import uk.ac.ebi.eva.contigalias.scheduler.ChecksumSetter; +import uk.ac.ebi.eva.contigalias.scheduler.ChromosomeUpdater; +import uk.ac.ebi.eva.contigalias.scheduler.Job; +import uk.ac.ebi.eva.contigalias.scheduler.JobType; import javax.transaction.Transactional; +import java.io.BufferedReader; +import java.io.FileReader; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; @Service public class AssemblyService { + private final int BATCH_SIZE = 100000; + + private final ChromosomeService chromosomeService; private final AssemblyRepository assemblyRepository; @@ -55,36 +63,34 @@ public class AssemblyService { private final ENAAssemblyDataSource enaDataSource; - private final ChecksumSetter checksumSetter; + private final ChromosomeUpdater chromosomeUpdater; private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); @Autowired - public AssemblyService(AssemblyRepository repository, ChromosomeRepository chromosomeRepository, + public AssemblyService(ChromosomeService chromosomeService, AssemblyRepository repository, ChromosomeRepository chromosomeRepository, NCBIAssemblyDataSource ncbiDataSource, ENAAssemblyDataSource enaDataSource, - ChecksumSetter checksumSetter) { + ChromosomeUpdater chromosomeUpdater) { + this.chromosomeService = chromosomeService; this.assemblyRepository = repository; this.chromosomeRepository = chromosomeRepository; this.ncbiDataSource = ncbiDataSource; this.enaDataSource = enaDataSource; - this.checksumSetter = checksumSetter; + this.chromosomeUpdater = chromosomeUpdater; } public Optional getAssemblyByInsdcAccession(String insdcAccession) { Optional entity = assemblyRepository.findAssemblyEntityByInsdcAccession(insdcAccession); - stripAssemblyFromChromosomes(entity); return entity; } public Optional getAssemblyByRefseq(String refseq) { Optional entity = assemblyRepository.findAssemblyEntityByRefseq(refseq); - stripAssemblyFromChromosomes(entity); return entity; } public Page getAssembliesByTaxid(long taxid, Pageable request) { Page page = assemblyRepository.findAssemblyEntitiesByTaxid(taxid, request); - page.forEach(this::stripAssemblyFromChromosomes); return page; } @@ -99,7 +105,7 @@ public void putAssemblyChecksumsByAccession(String accession, String md5, String assemblyRepository.save(assemblyEntity); } - public void fetchAndInsertAssembly(String accession) throws IOException { + public void fetchAndInsertAssembly(String accession) { // check if assembly already exists in db Optional entity = assemblyRepository.findAssemblyEntityByAccession(accession); if (entity.isPresent()) { @@ -108,58 +114,94 @@ public void fetchAndInsertAssembly(String accession) throws IOException { try { // download file and save assembly and chromosome data - ncbiDataSource.parseFileAndInsertAssembly(accession, enaDataSource, assemblyRepository, chromosomeRepository); + logger.info("Start inserting assembly for accession " + accession); + parseFileAndInsertAssembly(accession); logger.info("Successfully inserted assembly for accession " + accession); - // submit job for retrieving and updating MD5 Checksum for assembly (asynchronously) - checksumSetter.updateMd5CheckSumForAssemblyAsync(accession); + // submit job for updating ENA Sequence name for assembly (asynchronously) + Job enaSequenceNameupdateJob = new Job(JobType.ENA_SEQUENCE_NAME_UPDATE, accession); + chromosomeUpdater.submitJob(enaSequenceNameupdateJob); + + // submit job for updating MD5 Checksum for assembly (asynchronously) + Job md5ChecksumupdateJob = new Job(JobType.MD5_CHECKSUM_UPDATE, accession); + chromosomeUpdater.submitJob(md5ChecksumupdateJob); } catch (Exception e) { // roll back inserted entries in case of any exception or error - logger.error("Exception while inserting assembly " + accession + " Rolling back changes"); + logger.error("Exception while inserting assembly " + accession + " Rolling back changes. \n" + e); deleteEntriesForAssembly(accession); throw new AssemblyIngestionException(accession); } } + public void parseFileAndInsertAssembly(String accession) throws IOException { + Optional downloadNCBIFilePathOpt = ncbiDataSource.downloadAssemblyReport(accession); + Path downloadedNCBIFilePath = downloadNCBIFilePathOpt.orElseThrow(() -> new AssemblyNotFoundException(accession)); + + long numberOfChromosomesInFile = Files.lines(downloadedNCBIFilePath).filter(line -> !line.startsWith("#")).count(); + logger.info("Number of chromosomes in assembly (" + accession + "): " + numberOfChromosomesInFile); + + AssemblyEntity assemblyEntity = ncbiDataSource.getAssemblyEntity(downloadedNCBIFilePath); + assemblyRepository.save(assemblyEntity); + + try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedNCBIFilePath.toFile()))) { + long chromosomesSavedTillNow = 0l; + List chrLines = new ArrayList<>(); + String line; + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + chrLines.add(line); + if (chrLines.size() == BATCH_SIZE) { + List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + chromosomeService.saveAllChromosomes(chromosomeEntityList); + chromosomesSavedTillNow += chrLines.size(); + logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); + + chrLines = new ArrayList<>(); + } + } + if (!chrLines.isEmpty()) { + // add ena sequence name and save + List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + chromosomeService.saveAllChromosomes(chromosomeEntityList); + chromosomesSavedTillNow += chrLines.size(); + logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); + } + } + + // delete the files after assembly insertion + Files.deleteIfExists(downloadedNCBIFilePath); + } + public void deleteEntriesForAssembly(String accession) { chromosomeRepository.deleteChromosomeEntitiesByAssembly_InsdcAccession(accession); assemblyRepository.deleteAssemblyEntityByInsdcAccessionOrRefseq(accession); } public void retrieveAndInsertMd5ChecksumForAssembly(String assembly) { - checksumSetter.updateMd5CheckSumForAssemblyAsync(assembly); + Job md5ChecksumupdateJob = new Job(JobType.MD5_CHECKSUM_UPDATE, assembly); + chromosomeUpdater.submitJob(md5ChecksumupdateJob); + } + + public void retrieveAndInsertENASequenceNameForAssembly(String assembly) { + Job enaSequenceNameupdateJob = new Job(JobType.ENA_SEQUENCE_NAME_UPDATE, assembly); + chromosomeUpdater.submitJob(enaSequenceNameupdateJob); } - public Map> getMD5ChecksumUpdateTaskStatus() { - return checksumSetter.getMD5ChecksumUpdateTaskStatus(); + public List getScheduledJobStatus() { + return chromosomeUpdater.getScheduledJobStatus(); } public Optional getAssemblyByAccession(String accession) { Optional entity = assemblyRepository.findAssemblyEntityByAccession(accession); if (entity.isPresent()) { - stripAssemblyFromChromosomes(entity); return entity; } else { throw new AssemblyNotFoundException(accession); } } - public void stripAssemblyFromChromosomes(Optional optional) { - if (optional.isPresent()) { - AssemblyEntity entity = optional.get(); - stripAssemblyFromChromosomes(entity); - } - } - - private void stripAssemblyFromChromosomes(AssemblyEntity assembly) { - List chromosomes = assembly.getChromosomes(); - if (chromosomes != null && chromosomes.size() > 0) { - chromosomes.forEach(it -> it.setAssembly(null)); - } else { - assembly.setChromosomes(Collections.emptyList()); - } - } - @Transactional public void insertAssembly(AssemblyEntity entity) { if (isEntityPresent(entity)) { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java index 8b33659d..7ec7585c 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java @@ -19,13 +19,16 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.domain.Page; import org.springframework.data.domain.Pageable; +import org.springframework.jdbc.core.BatchPreparedStatementSetter; +import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.stereotype.Service; - import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; import javax.transaction.Transactional; +import java.sql.PreparedStatement; +import java.sql.SQLException; import java.util.LinkedList; import java.util.List; @@ -33,10 +36,12 @@ public class ChromosomeService { private final ChromosomeRepository repository; + private final JdbcTemplate jdbcTemplate; @Autowired - public ChromosomeService(ChromosomeRepository repository) { + public ChromosomeService(ChromosomeRepository repository, JdbcTemplate jdbcTemplate) { this.repository = repository; + this.jdbcTemplate = jdbcTemplate; } @@ -71,6 +76,13 @@ public void updateMd5ChecksumForAllChromosomeInAssembly(String assembly, List chromosomeEntityList) { + for (ChromosomeEntity chromosome : chromosomeEntityList) { + repository.updateENASequenceNameByInsdcAccession(assembly, chromosome.getInsdcAccession(), chromosome.getEnaSequenceName()); + } + } + public Page getChromosomesByAssemblyRefseq(String asmRefseq, Pageable request) { Page chromosomes = repository.findChromosomeEntitiesByAssembly_Refseq(asmRefseq, request); return stripAssembliesFromChromosomes(chromosomes); @@ -272,4 +284,30 @@ public long countChromosomeEntitiesByEnaName(String enaName) { return repository.countChromosomeEntitiesByEnaSequenceName(enaName); } + void saveAllChromosomes(List chromosomeEntityList) { + String sql = "INSERT INTO chromosome (assembly_insdc_accession,contig_type,ena_sequence_name," + + "genbank_sequence_name,insdc_accession,md5checksum,refseq,seq_length,trunc512checksum,ucsc_name) " + + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; + jdbcTemplate.batchUpdate(sql, new BatchPreparedStatementSetter() { + @Override + public void setValues(PreparedStatement ps, int i) throws SQLException { + ChromosomeEntity chromosome = chromosomeEntityList.get(i); + ps.setString(1, chromosome.getAssembly().getInsdcAccession()); + ps.setString(2, chromosome.getContigType().toString()); + ps.setString(3, chromosome.getEnaSequenceName()); + ps.setString(4, chromosome.getGenbankSequenceName()); + ps.setString(5, chromosome.getInsdcAccession()); + ps.setString(6, chromosome.getMd5checksum()); + ps.setString(7, chromosome.getRefseq()); + ps.setLong(8, chromosome.getSeqLength()); + ps.setString(9, chromosome.getTrunc512checksum()); + ps.setString(10, chromosome.getUcscName()); + } + @Override + public int getBatchSize() { + return chromosomeEntityList.size(); + } + }); + } + } diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index 514ac4f3..0e2bef2d 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +spring.jpa.open-in-view=false controller.auth.admin.username=@contig-alias.admin-user@ controller.auth.admin.password=@contig-alias.admin-password@ diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java index 311c6952..9fbcd04d 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java @@ -32,12 +32,11 @@ import uk.ac.ebi.eva.contigalias.entitygenerator.AssemblyGenerator; import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; -import uk.ac.ebi.eva.contigalias.scheduler.ChecksumSetter; +import uk.ac.ebi.eva.contigalias.scheduler.ChromosomeUpdater; import java.io.IOException; import java.util.List; import java.util.Optional; -import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -61,6 +60,9 @@ public class AssemblyServiceIntegrationTest { @Autowired ChromosomeRepository chromosomeRepository; + @Autowired + ChromosomeService chromosomeService; + @Autowired private AssemblyService service; @@ -68,7 +70,7 @@ public class AssemblyServiceIntegrationTest { void setup() throws IOException { NCBIAssemblyDataSource mockNcbiDataSource = mock(NCBIAssemblyDataSource.class); ENAAssemblyDataSource mockEnaDataSource = mock(ENAAssemblyDataSource.class); - ChecksumSetter mockChecksumSetter = mock(ChecksumSetter.class); + ChromosomeUpdater chromosomeUpdater = mock(ChromosomeUpdater.class); for (int i = 0; i < entities.length; i++) { AssemblyEntity generate = AssemblyGenerator.generate(i); entities[i] = generate; @@ -76,10 +78,9 @@ void setup() throws IOException { .thenReturn(Optional.of(generate)); Mockito.when(mockNcbiDataSource.getAssemblyByAccession(generate.getRefseq())) .thenReturn(Optional.of(generate)); - Mockito.when(mockChecksumSetter.updateMd5CheckSumForAssemblyAsync(generate.getInsdcAccession())) - .thenReturn(new CompletableFuture<>()); } - service = new AssemblyService(repository, chromosomeRepository, mockNcbiDataSource, mockEnaDataSource, mockChecksumSetter); + service = new AssemblyService(chromosomeService, repository, chromosomeRepository, mockNcbiDataSource, + mockEnaDataSource, chromosomeUpdater); } @AfterEach From 5ff968bc9e37b1de307be6b683acb6df7ca2ece8 Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Fri, 9 Feb 2024 12:26:56 +0000 Subject: [PATCH 07/13] added endpoint to update ena sequence name and md5 checksum for multiple assemblies --- .../controller/admin/AdminController.java | 33 ++++++++++++++++--- .../controller/admin/AdminHandler.java | 8 +++++ .../scheduler/ChromosomeUpdater.java | 7 ++++ .../contigalias/service/AssemblyService.java | 16 +++++++++ 4 files changed, 60 insertions(+), 4 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java index e32d73b7..422fcbff 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java @@ -80,7 +80,7 @@ public ResponseEntity fetchAndInsertAssemblyByAccession( "parallel manner.") @PutMapping(value = "assemblies") public ResponseEntity fetchAndInsertAssemblyByAccession( - @RequestBody(required = false) @ApiParam(value = "A JSON array of INSDC or RefSeq assembly accessions. " + + @RequestBody @ApiParam(value = "A JSON array of INSDC or RefSeq assembly accessions. " + "Eg: [\"GCA_000001405.10\",\"GCA_000001405.11\",\"GCA_000001405.12\"]") List accessions) { if (accessions == null || accessions.size() <= 0) { return new ResponseEntity<>(HttpStatus.BAD_REQUEST); @@ -90,7 +90,7 @@ public ResponseEntity fetchAndInsertAssemblyByAccession( } @ApiOperation(value = "Given an assembly accession, retrieve MD5 checksum for all chromosomes belonging to assembly and update") - @PutMapping(value = "assemblies/{accession}/md5checksum") + @PutMapping(value = "assemblies/md5checksum/{accession}") public ResponseEntity retrieveAndInsertMd5ChecksumForAssembly(@PathVariable(name = "accession") @ApiParam(value = "INSDC or RefSeq assembly accession. Eg: " + "GCA_000001405.10") String asmAccession) { @@ -106,8 +106,20 @@ public ResponseEntity retrieveAndInsertMd5ChecksumForAssembly(@PathVaria } } + @ApiOperation(value = "Given a list of assembly accessions, retrieve MD5 checksum for all chromosomes belonging to all the assemblies and update") + @PutMapping(value = "assemblies/md5checksum") + public ResponseEntity retrieveAndInsertMd5ChecksumForAssembly( + @RequestBody @ApiParam(value = "A JSON array of INSDC or RefSeq assembly accessions. " + + "Eg: [\"GCA_000001405.10\",\"GCA_000001405.11\",\"GCA_000001405.12\"]") List accessions) { + handler.retrieveAndInsertMd5ChecksumForAssembly(accessions); + return ResponseEntity.ok("A task has been submitted for updating md5checksum for all chromosomes " + + "in assemblies " + accessions + ". Depending upon the number of chromosomes present in assembly, " + + "and other scheduled jobs, this might take some time to complete"); + + } + @ApiOperation(value = "Given an assembly accession, retrieve ENA sequence name for all chromosomes belonging to assembly and update") - @PutMapping(value = "assemblies/{accession}/enaSequenceName") + @PutMapping(value = "assemblies/enaSequenceName/{accession}") public ResponseEntity retrieveAndInsertENASequenceNameForAssembly(@PathVariable(name = "accession") @ApiParam(value = "INSDC or RefSeq assembly accession. " + "Eg: GCA_000001405.10") String asmAccession) { @@ -123,8 +135,21 @@ public ResponseEntity retrieveAndInsertENASequenceNameForAssembly(@PathV } } + @ApiOperation(value = "Given a list of assembly accessions, retrieve ENA sequence name for all chromosomes belonging to all the assemblies and update") + @PutMapping(value = "assemblies/enaSequenceName") + public ResponseEntity retrieveAndInsertENASequenceNameForAssembly( + @RequestBody @ApiParam(value = "A JSON array of INSDC or RefSeq assembly accessions. " + + "Eg: [\"GCA_000001405.10\",\"GCA_000001405.11\",\"GCA_000001405.12\"]") List accessions) { + + handler.retrieveAndInsertENASequenceNameForAssembly(accessions); + return ResponseEntity.ok("A task has been submitted for updating ENA Sequence Name for all chromosomes " + + "in assembly " + accessions + ". Depending upon the number of chromosomes present in assembly, " + + "and other scheduled jobs, this might take some time to complete"); + } + + @ApiOperation(value = "Retrieve list of assemblies for which MD5 Checksum updates are running/going-to-run ") - @GetMapping(value = "assemblies/md5checksum/status") + @GetMapping(value = "assemblies/scheduledJobs") public ResponseEntity> getMD5ChecksumUpdateTaskStatus() { List scheduledJobStatus = handler.getScheduledJobStatus(); return ResponseEntity.ok(scheduledJobStatus); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java index d68f4c92..b0b18ccf 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminHandler.java @@ -61,10 +61,18 @@ public void retrieveAndInsertMd5ChecksumForAssembly(String accession) { assemblyService.retrieveAndInsertMd5ChecksumForAssembly(accession); } + public void retrieveAndInsertMd5ChecksumForAssembly(List accessions) { + assemblyService.retrieveAndInsertMd5ChecksumForAssembly(accessions); + } + public void retrieveAndInsertENASequenceNameForAssembly(String accession) { assemblyService.retrieveAndInsertENASequenceNameForAssembly(accession); } + public void retrieveAndInsertENASequenceNameForAssembly(List accessions) { + assemblyService.retrieveAndInsertENASequenceNameForAssembly(accessions); + } + public List getScheduledJobStatus() { return assemblyService.getScheduledJobStatus(); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java index b071eb9a..32376a63 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java @@ -33,6 +33,13 @@ public void submitJob(Job job) { ApplicationContextHolder.getApplicationContext().publishEvent(event); } + public void submitJob(List jobList) { + jobQueue.addAll(jobList); + jobList.stream().forEach(job -> logger.info("Submitted Job : " + job.getType() + " for assembly " + job.getParameter())); + JobSubmittedEvent event = new JobSubmittedEvent(this); + ApplicationContextHolder.getApplicationContext().publishEvent(event); + } + @Override public void onApplicationEvent(JobSubmittedEvent event) { processJobs(); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java index 731cdd7a..65c06fcd 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java @@ -184,11 +184,27 @@ public void retrieveAndInsertMd5ChecksumForAssembly(String assembly) { chromosomeUpdater.submitJob(md5ChecksumupdateJob); } + public void retrieveAndInsertMd5ChecksumForAssembly(List assemblies) { + List jobsList = new ArrayList(); + for (String assembly : assemblies) { + jobsList.add(new Job(JobType.MD5_CHECKSUM_UPDATE, assembly)); + } + chromosomeUpdater.submitJob(jobsList); + } + public void retrieveAndInsertENASequenceNameForAssembly(String assembly) { Job enaSequenceNameupdateJob = new Job(JobType.ENA_SEQUENCE_NAME_UPDATE, assembly); chromosomeUpdater.submitJob(enaSequenceNameupdateJob); } + public void retrieveAndInsertENASequenceNameForAssembly(List assemblies) { + List jobsList = new ArrayList(); + for (String assembly : assemblies) { + jobsList.add(new Job(JobType.ENA_SEQUENCE_NAME_UPDATE, assembly)); + } + chromosomeUpdater.submitJob(jobsList); + } + public List getScheduledJobStatus() { return chromosomeUpdater.getScheduledJobStatus(); } From bd1b07c7f682aa28fdc563a97fad85df2156d0cd Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Fri, 9 Feb 2024 14:03:16 +0000 Subject: [PATCH 08/13] prevent running multiple instances of job processing --- .../contigalias/ContigAliasApplication.java | 2 + .../ApplicationContextHolder.java | 2 +- .../controller/admin/AdminController.java | 8 +++ .../scheduler/ChromosomeUpdater.java | 52 +++++++++++++------ .../scheduler/ENASequenceNameUpdater.java | 6 +-- .../contigalias/scheduler/{ => Job}/Job.java | 7 ++- .../{ => Job}/JobSubmittedEvent.java | 2 +- .../Job/JobSubmittedEventHandler.java | 23 ++++++++ .../scheduler/{ => Job}/JobType.java | 2 +- .../scheduler/MD5ChecksumUpdater.java | 2 +- .../contigalias/service/AssemblyService.java | 12 +---- 11 files changed, 83 insertions(+), 35 deletions(-) rename src/main/java/uk/ac/ebi/eva/contigalias/{scheduler => conf}/ApplicationContextHolder.java (93%) rename src/main/java/uk/ac/ebi/eva/contigalias/scheduler/{ => Job}/Job.java (70%) rename src/main/java/uk/ac/ebi/eva/contigalias/scheduler/{ => Job}/JobSubmittedEvent.java (79%) create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEventHandler.java rename src/main/java/uk/ac/ebi/eva/contigalias/scheduler/{ => Job}/JobType.java (61%) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java b/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java index bd1f1109..9c4088c7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/ContigAliasApplication.java @@ -22,10 +22,12 @@ import org.springframework.boot.web.servlet.support.SpringBootServletInitializer; import org.springframework.hateoas.config.EnableHypermediaSupport; import org.springframework.retry.annotation.EnableRetry; +import org.springframework.scheduling.annotation.EnableAsync; import org.springframework.scheduling.annotation.EnableScheduling; import org.springframework.transaction.annotation.EnableTransactionManagement; @EnableScheduling +@EnableAsync @SpringBootApplication @EnableRetry @EnableTransactionManagement diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ApplicationContextHolder.java b/src/main/java/uk/ac/ebi/eva/contigalias/conf/ApplicationContextHolder.java similarity index 93% rename from src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ApplicationContextHolder.java rename to src/main/java/uk/ac/ebi/eva/contigalias/conf/ApplicationContextHolder.java index e9080ab9..ffd62ea6 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ApplicationContextHolder.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/conf/ApplicationContextHolder.java @@ -1,4 +1,4 @@ -package uk.ac.ebi.eva.contigalias.scheduler; +package uk.ac.ebi.eva.contigalias.conf; import org.springframework.beans.BeansException; import org.springframework.context.ApplicationContext; diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java index 422fcbff..6226f176 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java @@ -61,6 +61,9 @@ public ResponseEntity fetchAndInsertAssemblyByAccession( "GCA_000001405.10") String asmAccession) throws IOException { try { handler.fetchAndInsertAssemblyByAccession(asmAccession); + // submit jobs for updating ena sequence name and md5 checksum for assembly + handler.retrieveAndInsertENASequenceNameForAssembly(asmAccession); + handler.retrieveAndInsertMd5ChecksumForAssembly(asmAccession); } catch (IllegalArgumentException e) { return new ResponseEntity<>(e.getMessage(), HttpStatus.BAD_REQUEST); } @@ -86,6 +89,11 @@ public ResponseEntity fetchAndInsertAssemblyByAccession( return new ResponseEntity<>(HttpStatus.BAD_REQUEST); } Map> accessionResult = handler.fetchAndInsertAssemblyByAccession(accessions); + // submit jobs for updating ena sequence names and md5 checksum for all successfully inserted assemblies + if (accessionResult.get("SUCCESS").size() > 0) { + handler.retrieveAndInsertENASequenceNameForAssembly(accessionResult.get("SUCCESS")); + handler.retrieveAndInsertMd5ChecksumForAssembly(accessionResult.get("SUCCESS")); + } return new ResponseEntity<>("Accession Processing Result : " + accessionResult, HttpStatus.MULTI_STATUS); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java index 32376a63..016725b8 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java @@ -3,22 +3,29 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.context.ApplicationListener; import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; +import uk.ac.ebi.eva.contigalias.conf.ApplicationContextHolder; +import uk.ac.ebi.eva.contigalias.scheduler.Job.Job; +import uk.ac.ebi.eva.contigalias.scheduler.Job.JobSubmittedEvent; +import uk.ac.ebi.eva.contigalias.scheduler.Job.JobType; +import java.util.ArrayList; import java.util.List; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; @Service -public class ChromosomeUpdater implements ApplicationListener { +public class ChromosomeUpdater { private final Logger logger = LoggerFactory.getLogger(ChromosomeUpdater.class); private final BlockingQueue jobQueue = new LinkedBlockingQueue<>(); private final ENASequenceNameUpdater enaSequenceNameUpdater; private final MD5ChecksumUpdater md5ChecksumUpdater; + private AtomicBoolean running = new AtomicBoolean(false); + private Job currentJob; @Autowired public ChromosomeUpdater(ENASequenceNameUpdater enaSequenceNameUpdater, MD5ChecksumUpdater md5ChecksumUpdater) { @@ -28,40 +35,51 @@ public ChromosomeUpdater(ENASequenceNameUpdater enaSequenceNameUpdater, MD5Check public void submitJob(Job job) { jobQueue.add(job); - logger.info("Submitted Job : " + job.getType() + " for assembly " + job.getParameter()); + logger.info("Submitted Job : " + job); JobSubmittedEvent event = new JobSubmittedEvent(this); ApplicationContextHolder.getApplicationContext().publishEvent(event); } public void submitJob(List jobList) { jobQueue.addAll(jobList); - jobList.stream().forEach(job -> logger.info("Submitted Job : " + job.getType() + " for assembly " + job.getParameter())); + jobList.stream().forEach(job -> logger.info("Submitted Job : " + job)); JobSubmittedEvent event = new JobSubmittedEvent(this); ApplicationContextHolder.getApplicationContext().publishEvent(event); } - @Override - public void onApplicationEvent(JobSubmittedEvent event) { - processJobs(); - } - @Async public void processJobs() { + running.set(true); + currentJob = null; while (!jobQueue.isEmpty()) { try { - Job job = jobQueue.take(); - if (job.getType() == JobType.ENA_SEQUENCE_NAME_UPDATE) { - enaSequenceNameUpdater.updateENASequenceNameForAssembly(job.getParameter()); - } else if (job.getType() == JobType.MD5_CHECKSUM_UPDATE) { - md5ChecksumUpdater.updateMD5ChecksumForAssembly(job.getParameter()); + Job currentJob = jobQueue.take(); + if (currentJob.getType() == JobType.ENA_SEQUENCE_NAME_UPDATE) { + enaSequenceNameUpdater.updateENASequenceNameForAssembly(currentJob.getParameter()); + } else if (currentJob.getType() == JobType.MD5_CHECKSUM_UPDATE) { + md5ChecksumUpdater.updateMD5ChecksumForAssembly(currentJob.getParameter()); } - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); + } catch (Exception e) { + logger.error("Exception while running job : " + currentJob); } } + currentJob = null; + running.set(false); } public List getScheduledJobStatus() { - return jobQueue.stream().map(j -> j.getType().toString()).collect(Collectors.toList()); + List jobList = new ArrayList<>(); + if (currentJob != null) { + jobList.add(currentJob.toString()); + } + jobList.addAll(jobQueue.stream() + .map(j -> j.getType().toString() + " : " + j.getParameter()) + .collect(Collectors.toList())); + + return jobList; + } + + public AtomicBoolean isRunning() { + return running; } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java index 7b14c7e3..c7be1610 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java @@ -59,7 +59,7 @@ public void updateENASequenceNameForAssembly(String assembly) { } } - public void retrieveAndUpdateENASequenceNames(String assembly, Path downloadedENAFilePath) throws IOException { + private void retrieveAndUpdateENASequenceNames(String assembly, Path downloadedENAFilePath) throws IOException { try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedENAFilePath.toFile()))) { long chromosomesSavedTillNow = 0l; List chrLines = new ArrayList<>(); @@ -73,7 +73,7 @@ public void retrieveAndUpdateENASequenceNames(String assembly, Path downloadedEN List chromosomeEntityList = enaDataSource.getChromosomeEntityList(chrLines); chromosomeService.updateENASequenceNameForAllChromosomeInAssembly(assembly, chromosomeEntityList); chromosomesSavedTillNow += chrLines.size(); - logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); + logger.info("Number of chromosomes updated till now : " + chromosomesSavedTillNow); chrLines = new ArrayList<>(); } @@ -83,7 +83,7 @@ public void retrieveAndUpdateENASequenceNames(String assembly, Path downloadedEN List chromosomeEntityList = enaDataSource.getChromosomeEntityList(chrLines); chromosomeService.updateENASequenceNameForAllChromosomeInAssembly(assembly, chromosomeEntityList); chromosomesSavedTillNow += chrLines.size(); - logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); + logger.info("Number of chromosomes updated till now : " + chromosomesSavedTillNow); } } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/Job.java similarity index 70% rename from src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job.java rename to src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/Job.java index a5c39ec4..77bd78b6 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/Job.java @@ -1,4 +1,4 @@ -package uk.ac.ebi.eva.contigalias.scheduler; +package uk.ac.ebi.eva.contigalias.scheduler.Job; public class Job { private final JobType type; @@ -16,4 +16,9 @@ public JobType getType() { public String getParameter() { return parameter; } + + @Override + public String toString() { + return type + " : " + parameter; + } } \ No newline at end of file diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobSubmittedEvent.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEvent.java similarity index 79% rename from src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobSubmittedEvent.java rename to src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEvent.java index df9eb94a..1a86340b 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobSubmittedEvent.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEvent.java @@ -1,4 +1,4 @@ -package uk.ac.ebi.eva.contigalias.scheduler; +package uk.ac.ebi.eva.contigalias.scheduler.Job; import org.springframework.context.ApplicationEvent; diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEventHandler.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEventHandler.java new file mode 100644 index 00000000..2b1dfb89 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEventHandler.java @@ -0,0 +1,23 @@ +package uk.ac.ebi.eva.contigalias.scheduler.Job; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.ApplicationListener; +import org.springframework.stereotype.Component; +import uk.ac.ebi.eva.contigalias.scheduler.ChromosomeUpdater; + +@Component +public class JobSubmittedEventHandler implements ApplicationListener { + private ChromosomeUpdater chromosomeUpdater; + + @Autowired + public JobSubmittedEventHandler(ChromosomeUpdater chromosomeUpdater) { + this.chromosomeUpdater = chromosomeUpdater; + } + + @Override + public void onApplicationEvent(JobSubmittedEvent event) { + if (!chromosomeUpdater.isRunning().get()) { + chromosomeUpdater.processJobs(); + } + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobType.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobType.java similarity index 61% rename from src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobType.java rename to src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobType.java index 204e423e..6bf8f58b 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/JobType.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobType.java @@ -1,4 +1,4 @@ -package uk.ac.ebi.eva.contigalias.scheduler; +package uk.ac.ebi.eva.contigalias.scheduler.Job; public enum JobType { ENA_SEQUENCE_NAME_UPDATE, diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java index 978d7580..a586dc80 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java @@ -66,7 +66,7 @@ public void updateMD5ChecksumForAssembly(String assembly) { } } - public void updateMd5ChecksumForChromosome(String assembly, List chromosomesList) { + private void updateMd5ChecksumForChromosome(String assembly, List chromosomesList) { chromosomesList.parallelStream().forEach(chromosome -> { try { String md5Checksum = retrieveMd5Checksum(chromosome.getInsdcAccession()); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java index 65c06fcd..430e7a73 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java @@ -32,8 +32,8 @@ import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; import uk.ac.ebi.eva.contigalias.scheduler.ChromosomeUpdater; -import uk.ac.ebi.eva.contigalias.scheduler.Job; -import uk.ac.ebi.eva.contigalias.scheduler.JobType; +import uk.ac.ebi.eva.contigalias.scheduler.Job.Job; +import uk.ac.ebi.eva.contigalias.scheduler.Job.JobType; import javax.transaction.Transactional; import java.io.BufferedReader; @@ -117,14 +117,6 @@ public void fetchAndInsertAssembly(String accession) { logger.info("Start inserting assembly for accession " + accession); parseFileAndInsertAssembly(accession); logger.info("Successfully inserted assembly for accession " + accession); - - // submit job for updating ENA Sequence name for assembly (asynchronously) - Job enaSequenceNameupdateJob = new Job(JobType.ENA_SEQUENCE_NAME_UPDATE, accession); - chromosomeUpdater.submitJob(enaSequenceNameupdateJob); - - // submit job for updating MD5 Checksum for assembly (asynchronously) - Job md5ChecksumupdateJob = new Job(JobType.MD5_CHECKSUM_UPDATE, accession); - chromosomeUpdater.submitJob(md5ChecksumupdateJob); } catch (Exception e) { // roll back inserted entries in case of any exception or error logger.error("Exception while inserting assembly " + accession + " Rolling back changes. \n" + e); From b61f196280ecfc20e2387e6bd62f635b19b53faf Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Sun, 11 Feb 2024 12:39:25 +0000 Subject: [PATCH 09/13] removed unused code, update old test and add new test --- .../datasource/AssemblyDataSource.java | 28 ------ .../datasource/ENAAssemblyDataSource.java | 89 +------------------ .../datasource/NCBIAssemblyDataSource.java | 32 +------ .../contigalias/entities/AssemblyEntity.java | 2 +- .../contigalias/entities/SequenceEntity.java | 3 +- .../scheduler/ChromosomeUpdater.java | 5 +- .../scheduler/ENASequenceNameUpdater.java | 3 +- .../service/ChromosomeService.java | 2 +- .../datasource/ENAAssemblyDataSourceTest.java | 34 ++++--- .../NCBIAssemblyDataSourceTest.java | 42 ++++++--- .../scheduler/ENASequenceNameUpdaterTest.java | 61 +++++++++++++ .../scheduler/MD5ChecksumUpdaterTest.java | 70 +++++++++++++++ ...lyAndChromosomeServiceIntegrationTest.java | 3 + .../AssemblyServiceIntegrationTest.java | 9 +- .../resources/application-test.properties | 1 + 15 files changed, 193 insertions(+), 191 deletions(-) delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblyDataSource.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdaterTest.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblyDataSource.java deleted file mode 100644 index 1c646ca8..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblyDataSource.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright 2020 EMBL - European Bioinformatics Institute - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package uk.ac.ebi.eva.contigalias.datasource; - -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; - -import java.io.IOException; -import java.util.Optional; - -public interface AssemblyDataSource { - - Optional getAssemblyByAccession(String accession) throws IOException; - -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java index 5bdc32a7..5279357a 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java @@ -28,27 +28,18 @@ import uk.ac.ebi.eva.contigalias.dus.ENAAssemblyReportReaderFactory; import uk.ac.ebi.eva.contigalias.dus.ENABrowser; import uk.ac.ebi.eva.contigalias.dus.ENABrowserFactory; -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; import uk.ac.ebi.eva.contigalias.exception.DownloadFailedException; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; -import java.util.Collections; import java.util.List; -import java.util.Map; -import java.util.Objects; import java.util.Optional; -import java.util.stream.Collectors; @Repository("ENADataSource") -public class ENAAssemblyDataSource implements AssemblyDataSource { +public class ENAAssemblyDataSource { private final Logger logger = LoggerFactory.getLogger(ENAAssemblyDataSource.class); @@ -66,47 +57,16 @@ public ENAAssemblyDataSource(ENABrowserFactory factory, this.readerFactory = readerFactory; } - @Override - public Optional getAssemblyByAccession(String accession) throws IOException { - ENABrowser enaBrowser = factory.build(); - enaBrowser.connect(); - try { - Optional downloadFilePath = downloadAssemblyReport(enaBrowser, accession); - if (!downloadFilePath.isPresent()) { - return Optional.empty(); - } - - AssemblyEntity assemblyEntity; - try (InputStream stream = new FileInputStream(downloadFilePath.get().toFile())) { - ENAAssemblyReportReader reader = readerFactory.build(stream); - assemblyEntity = reader.getAssemblyEntity(); - logger.info("ENA: Number of chromosomes in " + accession + " : " + assemblyEntity.getChromosomes().size()); - } finally { - try { - enaBrowser.disconnect(); - Files.deleteIfExists(downloadFilePath.get()); - } catch (IOException e) { - logger.warn("Error while trying to disconnect - enaBrowser (assembly: " + accession + ") : " + e); - } - } - return Optional.of(assemblyEntity); - } catch (Exception e) { - logger.warn("Could not fetch Assembly Report from ENA for accession " + accession + "Exception: " + e); - return Optional.empty(); - } - - } - public Optional downloadAssemblyReport(String accession) throws IOException { ENABrowser enaBrowser = factory.build(); enaBrowser.connect(); try { enaBrowser.connect(); return downloadAssemblyReport(enaBrowser, accession); - } catch (Exception e){ + } catch (Exception e) { logger.warn("Could not fetch Assembly Report from ENA for accession " + accession + "Exception: " + e); return Optional.empty(); - }finally { + } finally { try { enaBrowser.disconnect(); } catch (IOException e) { @@ -150,47 +110,4 @@ public List getChromosomeEntityList(List chrDataList) public ChromosomeEntity getChromosomeEntity(String chrLine) { return ENAAssemblyReportReader.getChromosomeEntity(chrLine); } - - /** - * Adds ENA sequence names to chromosomes and scaffolds in an assembly. Will modify the AssemblyEntity in-place. - * - * @param optional {@link AssemblyEntity} to add ENA sequence names to - * @throws IOException Passes IOException thrown by {@link #getAssemblyByAccession(String)} - */ - public void addENASequenceNamesToAssembly(AssemblyEntity targetAssembly) throws IOException { - if (!hasAllEnaSequenceNames(targetAssembly)) { - String insdcAccession = targetAssembly.getInsdcAccession(); - Optional enaAssembly = getAssemblyByAccession(insdcAccession); - - if (enaAssembly.isPresent()) { - AssemblyEntity sourceAssembly = enaAssembly.get(); - addENASequenceNames(Objects.nonNull(sourceAssembly.getChromosomes()) ? - sourceAssembly.getChromosomes() : Collections.emptyList(), - Objects.nonNull(targetAssembly.getChromosomes()) ? - targetAssembly.getChromosomes() : Collections.emptyList()); - } - } - } - - public boolean hasAllEnaSequenceNames(AssemblyEntity assembly) { - List chromosomes = Objects.nonNull(assembly.getChromosomes()) ? - assembly.getChromosomes() : Collections.emptyList(); - return chromosomes.stream().allMatch(sequence -> sequence.getEnaSequenceName() != null); - } - - public void addENASequenceNames( - List sourceSequences, List targetSequences) { - if (targetSequences == null || sourceSequences == null || targetSequences.isEmpty() || sourceSequences.isEmpty()) { - return; - } - Map insdcToSequenceEntityMap = targetSequences.stream() - .collect(Collectors.toMap(s->s.getInsdcAccession(), s->s)); - - for (SequenceEntity sourceSeq : sourceSequences) { - String sourceInsdcAccession = sourceSeq.getInsdcAccession(); - if (insdcToSequenceEntityMap.containsKey(sourceInsdcAccession)) { - insdcToSequenceEntityMap.get(sourceInsdcAccession).setEnaSequenceName(sourceSeq.getEnaSequenceName()); - } - } - } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java index 22f5c2a4..0e7046f0 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java @@ -31,9 +31,7 @@ import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -43,7 +41,7 @@ import java.util.stream.Collectors; @Repository("NCBIDataSource") -public class NCBIAssemblyDataSource implements AssemblyDataSource { +public class NCBIAssemblyDataSource { private final Logger logger = LoggerFactory.getLogger(NCBIAssemblyDataSource.class); @@ -61,34 +59,6 @@ public NCBIAssemblyDataSource(NCBIBrowserFactory factory, this.readerFactory = readerFactory; } - @Override - public Optional getAssemblyByAccession( - String accession) throws IOException, IllegalArgumentException { - NCBIBrowser ncbiBrowser = factory.build(); - ncbiBrowser.connect(); - - Optional downloadFilePath = downloadAssemblyReport(accession, ncbiBrowser); - if (!downloadFilePath.isPresent()) { - return Optional.empty(); - } - - AssemblyEntity assemblyEntity; - try (InputStream stream = new FileInputStream(downloadFilePath.get().toFile())) { - NCBIAssemblyReportReader reader = readerFactory.build(stream); - assemblyEntity = reader.getAssemblyEntity(); - logger.info("NCBI: Number of chromosomes in " + accession + " : " + - (assemblyEntity.getChromosomes() != null ? assemblyEntity.getChromosomes().size() : 0)); - } finally { - try { - ncbiBrowser.disconnect(); - Files.deleteIfExists(downloadFilePath.get()); - } catch (IOException e) { - logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ") : " + e); - } - } - return Optional.of(assemblyEntity); - } - public AssemblyEntity getAssemblyEntity(Path downloadFilePath) throws IOException { List asmDataLines = Files.lines(downloadFilePath) .filter(line -> line.startsWith("#")) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java index 4609d941..129764e0 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblyEntity.java @@ -65,7 +65,7 @@ public class AssemblyEntity { @JsonInclude(JsonInclude.Include.NON_NULL) @ApiModelProperty(value = "List of all chromosomes of the assembly present in the database.") @LazyCollection(LazyCollectionOption.TRUE) - @OneToMany(mappedBy = "assembly", cascade = CascadeType.REMOVE, fetch = FetchType.LAZY) + @OneToMany(mappedBy = "assembly", cascade = CascadeType.ALL, fetch = FetchType.LAZY) private List chromosomes; public AssemblyEntity() { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java index 641b3af4..e5afdbdc 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java @@ -19,6 +19,7 @@ import com.fasterxml.jackson.annotation.JsonInclude; import io.swagger.annotations.ApiModelProperty; +import javax.persistence.CascadeType; import javax.persistence.Column; import javax.persistence.EnumType; import javax.persistence.Enumerated; @@ -67,7 +68,7 @@ public enum ContigType { @Id @JsonInclude(JsonInclude.Include.NON_NULL) @ApiModelProperty(value = "Assembly that this sequence belongs to.") - @ManyToOne + @ManyToOne(cascade = CascadeType.ALL) private AssemblyEntity assembly; public String getGenbankSequenceName() { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java index 016725b8..62c100dd 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java @@ -23,8 +23,7 @@ public class ChromosomeUpdater { private final BlockingQueue jobQueue = new LinkedBlockingQueue<>(); private final ENASequenceNameUpdater enaSequenceNameUpdater; private final MD5ChecksumUpdater md5ChecksumUpdater; - private AtomicBoolean running = new AtomicBoolean(false); - + private final AtomicBoolean running = new AtomicBoolean(false); private Job currentJob; @Autowired @@ -53,7 +52,7 @@ public void processJobs() { currentJob = null; while (!jobQueue.isEmpty()) { try { - Job currentJob = jobQueue.take(); + currentJob = jobQueue.take(); if (currentJob.getType() == JobType.ENA_SEQUENCE_NAME_UPDATE) { enaSequenceNameUpdater.updateENASequenceNameForAssembly(currentJob.getParameter()); } else if (currentJob.getType() == JobType.MD5_CHECKSUM_UPDATE) { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java index c7be1610..026171d3 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java @@ -79,12 +79,13 @@ private void retrieveAndUpdateENASequenceNames(String assembly, Path downloadedE } } if (!chrLines.isEmpty()) { - // add ena sequence name and save List chromosomeEntityList = enaDataSource.getChromosomeEntityList(chrLines); chromosomeService.updateENASequenceNameForAllChromosomeInAssembly(assembly, chromosomeEntityList); chromosomesSavedTillNow += chrLines.size(); logger.info("Number of chromosomes updated till now : " + chromosomesSavedTillNow); } } + + logger.info("Finished updating ENA Sequence Name for assembly: " + assembly); } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java index 7ec7585c..77e2564e 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java @@ -284,7 +284,7 @@ public long countChromosomeEntitiesByEnaName(String enaName) { return repository.countChromosomeEntitiesByEnaSequenceName(enaName); } - void saveAllChromosomes(List chromosomeEntityList) { + public void saveAllChromosomes(List chromosomeEntityList) { String sql = "INSERT INTO chromosome (assembly_insdc_accession,contig_type,ena_sequence_name," + "genbank_sequence_name,insdc_accession,md5checksum,refseq,seq_length,trunc512checksum,ucsc_name) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSourceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSourceTest.java index 11f7bd5c..b4b6ff26 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSourceTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSourceTest.java @@ -20,16 +20,16 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ActiveProfiles; - -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.List; import java.util.Optional; +import java.util.stream.Collectors; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @ActiveProfiles("test") @@ -41,24 +41,22 @@ public class ENAAssemblyDataSourceTest { @Autowired private ENAAssemblyDataSource enaDataSource; - @Autowired - private NCBIAssemblyDataSource ncbiDataSource; - @Test - public void getAssemblyByAccessionGCAHavingChromosomes() throws IOException { - Optional accession = enaDataSource.getAssemblyByAccession(GCA_ACCESSION_HAVING_CHROMOSOMES); - assertTrue(accession.isPresent()); - List chromosomes = accession.get().getChromosomes(); - assertNotNull(chromosomes); - assertFalse(chromosomes.isEmpty()); + public void testDownloadAssemblyReport() throws IOException { + Optional downloadedAssemblyReport = enaDataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES); + assertTrue(downloadedAssemblyReport.isPresent()); + assertTrue(Files.exists(downloadedAssemblyReport.get())); } @Test - public void getENASequenceNamesForAssembly() throws IOException { - Optional assembly = ncbiDataSource.getAssemblyByAccession(GCA_ACCESSION_HAVING_CHROMOSOMES); - enaDataSource.addENASequenceNamesToAssembly(assembly.get()); - assertTrue(assembly.isPresent()); - assertTrue(enaDataSource.hasAllEnaSequenceNames(assembly.get())); + public void getChromosomeEntityFromAssemblyReport() throws IOException { + Optional downloadedAssemblyReport = enaDataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES); + List chrLines = Files.lines(downloadedAssemblyReport.get()) + .filter(l -> !l.startsWith("accession")) + .collect(Collectors.toList()); + List chromosomeEntityList = enaDataSource.getChromosomeEntityList(chrLines); + assertEquals(3143, chromosomeEntityList.size()); + chromosomeEntityList.stream().forEach(c -> assertTrue(!c.getEnaSequenceName().isEmpty())); } } diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSourceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSourceTest.java index 589b441e..9440ca3d 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSourceTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSourceTest.java @@ -20,19 +20,18 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ActiveProfiles; - import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @ActiveProfiles("test") @@ -46,23 +45,40 @@ public class NCBIAssemblyDataSourceTest { @Autowired private NCBIAssemblyDataSource dataSource; + @Test + public void testDownloadAssemblyReport() throws IOException { + Optional downloadedAssemblyReport = dataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES); + assertTrue(downloadedAssemblyReport.isPresent()); + assertTrue(Files.exists(downloadedAssemblyReport.get())); + } + @Test public void getAssemblyByAccessionGCAHavingChromosomes() throws IOException { - Optional accession = dataSource.getAssemblyByAccession(GCA_ACCESSION_HAVING_CHROMOSOMES); - assertTrue(accession.isPresent()); - List chromosomes = accession.get().getChromosomes(); - assertNotNull(chromosomes); - assertFalse(chromosomes.isEmpty()); + Optional downloadedAssemblyReport = dataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES); + AssemblyEntity assembly = dataSource.getAssemblyEntity(downloadedAssemblyReport.get()); + assertEquals(GCA_ACCESSION_HAVING_CHROMOSOMES, assembly.getInsdcAccession()); + List chrLines = Files.lines(downloadedAssemblyReport.get()) + .filter(l -> !l.startsWith("#")) + .collect(Collectors.toList()); + List chromosomeEntityList = dataSource.getChromosomeEntityList(assembly, chrLines); + assertEquals(3143, chromosomeEntityList.size()); } @Test public void getAssemblyByAccessionGCFNoChromosomes() throws IOException { - Optional accession = dataSource.getAssemblyByAccession(GCF_ACCESSION_NO_CHROMOSOMES); - assertTrue(accession.isPresent()); - List chromosomes = accession.get().getChromosomes().stream() - .filter(e -> e.getContigType().equals(SequenceEntity.ContigType.CHROMOSOME)) + Optional downloadedAssemblyReport = dataSource.downloadAssemblyReport(GCF_ACCESSION_NO_CHROMOSOMES); + AssemblyEntity assembly = dataSource.getAssemblyEntity(downloadedAssemblyReport.get()); + assertEquals("GCA_006125015.1", assembly.getInsdcAccession()); + List chrLines = Files.lines(downloadedAssemblyReport.get()) + .filter(l -> !l.startsWith("#")) .collect(Collectors.toList()); - assertEquals(0, chromosomes.size()); + List chromosomeEntityList = dataSource.getChromosomeEntityList(assembly, chrLines); + long numOfChromosomes = chromosomeEntityList.stream() + .filter(c -> c.getContigType() == SequenceEntity.ContigType.CHROMOSOME).count(); + long numOfScaffolds = chromosomeEntityList.stream() + .filter(c -> c.getContigType() == SequenceEntity.ContigType.SCAFFOLD).count(); + assertEquals(0, numOfChromosomes); + assertEquals(2, numOfScaffolds); } } diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdaterTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdaterTest.java new file mode 100644 index 00000000..b6dd79db --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdaterTest.java @@ -0,0 +1,61 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.data.domain.PageRequest; +import org.springframework.test.annotation.DirtiesContext; +import org.springframework.test.context.ActiveProfiles; +import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.service.AssemblyService; +import uk.ac.ebi.eva.contigalias.service.ChromosomeService; + +import java.util.LinkedList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; + + +@ActiveProfiles("test") +@DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) +@SpringBootTest +public class ENASequenceNameUpdaterTest { + private static final String GCA_ACCESSION_HAVING_CHROMOSOMES = "GCA_000003055.5"; + + @Autowired + private ENASequenceNameUpdater enaSequenceNameUpdater; + + @Autowired + private AssemblyService assemblyService; + + @Autowired + private ChromosomeService chromosomeService; + private final List chromosomeEntities = new LinkedList<>(); + + @BeforeEach + void setup() { + assemblyService.fetchAndInsertAssembly(GCA_ACCESSION_HAVING_CHROMOSOMES); + } + + @AfterEach + void tearDown() { + chromosomeEntities.stream().forEach(c -> chromosomeService.deleteChromosome(c)); + assemblyService.deleteAssembly(assemblyService.getAssemblyByAccession(GCA_ACCESSION_HAVING_CHROMOSOMES).get()); + } + + @Test + public void testUpdateENASequenceName() { + List chromosomeListBeforeUpdate = chromosomeService.getChromosomesByInsdcAccession(GCA_ACCESSION_HAVING_CHROMOSOMES, + PageRequest.of(0, 5000)).getContent(); + chromosomeListBeforeUpdate.stream().forEach(c -> assertNull(c.getEnaSequenceName())); + + enaSequenceNameUpdater.updateENASequenceNameForAssembly(GCA_ACCESSION_HAVING_CHROMOSOMES); + + List chromosomeListAfterUpdate = chromosomeService.getChromosomesByInsdcAccession(GCA_ACCESSION_HAVING_CHROMOSOMES, + PageRequest.of(0, 5000)).getContent(); + chromosomeListAfterUpdate.stream().forEach(c -> assertNotNull(c.getEnaSequenceName())); + } +} diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java new file mode 100644 index 00000000..6969b670 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java @@ -0,0 +1,70 @@ +package uk.ac.ebi.eva.contigalias.scheduler; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.data.domain.PageRequest; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.test.annotation.DirtiesContext; +import org.springframework.test.context.ActiveProfiles; +import org.springframework.web.client.RestTemplate; +import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; +import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.entitygenerator.AssemblyGenerator; +import uk.ac.ebi.eva.contigalias.entitygenerator.ChromosomeGenerator; +import uk.ac.ebi.eva.contigalias.service.ChromosomeService; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.mockito.Mockito.mock; + +@ActiveProfiles("test") +@DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) +@SpringBootTest +class MD5ChecksumUpdaterTest { + private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:INSDC_ACCESSION_PLACE_HOLDER/metadata"; + private AssemblyEntity assemblyEntity = AssemblyGenerator.generate(); + private List chromosomeEntityList = new ArrayList<>(); + @Autowired + private JdbcTemplate jdbcTemplate; + @Autowired + private ChromosomeService chromosomeService; + private MD5ChecksumUpdater md5ChecksumUpdater; + + @BeforeEach + void setup() throws JsonProcessingException { + RestTemplate restTemplate = mock(RestTemplate.class); + md5ChecksumUpdater = new MD5ChecksumUpdater(restTemplate, jdbcTemplate, chromosomeService); + for (int i = 0; i < 5; i++) { + ChromosomeEntity chromosomeEntity = ChromosomeGenerator.generate(assemblyEntity); + chromosomeEntityList.add(chromosomeEntity); + chromosomeService.insertChromosome(chromosomeEntity); + + String jsonMD5Response = "{\"metadata\": {\"md5\": \"" + chromosomeEntity.getInsdcAccession() + "-MD5\"}}"; + Mockito.when(restTemplate.getForObject(INSDC_CHECKSUM_URL.replace("INSDC_ACCESSION_PLACE_HOLDER", + chromosomeEntity.getInsdcAccession()), JsonNode.class)) + .thenReturn(new ObjectMapper().readTree(jsonMD5Response)); + } + } + + @Test + void testUpdateMD5ChecksumForAssembly() { + chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), + PageRequest.of(0, 100)) + .forEach(c -> assertNull(c.getMd5checksum())); + + md5ChecksumUpdater.updateMD5ChecksumForAssembly(assemblyEntity.getInsdcAccession()); + + chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), + PageRequest.of(0, 100)) + .forEach(c -> assertEquals(c.getInsdcAccession() + "-MD5", c.getMd5checksum())); + } +} diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java index 02789609..b9447aaf 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java @@ -184,6 +184,9 @@ void setup() { @AfterEach void tearDown() { + for(ChromosomeEntity chromosomeEntity: chromosomeEntities){ + service.deleteChromosome(chromosomeEntity); + } for (AssemblyEntity assemblyEntity : assemblyEntities) { assemblyService.deleteAssembly(assemblyEntity); } diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java index 9fbcd04d..5b1ac463 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyServiceIntegrationTest.java @@ -20,7 +20,6 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; -import org.mockito.Mockito; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.data.domain.Page; @@ -34,7 +33,6 @@ import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; import uk.ac.ebi.eva.contigalias.scheduler.ChromosomeUpdater; -import java.io.IOException; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -49,7 +47,6 @@ @DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) @SpringBootTest public class AssemblyServiceIntegrationTest { - private static final int TEST_ENTITIES_NUMBERS = 11; private final AssemblyEntity[] entities = new AssemblyEntity[TEST_ENTITIES_NUMBERS]; @@ -67,17 +64,13 @@ public class AssemblyServiceIntegrationTest { private AssemblyService service; @BeforeEach - void setup() throws IOException { + void setup() { NCBIAssemblyDataSource mockNcbiDataSource = mock(NCBIAssemblyDataSource.class); ENAAssemblyDataSource mockEnaDataSource = mock(ENAAssemblyDataSource.class); ChromosomeUpdater chromosomeUpdater = mock(ChromosomeUpdater.class); for (int i = 0; i < entities.length; i++) { AssemblyEntity generate = AssemblyGenerator.generate(i); entities[i] = generate; - Mockito.when(mockNcbiDataSource.getAssemblyByAccession(generate.getInsdcAccession())) - .thenReturn(Optional.of(generate)); - Mockito.when(mockNcbiDataSource.getAssemblyByAccession(generate.getRefseq())) - .thenReturn(Optional.of(generate)); } service = new AssemblyService(chromosomeService, repository, chromosomeRepository, mockNcbiDataSource, mockEnaDataSource, chromosomeUpdater); diff --git a/src/test/resources/application-test.properties b/src/test/resources/application-test.properties index 166fc87b..83395a0e 100644 --- a/src/test/resources/application-test.properties +++ b/src/test/resources/application-test.properties @@ -25,6 +25,7 @@ spring.h2.console.path=/console/ spring.jpa.hibernate.ddl-auto=create-drop spring.jpa.show-sql=false spring.jpa.properties.hibernate.format_sql=false +spring.jpa.open-in-view=false ftp.proxy.host=null ftp.proxy.port=0 From 92d1099062feda17f57a068dfe051e96a688337b Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Mon, 12 Feb 2024 10:05:20 +0000 Subject: [PATCH 10/13] update rest api path --- .../eva/contigalias/controller/admin/AdminController.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java index 6226f176..4ddb25f3 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java @@ -127,7 +127,7 @@ public ResponseEntity retrieveAndInsertMd5ChecksumForAssembly( } @ApiOperation(value = "Given an assembly accession, retrieve ENA sequence name for all chromosomes belonging to assembly and update") - @PutMapping(value = "assemblies/enaSequenceName/{accession}") + @PutMapping(value = "assemblies/ena-sequence-name/{accession}") public ResponseEntity retrieveAndInsertENASequenceNameForAssembly(@PathVariable(name = "accession") @ApiParam(value = "INSDC or RefSeq assembly accession. " + "Eg: GCA_000001405.10") String asmAccession) { @@ -144,7 +144,7 @@ public ResponseEntity retrieveAndInsertENASequenceNameForAssembly(@PathV } @ApiOperation(value = "Given a list of assembly accessions, retrieve ENA sequence name for all chromosomes belonging to all the assemblies and update") - @PutMapping(value = "assemblies/enaSequenceName") + @PutMapping(value = "assemblies/ena-sequence-name") public ResponseEntity retrieveAndInsertENASequenceNameForAssembly( @RequestBody @ApiParam(value = "A JSON array of INSDC or RefSeq assembly accessions. " + "Eg: [\"GCA_000001405.10\",\"GCA_000001405.11\",\"GCA_000001405.12\"]") List accessions) { @@ -157,7 +157,7 @@ public ResponseEntity retrieveAndInsertENASequenceNameForAssembly( @ApiOperation(value = "Retrieve list of assemblies for which MD5 Checksum updates are running/going-to-run ") - @GetMapping(value = "assemblies/scheduledJobs") + @GetMapping(value = "assemblies/scheduled-jobs") public ResponseEntity> getMD5ChecksumUpdateTaskStatus() { List scheduledJobStatus = handler.getScheduledJobStatus(); return ResponseEntity.ok(scheduledJobStatus); From 3d23017e2a39a02e4af795aa8ee620b96c3e0065 Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Mon, 12 Feb 2024 13:04:28 +0000 Subject: [PATCH 11/13] check if assembly exist before trying to update chromosomes in the assembly --- .../controller/admin/AdminController.java | 92 ++++++++++++++----- .../scheduler/ENASequenceNameUpdater.java | 17 ++-- .../scheduler/MD5ChecksumUpdater.java | 10 +- .../contigalias/service/AssemblyService.java | 8 +- 4 files changed, 83 insertions(+), 44 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java index 4ddb25f3..a954cf56 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java @@ -28,11 +28,13 @@ import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; -import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; +import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Optional; @RequestMapping("/v1/admin") @RestController @@ -102,15 +104,14 @@ public ResponseEntity fetchAndInsertAssemblyByAccession( public ResponseEntity retrieveAndInsertMd5ChecksumForAssembly(@PathVariable(name = "accession") @ApiParam(value = "INSDC or RefSeq assembly accession. Eg: " + "GCA_000001405.10") String asmAccession) { - try { - handler.getAssemblyByAccession(asmAccession); - handler.retrieveAndInsertMd5ChecksumForAssembly(asmAccession); - return ResponseEntity.ok("A task has been submitted for updating md5checksum for all chromosomes " + - "in assembly " + asmAccession + ". Depending upon the number of chromosomes present in assembly, " + - "and other scheduled jobs, this might take some time to complete"); - } catch (AssemblyNotFoundException e) { + Optional assemblyOpt = handler.getAssemblyByAccession(asmAccession); + if (assemblyOpt.isPresent()) { + handler.retrieveAndInsertMd5ChecksumForAssembly(assemblyOpt.get().getInsdcAccession()); + return ResponseEntity.ok("A task has been submitted for updating md5checksum for assembly " + asmAccession + + "\nDepending upon the size of assembly and other scheduled jobs, this might take some time to complete"); + } else { return ResponseEntity.ok("Could not find assembly " + asmAccession + - ". Please insert the assembly first (md5checksum will be updated as part of the insertion process"); + ". Please insert the assembly first. MD5 checksum will be updated as part of the insertion process"); } } @@ -119,11 +120,32 @@ public ResponseEntity retrieveAndInsertMd5ChecksumForAssembly(@PathVaria public ResponseEntity retrieveAndInsertMd5ChecksumForAssembly( @RequestBody @ApiParam(value = "A JSON array of INSDC or RefSeq assembly accessions. " + "Eg: [\"GCA_000001405.10\",\"GCA_000001405.11\",\"GCA_000001405.12\"]") List accessions) { - handler.retrieveAndInsertMd5ChecksumForAssembly(accessions); - return ResponseEntity.ok("A task has been submitted for updating md5checksum for all chromosomes " + - "in assemblies " + accessions + ". Depending upon the number of chromosomes present in assembly, " + - "and other scheduled jobs, this might take some time to complete"); + if (accessions == null || accessions.size() <= 0) { + return new ResponseEntity<>(HttpStatus.BAD_REQUEST); + } + + List asmInsdcAccessionsList = new ArrayList<>(); + List asmNotPresent = new ArrayList<>(); + for (String accession : accessions) { + Optional assemblyOpt = handler.getAssemblyByAccession(accession); + if (assemblyOpt.isPresent()) { + asmInsdcAccessionsList.add(assemblyOpt.get().getInsdcAccession()); + } else { + asmNotPresent.add(accession); + } + } + + handler.retrieveAndInsertMd5ChecksumForAssembly(asmInsdcAccessionsList); + + accessions.removeAll(asmNotPresent); + String responseText = "A task has been submitted for updating MD5 checksum for assemblies: " + accessions + "." + + "\nDepending upon other scheduled jobs and the size of assembly, this might take some time to complete"; + if (!asmNotPresent.isEmpty()) { + responseText = responseText + "\nThe following assemblies are not present: " + asmNotPresent + "." + + "\nPlease insert the assembly first, MD5 Checksum will be updated as part of the insertion process"; + } + return ResponseEntity.ok(responseText); } @ApiOperation(value = "Given an assembly accession, retrieve ENA sequence name for all chromosomes belonging to assembly and update") @@ -131,15 +153,14 @@ public ResponseEntity retrieveAndInsertMd5ChecksumForAssembly( public ResponseEntity retrieveAndInsertENASequenceNameForAssembly(@PathVariable(name = "accession") @ApiParam(value = "INSDC or RefSeq assembly accession. " + "Eg: GCA_000001405.10") String asmAccession) { - try { - handler.getAssemblyByAccession(asmAccession); - handler.retrieveAndInsertENASequenceNameForAssembly(asmAccession); - return ResponseEntity.ok("A task has been submitted for updating ENA Sequence Name for all chromosomes " + - "in assembly " + asmAccession + ". Depending upon the number of chromosomes present in assembly, " + - "and other scheduled jobs, this might take some time to complete"); - } catch (AssemblyNotFoundException e) { + Optional assemblyOpt = handler.getAssemblyByAccession(asmAccession); + if (assemblyOpt.isPresent()) { + handler.retrieveAndInsertENASequenceNameForAssembly(assemblyOpt.get().getInsdcAccession()); + return ResponseEntity.ok("A task has been submitted for updating ENA Sequence Name for assembly " + asmAccession + + "\nDepending upon the size of assembly and other scheduled jobs, this might take some time to complete"); + } else { return ResponseEntity.ok("Could not find assembly " + asmAccession + - ". Please insert the assembly first (ENA sequence name will be updated as part of the insertion process"); + ". Please insert the assembly first. ENA sequence name will be updated as part of the insertion process"); } } @@ -148,11 +169,32 @@ public ResponseEntity retrieveAndInsertENASequenceNameForAssembly(@PathV public ResponseEntity retrieveAndInsertENASequenceNameForAssembly( @RequestBody @ApiParam(value = "A JSON array of INSDC or RefSeq assembly accessions. " + "Eg: [\"GCA_000001405.10\",\"GCA_000001405.11\",\"GCA_000001405.12\"]") List accessions) { + if (accessions == null || accessions.size() <= 0) { + return new ResponseEntity<>(HttpStatus.BAD_REQUEST); + } + + List asmInsdcAccessionsList = new ArrayList<>(); + List asmNotPresent = new ArrayList<>(); + for (String accession : accessions) { + Optional assemblyOpt = handler.getAssemblyByAccession(accession); + if (assemblyOpt.isPresent()) { + asmInsdcAccessionsList.add(assemblyOpt.get().getInsdcAccession()); + } else { + asmNotPresent.add(accession); + } + } + + handler.retrieveAndInsertENASequenceNameForAssembly(asmInsdcAccessionsList); + + accessions.removeAll(asmNotPresent); + String responseText = "A task has been submitted for updating ENA Sequence Name for assemblies: " + accessions + + "\nDepending upon other scheduled jobs and the size of assembly, this might take some time to complete"; + if (!asmNotPresent.isEmpty()) { + responseText = responseText + "\nThe following assemblies are not present: " + asmNotPresent + "." + + "\nPlease insert the assembly first, ENA Sequence Name will be updated as part of the insertion process"; + } - handler.retrieveAndInsertENASequenceNameForAssembly(accessions); - return ResponseEntity.ok("A task has been submitted for updating ENA Sequence Name for all chromosomes " + - "in assembly " + accessions + ". Depending upon the number of chromosomes present in assembly, " + - "and other scheduled jobs, this might take some time to complete"); + return ResponseEntity.ok(responseText); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java index 026171d3..5a28fe80 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ENASequenceNameUpdater.java @@ -12,7 +12,6 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.Optional; @@ -31,7 +30,7 @@ public ENASequenceNameUpdater(ENAAssemblyDataSource enaDataSource, ChromosomeSer } public void updateENASequenceNameForAssembly(String assembly) { - Path downloadedENAFilePath = Paths.get(""); + Path downloadedENAFilePath = null; try { logger.info("Trying to update ENA Sequence Name for assembly: " + assembly); Optional downloadENAFilePathOpt = enaDataSource.downloadAssemblyReport(assembly); @@ -51,7 +50,9 @@ public void updateENASequenceNameForAssembly(String assembly) { logger.error("Error while updating ENA Sequence Name for assembly : " + assembly + "\n" + e); } finally { try { - Files.deleteIfExists(downloadedENAFilePath); + if (downloadedENAFilePath != null) { + Files.deleteIfExists(downloadedENAFilePath); + } } catch (IOException e) { logger.error("Error while deleting downloaded ENA assembly report file with path " + downloadedENAFilePath + " for assembly : " + assembly); @@ -61,7 +62,7 @@ public void updateENASequenceNameForAssembly(String assembly) { private void retrieveAndUpdateENASequenceNames(String assembly, Path downloadedENAFilePath) throws IOException { try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedENAFilePath.toFile()))) { - long chromosomesSavedTillNow = 0l; + long chromosomesProcessedTillNow = 0l; List chrLines = new ArrayList<>(); String line; while ((line = bufferedReader.readLine()) != null) { @@ -72,8 +73,8 @@ private void retrieveAndUpdateENASequenceNames(String assembly, Path downloadedE if (chrLines.size() == DEFAULT_BATCH_SIZE) { List chromosomeEntityList = enaDataSource.getChromosomeEntityList(chrLines); chromosomeService.updateENASequenceNameForAllChromosomeInAssembly(assembly, chromosomeEntityList); - chromosomesSavedTillNow += chrLines.size(); - logger.info("Number of chromosomes updated till now : " + chromosomesSavedTillNow); + chromosomesProcessedTillNow += chrLines.size(); + logger.info("Number of chromosomes updated till now : " + chromosomesProcessedTillNow); chrLines = new ArrayList<>(); } @@ -81,8 +82,8 @@ private void retrieveAndUpdateENASequenceNames(String assembly, Path downloadedE if (!chrLines.isEmpty()) { List chromosomeEntityList = enaDataSource.getChromosomeEntityList(chrLines); chromosomeService.updateENASequenceNameForAllChromosomeInAssembly(assembly, chromosomeEntityList); - chromosomesSavedTillNow += chrLines.size(); - logger.info("Number of chromosomes updated till now : " + chromosomesSavedTillNow); + chromosomesProcessedTillNow += chrLines.size(); + logger.info("Number of chromosomes updated till now : " + chromosomesProcessedTillNow); } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java index a586dc80..b9db041d 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java @@ -37,7 +37,7 @@ public void updateMD5ChecksumForAssembly(String assembly) { String sql = "select * from chromosome c where c.assembly_insdc_accession = '" + assembly + "' AND (c.md5checksum IS NULL OR c.md5checksum = '')"; jdbcTemplate.query(sql, (ResultSetExtractor) rs -> { - long chromosomeUpdated = 0; + long chromosomeProcessed = 0; List chromosomeEntityList = new ArrayList<>(); while (rs.next()) { ChromosomeEntity chromosome = new ChromosomeEntity(); @@ -46,15 +46,15 @@ public void updateMD5ChecksumForAssembly(String assembly) { if (chromosomeEntityList.size() == DEFAULT_BATCH_SIZE) { updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); - chromosomeUpdated += chromosomeEntityList.size(); - logger.info("Chromosomes Updated till now: " + chromosomeUpdated); + chromosomeProcessed += chromosomeEntityList.size(); + logger.info("Chromosomes Processed till now: " + chromosomeProcessed); chromosomeEntityList = new ArrayList<>(); } } if (chromosomeEntityList.size() > 0) { updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); - chromosomeUpdated += chromosomeEntityList.size(); - logger.info("Chromosomes Updated till now: " + chromosomeUpdated); + chromosomeProcessed += chromosomeEntityList.size(); + logger.info("Chromosomes Processed till now: " + chromosomeProcessed); } logger.info("Finished updating MD5 Checksum for assembly: " + assembly); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java index 430e7a73..5b630c22 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java @@ -202,12 +202,8 @@ public List getScheduledJobStatus() { } public Optional getAssemblyByAccession(String accession) { - Optional entity = assemblyRepository.findAssemblyEntityByAccession(accession); - if (entity.isPresent()) { - return entity; - } else { - throw new AssemblyNotFoundException(accession); - } + Optional assemblyEntity = assemblyRepository.findAssemblyEntityByAccession(accession); + return assemblyEntity; } @Transactional From cb4f0f490c1e89eb30e1c14a1a34936dbb0d7d79 Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Mon, 12 Feb 2024 14:28:21 +0000 Subject: [PATCH 12/13] review comments --- .../contigalias/controller/admin/AdminController.java | 2 +- .../ebi/eva/contigalias/scheduler/ChromosomeUpdater.java | 6 +++--- .../ebi/eva/contigalias/scheduler/{Job => job}/Job.java | 2 +- .../scheduler/{Job => job}/JobSubmittedEvent.java | 2 +- .../scheduler/{Job => job}/JobSubmittedEventHandler.java | 2 +- .../eva/contigalias/scheduler/{Job => job}/JobType.java | 2 +- .../ac/ebi/eva/contigalias/service/AssemblyService.java | 9 ++++----- .../ebi/eva/contigalias/service/ChromosomeService.java | 5 +++-- 8 files changed, 15 insertions(+), 15 deletions(-) rename src/main/java/uk/ac/ebi/eva/contigalias/scheduler/{Job => job}/Job.java (89%) rename src/main/java/uk/ac/ebi/eva/contigalias/scheduler/{Job => job}/JobSubmittedEvent.java (79%) rename src/main/java/uk/ac/ebi/eva/contigalias/scheduler/{Job => job}/JobSubmittedEventHandler.java (93%) rename src/main/java/uk/ac/ebi/eva/contigalias/scheduler/{Job => job}/JobType.java (61%) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java index a954cf56..e35a6ab7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/admin/AdminController.java @@ -198,7 +198,7 @@ public ResponseEntity retrieveAndInsertENASequenceNameForAssembly( } - @ApiOperation(value = "Retrieve list of assemblies for which MD5 Checksum updates are running/going-to-run ") + @ApiOperation(value = "Retrieve list of Jobs that are running or scheduled to run") @GetMapping(value = "assemblies/scheduled-jobs") public ResponseEntity> getMD5ChecksumUpdateTaskStatus() { List scheduledJobStatus = handler.getScheduledJobStatus(); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java index 62c100dd..f39ad94b 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChromosomeUpdater.java @@ -6,9 +6,9 @@ import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; import uk.ac.ebi.eva.contigalias.conf.ApplicationContextHolder; -import uk.ac.ebi.eva.contigalias.scheduler.Job.Job; -import uk.ac.ebi.eva.contigalias.scheduler.Job.JobSubmittedEvent; -import uk.ac.ebi.eva.contigalias.scheduler.Job.JobType; +import uk.ac.ebi.eva.contigalias.scheduler.job.Job; +import uk.ac.ebi.eva.contigalias.scheduler.job.JobSubmittedEvent; +import uk.ac.ebi.eva.contigalias.scheduler.job.JobType; import java.util.ArrayList; import java.util.List; diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/Job.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/Job.java similarity index 89% rename from src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/Job.java rename to src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/Job.java index 77bd78b6..715381ba 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/Job.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/Job.java @@ -1,4 +1,4 @@ -package uk.ac.ebi.eva.contigalias.scheduler.Job; +package uk.ac.ebi.eva.contigalias.scheduler.job; public class Job { private final JobType type; diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEvent.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEvent.java similarity index 79% rename from src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEvent.java rename to src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEvent.java index 1a86340b..464af04e 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEvent.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEvent.java @@ -1,4 +1,4 @@ -package uk.ac.ebi.eva.contigalias.scheduler.Job; +package uk.ac.ebi.eva.contigalias.scheduler.job; import org.springframework.context.ApplicationEvent; diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEventHandler.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEventHandler.java similarity index 93% rename from src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEventHandler.java rename to src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEventHandler.java index 2b1dfb89..e0df9888 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobSubmittedEventHandler.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobSubmittedEventHandler.java @@ -1,4 +1,4 @@ -package uk.ac.ebi.eva.contigalias.scheduler.Job; +package uk.ac.ebi.eva.contigalias.scheduler.job; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.ApplicationListener; diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobType.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobType.java similarity index 61% rename from src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobType.java rename to src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobType.java index 6bf8f58b..47397dd4 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/Job/JobType.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/job/JobType.java @@ -1,4 +1,4 @@ -package uk.ac.ebi.eva.contigalias.scheduler.Job; +package uk.ac.ebi.eva.contigalias.scheduler.job; public enum JobType { ENA_SEQUENCE_NAME_UPDATE, diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java index 5b630c22..c5a9e2e9 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java @@ -32,8 +32,8 @@ import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; import uk.ac.ebi.eva.contigalias.repo.ChromosomeRepository; import uk.ac.ebi.eva.contigalias.scheduler.ChromosomeUpdater; -import uk.ac.ebi.eva.contigalias.scheduler.Job.Job; -import uk.ac.ebi.eva.contigalias.scheduler.Job.JobType; +import uk.ac.ebi.eva.contigalias.scheduler.job.Job; +import uk.ac.ebi.eva.contigalias.scheduler.job.JobType; import javax.transaction.Transactional; import java.io.BufferedReader; @@ -146,7 +146,7 @@ public void parseFileAndInsertAssembly(String accession) throws IOException { chrLines.add(line); if (chrLines.size() == BATCH_SIZE) { List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); - chromosomeService.saveAllChromosomes(chromosomeEntityList); + chromosomeService.insertAllChromosomes(chromosomeEntityList); chromosomesSavedTillNow += chrLines.size(); logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); @@ -154,9 +154,8 @@ public void parseFileAndInsertAssembly(String accession) throws IOException { } } if (!chrLines.isEmpty()) { - // add ena sequence name and save List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); - chromosomeService.saveAllChromosomes(chromosomeEntityList); + chromosomeService.insertAllChromosomes(chromosomeEntityList); chromosomesSavedTillNow += chrLines.size(); logger.info("Number of chromosomes saved till now : " + chromosomesSavedTillNow); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java index 77e2564e..9fd11976 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java @@ -208,7 +208,7 @@ private void stripAssemblyFromChromosome(ChromosomeEntity chromosome) { public void putChromosomeChecksumsByAccession(String accession, String md5, String trunc512) { Page page = repository.findChromosomeEntitiesByInsdcAccessionOrRefseq( accession, accession, Pageable.unpaged()); - if (page.isEmpty()){ + if (page.isEmpty()) { throw new IllegalArgumentException( "No chromosomes corresponding to accession " + accession + " found in the database"); } @@ -284,7 +284,7 @@ public long countChromosomeEntitiesByEnaName(String enaName) { return repository.countChromosomeEntitiesByEnaSequenceName(enaName); } - public void saveAllChromosomes(List chromosomeEntityList) { + public void insertAllChromosomes(List chromosomeEntityList) { String sql = "INSERT INTO chromosome (assembly_insdc_accession,contig_type,ena_sequence_name," + "genbank_sequence_name,insdc_accession,md5checksum,refseq,seq_length,trunc512checksum,ucsc_name) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"; @@ -303,6 +303,7 @@ public void setValues(PreparedStatement ps, int i) throws SQLException { ps.setString(9, chromosome.getTrunc512checksum()); ps.setString(10, chromosome.getUcscName()); } + @Override public int getBatchSize() { return chromosomeEntityList.size(); From 530f6233726c895e8f135a4f62298c4e3d8e87c6 Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Mon, 12 Feb 2024 16:50:35 +0000 Subject: [PATCH 13/13] review comments - removing unused code --- .../datasource/ENAAssemblyDataSource.java | 17 +-- .../datasource/NCBIAssemblyDataSource.java | 17 +-- .../dus/ENAAssemblyReportReader.java | 84 ++---------- .../dus/ENAAssemblyReportReaderFactory.java | 39 ------ .../dus/NCBIAssemblyReportReader.java | 122 ++---------------- .../dus/NCBIAssemblyReportReaderFactory.java | 39 ------ .../dus/ENAAssemblyReportReaderTest.java | 56 ++------ .../dus/NCBIAssemblyReportReaderTest.java | 60 +++------ 8 files changed, 57 insertions(+), 377 deletions(-) delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderFactory.java delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderFactory.java diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java index 5279357a..80d8bdf7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java @@ -25,7 +25,6 @@ import org.springframework.retry.annotation.Retryable; import org.springframework.stereotype.Repository; import uk.ac.ebi.eva.contigalias.dus.ENAAssemblyReportReader; -import uk.ac.ebi.eva.contigalias.dus.ENAAssemblyReportReaderFactory; import uk.ac.ebi.eva.contigalias.dus.ENABrowser; import uk.ac.ebi.eva.contigalias.dus.ENABrowserFactory; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; @@ -34,7 +33,6 @@ import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; import java.util.List; import java.util.Optional; @@ -45,16 +43,12 @@ public class ENAAssemblyDataSource { private final ENABrowserFactory factory; - private final ENAAssemblyReportReaderFactory readerFactory; - @Value("${asm.file.download.dir}") private String asmFileDownloadDir; @Autowired - public ENAAssemblyDataSource(ENABrowserFactory factory, - ENAAssemblyReportReaderFactory readerFactory) { + public ENAAssemblyDataSource(ENABrowserFactory factory) { this.factory = factory; - this.readerFactory = readerFactory; } public Optional downloadAssemblyReport(String accession) throws IOException { @@ -97,14 +91,7 @@ public Optional downloadAssemblyReport(ENABrowser enaBrowser, String acces } public List getChromosomeEntityList(List chrDataList) { - List chromosomeEntityList = new ArrayList<>(); - for (String chrData : chrDataList) { - ChromosomeEntity chromosomeEntity = getChromosomeEntity(chrData); - if (chromosomeEntity != null) { - chromosomeEntityList.add(chromosomeEntity); - } - } - return chromosomeEntityList; + return ENAAssemblyReportReader.getChromosomeEntity(chrDataList); } public ChromosomeEntity getChromosomeEntity(String chrLine) { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java index 0e7046f0..2ef26d09 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java @@ -25,7 +25,6 @@ import org.springframework.retry.annotation.Retryable; import org.springframework.stereotype.Repository; import uk.ac.ebi.eva.contigalias.dus.NCBIAssemblyReportReader; -import uk.ac.ebi.eva.contigalias.dus.NCBIAssemblyReportReaderFactory; import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser; import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; @@ -35,7 +34,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -47,16 +45,12 @@ public class NCBIAssemblyDataSource { private final NCBIBrowserFactory factory; - private final NCBIAssemblyReportReaderFactory readerFactory; - @Value("${asm.file.download.dir}") private String asmFileDownloadDir; @Autowired - public NCBIAssemblyDataSource(NCBIBrowserFactory factory, - NCBIAssemblyReportReaderFactory readerFactory) { + public NCBIAssemblyDataSource(NCBIBrowserFactory factory) { this.factory = factory; - this.readerFactory = readerFactory; } public AssemblyEntity getAssemblyEntity(Path downloadFilePath) throws IOException { @@ -71,13 +65,8 @@ public AssemblyEntity getAssemblyEntity(List asmDataLines) { } public List getChromosomeEntityList(AssemblyEntity assemblyEntity, List chrDataList) { - List chromosomeEntityList = new ArrayList<>(); - for (String chrData : chrDataList) { - ChromosomeEntity chromosomeEntity = getChromosomeEntity(assemblyEntity, chrData); - if (chromosomeEntity != null) { - chromosomeEntityList.add(chromosomeEntity); - } - } + List chromosomeEntityList = NCBIAssemblyReportReader.getChromosomeEntity(chrDataList); + chromosomeEntityList.stream().forEach(c -> c.setAssembly(assemblyEntity)); return chromosomeEntityList; } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java index 3e857885..96512f98 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReader.java @@ -16,90 +16,24 @@ package uk.ac.ebi.eva.contigalias.dus; -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.LinkedList; +import java.util.ArrayList; import java.util.List; -public class ENAAssemblyReportReader extends AssemblyReportReader { +public class ENAAssemblyReportReader { - public ENAAssemblyReportReader(InputStreamReader inputStreamReader, boolean isScaffoldsEnabled) { - super(inputStreamReader, isScaffoldsEnabled); - } - - protected void parseReport() throws IOException, NullPointerException { - if (reader == null) { - throw new NullPointerException("Cannot use AssemblyReportReader without having a valid InputStreamReader."); - } - String line = reader.readLine(); - while (line != null) { - if (line.startsWith("accession")) { - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - parseAssemblyData(line); - } else if (!line.startsWith("accession")) { - String[] columns = line.split("\t", -1); - if (columns.length >= 6) { - if (columns[5].equals("Chromosome") && columns[3].equals("assembled-molecule")) { - parseChromosomeLine(columns); - } else if (isScaffoldsEnabled) { - parseScaffoldLine(columns); - } - } + public static List getChromosomeEntity(List lines) { + List chromosomeEntityList = new ArrayList<>(); + for (String line : lines) { + ChromosomeEntity chromosomeEntity = getChromosomeEntity(line); + if (chromosomeEntity != null) { + chromosomeEntityList.add(chromosomeEntity); } - line = reader.readLine(); - } - reportParsed = true; - reader.close(); - } - - // Not present in ENA assembly reports - protected void parseAssemblyData(String line) { - } - - protected void parseChromosomeLine(String[] columns) { - ChromosomeEntity chromosomeEntity = getChromosome(columns); - if (chromosomeEntity == null) { - return; - } - - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - chromosomeEntity.setAssembly(this.assemblyEntity); - chromosomeEntity.setContigType(SequenceEntity.ContigType.CHROMOSOME); - - List chromosomes = this.assemblyEntity.getChromosomes(); - if (chromosomes == null) { - chromosomes = new LinkedList<>(); - assemblyEntity.setChromosomes(chromosomes); } - chromosomes.add(chromosomeEntity); - } - protected void parseScaffoldLine(String[] columns) { - ChromosomeEntity scaffoldEntity = getScaffold(columns); - if (scaffoldEntity == null) { - return; - } - - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - scaffoldEntity.setAssembly(this.assemblyEntity); - scaffoldEntity.setContigType(SequenceEntity.ContigType.SCAFFOLD); - - List scaffolds = this.assemblyEntity.getChromosomes(); - if (scaffolds == null) { - scaffolds = new LinkedList<>(); - assemblyEntity.setChromosomes(scaffolds); - } - scaffolds.add(scaffoldEntity); + return chromosomeEntityList; } public static ChromosomeEntity getChromosomeEntity(String line) { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderFactory.java deleted file mode 100644 index f6d608ae..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderFactory.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright 2021 EMBL - European Bioinformatics Institute - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package uk.ac.ebi.eva.contigalias.dus; - -import org.springframework.beans.factory.annotation.Value; -import org.springframework.stereotype.Component; - -import java.io.InputStream; -import java.io.InputStreamReader; - -@Component -public class ENAAssemblyReportReaderFactory { - - @Value("${config.scaffolds.enabled:false}") - private boolean SCAFFOLDS_ENABLED; - - public ENAAssemblyReportReader build(InputStream inputStream) { - return new ENAAssemblyReportReader(new InputStreamReader(inputStream), SCAFFOLDS_ENABLED); - } - - public ENAAssemblyReportReader build(InputStreamReader inputStreamReader) { - return new ENAAssemblyReportReader(inputStreamReader, SCAFFOLDS_ENABLED); - } - -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java index 28417ab7..fcf99462 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java @@ -20,119 +20,12 @@ import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.LinkedList; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.stream.Collectors; -public class NCBIAssemblyReportReader extends AssemblyReportReader { - - public NCBIAssemblyReportReader(InputStreamReader inputStreamReader, boolean isScaffoldsEnabled) { - super(inputStreamReader, isScaffoldsEnabled); - } - - protected void parseReport() throws IOException, NullPointerException { - if (reader == null) { - throw new NullPointerException("Cannot use AssemblyReportReader without having a valid InputStreamReader."); - } - String line = reader.readLine(); - while (line != null) { - if (line.startsWith("# ")) { - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - parseAssemblyData(line); - } else if (!line.startsWith("#")) { - String[] columns = line.split("\t", -1); - if (columns.length >= 6 && (columns[5].equals("=") || columns[5].equals("<>")) && - (columns[4] != null && !columns[4].isEmpty() && !columns[4].equals("na"))) { - if (columns[3].equals("Chromosome") && columns[1].equals("assembled-molecule")) { - parseChromosomeLine(columns); - } else if (isScaffoldsEnabled) { - parseScaffoldLine(columns); - } - } - } - line = reader.readLine(); - } - reportParsed = true; - reader.close(); - } - - protected void parseAssemblyData(String line) { - int tagEnd = line.indexOf(':'); - if (tagEnd == -1) { - return; - } - String tag = line.substring(2, tagEnd); - String tagData = line.substring(tagEnd + 1).trim(); - switch (tag) { - case "Assembly name": { - assemblyEntity.setName(tagData); - break; - } - case "Organism name": { - assemblyEntity.setOrganism(tagData); - break; - } - case "Taxid": { - assemblyEntity.setTaxid(Long.parseLong(tagData)); - break; - } - case "GenBank assembly accession": { - assemblyEntity.setInsdcAccession(tagData); - break; - } - case "RefSeq assembly accession": { - assemblyEntity.setRefseq(tagData); - break; - } - case "RefSeq assembly and GenBank assemblies identical": { - assemblyEntity.setGenbankRefseqIdentical(tagData.equals("yes")); - break; - } - } - } - - protected void parseChromosomeLine(String[] columns) { - ChromosomeEntity chromosomeEntity = getChromosome(columns); - if (chromosomeEntity == null) { - return; - } - - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - chromosomeEntity.setAssembly(this.assemblyEntity); - - List chromosomes = this.assemblyEntity.getChromosomes(); - if (chromosomes == null) { - chromosomes = new LinkedList<>(); - assemblyEntity.setChromosomes(chromosomes); - } - chromosomes.add(chromosomeEntity); - } - - protected void parseScaffoldLine(String[] columns) { - ChromosomeEntity scaffoldEntity = getScaffold(columns); - if (scaffoldEntity == null) { - return; - } - - if (assemblyEntity == null) { - assemblyEntity = new AssemblyEntity(); - } - scaffoldEntity.setAssembly(this.assemblyEntity); - - List scaffolds = this.assemblyEntity.getChromosomes(); - if (scaffolds == null) { - scaffolds = new LinkedList<>(); - assemblyEntity.setChromosomes(scaffolds); - } - scaffolds.add(scaffoldEntity); - } +public class NCBIAssemblyReportReader { public static AssemblyEntity getAssemblyEntity(List lines) { Map tagAndValuesMap = lines.stream() @@ -175,6 +68,17 @@ public static AssemblyEntity getAssemblyEntity(List lines) { return asmEntity; } + public static List getChromosomeEntity(List lines) { + List chromosomeEntityList = new ArrayList<>(); + for (String line : lines) { + ChromosomeEntity chromosomeEntity = getChromosomeEntity(line); + if (chromosomeEntity != null) { + chromosomeEntityList.add(chromosomeEntity); + } + } + return chromosomeEntityList; + } + public static ChromosomeEntity getChromosomeEntity(String line) { String[] columns = line.split("\t", -1); if (columns.length >= 6 && (columns[5].equals("=") || columns[5].equals("<>")) && diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderFactory.java deleted file mode 100644 index 6dfb49a8..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderFactory.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright 2020 EMBL - European Bioinformatics Institute - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package uk.ac.ebi.eva.contigalias.dus; - -import org.springframework.beans.factory.annotation.Value; -import org.springframework.stereotype.Component; - -import java.io.InputStream; -import java.io.InputStreamReader; - -@Component -public class NCBIAssemblyReportReaderFactory { - - @Value("${config.scaffolds.enabled:false}") - private boolean SCAFFOLDS_ENABLED; - - public NCBIAssemblyReportReader build(InputStream inputStream) { - return new NCBIAssemblyReportReader(new InputStreamReader(inputStream), SCAFFOLDS_ENABLED); - } - - public NCBIAssemblyReportReader build(InputStreamReader inputStreamReader) { - return new NCBIAssemblyReportReader(inputStreamReader, SCAFFOLDS_ENABLED); - } - -} diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderTest.java index 2276c1c9..5d5d71c0 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/dus/ENAAssemblyReportReaderTest.java @@ -16,22 +16,16 @@ package uk.ac.ebi.eva.contigalias.dus; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ActiveProfiles; - -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; -import java.io.FileInputStream; -import java.io.FileNotFoundException; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.List; import java.util.stream.Collectors; @@ -52,49 +46,23 @@ class ENAAssemblyReportReaderTest { private static final String SCAFFOLD_GENBANK_ACCESSION = "GJ057137.1"; - private InputStreamReader streamReader; - - private InputStream stream; - - @Autowired - private ENAAssemblyReportReaderFactory readerFactory; - - private ENAAssemblyReportReader reader; - - @BeforeEach - void setup() throws FileNotFoundException { - stream = new FileInputStream("src/test/resources/GCA_000003055.3_sequence_report.txt"); - streamReader = new InputStreamReader(stream); - reader = readerFactory.build(streamReader); - } - - @AfterEach - void tearDown() throws IOException { - stream.close(); - streamReader.close(); - } - - @Test - void getAssemblyReportReader() throws IOException { - assertTrue(reader.ready()); - } + private static final Path assemblyReportPath = Paths.get("src/test/resources/GCA_000003055.3_sequence_report.txt"); - AssemblyEntity getAssemblyEntity() throws IOException { - return reader.getAssemblyEntity(); + List getChromosomes() throws IOException { + List lines = Files.lines(assemblyReportPath).collect(Collectors.toList()); + return ENAAssemblyReportReader.getChromosomeEntity(lines); } @Test void verifyAssemblyHasChromosomes() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List chromosomes = assembly.getChromosomes(); + List chromosomes = getChromosomes(); assertNotNull(chromosomes); assertEquals(3316, chromosomes.size()); } @Test void verifyChromosomeMetadata() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List chromosomes = assembly.getChromosomes(); + List chromosomes = getChromosomes(); ChromosomeEntity chromosome = chromosomes.get(0); assertEquals(CHROMOSOME_ENA_SEQUENCE_NAME, chromosome.getEnaSequenceName()); assertEquals(CHROMOSOME_GENBANK_ACCESSION, chromosome.getInsdcAccession()); @@ -103,8 +71,7 @@ void verifyChromosomeMetadata() throws IOException { @Test void verifyAssemblyHasScaffolds() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List scaffolds = assembly.getChromosomes().stream() + List scaffolds = getChromosomes().stream() .filter(e -> e.getContigType().equals(SequenceEntity.ContigType.SCAFFOLD)).collect(Collectors.toList()); assertNotNull(scaffolds); assertEquals(3286, scaffolds.size()); @@ -112,8 +79,7 @@ void verifyAssemblyHasScaffolds() throws IOException { @Test void assertParsedScaffoldValid() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List scaffolds = assembly.getChromosomes().stream() + List scaffolds = getChromosomes().stream() .filter(e -> e.getContigType().equals(SequenceEntity.ContigType.SCAFFOLD)).collect(Collectors.toList()); assertNotNull(scaffolds); assertTrue(scaffolds.size() > 0); diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderTest.java index 9107baeb..ed32c95c 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReaderTest.java @@ -16,23 +16,18 @@ package uk.ac.ebi.eva.contigalias.dus; -import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.ActiveProfiles; - import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; import uk.ac.ebi.eva.contigalias.entities.SequenceEntity; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.List; import java.util.stream.Collectors; @@ -65,23 +60,11 @@ class NCBIAssemblyReportReaderTest { private static final Long CHROMOSOME_CHR1_SEQ_LENGTH = 158337067l; + private static final Path assemblyReportPath = Paths.get("src/test/resources/GCA_000003055.3_Bos_taurus_UMD_3.1_assembly_report.txt"); private ChromosomeEntity scaffoldEntity; - private InputStreamReader streamReader; - - private InputStream stream; - - @Autowired - private NCBIAssemblyReportReaderFactory readerFactory; - - private NCBIAssemblyReportReader reader; - @BeforeEach - void setup() throws FileNotFoundException { - stream = new FileInputStream( - new File("src/test/resources/GCA_000003055.3_Bos_taurus_UMD_3.1_assembly_report.txt")); - streamReader = new InputStreamReader(stream); - reader = readerFactory.build(streamReader); + void setup() { scaffoldEntity = (ChromosomeEntity) new ChromosomeEntity() .setGenbankSequenceName("ChrU_1") .setInsdcAccession("GJ057137.1") @@ -90,19 +73,18 @@ void setup() throws FileNotFoundException { .setUcscName(null); } - @AfterEach - void tearDown() throws IOException { - stream.close(); - streamReader.close(); - } - - @Test - void getAssemblyReportReader() throws IOException { - assertTrue(reader.ready()); + AssemblyEntity getAssemblyEntity() throws IOException { + List asmDataLines = Files.lines(assemblyReportPath) + .filter(line -> line.startsWith("#")) + .collect(Collectors.toList()); + return NCBIAssemblyReportReader.getAssemblyEntity(asmDataLines); } - AssemblyEntity getAssemblyEntity() throws IOException { - return reader.getAssemblyEntity(); + List getChromosomes() throws IOException { + List chrDataLines = Files.lines(assemblyReportPath) + .filter(line -> !line.startsWith("#")) + .collect(Collectors.toList()); + return NCBIAssemblyReportReader.getChromosomeEntity(chrDataLines); } @Test @@ -118,16 +100,14 @@ void verifyAssemblyMetadata() throws IOException { @Test void verifyAssemblyHasChromosomes() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List chromosomes = assembly.getChromosomes(); + List chromosomes = getChromosomes(); assertNotNull(chromosomes); assertEquals(3316, chromosomes.size()); } @Test void verifyChromosomeMetadata() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List chromosomes = assembly.getChromosomes(); + List chromosomes = getChromosomes(); ChromosomeEntity chromosome = chromosomes.get(0); assertEquals(CHROMOSOME_CHR1_SEQUENCE_NAME, chromosome.getGenbankSequenceName()); assertEquals(CHROMOSOME_CHR1_GENBANK_ACCESSION, chromosome.getInsdcAccession()); @@ -138,8 +118,7 @@ void verifyChromosomeMetadata() throws IOException { @Test void verifyAssemblyHasScaffolds() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List scaffolds = assembly.getChromosomes().stream() + List scaffolds = getChromosomes().stream() .filter(e -> e.getContigType().equals(SequenceEntity.ContigType.SCAFFOLD)).collect(Collectors.toList()); assertNotNull(scaffolds); assertEquals(3286, scaffolds.size()); @@ -147,8 +126,7 @@ void verifyAssemblyHasScaffolds() throws IOException { @Test void assertParsedScaffoldValid() throws IOException { - AssemblyEntity assembly = getAssemblyEntity(); - List scaffolds = assembly.getChromosomes().stream() + List scaffolds = getChromosomes().stream() .filter(e -> e.getContigType().equals(SequenceEntity.ContigType.SCAFFOLD)).collect(Collectors.toList()); assertNotNull(scaffolds); assertTrue(scaffolds.size() > 0);