From 7eaacbf3c92b44a3a2628bd06f88b7f1aa86fd8d Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Wed, 14 Feb 2024 13:42:57 +0000 Subject: [PATCH] update md5-checksum-update logic to use pagination --- .../repo/ChromosomeRepository.java | 44 +++++------ .../scheduler/MD5ChecksumUpdater.java | 78 +++++++------------ .../service/ChromosomeService.java | 34 ++++---- .../scheduler/MD5ChecksumUpdaterTest.java | 39 ++++------ 4 files changed, 78 insertions(+), 117 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java index 5979af07..a0ca26a3 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java @@ -33,15 +33,15 @@ @Repository public interface ChromosomeRepository extends JpaRepository { - Page findChromosomeEntitiesByInsdcAccession(String insdcAccession, Pageable request); + Page findChromosomeEntitiesByInsdcAccessionOrderByInsdcAccession(String insdcAccession, Pageable request); - Page findChromosomeEntitiesByRefseq(String refseq, Pageable request); + Page findChromosomeEntitiesByRefseqOrderByInsdcAccession(String refseq, Pageable request); - Page findChromosomeEntitiesByInsdcAccessionOrRefseq(String insdcAccession, String refseq, Pageable request); + Page findChromosomeEntitiesByInsdcAccessionOrRefseqOrderByInsdcAccession(String insdcAccession, String refseq, Pageable request); - Page findChromosomeEntitiesByAssembly_InsdcAccession(String asmInsdcAccession, Pageable request); + Page findChromosomeEntitiesByAssembly_InsdcAccessionOrderByInsdcAccession(String asmInsdcAccession, Pageable request); - @Query("SELECT c FROM ChromosomeEntity c WHERE c.assembly.insdcAccession = :asmInsdcAccession AND (c.md5checksum IS NULL OR c.md5checksum = '')") + @Query("SELECT c FROM ChromosomeEntity c WHERE c.assembly.insdcAccession = :asmInsdcAccession AND (c.md5checksum IS NULL OR c.md5checksum = '') ORDER BY c.insdcAccession") Page findChromosomeEntitiesByAssembly_InsdcAccessionAndMd5checksumIsNullOrEmpty(@Param("asmInsdcAccession") String asmInsdcAccession, Pageable pageable); @Query("SELECT distinct c.assembly.insdcAccession FROM ChromosomeEntity c WHERE c.md5checksum IS NULL OR c.md5checksum = ''") @@ -60,33 +60,33 @@ public interface ChromosomeRepository extends JpaRepository findChromosomeEntitiesByAssembly_Refseq(String asmRefseq, Pageable request); + Page findChromosomeEntitiesByAssembly_RefseqOrderByInsdcAccession(String asmRefseq, Pageable request); - Page findChromosomeEntitiesByGenbankSequenceNameAndAssembly_Taxid(String genbankName, long asmTaxid, Pageable request); + Page findChromosomeEntitiesByGenbankSequenceNameAndAssembly_TaxidOrderByInsdcAccession(String genbankName, long asmTaxid, Pageable request); - Page findChromosomeEntitiesByUcscNameAndAssembly_Taxid(String ucscName, long asmTaxid, - Pageable request); + Page findChromosomeEntitiesByUcscNameAndAssembly_TaxidOrderByInsdcAccession(String ucscName, long asmTaxid, + Pageable request); - Page findChromosomeEntitiesByEnaSequenceNameAndAssembly_Taxid(String enaName, long asmTaxid, - Pageable request); + Page findChromosomeEntitiesByEnaSequenceNameAndAssembly_TaxidOrderByInsdcAccession(String enaName, long asmTaxid, + Pageable request); - Page findChromosomeEntitiesByGenbankSequenceNameAndAssembly(String genbankName, AssemblyEntity assembly, - Pageable request); + Page findChromosomeEntitiesByGenbankSequenceNameAndAssemblyOrderByInsdcAccession(String genbankName, AssemblyEntity assembly, + Pageable request); - Page findChromosomeEntitiesByUcscNameAndAssembly(String ucscName, AssemblyEntity assembly, - Pageable request); + Page findChromosomeEntitiesByUcscNameAndAssemblyOrderByInsdcAccession(String ucscName, AssemblyEntity assembly, + Pageable request); - Page findChromosomeEntitiesByEnaSequenceNameAndAssembly(String enaName, AssemblyEntity assembly, - Pageable request); + Page findChromosomeEntitiesByEnaSequenceNameAndAssemblyOrderByInsdcAccession(String enaName, AssemblyEntity assembly, + Pageable request); - Page findChromosomeEntitiesByGenbankSequenceName(String genbankName, Pageable request); + Page findChromosomeEntitiesByGenbankSequenceNameOrderByInsdcAccession(String genbankName, Pageable request); - Page findChromosomeEntitiesByEnaSequenceName(String enaSequenceName, Pageable request); + Page findChromosomeEntitiesByEnaSequenceNameOrderByInsdcAccession(String enaSequenceName, Pageable request); - Page findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseq(String insdcAccession, String refseq, - Pageable request); + Page findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseqOrderByInsdcAccession(String insdcAccession, String refseq, + Pageable request); - Page findChromosomeEntitiesByUcscName(String ucscName, Pageable request); + Page findChromosomeEntitiesByUcscNameOrderByInsdcAccession(String ucscName, Pageable request); long countChromosomeEntitiesByInsdcAccession(String insdcAccession); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java index d01bf71e..61ad6cdd 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java @@ -4,21 +4,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.data.domain.Page; +import org.springframework.data.domain.PageRequest; +import org.springframework.data.domain.Pageable; import org.springframework.stereotype.Component; import org.springframework.web.client.RestTemplate; -import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; import uk.ac.ebi.eva.contigalias.service.ChromosomeService; -import java.io.BufferedReader; -import java.io.FileReader; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; import java.util.List; -import java.util.Optional; +import java.util.stream.Collectors; @Component public class MD5ChecksumUpdater { @@ -26,67 +21,46 @@ public class MD5ChecksumUpdater { private final int DEFAULT_BATCH_SIZE = 10000; private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER"; private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata"; - private final NCBIAssemblyDataSource ncbiDataSource; private final ChromosomeService chromosomeService; private RestTemplate restTemplate; @Autowired - public MD5ChecksumUpdater(ChromosomeService chromosomeService, NCBIAssemblyDataSource ncbiDataSource, RestTemplate restTemplate) { + public MD5ChecksumUpdater(ChromosomeService chromosomeService, RestTemplate restTemplate) { this.chromosomeService = chromosomeService; - this.ncbiDataSource = ncbiDataSource; this.restTemplate = restTemplate; } public void updateMD5ChecksumForAssembly(String accession) { logger.info("Start Update MD5 Checksum for assembly : " + accession); - Path downloadedNCBIFilePath = null; try { - Optional downloadNCBIFilePathOpt = ncbiDataSource.downloadAssemblyReport(accession); - downloadedNCBIFilePath = downloadNCBIFilePathOpt.orElseThrow(() -> new AssemblyNotFoundException(accession)); + int pageNumber = 0; + Page chrPage; + long chromosomeProcessed = 0; + long chromosomeUpdated = 0; + do { + Pageable pageable = PageRequest.of(pageNumber, DEFAULT_BATCH_SIZE); + chrPage = chromosomeService.getChromosomesByAssemblyAccession(accession, pageable); - AssemblyEntity assemblyEntity = new AssemblyEntity(); - assemblyEntity.setInsdcAccession(accession); + List chromosomeEntityList = chrPage.getContent(); + List chromosomeEntitiesWithoutMD5 = chromosomeEntityList.stream() + .filter(c -> c.getMd5checksum() == null || c.getMd5checksum().isEmpty()) + .collect(Collectors.toList()); - long numberOfChromosomesInFile = Files.lines(downloadedNCBIFilePath).filter(line -> !line.startsWith("#")).count(); - logger.info("Number of chromosomes in assembly (" + accession + "): " + numberOfChromosomesInFile); - - try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedNCBIFilePath.toFile()))) { - long chromosomesUpdatedTillNow = 0l; - List chrLines = new ArrayList<>(); - String line; - while ((line = bufferedReader.readLine()) != null) { - if (line.startsWith("#")) { - continue; - } - chrLines.add(line); - if (chrLines.size() == DEFAULT_BATCH_SIZE) { - List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); - updateMd5ChecksumForChromosome(accession, chromosomeEntityList); - chromosomesUpdatedTillNow += chrLines.size(); - logger.info("Number of chromosomes updated till now : " + chromosomesUpdatedTillNow); - - chrLines = new ArrayList<>(); - } - } - if (!chrLines.isEmpty()) { - List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + if(!chromosomeEntitiesWithoutMD5.isEmpty()){ updateMd5ChecksumForChromosome(accession, chromosomeEntityList); - chromosomesUpdatedTillNow += chrLines.size(); - logger.info("Number of chromosomes updated till now : " + chromosomesUpdatedTillNow); } - } - logger.info("MD5 Checksum update finished successfully for assembly: " + accession); + chromosomeProcessed += chromosomeEntityList.size(); + chromosomeUpdated += chromosomeEntitiesWithoutMD5.size(); + logger.info("Chromosomes Processed till now: {}, selected for update till now: {}", chromosomeProcessed, chromosomeUpdated); + + pageNumber++; + } while (chrPage.hasNext()); + + logger.info("Finished updating MD5 Checksum for assembly: " + accession); + } catch (Exception e) { logger.error("Error while updating MD5 Checksum for assembly : " + accession + "\n" + e); - } finally { - if (downloadedNCBIFilePath != null) { - try { - Files.deleteIfExists(downloadedNCBIFilePath); - } catch (Exception e) { - logger.warn("Could not delete file : " + downloadedNCBIFilePath); - } - } } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java index 9fd11976..826ea40d 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java @@ -46,17 +46,17 @@ public ChromosomeService(ChromosomeRepository repository, JdbcTemplate jdbcTempl public Page getChromosomesByInsdcAccession(String insdcAccession, Pageable request) { - Page chromosomes = repository.findChromosomeEntitiesByInsdcAccession(insdcAccession, request); + Page chromosomes = repository.findChromosomeEntitiesByInsdcAccessionOrderByInsdcAccession(insdcAccession, request); return stripChromosomesAndScaffoldsFromAssembly(chromosomes); } public Page getChromosomesByRefseq(String refseq, Pageable request) { - Page chromosomes = repository.findChromosomeEntitiesByRefseq(refseq, request); + Page chromosomes = repository.findChromosomeEntitiesByRefseqOrderByInsdcAccession(refseq, request); return stripChromosomesAndScaffoldsFromAssembly(chromosomes); } public Page getChromosomesByAssemblyInsdcAccession(String asmInsdcAccession, Pageable request) { - Page chromosomes = repository.findChromosomeEntitiesByAssembly_InsdcAccession(asmInsdcAccession, request); + Page chromosomes = repository.findChromosomeEntitiesByAssembly_InsdcAccessionOrderByInsdcAccession(asmInsdcAccession, request); return stripAssembliesFromChromosomes(chromosomes); } @@ -84,17 +84,17 @@ public void updateENASequenceNameForAllChromosomeInAssembly(String assembly, Lis } public Page getChromosomesByAssemblyRefseq(String asmRefseq, Pageable request) { - Page chromosomes = repository.findChromosomeEntitiesByAssembly_Refseq(asmRefseq, request); + Page chromosomes = repository.findChromosomeEntitiesByAssembly_RefseqOrderByInsdcAccession(asmRefseq, request); return stripAssembliesFromChromosomes(chromosomes); } public List getAssembliesByChromosomeInsdcAccession(String chrInsdcAccession) { - Page page = repository.findChromosomeEntitiesByInsdcAccession(chrInsdcAccession, Pageable.unpaged()); + Page page = repository.findChromosomeEntitiesByInsdcAccessionOrderByInsdcAccession(chrInsdcAccession, Pageable.unpaged()); return extractAssembliesFromChromosomes(page); } public List getAssembliesByChromosomeRefseq(String chrRefseq) { - Page page = repository.findChromosomeEntitiesByRefseq(chrRefseq, Pageable.unpaged()); + Page page = repository.findChromosomeEntitiesByRefseqOrderByInsdcAccession(chrRefseq, Pageable.unpaged()); return extractAssembliesFromChromosomes(page); } @@ -111,64 +111,64 @@ public List extractAssembliesFromChromosomes(Page getChromosomesByName(String name, Pageable request) { - Page page = repository.findChromosomeEntitiesByGenbankSequenceName(name, request); + Page page = repository.findChromosomeEntitiesByGenbankSequenceNameOrderByInsdcAccession(name, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByNameAndAssemblyTaxid(String name, long asmTaxid, Pageable request) { - Page page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssembly_Taxid(name, asmTaxid, request); + Page page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssembly_TaxidOrderByInsdcAccession(name, asmTaxid, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByNameAndAssembly( String name, AssemblyEntity assembly, Pageable request) { - Page page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssembly(name, assembly, request); + Page page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssemblyOrderByInsdcAccession(name, assembly, request); assembly.setChromosomes(null); return injectAssemblyIntoChromosomes(page, assembly); } public Page getChromosomesByAssemblyAccession(String accession, Pageable request) { - Page chromosomes = repository.findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseq( + Page chromosomes = repository.findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseqOrderByInsdcAccession( accession, accession, request); return stripAssembliesFromChromosomes(chromosomes); } public Page getChromosomesByUcscName(String ucscName, Pageable request) { - Page page = repository.findChromosomeEntitiesByUcscName(ucscName, request); + Page page = repository.findChromosomeEntitiesByUcscNameOrderByInsdcAccession(ucscName, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByUcscNameAndAssemblyTaxid( String ucscName, long asmTaxid, Pageable request) { Page page - = repository.findChromosomeEntitiesByUcscNameAndAssembly_Taxid(ucscName, asmTaxid, request); + = repository.findChromosomeEntitiesByUcscNameAndAssembly_TaxidOrderByInsdcAccession(ucscName, asmTaxid, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByUcscNameAndAssembly(String ucscName, AssemblyEntity assembly, Pageable request) { Page page - = repository.findChromosomeEntitiesByUcscNameAndAssembly(ucscName, assembly, request); + = repository.findChromosomeEntitiesByUcscNameAndAssemblyOrderByInsdcAccession(ucscName, assembly, request); assembly.setChromosomes(null); return injectAssemblyIntoChromosomes(page, assembly); } public Page getChromosomesByEnaName(String enaName, Pageable request) { - Page page = repository.findChromosomeEntitiesByEnaSequenceName(enaName, request); + Page page = repository.findChromosomeEntitiesByEnaSequenceNameOrderByInsdcAccession(enaName, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByEnaNameAndAssemblyTaxid( String enaName, long asmTaxid, Pageable request) { Page page - = repository.findChromosomeEntitiesByEnaSequenceNameAndAssembly_Taxid(enaName, asmTaxid, request); + = repository.findChromosomeEntitiesByEnaSequenceNameAndAssembly_TaxidOrderByInsdcAccession(enaName, asmTaxid, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByEnaNameAndAssembly( String enaName, AssemblyEntity assembly, Pageable request) { Page page - = repository.findChromosomeEntitiesByEnaSequenceNameAndAssembly(enaName, assembly, request); + = repository.findChromosomeEntitiesByEnaSequenceNameAndAssemblyOrderByInsdcAccession(enaName, assembly, request); assembly.setChromosomes(null); return injectAssemblyIntoChromosomes(page, assembly); } @@ -206,7 +206,7 @@ private void stripAssemblyFromChromosome(ChromosomeEntity chromosome) { } public void putChromosomeChecksumsByAccession(String accession, String md5, String trunc512) { - Page page = repository.findChromosomeEntitiesByInsdcAccessionOrRefseq( + Page page = repository.findChromosomeEntitiesByInsdcAccessionOrRefseqOrderByInsdcAccession( accession, accession, Pageable.unpaged()); if (page.isEmpty()) { throw new IllegalArgumentException( diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java index 262ca942..b490e6cc 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java @@ -1,5 +1,6 @@ package uk.ac.ebi.eva.contigalias.scheduler; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.jupiter.api.BeforeEach; @@ -11,18 +12,14 @@ import org.springframework.test.annotation.DirtiesContext; import org.springframework.test.context.ActiveProfiles; import org.springframework.web.client.RestTemplate; -import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import uk.ac.ebi.eva.contigalias.service.AssemblyService; +import uk.ac.ebi.eva.contigalias.entitygenerator.AssemblyGenerator; +import uk.ac.ebi.eva.contigalias.entitygenerator.ChromosomeGenerator; import uk.ac.ebi.eva.contigalias.service.ChromosomeService; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.ArrayList; import java.util.List; -import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; @@ -32,32 +29,22 @@ @DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) @SpringBootTest class MD5ChecksumUpdaterTest { - private static final String GCA_ACCESSION_HAVING_CHROMOSOMES = "GCA_000003055.5"; private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:INSDC_ACCESSION_PLACE_HOLDER/metadata"; - private AssemblyEntity assemblyEntity; + private AssemblyEntity assemblyEntity = AssemblyGenerator.generate(); private List chromosomeEntityList = new ArrayList<>(); @Autowired - private AssemblyService assemblyService; - @Autowired private ChromosomeService chromosomeService; - @Autowired - private NCBIAssemblyDataSource ncbiDataSource; private MD5ChecksumUpdater md5ChecksumUpdater; @BeforeEach - void setup() throws IOException { - Path assemblyReportPath = ncbiDataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES).get(); - assemblyEntity = ncbiDataSource.getAssemblyEntity(assemblyReportPath); - assemblyService.insertAssembly(assemblyEntity); - List chrDataLines = Files.lines(assemblyReportPath).filter(l -> !l.startsWith("#")) - .collect(Collectors.toList()); - chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrDataLines); - chromosomeService.insertAllChromosomes(chromosomeEntityList); - + void setup() throws JsonProcessingException { RestTemplate restTemplate = mock(RestTemplate.class); - md5ChecksumUpdater = new MD5ChecksumUpdater(chromosomeService, ncbiDataSource, restTemplate); - for (int i = 0; i < chromosomeEntityList.size(); i++) { - ChromosomeEntity chromosomeEntity = chromosomeEntityList.get(i); + md5ChecksumUpdater = new MD5ChecksumUpdater(chromosomeService, restTemplate); + for (int i = 0; i < 5; i++) { + ChromosomeEntity chromosomeEntity = ChromosomeGenerator.generate(assemblyEntity); + chromosomeEntityList.add(chromosomeEntity); + chromosomeService.insertChromosome(chromosomeEntity); + String jsonMD5Response = "{\"metadata\": {\"md5\": \"" + chromosomeEntity.getInsdcAccession() + "-MD5\"}}"; Mockito.when(restTemplate.getForObject(INSDC_CHECKSUM_URL.replace("INSDC_ACCESSION_PLACE_HOLDER", chromosomeEntity.getInsdcAccession()), JsonNode.class)) @@ -68,13 +55,13 @@ void setup() throws IOException { @Test void testUpdateMD5ChecksumForAssembly() { chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), - PageRequest.of(0, 5000)) + PageRequest.of(0, 10)) .forEach(c -> assertNull(c.getMd5checksum())); md5ChecksumUpdater.updateMD5ChecksumForAssembly(assemblyEntity.getInsdcAccession()); chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), - PageRequest.of(0, 5000)) + PageRequest.of(0, 10)) .forEach(c -> assertEquals(c.getInsdcAccession() + "-MD5", c.getMd5checksum())); } }