From 6c9fb2d3c52cbd0caa98a97d2eb29ff3b3149b0c Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Tue, 13 Feb 2024 12:27:31 +0000 Subject: [PATCH 1/3] update MD5-checksum-update logic - download file and read insdc accession from it --- .../scheduler/MD5ChecksumUpdater.java | 88 ++++++++++++------- .../scheduler/MD5ChecksumUpdaterTest.java | 40 +++++---- 2 files changed, 81 insertions(+), 47 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java index b9db041d..d01bf71e 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java @@ -4,15 +4,21 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.jdbc.core.JdbcTemplate; -import org.springframework.jdbc.core.ResultSetExtractor; import org.springframework.stereotype.Component; import org.springframework.web.client.RestTemplate; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; +import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; import uk.ac.ebi.eva.contigalias.service.ChromosomeService; +import java.io.BufferedReader; +import java.io.FileReader; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.Optional; @Component public class MD5ChecksumUpdater { @@ -20,49 +26,67 @@ public class MD5ChecksumUpdater { private final int DEFAULT_BATCH_SIZE = 10000; private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER"; private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata"; - private RestTemplate restTemplate; - private final JdbcTemplate jdbcTemplate; + private final NCBIAssemblyDataSource ncbiDataSource; private final ChromosomeService chromosomeService; + private RestTemplate restTemplate; @Autowired - public MD5ChecksumUpdater(RestTemplate restTemplate, JdbcTemplate jdbcTemplate, ChromosomeService chromosomeService) { - this.restTemplate = restTemplate; - this.jdbcTemplate = jdbcTemplate; + public MD5ChecksumUpdater(ChromosomeService chromosomeService, NCBIAssemblyDataSource ncbiDataSource, RestTemplate restTemplate) { this.chromosomeService = chromosomeService; + this.ncbiDataSource = ncbiDataSource; + this.restTemplate = restTemplate; } - public void updateMD5ChecksumForAssembly(String assembly) { + public void updateMD5ChecksumForAssembly(String accession) { + logger.info("Start Update MD5 Checksum for assembly : " + accession); + Path downloadedNCBIFilePath = null; try { - logger.info("Trying to update MD5 Checksum for assembly: " + assembly); - String sql = "select * from chromosome c where c.assembly_insdc_accession = '" + assembly - + "' AND (c.md5checksum IS NULL OR c.md5checksum = '')"; - jdbcTemplate.query(sql, (ResultSetExtractor) rs -> { - long chromosomeProcessed = 0; - List chromosomeEntityList = new ArrayList<>(); - while (rs.next()) { - ChromosomeEntity chromosome = new ChromosomeEntity(); - chromosome.setInsdcAccession(rs.getString(1)); - chromosomeEntityList.add(chromosome); + Optional downloadNCBIFilePathOpt = ncbiDataSource.downloadAssemblyReport(accession); + downloadedNCBIFilePath = downloadNCBIFilePathOpt.orElseThrow(() -> new AssemblyNotFoundException(accession)); + + AssemblyEntity assemblyEntity = new AssemblyEntity(); + assemblyEntity.setInsdcAccession(accession); + + long numberOfChromosomesInFile = Files.lines(downloadedNCBIFilePath).filter(line -> !line.startsWith("#")).count(); + logger.info("Number of chromosomes in assembly (" + accession + "): " + numberOfChromosomesInFile); - if (chromosomeEntityList.size() == DEFAULT_BATCH_SIZE) { - updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); - chromosomeProcessed += chromosomeEntityList.size(); - logger.info("Chromosomes Processed till now: " + chromosomeProcessed); - chromosomeEntityList = new ArrayList<>(); + try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedNCBIFilePath.toFile()))) { + long chromosomesUpdatedTillNow = 0l; + List chrLines = new ArrayList<>(); + String line; + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + chrLines.add(line); + if (chrLines.size() == DEFAULT_BATCH_SIZE) { + List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + updateMd5ChecksumForChromosome(accession, chromosomeEntityList); + chromosomesUpdatedTillNow += chrLines.size(); + logger.info("Number of chromosomes updated till now : " + chromosomesUpdatedTillNow); + + chrLines = new ArrayList<>(); } } - if (chromosomeEntityList.size() > 0) { - updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); - chromosomeProcessed += chromosomeEntityList.size(); - logger.info("Chromosomes Processed till now: " + chromosomeProcessed); + if (!chrLines.isEmpty()) { + List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + updateMd5ChecksumForChromosome(accession, chromosomeEntityList); + chromosomesUpdatedTillNow += chrLines.size(); + logger.info("Number of chromosomes updated till now : " + chromosomesUpdatedTillNow); } + } - logger.info("Finished updating MD5 Checksum for assembly: " + assembly); - - return null; - }); + logger.info("MD5 Checksum update finished successfully for assembly: " + accession); } catch (Exception e) { - logger.error("Error while updating MD5 Checksum for assembly : " + assembly + "\n" + e); + logger.error("Error while updating MD5 Checksum for assembly : " + accession + "\n" + e); + } finally { + if (downloadedNCBIFilePath != null) { + try { + Files.deleteIfExists(downloadedNCBIFilePath); + } catch (Exception e) { + logger.warn("Could not delete file : " + downloadedNCBIFilePath); + } + } } } diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java index 6969b670..262ca942 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java @@ -1,6 +1,5 @@ package uk.ac.ebi.eva.contigalias.scheduler; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.jupiter.api.BeforeEach; @@ -9,18 +8,21 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.data.domain.PageRequest; -import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.test.annotation.DirtiesContext; import org.springframework.test.context.ActiveProfiles; import org.springframework.web.client.RestTemplate; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import uk.ac.ebi.eva.contigalias.entitygenerator.AssemblyGenerator; -import uk.ac.ebi.eva.contigalias.entitygenerator.ChromosomeGenerator; +import uk.ac.ebi.eva.contigalias.service.AssemblyService; import uk.ac.ebi.eva.contigalias.service.ChromosomeService; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; @@ -30,24 +32,32 @@ @DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) @SpringBootTest class MD5ChecksumUpdaterTest { + private static final String GCA_ACCESSION_HAVING_CHROMOSOMES = "GCA_000003055.5"; private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:INSDC_ACCESSION_PLACE_HOLDER/metadata"; - private AssemblyEntity assemblyEntity = AssemblyGenerator.generate(); + private AssemblyEntity assemblyEntity; private List chromosomeEntityList = new ArrayList<>(); @Autowired - private JdbcTemplate jdbcTemplate; + private AssemblyService assemblyService; @Autowired private ChromosomeService chromosomeService; + @Autowired + private NCBIAssemblyDataSource ncbiDataSource; private MD5ChecksumUpdater md5ChecksumUpdater; @BeforeEach - void setup() throws JsonProcessingException { - RestTemplate restTemplate = mock(RestTemplate.class); - md5ChecksumUpdater = new MD5ChecksumUpdater(restTemplate, jdbcTemplate, chromosomeService); - for (int i = 0; i < 5; i++) { - ChromosomeEntity chromosomeEntity = ChromosomeGenerator.generate(assemblyEntity); - chromosomeEntityList.add(chromosomeEntity); - chromosomeService.insertChromosome(chromosomeEntity); + void setup() throws IOException { + Path assemblyReportPath = ncbiDataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES).get(); + assemblyEntity = ncbiDataSource.getAssemblyEntity(assemblyReportPath); + assemblyService.insertAssembly(assemblyEntity); + List chrDataLines = Files.lines(assemblyReportPath).filter(l -> !l.startsWith("#")) + .collect(Collectors.toList()); + chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrDataLines); + chromosomeService.insertAllChromosomes(chromosomeEntityList); + RestTemplate restTemplate = mock(RestTemplate.class); + md5ChecksumUpdater = new MD5ChecksumUpdater(chromosomeService, ncbiDataSource, restTemplate); + for (int i = 0; i < chromosomeEntityList.size(); i++) { + ChromosomeEntity chromosomeEntity = chromosomeEntityList.get(i); String jsonMD5Response = "{\"metadata\": {\"md5\": \"" + chromosomeEntity.getInsdcAccession() + "-MD5\"}}"; Mockito.when(restTemplate.getForObject(INSDC_CHECKSUM_URL.replace("INSDC_ACCESSION_PLACE_HOLDER", chromosomeEntity.getInsdcAccession()), JsonNode.class)) @@ -58,13 +68,13 @@ void setup() throws JsonProcessingException { @Test void testUpdateMD5ChecksumForAssembly() { chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), - PageRequest.of(0, 100)) + PageRequest.of(0, 5000)) .forEach(c -> assertNull(c.getMd5checksum())); md5ChecksumUpdater.updateMD5ChecksumForAssembly(assemblyEntity.getInsdcAccession()); chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), - PageRequest.of(0, 100)) + PageRequest.of(0, 5000)) .forEach(c -> assertEquals(c.getInsdcAccession() + "-MD5", c.getMd5checksum())); } } From e6eeeb9b3705c4e277a43a3eb9cdfc11798b6162 Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Wed, 14 Feb 2024 13:42:57 +0000 Subject: [PATCH 2/3] update md5-checksum-update logic to use pagination --- .../repo/ChromosomeRepository.java | 44 +++++------ .../scheduler/MD5ChecksumUpdater.java | 78 +++++++------------ .../service/ChromosomeService.java | 34 ++++---- .../scheduler/MD5ChecksumUpdaterTest.java | 39 ++++------ 4 files changed, 78 insertions(+), 117 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java index a715a648..928291ad 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java @@ -33,15 +33,15 @@ @Repository public interface ChromosomeRepository extends JpaRepository { - Page findChromosomeEntitiesByInsdcAccession(String insdcAccession, Pageable request); + Page findChromosomeEntitiesByInsdcAccessionOrderByInsdcAccession(String insdcAccession, Pageable request); - Page findChromosomeEntitiesByRefseq(String refseq, Pageable request); + Page findChromosomeEntitiesByRefseqOrderByInsdcAccession(String refseq, Pageable request); - Page findChromosomeEntitiesByInsdcAccessionOrRefseq(String insdcAccession, String refseq, Pageable request); + Page findChromosomeEntitiesByInsdcAccessionOrRefseqOrderByInsdcAccession(String insdcAccession, String refseq, Pageable request); - Page findChromosomeEntitiesByAssembly_InsdcAccession(String asmInsdcAccession, Pageable request); + Page findChromosomeEntitiesByAssembly_InsdcAccessionOrderByInsdcAccession(String asmInsdcAccession, Pageable request); - @Query("SELECT c FROM ChromosomeEntity c WHERE c.assembly.insdcAccession = :asmInsdcAccession AND (c.md5checksum IS NULL OR c.md5checksum = '')") + @Query("SELECT c FROM ChromosomeEntity c WHERE c.assembly.insdcAccession = :asmInsdcAccession AND (c.md5checksum IS NULL OR c.md5checksum = '') ORDER BY c.insdcAccession") Page findChromosomeEntitiesByAssembly_InsdcAccessionAndMd5checksumIsNullOrEmpty(@Param("asmInsdcAccession") String asmInsdcAccession, Pageable pageable); @Query("SELECT distinct c.assembly.insdcAccession FROM ChromosomeEntity c WHERE c.md5checksum IS NULL OR c.md5checksum = ''") @@ -60,33 +60,33 @@ public interface ChromosomeRepository extends JpaRepository findChromosomeEntitiesByAssembly_Refseq(String asmRefseq, Pageable request); + Page findChromosomeEntitiesByAssembly_RefseqOrderByInsdcAccession(String asmRefseq, Pageable request); - Page findChromosomeEntitiesByGenbankSequenceNameAndAssembly_Taxid(String genbankName, long asmTaxid, Pageable request); + Page findChromosomeEntitiesByGenbankSequenceNameAndAssembly_TaxidOrderByInsdcAccession(String genbankName, long asmTaxid, Pageable request); - Page findChromosomeEntitiesByUcscNameAndAssembly_Taxid(String ucscName, long asmTaxid, - Pageable request); + Page findChromosomeEntitiesByUcscNameAndAssembly_TaxidOrderByInsdcAccession(String ucscName, long asmTaxid, + Pageable request); - Page findChromosomeEntitiesByEnaSequenceNameAndAssembly_Taxid(String enaName, long asmTaxid, - Pageable request); + Page findChromosomeEntitiesByEnaSequenceNameAndAssembly_TaxidOrderByInsdcAccession(String enaName, long asmTaxid, + Pageable request); - Page findChromosomeEntitiesByGenbankSequenceNameAndAssembly(String genbankName, AssemblyEntity assembly, - Pageable request); + Page findChromosomeEntitiesByGenbankSequenceNameAndAssemblyOrderByInsdcAccession(String genbankName, AssemblyEntity assembly, + Pageable request); - Page findChromosomeEntitiesByUcscNameAndAssembly(String ucscName, AssemblyEntity assembly, - Pageable request); + Page findChromosomeEntitiesByUcscNameAndAssemblyOrderByInsdcAccession(String ucscName, AssemblyEntity assembly, + Pageable request); - Page findChromosomeEntitiesByEnaSequenceNameAndAssembly(String enaName, AssemblyEntity assembly, - Pageable request); + Page findChromosomeEntitiesByEnaSequenceNameAndAssemblyOrderByInsdcAccession(String enaName, AssemblyEntity assembly, + Pageable request); - Page findChromosomeEntitiesByGenbankSequenceName(String genbankName, Pageable request); + Page findChromosomeEntitiesByGenbankSequenceNameOrderByInsdcAccession(String genbankName, Pageable request); - Page findChromosomeEntitiesByEnaSequenceName(String enaSequenceName, Pageable request); + Page findChromosomeEntitiesByEnaSequenceNameOrderByInsdcAccession(String enaSequenceName, Pageable request); - Page findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseq(String insdcAccession, String refseq, - Pageable request); + Page findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseqOrderByInsdcAccession(String insdcAccession, String refseq, + Pageable request); - Page findChromosomeEntitiesByUcscName(String ucscName, Pageable request); + Page findChromosomeEntitiesByUcscNameOrderByInsdcAccession(String ucscName, Pageable request); Page findChromosomeEntitiesByMd5checksumOrderByInsdcAccession(String md5Checksum, Pageable request); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java index d01bf71e..61ad6cdd 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java @@ -4,21 +4,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.data.domain.Page; +import org.springframework.data.domain.PageRequest; +import org.springframework.data.domain.Pageable; import org.springframework.stereotype.Component; import org.springframework.web.client.RestTemplate; -import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; -import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; import uk.ac.ebi.eva.contigalias.service.ChromosomeService; -import java.io.BufferedReader; -import java.io.FileReader; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; import java.util.List; -import java.util.Optional; +import java.util.stream.Collectors; @Component public class MD5ChecksumUpdater { @@ -26,67 +21,46 @@ public class MD5ChecksumUpdater { private final int DEFAULT_BATCH_SIZE = 10000; private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER"; private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata"; - private final NCBIAssemblyDataSource ncbiDataSource; private final ChromosomeService chromosomeService; private RestTemplate restTemplate; @Autowired - public MD5ChecksumUpdater(ChromosomeService chromosomeService, NCBIAssemblyDataSource ncbiDataSource, RestTemplate restTemplate) { + public MD5ChecksumUpdater(ChromosomeService chromosomeService, RestTemplate restTemplate) { this.chromosomeService = chromosomeService; - this.ncbiDataSource = ncbiDataSource; this.restTemplate = restTemplate; } public void updateMD5ChecksumForAssembly(String accession) { logger.info("Start Update MD5 Checksum for assembly : " + accession); - Path downloadedNCBIFilePath = null; try { - Optional downloadNCBIFilePathOpt = ncbiDataSource.downloadAssemblyReport(accession); - downloadedNCBIFilePath = downloadNCBIFilePathOpt.orElseThrow(() -> new AssemblyNotFoundException(accession)); + int pageNumber = 0; + Page chrPage; + long chromosomeProcessed = 0; + long chromosomeUpdated = 0; + do { + Pageable pageable = PageRequest.of(pageNumber, DEFAULT_BATCH_SIZE); + chrPage = chromosomeService.getChromosomesByAssemblyAccession(accession, pageable); - AssemblyEntity assemblyEntity = new AssemblyEntity(); - assemblyEntity.setInsdcAccession(accession); + List chromosomeEntityList = chrPage.getContent(); + List chromosomeEntitiesWithoutMD5 = chromosomeEntityList.stream() + .filter(c -> c.getMd5checksum() == null || c.getMd5checksum().isEmpty()) + .collect(Collectors.toList()); - long numberOfChromosomesInFile = Files.lines(downloadedNCBIFilePath).filter(line -> !line.startsWith("#")).count(); - logger.info("Number of chromosomes in assembly (" + accession + "): " + numberOfChromosomesInFile); - - try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedNCBIFilePath.toFile()))) { - long chromosomesUpdatedTillNow = 0l; - List chrLines = new ArrayList<>(); - String line; - while ((line = bufferedReader.readLine()) != null) { - if (line.startsWith("#")) { - continue; - } - chrLines.add(line); - if (chrLines.size() == DEFAULT_BATCH_SIZE) { - List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); - updateMd5ChecksumForChromosome(accession, chromosomeEntityList); - chromosomesUpdatedTillNow += chrLines.size(); - logger.info("Number of chromosomes updated till now : " + chromosomesUpdatedTillNow); - - chrLines = new ArrayList<>(); - } - } - if (!chrLines.isEmpty()) { - List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + if(!chromosomeEntitiesWithoutMD5.isEmpty()){ updateMd5ChecksumForChromosome(accession, chromosomeEntityList); - chromosomesUpdatedTillNow += chrLines.size(); - logger.info("Number of chromosomes updated till now : " + chromosomesUpdatedTillNow); } - } - logger.info("MD5 Checksum update finished successfully for assembly: " + accession); + chromosomeProcessed += chromosomeEntityList.size(); + chromosomeUpdated += chromosomeEntitiesWithoutMD5.size(); + logger.info("Chromosomes Processed till now: {}, selected for update till now: {}", chromosomeProcessed, chromosomeUpdated); + + pageNumber++; + } while (chrPage.hasNext()); + + logger.info("Finished updating MD5 Checksum for assembly: " + accession); + } catch (Exception e) { logger.error("Error while updating MD5 Checksum for assembly : " + accession + "\n" + e); - } finally { - if (downloadedNCBIFilePath != null) { - try { - Files.deleteIfExists(downloadedNCBIFilePath); - } catch (Exception e) { - logger.warn("Could not delete file : " + downloadedNCBIFilePath); - } - } } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java index 52dfd6ad..f30e27ae 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java @@ -46,17 +46,17 @@ public ChromosomeService(ChromosomeRepository repository, JdbcTemplate jdbcTempl public Page getChromosomesByInsdcAccession(String insdcAccession, Pageable request) { - Page chromosomes = repository.findChromosomeEntitiesByInsdcAccession(insdcAccession, request); + Page chromosomes = repository.findChromosomeEntitiesByInsdcAccessionOrderByInsdcAccession(insdcAccession, request); return stripChromosomesAndScaffoldsFromAssembly(chromosomes); } public Page getChromosomesByRefseq(String refseq, Pageable request) { - Page chromosomes = repository.findChromosomeEntitiesByRefseq(refseq, request); + Page chromosomes = repository.findChromosomeEntitiesByRefseqOrderByInsdcAccession(refseq, request); return stripChromosomesAndScaffoldsFromAssembly(chromosomes); } public Page getChromosomesByAssemblyInsdcAccession(String asmInsdcAccession, Pageable request) { - Page chromosomes = repository.findChromosomeEntitiesByAssembly_InsdcAccession(asmInsdcAccession, request); + Page chromosomes = repository.findChromosomeEntitiesByAssembly_InsdcAccessionOrderByInsdcAccession(asmInsdcAccession, request); return stripAssembliesFromChromosomes(chromosomes); } @@ -84,17 +84,17 @@ public void updateENASequenceNameForAllChromosomeInAssembly(String assembly, Lis } public Page getChromosomesByAssemblyRefseq(String asmRefseq, Pageable request) { - Page chromosomes = repository.findChromosomeEntitiesByAssembly_Refseq(asmRefseq, request); + Page chromosomes = repository.findChromosomeEntitiesByAssembly_RefseqOrderByInsdcAccession(asmRefseq, request); return stripAssembliesFromChromosomes(chromosomes); } public List getAssembliesByChromosomeInsdcAccession(String chrInsdcAccession) { - Page page = repository.findChromosomeEntitiesByInsdcAccession(chrInsdcAccession, Pageable.unpaged()); + Page page = repository.findChromosomeEntitiesByInsdcAccessionOrderByInsdcAccession(chrInsdcAccession, Pageable.unpaged()); return extractAssembliesFromChromosomes(page); } public List getAssembliesByChromosomeRefseq(String chrRefseq) { - Page page = repository.findChromosomeEntitiesByRefseq(chrRefseq, Pageable.unpaged()); + Page page = repository.findChromosomeEntitiesByRefseqOrderByInsdcAccession(chrRefseq, Pageable.unpaged()); return extractAssembliesFromChromosomes(page); } @@ -111,64 +111,64 @@ public List extractAssembliesFromChromosomes(Page getChromosomesByName(String name, Pageable request) { - Page page = repository.findChromosomeEntitiesByGenbankSequenceName(name, request); + Page page = repository.findChromosomeEntitiesByGenbankSequenceNameOrderByInsdcAccession(name, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByNameAndAssemblyTaxid(String name, long asmTaxid, Pageable request) { - Page page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssembly_Taxid(name, asmTaxid, request); + Page page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssembly_TaxidOrderByInsdcAccession(name, asmTaxid, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByNameAndAssembly( String name, AssemblyEntity assembly, Pageable request) { - Page page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssembly(name, assembly, request); + Page page = repository.findChromosomeEntitiesByGenbankSequenceNameAndAssemblyOrderByInsdcAccession(name, assembly, request); assembly.setChromosomes(null); return injectAssemblyIntoChromosomes(page, assembly); } public Page getChromosomesByAssemblyAccession(String accession, Pageable request) { - Page chromosomes = repository.findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseq( + Page chromosomes = repository.findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyRefseqOrderByInsdcAccession( accession, accession, request); return stripAssembliesFromChromosomes(chromosomes); } public Page getChromosomesByUcscName(String ucscName, Pageable request) { - Page page = repository.findChromosomeEntitiesByUcscName(ucscName, request); + Page page = repository.findChromosomeEntitiesByUcscNameOrderByInsdcAccession(ucscName, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByUcscNameAndAssemblyTaxid( String ucscName, long asmTaxid, Pageable request) { Page page - = repository.findChromosomeEntitiesByUcscNameAndAssembly_Taxid(ucscName, asmTaxid, request); + = repository.findChromosomeEntitiesByUcscNameAndAssembly_TaxidOrderByInsdcAccession(ucscName, asmTaxid, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByUcscNameAndAssembly(String ucscName, AssemblyEntity assembly, Pageable request) { Page page - = repository.findChromosomeEntitiesByUcscNameAndAssembly(ucscName, assembly, request); + = repository.findChromosomeEntitiesByUcscNameAndAssemblyOrderByInsdcAccession(ucscName, assembly, request); assembly.setChromosomes(null); return injectAssemblyIntoChromosomes(page, assembly); } public Page getChromosomesByEnaName(String enaName, Pageable request) { - Page page = repository.findChromosomeEntitiesByEnaSequenceName(enaName, request); + Page page = repository.findChromosomeEntitiesByEnaSequenceNameOrderByInsdcAccession(enaName, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByEnaNameAndAssemblyTaxid( String enaName, long asmTaxid, Pageable request) { Page page - = repository.findChromosomeEntitiesByEnaSequenceNameAndAssembly_Taxid(enaName, asmTaxid, request); + = repository.findChromosomeEntitiesByEnaSequenceNameAndAssembly_TaxidOrderByInsdcAccession(enaName, asmTaxid, request); return stripChromosomesAndScaffoldsFromAssembly(page); } public Page getChromosomesByEnaNameAndAssembly( String enaName, AssemblyEntity assembly, Pageable request) { Page page - = repository.findChromosomeEntitiesByEnaSequenceNameAndAssembly(enaName, assembly, request); + = repository.findChromosomeEntitiesByEnaSequenceNameAndAssemblyOrderByInsdcAccession(enaName, assembly, request); assembly.setChromosomes(null); return injectAssemblyIntoChromosomes(page, assembly); } @@ -211,7 +211,7 @@ private void stripAssemblyFromChromosome(ChromosomeEntity chromosome) { } public void putChromosomeChecksumsByAccession(String accession, String md5, String trunc512) { - Page page = repository.findChromosomeEntitiesByInsdcAccessionOrRefseq( + Page page = repository.findChromosomeEntitiesByInsdcAccessionOrRefseqOrderByInsdcAccession( accession, accession, Pageable.unpaged()); if (page.isEmpty()) { throw new IllegalArgumentException( diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java index 262ca942..b490e6cc 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java @@ -1,5 +1,6 @@ package uk.ac.ebi.eva.contigalias.scheduler; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.jupiter.api.BeforeEach; @@ -11,18 +12,14 @@ import org.springframework.test.annotation.DirtiesContext; import org.springframework.test.context.ActiveProfiles; import org.springframework.web.client.RestTemplate; -import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import uk.ac.ebi.eva.contigalias.service.AssemblyService; +import uk.ac.ebi.eva.contigalias.entitygenerator.AssemblyGenerator; +import uk.ac.ebi.eva.contigalias.entitygenerator.ChromosomeGenerator; import uk.ac.ebi.eva.contigalias.service.ChromosomeService; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.ArrayList; import java.util.List; -import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; @@ -32,32 +29,22 @@ @DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) @SpringBootTest class MD5ChecksumUpdaterTest { - private static final String GCA_ACCESSION_HAVING_CHROMOSOMES = "GCA_000003055.5"; private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:INSDC_ACCESSION_PLACE_HOLDER/metadata"; - private AssemblyEntity assemblyEntity; + private AssemblyEntity assemblyEntity = AssemblyGenerator.generate(); private List chromosomeEntityList = new ArrayList<>(); @Autowired - private AssemblyService assemblyService; - @Autowired private ChromosomeService chromosomeService; - @Autowired - private NCBIAssemblyDataSource ncbiDataSource; private MD5ChecksumUpdater md5ChecksumUpdater; @BeforeEach - void setup() throws IOException { - Path assemblyReportPath = ncbiDataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES).get(); - assemblyEntity = ncbiDataSource.getAssemblyEntity(assemblyReportPath); - assemblyService.insertAssembly(assemblyEntity); - List chrDataLines = Files.lines(assemblyReportPath).filter(l -> !l.startsWith("#")) - .collect(Collectors.toList()); - chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrDataLines); - chromosomeService.insertAllChromosomes(chromosomeEntityList); - + void setup() throws JsonProcessingException { RestTemplate restTemplate = mock(RestTemplate.class); - md5ChecksumUpdater = new MD5ChecksumUpdater(chromosomeService, ncbiDataSource, restTemplate); - for (int i = 0; i < chromosomeEntityList.size(); i++) { - ChromosomeEntity chromosomeEntity = chromosomeEntityList.get(i); + md5ChecksumUpdater = new MD5ChecksumUpdater(chromosomeService, restTemplate); + for (int i = 0; i < 5; i++) { + ChromosomeEntity chromosomeEntity = ChromosomeGenerator.generate(assemblyEntity); + chromosomeEntityList.add(chromosomeEntity); + chromosomeService.insertChromosome(chromosomeEntity); + String jsonMD5Response = "{\"metadata\": {\"md5\": \"" + chromosomeEntity.getInsdcAccession() + "-MD5\"}}"; Mockito.when(restTemplate.getForObject(INSDC_CHECKSUM_URL.replace("INSDC_ACCESSION_PLACE_HOLDER", chromosomeEntity.getInsdcAccession()), JsonNode.class)) @@ -68,13 +55,13 @@ void setup() throws IOException { @Test void testUpdateMD5ChecksumForAssembly() { chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), - PageRequest.of(0, 5000)) + PageRequest.of(0, 10)) .forEach(c -> assertNull(c.getMd5checksum())); md5ChecksumUpdater.updateMD5ChecksumForAssembly(assemblyEntity.getInsdcAccession()); chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), - PageRequest.of(0, 5000)) + PageRequest.of(0, 10)) .forEach(c -> assertEquals(c.getInsdcAccession() + "-MD5", c.getMd5checksum())); } } From eb219e82abbcda5143e830b3f3a8d6e67f2c6493 Mon Sep 17 00:00:00 2001 From: nkumar2 Date: Wed, 14 Feb 2024 14:42:43 +0000 Subject: [PATCH 3/3] update test --- .../AssemblyAndChromosomeServiceIntegrationTest.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java index b9447aaf..f0be306b 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblyAndChromosomeServiceIntegrationTest.java @@ -205,8 +205,14 @@ void getChromosomesByNameAndAssemblySameNameSameTaxid() { assertNotNull(page); assertEquals(2, page.getTotalElements()); List entityList = page.get().collect(Collectors.toList()); - assertChromosomeEntityIdentical(chromosomeEntities[0], entityList.get(0)); - assertChromosomeEntityIdentical(chromosomeEntities[1], entityList.get(1)); + ChromosomeEntity first = entityList.stream() + .filter(c->c.getInsdcAccession()==chromosomeEntities[0].getInsdcAccession()) + .findFirst().get(); + ChromosomeEntity second = entityList.stream() + .filter(c->c.getInsdcAccession()==chromosomeEntities[1].getInsdcAccession()) + .findFirst().get(); + assertChromosomeEntityIdentical(chromosomeEntities[0], first); + assertChromosomeEntityIdentical(chromosomeEntities[1], second); } /**