diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java index b9db041d..d01bf71e 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdater.java @@ -4,15 +4,21 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.jdbc.core.JdbcTemplate; -import org.springframework.jdbc.core.ResultSetExtractor; import org.springframework.stereotype.Component; import org.springframework.web.client.RestTemplate; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; +import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; import uk.ac.ebi.eva.contigalias.service.ChromosomeService; +import java.io.BufferedReader; +import java.io.FileReader; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.Optional; @Component public class MD5ChecksumUpdater { @@ -20,49 +26,67 @@ public class MD5ChecksumUpdater { private final int DEFAULT_BATCH_SIZE = 10000; private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER"; private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata"; - private RestTemplate restTemplate; - private final JdbcTemplate jdbcTemplate; + private final NCBIAssemblyDataSource ncbiDataSource; private final ChromosomeService chromosomeService; + private RestTemplate restTemplate; @Autowired - public MD5ChecksumUpdater(RestTemplate restTemplate, JdbcTemplate jdbcTemplate, ChromosomeService chromosomeService) { - this.restTemplate = restTemplate; - this.jdbcTemplate = jdbcTemplate; + public MD5ChecksumUpdater(ChromosomeService chromosomeService, NCBIAssemblyDataSource ncbiDataSource, RestTemplate restTemplate) { this.chromosomeService = chromosomeService; + this.ncbiDataSource = ncbiDataSource; + this.restTemplate = restTemplate; } - public void updateMD5ChecksumForAssembly(String assembly) { + public void updateMD5ChecksumForAssembly(String accession) { + logger.info("Start Update MD5 Checksum for assembly : " + accession); + Path downloadedNCBIFilePath = null; try { - logger.info("Trying to update MD5 Checksum for assembly: " + assembly); - String sql = "select * from chromosome c where c.assembly_insdc_accession = '" + assembly - + "' AND (c.md5checksum IS NULL OR c.md5checksum = '')"; - jdbcTemplate.query(sql, (ResultSetExtractor) rs -> { - long chromosomeProcessed = 0; - List chromosomeEntityList = new ArrayList<>(); - while (rs.next()) { - ChromosomeEntity chromosome = new ChromosomeEntity(); - chromosome.setInsdcAccession(rs.getString(1)); - chromosomeEntityList.add(chromosome); + Optional downloadNCBIFilePathOpt = ncbiDataSource.downloadAssemblyReport(accession); + downloadedNCBIFilePath = downloadNCBIFilePathOpt.orElseThrow(() -> new AssemblyNotFoundException(accession)); + + AssemblyEntity assemblyEntity = new AssemblyEntity(); + assemblyEntity.setInsdcAccession(accession); + + long numberOfChromosomesInFile = Files.lines(downloadedNCBIFilePath).filter(line -> !line.startsWith("#")).count(); + logger.info("Number of chromosomes in assembly (" + accession + "): " + numberOfChromosomesInFile); - if (chromosomeEntityList.size() == DEFAULT_BATCH_SIZE) { - updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); - chromosomeProcessed += chromosomeEntityList.size(); - logger.info("Chromosomes Processed till now: " + chromosomeProcessed); - chromosomeEntityList = new ArrayList<>(); + try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedNCBIFilePath.toFile()))) { + long chromosomesUpdatedTillNow = 0l; + List chrLines = new ArrayList<>(); + String line; + while ((line = bufferedReader.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + chrLines.add(line); + if (chrLines.size() == DEFAULT_BATCH_SIZE) { + List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + updateMd5ChecksumForChromosome(accession, chromosomeEntityList); + chromosomesUpdatedTillNow += chrLines.size(); + logger.info("Number of chromosomes updated till now : " + chromosomesUpdatedTillNow); + + chrLines = new ArrayList<>(); } } - if (chromosomeEntityList.size() > 0) { - updateMd5ChecksumForChromosome(assembly, chromosomeEntityList); - chromosomeProcessed += chromosomeEntityList.size(); - logger.info("Chromosomes Processed till now: " + chromosomeProcessed); + if (!chrLines.isEmpty()) { + List chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines); + updateMd5ChecksumForChromosome(accession, chromosomeEntityList); + chromosomesUpdatedTillNow += chrLines.size(); + logger.info("Number of chromosomes updated till now : " + chromosomesUpdatedTillNow); } + } - logger.info("Finished updating MD5 Checksum for assembly: " + assembly); - - return null; - }); + logger.info("MD5 Checksum update finished successfully for assembly: " + accession); } catch (Exception e) { - logger.error("Error while updating MD5 Checksum for assembly : " + assembly + "\n" + e); + logger.error("Error while updating MD5 Checksum for assembly : " + accession + "\n" + e); + } finally { + if (downloadedNCBIFilePath != null) { + try { + Files.deleteIfExists(downloadedNCBIFilePath); + } catch (Exception e) { + logger.warn("Could not delete file : " + downloadedNCBIFilePath); + } + } } } diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java index 6969b670..262ca942 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/scheduler/MD5ChecksumUpdaterTest.java @@ -1,6 +1,5 @@ package uk.ac.ebi.eva.contigalias.scheduler; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.jupiter.api.BeforeEach; @@ -9,18 +8,21 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.data.domain.PageRequest; -import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.test.annotation.DirtiesContext; import org.springframework.test.context.ActiveProfiles; import org.springframework.web.client.RestTemplate; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; -import uk.ac.ebi.eva.contigalias.entitygenerator.AssemblyGenerator; -import uk.ac.ebi.eva.contigalias.entitygenerator.ChromosomeGenerator; +import uk.ac.ebi.eva.contigalias.service.AssemblyService; import uk.ac.ebi.eva.contigalias.service.ChromosomeService; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; @@ -30,24 +32,32 @@ @DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS) @SpringBootTest class MD5ChecksumUpdaterTest { + private static final String GCA_ACCESSION_HAVING_CHROMOSOMES = "GCA_000003055.5"; private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:INSDC_ACCESSION_PLACE_HOLDER/metadata"; - private AssemblyEntity assemblyEntity = AssemblyGenerator.generate(); + private AssemblyEntity assemblyEntity; private List chromosomeEntityList = new ArrayList<>(); @Autowired - private JdbcTemplate jdbcTemplate; + private AssemblyService assemblyService; @Autowired private ChromosomeService chromosomeService; + @Autowired + private NCBIAssemblyDataSource ncbiDataSource; private MD5ChecksumUpdater md5ChecksumUpdater; @BeforeEach - void setup() throws JsonProcessingException { - RestTemplate restTemplate = mock(RestTemplate.class); - md5ChecksumUpdater = new MD5ChecksumUpdater(restTemplate, jdbcTemplate, chromosomeService); - for (int i = 0; i < 5; i++) { - ChromosomeEntity chromosomeEntity = ChromosomeGenerator.generate(assemblyEntity); - chromosomeEntityList.add(chromosomeEntity); - chromosomeService.insertChromosome(chromosomeEntity); + void setup() throws IOException { + Path assemblyReportPath = ncbiDataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES).get(); + assemblyEntity = ncbiDataSource.getAssemblyEntity(assemblyReportPath); + assemblyService.insertAssembly(assemblyEntity); + List chrDataLines = Files.lines(assemblyReportPath).filter(l -> !l.startsWith("#")) + .collect(Collectors.toList()); + chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrDataLines); + chromosomeService.insertAllChromosomes(chromosomeEntityList); + RestTemplate restTemplate = mock(RestTemplate.class); + md5ChecksumUpdater = new MD5ChecksumUpdater(chromosomeService, ncbiDataSource, restTemplate); + for (int i = 0; i < chromosomeEntityList.size(); i++) { + ChromosomeEntity chromosomeEntity = chromosomeEntityList.get(i); String jsonMD5Response = "{\"metadata\": {\"md5\": \"" + chromosomeEntity.getInsdcAccession() + "-MD5\"}}"; Mockito.when(restTemplate.getForObject(INSDC_CHECKSUM_URL.replace("INSDC_ACCESSION_PLACE_HOLDER", chromosomeEntity.getInsdcAccession()), JsonNode.class)) @@ -58,13 +68,13 @@ void setup() throws JsonProcessingException { @Test void testUpdateMD5ChecksumForAssembly() { chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), - PageRequest.of(0, 100)) + PageRequest.of(0, 5000)) .forEach(c -> assertNull(c.getMd5checksum())); md5ChecksumUpdater.updateMD5ChecksumForAssembly(assemblyEntity.getInsdcAccession()); chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(), - PageRequest.of(0, 100)) + PageRequest.of(0, 5000)) .forEach(c -> assertEquals(c.getInsdcAccession() + "-MD5", c.getMd5checksum())); } }