Skip to content

Commit

Permalink
update MD5-checksum-update logic - download file and read insdc acces…
Browse files Browse the repository at this point in the history
…sion from it
  • Loading branch information
nitin-ebi committed Feb 13, 2024
1 parent edd5c2e commit 3e39913
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,65 +4,89 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.jdbc.core.ResultSetExtractor;
import org.springframework.stereotype.Component;
import org.springframework.web.client.RestTemplate;
import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource;
import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity;
import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity;
import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException;
import uk.ac.ebi.eva.contigalias.service.ChromosomeService;

import java.io.BufferedReader;
import java.io.FileReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

@Component
public class MD5ChecksumUpdater {
private final Logger logger = LoggerFactory.getLogger(MD5ChecksumUpdater.class);
private final int DEFAULT_BATCH_SIZE = 10000;
private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER";
private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata";
private RestTemplate restTemplate;
private final JdbcTemplate jdbcTemplate;
private final NCBIAssemblyDataSource ncbiDataSource;
private final ChromosomeService chromosomeService;
private RestTemplate restTemplate;

@Autowired
public MD5ChecksumUpdater(RestTemplate restTemplate, JdbcTemplate jdbcTemplate, ChromosomeService chromosomeService) {
this.restTemplate = restTemplate;
this.jdbcTemplate = jdbcTemplate;
public MD5ChecksumUpdater(ChromosomeService chromosomeService, NCBIAssemblyDataSource ncbiDataSource, RestTemplate restTemplate) {
this.chromosomeService = chromosomeService;
this.ncbiDataSource = ncbiDataSource;
this.restTemplate = restTemplate;
}

public void updateMD5ChecksumForAssembly(String assembly) {
public void updateMD5ChecksumForAssembly(String accession) {
logger.info("Start Update MD5 Checksum for assembly : " + accession);
Path downloadedNCBIFilePath = null;
try {
logger.info("Trying to update MD5 Checksum for assembly: " + assembly);
String sql = "select * from chromosome c where c.assembly_insdc_accession = '" + assembly
+ "' AND (c.md5checksum IS NULL OR c.md5checksum = '')";
jdbcTemplate.query(sql, (ResultSetExtractor<Void>) rs -> {
long chromosomeProcessed = 0;
List<ChromosomeEntity> chromosomeEntityList = new ArrayList<>();
while (rs.next()) {
ChromosomeEntity chromosome = new ChromosomeEntity();
chromosome.setInsdcAccession(rs.getString(1));
chromosomeEntityList.add(chromosome);
Optional<Path> downloadNCBIFilePathOpt = ncbiDataSource.downloadAssemblyReport(accession);
downloadedNCBIFilePath = downloadNCBIFilePathOpt.orElseThrow(() -> new AssemblyNotFoundException(accession));

AssemblyEntity assemblyEntity = new AssemblyEntity();
assemblyEntity.setInsdcAccession(accession);

long numberOfChromosomesInFile = Files.lines(downloadedNCBIFilePath).filter(line -> !line.startsWith("#")).count();
logger.info("Number of chromosomes in assembly (" + accession + "): " + numberOfChromosomesInFile);

if (chromosomeEntityList.size() == DEFAULT_BATCH_SIZE) {
updateMd5ChecksumForChromosome(assembly, chromosomeEntityList);
chromosomeProcessed += chromosomeEntityList.size();
logger.info("Chromosomes Processed till now: " + chromosomeProcessed);
chromosomeEntityList = new ArrayList<>();
try (BufferedReader bufferedReader = new BufferedReader(new FileReader(downloadedNCBIFilePath.toFile()))) {
long chromosomesUpdatedTillNow = 0l;
List<String> chrLines = new ArrayList<>();
String line;
while ((line = bufferedReader.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
chrLines.add(line);
if (chrLines.size() == DEFAULT_BATCH_SIZE) {
List<ChromosomeEntity> chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines);
updateMd5ChecksumForChromosome(accession, chromosomeEntityList);
chromosomesUpdatedTillNow += chrLines.size();
logger.info("Number of chromosomes updated till now : " + chromosomesUpdatedTillNow);

chrLines = new ArrayList<>();
}
}
if (chromosomeEntityList.size() > 0) {
updateMd5ChecksumForChromosome(assembly, chromosomeEntityList);
chromosomeProcessed += chromosomeEntityList.size();
logger.info("Chromosomes Processed till now: " + chromosomeProcessed);
if (!chrLines.isEmpty()) {
List<ChromosomeEntity> chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrLines);
updateMd5ChecksumForChromosome(accession, chromosomeEntityList);
chromosomesUpdatedTillNow += chrLines.size();
logger.info("Number of chromosomes updated till now : " + chromosomesUpdatedTillNow);
}
}

logger.info("Finished updating MD5 Checksum for assembly: " + assembly);

return null;
});
logger.info("MD5 Checksum update finished successfully for assembly: " + accession);
} catch (Exception e) {
logger.error("Error while updating MD5 Checksum for assembly : " + assembly + "\n" + e);
logger.error("Error while updating MD5 Checksum for assembly : " + accession + "\n" + e);
} finally {
if (downloadedNCBIFilePath != null) {
try {
Files.deleteIfExists(downloadedNCBIFilePath);
} catch (Exception e) {
logger.warn("Could not delete file : " + downloadedNCBIFilePath);
}
}
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package uk.ac.ebi.eva.contigalias.scheduler;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.junit.jupiter.api.BeforeEach;
Expand All @@ -9,18 +8,21 @@
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.data.domain.PageRequest;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.test.annotation.DirtiesContext;
import org.springframework.test.context.ActiveProfiles;
import org.springframework.web.client.RestTemplate;
import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource;
import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity;
import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity;
import uk.ac.ebi.eva.contigalias.entitygenerator.AssemblyGenerator;
import uk.ac.ebi.eva.contigalias.entitygenerator.ChromosomeGenerator;
import uk.ac.ebi.eva.contigalias.service.AssemblyService;
import uk.ac.ebi.eva.contigalias.service.ChromosomeService;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
Expand All @@ -30,24 +32,32 @@
@DirtiesContext(classMode = DirtiesContext.ClassMode.BEFORE_CLASS)
@SpringBootTest
class MD5ChecksumUpdaterTest {
private static final String GCA_ACCESSION_HAVING_CHROMOSOMES = "GCA_000003055.5";
private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:INSDC_ACCESSION_PLACE_HOLDER/metadata";
private AssemblyEntity assemblyEntity = AssemblyGenerator.generate();
private AssemblyEntity assemblyEntity;
private List<ChromosomeEntity> chromosomeEntityList = new ArrayList<>();
@Autowired
private JdbcTemplate jdbcTemplate;
private AssemblyService assemblyService;
@Autowired
private ChromosomeService chromosomeService;
@Autowired
private NCBIAssemblyDataSource ncbiDataSource;
private MD5ChecksumUpdater md5ChecksumUpdater;

@BeforeEach
void setup() throws JsonProcessingException {
RestTemplate restTemplate = mock(RestTemplate.class);
md5ChecksumUpdater = new MD5ChecksumUpdater(restTemplate, jdbcTemplate, chromosomeService);
for (int i = 0; i < 5; i++) {
ChromosomeEntity chromosomeEntity = ChromosomeGenerator.generate(assemblyEntity);
chromosomeEntityList.add(chromosomeEntity);
chromosomeService.insertChromosome(chromosomeEntity);
void setup() throws IOException {
Path assemblyReportPath = ncbiDataSource.downloadAssemblyReport(GCA_ACCESSION_HAVING_CHROMOSOMES).get();
assemblyEntity = ncbiDataSource.getAssemblyEntity(assemblyReportPath);
assemblyService.insertAssembly(assemblyEntity);
List<String> chrDataLines = Files.lines(assemblyReportPath).filter(l -> !l.startsWith("#"))
.collect(Collectors.toList());
chromosomeEntityList = ncbiDataSource.getChromosomeEntityList(assemblyEntity, chrDataLines);
chromosomeService.insertAllChromosomes(chromosomeEntityList);

RestTemplate restTemplate = mock(RestTemplate.class);
md5ChecksumUpdater = new MD5ChecksumUpdater(chromosomeService, ncbiDataSource, restTemplate);
for (int i = 0; i < chromosomeEntityList.size(); i++) {
ChromosomeEntity chromosomeEntity = chromosomeEntityList.get(i);
String jsonMD5Response = "{\"metadata\": {\"md5\": \"" + chromosomeEntity.getInsdcAccession() + "-MD5\"}}";
Mockito.when(restTemplate.getForObject(INSDC_CHECKSUM_URL.replace("INSDC_ACCESSION_PLACE_HOLDER",
chromosomeEntity.getInsdcAccession()), JsonNode.class))
Expand All @@ -58,13 +68,13 @@ void setup() throws JsonProcessingException {
@Test
void testUpdateMD5ChecksumForAssembly() {
chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(),
PageRequest.of(0, 100))
PageRequest.of(0, 5000))
.forEach(c -> assertNull(c.getMd5checksum()));

md5ChecksumUpdater.updateMD5ChecksumForAssembly(assemblyEntity.getInsdcAccession());

chromosomeService.getChromosomesByAssemblyInsdcAccession(assemblyEntity.getInsdcAccession(),
PageRequest.of(0, 100))
PageRequest.of(0, 5000))
.forEach(c -> assertEquals(c.getInsdcAccession() + "-MD5", c.getMd5checksum()));
}
}

0 comments on commit 3e39913

Please sign in to comment.