Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA3454 - Retrieve and Update Md5checksum for assemblies #121

Merged
merged 5 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@
<dependency>
<groupId>org.springframework.retry</groupId>
<artifactId>spring-retry</artifactId>
<version>1.2.5.RELEASE</version>
<version>1.3.1</version>
</dependency>

</dependencies>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
import org.springframework.hateoas.config.EnableHypermediaSupport;
import org.springframework.retry.annotation.EnableRetry;
import org.springframework.scheduling.annotation.EnableScheduling;

@EnableScheduling
@SpringBootApplication
@EnableRetry
@EnableHypermediaSupport(type = EnableHypermediaSupport.HypermediaType.HAL)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package uk.ac.ebi.eva.contigalias.conf;

import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.web.client.RestTemplate;

@Configuration
public class ContigAliasConfiguration {

@Bean
public RestTemplate getRestTemplate() {
return new RestTemplate();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,20 @@
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.DeleteMapping;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PutMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

@RequestMapping("/v1/admin")
@RestController
Expand Down Expand Up @@ -87,6 +91,36 @@ public ResponseEntity<?> fetchAndInsertAssemblyByAccession(
return new ResponseEntity<>("Accession Processing Result : " + accessionResult, HttpStatus.MULTI_STATUS);
}

@ApiOperation(value = "Given an assembly accession, retrieve MD5 checksum for all chromosomes belonging to assembly and update")
@PutMapping(value = "assemblies/{accession}/md5checksum")
public ResponseEntity<String> retrieveAndInsertMd5ChecksumForAssembly(@PathVariable(name = "accession")
@ApiParam(value = "INSDC or RefSeq assembly accession. Eg: " +
"GCA_000001405.10") String asmAccession) {
try {
handler.getAssemblyByAccession(asmAccession);
handler.retrieveAndInsertMd5ChecksumForAssembly(asmAccession);
return ResponseEntity.ok("A task has been submitted for updating md5checksum for all chromosomes " +
"in assembly " + asmAccession + ". Depending upon the number of chromosomes present in assembly, " +
"this might take some time to complete");
} catch (AssemblyNotFoundException e) {
return ResponseEntity.ok("Could not find assembly " + asmAccession +
". Please insert the assembly first (md5checksum will be updated as part of the insertion process");
}
}

@ApiOperation(value = "Retrieve list of assemblies for which MD5 Checksum updates are running/going-to-run ")
@GetMapping(value = "assemblies/md5checksum/status")
public ResponseEntity<String> getMD5ChecksumUpdateTaskStatus() {
Map<String, Set<String>> md5ChecksumUpdateTasks = handler.getMD5ChecksumUpdateTaskStatus();
Set<String> runningTasks = md5ChecksumUpdateTasks.get("running");
Set<String> scheduledTasks = md5ChecksumUpdateTasks.get("scheduled");
String runningTaskRes = runningTasks == null || runningTasks.isEmpty() ? "No running MD5 checksum update tasks" :
runningTasks.stream().collect(Collectors.joining(","));
String scheduledTaskRes = scheduledTasks == null || scheduledTasks.isEmpty() ? "No scheduled MD5 checksum update tasks" :
scheduledTasks.stream().collect(Collectors.joining(","));
return ResponseEntity.ok("running: " + runningTaskRes + "\nscheduled: " + scheduledTaskRes);
}

// This endpoint can be enabled in the future when checksums for assemblies are added to the project.
// @ApiOperation(value = "Add MD5 and TRUNC512 checksums to an assembly by accession.",
// notes = "Given an INSDC or RefSeq accession along with a MD5 or a TRUNC512 checksum, this endpoint will
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;

@Service
public class AdminHandler {
Expand All @@ -46,6 +48,10 @@ public AdminHandler(AssemblyService assemblyService,
this.assemblyAssembler = assemblyAssembler;
}

public Optional<AssemblyEntity> getAssemblyByAccession(String accession) {
return assemblyService.getAssemblyByAccession(accession);
}

public void fetchAndInsertAssemblyByAccession(String accession) throws IOException {
assemblyService.fetchAndInsertAssembly(accession);
}
Expand All @@ -54,6 +60,14 @@ public Map<String, List<String>> fetchAndInsertAssemblyByAccession(List<String>
return assemblyService.fetchAndInsertAssembly(accessions);
}

public void retrieveAndInsertMd5ChecksumForAssembly(String accession) {
assemblyService.retrieveAndInsertMd5ChecksumForAssembly(accession);
}

public Map<String, Set<String>> getMD5ChecksumUpdateTaskStatus() {
return assemblyService.getMD5ChecksumUpdateTaskStatus();
}

public void deleteAssemblyByAccession(String accession) {
assemblyService.deleteAssemblyByAccession(accession);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,20 +123,17 @@ public Optional<Path> downloadAssemblyReport(ENABrowser enaBrowser, String acces
* @param optional {@link AssemblyEntity} to add ENA sequence names to
* @throws IOException Passes IOException thrown by {@link #getAssemblyByAccession(String)}
*/
public void addENASequenceNamesToAssembly(Optional<AssemblyEntity> optional) throws IOException {
if (optional.isPresent()) {
AssemblyEntity targetAssembly = optional.get();
if (!hasAllEnaSequenceNames(targetAssembly)) {
String insdcAccession = targetAssembly.getInsdcAccession();
Optional<AssemblyEntity> enaAssembly = getAssemblyByAccession(insdcAccession);

if (enaAssembly.isPresent()) {
AssemblyEntity sourceAssembly = enaAssembly.get();
addENASequenceNames(Objects.nonNull(sourceAssembly.getChromosomes()) ?
sourceAssembly.getChromosomes() : Collections.emptyList(),
Objects.nonNull(targetAssembly.getChromosomes()) ?
targetAssembly.getChromosomes() : Collections.emptyList());
}
public void addENASequenceNamesToAssembly(AssemblyEntity targetAssembly) throws IOException {
if (!hasAllEnaSequenceNames(targetAssembly)) {
String insdcAccession = targetAssembly.getInsdcAccession();
Optional<AssemblyEntity> enaAssembly = getAssemblyByAccession(insdcAccession);

if (enaAssembly.isPresent()) {
AssemblyEntity sourceAssembly = enaAssembly.get();
addENASequenceNames(Objects.nonNull(sourceAssembly.getChromosomes()) ?
sourceAssembly.getChromosomes() : Collections.emptyList(),
Objects.nonNull(targetAssembly.getChromosomes()) ?
targetAssembly.getChromosomes() : Collections.emptyList());
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,16 @@
import org.springframework.data.domain.Page;
import org.springframework.data.domain.Pageable;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.Modifying;
import org.springframework.data.jpa.repository.Query;
import org.springframework.data.repository.query.Param;
import org.springframework.stereotype.Repository;

import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity;
import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity;

import java.util.List;

@Repository
public interface ChromosomeRepository extends JpaRepository<ChromosomeEntity, Long> {

Expand All @@ -35,6 +40,16 @@ public interface ChromosomeRepository extends JpaRepository<ChromosomeEntity, Lo

Page<ChromosomeEntity> findChromosomeEntitiesByAssembly_InsdcAccession(String asmInsdcAccession, Pageable request);

@Query("SELECT c FROM ChromosomeEntity c WHERE c.assembly.insdcAccession = :asmInsdcAccession AND (c.md5checksum IS NULL OR c.md5checksum = '')")
Page<ChromosomeEntity> findChromosomeEntitiesByAssembly_InsdcAccessionAndMd5checksumIsNullOrEmpty(@Param("asmInsdcAccession") String asmInsdcAccession, Pageable pageable);

@Query("SELECT distinct c.assembly.insdcAccession FROM ChromosomeEntity c WHERE c.md5checksum IS NULL OR c.md5checksum = ''")
List<String> findAssembliesWhereChromosomeMd5checksumIsNullOrEmpty();

@Modifying
@Query("UPDATE ChromosomeEntity c SET c.md5checksum = :md5Checksum WHERE c.assembly.insdcAccession= :asmInsdcAccession AND c.insdcAccession = :insdcAccession")
void updateMd5ChecksumByInsdcAccession(@Param("asmInsdcAccession") String asmInsdcAccession, @Param("insdcAccession") String insdcAccession, @Param("md5Checksum") String md5Checksum);

Page<ChromosomeEntity> findChromosomeEntitiesByAssembly_Refseq(String asmRefseq, Pageable request);

Page<ChromosomeEntity> findChromosomeEntitiesByGenbankSequenceNameAndAssembly_Taxid(String genbankName, long asmTaxid, Pageable request);
Expand Down
123 changes: 123 additions & 0 deletions src/main/java/uk/ac/ebi/eva/contigalias/scheduler/ChecksumSetter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package uk.ac.ebi.eva.contigalias.scheduler;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.data.domain.Slice;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity;
import uk.ac.ebi.eva.contigalias.service.ChromosomeService;

import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;

@Component
public class ChecksumSetter {
private final Logger logger = LoggerFactory.getLogger(ChecksumSetter.class);
private final Map<String, CompletableFuture<Void>> runningMD5ChecksumUpdateTasks = new ConcurrentHashMap<>();
private Set<String> scheduledToRunMD5ChecksumUpdateTasks = new HashSet<>();
private int DEFAULT_PAGE_SIZE = 10000;
private ChromosomeService chromosomeService;
private Md5ChecksumRetriever md5ChecksumRetriever;

@Autowired
public ChecksumSetter(ChromosomeService chromosomeService, Md5ChecksumRetriever md5ChecksumRetriever) {
this.chromosomeService = chromosomeService;
this.md5ChecksumRetriever = md5ChecksumRetriever;
}

@Scheduled(cron = "0 0 0 ? * TUE")
public void updateMd5CheckSumForAllAssemblies() {
scheduledToRunMD5ChecksumUpdateTasks = new HashSet<>();
List<String> assemblyList = chromosomeService.getAssembliesWhereChromosomeMd5ChecksumIsNull();
logger.info("List of assemblies to be updated for MD5 Checksum: " + assemblyList);
scheduledToRunMD5ChecksumUpdateTasks.addAll(assemblyList.stream().collect(Collectors.toSet()));

for (String assembly : assemblyList) {
CompletableFuture<Void> future = updateMd5CheckSumForAssemblyAsync(assembly);
try {
future.get();
} catch (InterruptedException | ExecutionException e) {
logger.error("Encountered an error when running MD5Checksum update for assembly: " + assembly);
} finally {
scheduledToRunMD5ChecksumUpdateTasks.remove(assembly);
}
}
}

public CompletableFuture<Void> updateMd5CheckSumForAssemblyAsync(String assembly) {
logger.info("Submitted job for updating MD5 Checksum for assembly (asynchronously)");
// Check if the async task for this assembly is already running
CompletableFuture<Void> existingTask = runningMD5ChecksumUpdateTasks.get(assembly);
if (existingTask != null && !existingTask.isDone()) {
logger.info("Async task is still running for assembly: " + assembly);
return existingTask;
}
// Start the async task (removing existing run if present)
runningMD5ChecksumUpdateTasks.remove(assembly);
CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
updateMD5ChecksumForAllChromosomesInAssembly(assembly);
});
// Store the future in the map for the given assembly
runningMD5ChecksumUpdateTasks.put(assembly, future);

// check the status of task upon completion and remove from running tasks
future.whenComplete((result, exception) -> {
if (exception != null) {
logger.error("Async task (MD5Checksum setter) failed for assembly: " + assembly, exception);
} else {
logger.info("Async task (MD5Checksum setter) completed successfully for assembly: " + assembly);
}
runningMD5ChecksumUpdateTasks.remove(assembly);
});

return future;
}

public void updateMD5ChecksumForAllChromosomesInAssembly(String assembly) {
logger.info("Trying to update md5checksum for assembly: " + assembly);
Slice<ChromosomeEntity> chrSlice;
Pageable pageable = PageRequest.of(0, DEFAULT_PAGE_SIZE);
long chromosomeUpdated = 0;
do {
chrSlice = chromosomeService.getChromosomesByAssemblyInsdcAccessionWhereMd5ChecksumIsNull(assembly, pageable);
List<ChromosomeEntity> chromosomeEntityList = chrSlice.getContent();
updateMd5ChecksumForChromosome(chromosomeEntityList);

chromosomeUpdated += chromosomeEntityList.size();
logger.info("Chromosomes Updated till now: " + chromosomeUpdated);
} while (chrSlice.hasNext());

logger.info("Updating md5checksum for assembly " + assembly + " completed");
}

public void updateMd5ChecksumForChromosome(List<ChromosomeEntity> chromosomesList) {
chromosomesList.parallelStream().forEach(chromosome -> {
try {
String md5Checksum = md5ChecksumRetriever.retrieveMd5Checksum(chromosome.getInsdcAccession());
chromosome.setMd5checksum(md5Checksum);
} catch (Exception e) {
logger.info("Could not retrieve md5Checksum for insdc accession: " + chromosome.getInsdcAccession());
}
});

chromosomeService.updateMd5ChecksumForAll(chromosomesList);
}

public Map<String, Set<String>> getMD5ChecksumUpdateTaskStatus() {
Map<String, Set<String>> taskStatus = new HashMap<>();
taskStatus.put("running", runningMD5ChecksumUpdateTasks.keySet());
taskStatus.put("scheduled", scheduledToRunMD5ChecksumUpdateTasks);
return taskStatus;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package uk.ac.ebi.eva.contigalias.scheduler;

import com.fasterxml.jackson.databind.JsonNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.retry.annotation.Backoff;
import org.springframework.retry.annotation.Retryable;
import org.springframework.stereotype.Component;
import org.springframework.web.client.RestTemplate;

@Component
public class Md5ChecksumRetriever {
private final Logger logger = LoggerFactory.getLogger(Md5ChecksumRetriever.class);
private String INSDC_ACCESSION_PLACE_HOLDER = "INSDC_ACCESSION_PLACE_HOLDER";
private String INSDC_CHECKSUM_URL = "https://www.ebi.ac.uk/ena/cram/sequence/insdc:" + INSDC_ACCESSION_PLACE_HOLDER + "/metadata";

private RestTemplate restTemplate;

@Autowired
public Md5ChecksumRetriever(RestTemplate restTemplate) {
this.restTemplate = restTemplate;
}

@Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2))
public String retrieveMd5Checksum(String insdcAccession) {
String apiURL = INSDC_CHECKSUM_URL.replace(INSDC_ACCESSION_PLACE_HOLDER, insdcAccession);
JsonNode jsonResponse = restTemplate.getForObject(apiURL, JsonNode.class);
String md5Checksum = jsonResponse.get("metadata").get("md5").asText();
return md5Checksum;
}
}
Loading
Loading