diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java index 7251ab71..d739dc7f 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/ENAAssemblyDataSource.java @@ -90,7 +90,7 @@ public Optional getAssemblyByAccession(String accession) throws } return Optional.of(assemblyEntity); } catch (Exception e) { - logger.info("Could not fetch Assembly Report form ENA for accession " + accession + "Exception: " + e); + logger.warn("Could not fetch Assembly Report from ENA for accession " + accession + "Exception: " + e); return Optional.empty(); } @@ -105,14 +105,14 @@ public Optional downloadAssemblyReport(ENABrowser enaBrowser, String acces try { boolean success = enaBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize()); if (success) { - logger.info("ENA assembly report downloaded successfully"); + logger.info("ENA assembly report downloaded successfully for accession "+ accession); return Optional.of(downloadFilePath); } else { - logger.info("ENA assembly report could not be downloaded successfully"); + logger.warn("ENA assembly report could not be downloaded successfully for accession "+accession); return Optional.empty(); } } catch (IOException | DownloadFailedException e) { - logger.info("Error downloading ENA assembly report " + e); + logger.warn("Error downloading ENA assembly report for accession "+ accession + e); return Optional.empty(); } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java index ad097564..08226ff2 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblyDataSource.java @@ -33,7 +33,6 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.net.UnknownHostException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -63,12 +62,8 @@ public Optional getAssemblyByAccession( String accession) throws IOException, IllegalArgumentException { NCBIBrowser ncbiBrowser = factory.build(); ncbiBrowser.connect(); - Optional directory = ncbiBrowser.getGenomeReportDirectory(accession); - if (!directory.isPresent()) { - return Optional.empty(); - } - Optional downloadFilePath = downloadAssemblyReport(ncbiBrowser, directory.get()); + Optional downloadFilePath = downloadAssemblyReport(accession, ncbiBrowser); if (!downloadFilePath.isPresent()) { return Optional.empty(); } @@ -77,7 +72,8 @@ public Optional getAssemblyByAccession( try (InputStream stream = new FileInputStream(downloadFilePath.get().toFile())) { NCBIAssemblyReportReader reader = readerFactory.build(stream); assemblyEntity = reader.getAssemblyEntity(); - logger.info("NCBI: Number of chromosomes in " + accession + " : " + assemblyEntity.getChromosomes().size()); + logger.info("NCBI: Number of chromosomes in " + accession + " : " + + (assemblyEntity.getChromosomes() != null ? assemblyEntity.getChromosomes().size() : 0)); } finally { try { ncbiBrowser.disconnect(); @@ -89,17 +85,23 @@ public Optional getAssemblyByAccession( return Optional.of(assemblyEntity); } - @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier=2)) - public Optional downloadAssemblyReport(NCBIBrowser ncbiBrowser, String directory) throws IOException { - FTPFile ftpFile = ncbiBrowser.getNCBIAssemblyReportFile(directory); - String ftpFilePath = directory + ftpFile.getName(); + @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) + public Optional downloadAssemblyReport(String accession, NCBIBrowser ncbiBrowser) throws IOException { + Optional directory = ncbiBrowser.getGenomeReportDirectory(accession); + if (!directory.isPresent()) { + return Optional.empty(); + } + logger.info("NCBI directory for assembly report download: " + directory.get()); + + FTPFile ftpFile = ncbiBrowser.getNCBIAssemblyReportFile(directory.get()); + String ftpFilePath = directory.get() + ftpFile.getName(); Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName()); boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize()); if (success) { - logger.info("NCBI assembly report downloaded successfully"); + logger.info("NCBI assembly report downloaded successfully (" + ftpFile.getName() + ")"); return Optional.of(downloadFilePath); } else { - logger.info("NCBI assembly report could not be downloaded successfully"); + logger.error("NCBI assembly report could not be downloaded successfully(" + ftpFile.getName() + ")"); return Optional.empty(); } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java index 4ce17d32..28ad9329 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIAssemblyReportReader.java @@ -44,7 +44,8 @@ protected void parseReport() throws IOException, NullPointerException { parseAssemblyData(line); } else if (!line.startsWith("#")) { String[] columns = line.split("\t", -1); - if (columns.length >= 6 && columns[5].equals("=")) { + if (columns.length >= 6 && (columns[5].equals("=") || columns[5].equals("<>")) && + (columns[4] != null && !columns[4].isEmpty() && !columns[4].equals("na"))) { if (columns[3].equals("Chromosome") && columns[1].equals("assembled-molecule")) { parseChromosomeLine(columns); } else if (isScaffoldsEnabled) { @@ -98,7 +99,11 @@ protected void parseChromosomeLine(String[] columns) { chromosomeEntity.setGenbankSequenceName(columns[0]); chromosomeEntity.setInsdcAccession(columns[4]); - chromosomeEntity.setRefseq(columns[6]); + if (columns[6] == null || columns[6].isEmpty() || columns[6].equals("na")) { + chromosomeEntity.setRefseq(null); + } else { + chromosomeEntity.setRefseq(columns[6]); + } if (columns.length > 8) { try { @@ -132,7 +137,11 @@ protected void parseScaffoldLine(String[] columns) { scaffoldEntity.setGenbankSequenceName(columns[0]); scaffoldEntity.setInsdcAccession(columns[4]); - scaffoldEntity.setRefseq(columns[6]); + if (columns[6] == null || columns[6].isEmpty() || columns[6].equals("na")) { + scaffoldEntity.setRefseq(null); + } else { + scaffoldEntity.setRefseq(columns[6]); + } if (columns.length > 8) { try { diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java index 9435ec71..e5afdbdc 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SequenceEntity.java @@ -41,7 +41,6 @@ public class SequenceEntity { @ApiModelProperty(value = "Sequence's INSDC accession.") private String insdcAccession; - @Column(nullable = false) @ApiModelProperty(value = "Sequence's RefSeq accession.") private String refseq; diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java index dbb33bcb..58a68d34 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblyService.java @@ -21,7 +21,6 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.domain.Page; import org.springframework.data.domain.Pageable; -import org.springframework.data.util.Pair; import org.springframework.stereotype.Service; import uk.ac.ebi.eva.contigalias.datasource.ENAAssemblyDataSource; import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; @@ -31,6 +30,7 @@ import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblyException; import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; +import javax.transaction.Transactional; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; @@ -40,7 +40,7 @@ import java.util.Optional; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import java.util.concurrent.Future; + @Service public class AssemblyService { @@ -51,8 +51,6 @@ public class AssemblyService { private final ENAAssemblyDataSource enaDataSource; - private final ExecutorService executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); - private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); @Autowired @@ -102,7 +100,12 @@ public void fetchAndInsertAssembly(String accession) throws IOException { throw new AssemblyNotFoundException(accession); } enaDataSource.addENASequenceNamesToAssembly(fetchAssembly); - fetchAssembly.ifPresent(this::insertAssembly); + if (fetchAssembly.get().getChromosomes() != null && fetchAssembly.get().getChromosomes().size() > 0) { + insertAssembly(fetchAssembly.get()); + logger.info("Successfully inserted assembly for accession " + accession); + } else { + logger.error("Skipping inserting assembly : No chromosome in assembly " + accession); + } } public Optional getAssemblyByAccession(String accession) { @@ -131,6 +134,7 @@ private void stripAssemblyFromChromosomes(AssemblyEntity assembly) { } } + @Transactional public void insertAssembly(AssemblyEntity entity) { if (isEntityPresent(entity)) { throw duplicateAssemblyInsertionException(null, entity); @@ -155,15 +159,21 @@ public boolean isEntityPresent(AssemblyEntity entity) { public Map> fetchAndInsertAssembly(List accessions) { Map> accessionResult = new HashMap<>(); - List>> executorResponseList = new ArrayList<>(); + accessionResult.put("SUCCESS", new ArrayList<>()); + accessionResult.put("FAILURE", new ArrayList<>()); + for (String accession : accessions) { try { + logger.info("Started processing assembly accession : " + accession); this.fetchAndInsertAssembly(accession); - accessionResult.getOrDefault("SUCCESS", new ArrayList<>()).add(accession); + accessionResult.get("SUCCESS").add(accession); } catch (Exception e) { - accessionResult.getOrDefault("FAILURE", new ArrayList<>()).add(accession); + logger.error("Exception while loading assembly for accession " + accession + e); + accessionResult.get("FAILURE").add(accession); } } + logger.info("Success: " + accessionResult.getOrDefault("SUCCESS", Collections.emptyList())); + logger.info("Failure: " + accessionResult.getOrDefault("FAILURE", Collections.emptyList())); return accessionResult; } diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/RetryTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/RetryTest.java index e3750a64..2ba4895a 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/RetryTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/RetryTest.java @@ -41,37 +41,43 @@ public class RetryTest { @Test public void fileDownloadSuccessfulTest() throws IOException { + String mockAccession = "GCA_000012345.1"; String mockDirectory = "/src/test/resources/"; String mockFileName = "mock_file.txt"; Long mockFileSize = 1000l; + when(ncbiBrowser.getGenomeReportDirectory(mockAccession)).thenReturn(Optional.of(mockDirectory)); when(ncbiBrowser.getNCBIAssemblyReportFile(mockDirectory)).thenReturn(ftpFile); when(ftpFile.getName()).thenReturn(mockFileName); when(ftpFile.getSize()).thenReturn(mockFileSize); when(ncbiBrowser.downloadFTPFile(mockDirectory + mockFileName, Paths.get("/tmp/mock_file.txt"), mockFileSize)) .thenReturn(true); - Optional result = dataSource.downloadAssemblyReport(ncbiBrowser, mockDirectory); + Optional result = dataSource.downloadAssemblyReport(mockAccession, ncbiBrowser); assertTrue(result.isPresent()); } @Test public void fileDownloadFailedTest() throws IOException { + String mockAccession = "GCA_000012345.1"; String mockDirectory = "/src/test/resources/"; String mockFileName = "mock_file.txt"; Long mockFileSize = 1000l; + when(ncbiBrowser.getGenomeReportDirectory(mockAccession)).thenReturn(Optional.of(mockDirectory)); when(ncbiBrowser.getNCBIAssemblyReportFile(mockDirectory)).thenReturn(ftpFile); when(ftpFile.getName()).thenReturn(mockFileName); when(ftpFile.getSize()).thenReturn(mockFileSize); when(ncbiBrowser.downloadFTPFile(mockDirectory + mockFileName, Paths.get("/tmp/mock_file.txt"), mockFileSize)) .thenReturn(false); - Optional result = dataSource.downloadAssemblyReport(ncbiBrowser, mockDirectory); + Optional result = dataSource.downloadAssemblyReport(mockAccession, ncbiBrowser); assertFalse(result.isPresent()); } @Test public void fileDownloadFailedRetryTest() throws IOException { + String mockAccession = "GCA_000012345.1"; String mockDirectory = "/src/test/resources/"; String mockFileName = "mock_file.txt"; Long mockFileSize = 1000l; + when(ncbiBrowser.getGenomeReportDirectory(mockAccession)).thenReturn(Optional.of(mockDirectory)); when(ncbiBrowser.getNCBIAssemblyReportFile(mockDirectory)).thenReturn(ftpFile); when(ftpFile.getName()).thenReturn(mockFileName); when(ftpFile.getSize()).thenReturn(mockFileSize); @@ -80,7 +86,7 @@ public void fileDownloadFailedRetryTest() throws IOException { NCBIAssemblyDataSource anotherObjSpy = Mockito.spy(dataSource); DownloadFailedException thrown = Assertions.assertThrows(DownloadFailedException.class, () -> { - anotherObjSpy.downloadAssemblyReport(ncbiBrowser, mockDirectory); + anotherObjSpy.downloadAssemblyReport(mockAccession, ncbiBrowser); }); assertEquals("Download Failed", thrown.getMessage()); @@ -88,4 +94,18 @@ public void fileDownloadFailedRetryTest() throws IOException { .downloadFTPFile(mockDirectory + mockFileName, Paths.get("/tmp/mock_file.txt"), mockFileSize); } + @Test + public void fileDownloadFailedRetryTest2() throws IOException { + String mockAccession = "GCA_000012345.1"; + when(ncbiBrowser.getGenomeReportDirectory(mockAccession)).thenThrow(new IOException("Error listing files")); + + NCBIAssemblyDataSource anotherObjSpy = Mockito.spy(dataSource); + IOException thrown = Assertions.assertThrows(IOException.class, () -> { + anotherObjSpy.downloadAssemblyReport(mockAccession, ncbiBrowser); + }); + + assertEquals("Error listing files", thrown.getMessage()); + verify(ncbiBrowser, times(5)).getGenomeReportDirectory(mockAccession); + } + }