Skip to content

Commit

Permalink
Merge pull request #92 from apriltuesday/EVA-2479
Browse files Browse the repository at this point in the history
EVA-2479 - Consume ENA sequence report
  • Loading branch information
apriltuesday authored Aug 5, 2021
2 parents 9ea5335 + d182278 commit b76da16
Show file tree
Hide file tree
Showing 31 changed files with 4,338 additions and 216 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ public class ContigAliasController {

public static final String AUTHORITY_NONE = "none";

public static final String NAME_SEQUENCE_TYPE = "chromosome";
public static final String NAME_GENBANK_TYPE = "genbank";

public static final String NAME_ENA_TYPE = "ena";

public static final String NAME_UCSC_TYPE = "ucsc";

Expand Down Expand Up @@ -280,7 +282,7 @@ public ResponseEntity<PagedModel<EntityModel<SequenceEntity>>> getSequencesByAss
@ApiOperation(value = "Get chromosomes using a combination of their own name and the Taxonomic ID's of their " +
"parent assemblies.",
notes = "Given a chromosome's name and the Taxonomic ID or the GenBank/RefSeq accession of the assembly " +
"that it belongs to, this endpoint will return a non-emtpy list of chromosomes that satisfy the " +
"that it belongs to, this endpoint will return a non-empty list of chromosomes that satisfy the " +
"given parameters. If no Taxonomic ID or accession are provided then the endpoint returns a list " +
"of chromosomes which have the given name. Each chromosome will also have its parent assembly " +
"nested inside it. The endpoint will either return a list of chromosomes or it will return an " +
Expand All @@ -291,11 +293,11 @@ public ResponseEntity<PagedModel<EntityModel<SequenceEntity>>> getSequencesBySeq
@RequestParam(required = false) @ApiParam(value = "Taxonomic ID of a group of accessions. Eg: 9606") Optional<Long> taxid,
@RequestParam(required = false, name = "accession") @ApiParam(value = "Genbank or Refseq assembly " +
"accession. Eg: GCA_000001405.10") Optional<String> asmAccession,
@RequestParam(required = false, name = "name") @ApiParam(value = "Specify if the provided name is a " +
"chromosome name or a UCSC style name. The acceptable param values are " + NAME_SEQUENCE_TYPE +
" " +
"and " + NAME_UCSC_TYPE + " respectively. If this parameter is omitted then the name is assumed " +
"to be a " + NAME_SEQUENCE_TYPE + " name by default.") Optional<String> nameTypeOpt,
@RequestParam(required = false, name = "name") @ApiParam(value = "Specify if the provided name is an " +
"GenBank chromosome name, ENA name, or a UCSC style name. The acceptable param values are " +
NAME_GENBANK_TYPE + ", " + NAME_ENA_TYPE + ", and " +
NAME_UCSC_TYPE + " respectively. If this parameter is omitted then the name is assumed " +
"to be a " + NAME_GENBANK_TYPE + " name by default.") Optional<String> nameTypeOpt,
@RequestParam(required = false, name = "page") @ApiParam(value = PAGE_NUMBER_DESCRIPTION) Integer pageNumber,
@RequestParam(required = false, name = "size") @ApiParam(value = PAGE_SIZE_DESCRIPTION) Integer pageSize) {
boolean isNameValid = name != null && !name.isEmpty();
Expand All @@ -306,7 +308,7 @@ public ResponseEntity<PagedModel<EntityModel<SequenceEntity>>> getSequencesBySeq
}
PageRequest pageRequest = createPageRequest(pageNumber, pageSize);
PagedModel<EntityModel<SequenceEntity>> pagedModel;
String nameType = nameTypeOpt.orElse(NAME_SEQUENCE_TYPE);
String nameType = nameTypeOpt.orElse(NAME_GENBANK_TYPE);
if (!isTaxidValid && !isAccessionValid) {
pagedModel = handler.getSequencesByName(name, nameType, pageRequest);
} else if (isTaxidValid) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,15 @@ public PagedModel<EntityModel<SequenceEntity>> getSequencesBySequenceNameAndAsse
for (Pageable pageable : pageRequests[1]) {
pages.add(scaffoldService.getScaffoldsByUcscNameAndAssemblyTaxid(name, taxid, pageable));
}
} else if (nameType.equals(ContigAliasController.NAME_ENA_TYPE)) {
long count = chromosomeService.countChromosomeEntitiesByEnaNameAndAssembly_Taxid(name, taxid);
List<Pageable>[] pageRequests = createScaffoldsPageRequest(count, request);
for (Pageable pageable : pageRequests[0]) {
pages.add(chromosomeService.getChromosomesByEnaNameAndAssemblyTaxid(name, taxid, pageable));
}
for (Pageable pageable : pageRequests[1]) {
pages.add(scaffoldService.getScaffoldsByEnaNameAndAssemblyTaxid(name, taxid, pageable));
}
} else {
long count = chromosomeService.countChromosomeEntitiesByNameAndAssembly_Taxid(name, taxid);
List<Pageable>[] pageRequests = createScaffoldsPageRequest(count, request);
Expand Down Expand Up @@ -207,6 +216,15 @@ public PagedModel<EntityModel<SequenceEntity>> getSequencesBySequenceNameAndAsse
for (Pageable pageable : pageRequests[1]) {
pages.add(scaffoldService.getScaffoldsByUcscNameAndAssembly(name, assemblyEntity, pageable));
}
} else if (nameType.equals(ContigAliasController.NAME_ENA_TYPE)) {
long count = chromosomeService.countChromosomeEntitiesByEnaNameAndAssembly(name, assemblyEntity);
List<Pageable>[] pageRequests = createScaffoldsPageRequest(count, request);
for (Pageable pageable : pageRequests[0]) {
pages.add(chromosomeService.getChromosomesByEnaNameAndAssembly(name, assemblyEntity, pageable));
}
for (Pageable pageable : pageRequests[1]) {
pages.add(scaffoldService.getScaffoldsByEnaNameAndAssembly(name, assemblyEntity, pageable));
}
} else {
long count = chromosomeService.countChromosomeEntitiesByNameAndAssembly(name, assemblyEntity);
List<Pageable>[] pageRequests = createScaffoldsPageRequest(count, request);
Expand All @@ -233,6 +251,15 @@ public PagedModel<EntityModel<SequenceEntity>> getSequencesByName(
for (Pageable pageable : pageRequests[1]) {
pages.add(scaffoldService.getScaffoldsByUcscName(name, pageable));
}
} else if (nameType.equals(ContigAliasController.NAME_ENA_TYPE)) {
long count = chromosomeService.countChromosomeEntitiesByEnaName(name);
List<Pageable>[] pageRequests = createScaffoldsPageRequest(count, request);
for (Pageable pageable : pageRequests[0]) {
pages.add(chromosomeService.getChromosomesByEnaName(name, pageable));
}
for (Pageable pageable : pageRequests[1]) {
pages.add(scaffoldService.getScaffoldsByEnaName(name, pageable));
}
} else {
long count = chromosomeService.countChromosomeEntitiesByName(name);
List<Pageable>[] pageRequests = createScaffoldsPageRequest(count, request);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
* Copyright 2021 EMBL - European Bioinformatics Institute
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package uk.ac.ebi.eva.contigalias.datasource;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;

import uk.ac.ebi.eva.contigalias.dus.ENAAssemblyReportReader;
import uk.ac.ebi.eva.contigalias.dus.ENAAssemblyReportReaderFactory;
import uk.ac.ebi.eva.contigalias.dus.ENABrowser;
import uk.ac.ebi.eva.contigalias.dus.ENABrowserFactory;
import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity;
import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity;
import uk.ac.ebi.eva.contigalias.entities.ScaffoldEntity;
import uk.ac.ebi.eva.contigalias.entities.SequenceEntity;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Stream;

@Repository("ENADataSource")
public class ENAAssemblyDataSource implements AssemblyDataSource {

private final ENABrowserFactory factory;

private final ENAAssemblyReportReaderFactory readerFactory;

@Autowired
public ENAAssemblyDataSource(ENABrowserFactory factory,
ENAAssemblyReportReaderFactory readerFactory) {
this.factory = factory;
this.readerFactory = readerFactory;
}

@Override
public Optional<AssemblyEntity> getAssemblyByAccession(String accession) throws IOException {
ENABrowser enaBrowser = factory.build();
enaBrowser.connect();

AssemblyEntity assemblyEntity;
try (InputStream stream = enaBrowser.getAssemblyReportInputStream(accession)) {
ENAAssemblyReportReader reader = readerFactory.build(stream);
assemblyEntity = reader.getAssemblyEntity();
} finally {
enaBrowser.disconnect();
}
return Optional.of(assemblyEntity);
}

/**
* Adds ENA sequence names to chromosomes and scaffolds in an assembly. Will modify the AssemblyEntity in-place.
*
* @param optional {@link AssemblyEntity} to add ENA sequence names to
* @throws IOException Passes IOException thrown by {@link #getAssemblyByAccession(String)}
*/
public void addENASequenceNamesToAssembly(Optional<AssemblyEntity> optional) throws IOException {
if (optional.isPresent()) {
AssemblyEntity targetAssembly = optional.get();
if (!hasAllEnaSequenceNames(targetAssembly)) {
String genbank = targetAssembly.getGenbank();
Optional<AssemblyEntity> enaAssembly = getAssemblyByAccession(genbank);

if (enaAssembly.isPresent()) {
AssemblyEntity sourceAssembly = enaAssembly.get();
addENASequenceNames(sourceAssembly.getChromosomes(), targetAssembly.getChromosomes());
addENASequenceNames(sourceAssembly.getScaffolds(), targetAssembly.getScaffolds());
}
}
}
}

public boolean hasAllEnaSequenceNames(AssemblyEntity assembly) {
List<ChromosomeEntity> chromosomes = assembly.getChromosomes();
List<ScaffoldEntity> scaffolds = assembly.getScaffolds();
return Stream.concat(chromosomes.stream(), scaffolds.stream())
.allMatch(sequence -> sequence.getEnaSequenceName() != null);
}

private void addENASequenceNames(
List<? extends SequenceEntity> sourceSequences, List<? extends SequenceEntity> targetSequences) {
Map<String, SequenceEntity> genbankToSequenceEntity = new HashMap<>();
for (SequenceEntity targetSeq : targetSequences) {
genbankToSequenceEntity.put(targetSeq.getGenbank(), targetSeq);
}
for (SequenceEntity sourceSeq : sourceSequences) {
String sourceGenbank = sourceSeq.getGenbank();
if (genbankToSequenceEntity.containsKey(sourceGenbank)) {
genbankToSequenceEntity.get(sourceGenbank).setEnaSequenceName(sourceSeq.getEnaSequenceName());
} else {
genbankToSequenceEntity.put(sourceGenbank, sourceSeq);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;

import uk.ac.ebi.eva.contigalias.dus.AssemblyReportReader;
import uk.ac.ebi.eva.contigalias.dus.AssemblyReportReaderFactory;
import uk.ac.ebi.eva.contigalias.dus.NCBIAssemblyReportReader;
import uk.ac.ebi.eva.contigalias.dus.NCBIAssemblyReportReaderFactory;
import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser;
import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory;
import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity;
Expand All @@ -34,11 +34,11 @@ public class NCBIAssemblyDataSource implements AssemblyDataSource {

private final NCBIBrowserFactory factory;

private final AssemblyReportReaderFactory readerFactory;
private final NCBIAssemblyReportReaderFactory readerFactory;

@Autowired
public NCBIAssemblyDataSource(NCBIBrowserFactory factory,
AssemblyReportReaderFactory readerFactory) {
NCBIAssemblyReportReaderFactory readerFactory) {
this.factory = factory;
this.readerFactory = readerFactory;
}
Expand All @@ -54,7 +54,7 @@ public Optional<AssemblyEntity> getAssemblyByAccession(
}
AssemblyEntity assemblyEntity;
try (InputStream stream = ncbiBrowser.getAssemblyReportInputStream(directory.get())) {
AssemblyReportReader reader = readerFactory.build(stream);
NCBIAssemblyReportReader reader = readerFactory.build(stream);
assemblyEntity = reader.getAssemblyEntity();
} finally {
ncbiBrowser.disconnect();
Expand Down
Loading

0 comments on commit b76da16

Please sign in to comment.