Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
3f7e1aa
wip
IvaTutis Nov 12, 2025
7cef9f0
wip-sequence-reader
IvaTutis Nov 17, 2025
595aa82
wip
IvaTutis Nov 19, 2025
7d2d099
ENA-6648-fasta-reader
IvaTutis Nov 26, 2025
d3a9298
wip
IvaTutis Nov 26, 2025
f08e3e1
very-much-wip-unbuildable
IvaTutis Nov 26, 2025
0574fb6
index-finished
IvaTutis Dec 1, 2025
a9f1c48
sequential-entry-reader-still-a-mess-unbuildable
IvaTutis Dec 1, 2025
f0fbfb0
wiup
IvaTutis Dec 3, 2025
cac54d7
wip
IvaTutis Dec 3, 2025
090e1be
wip
IvaTutis Dec 3, 2025
7347a87
wip
IvaTutis Dec 4, 2025
e8d71d5
basic-test-success
IvaTutis Dec 5, 2025
831df4f
stricter-header-parsing-done
IvaTutis Dec 5, 2025
543fee9
capitalisation-fix
IvaTutis Dec 5, 2025
cdf967a
fixed-streaming-chars
IvaTutis Dec 9, 2025
be67810
improved-tests
IvaTutis Dec 10, 2025
381d975
draft-finished
IvaTutis Dec 10, 2025
2b55262
Merge branch 'main' of https://github.com/enasequence/gff3tools into …
IvaTutis Dec 10, 2025
8fb18b0
tests-corrected
IvaTutis Dec 10, 2025
e60efc4
comments
IvaTutis Dec 10, 2025
1f22ff9
comments
IvaTutis Dec 10, 2025
84fb369
comment-and-gap-cleanup
IvaTutis Dec 10, 2025
80afdd6
optimized-reading-a-bit-more
IvaTutis Dec 10, 2025
e13ff99
spacing-fixes-for-spotless
IvaTutis Dec 10, 2025
b483272
switched-from-optional-to-string-in-fields
IvaTutis Dec 11, 2025
8662531
comment
IvaTutis Dec 11, 2025
c206db8
renamed-function
IvaTutis Dec 11, 2025
a71080d
renamed-function
IvaTutis Dec 11, 2025
2adc6e3
renamed-function
IvaTutis Dec 11, 2025
271681a
removed-useless-bit
IvaTutis Dec 11, 2025
9e81919
added-carriage-return-ignoring
IvaTutis Dec 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/*
* Copyright 2025 EMBL - European Bioinformatics Institute
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
* file except in compliance with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package uk.ac.ebi.embl.gff3tools.exception;

public class FastaFileException extends Exception {

public FastaFileException() {}

public FastaFileException(String message) {
super(message);
}

public FastaFileException(Throwable cause) {
super(cause);
}

public FastaFileException(String message, Throwable cause) {
super(message, cause);
}
}
27 changes: 27 additions & 0 deletions src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Copyright 2025 EMBL - European Bioinformatics Institute
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
* file except in compliance with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package uk.ac.ebi.embl.gff3tools.fasta;

import lombok.Getter;
import lombok.Setter;
import uk.ac.ebi.embl.gff3tools.fasta.headerutils.FastaHeader;

@Getter
@Setter
public class FastaEntry {
public String submissionId;
public String accessionId;
public FastaHeader header; // json info
public long totalBases;
public long totalBasesWithoutNBases;
public long leadingNsCount;
public long trailingNsCount;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Copyright 2025 EMBL - European Bioinformatics Institute
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
* file except in compliance with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package uk.ac.ebi.embl.gff3tools.fasta;

import lombok.Getter;
import lombok.Setter;
import uk.ac.ebi.embl.gff3tools.fasta.headerutils.FastaHeader;
import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex;

@Getter
@Setter
class FastaEntryInternal {
String submissionId;
String accessionId;
FastaHeader header;
// information needed for accessing the file
long fastaStartByte; // position of '>' in the file
SequenceIndex sequenceIndex; // a smart index for querying ranges in the file
}
154 changes: 154 additions & 0 deletions src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
/*
* Copyright 2025 EMBL - European Bioinformatics Institute
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
* file except in compliance with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package uk.ac.ebi.embl.gff3tools.fasta;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.*;
import lombok.Getter;
import lombok.Setter;
import uk.ac.ebi.embl.gff3tools.exception.FastaFileException;
import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.ByteSpan;
import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex;

/**
* Owns a SequentialFastaEntryReader, keeps all entries + indexes in memory, supports ID renames,
* and serves base-range slices by mapping (N..M bases) -> byte span via the cached SequenceIndex,
* then asking the reader to stream bytes while skipping newlines.
*/
@Getter
@Setter
public final class FastaFileService {

public List<FastaEntry> fastaEntries = new ArrayList<>();

private HashMap<String, SequenceIndex> sequenceIndexes = new HashMap<>();
private File file;
private SequentialFastaFileReader reader;

public FastaFileService() {
this.file = null;
}

// ---------------------------- queries ----------------------------

public Optional<FastaEntry> setAccessionId(String submissionId, String accessionId) throws FastaFileException {
Optional<FastaEntry> target = fastaEntries.stream()
.filter(entry -> entry.getSubmissionId().equals(submissionId))
.findFirst();
target.ifPresent(entry -> entry.setAccessionId(accessionId));
return target;
}

public Optional<FastaEntry> getFastaWithSubmissionId(String submissionId) throws FastaFileException {
return fastaEntries.stream()
.filter(entry -> entry.getSubmissionId().equals(submissionId))
.findFirst();
}

/** Return a sequence slice as a String (no EOLs) for [fromBase..toBase] inclusive. */
public String getSequenceSliceString(SequenceRangeOption option, String submissionId, long fromBase, long toBase)
throws FastaFileException {
ensureFileReaderOpen();
SequenceIndex index = sequenceIndexes.get(submissionId);
if (index == null) {
throw new FastaFileException("No sequence index found for submissionId " + submissionId);
}

final ByteSpan span;
switch (option) {
case WHOLE_SEQUENCE:
span = index.byteSpanForBaseRangeIncludingEdgeNBases(fromBase, toBase);
break;
case WITHOUT_N_BASES:
span = index.byteSpanForBaseRange(fromBase, toBase);
break;
default:
throw new IllegalStateException("Unknown option " + option);
}

try {
return reader.getSequenceSliceString(span);
} catch (IOException ioe) {
throw new FastaFileException(
"I/O while reading slice for " + submissionId + " bytes " + span.start + ".." + (span.endEx - 1),
ioe);
}
}

/**
* Return a sequence slice for reader [fromBase..toBase] (1-based, inclusive) for the given ID.
* Uses the cached index to translate bases -> bytes, then asks the reader to stream
* ASCII bytes while skipping '\n' and '\r' on the fly.
*/
public Reader getSequenceSliceReader(SequenceRangeOption option, String submissionId, long fromBase, long toBase)
throws FastaFileException {
ensureFileReaderOpen();
var index = sequenceIndexes.get(submissionId);
if (index == null) {
throw new FastaFileException("No sequence index found for submissionId " + submissionId);
}

ByteSpan span;
switch (option) {
case WHOLE_SEQUENCE:
span = index.byteSpanForBaseRangeIncludingEdgeNBases(fromBase, toBase);
break;
case WITHOUT_N_BASES:
span = index.byteSpanForBaseRange(fromBase, toBase);
break;
default:
throw new IllegalStateException("Unknown option " + option);
}

return reader.getSequenceSliceReader(span);
}

// ---------------------------- interactions with the reader ----------------------------

public void openNewFile(File fastaFile) throws FastaFileException, IOException {
close(); // if already open, close first
this.file = Objects.requireNonNull(fastaFile, "file");
this.fastaEntries.clear();
this.sequenceIndexes.clear();
reader = new SequentialFastaFileReader(fastaFile);
var readEntries = reader.readAll();
for (var entry : readEntries) {
FastaEntry fastaEntry = new FastaEntry();
fastaEntry.setSubmissionId(entry.getSubmissionId());
fastaEntry.setHeader(entry.getHeader());
fastaEntry.setTotalBases(entry.sequenceIndex.totalBases());
fastaEntry.setLeadingNsCount(entry.sequenceIndex.startNBasesCount);
fastaEntry.setTrailingNsCount(entry.sequenceIndex.endNBasesCount);
long adjustedBases = entry.sequenceIndex.totalBases()
- entry.sequenceIndex.startNBasesCount
- entry.sequenceIndex.endNBasesCount;
fastaEntry.setTotalBasesWithoutNBases(adjustedBases);
fastaEntries.add(fastaEntry);

sequenceIndexes.put(entry.getSubmissionId(), entry.sequenceIndex);
}
}

/** Close the reader. Safe to call multiple times. */
public void close() throws IOException {
if (reader != null) {
reader.close();
reader = null;
}
}

private void ensureFileReaderOpen() {
if (reader == null || !reader.readingFile())
throw new IllegalStateException("Service is not open. Call open() first.");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/*
* Copyright 2025 EMBL - European Bioinformatics Institute
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
* file except in compliance with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package uk.ac.ebi.embl.gff3tools.fasta;

public enum SequenceRangeOption {
WHOLE_SEQUENCE,
WITHOUT_N_BASES
}
Loading