diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaFileException.java b/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaFileException.java new file mode 100644 index 00000000..08c86cc7 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaFileException.java @@ -0,0 +1,28 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.exception; + +public class FastaFileException extends Exception { + + public FastaFileException() {} + + public FastaFileException(String message) { + super(message); + } + + public FastaFileException(Throwable cause) { + super(cause); + } + + public FastaFileException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java new file mode 100644 index 00000000..b7771922 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java @@ -0,0 +1,27 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta; + +import lombok.Getter; +import lombok.Setter; +import uk.ac.ebi.embl.gff3tools.fasta.headerutils.FastaHeader; + +@Getter +@Setter +public class FastaEntry { + public String submissionId; + public String accessionId; + public FastaHeader header; // json info + public long totalBases; + public long totalBasesWithoutNBases; + public long leadingNsCount; + public long trailingNsCount; +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java new file mode 100644 index 00000000..82e68264 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java @@ -0,0 +1,27 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta; + +import lombok.Getter; +import lombok.Setter; +import uk.ac.ebi.embl.gff3tools.fasta.headerutils.FastaHeader; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex; + +@Getter +@Setter +class FastaEntryInternal { + String submissionId; + String accessionId; + FastaHeader header; + // information needed for accessing the file + long fastaStartByte; // position of '>' in the file + SequenceIndex sequenceIndex; // a smart index for querying ranges in the file +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java new file mode 100644 index 00000000..94d83e4f --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -0,0 +1,154 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta; + +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.util.*; +import lombok.Getter; +import lombok.Setter; +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.ByteSpan; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex; + +/** + * Owns a SequentialFastaEntryReader, keeps all entries + indexes in memory, supports ID renames, + * and serves base-range slices by mapping (N..M bases) -> byte span via the cached SequenceIndex, + * then asking the reader to stream bytes while skipping newlines. + */ +@Getter +@Setter +public final class FastaFileService { + + public List fastaEntries = new ArrayList<>(); + + private HashMap sequenceIndexes = new HashMap<>(); + private File file; + private SequentialFastaFileReader reader; + + public FastaFileService() { + this.file = null; + } + + // ---------------------------- queries ---------------------------- + + public Optional setAccessionId(String submissionId, String accessionId) throws FastaFileException { + Optional target = fastaEntries.stream() + .filter(entry -> entry.getSubmissionId().equals(submissionId)) + .findFirst(); + target.ifPresent(entry -> entry.setAccessionId(accessionId)); + return target; + } + + public Optional getFastaWithSubmissionId(String submissionId) throws FastaFileException { + return fastaEntries.stream() + .filter(entry -> entry.getSubmissionId().equals(submissionId)) + .findFirst(); + } + + /** Return a sequence slice as a String (no EOLs) for [fromBase..toBase] inclusive. */ + public String getSequenceSliceString(SequenceRangeOption option, String submissionId, long fromBase, long toBase) + throws FastaFileException { + ensureFileReaderOpen(); + SequenceIndex index = sequenceIndexes.get(submissionId); + if (index == null) { + throw new FastaFileException("No sequence index found for submissionId " + submissionId); + } + + final ByteSpan span; + switch (option) { + case WHOLE_SEQUENCE: + span = index.byteSpanForBaseRangeIncludingEdgeNBases(fromBase, toBase); + break; + case WITHOUT_N_BASES: + span = index.byteSpanForBaseRange(fromBase, toBase); + break; + default: + throw new IllegalStateException("Unknown option " + option); + } + + try { + return reader.getSequenceSliceString(span); + } catch (IOException ioe) { + throw new FastaFileException( + "I/O while reading slice for " + submissionId + " bytes " + span.start + ".." + (span.endEx - 1), + ioe); + } + } + + /** + * Return a sequence slice for reader [fromBase..toBase] (1-based, inclusive) for the given ID. + * Uses the cached index to translate bases -> bytes, then asks the reader to stream + * ASCII bytes while skipping '\n' and '\r' on the fly. + */ + public Reader getSequenceSliceReader(SequenceRangeOption option, String submissionId, long fromBase, long toBase) + throws FastaFileException { + ensureFileReaderOpen(); + var index = sequenceIndexes.get(submissionId); + if (index == null) { + throw new FastaFileException("No sequence index found for submissionId " + submissionId); + } + + ByteSpan span; + switch (option) { + case WHOLE_SEQUENCE: + span = index.byteSpanForBaseRangeIncludingEdgeNBases(fromBase, toBase); + break; + case WITHOUT_N_BASES: + span = index.byteSpanForBaseRange(fromBase, toBase); + break; + default: + throw new IllegalStateException("Unknown option " + option); + } + + return reader.getSequenceSliceReader(span); + } + + // ---------------------------- interactions with the reader ---------------------------- + + public void openNewFile(File fastaFile) throws FastaFileException, IOException { + close(); // if already open, close first + this.file = Objects.requireNonNull(fastaFile, "file"); + this.fastaEntries.clear(); + this.sequenceIndexes.clear(); + reader = new SequentialFastaFileReader(fastaFile); + var readEntries = reader.readAll(); + for (var entry : readEntries) { + FastaEntry fastaEntry = new FastaEntry(); + fastaEntry.setSubmissionId(entry.getSubmissionId()); + fastaEntry.setHeader(entry.getHeader()); + fastaEntry.setTotalBases(entry.sequenceIndex.totalBases()); + fastaEntry.setLeadingNsCount(entry.sequenceIndex.startNBasesCount); + fastaEntry.setTrailingNsCount(entry.sequenceIndex.endNBasesCount); + long adjustedBases = entry.sequenceIndex.totalBases() + - entry.sequenceIndex.startNBasesCount + - entry.sequenceIndex.endNBasesCount; + fastaEntry.setTotalBasesWithoutNBases(adjustedBases); + fastaEntries.add(fastaEntry); + + sequenceIndexes.put(entry.getSubmissionId(), entry.sequenceIndex); + } + } + + /** Close the reader. Safe to call multiple times. */ + public void close() throws IOException { + if (reader != null) { + reader.close(); + reader = null; + } + } + + private void ensureFileReaderOpen() { + if (reader == null || !reader.readingFile()) + throw new IllegalStateException("Service is not open. Call open() first."); + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceRangeOption.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceRangeOption.java new file mode 100644 index 00000000..cba4a165 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceRangeOption.java @@ -0,0 +1,16 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta; + +public enum SequenceRangeOption { + WHOLE_SEQUENCE, + WITHOUT_N_BASES +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java new file mode 100644 index 00000000..421a4416 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -0,0 +1,311 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta; + +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.*; +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; +import uk.ac.ebi.embl.gff3tools.fasta.headerutils.JsonHeaderParser; +import uk.ac.ebi.embl.gff3tools.fasta.headerutils.ParsedHeader; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.ByteSpan; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceAlphabet; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndexBuilder; + +public class SequentialFastaFileReader implements AutoCloseable { + + private static final int BUFFER_SIZE = 4 * 1024 * 1024; // 4 MB + private static final int CHAR_BUF_SIZE = 512 * 1024; // 512 KB + private static final byte GT = (byte) '>'; + private static final byte LF = (byte) '\n'; + private static final byte CR = (byte) '\r'; + + private final FileChannel channel; + private final long fileSize; + private final JsonHeaderParser headerParser; + private final SequenceAlphabet alphabet; + + public SequentialFastaFileReader(File file) throws IOException { + this(file, new JsonHeaderParser(), SequenceAlphabet.defaultNucleotideAlphabet()); + } + + public SequentialFastaFileReader(File file, JsonHeaderParser parser, SequenceAlphabet alphabet) throws IOException { + Objects.requireNonNull(file, "Input FASTA file is null"); + if (!file.exists()) throw new FileNotFoundException(file.getAbsolutePath()); + if (file.isDirectory()) throw new FileNotFoundException("Directory: " + file.getAbsolutePath()); + if (!file.canRead()) throw new IllegalArgumentException("No read permission: " + file.getAbsolutePath()); + this.headerParser = Objects.requireNonNull(parser, "parser"); + this.alphabet = Objects.requireNonNull(alphabet, "alphabet"); + this.channel = new FileInputStream(file).getChannel(); + this.fileSize = channel.size(); + } + + @Override + public void close() throws IOException { + channel.close(); + } + + public boolean readingFile() { + return channel.isOpen(); + } + + public String getSequenceSliceString(ByteSpan span) throws IOException { + long byteStart = span.start; + long byteEndExclusive = span.endEx; + + if (byteStart < 0 || byteEndExclusive < byteStart || byteEndExclusive > fileSize) { + throw new IllegalArgumentException("Bad byte window: " + byteStart + ".." + byteEndExclusive); + } + long remain = byteEndExclusive - byteStart; + long off = byteStart; + + // pre-size builder with a sane cap (skip newlines, so content <= remain) + int expect = (int) Math.min(remain, 1_000_000L); + StringBuilder sb = new StringBuilder(expect); + + ByteBuffer buf = ByteBuffer.allocateDirect(BUFFER_SIZE); + while (remain > 0) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), remain); + buf.limit(want); + int n = channel.read(buf, off); + if (n <= 0) break; + buf.flip(); + while (buf.hasRemaining()) { + byte b = buf.get(); + if (b == LF || b == CR) continue; + sb.append((char) (b & 0xFF)); + } + remain -= n; + off += n; + } + return sb.toString(); + } + + /** Char-stream view over [span.start, span.endEx): ASCII decode, skip LF/CR. + * Uses absolute reads; does NOT change channel.position(). */ + public java.io.Reader getSequenceSliceReader(ByteSpan span) { + final long start = span.start; + final long endEx = span.endEx; + + return new java.io.Reader() { + private long pos = start; + + private final java.nio.ByteBuffer buf = java.nio.ByteBuffer.allocateDirect(CHAR_BUF_SIZE); + + { + // allocate buffer and mark it EMPTY so the very first read() refills it from the channel. + // Without this, hasRemaining() is true and we'll read uninitialized bytes (→ '\0'). + buf.limit(0); + } + + @Override + public int read( + char[] characterBuffer, int startingWriteIndexInCharacterBuffer, int maximumNumberOfCharsToRead) + throws java.io.IOException { + // --- Validate caller’s target window [off .. off + len) --- + ValidateTargetWindow(characterBuffer, startingWriteIndexInCharacterBuffer, maximumNumberOfCharsToRead); + if (maximumNumberOfCharsToRead == 0) return 0; + + int out = 0; + while (out < maximumNumberOfCharsToRead) { + // --- Prep the buffer for next read & fill it out --- + if (!buf.hasRemaining()) { + if (pos >= endEx) break; // if end of slice reached, stop reading + + buf.clear(); + int toRead = (int) Math.min(buf.capacity(), endEx - pos); + buf.limit(toRead); + + int n = channel.read(buf, pos); + if (n <= 0) break; // if no bytes were read, break + pos += n; + buf.flip(); + } + // Drain bytes + ASCII decode -> writees chars into caller’s window [off .. off+len) + while (buf.hasRemaining() && out < maximumNumberOfCharsToRead) { + byte b = buf.get(); + if (b == LF || b == CR) continue; // skip irrelevant bytes + characterBuffer[startingWriteIndexInCharacterBuffer + out] = (char) (b & 0xFF); + out++; + } + } + // If we produced nothing AND we’re at EOF, signal -1 + return (out == 0) ? -1 : out; + } + + private void ValidateTargetWindow( + char[] characterBuffer, int startingWriteIndexInCharacterBuffer, int maximumNumberOfCharsToRead) + throws java.io.IOException { + if (characterBuffer == null) throw new NullPointerException("characterBuffer"); + if (startingWriteIndexInCharacterBuffer < 0 + || maximumNumberOfCharsToRead < 0 + || startingWriteIndexInCharacterBuffer + maximumNumberOfCharsToRead > characterBuffer.length) { + throw new IndexOutOfBoundsException("off=" + startingWriteIndexInCharacterBuffer + " len=" + + maximumNumberOfCharsToRead + " bufLen=" + + characterBuffer.length); + } + } + + @Override + public int read() throws java.io.IOException { + char[] one = new char[1]; + int n = read(one, 0, 1); + return (n == -1) ? -1 : one[0]; + } + + @Override + public boolean ready() { + return buf.hasRemaining() || pos < endEx; + } + + @Override + public void close() { + /* no-op, channel is kept alive */ + } + }; + } + + public List readAll() throws FastaFileException, IOException { + long position = 0; + List entries = new ArrayList<>(); + while (true) { + var entry = readNext(position); + if (entry.isEmpty()) break; + entries.add(entry.get()); + position = entry.get().getSequenceIndex().lastBaseByte; // read from the end of last sequence + } + return entries; + } + + /** Reads the next FASTA entry starting at or after 'from'. */ + private Optional readNext(long from) throws FastaFileException { + try { + OptionalLong headerPosOpt = seekToNextHeader(from); + if (headerPosOpt.isEmpty()) return Optional.empty(); + + long headerPos = headerPosOpt.getAsLong(); + String headerLine = readHeaderLine(headerPos); + if (headerLine == null) throw new FastaFileException("Header is malformed at byte " + headerPos); + ParsedHeader ph = headerParser.parse(headerLine); + + long sequenceStartPos = channel.position(); // first byte after header line is the sequence position + SequenceIndexBuilder sib = new SequenceIndexBuilder(channel, fileSize, alphabet); + SequenceIndexBuilder.Result res = sib.buildFrom(sequenceStartPos); + + // Move reader cursor to the sequence start position + channel.position(sequenceStartPos); + + FastaEntryInternal e = new FastaEntryInternal(); + e.setSubmissionId(ph.getId()); + e.setHeader(ph.getHeader()); + e.setFastaStartByte(headerPos); + e.setSequenceIndex(res.index); + + return Optional.of(e); + } catch (IOException io) { + long pos = safePos(); + throw new FastaFileException("I/O while reading FASTA at byte " + pos + ": " + io.getMessage(), io); + } + } + + // ------------------ header seeking & line reading ------------------ + + private OptionalLong seekToNextHeader(long from) throws IOException { + if (from >= fileSize) return OptionalLong.empty(); + ByteBuffer buf = ByteBuffer.allocateDirect(BUFFER_SIZE); + + while (from < fileSize) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), fileSize - from); + buf.limit(want); + int n = channel.read(buf, from); + if (n <= 0) break; + buf.flip(); + while (buf.hasRemaining()) { + long abs = from + buf.position(); + if (buf.get() == GT && isLineStart(abs)) { + return OptionalLong.of(abs); + } + } + from += n; + } + return OptionalLong.empty(); + } + + private boolean isLineStart(long abs) throws IOException { + if (abs == 0) return true; + if (abs > fileSize) return false; + return peek(abs - 1) == LF; + } + + private byte peek(long abs) throws IOException { + if (abs < 0 || abs >= fileSize) return 0; + ByteBuffer one = ByteBuffer.allocate(1); + int n = channel.read(one, abs); + return (n == 1) ? one.get(0) : 0; + } + + /** Reads one ASCII line from input position, assuming the position handed to it contains '>', advances past LF or to EOF. */ + private String readHeaderLine(long from) throws IOException { + channel.position(from); + + long scanPos = channel.position(); + if (scanPos >= fileSize) return null; + + StringBuilder sb = new StringBuilder(256); + ByteBuffer buf = ByteBuffer.allocateDirect(BUFFER_SIZE); + + while (scanPos < fileSize) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), fileSize - scanPos); + buf.limit(want); + int n = channel.read(buf, scanPos); + if (n <= 0) break; + buf.flip(); + + int lfIndex = indexOf(buf, LF); + if (lfIndex >= 0) { + appendAscii(sb, buf, lfIndex); + long nextLineStart = scanPos + lfIndex + 1; // consume LF + channel.position(nextLineStart); + return sb.toString(); + } else { + appendAscii(sb, buf, buf.remaining()); + scanPos += n; + } + } + channel.position(fileSize); + return sb.toString(); + } + + private static int indexOf(ByteBuffer buf, byte target) { + for (int i = 0; i < buf.remaining(); i++) { + if (buf.get(buf.position() + i) == target) return i; + } + return -1; + } + + private static void appendAscii(StringBuilder sb, ByteBuffer buf, int len) { + byte[] chunk = new byte[len]; + buf.get(chunk); + sb.append(new String(chunk, java.nio.charset.StandardCharsets.US_ASCII)); + } + + private long safePos() { + try { + return channel.position(); + } catch (IOException e) { + return -1; + } + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/Topology.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/Topology.java new file mode 100644 index 00000000..148f4ca0 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/Topology.java @@ -0,0 +1,16 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta; + +public enum Topology { + LINEAR, + CIRCULAR +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java new file mode 100644 index 00000000..59be8951 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java @@ -0,0 +1,26 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta.headerutils; + +import lombok.Getter; +import lombok.Setter; +import uk.ac.ebi.embl.gff3tools.fasta.Topology; + +@Getter +@Setter +public class FastaHeader { + String description; // mandatory + String moleculeType; // mandatory + Topology topology; // mandatory + String chromosomeType; // optional (doesnt have to be in the json at all) + String chromosomeLocation; // optional + String chromosomeName; // optional +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java new file mode 100644 index 00000000..5f61a309 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java @@ -0,0 +1,109 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta.headerutils; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import java.util.*; +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; +import uk.ac.ebi.embl.gff3tools.fasta.Topology; + +public class JsonHeaderParser { + private static final ObjectMapper MAPPER = new ObjectMapper(); + + public ParsedHeader parse(String headerLine) throws FastaFileException { + String rest = headerLine.substring(1); // headerLine starts with '>' + int pipe = rest.indexOf('|'); + String idPart = (pipe >= 0 ? rest.substring(0, pipe) : rest).trim(); + String id = idPart.isEmpty() ? "" : idPart.split("\\s+")[0]; + + FastaHeader h = new FastaHeader(); + + if (pipe >= 0) { + fillFromJson(rest.substring(pipe + 1).trim(), h); // may throw IOException + } + return new ParsedHeader(id, h); + } + + private static void fillFromJson(String raw, FastaHeader h) throws FastaFileException { + if (raw == null || raw.isEmpty()) { + throw new FastaFileException("FASTA header contains a '|', but no JSON object was provided. " + + "Expected something like: >id { \"description\": \"...\", \"moleculeType\": \"DNA\", ... }"); + } + + // Normalize curly quotes / NBSPs + + String normalized = raw.replace('\u201C', '"') + .replace('\u201D', '"') + .replace('\u2018', '\'') + .replace('\u2019', '\'') + .replace('\u00A0', ' ') + .trim(); + + JsonNode node; + try { + node = MAPPER.readTree(normalized); + if (node == null || !node.isObject()) { + throw new FastaFileException( + "FASTA header JSON did not parse into an object. " + "Received: " + normalized); + } + } catch (IOException e) { + throw new FastaFileException("Malformed FASTA header JSON. Failed to parse: " + normalized, e); + } + + // Extract fields + Map m = new HashMap<>(); + node.fields().forEachRemaining(e -> { + String key = (e.getKey() == null ? "" : e.getKey()) + .trim() + .toLowerCase(Locale.ROOT) + .replaceAll("[\\s_-]+", ""); + String val = e.getValue().isNull() ? null : e.getValue().asText(); + m.put(key, val); + }); + + // Assign values + h.setDescription(m.get("description")); + h.setMoleculeType(m.get("moleculetype")); + h.setTopology(parseTopology(m.get("topology"))); + h.setChromosomeType(m.get("chromosometype")); + h.setChromosomeLocation(m.get("chromosomelocation")); + h.setChromosomeName(emptyToNull(m.get("chromosomename"))); + + // Validate required fields + List missing = new ArrayList<>(); + if (h.description == null) missing.add("description"); + if (h.moleculeType == null) missing.add("moleculeType"); + if (h.topology == null) missing.add("topology (must be 'LINEAR' or 'CIRCULAR')"); + + if (!missing.isEmpty()) { + throw new FastaFileException( + "FASTA header JSON is missing required fields: " + missing + ". Parsed JSON was: " + normalized); + } + } + + private static String emptyToNull(String s) { + return (s == null || s.isEmpty()) ? null : s; + } + + private static Topology parseTopology(String s) { + if (s == null) return null; + switch (s.trim().toUpperCase(Locale.ROOT)) { + case "LINEAR": + return Topology.LINEAR; + case "CIRCULAR": + return Topology.CIRCULAR; + default: + return null; + } + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/ParsedHeader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/ParsedHeader.java new file mode 100644 index 00000000..b5b33a8a --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/ParsedHeader.java @@ -0,0 +1,19 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta.headerutils; + +import lombok.Value; + +@Value +public class ParsedHeader { + String id; + FastaHeader header; +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java new file mode 100644 index 00000000..6d159370 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java @@ -0,0 +1,25 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +public final class ByteSpan { + public final long start; // inclusive + public final long endEx; // exclusive + + public ByteSpan(long start, long endEx) { + this.start = start; + this.endEx = endEx; + } + + public long length() { + return endEx - start; + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java new file mode 100644 index 00000000..35aa7cfd --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java @@ -0,0 +1,33 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +public final class LineEntry { + public long baseStart; // 1-based, inclusive + public long baseEnd; // 1-based, inclusive + public long byteStart; // absolute byte offset of first base in this line + public long byteEndExclusive; // absolute byte offset one past last base + + public LineEntry(long baseStart, long baseEnd, long byteStart, long byteEndExclusive) { + this.baseStart = baseStart; + this.baseEnd = baseEnd; + this.byteStart = byteStart; + this.byteEndExclusive = byteEndExclusive; + } + + public long lengthBases() { + return baseEnd - baseStart + 1; + } + + public long lengthBytes() { + return byteEndExclusive - byteStart; + } // ASCII: same as bases +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java new file mode 100644 index 00000000..6a555767 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java @@ -0,0 +1,62 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +public final class SequenceAlphabet { + private final boolean[] allowed = new boolean[128]; + + public SequenceAlphabet(String chars) { + for (char c : chars.toCharArray()) if (c < 128) allowed[c] = true; + allowed['>'] = false; + } + + /** Fast ASCII check for is it an allowed char. */ + public boolean isAllowed(byte b) { + int i = b & 0xFF; + return i < 128 && allowed[i]; + } + + /** Fast ASCII check for 'N' or 'n' without decoding. */ + public boolean isNBase(byte b) { + return ((b | 0x20) == 'n'); + } + + public static SequenceAlphabet defaultNucleotideAlphabet() { + return new SequenceAlphabet("ACGTURYSWKMBDHVNacgturyswkmbdhvn-.*"); + } + + public String describeAllowed() { + StringBuilder sb = new StringBuilder(); + sb.append("["); + + boolean first = true; + for (int i = 0; i < allowed.length; i++) { + if (allowed[i]) { + char c = (char) i; + + // Render unprintables safely + String display; + if (c >= 32 && c < 127) { + display = Character.toString(c); + } else { + display = String.format("\\x%02X", i); // e.g. non-printable → \x1B + } + + if (!first) sb.append(", "); + sb.append(display); + first = false; + } + } + + sb.append("]"); + return sb.toString(); + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java new file mode 100644 index 00000000..2bdc9577 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java @@ -0,0 +1,90 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public final class SequenceIndex { + + public long firstBaseByte; // -1 if empty + public long startNBasesCount; + public long lastBaseByte; // -1 if empty + public long endNBasesCount; + private final List lines; + + public SequenceIndex( + long firstBaseByte, long startNBasesCount, long lastBaseByte, long endNBasesCount, List lines) { + this.firstBaseByte = firstBaseByte; + this.startNBasesCount = startNBasesCount; + this.lastBaseByte = lastBaseByte; + this.endNBasesCount = endNBasesCount; + this.lines = new ArrayList<>(lines); + } + + public List linesView() { + return Collections.unmodifiableList(lines); + } + + public long totalBases() { + if (lines.isEmpty()) return 0; + return lines.get(lines.size() - 1).baseEnd; + } + + public long totalBasesExcludingEdgeNBases() { + long bases = totalBases() - endNBasesCount - startNBasesCount; + return Math.max(0, bases); + } + + public ByteSpan byteSpanForBaseRangeIncludingEdgeNBases(long fromBase, long toBase) { + long total = totalBases(); + if (fromBase < 1 || toBase < fromBase || toBase > total) { + throw new IllegalArgumentException("bad base range: " + fromBase + ".." + toBase); + } + int i = findLineByBase(fromBase); + int j = findLineByBase(toBase); + + LineEntry from = lines.get(i); + long offStart = fromBase - from.baseStart; + + LineEntry to = lines.get(j); + long offEndIncl = toBase - to.baseStart; + + long byteStart = from.byteStart + offStart; + long byteEndEx = to.byteStart + offEndIncl + 1; // half-open + + return new ByteSpan(byteStart, byteEndEx); + } + + public ByteSpan byteSpanForBaseRange(long fromBase, long toBase) { + long trimmedTotal = totalBasesExcludingEdgeNBases(); + if (fromBase < 1 || toBase < fromBase || toBase > trimmedTotal) { + throw new IllegalArgumentException("bad base range: " + fromBase + ".." + toBase); + } + long actualFromBase = startNBasesCount + fromBase; + long actualToBase = startNBasesCount + toBase; + return byteSpanForBaseRangeIncludingEdgeNBases(actualFromBase, actualToBase); + } + + private int findLineByBase(long base) { + int lo = 0, hi = lines.size() - 1, ans = hi; + while (lo <= hi) { + int mid = (lo + hi) >>> 1; + LineEntry L = lines.get(mid); + if (base < L.baseStart) hi = mid - 1; + else if (base > L.baseEnd) lo = mid + 1; + else return mid; + ans = lo; + } + return Math.max(0, Math.min(ans, lines.size() - 1)); + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java new file mode 100644 index 00000000..3a075d4a --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java @@ -0,0 +1,247 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.ArrayList; +import java.util.List; +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; + +public final class SequenceIndexBuilder { + private static final int SCAN_BUF_SIZE = 4 * 1024 * 1024; // 4 MB + private static final int COUNT_BUF_SIZE = 4 * 1024 * 1024; // 2 MB + + private static final byte GT = (byte) '>'; + private static final byte LF = (byte) '\n'; + private static final byte CR = (byte) '\r'; + + public static final class Result { + public final SequenceIndex index; + public final long nextHeaderByte; // byte offset of next '>' at line start, or fileSize (EOF) + + public Result(SequenceIndex index, long nextHeaderByte) { + this.index = index; + this.nextHeaderByte = nextHeaderByte; + } + } + + private final FileChannel ch; + private final long fileSize; + private final SequenceAlphabet alphabet; + + public SequenceIndexBuilder(FileChannel ch, long fileSize, SequenceAlphabet alphabet) { + this.ch = ch; + this.fileSize = fileSize; + this.alphabet = alphabet; + } + + /** Build a SequenceIndex starting at 'startPos' (first byte after header line). */ + public Result buildFrom(long startPos) throws IOException, FastaFileException { + ScanState s = new ScanState(startPos, fileSize); + ByteBuffer buf = newScanBuffer(); + + // ------------- scan raw bytes into provisional "sequence lines" ------------- + while (s.pos < fileSize) { + int n = fillBuffer(buf, s.pos); + if (n <= 0) break; + if (processBuffer(buf, s)) break; // found next header + s.pos += n; + } + commitOpenLineIfAny(s); + + // ------------- filter window & compute metadata ------------- + List filtered = filterLinesWithinWindow(s.lines, s.firstBaseByte, s.nextHdr); + + long firstBaseByte = filtered.isEmpty() ? -1 : filtered.get(0).byteStart; + long lastBaseByte = filtered.isEmpty() ? -1 : (filtered.get(filtered.size() - 1).byteEndExclusive - 1); + + long startN = 0, endN = 0; + if (!filtered.isEmpty()) { + startN = countLeadingNs(filtered.get(0)); // (3) only first line + endN = countTrailingNs(filtered.get(filtered.size() - 1)); // (4) only last line + } + + SequenceIndex idx = new SequenceIndex(firstBaseByte, startN, lastBaseByte, endN, filtered); + return new Result(idx, s.nextHdr); + } + + // ===================================================================== + // = scanning core = + // ===================================================================== + + private static final class ScanState { + long pos; // absolute scan position + long firstBaseByte = -1; // first allowed base byte seen + long lastBaseByte = -1; // last allowed base byte seen + long nextHdr; // byte of next header (or file end) + + long lineFirstByte = -1; // first allowed base byte in current line + long lineLastByte = -1; // last allowed base byte in current line + long basesSoFar = 0; + long basesInLine = 0; + + final ArrayList lines = new ArrayList<>(256); + + ScanState(long startPos, long fileSize) { + this.pos = startPos; + this.nextHdr = fileSize; + } + } + + private ByteBuffer newScanBuffer() { + return ByteBuffer.allocateDirect(SCAN_BUF_SIZE); + } + + private int fillBuffer(ByteBuffer buf, long at) throws IOException { + buf.clear(); + int want = (int) Math.min(buf.capacity(), fileSize - at); + buf.limit(want); + return ch.read(buf, at); // absolute read; does not touch ch.position() + } + + /** Returns true if we hit the next header and should stop scanning this entry. */ + private boolean processBuffer(ByteBuffer buf, ScanState s) throws IOException, FastaFileException { + buf.flip(); + while (buf.hasRemaining()) { + int idx = buf.position(); + byte b = buf.get(); + long abs = s.pos + idx; + + if (isHeaderStart(b, abs)) { + s.nextHdr = abs; // stop window at header byte + commitOpenLineIfAny(s); // finalize any in-flight line + return true; + } else if (b == LF || b == CR) { // end of a displayed sequence line or CR + commitOpenLineIfAny(s); // only lines with bases are committed + continue; + } else if (alphabet.isAllowed(b)) { + observeBase(abs, s); + } else { + throw new FastaFileException(String.format( + "Illegal character '%s' (byte value: %d) at absolute file position %d. " + + "This character is not allowed by the current FASTA alphabet. " + + "Expected only characters: %s", + (char) (b & 0xFF), b & 0xFF, abs, alphabet.describeAllowed())); + } + } + return false; + } + + private boolean isHeaderStart(byte b, long abs) throws IOException { + return b == GT && isLineStart(abs); + } + + /** header must be at file start or immediately after LF */ + private boolean isLineStart(long abs) throws IOException { + if (abs == 0) return true; + if (abs > fileSize) return false; + return peek(abs - 1) == LF; + } + + private byte peek(long abs) throws IOException { + if (abs < 0 || abs >= fileSize) return 0; + ByteBuffer one = ByteBuffer.allocate(1); + int n = ch.read(one, abs); + return (n == 1) ? one.get(0) : 0; + } + + private void observeBase(long abs, ScanState s) { + if (s.lineFirstByte < 0) s.lineFirstByte = abs; + s.lineLastByte = abs; + s.basesInLine++; + + if (s.firstBaseByte < 0) s.firstBaseByte = abs; + s.lastBaseByte = abs; + } + + private void commitOpenLineIfAny(ScanState s) { + if (s.basesInLine <= 0) return; // skip empty lines + long baseStart = s.basesSoFar + 1; + long baseEnd = s.basesSoFar + s.basesInLine; + long byteStart = s.lineFirstByte; + long byteEndEx = s.lineLastByte + 1; // half-open + + s.lines.add(new LineEntry(baseStart, baseEnd, byteStart, byteEndEx)); + + s.basesSoFar += s.basesInLine; + s.basesInLine = 0; + s.lineFirstByte = -1; + s.lineLastByte = -1; + } + + // ===================================================================== + // = window filter & edge N counting = + // ===================================================================== + + /** (1)+(2) Keep only lines fully inside [firstBaseByte, nextHdr) and already non-empty. */ + private List filterLinesWithinWindow(List raw, long firstBaseByte, long nextHdr) { + if (firstBaseByte < 0 || raw.isEmpty()) return List.of(); + ArrayList out = new ArrayList<>(raw.size()); + for (LineEntry L : raw) { + if (L.byteStart >= firstBaseByte && L.byteEndExclusive <= nextHdr) { + out.add(L); + } + } + return out; + } + + /** (3) count 'N'/'n' from the start of the first sequence line only. */ + private long countLeadingNs(LineEntry line) throws IOException { + long remaining = line.lengthBytes(); + long offset = line.byteStart; + long count = 0; + + ByteBuffer buf = ByteBuffer.allocateDirect(COUNT_BUF_SIZE); + while (remaining > 0) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), remaining); + buf.limit(want); + int n = ch.read(buf, offset); + if (n <= 0) break; + buf.flip(); + for (int i = 0; i < n; i++) { + byte b = buf.get(); + if (alphabet.isNBase(b)) count++; + else return count; + } + remaining -= n; + offset += n; + } + return count; + } + + /** (4) count 'N'/'n' at the tail of the last sequence line only. */ + private long countTrailingNs(LineEntry line) throws IOException { + long remaining = line.lengthBytes(); + long offset = line.byteStart; + long trailing = 0; + + ByteBuffer buf = ByteBuffer.allocateDirect(COUNT_BUF_SIZE); + while (remaining > 0) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), remaining); + buf.limit(want); + int n = ch.read(buf, offset); + if (n <= 0) break; + buf.flip(); + for (int i = 0; i < n; i++) { + byte b = buf.get(); + if (alphabet.isNBase(b)) trailing++; + else trailing = 0; + } + remaining -= n; + offset += n; + } + return trailing; + } +} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java new file mode 100644 index 00000000..82876f7e --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java @@ -0,0 +1,318 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import org.junit.jupiter.api.Test; +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; + +class FastaFileServiceIntegrationTest { + + @Test + void readingMalformedFastaJsonFailure() throws IOException { // more tests like this in the JsonHeaderParserTest + File fasta = FastaTestResources.file("fasta", "malformed_json_fasta.txt"); + FastaFileService service = new FastaFileService(); + + assertThrows(FastaFileException.class, () -> { + service.openNewFile(fasta); + }); + + service.close(); + } + + @Test + void readingMalformedFastaSequenceFailure() throws IOException { + File fasta = FastaTestResources.file("fasta", "malformed_fasta.txt"); + FastaFileService service = new FastaFileService(); + + assertThrows(FastaFileException.class, () -> { + service.openNewFile(fasta); + }); + + service.close(); + } + + @Test + void proccessingEntriesWithCarriageReturnsCorrectly() throws IOException, FastaFileException { + File fasta = FastaTestResources.file("fasta", "example_with_carriage_return_char.txt"); + FastaFileService service = new FastaFileService(); + service.openNewFile(fasta); + + List entries = service.getFastaEntries(); + assertEquals(2, entries.size(), "should parse 2 FASTA entries"); + + Set ids = entries.stream().map(e -> e.getSubmissionId()).collect(Collectors.toSet()); + assertTrue(ids.contains("AF123456.1")); + assertTrue(ids.contains("AF123455.2")); + + Optional entry1 = service.getFastaWithSubmissionId("AF123456.1"); + Optional entry2 = service.getFastaWithSubmissionId("AF123455.2"); + Optional imaginaryEntry = service.getFastaWithSubmissionId("ID3"); + assertTrue(entry1.isPresent(), "index for AF123456.1 must exist"); + assertTrue(entry2.isPresent(), "index for AF123455.2 must exist"); + assertTrue(imaginaryEntry.isEmpty(), "index for ID3 must not exist"); + + // From the sample file above: + assertEquals(9, entry1.get().leadingNsCount, "AF123456.1 leading Ns"); + assertEquals(1, entry1.get().trailingNsCount, "AF123456.1 trailing Ns"); + assertEquals(0, entry2.get().leadingNsCount, "AF123455.2 leading Ns"); + assertEquals(0, entry2.get().trailingNsCount, "AF123455.2 trailing Ns"); + + String sequence1StartSlice = + service.getSequenceSliceString(SequenceRangeOption.WITHOUT_N_BASES, entry1.get().submissionId, 1, 11); + assertEquals("CCCGGCGCGGG", sequence1StartSlice); + + String sequence1EndSlice = service.getSequenceSliceString( + SequenceRangeOption.WITHOUT_N_BASES, + entry1.get().submissionId, + entry1.get().totalBasesWithoutNBases - 9, + entry1.get().totalBasesWithoutNBases); + assertEquals("AAAAAAAAAA", sequence1EndSlice); + + String sequence2withoutNbases = service.getSequenceSliceString( + SequenceRangeOption.WITHOUT_N_BASES, + entry2.get().submissionId, + 1, + entry2.get().totalBasesWithoutNBases); + assertEquals( + "CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC" + + "TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG" + + "AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG" + + "ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA" + + "CGCGTTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCAGACGTGGCCAGCAA" + + "ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC" + + "ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT" + + "CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA" + + "AAAAAAAAAAAA", + sequence2withoutNbases); + + service.close(); + } + + @Test + void gettingSequenceSliceAsStringReturnsCorrectly() throws IOException, FastaFileException { + File fasta = FastaTestResources.file("fasta", "example.txt"); + FastaFileService service = new FastaFileService(); + service.openNewFile(fasta); + + List entries = service.getFastaEntries(); + assertEquals(2, entries.size(), "should parse 2 FASTA entries"); + + Set ids = entries.stream().map(e -> e.getSubmissionId()).collect(Collectors.toSet()); + assertTrue(ids.contains("ID1")); + assertTrue(ids.contains("ID2")); + + Optional entry1 = service.getFastaWithSubmissionId("ID1"); + Optional entry2 = service.getFastaWithSubmissionId("ID2"); + Optional imaginaryEntry = service.getFastaWithSubmissionId("ID3"); + assertTrue(entry1.isPresent(), "index for ID1 must exist"); + assertTrue(entry2.isPresent(), "index for ID2 must exist"); + assertTrue(imaginaryEntry.isEmpty(), "index for ID3 must not exist"); + + service.setAccessionId("ID1", "asc1"); + service.setAccessionId("ID2", "asc2"); + assertEquals(entry1.get().accessionId, "asc1"); + assertEquals(entry2.get().accessionId, "asc2"); + + // From the sample file above: + assertEquals(2, entry1.get().leadingNsCount, "ID1 leading Ns"); + assertEquals(2, entry1.get().trailingNsCount, "ID1 trailing Ns"); + assertEquals(0, entry2.get().leadingNsCount, "ID2 leading Ns"); + assertEquals(0, entry2.get().trailingNsCount, "ID2 trailing Ns"); + + String sequence1 = service.getSequenceSliceString( + SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases); + assertEquals("NNACACGTTTNn", sequence1); + String sequence2 = service.getSequenceSliceString( + SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, entry2.get().totalBases); + assertEquals("ACGTGGGG", sequence2); + + String sequence1withoutNbases = service.getSequenceSliceString( + SequenceRangeOption.WITHOUT_N_BASES, + entry1.get().submissionId, + 1, + entry1.get().totalBasesWithoutNBases); + assertEquals("ACACGTTT", sequence1withoutNbases); + + service.close(); + } + + @Test + void gettingSequenceViaReaderGivesCorrectResult() throws IOException, FastaFileException { + File fasta = FastaTestResources.file("fasta", "example.txt"); + FastaFileService service = new FastaFileService(); + service.openNewFile(fasta); + + List entries = service.getFastaEntries(); + assertEquals(2, entries.size(), "should parse 2 FASTA entries"); + + Set ids = entries.stream().map(e -> e.getSubmissionId()).collect(Collectors.toSet()); + assertTrue(ids.contains("ID1")); + assertTrue(ids.contains("ID2")); + Optional entry1 = service.getFastaWithSubmissionId("ID1"); + Optional entry2 = service.getFastaWithSubmissionId("ID2"); + + // stream whole sequence with the reader + String streamedSequence; + try (java.io.Reader r = service.getSequenceSliceReader( + SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases)) { + StringBuilder sb = new StringBuilder(); + char[] cbuf = new char[8192]; + int n; + while ((n = r.read(cbuf)) != -1) { + sb.append(cbuf, 0, n); + } + streamedSequence = sb.toString(); + } + // compare + assertEquals("NNACACGTTTNn", streamedSequence); + + // stream whole sequence with the reader + String streamedSequenceWithoutNbases; + try (java.io.Reader r = service.getSequenceSliceReader( + SequenceRangeOption.WITHOUT_N_BASES, + entry1.get().submissionId, + 1, + entry1.get().totalBasesWithoutNBases)) { + StringBuilder sb = new StringBuilder(); + char[] cbuf = new char[8192]; + int n; + while ((n = r.read(cbuf)) != -1) { + sb.append(cbuf, 0, n); + } + streamedSequenceWithoutNbases = sb.toString(); + } + // compare + assertEquals("ACACGTTT", streamedSequenceWithoutNbases); + + // stream sequence with the reader + String streamedSequence2; + try (java.io.Reader r = service.getSequenceSliceReader( + SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, entry2.get().totalBases)) { + StringBuilder sb = new StringBuilder(); + char[] cbuf = new char[8192]; + int n; + while ((n = r.read(cbuf)) != -1) { + sb.append(cbuf, 0, n); + } + streamedSequence2 = sb.toString(); + } + // compare + assertEquals("ACGTGGGG", streamedSequence2); + + service.close(); + } + + @Test + void gettingStringAsAStringVsStreamProducesSameResultSlices() throws IOException, FastaFileException { + File fasta = FastaTestResources.file("fasta", "example.txt"); + FastaFileService service = new FastaFileService(); + service.openNewFile(fasta); + + List entries = service.getFastaEntries(); + assertEquals(2, entries.size(), "should parse 2 FASTA entries"); + + Set ids = entries.stream().map(e -> e.getSubmissionId()).collect(Collectors.toSet()); + assertTrue(ids.contains("ID1")); + assertTrue(ids.contains("ID2")); + Optional entry1 = service.getFastaWithSubmissionId("ID1"); + Optional entry2 = service.getFastaWithSubmissionId("ID2"); + + for (long end = 2; end <= entry1.get().totalBases; end++) { + // get slice as string + String sequence = service.getSequenceSliceString( + SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, end); + // stream sequence with the reader + String streamedSequence; + try (java.io.Reader r = service.getSequenceSliceReader( + SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, end)) { + StringBuilder sb = new StringBuilder(); + char[] cbuf = new char[8192]; + int n; + while ((n = r.read(cbuf)) != -1) { + sb.append(cbuf, 0, n); + } + streamedSequence = sb.toString(); + } + // compare + assertEquals(sequence, streamedSequence); + } + + for (long end = 2; end <= entry2.get().totalBases; end++) { + // get slice as string + String sequence2 = service.getSequenceSliceString( + SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, end); + // stream sequence with the reader + String streamedSequence2; + try (java.io.Reader r = service.getSequenceSliceReader( + SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, end)) { + StringBuilder sb = new StringBuilder(); + char[] cbuf = new char[8192]; + int n; + while ((n = r.read(cbuf)) != -1) { + sb.append(cbuf, 0, n); + } + streamedSequence2 = sb.toString(); + } + // compare + assertEquals(sequence2, streamedSequence2); + } + + service.close(); + } + + // to run this, curl the sequence with: curl -o single_fasta_large_sequence.txt + // https://www.ebi.ac.uk/ena/cram/md5/11398cc4b68f2cceb4fd50b742d4b1ec + // then to add the fasta header run something like : + // + // tmp="$(mktemp "${TMPDIR:-/tmp}/prepend.XXXXXX")" && + // { printf '%s\n' '>ID1 | {"description":"x", "molecule_type":"dna", "topology":"linear"}'; cat -- + // single_fasta_large_sequence.txt; } >"$tmp" && + // mv -f -- "$tmp" single_fasta_large_sequence.txt + // + // then just move the fasta into whatever/gff3tools/src/test/resources/fasta/ + // and run the test + // @Test + void readBigSequenceSuccessfully() throws IOException, FastaFileException { + File fasta = FastaTestResources.file("fasta", "single_fasta_large_sequence.txt"); + FastaFileService service = new FastaFileService(); + service.openNewFile(fasta); + + List entries = service.getFastaEntries(); + assertEquals(1, entries.size(), "should parse 1 FASTA entry"); + + Set ids = entries.stream().map(e -> e.getSubmissionId()).collect(Collectors.toSet()); + assertTrue(ids.contains("ID1")); + Optional entry1 = service.getFastaWithSubmissionId("ID1"); + + // get first 16 chars + String sequenceStart = + service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, 16); + assertEquals(sequenceStart, "GGGCTTTAAATGGCTC"); + + // get last 16 chars + String sequenceEnd = service.getSequenceSliceString( + SequenceRangeOption.WHOLE_SEQUENCE, + entry1.get().submissionId, + entry1.get().totalBases - 15, + entry1.get().totalBases); + assertEquals(sequenceEnd, "GAATTCTGATGGCTGT"); + + service.close(); + } +} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaTestResources.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaTestResources.java new file mode 100644 index 00000000..87427e65 --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaTestResources.java @@ -0,0 +1,59 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta; + +import java.io.*; +import java.net.URL; +import java.nio.file.*; +import java.util.Objects; + +public final class FastaTestResources { + private FastaTestResources() {} + + /** Returns a Path to a resource like ("fasta", "example.txt"). */ + public static Path path(String dir, String fileName) { + Objects.requireNonNull(dir, "dir"); + Objects.requireNonNull(fileName, "fileName"); + String resource = dir.endsWith("/") ? dir + fileName : dir + "/" + fileName; + + ClassLoader cl = Thread.currentThread().getContextClassLoader(); + URL url = Objects.requireNonNull(cl.getResource(resource), "Missing resource on classpath: " + resource); + + try { + if ("file".equals(url.getProtocol())) { + // Gradle tests: build/resources/test/... + return Paths.get(url.toURI()); + } + // Fallback for jar: URLs — copy to temp so callers can have a real Path/File + try (InputStream in = cl.getResourceAsStream(resource)) { + Objects.requireNonNull(in, "Resource stream is null: " + resource); + Path tmp = Files.createTempFile("testres-", "-" + fileName); + tmp.toFile().deleteOnExit(); + Files.copy(in, tmp, StandardCopyOption.REPLACE_EXISTING); + return tmp; + } + } catch (Exception e) { + throw new IllegalStateException("Failed to resolve resource: " + resource, e); + } + } + + /** Convenience if you need a File. */ + public static File file(String dir, String fileName) { + return path(dir, fileName).toFile(); + } + + /** Stream, if you don’t need a File/Path. */ + public static InputStream stream(String dir, String fileName) { + String resource = dir.endsWith("/") ? dir + fileName : dir + "/" + fileName; + InputStream in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resource); + return Objects.requireNonNull(in, "Missing resource stream: " + resource); + } +} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java new file mode 100644 index 00000000..396923c9 --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java @@ -0,0 +1,191 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta.headerutils; + +import static org.junit.jupiter.api.Assertions.*; + +import org.junit.jupiter.api.Test; +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; +import uk.ac.ebi.embl.gff3tools.fasta.Topology; + +public class JsonHeaderParserTest { + + private final JsonHeaderParser parser = new JsonHeaderParser(); + + // --------------------------------------------------------- + // VALID CASES + // --------------------------------------------------------- + + @Test + void parsesStandardHeaderWithJson() { + String line = + ">AF123456.1 | { \"description\":\"Pinus sativa\", \"molecule_type\":\"genomic\", \"topology\":\"circular\" }"; + + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + assertEquals("AF123456.1", ph.getId()); + + FastaHeader h = ph.getHeader(); + assertEquals("Pinus sativa", h.getDescription()); + assertEquals("genomic", h.getMoleculeType()); + assertEquals(Topology.CIRCULAR, h.getTopology()); + assertEquals(null, h.getChromosomeType()); + assertEquals(null, h.getChromosomeLocation()); + assertEquals(null, h.getChromosomeName()); + } + + @Test + void picksFirstTokenAsIdEvenWithExtraStuff() { + String line = ">AF123456.1 extra tokens here | " + + " {\"description\":\"x\", \"molecule_type\":\"dna\", \"topology\":\"linear\"}"; + + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + assertEquals("AF123456.1", ph.getId()); + } + + @Test + void parsesCurlyQuotesAndWeirdSpacingInKeys() { + String line = + ">ID1 | { \u201Cdescription\u201D: \u201CPinus\u201D, \u201C molecule_type\u201D:\"genomic\", \u201Ctopology\u201D:\"CIRCULAR\" }"; + + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + FastaHeader h = ph.getHeader(); + + assertEquals("Pinus", h.getDescription()); + assertEquals("genomic", h.getMoleculeType()); + assertEquals(Topology.CIRCULAR, h.getTopology()); + } + + @Test + void normalizesKeyVariantsAndChromosomeOptionals() { + String line = ">ID2 | { \"Description\":\"Desc\", \"molecule-type\":\"rna\", \"topology\":\"linear\", " + + "\"Chromosome Type\":\"plasmid\", \"chromosome_location\":\"chr12:100-200\", \"CHROMOSOME_NAME\":\"pX\" }"; + + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + FastaHeader h = ph.getHeader(); + + assertEquals("Desc", h.getDescription()); + assertEquals("rna", h.getMoleculeType()); + assertEquals(Topology.LINEAR, h.getTopology()); + assertEquals("plasmid", h.getChromosomeType()); + assertEquals("chr12:100-200", h.getChromosomeLocation()); + assertEquals("pX", h.getChromosomeName()); + } + + @Test + void handlesNbspInJson() { + String nbsp = "\u00A0"; + String line = ">ID3 | {" + nbsp + + "\"description\"" + nbsp + ":" + nbsp + "\"Alpha" + nbsp + "Beta\"" + "," + + "\"molecule_type\":\"rna\", \"topology\":\"linear\"}"; + + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + FastaHeader h = ph.getHeader(); + + assertEquals("Alpha Beta", h.getDescription()); + assertEquals("rna", h.getMoleculeType()); + assertEquals(Topology.LINEAR, h.getTopology()); + } + + @Test + void missingJsonIsFine_NoPipe() { + String line = ">AF999999.5 some label without json"; + + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + assertEquals("AF999999.5", ph.getId()); + + FastaHeader h = ph.getHeader(); + assertNull(h.getDescription()); + assertNull(h.getMoleculeType()); + assertNull(h.getTopology()); + } + + @Test + void trimsIdAndHandlesJustChevron() { + ParsedHeader ph1 = assertDoesNotThrow(() -> parser.parse( + "> AF111 | {\"description\":\"x\",\"molecule_type\":\"dna\",\"topology\":\"linear\"}")); + assertEquals("AF111", ph1.getId()); + + // No pipe: JSON not required + ParsedHeader ph2 = assertDoesNotThrow(() -> parser.parse(">")); + assertEquals("", ph2.getId()); + assertNull(ph2.getHeader().getDescription()); + } + + // --------------------------------------------------------- + // INVALID CASES — MUST THROW FASTAFIleException + // --------------------------------------------------------- + + @Test + void noJsonAfterPipeThrows() { + String line = ">ID5 | "; + assertThrows(FastaFileException.class, () -> parser.parse(line)); + } + + @Test + void emptyJsonAfterPipeThrows() { + String line = ">ID5 | {} "; + assertThrows(FastaFileException.class, () -> parser.parse(line)); + } + + @Test + void jsonWithNullValuesThrows() { + String line = ">ID8 | {\"description\":null, \"molecule_type\":null, \"topology\":null}"; + assertThrows(FastaFileException.class, () -> parser.parse(line)); + } + + @Test + void missingRequiredFieldsThrows() { + String line = ">ID9 | {\"description\":\"x\"}"; + FastaFileException e = assertThrows(FastaFileException.class, () -> parser.parse(line)); + assertTrue(e.getMessage().contains("missing required")); + } + + @Test + void unknownTopologyThrows() { + String line = ">ID4 | {\"description\":\"x\", \"molecule_type\":\"dna\", \"topology\":\"banana\"}"; + FastaFileException e = assertThrows(FastaFileException.class, () -> parser.parse(line)); + assertTrue(e.getMessage().contains("topology")); + } + + // --------------------------------------------------------- + // MALFORMED JSON + // --------------------------------------------------------- + + @Test + void malformedJsonThrowsAndIncludesJsonInMessage() { + String badJson = "{\"description\": \"x\", \"molecule_type\": \"genomic\", OOPS }"; + String line = ">ID6 | " + badJson; + + FastaFileException e = assertThrows(FastaFileException.class, () -> parser.parse(line)); + assertTrue(e.getMessage().contains("OOPS")); + assertTrue(e.getMessage().contains("{\"description")); + } + + @Test + void malformedJsonBracesThrowsAndIncludesJsonInMessage() { + String badJson = "{\"description\": \"x\", \"molecule_type\": \"genomic\", OOPS }"; + String line = ">ID6 | " + badJson; + + FastaFileException e = assertThrows(FastaFileException.class, () -> parser.parse(line)); + assertTrue(e.getMessage().contains("OOPS")); + assertTrue(e.getMessage().contains("{\"description")); + } + + @Test + void malformedJsonWithTrailingCommaThrowsAndMentionsComma() { + String badJson = "{ \"description\":\"y\", \"molecule_type\":\"genomic\", }"; + String line = ">ID7 | " + badJson; + + FastaFileException e = assertThrows(FastaFileException.class, () -> parser.parse(line)); + assertTrue(e.getMessage().contains("\"description\":\"y\"")); + assertTrue(e.getMessage().contains("\"molecule_type\":\"genomic\"")); + } +} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java new file mode 100644 index 00000000..d754dd49 --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java @@ -0,0 +1,189 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.List; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class SequenceIndexBuilderTest { + + @TempDir + Path tempDir; + + private static FileChannel openRead(Path p) throws IOException { + return FileChannel.open(p, StandardOpenOption.READ); + } + + private static Path writeAscii(Path dir, String filename, String content) throws IOException { + Path p = dir.resolve(filename); + Files.write(p, content.getBytes(StandardCharsets.US_ASCII)); + return p; + } + + @Test + void buildsIndexCorrectly() throws Exception { + // Layout (US-ASCII): + // >ID1 | {"d":"x"}\n + // NNAC\n + // acgt\n + // ttnN\n + // \n + // \t\n + // \n + // >NEXT\n + String header = ">ID1 | {\"d\":\"x\"}\n"; + String l1 = "NNAC\n"; // leading N=2 + String l2 = "acgt\n"; + String l3 = "ttnN\n"; // trailing N=2 + String empties = "\n\n\n"; + String nextHead = ">NEXT\n"; + + String fasta = header + l1 + l2 + l3 + empties + nextHead; + Path p = writeAscii(tempDir, "idx1.fa", fasta); + + try (FileChannel ch = openRead(p)) { + long fileSize = ch.size(); + long seqStartPos = header.getBytes(StandardCharsets.US_ASCII).length; // first byte after header line + + SequenceAlphabet alpha = SequenceAlphabet.defaultNucleotideAlphabet(); + SequenceIndexBuilder sib = new SequenceIndexBuilder(ch, fileSize, alpha); + + long beforePos = ch.position(); // should remain unchanged + SequenceIndexBuilder.Result res = sib.buildFrom(seqStartPos); + long afterPos = ch.position(); + + // builder must not touch channel.position() + assertEquals(beforePos, afterPos, "builder must not change channel.position()"); + + SequenceIndex idx = res.index; + + // Lines: only 3 sequence lines; empties ignored + List lines = idx.linesView(); + assertEquals(3, lines.size(), "only non-empty sequence lines must be indexed"); + + // Base numbering should be contiguous across lines (4 bases per line) + assertEquals(1, lines.get(0).baseStart); + assertEquals(4, lines.get(0).baseEnd); + assertEquals(5, lines.get(1).baseStart); + assertEquals(8, lines.get(1).baseEnd); + assertEquals(9, lines.get(2).baseStart); + assertEquals(12, lines.get(2).baseEnd); + + // Byte math: each line has 4 letters; byteEndExclusive = lastBaseByte + 1 + long l1Start = seqStartPos; // begins right after header line + long l1EndEx = l1Start + 4; + long l2Start = l1EndEx + 1; // + LF between lines + long l2EndEx = l2Start + 4; + long l3Start = l2EndEx + 1; + long l3EndEx = l3Start + 4; + + assertEquals(l1Start, lines.get(0).byteStart); + assertEquals(l1EndEx, lines.get(0).byteEndExclusive); + + assertEquals(l2Start, lines.get(1).byteStart); + assertEquals(l2EndEx, lines.get(1).byteEndExclusive); + + assertEquals(l3Start, lines.get(2).byteStart); + assertEquals(l3EndEx, lines.get(2).byteEndExclusive); + + // first/last base bytes + assertEquals(l1Start, idx.firstBaseByte); + assertEquals(l3EndEx - 1, idx.lastBaseByte); + + // Edge N counting: only first and last lines are inspected + assertEquals(2, idx.startNBasesCount, "leading Ns only from first sequence line"); + assertEquals(2, idx.endNBasesCount, "trailing Ns only from last sequence line"); + + // nextHeaderByte should point to '>' of NEXT header + long expectedNextHeader = header.length() + l1.length() + l2.length() + l3.length() + empties.length(); + assertEquals(expectedNextHeader, res.nextHeaderByte); + } + } + + @Test + void buildsIndexCorrectlyTest2() throws Exception { + String header = ">ID2\n"; + String l1 = "NNxx".replace('x', 'A') + "\n"; + String l2 = "gggg\n"; + String next = ">H2\n"; + + String fasta = header + l1 + l2 + next; + Path p = writeAscii(tempDir, "idx2.fa", fasta); + + try (FileChannel ch = openRead(p)) { + long seqStart = header.getBytes(StandardCharsets.US_ASCII).length; + SequenceIndexBuilder sib = + new SequenceIndexBuilder(ch, ch.size(), SequenceAlphabet.defaultNucleotideAlphabet()); + + SequenceIndexBuilder.Result res = sib.buildFrom(seqStart); + SequenceIndex idx = res.index; + + // Two non-empty lines only + assertEquals(2, idx.linesView().size()); + + // leading Ns counted only on first line (here: 2) + assertEquals(2, idx.startNBasesCount); + // no trailing Ns on last line (all 'g') + assertEquals(0, idx.endNBasesCount); + + // nextHeader should be at the '>' byte of H2 + long expectedNext = fasta.lastIndexOf(">H2\n"); // ascii index + assertEquals(expectedNext, res.nextHeaderByte); + } + } + + @Test + void ignoresEmptyLinesCorrectly() throws Exception { + String header = ">ID3\n"; + String l1 = "NACG\n"; // leading N = 1 + String l2 = "NNNN\n"; // middle line of Ns — must NOT affect start/end N counts + String blanks = "\n\n"; + String l3 = "GGGn\n"; // trailing n = 1 + String next = ">K\n"; + + String fasta = header + l1 + l2 + blanks + l3 + next; + Path p = writeAscii(tempDir, "idx3.fa", fasta); + + try (FileChannel ch = openRead(p)) { + long seqStart = header.getBytes(StandardCharsets.US_ASCII).length; + SequenceIndexBuilder sib = + new SequenceIndexBuilder(ch, ch.size(), SequenceAlphabet.defaultNucleotideAlphabet()); + + long before = ch.position(); + SequenceIndexBuilder.Result res = sib.buildFrom(seqStart); + long after = ch.position(); + assertEquals(before, after, "builder must not move channel position"); + + SequenceIndex idx = res.index; + // three non-empty sequence lines: l1, l2, l3 + assertEquals(3, idx.linesView().size()); + + // Edge N counts: only first and last lines considered + assertEquals(1, idx.startNBasesCount, "only first line leading Ns"); + assertEquals(1, idx.endNBasesCount, "only last line trailing Ns"); + + // Middle line of Ns shouldn't change edge counts + assertEquals(idx.linesView().get(1).lengthBases(), 4); + + // Total base numbering should be contiguous: 4 + 4 + 4 = 12 + assertEquals(12, idx.totalBases()); + } + } +} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java new file mode 100644 index 00000000..1072b496 --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java @@ -0,0 +1,127 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +import static org.junit.jupiter.api.Assertions.*; + +import java.util.List; +import org.junit.jupiter.api.Test; + +public class SequenceIndexTest { + + /** + * Synthetic line layout (ASCII, 1 byte/base; '\n' not part of lines): + * + * Line1: bases 1..4 at bytes [100,104) -> base->byte: 1:100, 2:101, 3:102, 4:103, '\n':104 + * Line2: bases 5..8 at bytes [105,109) -> 5:105, 6:106, 7:107, 8:108, '\n':109 + * Line3: bases 9..12 at bytes [110,114) -> 9:110, 10:111, 11:112, 12:113, '\n':114 + * + * So: + * - first base byte = 100 + * - last base byte = 113 + * - total bases including edge Ns = 12 + */ + private SequenceIndex buildIndex(long startN, long endN) { + List lines = + List.of(new LineEntry(1, 4, 100, 104), new LineEntry(5, 8, 105, 109), new LineEntry(9, 12, 110, 114)); + return new SequenceIndex( + /*firstBaseByte*/ 100, + /*startNBasesCount*/ startN, + /*lastBaseByte*/ 113, + /*endNBasesCount*/ endN, + lines); + } + + @Test + void totalsIncludingAndTrimmed() { + SequenceIndex idx = buildIndex(/*startN*/ 2, /*endN*/ 3); + + assertEquals(12, idx.totalBases(), "totalBasesIncludingEdgeNBases"); + assertEquals(7, idx.totalBasesExcludingEdgeNBases(), "trimmed totalBases"); + } + + @Test + void byteSpanIncludingEdgesSameLine() { + SequenceIndex idx = buildIndex(0, 0); + + // [from..to] = [2..4] -> bytes [101..103], endExclusive = 104 + ByteSpan s = idx.byteSpanForBaseRangeIncludingEdgeNBases(2, 4); + + assertEquals(101, s.start); + assertEquals(104, s.endEx); + assertEquals(3, s.length()); + } + + @Test + void byteSpanIncludingEdgesCrossesNewline() { + SequenceIndex idx = buildIndex(0, 0); + + // [2..5] crosses the newline between line1 and line2 + // start = base2@101, endEx = base5@105 + 1 = 106, newline at 104 is included + ByteSpan s = idx.byteSpanForBaseRangeIncludingEdgeNBases(2, 5); + + assertEquals(101, s.start); + assertEquals(106, s.endEx); + assertEquals(5, s.length()); // 2,3,\n,5,exclusive end char + } + + @Test + void includingEdgesValidatesTotal() { + SequenceIndex idx = buildIndex(0, 0); + assertThrows( + IllegalArgumentException.class, + () -> idx.byteSpanForBaseRangeIncludingEdgeNBases(1, 13), + "toBase beyond total (including Ns) should throw"); + } + + @Test + void trimmedByteSpanMapsThroughStartN() { + SequenceIndex idx = buildIndex(2, 3); + assertEquals(7, idx.totalBasesExcludingEdgeNBases()); + + ByteSpan s = idx.byteSpanForBaseRange(1, 3); // Ignore first 2 Ns, ignore last 3 Ns + + assertEquals(102, s.start); + assertEquals(106, s.endEx); + assertEquals(4, s.length()); // 3 bases + exclusive end + } + + @Test + void trimmedSpanCrossesMultipleLines() { + SequenceIndex idx = buildIndex(2, 3); // trimmed total = 7 bases + + ByteSpan s = idx.byteSpanForBaseRange(4, 7); + + assertEquals(106, s.start); + assertEquals(111, s.endEx); + assertEquals(5, s.length()); + } + + @Test + void trimmedValidatesRangeAgainstTrimmedTotal() { + SequenceIndex idx = buildIndex(2, 3); // trimmed total = 7 + assertThrows( + IllegalArgumentException.class, + () -> idx.byteSpanForBaseRange(1, 8), + "toBase beyond trimmed total should throw"); + } + + @Test + void zeroEdgeNsBehaviorMatchesIncludingMethod() { + SequenceIndex idx = buildIndex(0, 0); // no additional N bases + + ByteSpan a = idx.byteSpanForBaseRange(2, 5); + ByteSpan b = idx.byteSpanForBaseRangeIncludingEdgeNBases(2, 5); + + assertEquals(b.start, a.start); + assertEquals(b.endEx, a.endEx); + } +} diff --git a/src/test/resources/fasta/example.txt b/src/test/resources/fasta/example.txt new file mode 100644 index 00000000..a91ba778 --- /dev/null +++ b/src/test/resources/fasta/example.txt @@ -0,0 +1,9 @@ +>ID1 | {"description":"x", "molecule_type":"dna", "topology":"linear"} +NNAC +ACGT + +TTNn + +>ID2 | {"description":"x", "molecule_type":"dna", "topology":"circular"} +ACGT +GGGG diff --git a/src/test/resources/fasta/example_to_delete.txt b/src/test/resources/fasta/example_to_delete.txt new file mode 100644 index 00000000..8079c4e9 --- /dev/null +++ b/src/test/resources/fasta/example_to_delete.txt @@ -0,0 +1,24 @@ + +NONSENSE +NONSENSE + +>AF123456.1 |{ “description”: “Pinus sativa isolate xyz, complete mitochondrion”, “ molecule_type”: “genomic”, “topology”: “circular” } +CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC +TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG +AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG +ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA +CGCGTTTTGCATATTACAGTTGAGTGCCTCGACTTAGATTGCAATATAAGCGGCCAGCAA +ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC +ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT +CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA +AAAAAAAAAAAA +>AF123455.2 |{ “description”: “Pinus sativa isolate xyz, complete mitochondrion”, “ molecule_type”: “genomic”, “topology”: “circular” } +CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC +TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG +AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG +ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA +CGCGTTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCAGACGTGGCCAGCAA +ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC +ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT +CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA +AAAAAAAAAAAA diff --git a/src/test/resources/fasta/example_with_carriage_return_char.txt b/src/test/resources/fasta/example_with_carriage_return_char.txt new file mode 100644 index 00000000..2c2637c9 --- /dev/null +++ b/src/test/resources/fasta/example_with_carriage_return_char.txt @@ -0,0 +1,26 @@ +>AF123456.1 |{"description":"x", "molecule_type":"dna", "topology":"circular"} +nnnNNNNNNCCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC +TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG + +AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG + +ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA + +CGCGTTTTGCATATTACAGTTGAGTGCCTCGACTTAGATTGCAATATAAGCGGCCAGCAA +ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC +ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT + +CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA +AAAAAAAAAAAAN +>AF123455.2 |{"description":"x", "molecule_type":"dna", "topology":"circular"} +CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC +TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG +AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG + +ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA +CGCGTTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCAGACGTGGCCAGCAA +ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC + +ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT +CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA +AAAAAAAAAAAA diff --git a/src/test/resources/fasta/malformed_fasta.txt b/src/test/resources/fasta/malformed_fasta.txt new file mode 100644 index 00000000..21e52862 --- /dev/null +++ b/src/test/resources/fasta/malformed_fasta.txt @@ -0,0 +1,8 @@ +>ID1 | {"description":"something", "molecule_type":"dna", "topology":"linear"} +NNAC +ACGT;';'; +TTNn + +>ID2 | {"description":"x", "molecule_type":"dna", "topology":"linear"} +ACGT +GGGG diff --git a/src/test/resources/fasta/malformed_json_fasta.txt b/src/test/resources/fasta/malformed_json_fasta.txt new file mode 100644 index 00000000..97d3fc85 --- /dev/null +++ b/src/test/resources/fasta/malformed_json_fasta.txt @@ -0,0 +1,8 @@ +>ID1 | {"desc;';ription":"first"} +NNAC +ACGT +TTNn + +>ID2 | {"description":"x", "molecule_type":"dna", "topology":"linear"} +ACGT +GGGG