From 3f7e1aa06a8080812d154b1b280e4cac9f27e8fc Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 12 Nov 2025 15:22:49 +0000 Subject: [PATCH 01/31] wip --- .../ac/ebi/embl/gff3tools/fasta/FASTAFile.java | 12 ++++++++++++ .../embl/gff3tools/fasta/FASTAFileReader.java | 12 ++++++++++++ .../ebi/embl/gff3tools/fasta/FastaHeader.java | 18 ++++++++++++++++++ .../embl/gff3tools/fasta/SequenceAccessor.java | 4 ++++ .../ac/ebi/embl/gff3tools/fasta/Topology.java | 6 ++++++ 5 files changed, 52 insertions(+) create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFile.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReader.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaHeader.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/Topology.java diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFile.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFile.java new file mode 100644 index 00000000..a69225bb --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFile.java @@ -0,0 +1,12 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +import lombok.Getter; +import lombok.Setter; + +@Getter +@Setter +public class FASTAFile { + String Id; //accessionNumber + FastaHeader header; + SequenceAccessor sequenceAccessor; +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReader.java new file mode 100644 index 00000000..d58016aa --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReader.java @@ -0,0 +1,12 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +import java.io.File; + +public class FASTAFileReader { + + public FASTAFile readFile(File file){ + + + } + +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaHeader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaHeader.java new file mode 100644 index 00000000..21628b75 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaHeader.java @@ -0,0 +1,18 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +import lombok.Getter; +import lombok.Setter; + +import java.util.Optional; + +@Getter +@Setter +public class FastaHeader { + String description; // mandatory (can be empty if you insist) + String moleculeType; // mandatory (can be null if empty allowed) + Topology topology; // mandatory (can be null if empty allowed) + Optional chromosomeType; // optional (open string unless you constrain) + Optional chromosomeLocation; // optional + Optional chromosomeName; // optional + // Not stored here: NCBITaxon (you said you’ll fetch it from BioSample) +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java new file mode 100644 index 00000000..4be1548c --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java @@ -0,0 +1,4 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +public class SequenceAccessor { +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/Topology.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/Topology.java new file mode 100644 index 00000000..24901b37 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/Topology.java @@ -0,0 +1,6 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +public enum Topology { + LINEAR, + CIRCULAR +} From 7cef9f0164f7be94d87fe4d3a32c1009d9ea1b97 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Mon, 17 Nov 2025 14:55:29 +0000 Subject: [PATCH 02/31] wip-sequence-reader --- .../embl/gff3tools/fasta/FASTAFileReader.java | 186 +++++++++++++++++- .../ebi/embl/gff3tools/fasta/FastaHeader.java | 3 +- .../gff3tools/fasta/SequenceAccessor.java | 38 ++++ .../gff3tools/fasta/FASTAFileReaderTest.java | 52 +++++ src/test/resources/fasta/example.txt | 24 +++ 5 files changed, 300 insertions(+), 3 deletions(-) create mode 100644 src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java create mode 100644 src/test/resources/fasta/example.txt diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReader.java index d58016aa..3c77186f 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReader.java @@ -1,12 +1,196 @@ package uk.ac.ebi.embl.gff3tools.fasta; import java.io.File; +import java.util.List; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.util.*; +import java.util.regex.Pattern; public class FASTAFileReader { - public FASTAFile readFile(File file){ + private static final ObjectMapper MAPPER = new ObjectMapper(); + + // normalize curly quotes / NBSP etc. + private static final Pattern CURLY_DOUBLE = Pattern.compile("[\u201C\u201D]"); + private static final Pattern CURLY_SINGLE = Pattern.compile("[\u2018\u2019]"); + private static final Pattern NBSP = Pattern.compile("\u00A0"); + + public List readFile(File file) { + List out = new ArrayList<>(); + + try (BufferedReader br = Files.newBufferedReader(file.toPath(), StandardCharsets.UTF_8)) { + String line; + long lineNo = -1; + + FASTAFile current = null; + int currentHeaderLine = -1; // for clarity + int currentSeqStartLine = -1; + + while ((line = br.readLine()) != null) { + lineNo++; + + if (line.isEmpty()) continue; + + if (line.charAt(0) == '>') { + // finalize previous record (sequence ends on the line before this header) + if (current != null && current.getSequenceAccessor() != null) { + current.getSequenceAccessor().setEndLine((int) (lineNo - 1)); + } + + // parse new header + int pipeIdx = line.indexOf('|'); + if (pipeIdx < 0) { + // No JSON? We'll still capture an ID and create an empty header. + String id = extractAccession(line.substring(1)); + FastaHeader header = new FastaHeader(); + header.setDescription(null); + header.setMoleculeType(null); + header.setTopology(null); + header.setChromosomeType(Optional.empty()); + header.setChromosomeLocation(Optional.empty()); + header.setChromosomeName(Optional.empty()); + + current = new FASTAFile(); + current.setId(id); + current.setHeader(header); + currentHeaderLine = (int) lineNo; + currentSeqStartLine = (int) lineNo + 1; + current.setSequenceAccessor(new SequenceAccessor(file, currentSeqStartLine, -1)); + out.add(current); + continue; + } + + String idPart = line.substring(1, pipeIdx); + String jsonPart = line.substring(pipeIdx + 1); + + String id = extractAccession(idPart); + FastaHeader header = parseHeaderJson(jsonPart); + + current = new FASTAFile(); + current.setId(id); + current.setHeader(header); + currentHeaderLine = (int) lineNo; + currentSeqStartLine = currentHeaderLine + 1; + + SequenceAccessor accessor = new SequenceAccessor(file, currentSeqStartLine, -1); + current.setSequenceAccessor(accessor); + out.add(current); + continue; + } + + // Any non-header line before we've seen a header is “NONSENSE”; ignore. + // Sequence lines are just skipped here; we only track their line numbers. + } + + // finalize last record at EOF + if (current != null && current.getSequenceAccessor() != null) { + current.getSequenceAccessor().setEndLine((int) lineNo); + } + + } catch (IOException e) { + throw new RuntimeException("Failed to read FASTA file: " + file, e); + } + + return out; + } + + private static String extractAccession(String betweenGtAndPipe) { + // Trim, then grab the FIRST token — “first accession number” as requested. + String trimmed = betweenGtAndPipe.trim(); + int space = trimmed.indexOf(' '); + return (space > 0) ? trimmed.substring(0, space) : trimmed; + } + + private static FastaHeader parseHeaderJson(String rawJson) { + String normalized = normalizeJson(rawJson); + + String description = null; + String moleculeType = null; + String topologyStr = null; + String chrType = null, chrLoc = null, chrName = null; + + try { + JsonNode node = MAPPER.readTree(normalized); + + // make a normalized key->value map (trim keys, lower-case, collapse [_ - space]) + Map norm = new HashMap<>(); + Iterator> it = node.fields(); + while (it.hasNext()) { + Map.Entry e = it.next(); + String key = e.getKey() == null ? "" : e.getKey(); + key = key.trim().toLowerCase(Locale.ROOT).replaceAll("[\\s_-]+", ""); + String val = e.getValue().isNull() ? null : e.getValue().asText(); + norm.put(key, val); + } + description = norm.get("description"); + moleculeType = firstNonNull(norm.get("moleculetype"), norm.get("moleculetype"), norm.get("moleculetype")); // defensive, yes it's the same key after normalization + if (moleculeType == null) moleculeType = norm.get("moleculetype"); // for safety + topologyStr = norm.get("topology"); + chrType = firstNonNull(norm.get("chromosometype"), norm.get("chromosometyp")); + chrLoc = norm.get("chromosomelocation"); + chrName = norm.get("chromosomename"); + + } catch (Exception ignore) { + // If the JSON is totally mangled, we leave fields null; better than exploding. + } + + FastaHeader header = new FastaHeader(); + header.setDescription(description); + header.setMoleculeType(moleculeType); + header.setTopology(parseTopology(topologyStr)); + + header.setChromosomeType(Optional.ofNullable(emptyToNull(chrType))); + header.setChromosomeLocation(Optional.ofNullable(emptyToNull(chrLoc))); + header.setChromosomeName(Optional.ofNullable(emptyToNull(chrName))); + + return header; } + private static String emptyToNull(String s) { + return (s == null || s.isEmpty()) ? null : s; + } + + private static String firstNonNull(String... vals) { + for (String v : vals) if (v != null) return v; + return null; + } + + private static Topology parseTopology(String s) { + if (s == null) return null; + String t = s.trim().toUpperCase(Locale.ROOT); + switch (t) { + case "LINEAR": return Topology.LINEAR; + case "CIRCULAR": return Topology.CIRCULAR; + default: return null; + } + } + + private static String normalizeJson(String s) { + if (s == null) return null; + String out = s.trim(); + + // replace curly quotes with straight quotes + out = CURLY_DOUBLE.matcher(out).replaceAll("\""); + out = CURLY_SINGLE.matcher(out).replaceAll("'"); + + // remove NBSP + out = NBSP.matcher(out).replaceAll(" "); + + // A tiny mercy: if header accidentally missed the opening brace, try to salvage + if (!out.startsWith("{") && out.contains("{")) { + out = out.substring(out.indexOf('{')); + } + + return out; + } } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaHeader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaHeader.java index 21628b75..08b7115a 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaHeader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaHeader.java @@ -11,8 +11,7 @@ public class FastaHeader { String description; // mandatory (can be empty if you insist) String moleculeType; // mandatory (can be null if empty allowed) Topology topology; // mandatory (can be null if empty allowed) - Optional chromosomeType; // optional (open string unless you constrain) + Optional chromosomeType; // optional (doesnt have to be a json) Optional chromosomeLocation; // optional Optional chromosomeName; // optional - // Not stored here: NCBITaxon (you said you’ll fetch it from BioSample) } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java index 4be1548c..7372a60e 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java @@ -1,4 +1,42 @@ package uk.ac.ebi.embl.gff3tools.fasta; +import java.io.File; + public class SequenceAccessor { + private final File file; + private int startLine; // inclusive + private int endLine; // inclusive + + public SequenceAccessor(File file, int startLine, int endLine) { + this.file = file; + this.startLine = startLine; + this.endLine = endLine; + } + + public File getFile() { + return file; + } + + public int getStartLine() { + return startLine; + } + + public void setStartLine(int startLine) { + this.startLine = startLine; + } + + public int getEndLine() { + return endLine; + } + + public void setEndLine(int endLine) { + this.endLine = endLine; + } + + /** Placeholder for later: return approximate length from stored lines (ignores line breaks). */ + public long lengthApprox() { + if (endLine < startLine) return 0; + return (long) (endLine - startLine + 1); // lines count, not bases (for now) + } } + diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java new file mode 100644 index 00000000..ce71b65e --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java @@ -0,0 +1,52 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.net.URI; +import java.nio.file.Paths; +import java.util.List; +import java.util.Objects; + +import static org.junit.jupiter.api.Assertions.*; + +public class FASTAFileReaderTest { + + @Test + void readsExampleAndParsesIdsAndHeaderJson() throws Exception { + URI uri = Objects.requireNonNull( + getClass().getClassLoader().getResource("fasta/example.txt"), + "Test resource fasta/example.txt is missing" + ).toURI(); + File file = Paths.get(uri).toFile(); + + FASTAFileReader reader = new FASTAFileReader(); + List records = reader.readFile(file); + + // We expect two records (your two headers), not counting the "NONSENSE" lines. + assertEquals(2, records.size(), "Should parse two FASTA records"); + + // ---- Record 1 ---- + FASTAFile r1 = records.get(0); + assertEquals("AF123456.1", r1.getId(), + "Accession should be the first token between '>' and '|' (trimmed)"); + FastaHeader h1 = r1.getHeader(); + assertNotNull(h1, "Header must be present"); + assertEquals("Pinus sativa isolate xyz, complete mitochondrion", h1.getDescription()); + assertEquals("genomic", h1.getMoleculeType()); + assertEquals(Topology.CIRCULAR, h1.getTopology()); + assertTrue(h1.getChromosomeType().isEmpty()); + assertTrue(h1.getChromosomeLocation().isEmpty()); + assertTrue(h1.getChromosomeName().isEmpty()); + + // ---- Record 2 ---- + FASTAFile r2 = records.get(1); + assertEquals("AF123455.2", r2.getId(), + "Second accession should be parsed the same way"); + FastaHeader h2 = r2.getHeader(); + assertNotNull(h2); + assertEquals("Pinus sativa isolate xyz, complete mitochondrion", h2.getDescription()); + assertEquals("genomic", h2.getMoleculeType()); + assertEquals(Topology.CIRCULAR, h2.getTopology()); + } +} diff --git a/src/test/resources/fasta/example.txt b/src/test/resources/fasta/example.txt new file mode 100644 index 00000000..8079c4e9 --- /dev/null +++ b/src/test/resources/fasta/example.txt @@ -0,0 +1,24 @@ + +NONSENSE +NONSENSE + +>AF123456.1 |{ “description”: “Pinus sativa isolate xyz, complete mitochondrion”, “ molecule_type”: “genomic”, “topology”: “circular” } +CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC +TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG +AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG +ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA +CGCGTTTTGCATATTACAGTTGAGTGCCTCGACTTAGATTGCAATATAAGCGGCCAGCAA +ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC +ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT +CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA +AAAAAAAAAAAA +>AF123455.2 |{ “description”: “Pinus sativa isolate xyz, complete mitochondrion”, “ molecule_type”: “genomic”, “topology”: “circular” } +CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC +TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG +AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG +ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA +CGCGTTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCAGACGTGGCCAGCAA +ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC +ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT +CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA +AAAAAAAAAAAA From 595aa82c2329abd999f336f42adc921014544263 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 19 Nov 2025 16:27:46 +0000 Subject: [PATCH 03/31] wip --- .../exception/FastaReadException.java | 60 +++++++++++++++++++ .../fasta/{FASTAFile.java => FastaEntry.java} | 3 +- ...{FASTAFileReader.java => FastaReader.java} | 23 ++++--- .../fasta/SequentialFastaEntryReader.java | 44 ++++++++++++++ .../gff3tools/fasta/FASTAFileReaderTest.java | 8 +-- 5 files changed, 120 insertions(+), 18 deletions(-) create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java rename src/main/java/uk/ac/ebi/embl/gff3tools/fasta/{FASTAFile.java => FastaEntry.java} (71%) rename src/main/java/uk/ac/ebi/embl/gff3tools/fasta/{FASTAFileReader.java => FastaReader.java} (92%) create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java b/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java new file mode 100644 index 00000000..9ff12de6 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java @@ -0,0 +1,60 @@ +package uk.ac.ebi.embl.gff3tools.exception; + +public class FastaReadException extends Exception { + + private final String fastaFilePath; + private final String line; // can be null + + /** + * Creates a new FastaReadingException with a message, file path, line content and cause. + * + * @param message description of the error + * @param fastaFilePath path to the FASTA file being read + * @param line the line that caused the error (can be null) + * @param cause the underlying cause (can be null) + */ + public FastaReadException(String message, + String fastaFilePath, + String line, + Throwable cause) { + super(message, cause); + this.fastaFilePath = fastaFilePath; + this.line = line; + } + + /** + * Creates a new FastaReadingException with a message, file path and line content. + * + * @param message description of the error + * @param fastaFilePath path to the FASTA file being read + * @param line the line that caused the error (can be null) + */ + public FastaReadException(String message, + String fastaFilePath, + String line) { + super(message); + this.fastaFilePath = fastaFilePath; + this.line = line; + } + + public String getFastaFilePath() { + return fastaFilePath; + } + + /** + * @return the problematic line content, or null if not available + */ + public String getLine() { + return line; + } + + @Override + public String toString() { + return "FastaReadingException{" + + "message='" + getMessage() + '\'' + + ", fastaFilePath='" + fastaFilePath + '\'' + + ", line=" + (line == null ? "null" : "'" + line + "'") + + '}'; + } +} + diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFile.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java similarity index 71% rename from src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFile.java rename to src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java index a69225bb..782d0da9 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFile.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java @@ -5,8 +5,7 @@ @Getter @Setter -public class FASTAFile { +public class FastaEntry { String Id; //accessionNumber FastaHeader header; - SequenceAccessor sequenceAccessor; } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java similarity index 92% rename from src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReader.java rename to src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java index 3c77186f..daf37228 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java @@ -5,16 +5,16 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import uk.ac.ebi.embl.gff3tools.exception.FastaReadException; import java.io.BufferedReader; -import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.*; import java.util.regex.Pattern; -public class FASTAFileReader { +public class FastaReader { private static final ObjectMapper MAPPER = new ObjectMapper(); @@ -23,14 +23,14 @@ public class FASTAFileReader { private static final Pattern CURLY_SINGLE = Pattern.compile("[\u2018\u2019]"); private static final Pattern NBSP = Pattern.compile("\u00A0"); - public List readFile(File file) { - List out = new ArrayList<>(); + public List readFile(File file) { + List out = new ArrayList<>(); try (BufferedReader br = Files.newBufferedReader(file.toPath(), StandardCharsets.UTF_8)) { String line; long lineNo = -1; - FASTAFile current = null; + FastaEntry current = null; int currentHeaderLine = -1; // for clarity int currentSeqStartLine = -1; @@ -58,7 +58,7 @@ public List readFile(File file) { header.setChromosomeLocation(Optional.empty()); header.setChromosomeName(Optional.empty()); - current = new FASTAFile(); + current = new FastaEntry(); current.setId(id); current.setHeader(header); currentHeaderLine = (int) lineNo; @@ -74,7 +74,7 @@ public List readFile(File file) { String id = extractAccession(idPart); FastaHeader header = parseHeaderJson(jsonPart); - current = new FASTAFile(); + current = new FastaEntry(); current.setId(id); current.setHeader(header); currentHeaderLine = (int) lineNo; @@ -102,14 +102,13 @@ public List readFile(File file) { return out; } - private static String extractAccession(String betweenGtAndPipe) { - // Trim, then grab the FIRST token — “first accession number” as requested. + private static String extractAccession(String betweenGtAndPipe, int linenumber) { String trimmed = betweenGtAndPipe.trim(); int space = trimmed.indexOf(' '); return (space > 0) ? trimmed.substring(0, space) : trimmed; } - private static FastaHeader parseHeaderJson(String rawJson) { + private static FastaHeader parseHeaderJson(String rawJson, int linenumber) { String normalized = normalizeJson(rawJson); String description = null; @@ -141,7 +140,7 @@ private static FastaHeader parseHeaderJson(String rawJson) { chrName = norm.get("chromosomename"); } catch (Exception ignore) { - // If the JSON is totally mangled, we leave fields null; better than exploding. + throw new FastaReadException("Failed to read FASTA file header") } FastaHeader header = new FastaHeader(); @@ -165,7 +164,7 @@ private static String firstNonNull(String... vals) { return null; } - private static Topology parseTopology(String s) { + private static Topology parseTopology(String s) { //TODO move to Topology if (s == null) return null; String t = s.trim().toUpperCase(Locale.ROOT); switch (t) { diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java new file mode 100644 index 00000000..6d5814ff --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java @@ -0,0 +1,44 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +import uk.ac.ebi.embl.gff3tools.exception.FastaReadException; +import uk.ac.ebi.embl.gff3tools.validation.ValidationRegistry; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.nio.channels.FileChannel; + +/* + IMPORTANT: This file reader works with the assumption that the file is UTF-8. If it isn't, expect garbage. //TODO verify if this edge case ever actually happens, it shouldnt but if I need a special exception for it I should know + */ +public class SequentialFastaEntryReader { //todo: ignore the fact of whether this is a Singleton or needs a Manager to manage channels to different file channels FOR NOW + + File multiFastaFile; + private final FileChannel channel; + + String currentId; //accessionNumber + FastaHeader header; + + + public SequentialFastaEntryReader(File file) { + String reason = null; + if (file == null) { + throw new IllegalStateException("Inputted FASTA file object is null."); + }else if (!file.exists()) { + throw new IllegalArgumentException("File does not exist:" + file.getAbsolutePath()); + } else if (file.isDirectory()) { + throw new IllegalArgumentException("Path is a directory, not a regular file:" + file.getAbsolutePath()); + } else if (!file.canRead()) { + throw new IllegalArgumentException("Read permission denied for file:" + file.getAbsolutePath()); + } + + this.multiFastaFile = file; + this.channel = new FileInputStream(file).getChannel(); + } + + public void goToNextFastaEntry() { + + } + + +} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java index ce71b65e..643147bd 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java @@ -20,14 +20,14 @@ void readsExampleAndParsesIdsAndHeaderJson() throws Exception { ).toURI(); File file = Paths.get(uri).toFile(); - FASTAFileReader reader = new FASTAFileReader(); - List records = reader.readFile(file); + FastaReader reader = new FastaReader(); + List records = reader.readFile(file); // We expect two records (your two headers), not counting the "NONSENSE" lines. assertEquals(2, records.size(), "Should parse two FASTA records"); // ---- Record 1 ---- - FASTAFile r1 = records.get(0); + FastaEntry r1 = records.get(0); assertEquals("AF123456.1", r1.getId(), "Accession should be the first token between '>' and '|' (trimmed)"); FastaHeader h1 = r1.getHeader(); @@ -40,7 +40,7 @@ void readsExampleAndParsesIdsAndHeaderJson() throws Exception { assertTrue(h1.getChromosomeName().isEmpty()); // ---- Record 2 ---- - FASTAFile r2 = records.get(1); + FastaEntry r2 = records.get(1); assertEquals("AF123455.2", r2.getId(), "Second accession should be parsed the same way"); FastaHeader h2 = r2.getHeader(); From 7d2d09902cf0d89273727c3d1421b937244fecce Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 26 Nov 2025 16:40:43 +0000 Subject: [PATCH 04/31] ENA-6648-fasta-reader --- .../exception/FastaReadException.java | 58 +---- .../ebi/embl/gff3tools/fasta/FastaEntry.java | 6 +- .../ebi/embl/gff3tools/fasta/FastaReader.java | 195 ----------------- .../gff3tools/fasta/JsonHeaderParser.java | 70 ++++++ .../embl/gff3tools/fasta/ParsedHeader.java | 9 + .../gff3tools/fasta/SequenceAccessor.java | 42 ---- .../fasta/SequentialFastaEntryReader.java | 202 ++++++++++++++++-- .../fasta/sequenceutils/ByteSpan.java | 7 + .../fasta/sequenceutils/LineEntry.java | 16 ++ .../fasta/sequenceutils/SequenceAlphabet.java | 16 ++ .../fasta/sequenceutils/SequenceIndex.java | 115 ++++++++++ .../gff3tools/fasta/FASTAFileReaderTest.java | 4 +- .../gff3tools/fasta/JsonHeaderParserTest.java | 196 +++++++++++++++++ .../sequenceutils/SequenceIndexTest.java | 115 ++++++++++ 14 files changed, 737 insertions(+), 314 deletions(-) delete mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParser.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/ParsedHeader.java delete mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java create mode 100644 src/test/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParserTest.java create mode 100644 src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java b/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java index 9ff12de6..09459f79 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java @@ -2,59 +2,9 @@ public class FastaReadException extends Exception { - private final String fastaFilePath; - private final String line; // can be null - - /** - * Creates a new FastaReadingException with a message, file path, line content and cause. - * - * @param message description of the error - * @param fastaFilePath path to the FASTA file being read - * @param line the line that caused the error (can be null) - * @param cause the underlying cause (can be null) - */ - public FastaReadException(String message, - String fastaFilePath, - String line, - Throwable cause) { - super(message, cause); - this.fastaFilePath = fastaFilePath; - this.line = line; - } - - /** - * Creates a new FastaReadingException with a message, file path and line content. - * - * @param message description of the error - * @param fastaFilePath path to the FASTA file being read - * @param line the line that caused the error (can be null) - */ - public FastaReadException(String message, - String fastaFilePath, - String line) { - super(message); - this.fastaFilePath = fastaFilePath; - this.line = line; - } - - public String getFastaFilePath() { - return fastaFilePath; - } - - /** - * @return the problematic line content, or null if not available - */ - public String getLine() { - return line; - } - - @Override - public String toString() { - return "FastaReadingException{" + - "message='" + getMessage() + '\'' + - ", fastaFilePath='" + fastaFilePath + '\'' + - ", line=" + (line == null ? "null" : "'" + line + "'") + - '}'; - } + public FastaReadException() {} + public FastaReadException(String message) { super(message); } + public FastaReadException(Throwable cause) { super(cause); } + public FastaReadException(String message, Throwable cause) { super(message, cause); } } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java index 782d0da9..2beba413 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java @@ -6,6 +6,10 @@ @Getter @Setter public class FastaEntry { - String Id; //accessionNumber + String id; // submissionNumber or accessionNumber FastaHeader header; + + long fastaStart; // position of '>' in the file + long sequenceStart; // first allowed base after header (absolute byte offset) + long sequenceEnd; // last allowed base before next header (absolute byte offset) } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java deleted file mode 100644 index daf37228..00000000 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java +++ /dev/null @@ -1,195 +0,0 @@ -package uk.ac.ebi.embl.gff3tools.fasta; - -import java.io.File; -import java.util.List; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import uk.ac.ebi.embl.gff3tools.exception.FastaReadException; - -import java.io.BufferedReader; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.util.*; -import java.util.regex.Pattern; - -public class FastaReader { - - private static final ObjectMapper MAPPER = new ObjectMapper(); - - // normalize curly quotes / NBSP etc. - private static final Pattern CURLY_DOUBLE = Pattern.compile("[\u201C\u201D]"); - private static final Pattern CURLY_SINGLE = Pattern.compile("[\u2018\u2019]"); - private static final Pattern NBSP = Pattern.compile("\u00A0"); - - public List readFile(File file) { - List out = new ArrayList<>(); - - try (BufferedReader br = Files.newBufferedReader(file.toPath(), StandardCharsets.UTF_8)) { - String line; - long lineNo = -1; - - FastaEntry current = null; - int currentHeaderLine = -1; // for clarity - int currentSeqStartLine = -1; - - while ((line = br.readLine()) != null) { - lineNo++; - - if (line.isEmpty()) continue; - - if (line.charAt(0) == '>') { - // finalize previous record (sequence ends on the line before this header) - if (current != null && current.getSequenceAccessor() != null) { - current.getSequenceAccessor().setEndLine((int) (lineNo - 1)); - } - - // parse new header - int pipeIdx = line.indexOf('|'); - if (pipeIdx < 0) { - // No JSON? We'll still capture an ID and create an empty header. - String id = extractAccession(line.substring(1)); - FastaHeader header = new FastaHeader(); - header.setDescription(null); - header.setMoleculeType(null); - header.setTopology(null); - header.setChromosomeType(Optional.empty()); - header.setChromosomeLocation(Optional.empty()); - header.setChromosomeName(Optional.empty()); - - current = new FastaEntry(); - current.setId(id); - current.setHeader(header); - currentHeaderLine = (int) lineNo; - currentSeqStartLine = (int) lineNo + 1; - current.setSequenceAccessor(new SequenceAccessor(file, currentSeqStartLine, -1)); - out.add(current); - continue; - } - - String idPart = line.substring(1, pipeIdx); - String jsonPart = line.substring(pipeIdx + 1); - - String id = extractAccession(idPart); - FastaHeader header = parseHeaderJson(jsonPart); - - current = new FastaEntry(); - current.setId(id); - current.setHeader(header); - currentHeaderLine = (int) lineNo; - currentSeqStartLine = currentHeaderLine + 1; - - SequenceAccessor accessor = new SequenceAccessor(file, currentSeqStartLine, -1); - current.setSequenceAccessor(accessor); - out.add(current); - continue; - } - - // Any non-header line before we've seen a header is “NONSENSE”; ignore. - // Sequence lines are just skipped here; we only track their line numbers. - } - - // finalize last record at EOF - if (current != null && current.getSequenceAccessor() != null) { - current.getSequenceAccessor().setEndLine((int) lineNo); - } - - } catch (IOException e) { - throw new RuntimeException("Failed to read FASTA file: " + file, e); - } - - return out; - } - - private static String extractAccession(String betweenGtAndPipe, int linenumber) { - String trimmed = betweenGtAndPipe.trim(); - int space = trimmed.indexOf(' '); - return (space > 0) ? trimmed.substring(0, space) : trimmed; - } - - private static FastaHeader parseHeaderJson(String rawJson, int linenumber) { - String normalized = normalizeJson(rawJson); - - String description = null; - String moleculeType = null; - String topologyStr = null; - String chrType = null, chrLoc = null, chrName = null; - - try { - JsonNode node = MAPPER.readTree(normalized); - - // make a normalized key->value map (trim keys, lower-case, collapse [_ - space]) - Map norm = new HashMap<>(); - Iterator> it = node.fields(); - while (it.hasNext()) { - Map.Entry e = it.next(); - String key = e.getKey() == null ? "" : e.getKey(); - key = key.trim().toLowerCase(Locale.ROOT).replaceAll("[\\s_-]+", ""); - String val = e.getValue().isNull() ? null : e.getValue().asText(); - norm.put(key, val); - } - - description = norm.get("description"); - moleculeType = firstNonNull(norm.get("moleculetype"), norm.get("moleculetype"), norm.get("moleculetype")); // defensive, yes it's the same key after normalization - if (moleculeType == null) moleculeType = norm.get("moleculetype"); // for safety - - topologyStr = norm.get("topology"); - chrType = firstNonNull(norm.get("chromosometype"), norm.get("chromosometyp")); - chrLoc = norm.get("chromosomelocation"); - chrName = norm.get("chromosomename"); - - } catch (Exception ignore) { - throw new FastaReadException("Failed to read FASTA file header") - } - - FastaHeader header = new FastaHeader(); - header.setDescription(description); - header.setMoleculeType(moleculeType); - header.setTopology(parseTopology(topologyStr)); - - header.setChromosomeType(Optional.ofNullable(emptyToNull(chrType))); - header.setChromosomeLocation(Optional.ofNullable(emptyToNull(chrLoc))); - header.setChromosomeName(Optional.ofNullable(emptyToNull(chrName))); - - return header; - } - - private static String emptyToNull(String s) { - return (s == null || s.isEmpty()) ? null : s; - } - - private static String firstNonNull(String... vals) { - for (String v : vals) if (v != null) return v; - return null; - } - - private static Topology parseTopology(String s) { //TODO move to Topology - if (s == null) return null; - String t = s.trim().toUpperCase(Locale.ROOT); - switch (t) { - case "LINEAR": return Topology.LINEAR; - case "CIRCULAR": return Topology.CIRCULAR; - default: return null; - } - } - - private static String normalizeJson(String s) { - if (s == null) return null; - String out = s.trim(); - - // replace curly quotes with straight quotes - out = CURLY_DOUBLE.matcher(out).replaceAll("\""); - out = CURLY_SINGLE.matcher(out).replaceAll("'"); - - // remove NBSP - out = NBSP.matcher(out).replaceAll(" "); - - // A tiny mercy: if header accidentally missed the opening brace, try to salvage - if (!out.startsWith("{") && out.contains("{")) { - out = out.substring(out.indexOf('{')); - } - - return out; - } -} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParser.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParser.java new file mode 100644 index 00000000..487169ff --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParser.java @@ -0,0 +1,70 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.IOException; +import java.util.*; + +public class JsonHeaderParser { + private static final ObjectMapper MAPPER = new ObjectMapper(); + + public ParsedHeader parse(String headerLine) throws IOException { + String rest = headerLine.substring(1); // headerLine starts with '>' + int pipe = rest.indexOf('|'); + String idPart = (pipe >= 0 ? rest.substring(0, pipe) : rest).trim(); + String id = idPart.isEmpty() ? "" : idPart.split("\\s+")[0]; + + FastaHeader h = new FastaHeader(); + h.setChromosomeType(Optional.empty()); + h.setChromosomeLocation(Optional.empty()); + h.setChromosomeName(Optional.empty()); + + if (pipe >= 0) { + fillFromJson(rest.substring(pipe + 1).trim(), h); // may throw IOException + } + return new ParsedHeader(id, h); + } + + private static void fillFromJson(String raw, FastaHeader h) throws IOException { + if (raw == null || raw.isEmpty()) return; + + // Normalize curly quotes / NBSPs but keep the final JSON we actually tried to parse + String normalized = raw.replace('\u201C','"').replace('\u201D','"') + .replace('\u2018','\'').replace('\u2019','\'') + .replace('\u00A0',' ').trim(); + try { + JsonNode node = MAPPER.readTree(normalized); + Map m = new HashMap<>(); + node.fields().forEachRemaining(e -> { + String k = e.getKey()==null?"":e.getKey(); + k = k.trim().toLowerCase(Locale.ROOT).replaceAll("[\\s_-]+",""); + String v = e.getValue().isNull()?null:e.getValue().asText(); + m.put(k, v); + }); + h.setDescription(m.get("description")); + h.setMoleculeType(m.get("moleculetype")); + h.setTopology(parseTopology(m.get("topology"))); + if (m.containsKey("chromosometype")) + h.setChromosomeType(Optional.ofNullable(emptyToNull(m.get("chromosometype")))); + if (m.containsKey("chromosomelocation")) + h.setChromosomeLocation(Optional.ofNullable(emptyToNull(m.get("chromosomelocation")))); + if (m.containsKey("chromosomename")) + h.setChromosomeName(Optional.ofNullable(emptyToNull(m.get("chromosomename")))); + } catch (IOException e) { + // explode, and include the JSON we tried to parse + throw new IOException("Malformed FASTA header JSON: " + normalized, e); + } + } + + private static String emptyToNull(String s){ return (s==null||s.isEmpty())?null:s; } + + private static Topology parseTopology(String s){ + if (s==null) return null; + switch (s.trim().toUpperCase(Locale.ROOT)){ + case "LINEAR": return Topology.LINEAR; + case "CIRCULAR": return Topology.CIRCULAR; + default: return null; + } + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/ParsedHeader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/ParsedHeader.java new file mode 100644 index 00000000..0a5e5144 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/ParsedHeader.java @@ -0,0 +1,9 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +import lombok.Value; + +@Value +public class ParsedHeader { + String id; + FastaHeader header; +} \ No newline at end of file diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java deleted file mode 100644 index 7372a60e..00000000 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceAccessor.java +++ /dev/null @@ -1,42 +0,0 @@ -package uk.ac.ebi.embl.gff3tools.fasta; - -import java.io.File; - -public class SequenceAccessor { - private final File file; - private int startLine; // inclusive - private int endLine; // inclusive - - public SequenceAccessor(File file, int startLine, int endLine) { - this.file = file; - this.startLine = startLine; - this.endLine = endLine; - } - - public File getFile() { - return file; - } - - public int getStartLine() { - return startLine; - } - - public void setStartLine(int startLine) { - this.startLine = startLine; - } - - public int getEndLine() { - return endLine; - } - - public void setEndLine(int endLine) { - this.endLine = endLine; - } - - /** Placeholder for later: return approximate length from stored lines (ignores line breaks). */ - public long lengthApprox() { - if (endLine < startLine) return 0; - return (long) (endLine - startLine + 1); // lines count, not bases (for now) - } -} - diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java index 6d5814ff..a7cd7ce9 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java @@ -1,44 +1,204 @@ package uk.ac.ebi.embl.gff3tools.fasta; import uk.ac.ebi.embl.gff3tools.exception.FastaReadException; -import uk.ac.ebi.embl.gff3tools.validation.ValidationRegistry; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceAlphabet; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.channels.FileChannel; +import java.nio.charset.StandardCharsets; +import java.util.OptionalLong; -/* - IMPORTANT: This file reader works with the assumption that the file is UTF-8. If it isn't, expect garbage. //TODO verify if this edge case ever actually happens, it shouldnt but if I need a special exception for it I should know - */ -public class SequentialFastaEntryReader { //todo: ignore the fact of whether this is a Singleton or needs a Manager to manage channels to different file channels FOR NOW +public class SequentialFastaEntryReader implements AutoCloseable { + + private static final int BUF_SIZE = 64 * 1024; + private static final byte GT = (byte) '>'; + private static final byte LF = (byte) '\n'; - File multiFastaFile; private final FileChannel channel; + private final long fileSize; //size of file in bytes + + private final JsonHeaderParser headerParser; + private final SequenceAlphabet alphabet; + private FastaEntry current; - String currentId; //accessionNumber - FastaHeader header; + public SequentialFastaEntryReader(File file) throws FileNotFoundException { + this(file, new JsonHeaderParser(), SequenceAlphabet.defaultNucleotideAlphabet()); + } + public SequentialFastaEntryReader(File file, JsonHeaderParser parser, SequenceAlphabet alphabet) throws FileNotFoundException { + if (file == null) throw new IllegalStateException("Input FASTA file is null"); + if (!file.exists()) throw new FileNotFoundException(file.getAbsolutePath()); + if (file.isDirectory()) throw new FileNotFoundException("Directory: " + file.getAbsolutePath()); + if (!file.canRead()) throw new IllegalArgumentException("No read permission: " + file.getAbsolutePath()); - public SequentialFastaEntryReader(File file) { - String reason = null; - if (file == null) { - throw new IllegalStateException("Inputted FASTA file object is null."); - }else if (!file.exists()) { - throw new IllegalArgumentException("File does not exist:" + file.getAbsolutePath()); - } else if (file.isDirectory()) { - throw new IllegalArgumentException("Path is a directory, not a regular file:" + file.getAbsolutePath()); - } else if (!file.canRead()) { - throw new IllegalArgumentException("Read permission denied for file:" + file.getAbsolutePath()); + this.headerParser = parser; + this.alphabet = alphabet; + + try { + this.channel = new FileInputStream(file).getChannel(); + this.fileSize = channel.size(); + } catch (IOException e) { + throw new RuntimeException("Open channel failed", e); } + } + + public void close() throws IOException { channel.close(); } + public boolean readingFile() { return channel.isOpen(); } + public FastaEntry getCurrentEntry() { return current; } + + public boolean readNext() throws FastaReadException { + try { + long startPos = channel.position(); + OptionalLong headerPos = goToNextFastaEntry(); + if (headerPos.isPresent() || !peekIsGT(startPos)) return false; + + //parse id & header json + String headerLine = readAsciiLine(); + if (headerLine == null) return false; + ParsedHeader ph = headerParser.parse(headerLine); + //find the start & end bytes of the sequence of the current fasta entry + ScanResult sr = findSequenceLimits(); + + FastaEntry e = new FastaEntry(); + e.setId(ph.getId()); + e.setHeader(ph.getHeader()); + e.setFastaStart(headerPos.getAsLong()); + e.setSequenceStart(sr.firstBase); + e.setSequenceEnd(sr.lastBase); - this.multiFastaFile = file; - this.channel = new FileInputStream(file).getChannel(); + this.current = e; + return true; + + } catch (IOException io) { + throw new FastaReadException("I/O while reading FASTA", io); + } } - public void goToNextFastaEntry() { + // ---- private helpers (the only code allowed to touch channel position) ---- + + private OptionalLong goToNextFastaEntry() throws IOException { //TODO modify so that the > has to be the first character in line + long currentPosition = channel.position(), originalPosition = currentPosition; + if (currentPosition >= fileSize) return OptionalLong.empty(); + + ByteBuffer buf = ByteBuffer.allocateDirect(BUF_SIZE); //read the next chunk of the file + while (currentPosition < fileSize) { + buf.clear(); + int minToRead = (int) Math.min(buf.capacity(), fileSize - currentPosition); + buf.limit(minToRead); + int numberOfBytesRead = channel.read(buf, currentPosition); + if (numberOfBytesRead <= 0) break; //no bytes were read, probably paranoid + + buf.flip(); + while (buf.hasRemaining()) { + if (buf.get() == GT) { + long potentialFastaStart = currentPosition + buf.position() - 1; + if(peekIsEndOfLine(potentialFastaStart - 1)) { + channel.position(potentialFastaStart); //put the channel position on the new greater than + return OptionalLong.of(potentialFastaStart); + } + } + } + currentPosition += numberOfBytesRead; + } + channel.position(originalPosition); //if we didn't find new fasta entry start, go back to original position + return OptionalLong.empty(); } + /* Just checks if the character at the given position equals '>' char, does not move the channel.position() */ + private boolean peekIsEndOfLine(long position) throws IOException { + if (position >= fileSize) return false; + ByteBuffer one = ByteBuffer.allocate(1); + int n = channel.read(one, position); + return n == 1 && one.get(0) == LF; + } + /* Just checks if the character at the given position equals '>' char, does not move the channel.position() */ + private boolean peekIsGT(long position) throws IOException { + if (position >= fileSize) return false; + ByteBuffer one = ByteBuffer.allocate(1); + int n = channel.read(one, position); + return n == 1 && one.get(0) == GT; + } + + private String readAsciiLine() throws IOException { + if (channel.position() >= fileSize) return null; + + StringBuilder sb = new StringBuilder(256); + ByteBuffer buf = ByteBuffer.allocateDirect(BUF_SIZE); + long currentPosition = channel.position(); + + while (currentPosition < fileSize) { + buf.clear(); + int toRead = (int) Math.min(buf.capacity(), fileSize - currentPosition); + buf.limit(toRead); + + int numberOfBytesRead = channel.read(buf, currentPosition); + if (numberOfBytesRead <= 0) break; + + buf.flip(); + + //find end of line + int lfIndex = -1; + for (int i = 0; i < buf.remaining(); i++) { + if (buf.get(buf.position() + i) == LF) { lfIndex = i; break; } + } + + if (lfIndex >= 0) { //if there is an "\n", read it and position the channel at the end of index + byte[] chunk = new byte[lfIndex]; + buf.get(chunk); + sb.append(new String(chunk, StandardCharsets.US_ASCII)); + buf.get(); // consume LF + channel.position(currentPosition + lfIndex + 1); //skip separator + int len = sb.length(); + if (len>0 && sb.charAt(len-1)=='\r') sb.setLength(len-1); + return sb.toString(); + } else { //otherwise read entire chunk, but this is unlikely to happen as the buffer should be large enough + byte[] chunk = new byte[buf.remaining()]; + buf.get(chunk); + sb.append(new String(chunk, StandardCharsets.US_ASCII)); + currentPosition += numberOfBytesRead; + } + } + channel.position(fileSize); + return sb.toString(); + } + + private static final class ScanResult { + final long firstBase, lastBase, nextHeader; + ScanResult(long f, long l, long n){ firstBase=f; lastBase=l; nextHeader=n; } + } + + private ScanResult findSequenceLimits() throws IOException { + long currentPosition = channel.position(); + long first = -1, last = -1, nextHdr = fileSize; + + ByteBuffer buf = ByteBuffer.allocateDirect(BUF_SIZE); + + outer: + while (currentPosition < fileSize) { + buf.clear(); + int toRead = (int) Math.min(buf.capacity(), fileSize - currentPosition); + buf.limit(toRead); + int n = channel.read(buf, currentPosition); + if (n<=0) break; + buf.flip(); + while (buf.hasRemaining()) { + byte b = buf.get(); + long abs = currentPosition + buf.position() - 1; + if (b == GT) { nextHdr = abs; break outer; } + if (alphabet.isAllowed(b)) { + if (first < 0) first = abs; + last = abs; + } + } + currentPosition += n; + } + channel.position(nextHdr); + return new ScanResult(first, last, nextHdr); + } } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java new file mode 100644 index 00000000..afa3e96b --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java @@ -0,0 +1,7 @@ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +final class ByteSpan { + final long start; // inclusive + final long endEx; // exclusive + ByteSpan(long s, long e) { this.start = s; this.endEx = e; } +} \ No newline at end of file diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java new file mode 100644 index 00000000..07460da1 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java @@ -0,0 +1,16 @@ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +public final class LineEntry { + long baseStart; // 1-based base index at line start (inclusive) + long baseEnd; // 1-based base index at line end (inclusive) + long byteStart; // absolute byte offset of first base in the line + long byteEndExclusive;// absolute byte offset just after the last base + + LineEntry(long bStart, long bEnd, long byStart, long byEndEx) { + this.baseStart = bStart; + this.baseEnd = bEnd; + this.byteStart = byStart; + this.byteEndExclusive = byEndEx; + } +} + diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java new file mode 100644 index 00000000..a341f52b --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java @@ -0,0 +1,16 @@ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +public final class SequenceAlphabet { + private final boolean[] allowed = new boolean[128]; + public SequenceAlphabet(String chars) { + for (char c: chars.toCharArray()) if (c<128) allowed[c]=true; + allowed['>']=false; + } + public boolean isAllowed(byte b){ + int i=b&0xFF; + return i<128 && allowed[i]; + } + public static SequenceAlphabet defaultNucleotideAlphabet() { + return new SequenceAlphabet("ACGTURYSWKMBDHVNacgturyswkmbdhvn-.*"); + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java new file mode 100644 index 00000000..343dc2ad --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java @@ -0,0 +1,115 @@ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +public final class SequenceIndex { + final long firstBaseByte; // absolute byte offset of the first base (inclusive), -1 if none + final long lastBaseByte; // absolute byte offset of the last base (inclusive), -1 if none + final java.util.List lines; // sorted by baseStart + + public SequenceIndex(long firstBaseByte, long lastBaseByte, java.util.List lines) { + this.firstBaseByte = firstBaseByte; + this.lastBaseByte = lastBaseByte; + this.lines = java.util.Collections.unmodifiableList(lines); + } + + long totalBases() { + if (lines.isEmpty()) return 0; + return lines.get(lines.size()-1).baseEnd; + } + + /** Return one or more byte spans covering [fromBase..toBase], inclusive. */ + java.util.List byteSpansForBaseRange(long fromBase, long toBase) { + if (fromBase < 1 || toBase < fromBase) throw new IllegalArgumentException("bad base range"); + if (lines.isEmpty()) return java.util.List.of(); + + int i = findLineByBase(fromBase); + int j = findLineByBase(toBase); + + java.util.ArrayList out = new java.util.ArrayList<>(Math.max(1, j - i + 1)); + for (int k = i; k <= j; k++) { + LineEntry L = lines.get(k); + long startBase = Math.max(fromBase, L.baseStart); + long endBase = Math.min(toBase, L.baseEnd); + + long offsetStartInLine = startBase - L.baseStart; // 0-based + long offsetEndInLineEx = (endBase - L.baseStart) + 1; // exclusive + + long byteStart = L.byteStart + offsetStartInLine; // ASCII 1 byte/base + long byteEndEx = L.byteStart + offsetEndInLineEx; // exclusive + out.add(new ByteSpan(byteStart, byteEndEx)); + } + return out; + } + + /** Naive in-place index adjustment after deleting [fromBase..toBase] (inclusive). */ + void applyDeletion(long fromBase, long toBase) { + if (fromBase < 1 || toBase < fromBase) throw new IllegalArgumentException("bad base range"); + if (lines.isEmpty()) return; + + long deltaBases = (toBase - fromBase + 1); + long deltaBytes = deltaBases; + + int first = findLineByBase(fromBase); + int last = findLineByBase(toBase); + + // Adjust partially affected first/last lines, remove fully-eaten ones, and shift the rest. + java.util.List mutable = new java.util.ArrayList<>(lines); + // Trim front + LineEntry Lf = mutable.get(first); + if (fromBase > Lf.baseStart) { + // delete tail portion in first line + long cut = Math.min(deltaBases, Lf.baseEnd - fromBase + 1); + Lf.baseEnd -= cut; + Lf.byteEndExclusive -= cut; + } else { + // delete whole first line (or will be eaten by later logic) + } + // Trim back + LineEntry Ll = mutable.get(last); + if (toBase < Ll.baseEnd) { + long cut = Math.min(deltaBases, Ll.baseEnd - toBase); + // delete head portion in last line + long newBaseStart = toBase + 1; + long newByteStart = Ll.byteStart + (newBaseStart - Ll.baseStart); + Ll.baseStart = newBaseStart - deltaBases; + Ll.byteStart = newByteStart - deltaBytes; + } else { + // will be shifted/removed below + } + + // Remove any lines whose base range collapsed + mutable.removeIf(le -> le.baseEnd < le.baseStart); + + // Shift all lines strictly after the deletion by (-delta) + for (int idx = 0; idx < mutable.size(); idx++) { + LineEntry L = mutable.get(idx); + if (L.baseStart > toBase) { + L.baseStart -= deltaBases; + L.baseEnd -= deltaBases; + L.byteStart -= deltaBytes; + L.byteEndExclusive -= deltaBytes; + } + } + + // Re-freeze as unmodifiable + lines.clear(); // if you stored unmodifiable above, switch to a mutable field or a builder + lines.addAll(java.util.Collections.unmodifiableList(mutable)); + // Note: first/last base bytes would also shift by -deltaBytes if deletion occurs before them. + // You can recompute from lines when needed. + } + + private int findLineByBase(long base) { + int lo = 0, hi = lines.size()-1, ans = hi; + while (lo <= hi) { + int mid = (lo + hi) >>> 1; + LineEntry L = lines.get(mid); + if (base < L.baseStart) { hi = mid - 1; } + else if (base > L.baseEnd) { lo = mid + 1; } + else { return mid; } // inside + ans = lo; // insertion point + } + return Math.max(0, Math.min(ans, lines.size()-1)); + } +} + + + diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java index 643147bd..a556f5f4 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java @@ -11,7 +11,7 @@ import static org.junit.jupiter.api.Assertions.*; public class FASTAFileReaderTest { - +/* @Test void readsExampleAndParsesIdsAndHeaderJson() throws Exception { URI uri = Objects.requireNonNull( @@ -49,4 +49,6 @@ void readsExampleAndParsesIdsAndHeaderJson() throws Exception { assertEquals("genomic", h2.getMoleculeType()); assertEquals(Topology.CIRCULAR, h2.getTopology()); } + *? + */ } diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParserTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParserTest.java new file mode 100644 index 00000000..f73b0930 --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParserTest.java @@ -0,0 +1,196 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.Optional; + +import static org.junit.jupiter.api.Assertions.*; + +public class JsonHeaderParserTest { + + private final JsonHeaderParser parser = new JsonHeaderParser(); + + @Test + void parsesStandardHeaderWithJson() { + String line = ">AF123456.1 | { \"description\":\"Pinus sativa\", \"molecule_type\":\"genomic\", \"topology\":\"circular\" }"; + try { + ParsedHeader ph = parser.parse(line); + + assertEquals("AF123456.1", ph.getId()); + FastaHeader h = ph.getHeader(); + assertEquals("Pinus sativa", h.getDescription()); + assertEquals("genomic", h.getMoleculeType()); + assertEquals(Topology.CIRCULAR, h.getTopology()); + assertTrue(h.getChromosomeType().isEmpty()); + assertTrue(h.getChromosomeLocation().isEmpty()); + assertTrue(h.getChromosomeName().isEmpty()); + } catch (IOException e) { + fail("Should not throw for well-formed JSON: " + e.getMessage()); + } + } + + @Test + void picksFirstTokenAsIdEvenWithExtraStuff() { + String line = ">AF123456.1 extra tokens here | {\"description\":\"x\"}"; + try { + ParsedHeader ph = parser.parse(line); + assertEquals("AF123456.1", ph.getId()); + } catch (IOException e) { + fail("Should not throw: " + e.getMessage()); + } + } + + @Test + void parsesCurlyQuotesAndWeirdSpacingInKeys() { + String line = ">ID1 | { \u201Cdescription\u201D: \u201CPinus\u201D, \u201C molecule_type\u201D: \"genomic\" , \u201Ctopology\u201D: \"CIRCULAR\" }"; + try { + ParsedHeader ph = parser.parse(line); + FastaHeader h = ph.getHeader(); + assertEquals("Pinus", h.getDescription()); + assertEquals("genomic", h.getMoleculeType()); + assertEquals(Topology.CIRCULAR, h.getTopology()); + } catch (IOException e) { + fail("Should not throw with normalized curly quotes: " + e.getMessage()); + } + } + + @Test + void normalizesKeyVariantsAndChromosomeOptionals() { + String line = ">ID2 | { \"Description\":\"Desc\", \"molecule-type\":\"rna\", \"Chromosome Type\":\"plasmid\", \"chromosome_location\":\"chr12:100-200\", \"CHROMOSOME_NAME\":\"pX\" }"; + try { + ParsedHeader ph = parser.parse(line); + FastaHeader h = ph.getHeader(); + + assertEquals("Desc", h.getDescription()); + assertEquals("rna", h.getMoleculeType()); + assertEquals(Optional.of("plasmid"), h.getChromosomeType()); + assertEquals(Optional.of("chr12:100-200"), h.getChromosomeLocation()); + assertEquals(Optional.of("pX"), h.getChromosomeName()); + } catch (IOException e) { + fail("Should not throw: " + e.getMessage()); + } + } + + @Test + void handlesNbspInJson() { + String nbsp = "\u00A0"; + String line = (">ID3 | {"+nbsp+"\"description\""+nbsp+":" + nbsp + "\"Alpha"+nbsp+"Beta\"" + nbsp + ",\"topology\":\"linear\"}"); + try { + ParsedHeader ph = parser.parse(line); + FastaHeader h = ph.getHeader(); + + assertEquals("Alpha Beta", h.getDescription()); // NBSP normalized to space + assertEquals(Topology.LINEAR, h.getTopology()); + } catch (IOException e) { + fail("Should not throw with NBSP: " + e.getMessage()); + } + } + + @Test + void unknownTopologyYieldsNull() { + String line = ">ID4 | {\"topology\":\"weird-shape\"}"; + try { + ParsedHeader ph = parser.parse(line); + assertNull(ph.getHeader().getTopology()); + } catch (IOException e) { + fail("Should not throw when topology is unknown: " + e.getMessage()); + } + } + + @Test + void missingJsonIsFine_NoPipe() { + String line = ">AF999999.5 some label without json"; + try { + ParsedHeader ph = parser.parse(line); + + assertEquals("AF999999.5", ph.getId()); + FastaHeader h = ph.getHeader(); + assertNull(h.getDescription()); + assertNull(h.getMoleculeType()); + assertNull(h.getTopology()); + assertTrue(h.getChromosomeType().isEmpty()); + assertTrue(h.getChromosomeLocation().isEmpty()); + assertTrue(h.getChromosomeName().isEmpty()); + } catch (IOException e) { + fail("Should not throw without pipe/JSON: " + e.getMessage()); + } + } + + @Test + void emptyJsonAfterPipeIsFine() { + String line = ">ID5 | "; + try { + ParsedHeader ph = parser.parse(line); + FastaHeader h = ph.getHeader(); + + assertEquals("ID5", ph.getId()); + assertNull(h.getDescription()); + assertNull(h.getMoleculeType()); + assertNull(h.getTopology()); + } catch (IOException e) { + fail("Should not throw for empty JSON after pipe: " + e.getMessage()); + } + } + + @Test + void malformedJsonThrowsAndIncludesJsonInMessage() { + String badJson = "{\"description\": \"x\", \"molecule_type\": \"genomic\", OOPS }"; + String line = ">ID6 | " + badJson; + try { + parser.parse(line); + fail("Expected IOException for malformed JSON"); + } catch (IOException e) { + // Should include a recognizable chunk of normalized JSON + String msg = e.getMessage(); + assertNotNull(msg); + assertTrue(msg.contains("OOPS"), "Message should include offending JSON token"); + assertTrue(msg.contains("{\"description\": \"x\"") || msg.contains("{\"description\":\"x\""), + "Message should include JSON snippet"); + } + } + + @Test + void malformedJsonWithTrailingCommaThrowsAndMentionsComma() { + String badJson = "{ \"description\":\"y\", \"molecule_type\":\"genomic\", }"; + String line = ">ID7 | " + badJson; + try { + parser.parse(line); + fail("Expected IOException for trailing comma"); + } catch (IOException e) { + String msg = e.getMessage(); + assertNotNull(msg); + // different Jackson versions phrase this differently; just assert we included the JSON + assertTrue(msg.contains("\"description\":\"y\"")); + assertTrue(msg.contains("\"molecule_type\":\"genomic\"")); + } + } + + @Test + void jsonWithNullValuesParsesAndLeavesNulls() { + String line = ">ID8 | {\"description\":null, \"molecule_type\":null, \"topology\":null}"; + try { + ParsedHeader ph = parser.parse(line); + FastaHeader h = ph.getHeader(); + assertNull(h.getDescription()); + assertNull(h.getMoleculeType()); + assertNull(h.getTopology()); + } catch (IOException e) { + fail("Should not throw for explicit nulls: " + e.getMessage()); + } + } + + @Test + void trimsIdAndHandlesJustChevron() { + try { + ParsedHeader ph1 = parser.parse("> AF111 | {\"description\":\"x\"}"); + assertEquals("AF111", ph1.getId()); + + ParsedHeader ph2 = parser.parse(">"); + assertEquals("", ph2.getId()); + assertNull(ph2.getHeader().getDescription()); + } catch (IOException e) { + fail("Should not throw here: " + e.getMessage()); + } + } +} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java new file mode 100644 index 00000000..50e96ff6 --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java @@ -0,0 +1,115 @@ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for SequenceIndex byte-range mapping and validation. + */ +public class SequenceIndexTest { + + // Helpers to build a sane fixture ---------------------------------------- + + private LineEntry line(long baseStart, long baseEnd, long byteStart, long byteEndEx) { + return new LineEntry(baseStart, baseEnd, byteStart, byteEndEx); + } + + private SequenceIndex newIndexFixture() { + List lines = new ArrayList<>(); + // L1: 12 bases, bytes 100..111 (exclusive end 112) + lines.add(line(1, 12, 100, 112)); + // L2: 10 bases, bytes 113..122 (exclusive end 123) + lines.add(line(13, 22, 113, 123)); + // L3: 11 bases, bytes 124..134 (exclusive end 135) + lines.add(line(23, 33, 124, 135)); + // first/last base bytes (inclusive): 100 and 134 + return new SequenceIndex(100, 134, lines); + } + + // ------------------------------------------------------------------------ + + @Test + void totalBases_isEndOfLastLine() { + SequenceIndex idx = newIndexFixture(); + assertEquals(33, idx.totalBases()); + assertEquals(100, idx.firstBaseByte); + assertEquals(134, idx.lastBaseByte); + } + + @Test + void byteSpans_singleLine_inside() { + SequenceIndex idx = newIndexFixture(); + + // base 1..1 -> L1 offset 0 -> bytes [100,101) + var spans = idx.byteSpansForBaseRange(1, 1); + assertEquals(1, spans.size()); + assertEquals(100, spans.get(0).start); + assertEquals(101, spans.get(0).endEx); + + // base 12..12 -> last byte of L1 -> [111,112) + spans = idx.byteSpansForBaseRange(12, 12); + assertEquals(1, spans.size()); + assertEquals(111, spans.get(0).start); + assertEquals(112, spans.get(0).endEx); + + // base 15..18 -> within L2 (L2 baseStart=13 => offsets 2..5) -> [115,119) + spans = idx.byteSpansForBaseRange(15, 18); + assertEquals(1, spans.size()); + assertEquals(115, spans.get(0).start); + assertEquals(119, spans.get(0).endEx); + } + + @Test + void byteSpans_acrossLines() { + SequenceIndex idx = newIndexFixture(); + + // base 5..17 crosses L1 (5..12) and L2 (13..17) + var spans = idx.byteSpansForBaseRange(5, 17); + assertEquals(2, spans.size()); + + // L1 slice: offsetStart=4 => [104,112) + assertEquals(104, spans.get(0).start); + assertEquals(112, spans.get(0).endEx); + + // L2 slice: base 13..17 => offsets 0..4 => [113,118) + assertEquals(113, spans.get(1).start); + assertEquals(118, spans.get(1).endEx); + } + + @Test + void byteSpans_fullRange_allLines() { + SequenceIndex idx = newIndexFixture(); + + var spans = idx.byteSpansForBaseRange(1, 33); + assertEquals(3, spans.size()); + + assertEquals(100, spans.get(0).start); // full L1 + assertEquals(112, spans.get(0).endEx); + + assertEquals(113, spans.get(1).start); // full L2 + assertEquals(123, spans.get(1).endEx); + + assertEquals(124, spans.get(2).start); // full L3 + assertEquals(135, spans.get(2).endEx); + } + + @Test + void badRanges_throw() { + SequenceIndex idx = newIndexFixture(); + + assertThrows(IllegalArgumentException.class, () -> idx.byteSpansForBaseRange(0, 1)); + assertThrows(IllegalArgumentException.class, () -> idx.byteSpansForBaseRange(10, 9)); + } + + @Test + void applyDeletion_currentImpl_throwsDueToUnmodifiable() { //TODO + // As written, SequenceIndex stores lines as Collections.unmodifiableList(lines) + // and applyDeletion() tries to mutate it via lines.clear() -> UnsupportedOperationException. + SequenceIndex idx = newIndexFixture(); + assertThrows(UnsupportedOperationException.class, () -> idx.applyDeletion(3, 5)); + } +} From d3a9298cbb0b22c575111aeab87efed00a59ba0a Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 26 Nov 2025 16:43:31 +0000 Subject: [PATCH 05/31] wip --- .../fasta/SequentialFastaEntryReader.java | 95 ++++++++++++++++++- .../fasta/sequenceutils/LineEntry.java | 2 +- 2 files changed, 92 insertions(+), 5 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java index a7cd7ce9..c7d75b7d 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java @@ -1,7 +1,9 @@ package uk.ac.ebi.embl.gff3tools.fasta; import uk.ac.ebi.embl.gff3tools.exception.FastaReadException; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.LineEntry; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceAlphabet; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex; import java.io.File; import java.io.FileInputStream; @@ -61,14 +63,18 @@ public boolean readNext() throws FastaReadException { if (headerLine == null) return false; ParsedHeader ph = headerParser.parse(headerLine); //find the start & end bytes of the sequence of the current fasta entry - ScanResult sr = findSequenceLimits(); + SequenceIndex idx = buildSequenceIndex(); FastaEntry e = new FastaEntry(); e.setId(ph.getId()); e.setHeader(ph.getHeader()); - e.setFastaStart(headerPos.getAsLong()); - e.setSequenceStart(sr.firstBase); - e.setSequenceEnd(sr.lastBase); + e.setFastaStart(headerPos.); //todo THIS GARBAGE + e.setSequenceStart(idx.firstBaseByte); + e.setSequenceEnd(idx.lastBaseByte); + this.current = e; + + // keep 'idx' around as a field if you want fast range queries for current entry + this.currentIndex = idx; this.current = e; return true; @@ -201,4 +207,85 @@ private ScanResult findSequenceLimits() throws IOException { channel.position(nextHdr); return new ScanResult(first, last, nextHdr); } + + private SequenceIndex buildSequenceIndex() throws IOException { + long pos = channel.position(); + long firstBaseByte = -1, lastBaseByte = -1, nextHdr = fileSize; + + long currentLineFirstByte = -1; // byte of first base in current line + long currentLineLastByte = -1; // byte of last base in current line + long basesSoFar = 0; // total bases committed to 'lines' + long basesInCurrentLine = 0; + + java.util.ArrayList lines = new java.util.ArrayList<>(); + + ByteBuffer buf = ByteBuffer.allocateDirect(BUF_SIZE); + + outer: + while (pos < fileSize) { + buf.clear(); + int toRead = (int) Math.min(buf.capacity(), fileSize - pos); + buf.limit(toRead); + int n = channel.read(buf, pos); + if (n <= 0) break; + buf.flip(); + while (buf.hasRemaining()) { + byte b = buf.get(); + long abs = pos + buf.position() - 1; + + if (b == GT) { // next header begins + nextHdr = abs; + // finalize the current line if it has bases + if (basesInCurrentLine > 0) { + long baseStart = basesSoFar + 1; + long baseEnd = basesSoFar + basesInCurrentLine; + lines.add(new LineEntry(baseStart, baseEnd, + currentLineFirstByte, currentLineLastByte + 1)); // end exclusive + basesSoFar += basesInCurrentLine; + } + break outer; + } + + if (b == '\n') { + if (basesInCurrentLine > 0) { + long baseStart = basesSoFar + 1; + long baseEnd = basesSoFar + basesInCurrentLine; + lines.add(new LineEntry(baseStart, baseEnd, + currentLineFirstByte, currentLineLastByte + 1)); + basesSoFar += basesInCurrentLine; + basesInCurrentLine = 0; + currentLineFirstByte = -1; + currentLineLastByte = -1; + } + continue; // newline consumed + } + + if (alphabet.isAllowed(b)) { + if (currentLineFirstByte < 0) currentLineFirstByte = abs; + currentLineLastByte = abs; + basesInCurrentLine++; + + if (firstBaseByte < 0) firstBaseByte = abs; + lastBaseByte = abs; + continue; + } + + // Non-allowed, non-newline byte: ignore (you said the lines only contain bases + '\n') + } + pos += n; + } + + // EOF: finalize any unterminated line with bases + if (basesInCurrentLine > 0) { + long baseStart = basesSoFar + 1; + long baseEnd = basesSoFar + basesInCurrentLine; + lines.add(new LineEntry(baseStart, baseEnd, + currentLineFirstByte, currentLineLastByte + 1)); + basesSoFar += basesInCurrentLine; + } + + channel.position(nextHdr); + return new SequenceIndex(firstBaseByte, lastBaseByte, lines); + } + } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java index 07460da1..29d2dec0 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java @@ -6,7 +6,7 @@ public final class LineEntry { long byteStart; // absolute byte offset of first base in the line long byteEndExclusive;// absolute byte offset just after the last base - LineEntry(long bStart, long bEnd, long byStart, long byEndEx) { + public LineEntry(long bStart, long bEnd, long byStart, long byEndEx) { this.baseStart = bStart; this.baseEnd = bEnd; this.byteStart = byStart; From f08e3e11495fdb6a6e6361edb6e5222dd7767a26 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 26 Nov 2025 17:08:19 +0000 Subject: [PATCH 06/31] very-much-wip-unbuildable --- .../ebi/embl/gff3tools/fasta/FastaEntry.java | 2 ++ .../fasta/SequentialFastaEntryReader.java | 31 ++++++++++++------- .../fasta/sequenceutils/SequenceIndex.java | 4 +-- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java index 2beba413..49b808b8 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java @@ -2,12 +2,14 @@ import lombok.Getter; import lombok.Setter; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex; @Getter @Setter public class FastaEntry { String id; // submissionNumber or accessionNumber FastaHeader header; + SequenceIndex sequenceIndex; long fastaStart; // position of '>' in the file long sequenceStart; // first allowed base after header (absolute byte offset) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java index c7d75b7d..5c504791 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java @@ -62,19 +62,16 @@ public boolean readNext() throws FastaReadException { String headerLine = readAsciiLine(); if (headerLine == null) return false; ParsedHeader ph = headerParser.parse(headerLine); - //find the start & end bytes of the sequence of the current fasta entry + //find information about the current positions and line structure of the sequence SequenceIndex idx = buildSequenceIndex(); FastaEntry e = new FastaEntry(); e.setId(ph.getId()); e.setHeader(ph.getHeader()); - e.setFastaStart(headerPos.); //todo THIS GARBAGE + e.setFastaStart(headerPos.getAsLong()); e.setSequenceStart(idx.firstBaseByte); e.setSequenceEnd(idx.lastBaseByte); - this.current = e; - - // keep 'idx' around as a field if you want fast range queries for current entry - this.currentIndex = idx; + e.setSequenceIndex(idx); this.current = e; return true; @@ -86,11 +83,15 @@ public boolean readNext() throws FastaReadException { // ---- private helpers (the only code allowed to touch channel position) ---- - private OptionalLong goToNextFastaEntry() throws IOException { //TODO modify so that the > has to be the first character in line + /** finds the first next '>' after the current fasta entry and puts the channel reader position there. + * If there is no later fasta entry, returns empty and leaves the channel reader position where it was. + * **/ + private OptionalLong goToNextFastaEntry() throws IOException { long currentPosition = channel.position(), originalPosition = currentPosition; if (currentPosition >= fileSize) return OptionalLong.empty(); - ByteBuffer buf = ByteBuffer.allocateDirect(BUF_SIZE); //read the next chunk of the file + //read the file content from current position (which should be somewhere in the current fasta entry or at the beginning of file) chunk by chunk + ByteBuffer buf = ByteBuffer.allocateDirect(BUF_SIZE); while (currentPosition < fileSize) { buf.clear(); int minToRead = (int) Math.min(buf.capacity(), fileSize - currentPosition); @@ -104,14 +105,16 @@ private OptionalLong goToNextFastaEntry() throws IOException { //TODO modify so if (buf.get() == GT) { long potentialFastaStart = currentPosition + buf.position() - 1; if(peekIsEndOfLine(potentialFastaStart - 1)) { - channel.position(potentialFastaStart); //put the channel position on the new greater than + // found new fasta entry start + channel.position(potentialFastaStart); return OptionalLong.of(potentialFastaStart); } } } currentPosition += numberOfBytesRead; } - channel.position(originalPosition); //if we didn't find new fasta entry start, go back to original position + //found no fasta starting after the current channel reader position + channel.position(originalPosition); return OptionalLong.empty(); } @@ -131,7 +134,10 @@ private boolean peekIsGT(long position) throws IOException { return n == 1 && one.get(0) == GT; } - private String readAsciiLine() throws IOException { + /* returns entire next line from the current reader position or the maximum buffer size if the line is too large to safely process (unlikely). + * Places current channel reader position at the first next unread character. (end of line or ) + * */ //TODO return boolean to see whether the line was read completely + private String readAsciiLine() throws IOException { //todo unfuck if (channel.position() >= fileSize) return null; StringBuilder sb = new StringBuilder(256); @@ -154,7 +160,7 @@ private String readAsciiLine() throws IOException { if (buf.get(buf.position() + i) == LF) { lfIndex = i; break; } } - if (lfIndex >= 0) { //if there is an "\n", read it and position the channel at the end of index + if (lfIndex >= 0) { //if there is an "\n", read it and position the channel at the end of line byte[] chunk = new byte[lfIndex]; buf.get(chunk); sb.append(new String(chunk, StandardCharsets.US_ASCII)); @@ -170,6 +176,7 @@ private String readAsciiLine() throws IOException { currentPosition += numberOfBytesRead; } } + channel.position(fileSize); return sb.toString(); } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java index 343dc2ad..6fab7659 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java @@ -1,8 +1,8 @@ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; public final class SequenceIndex { - final long firstBaseByte; // absolute byte offset of the first base (inclusive), -1 if none - final long lastBaseByte; // absolute byte offset of the last base (inclusive), -1 if none + public final long firstBaseByte; // absolute byte offset of the first base (inclusive), -1 if none + public final long lastBaseByte; // absolute byte offset of the last base (inclusive), -1 if none final java.util.List lines; // sorted by baseStart public SequenceIndex(long firstBaseByte, long lastBaseByte, java.util.List lines) { From 0574fb67e164ef3c47dc091fa695a15ce90e44e7 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Mon, 1 Dec 2025 16:11:39 +0000 Subject: [PATCH 07/31] index-finished --- .../fasta/SequentialFastaEntryReader.java | 5 +- .../fasta/sequenceutils/ByteSpan.java | 9 +- .../fasta/sequenceutils/LineEntry.java | 22 +-- .../fasta/sequenceutils/SequenceIndex.java | 139 ++++++--------- .../sequenceutils/SequenceIndexTest.java | 161 +++++++++--------- 5 files changed, 157 insertions(+), 179 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java index 5c504791..b346b011 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java @@ -1,6 +1,7 @@ package uk.ac.ebi.embl.gff3tools.fasta; import uk.ac.ebi.embl.gff3tools.exception.FastaReadException; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.ByteSpan; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.LineEntry; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceAlphabet; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex; @@ -52,6 +53,8 @@ public SequentialFastaEntryReader(File file, JsonHeaderParser parser, SequenceAl public boolean readingFile() { return channel.isOpen(); } public FastaEntry getCurrentEntry() { return current; } + + public boolean readNext() throws FastaReadException { try { long startPos = channel.position(); @@ -292,7 +295,7 @@ private SequenceIndex buildSequenceIndex() throws IOException { } channel.position(nextHdr); - return new SequenceIndex(firstBaseByte, lastBaseByte, lines); + return new SequenceIndex(firstBaseByte, lastBaseByte, 1, 1, lines); //todo fix } } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java index afa3e96b..6146620b 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java @@ -1,7 +1,8 @@ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; -final class ByteSpan { - final long start; // inclusive - final long endEx; // exclusive - ByteSpan(long s, long e) { this.start = s; this.endEx = e; } +public final class ByteSpan { + public final long start; // inclusive + public final long endEx; // exclusive + public ByteSpan(long start, long endEx) { this.start = start; this.endEx = endEx; } + public long length() { return endEx - start; } } \ No newline at end of file diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java index 29d2dec0..c1b994b3 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java @@ -1,16 +1,18 @@ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; public final class LineEntry { - long baseStart; // 1-based base index at line start (inclusive) - long baseEnd; // 1-based base index at line end (inclusive) - long byteStart; // absolute byte offset of first base in the line - long byteEndExclusive;// absolute byte offset just after the last base + public long baseStart; // 1-based, inclusive (after edits) + public long baseEnd; // 1-based, inclusive + public long byteStart; // absolute byte offset of first base in this line + public long byteEndExclusive; // absolute byte offset one past last base - public LineEntry(long bStart, long bEnd, long byStart, long byEndEx) { - this.baseStart = bStart; - this.baseEnd = bEnd; - this.byteStart = byStart; - this.byteEndExclusive = byEndEx; + public LineEntry(long baseStart, long baseEnd, long byteStart, long byteEndExclusive) { + this.baseStart = baseStart; + this.baseEnd = baseEnd; + this.byteStart = byteStart; + this.byteEndExclusive = byteEndExclusive; } -} + public long lengthBases() { return baseEnd - baseStart + 1; } + public long lengthBytes() { return byteEndExclusive - byteStart; } // ASCII: same as bases +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java index 6fab7659..ed3acfa7 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java @@ -1,115 +1,82 @@ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.ByteSpan; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.LineEntry; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + public final class SequenceIndex { - public final long firstBaseByte; // absolute byte offset of the first base (inclusive), -1 if none - public final long lastBaseByte; // absolute byte offset of the last base (inclusive), -1 if none - final java.util.List lines; // sorted by baseStart - public SequenceIndex(long firstBaseByte, long lastBaseByte, java.util.List lines) { + public long firstBaseByte; // -1 if empty + public long startNBasesCount; + public long lastBaseByte; // -1 if empty + public long endNBasesCount; + private final List lines; + + public SequenceIndex(long firstBaseByte, long startNBasesCount, + long lastBaseByte, long endNBasesCount, List lines) { this.firstBaseByte = firstBaseByte; - this.lastBaseByte = lastBaseByte; - this.lines = java.util.Collections.unmodifiableList(lines); + this.startNBasesCount = startNBasesCount; + this.lastBaseByte = lastBaseByte; + this.endNBasesCount = endNBasesCount; + this.lines = new ArrayList<>(lines); } - long totalBases() { + public List linesView() { return Collections.unmodifiableList(lines); } + + public long totalBasesIncludingEdgeNBases() { if (lines.isEmpty()) return 0; - return lines.get(lines.size()-1).baseEnd; + return lines.get(lines.size() - 1).baseEnd; } - /** Return one or more byte spans covering [fromBase..toBase], inclusive. */ - java.util.List byteSpansForBaseRange(long fromBase, long toBase) { - if (fromBase < 1 || toBase < fromBase) throw new IllegalArgumentException("bad base range"); - if (lines.isEmpty()) return java.util.List.of(); + public long totalBases() { + long bases = totalBasesIncludingEdgeNBases() - endNBasesCount - startNBasesCount; + return Math.max(0, bases); + } + public ByteSpan byteSpanForBaseRangeIncludingEdgeNBases(long fromBase, long toBase) { + long total = totalBasesIncludingEdgeNBases(); + if (fromBase < 1 || toBase < fromBase || toBase > total) { + throw new IllegalArgumentException("bad base range: " + fromBase + ".." + toBase); + } int i = findLineByBase(fromBase); int j = findLineByBase(toBase); - java.util.ArrayList out = new java.util.ArrayList<>(Math.max(1, j - i + 1)); - for (int k = i; k <= j; k++) { - LineEntry L = lines.get(k); - long startBase = Math.max(fromBase, L.baseStart); - long endBase = Math.min(toBase, L.baseEnd); + LineEntry from = lines.get(i); + long offStart = fromBase - from.baseStart; - long offsetStartInLine = startBase - L.baseStart; // 0-based - long offsetEndInLineEx = (endBase - L.baseStart) + 1; // exclusive + LineEntry to = lines.get(j); + long offEndIncl = toBase - to.baseStart; - long byteStart = L.byteStart + offsetStartInLine; // ASCII 1 byte/base - long byteEndEx = L.byteStart + offsetEndInLineEx; // exclusive - out.add(new ByteSpan(byteStart, byteEndEx)); - } - return out; + long byteStart = from.byteStart + offStart; + long byteEndEx = to.byteStart + offEndIncl + 1; // half-open + + return new ByteSpan(byteStart, byteEndEx); } - /** Naive in-place index adjustment after deleting [fromBase..toBase] (inclusive). */ - void applyDeletion(long fromBase, long toBase) { - if (fromBase < 1 || toBase < fromBase) throw new IllegalArgumentException("bad base range"); - if (lines.isEmpty()) return; - - long deltaBases = (toBase - fromBase + 1); - long deltaBytes = deltaBases; - - int first = findLineByBase(fromBase); - int last = findLineByBase(toBase); - - // Adjust partially affected first/last lines, remove fully-eaten ones, and shift the rest. - java.util.List mutable = new java.util.ArrayList<>(lines); - // Trim front - LineEntry Lf = mutable.get(first); - if (fromBase > Lf.baseStart) { - // delete tail portion in first line - long cut = Math.min(deltaBases, Lf.baseEnd - fromBase + 1); - Lf.baseEnd -= cut; - Lf.byteEndExclusive -= cut; - } else { - // delete whole first line (or will be eaten by later logic) - } - // Trim back - LineEntry Ll = mutable.get(last); - if (toBase < Ll.baseEnd) { - long cut = Math.min(deltaBases, Ll.baseEnd - toBase); - // delete head portion in last line - long newBaseStart = toBase + 1; - long newByteStart = Ll.byteStart + (newBaseStart - Ll.baseStart); - Ll.baseStart = newBaseStart - deltaBases; - Ll.byteStart = newByteStart - deltaBytes; - } else { - // will be shifted/removed below - } - // Remove any lines whose base range collapsed - mutable.removeIf(le -> le.baseEnd < le.baseStart); - - // Shift all lines strictly after the deletion by (-delta) - for (int idx = 0; idx < mutable.size(); idx++) { - LineEntry L = mutable.get(idx); - if (L.baseStart > toBase) { - L.baseStart -= deltaBases; - L.baseEnd -= deltaBases; - L.byteStart -= deltaBytes; - L.byteEndExclusive -= deltaBytes; - } + public ByteSpan byteSpanForBaseRange(long fromBase, long toBase) { + long trimmedTotal = totalBases(); + if (fromBase < 1 || toBase < fromBase || toBase > trimmedTotal) { + throw new IllegalArgumentException("bad base range: " + fromBase + ".." + toBase); } - - // Re-freeze as unmodifiable - lines.clear(); // if you stored unmodifiable above, switch to a mutable field or a builder - lines.addAll(java.util.Collections.unmodifiableList(mutable)); - // Note: first/last base bytes would also shift by -deltaBytes if deletion occurs before them. - // You can recompute from lines when needed. + long actualFromBase = startNBasesCount + fromBase; + long actualToBase = startNBasesCount + toBase; + return byteSpanForBaseRangeIncludingEdgeNBases(actualFromBase, actualToBase); } private int findLineByBase(long base) { - int lo = 0, hi = lines.size()-1, ans = hi; + int lo = 0, hi = lines.size() - 1, ans = hi; while (lo <= hi) { int mid = (lo + hi) >>> 1; LineEntry L = lines.get(mid); - if (base < L.baseStart) { hi = mid - 1; } - else if (base > L.baseEnd) { lo = mid + 1; } - else { return mid; } // inside - ans = lo; // insertion point + if (base < L.baseStart) hi = mid - 1; + else if (base > L.baseEnd) lo = mid + 1; + else return mid; + ans = lo; } - return Math.max(0, Math.min(ans, lines.size()-1)); + return Math.max(0, Math.min(ans, lines.size() - 1)); } } - - - diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java index 50e96ff6..d8b808b3 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java @@ -2,114 +2,119 @@ import org.junit.jupiter.api.Test; -import java.util.ArrayList; import java.util.List; import static org.junit.jupiter.api.Assertions.*; -/** - * Tests for SequenceIndex byte-range mapping and validation. - */ public class SequenceIndexTest { - // Helpers to build a sane fixture ---------------------------------------- - - private LineEntry line(long baseStart, long baseEnd, long byteStart, long byteEndEx) { - return new LineEntry(baseStart, baseEnd, byteStart, byteEndEx); - } - - private SequenceIndex newIndexFixture() { - List lines = new ArrayList<>(); - // L1: 12 bases, bytes 100..111 (exclusive end 112) - lines.add(line(1, 12, 100, 112)); - // L2: 10 bases, bytes 113..122 (exclusive end 123) - lines.add(line(13, 22, 113, 123)); - // L3: 11 bases, bytes 124..134 (exclusive end 135) - lines.add(line(23, 33, 124, 135)); - // first/last base bytes (inclusive): 100 and 134 - return new SequenceIndex(100, 134, lines); + /** + * Synthetic line layout (ASCII, 1 byte/base; '\n' not part of lines): + * + * Line1: bases 1..4 at bytes [100,104) -> base->byte: 1:100, 2:101, 3:102, 4:103, '\n':104 + * Line2: bases 5..8 at bytes [105,109) -> 5:105, 6:106, 7:107, 8:108, '\n':109 + * Line3: bases 9..12 at bytes [110,114) -> 9:110, 10:111, 11:112, 12:113, '\n':114 + * + * So: + * - first base byte = 100 + * - last base byte = 113 + * - total bases including edge Ns = 12 + */ + private SequenceIndex buildIndex(long startN, long endN) { + List lines = List.of( + new LineEntry(1, 4, 100, 104), + new LineEntry(5, 8, 105, 109), + new LineEntry(9, 12, 110, 114) + ); + return new SequenceIndex( + /*firstBaseByte*/100, + /*startNBasesCount*/startN, + /*lastBaseByte*/113, + /*endNBasesCount*/endN, + lines + ); } - // ------------------------------------------------------------------------ - @Test - void totalBases_isEndOfLastLine() { - SequenceIndex idx = newIndexFixture(); - assertEquals(33, idx.totalBases()); - assertEquals(100, idx.firstBaseByte); - assertEquals(134, idx.lastBaseByte); + void totals_including_and_trimmed() { + SequenceIndex idx = buildIndex(/*startN*/2, /*endN*/3); + + assertEquals(12, idx.totalBasesIncludingEdgeNBases(), "totalBasesIncludingEdgeNBases"); + assertEquals(7, idx.totalBases(), "trimmed totalBases"); } @Test - void byteSpans_singleLine_inside() { - SequenceIndex idx = newIndexFixture(); - - // base 1..1 -> L1 offset 0 -> bytes [100,101) - var spans = idx.byteSpansForBaseRange(1, 1); - assertEquals(1, spans.size()); - assertEquals(100, spans.get(0).start); - assertEquals(101, spans.get(0).endEx); - - // base 12..12 -> last byte of L1 -> [111,112) - spans = idx.byteSpansForBaseRange(12, 12); - assertEquals(1, spans.size()); - assertEquals(111, spans.get(0).start); - assertEquals(112, spans.get(0).endEx); - - // base 15..18 -> within L2 (L2 baseStart=13 => offsets 2..5) -> [115,119) - spans = idx.byteSpansForBaseRange(15, 18); - assertEquals(1, spans.size()); - assertEquals(115, spans.get(0).start); - assertEquals(119, spans.get(0).endEx); + void byteSpan_including_edges_same_line() { + SequenceIndex idx = buildIndex(0, 0); + + // [from..to] = [2..4] -> bytes [101..103], endExclusive = 104 + ByteSpan s = idx.byteSpanForBaseRangeIncludingEdgeNBases(2, 4); + + assertEquals(101, s.start); + assertEquals(104, s.endEx); + assertEquals(3, s.length()); } @Test - void byteSpans_acrossLines() { - SequenceIndex idx = newIndexFixture(); + void byteSpan_including_edges_crosses_newline() { + SequenceIndex idx = buildIndex(0, 0); - // base 5..17 crosses L1 (5..12) and L2 (13..17) - var spans = idx.byteSpansForBaseRange(5, 17); - assertEquals(2, spans.size()); + // [2..5] crosses the newline between line1 and line2 + // start = base2@101, endEx = base5@105 + 1 = 106, newline at 104 is included + ByteSpan s = idx.byteSpanForBaseRangeIncludingEdgeNBases(2, 5); - // L1 slice: offsetStart=4 => [104,112) - assertEquals(104, spans.get(0).start); - assertEquals(112, spans.get(0).endEx); + assertEquals(101, s.start); + assertEquals(106, s.endEx); + assertEquals(5, s.length()); // 2,3,\n,5,exclusive end char + } - // L2 slice: base 13..17 => offsets 0..4 => [113,118) - assertEquals(113, spans.get(1).start); - assertEquals(118, spans.get(1).endEx); + @Test + void including_edges_validates_total() { + SequenceIndex idx = buildIndex(0, 0); + assertThrows(IllegalArgumentException.class, + () -> idx.byteSpanForBaseRangeIncludingEdgeNBases(1, 13), + "toBase beyond total (including Ns) should throw"); } @Test - void byteSpans_fullRange_allLines() { - SequenceIndex idx = newIndexFixture(); + void trimmed_byteSpan_maps_through_startN() { + SequenceIndex idx = buildIndex(2, 3); + assertEquals(7, idx.totalBases()); + + ByteSpan s = idx.byteSpanForBaseRange(1, 3); // Ignore first 2 Ns, ignore last 3 Ns - var spans = idx.byteSpansForBaseRange(1, 33); - assertEquals(3, spans.size()); + assertEquals(102, s.start); + assertEquals(106, s.endEx); + assertEquals(4, s.length()); // 3 bases + exclusive end + } - assertEquals(100, spans.get(0).start); // full L1 - assertEquals(112, spans.get(0).endEx); + @Test + void trimmed_span_crosses_multiple_lines() { + SequenceIndex idx = buildIndex(2, 3); // trimmed total = 7 bases - assertEquals(113, spans.get(1).start); // full L2 - assertEquals(123, spans.get(1).endEx); + ByteSpan s = idx.byteSpanForBaseRange(4, 7); - assertEquals(124, spans.get(2).start); // full L3 - assertEquals(135, spans.get(2).endEx); + assertEquals(106, s.start); + assertEquals(111, s.endEx); + assertEquals(5, s.length()); } @Test - void badRanges_throw() { - SequenceIndex idx = newIndexFixture(); - - assertThrows(IllegalArgumentException.class, () -> idx.byteSpansForBaseRange(0, 1)); - assertThrows(IllegalArgumentException.class, () -> idx.byteSpansForBaseRange(10, 9)); + void trimmed_validates_range_against_trimmed_total() { + SequenceIndex idx = buildIndex(2, 3); // trimmed total = 7 + assertThrows(IllegalArgumentException.class, + () -> idx.byteSpanForBaseRange(1, 8), + "toBase beyond trimmed total should throw"); } @Test - void applyDeletion_currentImpl_throwsDueToUnmodifiable() { //TODO - // As written, SequenceIndex stores lines as Collections.unmodifiableList(lines) - // and applyDeletion() tries to mutate it via lines.clear() -> UnsupportedOperationException. - SequenceIndex idx = newIndexFixture(); - assertThrows(UnsupportedOperationException.class, () -> idx.applyDeletion(3, 5)); + void zero_edgeNs_behavior_matches_including_method() { + SequenceIndex idx = buildIndex(0, 0); //no additional N bases + + ByteSpan a = idx.byteSpanForBaseRange(2, 5); + ByteSpan b = idx.byteSpanForBaseRangeIncludingEdgeNBases(2, 5); + + assertEquals(b.start, a.start); + assertEquals(b.endEx, a.endEx); } } From a9f1c48849dcd28b32abec10275f3828a5f2e5d9 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Mon, 1 Dec 2025 18:05:08 +0000 Subject: [PATCH 08/31] sequential-entry-reader-still-a-mess-unbuildable --- .../ebi/embl/gff3tools/fasta/FastaEntry.java | 12 +- .../ebi/embl/gff3tools/fasta/FastaReader.java | 8 + .../fasta/SequentialFastaEntryReader.java | 517 +++++++++++------- .../fasta/sequenceutils/SequenceAlphabet.java | 9 + 4 files changed, 341 insertions(+), 205 deletions(-) create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java index 49b808b8..d3eff22d 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java @@ -7,11 +7,9 @@ @Getter @Setter public class FastaEntry { - String id; // submissionNumber or accessionNumber - FastaHeader header; - SequenceIndex sequenceIndex; - - long fastaStart; // position of '>' in the file - long sequenceStart; // first allowed base after header (absolute byte offset) - long sequenceEnd; // last allowed base before next header (absolute byte offset) + String id; + FastaHeader header; //json info + //information needed for accessing the file + long fastaStartByte; // position of '>' in the file + SequenceIndex sequenceIndex; // a smart index for querying ranges in the file } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java new file mode 100644 index 00000000..8fb38881 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java @@ -0,0 +1,8 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +public interface Fasta { + + FastaEntry getCurrentEntry(); + + +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java index b346b011..5df07519 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java @@ -1,7 +1,6 @@ package uk.ac.ebi.embl.gff3tools.fasta; import uk.ac.ebi.embl.gff3tools.exception.FastaReadException; -import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.ByteSpan; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.LineEntry; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceAlphabet; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex; @@ -13,289 +12,411 @@ import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.nio.charset.StandardCharsets; -import java.util.OptionalLong; +import java.util.*; public class SequentialFastaEntryReader implements AutoCloseable { - private static final int BUF_SIZE = 64 * 1024; + // -------- constants + private static final int SCAN_BUF_SIZE = 64 * 1024; + private static final int COUNT_BUF_SIZE = 8 * 1024; private static final byte GT = (byte) '>'; private static final byte LF = (byte) '\n'; + // -------- file + config private final FileChannel channel; - private final long fileSize; //size of file in bytes - + private final long fileSize; private final JsonHeaderParser headerParser; private final SequenceAlphabet alphabet; - private FastaEntry current; - public SequentialFastaEntryReader(File file) throws FileNotFoundException { + // -------- state/result + private final Map indexById = new LinkedHashMap<>(); + + // -------- ctor + public SequentialFastaEntryReader(File file) throws IOException { this(file, new JsonHeaderParser(), SequenceAlphabet.defaultNucleotideAlphabet()); } - public SequentialFastaEntryReader(File file, JsonHeaderParser parser, SequenceAlphabet alphabet) throws FileNotFoundException { - if (file == null) throw new IllegalStateException("Input FASTA file is null"); + public SequentialFastaEntryReader(File file, JsonHeaderParser parser, SequenceAlphabet alphabet) + throws FileNotFoundException, IOException { + Objects.requireNonNull(file, "Input FASTA file is null"); if (!file.exists()) throw new FileNotFoundException(file.getAbsolutePath()); if (file.isDirectory()) throw new FileNotFoundException("Directory: " + file.getAbsolutePath()); if (!file.canRead()) throw new IllegalArgumentException("No read permission: " + file.getAbsolutePath()); - this.headerParser = parser; - this.alphabet = alphabet; + this.headerParser = Objects.requireNonNull(parser, "parser"); + this.alphabet = Objects.requireNonNull(alphabet, "alphabet"); - try { - this.channel = new FileInputStream(file).getChannel(); - this.fileSize = channel.size(); - } catch (IOException e) { - throw new RuntimeException("Open channel failed", e); - } + this.channel = new FileInputStream(file).getChannel(); //exception will bubble up if fails + this.fileSize = channel.size(); } - public void close() throws IOException { channel.close(); } + // -------- lifecycle of the read + @Override public void close() throws IOException { channel.close(); } public boolean readingFile() { return channel.isOpen(); } - public FastaEntry getCurrentEntry() { return current; } - + // -------- main iteration - public boolean readNext() throws FastaReadException { + /** Reads the next FASTA entry, if there is none it returns an empty object*/ + public Optional readNext(long from) throws FastaReadException { try { - long startPos = channel.position(); - OptionalLong headerPos = goToNextFastaEntry(); - if (headerPos.isPresent() || !peekIsGT(startPos)) return false; + OptionalLong headerPosition = seekToNextHeader(from); + if (headerPosition.isEmpty()) return Optional.empty(); // no next FASTA entry detected + channel.position(headerPosition.getAsLong()); //otherwise, position at start of new fasta - //parse id & header json - String headerLine = readAsciiLine(); - if (headerLine == null) return false; + // parse header & build sequence index + String headerLine = readAsciiLineFromCurrentPosition(); // scan first line, parser + if (headerLine == null) throw new FastaReadException("Header is malformed"); ParsedHeader ph = headerParser.parse(headerLine); - //find information about the current positions and line structure of the sequence SequenceIndex idx = buildSequenceIndex(); - FastaEntry e = new FastaEntry(); - e.setId(ph.getId()); - e.setHeader(ph.getHeader()); - e.setFastaStart(headerPos.getAsLong()); - e.setSequenceStart(idx.firstBaseByte); - e.setSequenceEnd(idx.lastBaseByte); - e.setSequenceIndex(idx); + // produce current entry + FastaEntry newEntry = new FastaEntry(); + newEntry.setId(ph.getId()); + newEntry.setHeader(ph.getHeader()); + newEntry.setFastaStartByte(headerPosition.getAsLong()); + newEntry.setSequenceIndex(idx); - this.current = e; - return true; + return Optional.of(newEntry); } catch (IOException io) { - throw new FastaReadException("I/O while reading FASTA", io); + long position; + try{ + position= channel.position(); + } catch (IOException e) { + position = -1; + } + throw new FastaReadException("I/O while reading FASTA at byte " + position + ": " + io.getMessage(), io); } } - // ---- private helpers (the only code allowed to touch channel position) ---- + // ===================================================================== + // = SCANNING (to next header) = + // ===================================================================== - /** finds the first next '>' after the current fasta entry and puts the channel reader position there. - * If there is no later fasta entry, returns empty and leaves the channel reader position where it was. - * **/ - private OptionalLong goToNextFastaEntry() throws IOException { - long currentPosition = channel.position(), originalPosition = currentPosition; - if (currentPosition >= fileSize) return OptionalLong.empty(); + /** Finds next header ('>') that starts a line (at file start or after LF). */ + private OptionalLong seekToNextHeader(long from) throws IOException { + if (from >= fileSize) return OptionalLong.empty(); - //read the file content from current position (which should be somewhere in the current fasta entry or at the beginning of file) chunk by chunk - ByteBuffer buf = ByteBuffer.allocateDirect(BUF_SIZE); - while (currentPosition < fileSize) { - buf.clear(); - int minToRead = (int) Math.min(buf.capacity(), fileSize - currentPosition); - buf.limit(minToRead); - - int numberOfBytesRead = channel.read(buf, currentPosition); - if (numberOfBytesRead <= 0) break; //no bytes were read, probably paranoid + ByteBuffer buf = ByteBuffer.allocateDirect(SCAN_BUF_SIZE); + while (from < fileSize) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), fileSize - from); + buf.limit(want); + int n = channel.read(buf, from); + if (n <= 0) break; buf.flip(); + while (buf.hasRemaining()) { - if (buf.get() == GT) { - long potentialFastaStart = currentPosition + buf.position() - 1; - if(peekIsEndOfLine(potentialFastaStart - 1)) { - // found new fasta entry start - channel.position(potentialFastaStart); - return OptionalLong.of(potentialFastaStart); - } + long positionToCheck = from + buf.position(); + byte b = buf.get(); + if (peekIfFastaHeaderStart(b, positionToCheck)) { + return OptionalLong.of(positionToCheck); } } - currentPosition += numberOfBytesRead; + from += n; } - //found no fasta starting after the current channel reader position - channel.position(originalPosition); return OptionalLong.empty(); } - /* Just checks if the character at the given position equals '>' char, does not move the channel.position() */ - private boolean peekIsEndOfLine(long position) throws IOException { - if (position >= fileSize) return false; - ByteBuffer one = ByteBuffer.allocate(1); - int n = channel.read(one, position); - return n == 1 && one.get(0) == LF; + /** Reads until end of one ASCII line from current channel.position() ( '\n' terminated or EOF). + * Advances channel past end of line or to EOF */ + private String readAsciiLineFromCurrentPosition() throws IOException { + long scanPos = channel.position(); + if (scanPos >= fileSize) return null; + + StringBuilder sb = new StringBuilder(256); + ByteBuffer buf = ByteBuffer.allocateDirect(SCAN_BUF_SIZE); + + while (scanPos < fileSize) { + + // fill buffer from disk + buf.clear(); + int want = (int) Math.min(buf.capacity(), fileSize - scanPos); + buf.limit(want); + int n = channel.read(buf, scanPos); + if (n <= 0) break; + + buf.flip(); + + // find end of line in the bytes we already have + int lfIndex = indexOf(buf, LF); + if (lfIndex >= 0) + {// if end of line found, append bytes up to (not including) LF + appendAscii(sb, buf, lfIndex); + long nextLineStart = scanPos + lfIndex + 1; // consume LF + channel.position(nextLineStart); + return sb.toString(); + } else { + // no LF in this chunk; append all bytes and continue + appendAscii(sb, buf, buf.remaining()); + scanPos += n; + } + } + + // read up to EOF + channel.position(fileSize); + return sb.toString(); } - /* Just checks if the character at the given position equals '>' char, does not move the channel.position() */ - private boolean peekIsGT(long position) throws IOException { - if (position >= fileSize) return false; - ByteBuffer one = ByteBuffer.allocate(1); - int n = channel.read(one, position); - return n == 1 && one.get(0) == GT; + /** gets index of a target byte character in a byte buffer, returns -1 if not found **/ + private static int indexOf(ByteBuffer buf, byte target) { + for (int i = 0; i < buf.remaining(); i++) { + if (buf.get(buf.position() + i) == target) return i; + } + return -1; } - /* returns entire next line from the current reader position or the maximum buffer size if the line is too large to safely process (unlikely). - * Places current channel reader position at the first next unread character. (end of line or ) - * */ //TODO return boolean to see whether the line was read completely - private String readAsciiLine() throws IOException { //todo unfuck - if (channel.position() >= fileSize) return null; + /** Append exactly len bytes from buf to sb as US-ASCII, advancing buf.position() by len. */ + private static void appendAscii(StringBuilder sb, ByteBuffer buf, int len) { + byte[] chunk = new byte[len]; + buf.get(chunk); + sb.append(new String(chunk, java.nio.charset.StandardCharsets.US_ASCII)); + } - StringBuilder sb = new StringBuilder(256); - ByteBuffer buf = ByteBuffer.allocateDirect(BUF_SIZE); - long currentPosition = channel.position(); + // ============================================================================= + // = SEQUENCE INDEX SCAN & BUILD = + // ============================================================================= - while (currentPosition < fileSize) { - buf.clear(); - int toRead = (int) Math.min(buf.capacity(), fileSize - currentPosition); - buf.limit(toRead); + /** Builds the per-line index for the sequence starting at current position (right after header line). */ + private SequenceIndex buildSequenceIndex() throws IOException { + ScanState s = initScanState(channel.position()); + ByteBuffer buf = newScanBuffer(); - int numberOfBytesRead = channel.read(buf, currentPosition); - if (numberOfBytesRead <= 0) break; + while (s.pos we hit next header; stop scanning + s.pos += n; + } - buf.flip(); + finalizeOpenLineIfAny(s); + // leave channel at next header (or EOF) + channel.position(s.nextHdr); + + long startN = 0, endN = 0; + if (!s.lines.isEmpty()) { + LineEntry first = s.lines.get(0); + LineEntry last = s.lines.get(s.lines.size() - 1); + startN = countLeadingNsInLine(first); + endN = countTrailingNsInLine(last); + } + + return new SequenceIndex(s.firstBaseByte, startN, s.lastBaseByte, endN, s.lines); + } + +// --------------------------------------------------------------------- +// scan helpers +// --------------------------------------------------------------------- - //find end of line - int lfIndex = -1; - for (int i = 0; i < buf.remaining(); i++) { - if (buf.get(buf.position() + i) == LF) { lfIndex = i; break; } + /** All mutable scanning state for one sequence. */ + private static final class ScanState { + long pos; // absolute file position we’re scanning from + long firstBaseByte = -1; // byte of first allowed base seen + long lastBaseByte = -1; // byte of last allowed base seen + long nextHdr; // byte of next '>' that starts a line (or fileSize) + + long lineFirstByte = -1; // byte of first base in current line + long lineLastByte = -1; // byte of last base in current line + long basesSoFar = 0; // total bases committed to s.lines + long basesInLine = 0; // bases accumulated in current line (not yet committed) + + final java.util.ArrayList lines = new java.util.ArrayList<>(256); + ScanState(long startPos, long fileSize) { this.pos = startPos; this.nextHdr = fileSize; } + } + + private ScanState initScanState(long startPos) { + return new ScanState(startPos, fileSize); + } + + private ByteBuffer newScanBuffer() { + return ByteBuffer.allocateDirect(SCAN_BUF_SIZE); + } + + private int fillBuffer(ByteBuffer buf, long at) throws IOException { + buf.clear(); + int want = (int) Math.min(buf.capacity(), fileSize - at); + buf.limit(want); + return channel.read(buf, at); + } + + /** + * Process a filled buffer. Returns true if scanning should stop (we found the next header), + * false to keep scanning. + */ + private boolean processBuffer(ByteBuffer buf, ScanState s) throws IOException { + buf.flip(); + while (buf.hasRemaining()) { + // capture index BEFORE get(), so abs == s.pos + idx + int idx = buf.position(); + byte b = buf.get(); + long abs = s.pos + idx; + + if (peekIfFastaHeaderStart(b, abs)) { + s.nextHdr = abs; + commitOpenLineIfAny(s); + return true; // done scanning current entry } - if (lfIndex >= 0) { //if there is an "\n", read it and position the channel at the end of line - byte[] chunk = new byte[lfIndex]; - buf.get(chunk); - sb.append(new String(chunk, StandardCharsets.US_ASCII)); - buf.get(); // consume LF - channel.position(currentPosition + lfIndex + 1); //skip separator - int len = sb.length(); - if (len>0 && sb.charAt(len-1)=='\r') sb.setLength(len-1); - return sb.toString(); - } else { //otherwise read entire chunk, but this is unlikely to happen as the buffer should be large enough - byte[] chunk = new byte[buf.remaining()]; - buf.get(chunk); - sb.append(new String(chunk, StandardCharsets.US_ASCII)); - currentPosition += numberOfBytesRead; + if (isNewline(b)) { + commitOpenLineIfAny(s); + continue; } + + if (isAllowedBase(b)) { + observeBaseByte(abs, s); + continue; + } + + // else: ignore unexpected bytes (spec: sequence lines contain bases + '\n') } + return false; + } - channel.position(fileSize); - return sb.toString(); + // --------------------------------------------------------------------- + // state mutation helpers (lines) + // --------------------------------------------------------------------- + + /** We saw a base at absolute byte 'abs'. Update current line + first/last markers. */ + private void observeBaseByte(long abs, ScanState s) { + if (s.lineFirstByte < 0) s.lineFirstByte = abs; + s.lineLastByte = abs; + s.basesInLine++; + + if (s.firstBaseByte < 0) s.firstBaseByte = abs; + s.lastBaseByte = abs; } - private static final class ScanResult { - final long firstBase, lastBase, nextHeader; - ScanResult(long f, long l, long n){ firstBase=f; lastBase=l; nextHeader=n; } + /** If current line has bases, convert it into a LineEntry and reset line accumulators. */ + private void commitOpenLineIfAny(ScanState s) { + if (s.basesInLine <= 0) return; + + long baseStart = s.basesSoFar + 1; + long baseEnd = s.basesSoFar + s.basesInLine; + long byteStart = s.lineFirstByte; + long byteEndEx = s.lineLastByte + 1; // half-open + + s.lines.add(new LineEntry(baseStart, baseEnd, byteStart, byteEndEx)); + + s.basesSoFar += s.basesInLine; + s.basesInLine = 0; + s.lineFirstByte = -1; + s.lineLastByte = -1; } - private ScanResult findSequenceLimits() throws IOException { - long currentPosition = channel.position(); - long first = -1, last = -1, nextHdr = fileSize; + /** If EOF hit with an unterminated line, commit it. */ + private void finalizeOpenLineIfAny(ScanState s) { + commitOpenLineIfAny(s); + } + + + private static void addLine(List lines, + long basesSoFar, + long basesInCurrentLine, + long firstByte, long lastByte) { + long baseStart = basesSoFar + 1; + long baseEnd = basesSoFar + basesInCurrentLine; + // byteEndExclusive is lastByte + 1 (ASCII 1 byte/base) + lines.add(new LineEntry(baseStart, baseEnd, firstByte, lastByte + 1)); + } + + // ===================================================================== + // = EDGE 'N' COUNT HELPERS = + // ===================================================================== + + /** Count leading 'N'/'n' in the given line’s byte range. */ + private long countLeadingNsInLine(LineEntry line) throws IOException { + long remaining = line.lengthBytes(); + long offset = line.byteStart; + long count = 0; - ByteBuffer buf = ByteBuffer.allocateDirect(BUF_SIZE); + ByteBuffer buf = ByteBuffer.allocateDirect(COUNT_BUF_SIZE); - outer: - while (currentPosition < fileSize) { + while (remaining > 0) { buf.clear(); - int toRead = (int) Math.min(buf.capacity(), fileSize - currentPosition); - buf.limit(toRead); - int n = channel.read(buf, currentPosition); - if (n<=0) break; + int want = (int) Math.min(buf.capacity(), remaining); + buf.limit(want); + int n = channel.read(buf, offset); + if (n <= 0) break; buf.flip(); - while (buf.hasRemaining()) { + + for (int i = 0; i < n; i++) { byte b = buf.get(); - long abs = currentPosition + buf.position() - 1; - if (b == GT) { nextHdr = abs; break outer; } - if (alphabet.isAllowed(b)) { - if (first < 0) first = abs; - last = abs; + if (isNBase(b)) { + count++; + } else { + return count; } } - currentPosition += n; + remaining -= n; + offset += n; } - channel.position(nextHdr); - return new ScanResult(first, last, nextHdr); + return count; } - private SequenceIndex buildSequenceIndex() throws IOException { - long pos = channel.position(); - long firstBaseByte = -1, lastBaseByte = -1, nextHdr = fileSize; - - long currentLineFirstByte = -1; // byte of first base in current line - long currentLineLastByte = -1; // byte of last base in current line - long basesSoFar = 0; // total bases committed to 'lines' - long basesInCurrentLine = 0; - - java.util.ArrayList lines = new java.util.ArrayList<>(); + /** Count trailing 'N'/'n' in the given line’s byte range (scan forward, track tail run). */ + private long countTrailingNsInLine(LineEntry line) throws IOException { + long remaining = line.lengthBytes(); + long offset = line.byteStart; + long trailingRun = 0; - ByteBuffer buf = ByteBuffer.allocateDirect(BUF_SIZE); + ByteBuffer buf = ByteBuffer.allocateDirect(COUNT_BUF_SIZE); - outer: - while (pos < fileSize) { + while (remaining > 0) { buf.clear(); - int toRead = (int) Math.min(buf.capacity(), fileSize - pos); - buf.limit(toRead); - int n = channel.read(buf, pos); + int want = (int) Math.min(buf.capacity(), remaining); + buf.limit(want); + int n = channel.read(buf, offset); if (n <= 0) break; buf.flip(); - while (buf.hasRemaining()) { + + for (int i = 0; i < n; i++) { byte b = buf.get(); - long abs = pos + buf.position() - 1; - - if (b == GT) { // next header begins - nextHdr = abs; - // finalize the current line if it has bases - if (basesInCurrentLine > 0) { - long baseStart = basesSoFar + 1; - long baseEnd = basesSoFar + basesInCurrentLine; - lines.add(new LineEntry(baseStart, baseEnd, - currentLineFirstByte, currentLineLastByte + 1)); // end exclusive - basesSoFar += basesInCurrentLine; - } - break outer; + if (isNBase(b)) { + trailingRun++; // extend current tail run + } else { + trailingRun = 0; // reset tail run on any non-N } + } + remaining -= n; + offset += n; + } + return trailingRun; + } - if (b == '\n') { - if (basesInCurrentLine > 0) { - long baseStart = basesSoFar + 1; - long baseEnd = basesSoFar + basesInCurrentLine; - lines.add(new LineEntry(baseStart, baseEnd, - currentLineFirstByte, currentLineLastByte + 1)); - basesSoFar += basesInCurrentLine; - basesInCurrentLine = 0; - currentLineFirstByte = -1; - currentLineLastByte = -1; - } - continue; // newline consumed - } + // --------------------------------------------------------------------- + // helper peek byte functions + // --------------------------------------------------------------------- - if (alphabet.isAllowed(b)) { - if (currentLineFirstByte < 0) currentLineFirstByte = abs; - currentLineLastByte = abs; - basesInCurrentLine++; - if (firstBaseByte < 0) firstBaseByte = abs; - lastBaseByte = abs; - continue; - } + /** True if absolute position is at file start OR previous byte is '\n'. */ + private boolean peekIfLineStart(long absPos) throws IOException { + if (absPos == 0) return true; + if (absPos > fileSize) return false; + return peekByte(absPos - 1) == LF; + } - // Non-allowed, non-newline byte: ignore (you said the lines only contain bases + '\n') - } - pos += n; - } + /** Absolute peek (does not change channel.position()). Returns 0 if OOB. */ + private byte peekByte(long absPos) throws IOException { + if (absPos < 0 || absPos >= fileSize) return 0; + ByteBuffer one = ByteBuffer.allocate(1); + int n = channel.read(one, absPos); + return (n == 1) ? one.get(0) : 0; + } - // EOF: finalize any unterminated line with bases - if (basesInCurrentLine > 0) { - long baseStart = basesSoFar + 1; - long baseEnd = basesSoFar + basesInCurrentLine; - lines.add(new LineEntry(baseStart, baseEnd, - currentLineFirstByte, currentLineLastByte + 1)); - basesSoFar += basesInCurrentLine; - } + private boolean peekIfFastaHeaderStart(byte b, long abs) throws IOException { + return b == GT && peekIfLineStart(abs); + } - channel.position(nextHdr); - return new SequenceIndex(firstBaseByte, lastBaseByte, 1, 1, lines); //todo fix + private boolean isNewline(byte b) { + return b == LF; // we already normalize CRLF earlier; lines are LF-terminated in scanning } + private boolean isAllowedBase(byte b) { + return alphabet.isAllowed(b); + } + + private boolean isNBase(byte b) { + return alphabet.isNBase(b); + } } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java index a341f52b..0ec7667a 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java @@ -6,10 +6,19 @@ public SequenceAlphabet(String chars) { for (char c: chars.toCharArray()) if (c<128) allowed[c]=true; allowed['>']=false; } + + /** Fast ASCII check for is it an allowed char. */ public boolean isAllowed(byte b){ int i=b&0xFF; return i<128 && allowed[i]; } + + /** Fast ASCII check for 'N' or 'n' without decoding. */ + public boolean isNBase(byte b) { + return ((b | 0x20) == 'n'); + } + + public static SequenceAlphabet defaultNucleotideAlphabet() { return new SequenceAlphabet("ACGTURYSWKMBDHVNacgturyswkmbdhvn-.*"); } From f0fbfb0abac9226f4374fab983c6ca4841db60f8 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 3 Dec 2025 15:32:53 +0000 Subject: [PATCH 09/31] wiup --- .../exception/FastaFileException.java | 10 + .../exception/FastaReadException.java | 10 - .../ebi/embl/gff3tools/fasta/FastaEntry.java | 11 +- .../gff3tools/fasta/FastaEntryInternal.java | 17 + .../gff3tools/fasta/FastaFileService.java | 86 ++++ .../ebi/embl/gff3tools/fasta/FastaReader.java | 8 - .../fasta/SequentialFastaEntryReader.java | 422 ------------------ .../fasta/SequentialFastaFileReader.java | 193 ++++++++ .../fasta/{ => headerutils}/FastaHeader.java | 3 +- .../{ => headerutils}/JsonHeaderParser.java | 3 +- .../fasta/{ => headerutils}/ParsedHeader.java | 2 +- .../sequenceutils/SequenceIndexBuilder.java | 230 ++++++++++ .../gff3tools/fasta/FASTAFileReaderTest.java | 54 --- .../JsonHeaderParserTest.java | 6 +- .../SequenceIndexBuilderTest.java | 181 ++++++++ 15 files changed, 732 insertions(+), 504 deletions(-) create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaFileException.java delete mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java delete mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java delete mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java rename src/main/java/uk/ac/ebi/embl/gff3tools/fasta/{ => headerutils}/FastaHeader.java (84%) rename src/main/java/uk/ac/ebi/embl/gff3tools/fasta/{ => headerutils}/JsonHeaderParser.java (96%) rename src/main/java/uk/ac/ebi/embl/gff3tools/fasta/{ => headerutils}/ParsedHeader.java (65%) create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java delete mode 100644 src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java rename src/test/java/uk/ac/ebi/embl/gff3tools/fasta/{ => headerutils}/JsonHeaderParserTest.java (97%) create mode 100644 src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaFileException.java b/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaFileException.java new file mode 100644 index 00000000..9b51cfce --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaFileException.java @@ -0,0 +1,10 @@ +package uk.ac.ebi.embl.gff3tools.exception; + +public class FastaFileException extends Exception { + + public FastaFileException() {} + public FastaFileException(String message) { super(message); } + public FastaFileException(Throwable cause) { super(cause); } + public FastaFileException(String message, Throwable cause) { super(message, cause); } +} + diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java b/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java deleted file mode 100644 index 09459f79..00000000 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaReadException.java +++ /dev/null @@ -1,10 +0,0 @@ -package uk.ac.ebi.embl.gff3tools.exception; - -public class FastaReadException extends Exception { - - public FastaReadException() {} - public FastaReadException(String message) { super(message); } - public FastaReadException(Throwable cause) { super(cause); } - public FastaReadException(String message, Throwable cause) { super(message, cause); } -} - diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java index d3eff22d..07644fb7 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java @@ -2,14 +2,15 @@ import lombok.Getter; import lombok.Setter; -import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex; +import uk.ac.ebi.embl.gff3tools.fasta.headerutils.FastaHeader; @Getter @Setter public class FastaEntry { - String id; + String submissionId; + String accessionId; FastaHeader header; //json info - //information needed for accessing the file - long fastaStartByte; // position of '>' in the file - SequenceIndex sequenceIndex; // a smart index for querying ranges in the file + long totalBases; + long startCountNs; + long endCountNs; } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java new file mode 100644 index 00000000..a75c94e3 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java @@ -0,0 +1,17 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +import lombok.Getter; +import lombok.Setter; +import uk.ac.ebi.embl.gff3tools.fasta.headerutils.FastaHeader; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex; + +@Getter +@Setter +class FastaEntryInternal { + String submissionId; + String accessionId; + FastaHeader header; //json info + //information needed for accessing the file + long fastaStartByte; // position of '>' in the file + SequenceIndex sequenceIndex; // a smart index for querying ranges in the file +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java new file mode 100644 index 00000000..0595610c --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -0,0 +1,86 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +/** + * Owns a SequentialFastaEntryReader, keeps all entries + indexes in memory, supports ID renames, + * and serves base-range slices by mapping (N..M bases) -> byte span via the cached SequenceIndex, + * then asking the reader to stream bytes while skipping newlines. + */ +public final class FastaFileService{ + + private final File file; + private SequentialFastaFileReader reader; // owned here + + private final List entriesArchive; + + public FastaFileService(File file) throws FastaFileException { + this.file = Objects.requireNonNull(file, "file"); + entriesArchive = new ArrayList<>(); + } + + // ---------------------------- queries ---------------------------- + + public List getAllReadFastaEntries() { + return Collections.unmodifiableList(entriesArchive); + } + + public Optional getPreviouslyReadFasta(String accessionId) throws FastaFileException { + return Optional.empty(); + } + + public Optional getNewEntry(String newAccessionId) throws FastaFileException { //TODO it would be better if instead of getting the accessionId here, we can just call the accessionId generator service after (optionally) managing to read the entry + return Optional.empty(); + } + + /** + * Return a sequence slice for [fromBase..toBase] (1-based, inclusive) for the given ID. + * Uses the cached index to translate bases -> bytes, then asks the reader to stream + * ASCII bytes while skipping '\n' and '\r' on the fly. + */ + public Optional getSequenceRange(String accessionId, long fromBase, long toBase) throws FastaFileException { + ensureFileReaderOpen(); + //TODO + return Optional.empty(); + } + + // ---------------------------- interactions with the reader ---------------------------- + + /** Open the underlying reader and scan all entries and indexes into memory. */ + public boolean readNewEntry (String accessionId) throws FastaFileException { + return false; + } + + private void open() throws FastaFileException { + ensureFileReaderClosed(); // if already open, close first + try { + reader = new SequentialFastaFileReader(file); + } catch (IOException ioe) { + throw new FastaFileException("Failed to open FASTA reader: " + file.getAbsolutePath(), ioe); + } + } + + /** Close the reader. Safe to call multiple times. */ + private void close() throws FastaFileException { + if (reader != null) { + try { reader.close(); } + catch (IOException ioe) { + throw new FastaFileException("Failed to close FASTA reader: " + file.getAbsolutePath(), ioe); + } + reader = null; + } + } + + private void ensureFileReaderClosed() throws FastaFileException { + if (reader != null) close(); + } + + private void ensureFileReaderOpen() { + if (reader == null) throw new IllegalStateException("Service is not open. Call open() first."); + } +} + diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java deleted file mode 100644 index 8fb38881..00000000 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaReader.java +++ /dev/null @@ -1,8 +0,0 @@ -package uk.ac.ebi.embl.gff3tools.fasta; - -public interface Fasta { - - FastaEntry getCurrentEntry(); - - -} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java deleted file mode 100644 index 5df07519..00000000 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaEntryReader.java +++ /dev/null @@ -1,422 +0,0 @@ -package uk.ac.ebi.embl.gff3tools.fasta; - -import uk.ac.ebi.embl.gff3tools.exception.FastaReadException; -import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.LineEntry; -import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceAlphabet; -import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.charset.StandardCharsets; -import java.util.*; - -public class SequentialFastaEntryReader implements AutoCloseable { - - // -------- constants - private static final int SCAN_BUF_SIZE = 64 * 1024; - private static final int COUNT_BUF_SIZE = 8 * 1024; - private static final byte GT = (byte) '>'; - private static final byte LF = (byte) '\n'; - - // -------- file + config - private final FileChannel channel; - private final long fileSize; - private final JsonHeaderParser headerParser; - private final SequenceAlphabet alphabet; - - // -------- state/result - private final Map indexById = new LinkedHashMap<>(); - - // -------- ctor - public SequentialFastaEntryReader(File file) throws IOException { - this(file, new JsonHeaderParser(), SequenceAlphabet.defaultNucleotideAlphabet()); - } - - public SequentialFastaEntryReader(File file, JsonHeaderParser parser, SequenceAlphabet alphabet) - throws FileNotFoundException, IOException { - Objects.requireNonNull(file, "Input FASTA file is null"); - if (!file.exists()) throw new FileNotFoundException(file.getAbsolutePath()); - if (file.isDirectory()) throw new FileNotFoundException("Directory: " + file.getAbsolutePath()); - if (!file.canRead()) throw new IllegalArgumentException("No read permission: " + file.getAbsolutePath()); - - this.headerParser = Objects.requireNonNull(parser, "parser"); - this.alphabet = Objects.requireNonNull(alphabet, "alphabet"); - - this.channel = new FileInputStream(file).getChannel(); //exception will bubble up if fails - this.fileSize = channel.size(); - } - - // -------- lifecycle of the read - @Override public void close() throws IOException { channel.close(); } - public boolean readingFile() { return channel.isOpen(); } - - // -------- main iteration - - /** Reads the next FASTA entry, if there is none it returns an empty object*/ - public Optional readNext(long from) throws FastaReadException { - try { - OptionalLong headerPosition = seekToNextHeader(from); - if (headerPosition.isEmpty()) return Optional.empty(); // no next FASTA entry detected - channel.position(headerPosition.getAsLong()); //otherwise, position at start of new fasta - - // parse header & build sequence index - String headerLine = readAsciiLineFromCurrentPosition(); // scan first line, parser - if (headerLine == null) throw new FastaReadException("Header is malformed"); - ParsedHeader ph = headerParser.parse(headerLine); - SequenceIndex idx = buildSequenceIndex(); - - // produce current entry - FastaEntry newEntry = new FastaEntry(); - newEntry.setId(ph.getId()); - newEntry.setHeader(ph.getHeader()); - newEntry.setFastaStartByte(headerPosition.getAsLong()); - newEntry.setSequenceIndex(idx); - - return Optional.of(newEntry); - - } catch (IOException io) { - long position; - try{ - position= channel.position(); - } catch (IOException e) { - position = -1; - } - throw new FastaReadException("I/O while reading FASTA at byte " + position + ": " + io.getMessage(), io); - } - } - - // ===================================================================== - // = SCANNING (to next header) = - // ===================================================================== - - /** Finds next header ('>') that starts a line (at file start or after LF). */ - private OptionalLong seekToNextHeader(long from) throws IOException { - if (from >= fileSize) return OptionalLong.empty(); - - ByteBuffer buf = ByteBuffer.allocateDirect(SCAN_BUF_SIZE); - - while (from < fileSize) { - buf.clear(); - int want = (int) Math.min(buf.capacity(), fileSize - from); - buf.limit(want); - int n = channel.read(buf, from); - if (n <= 0) break; - buf.flip(); - - while (buf.hasRemaining()) { - long positionToCheck = from + buf.position(); - byte b = buf.get(); - if (peekIfFastaHeaderStart(b, positionToCheck)) { - return OptionalLong.of(positionToCheck); - } - } - from += n; - } - return OptionalLong.empty(); - } - - /** Reads until end of one ASCII line from current channel.position() ( '\n' terminated or EOF). - * Advances channel past end of line or to EOF */ - private String readAsciiLineFromCurrentPosition() throws IOException { - long scanPos = channel.position(); - if (scanPos >= fileSize) return null; - - StringBuilder sb = new StringBuilder(256); - ByteBuffer buf = ByteBuffer.allocateDirect(SCAN_BUF_SIZE); - - while (scanPos < fileSize) { - - // fill buffer from disk - buf.clear(); - int want = (int) Math.min(buf.capacity(), fileSize - scanPos); - buf.limit(want); - int n = channel.read(buf, scanPos); - if (n <= 0) break; - - buf.flip(); - - // find end of line in the bytes we already have - int lfIndex = indexOf(buf, LF); - if (lfIndex >= 0) - {// if end of line found, append bytes up to (not including) LF - appendAscii(sb, buf, lfIndex); - long nextLineStart = scanPos + lfIndex + 1; // consume LF - channel.position(nextLineStart); - return sb.toString(); - } else { - // no LF in this chunk; append all bytes and continue - appendAscii(sb, buf, buf.remaining()); - scanPos += n; - } - } - - // read up to EOF - channel.position(fileSize); - return sb.toString(); - } - - /** gets index of a target byte character in a byte buffer, returns -1 if not found **/ - private static int indexOf(ByteBuffer buf, byte target) { - for (int i = 0; i < buf.remaining(); i++) { - if (buf.get(buf.position() + i) == target) return i; - } - return -1; - } - - /** Append exactly len bytes from buf to sb as US-ASCII, advancing buf.position() by len. */ - private static void appendAscii(StringBuilder sb, ByteBuffer buf, int len) { - byte[] chunk = new byte[len]; - buf.get(chunk); - sb.append(new String(chunk, java.nio.charset.StandardCharsets.US_ASCII)); - } - - // ============================================================================= - // = SEQUENCE INDEX SCAN & BUILD = - // ============================================================================= - - /** Builds the per-line index for the sequence starting at current position (right after header line). */ - private SequenceIndex buildSequenceIndex() throws IOException { - ScanState s = initScanState(channel.position()); - ByteBuffer buf = newScanBuffer(); - - while (s.pos we hit next header; stop scanning - s.pos += n; - } - - finalizeOpenLineIfAny(s); - // leave channel at next header (or EOF) - channel.position(s.nextHdr); - - long startN = 0, endN = 0; - if (!s.lines.isEmpty()) { - LineEntry first = s.lines.get(0); - LineEntry last = s.lines.get(s.lines.size() - 1); - startN = countLeadingNsInLine(first); - endN = countTrailingNsInLine(last); - } - - return new SequenceIndex(s.firstBaseByte, startN, s.lastBaseByte, endN, s.lines); - } - -// --------------------------------------------------------------------- -// scan helpers -// --------------------------------------------------------------------- - - /** All mutable scanning state for one sequence. */ - private static final class ScanState { - long pos; // absolute file position we’re scanning from - long firstBaseByte = -1; // byte of first allowed base seen - long lastBaseByte = -1; // byte of last allowed base seen - long nextHdr; // byte of next '>' that starts a line (or fileSize) - - long lineFirstByte = -1; // byte of first base in current line - long lineLastByte = -1; // byte of last base in current line - long basesSoFar = 0; // total bases committed to s.lines - long basesInLine = 0; // bases accumulated in current line (not yet committed) - - final java.util.ArrayList lines = new java.util.ArrayList<>(256); - ScanState(long startPos, long fileSize) { this.pos = startPos; this.nextHdr = fileSize; } - } - - private ScanState initScanState(long startPos) { - return new ScanState(startPos, fileSize); - } - - private ByteBuffer newScanBuffer() { - return ByteBuffer.allocateDirect(SCAN_BUF_SIZE); - } - - private int fillBuffer(ByteBuffer buf, long at) throws IOException { - buf.clear(); - int want = (int) Math.min(buf.capacity(), fileSize - at); - buf.limit(want); - return channel.read(buf, at); - } - - /** - * Process a filled buffer. Returns true if scanning should stop (we found the next header), - * false to keep scanning. - */ - private boolean processBuffer(ByteBuffer buf, ScanState s) throws IOException { - buf.flip(); - while (buf.hasRemaining()) { - // capture index BEFORE get(), so abs == s.pos + idx - int idx = buf.position(); - byte b = buf.get(); - long abs = s.pos + idx; - - if (peekIfFastaHeaderStart(b, abs)) { - s.nextHdr = abs; - commitOpenLineIfAny(s); - return true; // done scanning current entry - } - - if (isNewline(b)) { - commitOpenLineIfAny(s); - continue; - } - - if (isAllowedBase(b)) { - observeBaseByte(abs, s); - continue; - } - - // else: ignore unexpected bytes (spec: sequence lines contain bases + '\n') - } - return false; - } - - // --------------------------------------------------------------------- - // state mutation helpers (lines) - // --------------------------------------------------------------------- - - /** We saw a base at absolute byte 'abs'. Update current line + first/last markers. */ - private void observeBaseByte(long abs, ScanState s) { - if (s.lineFirstByte < 0) s.lineFirstByte = abs; - s.lineLastByte = abs; - s.basesInLine++; - - if (s.firstBaseByte < 0) s.firstBaseByte = abs; - s.lastBaseByte = abs; - } - - /** If current line has bases, convert it into a LineEntry and reset line accumulators. */ - private void commitOpenLineIfAny(ScanState s) { - if (s.basesInLine <= 0) return; - - long baseStart = s.basesSoFar + 1; - long baseEnd = s.basesSoFar + s.basesInLine; - long byteStart = s.lineFirstByte; - long byteEndEx = s.lineLastByte + 1; // half-open - - s.lines.add(new LineEntry(baseStart, baseEnd, byteStart, byteEndEx)); - - s.basesSoFar += s.basesInLine; - s.basesInLine = 0; - s.lineFirstByte = -1; - s.lineLastByte = -1; - } - - /** If EOF hit with an unterminated line, commit it. */ - private void finalizeOpenLineIfAny(ScanState s) { - commitOpenLineIfAny(s); - } - - - private static void addLine(List lines, - long basesSoFar, - long basesInCurrentLine, - long firstByte, long lastByte) { - long baseStart = basesSoFar + 1; - long baseEnd = basesSoFar + basesInCurrentLine; - // byteEndExclusive is lastByte + 1 (ASCII 1 byte/base) - lines.add(new LineEntry(baseStart, baseEnd, firstByte, lastByte + 1)); - } - - // ===================================================================== - // = EDGE 'N' COUNT HELPERS = - // ===================================================================== - - /** Count leading 'N'/'n' in the given line’s byte range. */ - private long countLeadingNsInLine(LineEntry line) throws IOException { - long remaining = line.lengthBytes(); - long offset = line.byteStart; - long count = 0; - - ByteBuffer buf = ByteBuffer.allocateDirect(COUNT_BUF_SIZE); - - while (remaining > 0) { - buf.clear(); - int want = (int) Math.min(buf.capacity(), remaining); - buf.limit(want); - int n = channel.read(buf, offset); - if (n <= 0) break; - buf.flip(); - - for (int i = 0; i < n; i++) { - byte b = buf.get(); - if (isNBase(b)) { - count++; - } else { - return count; - } - } - remaining -= n; - offset += n; - } - return count; - } - - /** Count trailing 'N'/'n' in the given line’s byte range (scan forward, track tail run). */ - private long countTrailingNsInLine(LineEntry line) throws IOException { - long remaining = line.lengthBytes(); - long offset = line.byteStart; - long trailingRun = 0; - - ByteBuffer buf = ByteBuffer.allocateDirect(COUNT_BUF_SIZE); - - while (remaining > 0) { - buf.clear(); - int want = (int) Math.min(buf.capacity(), remaining); - buf.limit(want); - int n = channel.read(buf, offset); - if (n <= 0) break; - buf.flip(); - - for (int i = 0; i < n; i++) { - byte b = buf.get(); - if (isNBase(b)) { - trailingRun++; // extend current tail run - } else { - trailingRun = 0; // reset tail run on any non-N - } - } - remaining -= n; - offset += n; - } - return trailingRun; - } - - // --------------------------------------------------------------------- - // helper peek byte functions - // --------------------------------------------------------------------- - - - /** True if absolute position is at file start OR previous byte is '\n'. */ - private boolean peekIfLineStart(long absPos) throws IOException { - if (absPos == 0) return true; - if (absPos > fileSize) return false; - return peekByte(absPos - 1) == LF; - } - - /** Absolute peek (does not change channel.position()). Returns 0 if OOB. */ - private byte peekByte(long absPos) throws IOException { - if (absPos < 0 || absPos >= fileSize) return 0; - ByteBuffer one = ByteBuffer.allocate(1); - int n = channel.read(one, absPos); - return (n == 1) ? one.get(0) : 0; - } - - private boolean peekIfFastaHeaderStart(byte b, long abs) throws IOException { - return b == GT && peekIfLineStart(abs); - } - - private boolean isNewline(byte b) { - return b == LF; // we already normalize CRLF earlier; lines are LF-terminated in scanning - } - - private boolean isAllowedBase(byte b) { - return alphabet.isAllowed(b); - } - - private boolean isNBase(byte b) { - return alphabet.isNBase(b); - } -} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java new file mode 100644 index 00000000..a734ea33 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -0,0 +1,193 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; +import uk.ac.ebi.embl.gff3tools.fasta.headerutils.JsonHeaderParser; +import uk.ac.ebi.embl.gff3tools.fasta.headerutils.ParsedHeader; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceAlphabet; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndexBuilder; + +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.*; + +public class SequentialFastaFileReader implements AutoCloseable { + + private static final int BUFFER_SIZE = 64 * 1024; + private static final byte GT = (byte) '>'; + private static final byte LF = (byte) '\n'; + private static final byte CR = (byte) '\r'; + + private final FileChannel channel; + private final long fileSize; + private final JsonHeaderParser headerParser; + private final SequenceAlphabet alphabet; + + public SequentialFastaFileReader(File file) throws IOException { + this(file, new JsonHeaderParser(), SequenceAlphabet.defaultNucleotideAlphabet()); + } + + public SequentialFastaFileReader(File file, JsonHeaderParser parser, SequenceAlphabet alphabet) throws IOException { + Objects.requireNonNull(file, "Input FASTA file is null"); + if (!file.exists()) throw new FileNotFoundException(file.getAbsolutePath()); + if (file.isDirectory()) throw new FileNotFoundException("Directory: " + file.getAbsolutePath()); + if (!file.canRead()) throw new IllegalArgumentException("No read permission: " + file.getAbsolutePath()); + this.headerParser = Objects.requireNonNull(parser, "parser"); + this.alphabet = Objects.requireNonNull(alphabet, "alphabet"); + this.channel = new FileInputStream(file).getChannel(); + this.fileSize = channel.size(); + } + + @Override public void close() throws IOException { channel.close(); } + public boolean readingFile() { return channel.isOpen(); } + + + /** Reads the next FASTA entry starting at or after 'from'. */ + public Optional readNext(long from) throws FastaFileException { + try { + OptionalLong headerPosOpt = seekToNextHeader(from); + if (headerPosOpt.isEmpty()) return Optional.empty(); + + long headerPos = headerPosOpt.getAsLong(); + channel.position(headerPos); + + String headerLine = readAsciiLineFromCurrentPosition(); + if (headerLine == null) throw new FastaFileException("Header is malformed at byte " + headerPos); + ParsedHeader ph = headerParser.parse(headerLine); + + long sequenceStartPos = channel.position(); // first byte after header line is the sequence position + SequenceIndexBuilder sib = new SequenceIndexBuilder(channel, fileSize, alphabet); + SequenceIndexBuilder.Result res = sib.buildFrom(sequenceStartPos); + + // Move reader cursor to the sequence start position + channel.position(sequenceStartPos); + + FastaEntryInternal e = new FastaEntryInternal(); + e.setSubmissionId(ph.getId()); + e.setHeader(ph.getHeader()); + e.setFastaStartByte(headerPos); + e.setSequenceIndex(res.index); + + return Optional.of(e); + } catch (IOException io) { + long pos = safePos(); + throw new FastaFileException("I/O while reading FASTA at byte " + pos + ": " + io.getMessage(), io); + } + } + + /** Read ASCII bytes from [byteStart, byteEndExclusive) skipping LF/CR; does not change channel.position(). */ + public String readAsciiWithoutNewlines(long byteStart, long byteEndExclusive) throws IOException { + if (byteStart < 0 || byteEndExclusive < byteStart || byteEndExclusive > fileSize) { + throw new IllegalArgumentException("Bad byte window: " + byteStart + ".." + byteEndExclusive); + } + long remain = byteEndExclusive - byteStart; + long off = byteStart; + + // pre-size builder with a sane cap (skip newlines, so content <= remain) + int expect = (int) Math.min(remain, 1_000_000L); + StringBuilder sb = new StringBuilder(expect); + + ByteBuffer buf = ByteBuffer.allocateDirect(BUFFER_SIZE); + while (remain > 0) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), remain); + buf.limit(want); + int n = channel.read(buf, off); + if (n <= 0) break; + buf.flip(); + while (buf.hasRemaining()) { + byte b = buf.get(); + if (b == LF || b == CR) continue; // omit line breaks on the fly + sb.append((char)(b & 0xFF)); // ASCII + } + remain -= n; + off += n; + } + return sb.toString(); + } + + // ------------------ header seeking & line reading ------------------ + + private OptionalLong seekToNextHeader(long from) throws IOException { + if (from >= fileSize) return OptionalLong.empty(); + ByteBuffer buf = ByteBuffer.allocateDirect(BUFFER_SIZE); + + while (from < fileSize) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), fileSize - from); + buf.limit(want); + int n = channel.read(buf, from); + if (n <= 0) break; + buf.flip(); + while (buf.hasRemaining()) { + long abs = from + buf.position(); + if (buf.get() == GT && isLineStart(abs)) { + return OptionalLong.of(abs); + } + } + from += n; + } + return OptionalLong.empty(); + } + + private boolean isLineStart(long abs) throws IOException { + if (abs == 0) return true; + if (abs > fileSize) return false; + return peek(abs - 1) == LF; + } + + private byte peek(long abs) throws IOException { + if (abs < 0 || abs >= fileSize) return 0; + ByteBuffer one = ByteBuffer.allocate(1); + int n = channel.read(one, abs); + return (n == 1) ? one.get(0) : 0; + } + + /** Reads one ASCII line from current position, advances past LF or to EOF. */ + private String readAsciiLineFromCurrentPosition() throws IOException { + long scanPos = channel.position(); + if (scanPos >= fileSize) return null; + + StringBuilder sb = new StringBuilder(256); + ByteBuffer buf = ByteBuffer.allocateDirect(BUFFER_SIZE); + + while (scanPos < fileSize) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), fileSize - scanPos); + buf.limit(want); + int n = channel.read(buf, scanPos); + if (n <= 0) break; + buf.flip(); + + int lfIndex = indexOf(buf, LF); + if (lfIndex >= 0) { + appendAscii(sb, buf, lfIndex); + long nextLineStart = scanPos + lfIndex + 1; // consume LF + channel.position(nextLineStart); + return sb.toString(); + } else { + appendAscii(sb, buf, buf.remaining()); + scanPos += n; + } + } + channel.position(fileSize); + return sb.toString(); + } + + private static int indexOf(ByteBuffer buf, byte target) { + for (int i = 0; i < buf.remaining(); i++) { + if (buf.get(buf.position() + i) == target) return i; + } + return -1; + } + + private static void appendAscii(StringBuilder sb, ByteBuffer buf, int len) { + byte[] chunk = new byte[len]; + buf.get(chunk); + sb.append(new String(chunk, java.nio.charset.StandardCharsets.US_ASCII)); + } + + private long safePos() { + try { return channel.position(); } catch (IOException e) { return -1; } + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaHeader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java similarity index 84% rename from src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaHeader.java rename to src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java index 08b7115a..d390e3da 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaHeader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java @@ -1,7 +1,8 @@ -package uk.ac.ebi.embl.gff3tools.fasta; +package uk.ac.ebi.embl.gff3tools.fasta.headerutils; import lombok.Getter; import lombok.Setter; +import uk.ac.ebi.embl.gff3tools.fasta.Topology; import java.util.Optional; diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParser.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java similarity index 96% rename from src/main/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParser.java rename to src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java index 487169ff..b8663d71 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParser.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java @@ -1,7 +1,8 @@ -package uk.ac.ebi.embl.gff3tools.fasta; +package uk.ac.ebi.embl.gff3tools.fasta.headerutils; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import uk.ac.ebi.embl.gff3tools.fasta.Topology; import java.io.IOException; import java.util.*; diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/ParsedHeader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/ParsedHeader.java similarity index 65% rename from src/main/java/uk/ac/ebi/embl/gff3tools/fasta/ParsedHeader.java rename to src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/ParsedHeader.java index 0a5e5144..d45a9462 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/ParsedHeader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/ParsedHeader.java @@ -1,4 +1,4 @@ -package uk.ac.ebi.embl.gff3tools.fasta; +package uk.ac.ebi.embl.gff3tools.fasta.headerutils; import lombok.Value; diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java new file mode 100644 index 00000000..00c80240 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java @@ -0,0 +1,230 @@ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.ArrayList; +import java.util.List; + +public final class SequenceIndexBuilder { + + private static final int SCAN_BUF_SIZE = 64 * 1024; + private static final int COUNT_BUF_SIZE = 8 * 1024; + + private static final byte GT = (byte) '>'; + private static final byte LF = (byte) '\n'; + + public static final class Result { + public final SequenceIndex index; + public final long nextHeaderByte; // byte offset of next '>' at line start, or fileSize (EOF) + public Result(SequenceIndex index, long nextHeaderByte) { + this.index = index; this.nextHeaderByte = nextHeaderByte; + } + } + + private final FileChannel ch; + private final long fileSize; + private final SequenceAlphabet alphabet; + + public SequenceIndexBuilder(FileChannel ch, long fileSize, SequenceAlphabet alphabet) { + this.ch = ch; + this.fileSize = fileSize; + this.alphabet = alphabet; + } + + /** Build a SequenceIndex starting at 'startPos' (first byte after header line). */ + public Result buildFrom(long startPos) throws IOException { + ScanState s = new ScanState(startPos, fileSize); + ByteBuffer buf = newScanBuffer(); + + // ------------- scan raw bytes into provisional "sequence lines" ------------- + while (hasMore(s.pos)) { + int n = fillBuffer(buf, s.pos); + if (n <= 0) break; + if (processBuffer(buf, s)) break; // found next header + s.pos += n; + } + commitOpenLineIfAny(s); + + // ------------- filter window & compute metadata (requirements 1–4) ------------- + List filtered = filterLinesWithinWindow(s.lines, s.firstBaseByte, s.nextHdr); + + long firstBaseByte = filtered.isEmpty() ? -1 : filtered.get(0).byteStart; + long lastBaseByte = filtered.isEmpty() ? -1 : (filtered.get(filtered.size()-1).byteEndExclusive - 1); + + long startN = 0, endN = 0; + if (!filtered.isEmpty()) { + startN = countLeadingNs(filtered.get(0)); // (3) only first line + endN = countTrailingNs(filtered.get(filtered.size()-1)); // (4) only last line + } + + SequenceIndex idx = new SequenceIndex(firstBaseByte, startN, lastBaseByte, endN, filtered); + return new Result(idx, s.nextHdr); + } + + // ===================================================================== + // = scanning core = + // ===================================================================== + + private static final class ScanState { + long pos; // absolute scan position + long firstBaseByte = -1; // first allowed base byte seen + long lastBaseByte = -1; // last allowed base byte seen + long nextHdr; // byte of next header (or fileSize) + + long lineFirstByte = -1; // first allowed base byte in current line + long lineLastByte = -1; // last allowed base byte in current line + long basesSoFar = 0; + long basesInLine = 0; + + final ArrayList lines = new ArrayList<>(256); + ScanState(long startPos, long fileSize) { this.pos = startPos; this.nextHdr = fileSize; } + } + + private boolean hasMore(long p){ return p < fileSize; } + + private ByteBuffer newScanBuffer() { return ByteBuffer.allocateDirect(SCAN_BUF_SIZE); } + + private int fillBuffer(ByteBuffer buf, long at) throws IOException { + buf.clear(); + int want = (int) Math.min(buf.capacity(), fileSize - at); + buf.limit(want); + return ch.read(buf, at); // absolute read; does not touch ch.position() + } + + /** Returns true if we hit the next header and should stop scanning this entry. */ + private boolean processBuffer(ByteBuffer buf, ScanState s) throws IOException { + buf.flip(); + while (buf.hasRemaining()) { + int idx = buf.position(); + byte b = buf.get(); + long abs = s.pos + idx; + + if (isHeaderStart(b, abs)) { + s.nextHdr = abs; // stop window at header byte + commitOpenLineIfAny(s); // finalize any in-flight line + return true; + } + if (b == LF) { // end of a displayed sequence line + commitOpenLineIfAny(s); // (2) only lines with bases are committed + continue; + } + if (alphabet.isAllowed(b)) { + observeBase(abs, s); + } + // else: ignore non-allowed, non-newline junk on the line + } + return false; + } + + private boolean isHeaderStart(byte b, long abs) throws IOException { + return b == GT && isLineStart(abs); + } + + /** header must be at file start or immediately after LF */ + private boolean isLineStart(long abs) throws IOException { + if (abs == 0) return true; + if (abs > fileSize) return false; + return peek(abs - 1) == LF; + } + + private byte peek(long abs) throws IOException { + if (abs < 0 || abs >= fileSize) return 0; + ByteBuffer one = ByteBuffer.allocate(1); + int n = ch.read(one, abs); + return (n == 1) ? one.get(0) : 0; + } + + private void observeBase(long abs, ScanState s) { + if (s.lineFirstByte < 0) s.lineFirstByte = abs; + s.lineLastByte = abs; + s.basesInLine++; + + if (s.firstBaseByte < 0) s.firstBaseByte = abs; + s.lastBaseByte = abs; + } + + private void commitOpenLineIfAny(ScanState s) { + if (s.basesInLine <= 0) return; // (2) skip empty lines + long baseStart = s.basesSoFar + 1; + long baseEnd = s.basesSoFar + s.basesInLine; + long byteStart = s.lineFirstByte; + long byteEndEx = s.lineLastByte + 1; // half-open + + s.lines.add(new LineEntry(baseStart, baseEnd, byteStart, byteEndEx)); + + s.basesSoFar += s.basesInLine; + s.basesInLine = 0; + s.lineFirstByte = -1; + s.lineLastByte = -1; + } + + // ===================================================================== + // = window filter & edge N counting = + // ===================================================================== + + /** (1)+(2) Keep only lines fully inside [firstBaseByte, nextHdr) and already non-empty. */ + private List filterLinesWithinWindow(List raw, + long firstBaseByte, + long nextHdr) { + if (firstBaseByte < 0 || raw.isEmpty()) return List.of(); + ArrayList out = new ArrayList<>(raw.size()); + for (LineEntry L : raw) { + if (L.byteStart >= firstBaseByte && L.byteEndExclusive <= nextHdr) { + out.add(L); + } + } + // baseStart/baseEnd are already contiguous (1..N) in raw; filtering preserves order & numbering + return out; + } + + /** (3) count 'N'/'n' from the start of the first sequence line only. */ + private long countLeadingNs(LineEntry line) throws IOException { + long remaining = line.lengthBytes(); + long offset = line.byteStart; + long count = 0; + + ByteBuffer buf = ByteBuffer.allocateDirect(COUNT_BUF_SIZE); + while (remaining > 0) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), remaining); + buf.limit(want); + int n = ch.read(buf, offset); + if (n <= 0) break; + buf.flip(); + for (int i = 0; i < n; i++) { + byte b = buf.get(); + if (alphabet.isNBase(b)) count++; + else return count; + } + remaining -= n; + offset += n; + } + return count; + } + + /** (4) count 'N'/'n' at the tail of the last sequence line only. */ + private long countTrailingNs(LineEntry line) throws IOException { + long remaining = line.lengthBytes(); + long offset = line.byteStart; + long trailing = 0; + + ByteBuffer buf = ByteBuffer.allocateDirect(COUNT_BUF_SIZE); + while (remaining > 0) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), remaining); + buf.limit(want); + int n = ch.read(buf, offset); + if (n <= 0) break; + buf.flip(); + for (int i = 0; i < n; i++) { + byte b = buf.get(); + if (alphabet.isNBase(b)) trailing++; + else trailing = 0; + } + remaining -= n; + offset += n; + } + return trailing; + } +} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java deleted file mode 100644 index a556f5f4..00000000 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FASTAFileReaderTest.java +++ /dev/null @@ -1,54 +0,0 @@ -package uk.ac.ebi.embl.gff3tools.fasta; - -import org.junit.jupiter.api.Test; - -import java.io.File; -import java.net.URI; -import java.nio.file.Paths; -import java.util.List; -import java.util.Objects; - -import static org.junit.jupiter.api.Assertions.*; - -public class FASTAFileReaderTest { -/* - @Test - void readsExampleAndParsesIdsAndHeaderJson() throws Exception { - URI uri = Objects.requireNonNull( - getClass().getClassLoader().getResource("fasta/example.txt"), - "Test resource fasta/example.txt is missing" - ).toURI(); - File file = Paths.get(uri).toFile(); - - FastaReader reader = new FastaReader(); - List records = reader.readFile(file); - - // We expect two records (your two headers), not counting the "NONSENSE" lines. - assertEquals(2, records.size(), "Should parse two FASTA records"); - - // ---- Record 1 ---- - FastaEntry r1 = records.get(0); - assertEquals("AF123456.1", r1.getId(), - "Accession should be the first token between '>' and '|' (trimmed)"); - FastaHeader h1 = r1.getHeader(); - assertNotNull(h1, "Header must be present"); - assertEquals("Pinus sativa isolate xyz, complete mitochondrion", h1.getDescription()); - assertEquals("genomic", h1.getMoleculeType()); - assertEquals(Topology.CIRCULAR, h1.getTopology()); - assertTrue(h1.getChromosomeType().isEmpty()); - assertTrue(h1.getChromosomeLocation().isEmpty()); - assertTrue(h1.getChromosomeName().isEmpty()); - - // ---- Record 2 ---- - FastaEntry r2 = records.get(1); - assertEquals("AF123455.2", r2.getId(), - "Second accession should be parsed the same way"); - FastaHeader h2 = r2.getHeader(); - assertNotNull(h2); - assertEquals("Pinus sativa isolate xyz, complete mitochondrion", h2.getDescription()); - assertEquals("genomic", h2.getMoleculeType()); - assertEquals(Topology.CIRCULAR, h2.getTopology()); - } - *? - */ -} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParserTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java similarity index 97% rename from src/test/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParserTest.java rename to src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java index f73b0930..d7743f8f 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/JsonHeaderParserTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java @@ -1,6 +1,8 @@ -package uk.ac.ebi.embl.gff3tools.fasta; +package uk.ac.ebi.embl.gff3tools.fasta.headerutils; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import uk.ac.ebi.embl.gff3tools.fasta.Topology; import java.io.IOException; import java.util.Optional; @@ -21,7 +23,7 @@ void parsesStandardHeaderWithJson() { FastaHeader h = ph.getHeader(); assertEquals("Pinus sativa", h.getDescription()); assertEquals("genomic", h.getMoleculeType()); - assertEquals(Topology.CIRCULAR, h.getTopology()); + Assertions.assertEquals(Topology.CIRCULAR, h.getTopology()); assertTrue(h.getChromosomeType().isEmpty()); assertTrue(h.getChromosomeLocation().isEmpty()); assertTrue(h.getChromosomeName().isEmpty()); diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java new file mode 100644 index 00000000..a5e352d7 --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java @@ -0,0 +1,181 @@ +package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +public class SequenceIndexBuilderTest { + + @TempDir + Path tempDir; + + private static FileChannel openRead(Path p) throws IOException { + return FileChannel.open(p, StandardOpenOption.READ); + } + + private static Path writeAscii(Path dir, String filename, String content) throws IOException { + Path p = dir.resolve(filename); + Files.write(p, content.getBytes(StandardCharsets.US_ASCII)); + return p; + } + + @Test + void buildsIndex_ignoresEmptyLines_countsEdgeNs_only_withinWindow() throws Exception { + // Layout (US-ASCII): + // >ID1 | {"d":"x"}\n + // NNAC\n + // acgt\n + // ttnN\n + // \n + // \t\n + // \n + // >NEXT\n + String header = ">ID1 | {\"d\":\"x\"}\n"; + String l1 = "NNAC\n"; // leading N=2 + String l2 = "acgt\n"; + String l3 = "ttnN\n"; // trailing N=2 + String empties = "\n\t\n\n"; + String nextHead = ">NEXT\n"; + + String fasta = header + l1 + l2 + l3 + empties + nextHead; + Path p = writeAscii(tempDir, "idx1.fa", fasta); + + try (FileChannel ch = openRead(p)) { + long fileSize = ch.size(); + long seqStartPos = header.getBytes(StandardCharsets.US_ASCII).length; // first byte after header line + + SequenceAlphabet alpha = SequenceAlphabet.defaultNucleotideAlphabet(); + SequenceIndexBuilder sib = new SequenceIndexBuilder(ch, fileSize, alpha); + + long beforePos = ch.position(); // should remain unchanged + SequenceIndexBuilder.Result res = sib.buildFrom(seqStartPos); + long afterPos = ch.position(); + + // builder must not touch channel.position() + assertEquals(beforePos, afterPos, "builder must not change channel.position()"); + + SequenceIndex idx = res.index; + + // Lines: only 3 sequence lines; empties ignored + List lines = idx.linesView(); + assertEquals(3, lines.size(), "only non-empty sequence lines must be indexed"); + + // Base numbering should be contiguous across lines (4 bases per line) + assertEquals(1, lines.get(0).baseStart); + assertEquals(4, lines.get(0).baseEnd); + assertEquals(5, lines.get(1).baseStart); + assertEquals(8, lines.get(1).baseEnd); + assertEquals(9, lines.get(2).baseStart); + assertEquals(12, lines.get(2).baseEnd); + + // Byte math: each line has 4 letters; byteEndExclusive = lastBaseByte + 1 + long l1Start = seqStartPos; // begins right after header line + long l1EndEx = l1Start + 4; + long l2Start = l1EndEx + 1; // + LF between lines + long l2EndEx = l2Start + 4; + long l3Start = l2EndEx + 1; + long l3EndEx = l3Start + 4; + + assertEquals(l1Start, lines.get(0).byteStart); + assertEquals(l1EndEx, lines.get(0).byteEndExclusive); + + assertEquals(l2Start, lines.get(1).byteStart); + assertEquals(l2EndEx, lines.get(1).byteEndExclusive); + + assertEquals(l3Start, lines.get(2).byteStart); + assertEquals(l3EndEx, lines.get(2).byteEndExclusive); + + // first/last base bytes + assertEquals(l1Start, idx.firstBaseByte); + assertEquals(l3EndEx - 1, idx.lastBaseByte); + + // Edge N counting: only first and last lines are inspected + assertEquals(2, idx.startNBasesCount, "leading Ns only from first sequence line"); + assertEquals(2, idx.endNBasesCount, "trailing Ns only from last sequence line"); + + // nextHeaderByte should point to '>' of NEXT header + long expectedNextHeader = header.length() + l1.length() + l2.length() + l3.length() + empties.length(); + assertEquals(expectedNextHeader, res.nextHeaderByte); + } + } + + @Test + void supportsCRLF_beforeNextHeader_and_stillWindowsCorrectly() throws Exception { + // Mix CRLF lines in the sequence part; builder uses LF as terminator and ignores CR as non-base. + String header = ">ID2\n"; + // simulate CRLF lines by inserting '\r' before '\n' + String l1 = "NNxx".replace('x','A') + "\r\n"; // "NNAA\r\n" + String l2 = "gggg\r\n"; + String next = ">H2\n"; + + String fasta = header + l1 + l2 + next; + Path p = writeAscii(tempDir, "idx2.fa", fasta); + + try (FileChannel ch = openRead(p)) { + long seqStart = header.getBytes(StandardCharsets.US_ASCII).length; + SequenceIndexBuilder sib = new SequenceIndexBuilder(ch, ch.size(), SequenceAlphabet.defaultNucleotideAlphabet()); + + SequenceIndexBuilder.Result res = sib.buildFrom(seqStart); + SequenceIndex idx = res.index; + + // Two non-empty lines only + assertEquals(2, idx.linesView().size()); + + // leading Ns counted only on first line (here: 2) + assertEquals(2, idx.startNBasesCount); + // no trailing Ns on last line (all 'g') + assertEquals(0, idx.endNBasesCount); + + // nextHeader should be at the '>' byte of H2 + long expectedNext = fasta.lastIndexOf(">H2\n"); // ascii index + assertEquals(expectedNext, res.nextHeaderByte); + } + } + + @Test + void ignoresWhitespaceOnlyLines_and_middleLineNs_doNotAffectEdgeCounts() throws Exception { + String header = ">ID3\n"; + String l1 = "NACG\n"; // leading N = 1 + String l2 = "NNNN\n"; // middle line of Ns — must NOT affect start/end N counts + String blanks = " \n\t\n"; + String l3 = "GGGn\n"; // trailing n = 1 + String next = ">K\n"; + + String fasta = header + l1 + l2 + blanks + l3 + next; + Path p = writeAscii(tempDir, "idx3.fa", fasta); + + try (FileChannel ch = openRead(p)) { + long seqStart = header.getBytes(StandardCharsets.US_ASCII).length; + SequenceIndexBuilder sib = new SequenceIndexBuilder(ch, ch.size(), SequenceAlphabet.defaultNucleotideAlphabet()); + + long before = ch.position(); + SequenceIndexBuilder.Result res = sib.buildFrom(seqStart); + long after = ch.position(); + assertEquals(before, after, "builder must not move channel position"); + + SequenceIndex idx = res.index; + // three non-empty sequence lines: l1, l2, l3 + assertEquals(3, idx.linesView().size()); + + // Edge N counts: only first and last lines considered + assertEquals(1, idx.startNBasesCount, "only first line leading Ns"); + assertEquals(1, idx.endNBasesCount, "only last line trailing Ns"); + + // Middle line of Ns shouldn't change edge counts + assertEquals( idx.linesView().get(1).lengthBases(), 4 ); + + // Total base numbering should be contiguous: 4 + 4 + 4 = 12 + assertEquals(12, idx.totalBasesIncludingEdgeNBases()); + } + } +} + From cac54d7b58d65a1efa5727567a467976136e298c Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 3 Dec 2025 16:12:59 +0000 Subject: [PATCH 10/31] wip --- .../gff3tools/fasta/FastaFileService.java | 41 +++++++++---------- .../gff3tools/fasta/SequenceRangeOption.java | 6 +++ .../fasta/SequentialFastaFileReader.java | 13 +++++- 3 files changed, 38 insertions(+), 22 deletions(-) create mode 100644 src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceRangeOption.java diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java index 0595610c..0586ed89 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -1,5 +1,7 @@ package uk.ac.ebi.embl.gff3tools.fasta; +import lombok.Getter; +import lombok.Setter; import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; import java.io.File; @@ -11,29 +13,27 @@ * and serves base-range slices by mapping (N..M bases) -> byte span via the cached SequenceIndex, * then asking the reader to stream bytes while skipping newlines. */ +@Getter +@Setter public final class FastaFileService{ - private final File file; + private File file; private SequentialFastaFileReader reader; // owned here + public List entriesArchive; + private List entriesInternal; - private final List entriesArchive; - - public FastaFileService(File file) throws FastaFileException { - this.file = Objects.requireNonNull(file, "file"); + public FastaFileService(){ entriesArchive = new ArrayList<>(); + this.file = null; } // ---------------------------- queries ---------------------------- - public List getAllReadFastaEntries() { - return Collections.unmodifiableList(entriesArchive); - } - - public Optional getPreviouslyReadFasta(String accessionId) throws FastaFileException { - return Optional.empty(); + public List getAllReadFastaEntries() { + return new ArrayList<>(); } - public Optional getNewEntry(String newAccessionId) throws FastaFileException { //TODO it would be better if instead of getting the accessionId here, we can just call the accessionId generator service after (optionally) managing to read the entry + public Optional getFasta(String submissionId) throws FastaFileException { return Optional.empty(); } @@ -42,7 +42,7 @@ public Optional getNewEntry(String newAccessionId) throws Fa * Uses the cached index to translate bases -> bytes, then asks the reader to stream * ASCII bytes while skipping '\n' and '\r' on the fly. */ - public Optional getSequenceRange(String accessionId, long fromBase, long toBase) throws FastaFileException { + public Optional getSequenceRange(SequenceRangeOption option, String accessionId, long fromBase, long toBase) throws FastaFileException { ensureFileReaderOpen(); //TODO return Optional.empty(); @@ -50,22 +50,21 @@ public Optional getSequenceRange(String accessionId, long fromBase, long // ---------------------------- interactions with the reader ---------------------------- - /** Open the underlying reader and scan all entries and indexes into memory. */ - public boolean readNewEntry (String accessionId) throws FastaFileException { - return false; - } - - private void open() throws FastaFileException { + public void openNewFile(File fastaFile) throws FastaFileException { ensureFileReaderClosed(); // if already open, close first + this.file = Objects.requireNonNull(file, "file"); + this.entriesArchive.clear(); try { - reader = new SequentialFastaFileReader(file); + reader = new SequentialFastaFileReader(fastaFile); + var readEntries = reader.readAll(); + //TODO assign } catch (IOException ioe) { throw new FastaFileException("Failed to open FASTA reader: " + file.getAbsolutePath(), ioe); } } /** Close the reader. Safe to call multiple times. */ - private void close() throws FastaFileException { + public void close() throws FastaFileException { if (reader != null) { try { reader.close(); } catch (IOException ioe) { diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceRangeOption.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceRangeOption.java new file mode 100644 index 00000000..c5ce5437 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceRangeOption.java @@ -0,0 +1,6 @@ +package uk.ac.ebi.embl.gff3tools.fasta; + +public enum SequenceRangeOption { + WHOLE_SEQUENCE, + WITHOUT_N_BASES +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index a734ea33..0cd9f9e0 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -41,9 +41,20 @@ public SequentialFastaFileReader(File file, JsonHeaderParser parser, SequenceAlp @Override public void close() throws IOException { channel.close(); } public boolean readingFile() { return channel.isOpen(); } + public List readAll() throws FastaFileException, IOException { + long position = 0; + List entries = new ArrayList<>(); + while (true){ + var entry = readNext(position); + if (entry.isEmpty()) break; + entries.add(entry.get()); + position = channel.position(); + } + return entries; + } /** Reads the next FASTA entry starting at or after 'from'. */ - public Optional readNext(long from) throws FastaFileException { + private Optional readNext(long from) throws FastaFileException { try { OptionalLong headerPosOpt = seekToNextHeader(from); if (headerPosOpt.isEmpty()) return Optional.empty(); From 090e1beabaccc02de66bc3cdf258b52b96c7393f Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 3 Dec 2025 16:40:29 +0000 Subject: [PATCH 11/31] wip --- .../gff3tools/fasta/FastaFileService.java | 38 +++++++++++++------ .../fasta/SequentialFastaFileReader.java | 7 ++++ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java index 0586ed89..bed5d98d 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -3,6 +3,8 @@ import lombok.Getter; import lombok.Setter; import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.ByteSpan; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex; import java.io.File; import java.io.IOException; @@ -17,22 +19,18 @@ @Setter public final class FastaFileService{ + public List fastaEntries; + private HashMap sequenceIndexes; private File file; private SequentialFastaFileReader reader; // owned here - public List entriesArchive; - private List entriesInternal; public FastaFileService(){ - entriesArchive = new ArrayList<>(); + fastaEntries = new ArrayList<>(); this.file = null; } // ---------------------------- queries ---------------------------- - public List getAllReadFastaEntries() { - return new ArrayList<>(); - } - public Optional getFasta(String submissionId) throws FastaFileException { return Optional.empty(); } @@ -42,10 +40,25 @@ public Optional getFasta(String submissionId) throws FastaFileExcept * Uses the cached index to translate bases -> bytes, then asks the reader to stream * ASCII bytes while skipping '\n' and '\r' on the fly. */ - public Optional getSequenceRange(SequenceRangeOption option, String accessionId, long fromBase, long toBase) throws FastaFileException { + public String getSequenceRange(SequenceRangeOption option, String submissionId, long fromBase, long toBase) throws FastaFileException { ensureFileReaderOpen(); - //TODO - return Optional.empty(); + var index = sequenceIndexes.get(submissionId); + if (index == null) { throw new FastaFileException("No sequence index found for submissionId " + submissionId); } + + ByteSpan span; + switch (option) { + case WHOLE_SEQUENCE: + span = index.byteSpanForBaseRangeIncludingEdgeNBases(fromBase, toBase); + break; + case WITHOUT_N_BASES: + span = index.byteSpanForBaseRange(fromBase, toBase); + break; + default: + throw new IllegalStateException("Unknown option " + option); + } + + var result = reader.getSequenceSlice(span); + return result; } // ---------------------------- interactions with the reader ---------------------------- @@ -53,7 +66,8 @@ public Optional getSequenceRange(SequenceRangeOption option, String acce public void openNewFile(File fastaFile) throws FastaFileException { ensureFileReaderClosed(); // if already open, close first this.file = Objects.requireNonNull(file, "file"); - this.entriesArchive.clear(); + this.fastaEntries.clear(); + this.sequenceIndexes.clear(); try { reader = new SequentialFastaFileReader(fastaFile); var readEntries = reader.readAll(); @@ -79,7 +93,7 @@ private void ensureFileReaderClosed() throws FastaFileException { } private void ensureFileReaderOpen() { - if (reader == null) throw new IllegalStateException("Service is not open. Call open() first."); + if (reader == null || !reader.readingFile()) throw new IllegalStateException("Service is not open. Call open() first."); } } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index 0cd9f9e0..7815936f 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -3,6 +3,7 @@ import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; import uk.ac.ebi.embl.gff3tools.fasta.headerutils.JsonHeaderParser; import uk.ac.ebi.embl.gff3tools.fasta.headerutils.ParsedHeader; +import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.ByteSpan; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceAlphabet; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndexBuilder; @@ -41,6 +42,11 @@ public SequentialFastaFileReader(File file, JsonHeaderParser parser, SequenceAlp @Override public void close() throws IOException { channel.close(); } public boolean readingFile() { return channel.isOpen(); } + public String getSequenceSlice(ByteSpan span) { + //TODO + return ""; + } + public List readAll() throws FastaFileException, IOException { long position = 0; List entries = new ArrayList<>(); @@ -201,4 +207,5 @@ private static void appendAscii(StringBuilder sb, ByteBuffer buf, int len) { private long safePos() { try { return channel.position(); } catch (IOException e) { return -1; } } + } From 7347a87d96b4e9795d28faf8933a62f0bb1c2505 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Thu, 4 Dec 2025 14:31:34 +0000 Subject: [PATCH 12/31] wip --- .../gff3tools/fasta/FastaFileService.java | 20 +++++++++++++++++-- .../fasta/SequentialFastaFileReader.java | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java index bed5d98d..0216b437 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -31,8 +31,14 @@ public FastaFileService(){ // ---------------------------- queries ---------------------------- + public Optional setAccessionId(String submissionId, String accessionId) throws FastaFileException { + Optional target = fastaEntries.stream().filter(entry -> entry.getSubmissionId().equals(submissionId)).findFirst(); + target.ifPresent(entry -> entry.setAccessionId(accessionId)); + return target; + } + public Optional getFasta(String submissionId) throws FastaFileException { - return Optional.empty(); + return fastaEntries.stream().filter(entry -> entry.getSubmissionId().equals(submissionId)).findFirst(); } /** @@ -71,7 +77,17 @@ public void openNewFile(File fastaFile) throws FastaFileException { try { reader = new SequentialFastaFileReader(fastaFile); var readEntries = reader.readAll(); - //TODO assign + for (var entry : readEntries) { + FastaEntry fastaEntry = new FastaEntry(); + fastaEntry.setSubmissionId(entry.getSubmissionId()); + fastaEntry.setHeader(entry.getHeader()); + fastaEntry.setTotalBases(entry.sequenceIndex.totalBases()); + fastaEntry.setStartCountNs(entry.sequenceIndex.startNBasesCount); + fastaEntry.setEndCountNs(entry.sequenceIndex.endNBasesCount); + fastaEntries.add(fastaEntry); + + sequenceIndexes.put(entry.getSubmissionId(), entry.sequenceIndex); + } } catch (IOException ioe) { throw new FastaFileException("Failed to open FASTA reader: " + file.getAbsolutePath(), ioe); } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index 7815936f..467ad2d2 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -43,7 +43,7 @@ public SequentialFastaFileReader(File file, JsonHeaderParser parser, SequenceAlp public boolean readingFile() { return channel.isOpen(); } public String getSequenceSlice(ByteSpan span) { - //TODO + return ""; } From e8d71d5d72d6d254bc42847475f1497c78f1fe1e Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Fri, 5 Dec 2025 13:43:03 +0000 Subject: [PATCH 13/31] basic-test-success --- .../exception/FastaFileException.java | 26 ++++- .../ebi/embl/gff3tools/fasta/FastaEntry.java | 22 ++-- .../gff3tools/fasta/FastaEntryInternal.java | 14 ++- .../gff3tools/fasta/FastaFileService.java | 100 +++++++++++++----- .../gff3tools/fasta/SequenceRangeOption.java | 10 ++ .../fasta/SequentialFastaFileReader.java | 94 +++++++++++++--- .../ac/ebi/embl/gff3tools/fasta/Topology.java | 10 ++ .../fasta/headerutils/FastaHeader.java | 25 +++-- .../fasta/headerutils/JsonHeaderParser.java | 49 ++++++--- .../fasta/headerutils/ParsedHeader.java | 12 ++- .../fasta/sequenceutils/ByteSpan.java | 27 ++++- .../fasta/sequenceutils/LineEntry.java | 25 ++++- .../fasta/sequenceutils/SequenceAlphabet.java | 22 ++-- .../fasta/sequenceutils/SequenceIndex.java | 46 ++++---- .../sequenceutils/SequenceIndexBuilder.java | 66 +++++++----- .../FastaFileServiceIntegrationTest.java | 68 ++++++++++++ .../gff3tools/fasta/FastaTestResources.java | 59 +++++++++++ .../headerutils/JsonHeaderParserTest.java | 34 ++++-- .../SequenceIndexBuilderTest.java | 64 ++++++----- .../sequenceutils/SequenceIndexTest.java | 49 +++++---- src/test/resources/fasta/example2.txt | 9 ++ 21 files changed, 637 insertions(+), 194 deletions(-) create mode 100644 src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java create mode 100644 src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaTestResources.java create mode 100644 src/test/resources/fasta/example2.txt diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaFileException.java b/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaFileException.java index 9b51cfce..08c86cc7 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaFileException.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/exception/FastaFileException.java @@ -1,10 +1,28 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.exception; public class FastaFileException extends Exception { public FastaFileException() {} - public FastaFileException(String message) { super(message); } - public FastaFileException(Throwable cause) { super(cause); } - public FastaFileException(String message, Throwable cause) { super(message, cause); } -} + public FastaFileException(String message) { + super(message); + } + + public FastaFileException(Throwable cause) { + super(cause); + } + + public FastaFileException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java index 07644fb7..235ac21a 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java @@ -1,3 +1,13 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta; import lombok.Getter; @@ -7,10 +17,10 @@ @Getter @Setter public class FastaEntry { - String submissionId; - String accessionId; - FastaHeader header; //json info - long totalBases; - long startCountNs; - long endCountNs; + public String submissionId; + public String accessionId; + public FastaHeader header; // json info + public long totalBases; + public long leadingNsCount; + public long trailingNsCount; } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java index a75c94e3..1c96ef48 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java @@ -1,3 +1,13 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta; import lombok.Getter; @@ -10,8 +20,8 @@ class FastaEntryInternal { String submissionId; String accessionId; - FastaHeader header; //json info - //information needed for accessing the file + FastaHeader header; // json info + // information needed for accessing the file long fastaStartByte; // position of '>' in the file SequenceIndex sequenceIndex; // a smart index for querying ranges in the file } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java index 0216b437..a081d97f 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -1,15 +1,25 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.*; import lombok.Getter; import lombok.Setter; import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.ByteSpan; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndex; -import java.io.File; -import java.io.IOException; -import java.util.*; - /** * Owns a SequentialFastaEntryReader, keeps all entries + indexes in memory, supports ID renames, * and serves base-range slices by mapping (N..M bases) -> byte span via the cached SequenceIndex, @@ -17,39 +27,79 @@ */ @Getter @Setter -public final class FastaFileService{ +public final class FastaFileService { - public List fastaEntries; - private HashMap sequenceIndexes; + public List fastaEntries = new ArrayList<>(); + + private HashMap sequenceIndexes = new HashMap<>(); private File file; - private SequentialFastaFileReader reader; // owned here + private SequentialFastaFileReader reader; // owned here - public FastaFileService(){ - fastaEntries = new ArrayList<>(); + public FastaFileService() { this.file = null; } // ---------------------------- queries ---------------------------- public Optional setAccessionId(String submissionId, String accessionId) throws FastaFileException { - Optional target = fastaEntries.stream().filter(entry -> entry.getSubmissionId().equals(submissionId)).findFirst(); + Optional target = fastaEntries.stream() + .filter(entry -> entry.getSubmissionId().equals(submissionId)) + .findFirst(); target.ifPresent(entry -> entry.setAccessionId(accessionId)); return target; } - public Optional getFasta(String submissionId) throws FastaFileException { - return fastaEntries.stream().filter(entry -> entry.getSubmissionId().equals(submissionId)).findFirst(); + public Optional getFastaWithSubmissionId(String submissionId) throws FastaFileException { + return fastaEntries.stream() + .filter(entry -> entry.getSubmissionId().equals(submissionId)) + .findFirst(); } + /** Return a sequence slice as a String (no EOLs) for [fromBase..toBase] inclusive. */ + public String getSequenceRangeAsString(SequenceRangeOption option, + String submissionId, + long fromBase, long toBase) throws FastaFileException { + ensureFileReaderOpen(); + SequenceIndex index = sequenceIndexes.get(submissionId); + if (index == null) { + throw new FastaFileException("No sequence index found for submissionId " + submissionId); + } + + final ByteSpan span; + switch (option) { + case WHOLE_SEQUENCE: + span = index.byteSpanForBaseRangeIncludingEdgeNBases(fromBase, toBase); + break; + case WITHOUT_N_BASES: + span = index.byteSpanForBaseRange(fromBase, toBase); + break; + default: + throw new IllegalStateException("Unknown option " + option); + } + + try { + return reader.getSequenceSliceString(span); + } catch (IOException ioe) { + throw new FastaFileException( + "I/O while reading slice for " + submissionId + " bytes " + span.start + ".." + (span.endEx - 1), + ioe + ); + } + } + + /** - * Return a sequence slice for [fromBase..toBase] (1-based, inclusive) for the given ID. + * Return a sequence slice for reader [fromBase..toBase] (1-based, inclusive) for the given ID. * Uses the cached index to translate bases -> bytes, then asks the reader to stream * ASCII bytes while skipping '\n' and '\r' on the fly. */ - public String getSequenceRange(SequenceRangeOption option, String submissionId, long fromBase, long toBase) throws FastaFileException { + public InputStream streamSequenceRange(SequenceRangeOption option, String submissionId, long fromBase, long toBase) + throws FastaFileException { ensureFileReaderOpen(); var index = sequenceIndexes.get(submissionId); - if (index == null) { throw new FastaFileException("No sequence index found for submissionId " + submissionId); } + if (index == null) { + throw new FastaFileException("No sequence index found for submissionId " + submissionId); + } ByteSpan span; switch (option) { @@ -63,15 +113,14 @@ public String getSequenceRange(SequenceRangeOption option, String submissionId, throw new IllegalStateException("Unknown option " + option); } - var result = reader.getSequenceSlice(span); - return result; + return reader.getSequenceSlice(span); } // ---------------------------- interactions with the reader ---------------------------- public void openNewFile(File fastaFile) throws FastaFileException { ensureFileReaderClosed(); // if already open, close first - this.file = Objects.requireNonNull(file, "file"); + this.file = Objects.requireNonNull(fastaFile, "file"); this.fastaEntries.clear(); this.sequenceIndexes.clear(); try { @@ -82,8 +131,8 @@ public void openNewFile(File fastaFile) throws FastaFileException { fastaEntry.setSubmissionId(entry.getSubmissionId()); fastaEntry.setHeader(entry.getHeader()); fastaEntry.setTotalBases(entry.sequenceIndex.totalBases()); - fastaEntry.setStartCountNs(entry.sequenceIndex.startNBasesCount); - fastaEntry.setEndCountNs(entry.sequenceIndex.endNBasesCount); + fastaEntry.setLeadingNsCount(entry.sequenceIndex.startNBasesCount); + fastaEntry.setTrailingNsCount(entry.sequenceIndex.endNBasesCount); fastaEntries.add(fastaEntry); sequenceIndexes.put(entry.getSubmissionId(), entry.sequenceIndex); @@ -96,8 +145,9 @@ public void openNewFile(File fastaFile) throws FastaFileException { /** Close the reader. Safe to call multiple times. */ public void close() throws FastaFileException { if (reader != null) { - try { reader.close(); } - catch (IOException ioe) { + try { + reader.close(); + } catch (IOException ioe) { throw new FastaFileException("Failed to close FASTA reader: " + file.getAbsolutePath(), ioe); } reader = null; @@ -109,7 +159,7 @@ private void ensureFileReaderClosed() throws FastaFileException { } private void ensureFileReaderOpen() { - if (reader == null || !reader.readingFile()) throw new IllegalStateException("Service is not open. Call open() first."); + if (reader == null || !reader.readingFile()) + throw new IllegalStateException("Service is not open. Call open() first."); } } - diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceRangeOption.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceRangeOption.java index c5ce5437..cba4a165 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceRangeOption.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequenceRangeOption.java @@ -1,3 +1,13 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta; public enum SequenceRangeOption { diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index 467ad2d2..54ba4882 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -1,5 +1,19 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta; +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.*; import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; import uk.ac.ebi.embl.gff3tools.fasta.headerutils.JsonHeaderParser; import uk.ac.ebi.embl.gff3tools.fasta.headerutils.ParsedHeader; @@ -7,11 +21,6 @@ import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceAlphabet; import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.SequenceIndexBuilder; -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; -import java.util.*; - public class SequentialFastaFileReader implements AutoCloseable { private static final int BUFFER_SIZE = 64 * 1024; @@ -39,18 +48,70 @@ public SequentialFastaFileReader(File file, JsonHeaderParser parser, SequenceAlp this.fileSize = channel.size(); } - @Override public void close() throws IOException { channel.close(); } - public boolean readingFile() { return channel.isOpen(); } + @Override + public void close() throws IOException { + channel.close(); + } + + public boolean readingFile() { + return channel.isOpen(); + } + + public String getSequenceSliceString(ByteSpan span) throws IOException { + return readAsciiWithoutNewlines(span.start, span.endEx); + } + + public InputStream getSequenceSlice(ByteSpan span) { + return new InputStream() { + private long position = span.start; + private final long end = span.endEx; + private final ByteBuffer buffer = ByteBuffer.allocate(8192); // Adjust as needed + + @Override + public int read() throws IOException { + while (true) { + if (!buffer.hasRemaining()) { + if (position >= end) return -1; + + buffer.clear(); + int toRead = (int) Math.min(buffer.capacity(), end - position); + int read = channel.read(buffer, position); + if (read == -1) return -1; + + position += read; + buffer.flip(); + } + + // Peek the next byte + if (buffer.hasRemaining()) { + byte b = buffer.get(); + if (b == '\n') continue; // Filter out newline + return b & 0xFF; + } + } + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + int totalRead = 0; - public String getSequenceSlice(ByteSpan span) { + while (totalRead < len) { + int next = read(); + if (next == -1) break; - return ""; + b[off + totalRead] = (byte) next; + totalRead++; + } + + return (totalRead == 0) ? -1 : totalRead; + } + }; } public List readAll() throws FastaFileException, IOException { long position = 0; List entries = new ArrayList<>(); - while (true){ + while (true) { var entry = readNext(position); if (entry.isEmpty()) break; entries.add(entry.get()); @@ -114,11 +175,11 @@ public String readAsciiWithoutNewlines(long byteStart, long byteEndExclusive) th buf.flip(); while (buf.hasRemaining()) { byte b = buf.get(); - if (b == LF || b == CR) continue; // omit line breaks on the fly - sb.append((char)(b & 0xFF)); // ASCII + if (b == LF || b == CR) continue; // omit line breaks on the fly + sb.append((char) (b & 0xFF)); // ASCII } remain -= n; - off += n; + off += n; } return sb.toString(); } @@ -205,7 +266,10 @@ private static void appendAscii(StringBuilder sb, ByteBuffer buf, int len) { } private long safePos() { - try { return channel.position(); } catch (IOException e) { return -1; } + try { + return channel.position(); + } catch (IOException e) { + return -1; + } } - } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/Topology.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/Topology.java index 24901b37..148f4ca0 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/Topology.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/Topology.java @@ -1,3 +1,13 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta; public enum Topology { diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java index d390e3da..e4edc521 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java @@ -1,18 +1,27 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta.headerutils; +import java.util.Optional; import lombok.Getter; import lombok.Setter; import uk.ac.ebi.embl.gff3tools.fasta.Topology; -import java.util.Optional; - @Getter @Setter public class FastaHeader { - String description; // mandatory (can be empty if you insist) - String moleculeType; // mandatory (can be null if empty allowed) - Topology topology; // mandatory (can be null if empty allowed) - Optional chromosomeType; // optional (doesnt have to be a json) - Optional chromosomeLocation; // optional - Optional chromosomeName; // optional + String description; // mandatory (can be empty if you insist) + String moleculeType; // mandatory (can be null if empty allowed) + Topology topology; // mandatory (can be null if empty allowed) + Optional chromosomeType; // optional (doesnt have to be a json) + Optional chromosomeLocation; // optional + Optional chromosomeName; // optional } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java index b8663d71..bc917877 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java @@ -1,11 +1,20 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta.headerutils; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import uk.ac.ebi.embl.gff3tools.fasta.Topology; - import java.io.IOException; import java.util.*; +import uk.ac.ebi.embl.gff3tools.fasta.Topology; public class JsonHeaderParser { private static final ObjectMapper MAPPER = new ObjectMapper(); @@ -31,16 +40,19 @@ private static void fillFromJson(String raw, FastaHeader h) throws IOException { if (raw == null || raw.isEmpty()) return; // Normalize curly quotes / NBSPs but keep the final JSON we actually tried to parse - String normalized = raw.replace('\u201C','"').replace('\u201D','"') - .replace('\u2018','\'').replace('\u2019','\'') - .replace('\u00A0',' ').trim(); + String normalized = raw.replace('\u201C', '"') + .replace('\u201D', '"') + .replace('\u2018', '\'') + .replace('\u2019', '\'') + .replace('\u00A0', ' ') + .trim(); try { JsonNode node = MAPPER.readTree(normalized); - Map m = new HashMap<>(); + Map m = new HashMap<>(); node.fields().forEachRemaining(e -> { - String k = e.getKey()==null?"":e.getKey(); - k = k.trim().toLowerCase(Locale.ROOT).replaceAll("[\\s_-]+",""); - String v = e.getValue().isNull()?null:e.getValue().asText(); + String k = e.getKey() == null ? "" : e.getKey(); + k = k.trim().toLowerCase(Locale.ROOT).replaceAll("[\\s_-]+", ""); + String v = e.getValue().isNull() ? null : e.getValue().asText(); m.put(k, v); }); h.setDescription(m.get("description")); @@ -58,14 +70,19 @@ private static void fillFromJson(String raw, FastaHeader h) throws IOException { } } - private static String emptyToNull(String s){ return (s==null||s.isEmpty())?null:s; } + private static String emptyToNull(String s) { + return (s == null || s.isEmpty()) ? null : s; + } - private static Topology parseTopology(String s){ - if (s==null) return null; - switch (s.trim().toUpperCase(Locale.ROOT)){ - case "LINEAR": return Topology.LINEAR; - case "CIRCULAR": return Topology.CIRCULAR; - default: return null; + private static Topology parseTopology(String s) { + if (s == null) return null; + switch (s.trim().toUpperCase(Locale.ROOT)) { + case "LINEAR": + return Topology.LINEAR; + case "CIRCULAR": + return Topology.CIRCULAR; + default: + return null; } } } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/ParsedHeader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/ParsedHeader.java index d45a9462..b5b33a8a 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/ParsedHeader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/ParsedHeader.java @@ -1,3 +1,13 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta.headerutils; import lombok.Value; @@ -6,4 +16,4 @@ public class ParsedHeader { String id; FastaHeader header; -} \ No newline at end of file +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java index 6146620b..6d159370 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/ByteSpan.java @@ -1,8 +1,25 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; public final class ByteSpan { - public final long start; // inclusive - public final long endEx; // exclusive - public ByteSpan(long start, long endEx) { this.start = start; this.endEx = endEx; } - public long length() { return endEx - start; } -} \ No newline at end of file + public final long start; // inclusive + public final long endEx; // exclusive + + public ByteSpan(long start, long endEx) { + this.start = start; + this.endEx = endEx; + } + + public long length() { + return endEx - start; + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java index c1b994b3..32f1cd02 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java @@ -1,9 +1,19 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; public final class LineEntry { - public long baseStart; // 1-based, inclusive (after edits) - public long baseEnd; // 1-based, inclusive - public long byteStart; // absolute byte offset of first base in this line + public long baseStart; // 1-based, inclusive (after edits) + public long baseEnd; // 1-based, inclusive + public long byteStart; // absolute byte offset of first base in this line public long byteEndExclusive; // absolute byte offset one past last base public LineEntry(long baseStart, long baseEnd, long byteStart, long byteEndExclusive) { @@ -13,6 +23,11 @@ public LineEntry(long baseStart, long baseEnd, long byteStart, long byteEndExclu this.byteEndExclusive = byteEndExclusive; } - public long lengthBases() { return baseEnd - baseStart + 1; } - public long lengthBytes() { return byteEndExclusive - byteStart; } // ASCII: same as bases + public long lengthBases() { + return baseEnd - baseStart + 1; + } + + public long lengthBytes() { + return byteEndExclusive - byteStart; + } // ASCII: same as bases } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java index 0ec7667a..e66ace23 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java @@ -1,16 +1,27 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; public final class SequenceAlphabet { private final boolean[] allowed = new boolean[128]; + public SequenceAlphabet(String chars) { - for (char c: chars.toCharArray()) if (c<128) allowed[c]=true; - allowed['>']=false; + for (char c : chars.toCharArray()) if (c < 128) allowed[c] = true; + allowed['>'] = false; } /** Fast ASCII check for is it an allowed char. */ - public boolean isAllowed(byte b){ - int i=b&0xFF; - return i<128 && allowed[i]; + public boolean isAllowed(byte b) { + int i = b & 0xFF; + return i < 128 && allowed[i]; } /** Fast ASCII check for 'N' or 'n' without decoding. */ @@ -18,7 +29,6 @@ public boolean isNBase(byte b) { return ((b | 0x20) == 'n'); } - public static SequenceAlphabet defaultNucleotideAlphabet() { return new SequenceAlphabet("ACGTURYSWKMBDHVNacgturyswkmbdhvn-.*"); } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java index ed3acfa7..2bdc9577 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndex.java @@ -1,43 +1,52 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; -import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.ByteSpan; -import uk.ac.ebi.embl.gff3tools.fasta.sequenceutils.LineEntry; - import java.util.ArrayList; import java.util.Collections; import java.util.List; public final class SequenceIndex { - public long firstBaseByte; // -1 if empty + public long firstBaseByte; // -1 if empty public long startNBasesCount; - public long lastBaseByte; // -1 if empty + public long lastBaseByte; // -1 if empty public long endNBasesCount; private final List lines; - public SequenceIndex(long firstBaseByte, long startNBasesCount, - long lastBaseByte, long endNBasesCount, List lines) { + public SequenceIndex( + long firstBaseByte, long startNBasesCount, long lastBaseByte, long endNBasesCount, List lines) { this.firstBaseByte = firstBaseByte; this.startNBasesCount = startNBasesCount; - this.lastBaseByte = lastBaseByte; + this.lastBaseByte = lastBaseByte; this.endNBasesCount = endNBasesCount; this.lines = new ArrayList<>(lines); } - public List linesView() { return Collections.unmodifiableList(lines); } + public List linesView() { + return Collections.unmodifiableList(lines); + } - public long totalBasesIncludingEdgeNBases() { + public long totalBases() { if (lines.isEmpty()) return 0; return lines.get(lines.size() - 1).baseEnd; } - public long totalBases() { - long bases = totalBasesIncludingEdgeNBases() - endNBasesCount - startNBasesCount; + public long totalBasesExcludingEdgeNBases() { + long bases = totalBases() - endNBasesCount - startNBasesCount; return Math.max(0, bases); } public ByteSpan byteSpanForBaseRangeIncludingEdgeNBases(long fromBase, long toBase) { - long total = totalBasesIncludingEdgeNBases(); + long total = totalBases(); if (fromBase < 1 || toBase < fromBase || toBase > total) { throw new IllegalArgumentException("bad base range: " + fromBase + ".." + toBase); } @@ -45,10 +54,10 @@ public ByteSpan byteSpanForBaseRangeIncludingEdgeNBases(long fromBase, long toBa int j = findLineByBase(toBase); LineEntry from = lines.get(i); - long offStart = fromBase - from.baseStart; + long offStart = fromBase - from.baseStart; - LineEntry to = lines.get(j); - long offEndIncl = toBase - to.baseStart; + LineEntry to = lines.get(j); + long offEndIncl = toBase - to.baseStart; long byteStart = from.byteStart + offStart; long byteEndEx = to.byteStart + offEndIncl + 1; // half-open @@ -56,14 +65,13 @@ public ByteSpan byteSpanForBaseRangeIncludingEdgeNBases(long fromBase, long toBa return new ByteSpan(byteStart, byteEndEx); } - public ByteSpan byteSpanForBaseRange(long fromBase, long toBase) { - long trimmedTotal = totalBases(); + long trimmedTotal = totalBasesExcludingEdgeNBases(); if (fromBase < 1 || toBase < fromBase || toBase > trimmedTotal) { throw new IllegalArgumentException("bad base range: " + fromBase + ".." + toBase); } long actualFromBase = startNBasesCount + fromBase; - long actualToBase = startNBasesCount + toBase; + long actualToBase = startNBasesCount + toBase; return byteSpanForBaseRangeIncludingEdgeNBases(actualFromBase, actualToBase); } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java index 00c80240..60aa5585 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java @@ -1,3 +1,13 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; import java.io.IOException; @@ -8,8 +18,8 @@ public final class SequenceIndexBuilder { - private static final int SCAN_BUF_SIZE = 64 * 1024; - private static final int COUNT_BUF_SIZE = 8 * 1024; + private static final int SCAN_BUF_SIZE = 64 * 1024; + private static final int COUNT_BUF_SIZE = 8 * 1024; private static final byte GT = (byte) '>'; private static final byte LF = (byte) '\n'; @@ -17,8 +27,10 @@ public final class SequenceIndexBuilder { public static final class Result { public final SequenceIndex index; public final long nextHeaderByte; // byte offset of next '>' at line start, or fileSize (EOF) + public Result(SequenceIndex index, long nextHeaderByte) { - this.index = index; this.nextHeaderByte = nextHeaderByte; + this.index = index; + this.nextHeaderByte = nextHeaderByte; } } @@ -50,12 +62,12 @@ public Result buildFrom(long startPos) throws IOException { List filtered = filterLinesWithinWindow(s.lines, s.firstBaseByte, s.nextHdr); long firstBaseByte = filtered.isEmpty() ? -1 : filtered.get(0).byteStart; - long lastBaseByte = filtered.isEmpty() ? -1 : (filtered.get(filtered.size()-1).byteEndExclusive - 1); + long lastBaseByte = filtered.isEmpty() ? -1 : (filtered.get(filtered.size() - 1).byteEndExclusive - 1); long startN = 0, endN = 0; if (!filtered.isEmpty()) { - startN = countLeadingNs(filtered.get(0)); // (3) only first line - endN = countTrailingNs(filtered.get(filtered.size()-1)); // (4) only last line + startN = countLeadingNs(filtered.get(0)); // (3) only first line + endN = countTrailingNs(filtered.get(filtered.size() - 1)); // (4) only last line } SequenceIndex idx = new SequenceIndex(firstBaseByte, startN, lastBaseByte, endN, filtered); @@ -67,23 +79,31 @@ public Result buildFrom(long startPos) throws IOException { // ===================================================================== private static final class ScanState { - long pos; // absolute scan position - long firstBaseByte = -1; // first allowed base byte seen - long lastBaseByte = -1; // last allowed base byte seen - long nextHdr; // byte of next header (or fileSize) + long pos; // absolute scan position + long firstBaseByte = -1; // first allowed base byte seen + long lastBaseByte = -1; // last allowed base byte seen + long nextHdr; // byte of next header (or fileSize) - long lineFirstByte = -1; // first allowed base byte in current line - long lineLastByte = -1; // last allowed base byte in current line + long lineFirstByte = -1; // first allowed base byte in current line + long lineLastByte = -1; // last allowed base byte in current line long basesSoFar = 0; long basesInLine = 0; final ArrayList lines = new ArrayList<>(256); - ScanState(long startPos, long fileSize) { this.pos = startPos; this.nextHdr = fileSize; } + + ScanState(long startPos, long fileSize) { + this.pos = startPos; + this.nextHdr = fileSize; + } } - private boolean hasMore(long p){ return p < fileSize; } + private boolean hasMore(long p) { + return p < fileSize; + } - private ByteBuffer newScanBuffer() { return ByteBuffer.allocateDirect(SCAN_BUF_SIZE); } + private ByteBuffer newScanBuffer() { + return ByteBuffer.allocateDirect(SCAN_BUF_SIZE); + } private int fillBuffer(ByteBuffer buf, long at) throws IOException { buf.clear(); @@ -101,12 +121,12 @@ private boolean processBuffer(ByteBuffer buf, ScanState s) throws IOException { long abs = s.pos + idx; if (isHeaderStart(b, abs)) { - s.nextHdr = abs; // stop window at header byte - commitOpenLineIfAny(s); // finalize any in-flight line + s.nextHdr = abs; // stop window at header byte + commitOpenLineIfAny(s); // finalize any in-flight line return true; } - if (b == LF) { // end of a displayed sequence line - commitOpenLineIfAny(s); // (2) only lines with bases are committed + if (b == LF) { // end of a displayed sequence line + commitOpenLineIfAny(s); // (2) only lines with bases are committed continue; } if (alphabet.isAllowed(b)) { @@ -147,7 +167,7 @@ private void observeBase(long abs, ScanState s) { private void commitOpenLineIfAny(ScanState s) { if (s.basesInLine <= 0) return; // (2) skip empty lines long baseStart = s.basesSoFar + 1; - long baseEnd = s.basesSoFar + s.basesInLine; + long baseEnd = s.basesSoFar + s.basesInLine; long byteStart = s.lineFirstByte; long byteEndEx = s.lineLastByte + 1; // half-open @@ -156,7 +176,7 @@ private void commitOpenLineIfAny(ScanState s) { s.basesSoFar += s.basesInLine; s.basesInLine = 0; s.lineFirstByte = -1; - s.lineLastByte = -1; + s.lineLastByte = -1; } // ===================================================================== @@ -164,9 +184,7 @@ private void commitOpenLineIfAny(ScanState s) { // ===================================================================== /** (1)+(2) Keep only lines fully inside [firstBaseByte, nextHdr) and already non-empty. */ - private List filterLinesWithinWindow(List raw, - long firstBaseByte, - long nextHdr) { + private List filterLinesWithinWindow(List raw, long firstBaseByte, long nextHdr) { if (firstBaseByte < 0 || raw.isEmpty()) return List.of(); ArrayList out = new ArrayList<>(raw.size()); for (LineEntry L : raw) { diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java new file mode 100644 index 00000000..81fb4b79 --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java @@ -0,0 +1,68 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.File; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import org.junit.jupiter.api.Test; +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; + +class FastaFileServiceIntegrationTest { + + + @Test + void basicFastaEntryManipulation_test() throws FastaFileException { + File fasta = FastaTestResources.file("fasta", "example2.txt"); + FastaFileService service = new FastaFileService(); + service.openNewFile(fasta); + + List entries = service.getFastaEntries(); + assertEquals(2, entries.size(), "should parse 2 FASTA entries"); + + Set ids = + Set.of(entries.get(0).getSubmissionId(), entries.get(1).getSubmissionId()); + assertTrue(ids.contains("ID1")); + assertTrue(ids.contains("ID2")); + + Optional entry1 = service.getFastaWithSubmissionId("ID1"); + Optional entry2 = service.getFastaWithSubmissionId("ID2"); + Optional imaginaryEntry = service.getFastaWithSubmissionId("ID3"); + assertTrue(entry1.isPresent(), "index for ID1 must exist"); + assertTrue(entry2.isPresent(), "index for ID2 must exist"); + assertTrue(imaginaryEntry.isEmpty(), "index for ID3 must not exist"); + + service.setAccessionId("ID1", "asc1"); + service.setAccessionId("ID2", "asc2"); + assertEquals(entry1.get().accessionId, "asc1"); + assertEquals(entry2.get().accessionId, "asc2"); + + // From the sample file above: + assertEquals(2, entry1.get().leadingNsCount, "ID1 leading Ns"); + assertEquals(2, entry1.get().trailingNsCount, "ID1 trailing Ns"); + assertEquals(0, entry2.get().leadingNsCount, "ID2 leading Ns"); + assertEquals(0,entry2.get().trailingNsCount, "ID2 trailing Ns"); + + String sequence1 = service.getSequenceRangeAsString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases); + assertEquals("NNACACGTTTNn", sequence1); + String sequence2 = service.getSequenceRangeAsString(SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, entry2.get().totalBases); + assertEquals("ACGTGGGG", sequence2); + + long adjustedTotalBases = entry1.get().totalBases - entry1.get().leadingNsCount - entry1.get().trailingNsCount; + String sequence1withoutNbases = service.getSequenceRangeAsString(SequenceRangeOption.WITHOUT_N_BASES, entry1.get().submissionId, 1, adjustedTotalBases); + assertEquals("ACACGTTT", sequence1withoutNbases); + + service.close(); + } +} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaTestResources.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaTestResources.java new file mode 100644 index 00000000..e3f09cfe --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaTestResources.java @@ -0,0 +1,59 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.fasta; + +import java.io.*; +import java.net.URL; +import java.nio.file.*; +import java.util.Objects; + +public final class FastaTestResources { + private FastaTestResources() {} + + /** Returns a Path to a resource like ("fasta", "example2.txt"). */ + public static Path path(String dir, String fileName) { + Objects.requireNonNull(dir, "dir"); + Objects.requireNonNull(fileName, "fileName"); + String resource = dir.endsWith("/") ? dir + fileName : dir + "/" + fileName; + + ClassLoader cl = Thread.currentThread().getContextClassLoader(); + URL url = Objects.requireNonNull(cl.getResource(resource), "Missing resource on classpath: " + resource); + + try { + if ("file".equals(url.getProtocol())) { + // Gradle tests: build/resources/test/... + return Paths.get(url.toURI()); + } + // Fallback for jar: URLs — copy to temp so callers can have a real Path/File + try (InputStream in = cl.getResourceAsStream(resource)) { + Objects.requireNonNull(in, "Resource stream is null: " + resource); + Path tmp = Files.createTempFile("testres-", "-" + fileName); + tmp.toFile().deleteOnExit(); + Files.copy(in, tmp, StandardCopyOption.REPLACE_EXISTING); + return tmp; + } + } catch (Exception e) { + throw new IllegalStateException("Failed to resolve resource: " + resource, e); + } + } + + /** Convenience if you need a File. */ + public static File file(String dir, String fileName) { + return path(dir, fileName).toFile(); + } + + /** Stream, if you don’t need a File/Path. */ + public static InputStream stream(String dir, String fileName) { + String resource = dir.endsWith("/") ? dir + fileName : dir + "/" + fileName; + InputStream in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resource); + return Objects.requireNonNull(in, "Missing resource stream: " + resource); + } +} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java index d7743f8f..9eaab11f 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java @@ -1,13 +1,22 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta.headerutils; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; -import uk.ac.ebi.embl.gff3tools.fasta.Topology; +import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.util.Optional; - -import static org.junit.jupiter.api.Assertions.*; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import uk.ac.ebi.embl.gff3tools.fasta.Topology; public class JsonHeaderParserTest { @@ -15,7 +24,8 @@ public class JsonHeaderParserTest { @Test void parsesStandardHeaderWithJson() { - String line = ">AF123456.1 | { \"description\":\"Pinus sativa\", \"molecule_type\":\"genomic\", \"topology\":\"circular\" }"; + String line = + ">AF123456.1 | { \"description\":\"Pinus sativa\", \"molecule_type\":\"genomic\", \"topology\":\"circular\" }"; try { ParsedHeader ph = parser.parse(line); @@ -45,7 +55,8 @@ void picksFirstTokenAsIdEvenWithExtraStuff() { @Test void parsesCurlyQuotesAndWeirdSpacingInKeys() { - String line = ">ID1 | { \u201Cdescription\u201D: \u201CPinus\u201D, \u201C molecule_type\u201D: \"genomic\" , \u201Ctopology\u201D: \"CIRCULAR\" }"; + String line = + ">ID1 | { \u201Cdescription\u201D: \u201CPinus\u201D, \u201C molecule_type\u201D: \"genomic\" , \u201Ctopology\u201D: \"CIRCULAR\" }"; try { ParsedHeader ph = parser.parse(line); FastaHeader h = ph.getHeader(); @@ -59,7 +70,8 @@ void parsesCurlyQuotesAndWeirdSpacingInKeys() { @Test void normalizesKeyVariantsAndChromosomeOptionals() { - String line = ">ID2 | { \"Description\":\"Desc\", \"molecule-type\":\"rna\", \"Chromosome Type\":\"plasmid\", \"chromosome_location\":\"chr12:100-200\", \"CHROMOSOME_NAME\":\"pX\" }"; + String line = + ">ID2 | { \"Description\":\"Desc\", \"molecule-type\":\"rna\", \"Chromosome Type\":\"plasmid\", \"chromosome_location\":\"chr12:100-200\", \"CHROMOSOME_NAME\":\"pX\" }"; try { ParsedHeader ph = parser.parse(line); FastaHeader h = ph.getHeader(); @@ -77,7 +89,8 @@ void normalizesKeyVariantsAndChromosomeOptionals() { @Test void handlesNbspInJson() { String nbsp = "\u00A0"; - String line = (">ID3 | {"+nbsp+"\"description\""+nbsp+":" + nbsp + "\"Alpha"+nbsp+"Beta\"" + nbsp + ",\"topology\":\"linear\"}"); + String line = (">ID3 | {" + nbsp + "\"description\"" + nbsp + ":" + nbsp + "\"Alpha" + nbsp + "Beta\"" + nbsp + + ",\"topology\":\"linear\"}"); try { ParsedHeader ph = parser.parse(line); FastaHeader h = ph.getHeader(); @@ -147,7 +160,8 @@ void malformedJsonThrowsAndIncludesJsonInMessage() { String msg = e.getMessage(); assertNotNull(msg); assertTrue(msg.contains("OOPS"), "Message should include offending JSON token"); - assertTrue(msg.contains("{\"description\": \"x\"") || msg.contains("{\"description\":\"x\""), + assertTrue( + msg.contains("{\"description\": \"x\"") || msg.contains("{\"description\":\"x\""), "Message should include JSON snippet"); } } diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java index a5e352d7..cb57b200 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java @@ -1,7 +1,16 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; +import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.nio.channels.FileChannel; @@ -10,8 +19,8 @@ import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.List; - -import static org.junit.jupiter.api.Assertions.*; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; public class SequenceIndexBuilderTest { @@ -39,11 +48,11 @@ void buildsIndex_ignoresEmptyLines_countsEdgeNs_only_withinWindow() throws Excep // \t\n // \n // >NEXT\n - String header = ">ID1 | {\"d\":\"x\"}\n"; - String l1 = "NNAC\n"; // leading N=2 - String l2 = "acgt\n"; - String l3 = "ttnN\n"; // trailing N=2 - String empties = "\n\t\n\n"; + String header = ">ID1 | {\"d\":\"x\"}\n"; + String l1 = "NNAC\n"; // leading N=2 + String l2 = "acgt\n"; + String l3 = "ttnN\n"; // trailing N=2 + String empties = "\n\t\n\n"; String nextHead = ">NEXT\n"; String fasta = header + l1 + l2 + l3 + empties + nextHead; @@ -70,17 +79,17 @@ void buildsIndex_ignoresEmptyLines_countsEdgeNs_only_withinWindow() throws Excep assertEquals(3, lines.size(), "only non-empty sequence lines must be indexed"); // Base numbering should be contiguous across lines (4 bases per line) - assertEquals(1, lines.get(0).baseStart); - assertEquals(4, lines.get(0).baseEnd); - assertEquals(5, lines.get(1).baseStart); - assertEquals(8, lines.get(1).baseEnd); - assertEquals(9, lines.get(2).baseStart); + assertEquals(1, lines.get(0).baseStart); + assertEquals(4, lines.get(0).baseEnd); + assertEquals(5, lines.get(1).baseStart); + assertEquals(8, lines.get(1).baseEnd); + assertEquals(9, lines.get(2).baseStart); assertEquals(12, lines.get(2).baseEnd); // Byte math: each line has 4 letters; byteEndExclusive = lastBaseByte + 1 - long l1Start = seqStartPos; // begins right after header line + long l1Start = seqStartPos; // begins right after header line long l1EndEx = l1Start + 4; - long l2Start = l1EndEx + 1; // + LF between lines + long l2Start = l1EndEx + 1; // + LF between lines long l2EndEx = l2Start + 4; long l3Start = l2EndEx + 1; long l3EndEx = l3Start + 4; @@ -100,7 +109,7 @@ void buildsIndex_ignoresEmptyLines_countsEdgeNs_only_withinWindow() throws Excep // Edge N counting: only first and last lines are inspected assertEquals(2, idx.startNBasesCount, "leading Ns only from first sequence line"); - assertEquals(2, idx.endNBasesCount, "trailing Ns only from last sequence line"); + assertEquals(2, idx.endNBasesCount, "trailing Ns only from last sequence line"); // nextHeaderByte should point to '>' of NEXT header long expectedNextHeader = header.length() + l1.length() + l2.length() + l3.length() + empties.length(); @@ -113,7 +122,7 @@ void supportsCRLF_beforeNextHeader_and_stillWindowsCorrectly() throws Exception // Mix CRLF lines in the sequence part; builder uses LF as terminator and ignores CR as non-base. String header = ">ID2\n"; // simulate CRLF lines by inserting '\r' before '\n' - String l1 = "NNxx".replace('x','A') + "\r\n"; // "NNAA\r\n" + String l1 = "NNxx".replace('x', 'A') + "\r\n"; // "NNAA\r\n" String l2 = "gggg\r\n"; String next = ">H2\n"; @@ -122,7 +131,8 @@ void supportsCRLF_beforeNextHeader_and_stillWindowsCorrectly() throws Exception try (FileChannel ch = openRead(p)) { long seqStart = header.getBytes(StandardCharsets.US_ASCII).length; - SequenceIndexBuilder sib = new SequenceIndexBuilder(ch, ch.size(), SequenceAlphabet.defaultNucleotideAlphabet()); + SequenceIndexBuilder sib = + new SequenceIndexBuilder(ch, ch.size(), SequenceAlphabet.defaultNucleotideAlphabet()); SequenceIndexBuilder.Result res = sib.buildFrom(seqStart); SequenceIndex idx = res.index; @@ -144,10 +154,10 @@ void supportsCRLF_beforeNextHeader_and_stillWindowsCorrectly() throws Exception @Test void ignoresWhitespaceOnlyLines_and_middleLineNs_doNotAffectEdgeCounts() throws Exception { String header = ">ID3\n"; - String l1 = "NACG\n"; // leading N = 1 - String l2 = "NNNN\n"; // middle line of Ns — must NOT affect start/end N counts + String l1 = "NACG\n"; // leading N = 1 + String l2 = "NNNN\n"; // middle line of Ns — must NOT affect start/end N counts String blanks = " \n\t\n"; - String l3 = "GGGn\n"; // trailing n = 1 + String l3 = "GGGn\n"; // trailing n = 1 String next = ">K\n"; String fasta = header + l1 + l2 + blanks + l3 + next; @@ -155,7 +165,8 @@ void ignoresWhitespaceOnlyLines_and_middleLineNs_doNotAffectEdgeCounts() throws try (FileChannel ch = openRead(p)) { long seqStart = header.getBytes(StandardCharsets.US_ASCII).length; - SequenceIndexBuilder sib = new SequenceIndexBuilder(ch, ch.size(), SequenceAlphabet.defaultNucleotideAlphabet()); + SequenceIndexBuilder sib = + new SequenceIndexBuilder(ch, ch.size(), SequenceAlphabet.defaultNucleotideAlphabet()); long before = ch.position(); SequenceIndexBuilder.Result res = sib.buildFrom(seqStart); @@ -168,14 +179,13 @@ void ignoresWhitespaceOnlyLines_and_middleLineNs_doNotAffectEdgeCounts() throws // Edge N counts: only first and last lines considered assertEquals(1, idx.startNBasesCount, "only first line leading Ns"); - assertEquals(1, idx.endNBasesCount, "only last line trailing Ns"); + assertEquals(1, idx.endNBasesCount, "only last line trailing Ns"); // Middle line of Ns shouldn't change edge counts - assertEquals( idx.linesView().get(1).lengthBases(), 4 ); + assertEquals(idx.linesView().get(1).lengthBases(), 4); // Total base numbering should be contiguous: 4 + 4 + 4 = 12 - assertEquals(12, idx.totalBasesIncludingEdgeNBases()); + assertEquals(12, idx.totalBases()); } } } - diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java index d8b808b3..acbd846d 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java @@ -1,10 +1,19 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; -import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.*; import java.util.List; - -import static org.junit.jupiter.api.Assertions.*; +import org.junit.jupiter.api.Test; public class SequenceIndexTest { @@ -21,26 +30,22 @@ public class SequenceIndexTest { * - total bases including edge Ns = 12 */ private SequenceIndex buildIndex(long startN, long endN) { - List lines = List.of( - new LineEntry(1, 4, 100, 104), - new LineEntry(5, 8, 105, 109), - new LineEntry(9, 12, 110, 114) - ); + List lines = + List.of(new LineEntry(1, 4, 100, 104), new LineEntry(5, 8, 105, 109), new LineEntry(9, 12, 110, 114)); return new SequenceIndex( - /*firstBaseByte*/100, - /*startNBasesCount*/startN, - /*lastBaseByte*/113, - /*endNBasesCount*/endN, - lines - ); + /*firstBaseByte*/ 100, + /*startNBasesCount*/ startN, + /*lastBaseByte*/ 113, + /*endNBasesCount*/ endN, + lines); } @Test void totals_including_and_trimmed() { - SequenceIndex idx = buildIndex(/*startN*/2, /*endN*/3); + SequenceIndex idx = buildIndex(/*startN*/ 2, /*endN*/ 3); - assertEquals(12, idx.totalBasesIncludingEdgeNBases(), "totalBasesIncludingEdgeNBases"); - assertEquals(7, idx.totalBases(), "trimmed totalBases"); + assertEquals(12, idx.totalBases(), "totalBasesIncludingEdgeNBases"); + assertEquals(7, idx.totalBasesExcludingEdgeNBases(), "trimmed totalBases"); } @Test @@ -71,7 +76,8 @@ void byteSpan_including_edges_crosses_newline() { @Test void including_edges_validates_total() { SequenceIndex idx = buildIndex(0, 0); - assertThrows(IllegalArgumentException.class, + assertThrows( + IllegalArgumentException.class, () -> idx.byteSpanForBaseRangeIncludingEdgeNBases(1, 13), "toBase beyond total (including Ns) should throw"); } @@ -79,7 +85,7 @@ void including_edges_validates_total() { @Test void trimmed_byteSpan_maps_through_startN() { SequenceIndex idx = buildIndex(2, 3); - assertEquals(7, idx.totalBases()); + assertEquals(7, idx.totalBasesExcludingEdgeNBases()); ByteSpan s = idx.byteSpanForBaseRange(1, 3); // Ignore first 2 Ns, ignore last 3 Ns @@ -102,14 +108,15 @@ void trimmed_span_crosses_multiple_lines() { @Test void trimmed_validates_range_against_trimmed_total() { SequenceIndex idx = buildIndex(2, 3); // trimmed total = 7 - assertThrows(IllegalArgumentException.class, + assertThrows( + IllegalArgumentException.class, () -> idx.byteSpanForBaseRange(1, 8), "toBase beyond trimmed total should throw"); } @Test void zero_edgeNs_behavior_matches_including_method() { - SequenceIndex idx = buildIndex(0, 0); //no additional N bases + SequenceIndex idx = buildIndex(0, 0); // no additional N bases ByteSpan a = idx.byteSpanForBaseRange(2, 5); ByteSpan b = idx.byteSpanForBaseRangeIncludingEdgeNBases(2, 5); diff --git a/src/test/resources/fasta/example2.txt b/src/test/resources/fasta/example2.txt new file mode 100644 index 00000000..ac275453 --- /dev/null +++ b/src/test/resources/fasta/example2.txt @@ -0,0 +1,9 @@ +>ID1 | {"description":"first"} +NNAC +ACGT + +TTNn + +>ID2 | {"description":"second"} +ACGT +GGGG From 831df4fdda30b56608416b6f443b4789fe1308a7 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Fri, 5 Dec 2025 14:25:41 +0000 Subject: [PATCH 14/31] stricter-header-parsing-done --- .../gff3tools/fasta/FastaFileService.java | 40 +-- .../fasta/headerutils/JsonHeaderParser.java | 87 ++++-- .../fasta/sequenceutils/SequenceAlphabet.java | 28 ++ .../sequenceutils/SequenceIndexBuilder.java | 27 +- .../FastaFileServiceIntegrationTest.java | 27 +- .../headerutils/JsonHeaderParserTest.java | 282 ++++++++---------- src/test/resources/fasta/example2.txt | 4 +- src/test/resources/fasta/malformedFasta.txt | 8 + .../resources/fasta/malformedJsonFasta.txt | 8 + 9 files changed, 299 insertions(+), 212 deletions(-) create mode 100644 src/test/resources/fasta/malformedFasta.txt create mode 100644 src/test/resources/fasta/malformedJsonFasta.txt diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java index a081d97f..2c86ef51 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -118,43 +118,35 @@ public InputStream streamSequenceRange(SequenceRangeOption option, String submis // ---------------------------- interactions with the reader ---------------------------- - public void openNewFile(File fastaFile) throws FastaFileException { + public void openNewFile(File fastaFile) throws FastaFileException, IOException { ensureFileReaderClosed(); // if already open, close first this.file = Objects.requireNonNull(fastaFile, "file"); this.fastaEntries.clear(); this.sequenceIndexes.clear(); - try { - reader = new SequentialFastaFileReader(fastaFile); - var readEntries = reader.readAll(); - for (var entry : readEntries) { - FastaEntry fastaEntry = new FastaEntry(); - fastaEntry.setSubmissionId(entry.getSubmissionId()); - fastaEntry.setHeader(entry.getHeader()); - fastaEntry.setTotalBases(entry.sequenceIndex.totalBases()); - fastaEntry.setLeadingNsCount(entry.sequenceIndex.startNBasesCount); - fastaEntry.setTrailingNsCount(entry.sequenceIndex.endNBasesCount); - fastaEntries.add(fastaEntry); - - sequenceIndexes.put(entry.getSubmissionId(), entry.sequenceIndex); - } - } catch (IOException ioe) { - throw new FastaFileException("Failed to open FASTA reader: " + file.getAbsolutePath(), ioe); + reader = new SequentialFastaFileReader(fastaFile); + var readEntries = reader.readAll(); + for (var entry : readEntries) { + FastaEntry fastaEntry = new FastaEntry(); + fastaEntry.setSubmissionId(entry.getSubmissionId()); + fastaEntry.setHeader(entry.getHeader()); + fastaEntry.setTotalBases(entry.sequenceIndex.totalBases()); + fastaEntry.setLeadingNsCount(entry.sequenceIndex.startNBasesCount); + fastaEntry.setTrailingNsCount(entry.sequenceIndex.endNBasesCount); + fastaEntries.add(fastaEntry); + + sequenceIndexes.put(entry.getSubmissionId(), entry.sequenceIndex); } } /** Close the reader. Safe to call multiple times. */ - public void close() throws FastaFileException { + public void close() throws IOException { if (reader != null) { - try { - reader.close(); - } catch (IOException ioe) { - throw new FastaFileException("Failed to close FASTA reader: " + file.getAbsolutePath(), ioe); - } + reader.close(); reader = null; } } - private void ensureFileReaderClosed() throws FastaFileException { + private void ensureFileReaderClosed() throws IOException { if (reader != null) close(); } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java index bc917877..b9b14fbb 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java @@ -14,12 +14,14 @@ import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.util.*; + +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; import uk.ac.ebi.embl.gff3tools.fasta.Topology; public class JsonHeaderParser { private static final ObjectMapper MAPPER = new ObjectMapper(); - public ParsedHeader parse(String headerLine) throws IOException { + public ParsedHeader parse(String headerLine) throws FastaFileException { String rest = headerLine.substring(1); // headerLine starts with '>' int pipe = rest.indexOf('|'); String idPart = (pipe >= 0 ? rest.substring(0, pipe) : rest).trim(); @@ -36,37 +38,76 @@ public ParsedHeader parse(String headerLine) throws IOException { return new ParsedHeader(id, h); } - private static void fillFromJson(String raw, FastaHeader h) throws IOException { - if (raw == null || raw.isEmpty()) return; + private static void fillFromJson(String raw, FastaHeader h) throws FastaFileException { + if (raw == null || raw.isEmpty()) { + throw new FastaFileException( + "FASTA header contains a '|', but no JSON object was provided. " + + "Expected something like: >id { \"description\": \"...\", \"moleculeType\": \"DNA\", ... }" + ); + } - // Normalize curly quotes / NBSPs but keep the final JSON we actually tried to parse + // Normalize curly quotes / NBSPs String normalized = raw.replace('\u201C', '"') .replace('\u201D', '"') .replace('\u2018', '\'') .replace('\u2019', '\'') .replace('\u00A0', ' ') .trim(); + + JsonNode node; try { - JsonNode node = MAPPER.readTree(normalized); - Map m = new HashMap<>(); - node.fields().forEachRemaining(e -> { - String k = e.getKey() == null ? "" : e.getKey(); - k = k.trim().toLowerCase(Locale.ROOT).replaceAll("[\\s_-]+", ""); - String v = e.getValue().isNull() ? null : e.getValue().asText(); - m.put(k, v); - }); - h.setDescription(m.get("description")); - h.setMoleculeType(m.get("moleculetype")); - h.setTopology(parseTopology(m.get("topology"))); - if (m.containsKey("chromosometype")) - h.setChromosomeType(Optional.ofNullable(emptyToNull(m.get("chromosometype")))); - if (m.containsKey("chromosomelocation")) - h.setChromosomeLocation(Optional.ofNullable(emptyToNull(m.get("chromosomelocation")))); - if (m.containsKey("chromosomename")) - h.setChromosomeName(Optional.ofNullable(emptyToNull(m.get("chromosomename")))); + node = MAPPER.readTree(normalized); + if (node == null || !node.isObject()) { + throw new FastaFileException( + "FASTA header JSON did not parse into an object. " + + "Received: " + normalized + ); + } } catch (IOException e) { - // explode, and include the JSON we tried to parse - throw new IOException("Malformed FASTA header JSON: " + normalized, e); + throw new FastaFileException( + "Malformed FASTA header JSON. Failed to parse: " + normalized, e + ); + } + + // Extract fields + Map m = new HashMap<>(); + node.fields().forEachRemaining(e -> { + String key = (e.getKey() == null ? "" : e.getKey()) + .trim().toLowerCase(Locale.ROOT) + .replaceAll("[\\s_-]+", ""); + String val = e.getValue().isNull() ? null : e.getValue().asText(); + m.put(key, val); + }); + + // Assign values + h.setDescription(m.get("description")); + h.setMoleculeType(m.get("moleculetype")); + h.setTopology(parseTopology(m.get("topology"))); + + if (m.containsKey("chromosometype")) + h.setChromosomeType(Optional.ofNullable(emptyToNull(m.get("chromosometype")))); + if (m.containsKey("chromosomelocation")) + h.setChromosomeLocation(Optional.ofNullable(emptyToNull(m.get("chromosomelocation")))); + if (m.containsKey("chromosomename")) + h.setChromosomeName(Optional.ofNullable(emptyToNull(m.get("chromosomename")))); + + // 🔍 Validate required fields + List missing = new ArrayList<>(); + + if (h.description == null) + missing.add("description"); + + if (h.moleculeType == null) + missing.add("moleculeType"); + + if (h.topology == null) + missing.add("topology (must be 'LINEAR' or 'CIRCULAR')"); + + if (!missing.isEmpty()) { + throw new FastaFileException( + "FASTA header JSON is missing required fields: " + missing + + ". Parsed JSON was: " + normalized + ); } } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java index e66ace23..f9f8c203 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java @@ -32,4 +32,32 @@ public boolean isNBase(byte b) { public static SequenceAlphabet defaultNucleotideAlphabet() { return new SequenceAlphabet("ACGTURYSWKMBDHVNacgturyswkmbdhvn-.*"); } + + public String describeAllowed() { + StringBuilder sb = new StringBuilder(); + sb.append("["); + + boolean first = true; + for (int i = 0; i < allowed.length; i++) { + if (allowed[i]) { + char c = (char) i; + + // Render unprintables safely + String display; + if (c >= 32 && c < 127) { + display = Character.toString(c); + } else { + display = String.format("\\x%02X", i); // e.g. non-printable → \x1B + } + + if (!first) sb.append(", "); + sb.append(display); + first = false; + } + } + + sb.append("]"); + return sb.toString(); + } + } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java index 60aa5585..ade0670c 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java @@ -10,6 +10,8 @@ */ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; + import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; @@ -45,12 +47,12 @@ public SequenceIndexBuilder(FileChannel ch, long fileSize, SequenceAlphabet alph } /** Build a SequenceIndex starting at 'startPos' (first byte after header line). */ - public Result buildFrom(long startPos) throws IOException { + public Result buildFrom(long startPos) throws IOException, FastaFileException { ScanState s = new ScanState(startPos, fileSize); ByteBuffer buf = newScanBuffer(); // ------------- scan raw bytes into provisional "sequence lines" ------------- - while (hasMore(s.pos)) { + while (s.pos filtered = filterLinesWithinWindow(s.lines, s.firstBaseByte, s.nextHdr); long firstBaseByte = filtered.isEmpty() ? -1 : filtered.get(0).byteStart; @@ -97,10 +99,6 @@ private static final class ScanState { } } - private boolean hasMore(long p) { - return p < fileSize; - } - private ByteBuffer newScanBuffer() { return ByteBuffer.allocateDirect(SCAN_BUF_SIZE); } @@ -113,7 +111,7 @@ private int fillBuffer(ByteBuffer buf, long at) throws IOException { } /** Returns true if we hit the next header and should stop scanning this entry. */ - private boolean processBuffer(ByteBuffer buf, ScanState s) throws IOException { + private boolean processBuffer(ByteBuffer buf, ScanState s) throws IOException, FastaFileException { buf.flip(); while (buf.hasRemaining()) { int idx = buf.position(); @@ -125,14 +123,21 @@ private boolean processBuffer(ByteBuffer buf, ScanState s) throws IOException { commitOpenLineIfAny(s); // finalize any in-flight line return true; } - if (b == LF) { // end of a displayed sequence line + else if (b == LF) { // end of a displayed sequence line commitOpenLineIfAny(s); // (2) only lines with bases are committed continue; } - if (alphabet.isAllowed(b)) { + else if (alphabet.isAllowed(b)) { observeBase(abs, s); } - // else: ignore non-allowed, non-newline junk on the line + else{ + throw new FastaFileException(String.format( + "Illegal character '%s' (byte value: %d) at absolute file position %d. " + + "This character is not allowed by the current FASTA alphabet. " + + "Expected only characters: %s", + (char) (b & 0xFF), b & 0xFF, abs, alphabet.describeAllowed() + )); + } } return false; } diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java index 81fb4b79..ab87a03f 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java @@ -13,6 +13,7 @@ import static org.junit.jupiter.api.Assertions.*; import java.io.File; +import java.io.IOException; import java.util.List; import java.util.Optional; import java.util.Set; @@ -21,9 +22,33 @@ class FastaFileServiceIntegrationTest { + @Test + void readsMalformedFastaJson_Failure() throws IOException { + File fasta = FastaTestResources.file("fasta", "malformedJsonFasta.txt"); + FastaFileService service = new FastaFileService(); + + assertThrows(FastaFileException.class, () -> { + service.openNewFile(fasta); + }); + + service.close(); + } + + + @Test + void readsMalformedFastaSequence_Failure() throws IOException { + File fasta = FastaTestResources.file("fasta", "malformedFasta.txt"); + FastaFileService service = new FastaFileService(); + + assertThrows(FastaFileException.class, () -> { + service.openNewFile(fasta); + }); + + service.close(); + } @Test - void basicFastaEntryManipulation_test() throws FastaFileException { + void basicFastaEntryManipulation_succeeds() throws IOException, FastaFileException { File fasta = FastaTestResources.file("fasta", "example2.txt"); FastaFileService service = new FastaFileService(); service.openNewFile(fasta); diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java index 9eaab11f..f29aa610 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java @@ -1,212 +1,192 @@ /* - * Copyright 2025 EMBL - European Bioinformatics Institute - * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this - * file except in compliance with the License. You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * Unless required by applicable law or agreed to in writing, software distributed under the - * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. + * Copyright 2025 EMBL... */ package uk.ac.ebi.embl.gff3tools.fasta.headerutils; import static org.junit.jupiter.api.Assertions.*; -import java.io.IOException; -import java.util.Optional; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; import uk.ac.ebi.embl.gff3tools.fasta.Topology; +import java.util.Optional; + public class JsonHeaderParserTest { private final JsonHeaderParser parser = new JsonHeaderParser(); + // --------------------------------------------------------- + // VALID CASES + // --------------------------------------------------------- + @Test void parsesStandardHeaderWithJson() { String line = ">AF123456.1 | { \"description\":\"Pinus sativa\", \"molecule_type\":\"genomic\", \"topology\":\"circular\" }"; - try { - ParsedHeader ph = parser.parse(line); - - assertEquals("AF123456.1", ph.getId()); - FastaHeader h = ph.getHeader(); - assertEquals("Pinus sativa", h.getDescription()); - assertEquals("genomic", h.getMoleculeType()); - Assertions.assertEquals(Topology.CIRCULAR, h.getTopology()); - assertTrue(h.getChromosomeType().isEmpty()); - assertTrue(h.getChromosomeLocation().isEmpty()); - assertTrue(h.getChromosomeName().isEmpty()); - } catch (IOException e) { - fail("Should not throw for well-formed JSON: " + e.getMessage()); - } + + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + assertEquals("AF123456.1", ph.getId()); + + FastaHeader h = ph.getHeader(); + assertEquals("Pinus sativa", h.getDescription()); + assertEquals("genomic", h.getMoleculeType()); + assertEquals(Topology.CIRCULAR, h.getTopology()); + assertTrue(h.getChromosomeType().isEmpty()); + assertTrue(h.getChromosomeLocation().isEmpty()); + assertTrue(h.getChromosomeName().isEmpty()); } @Test void picksFirstTokenAsIdEvenWithExtraStuff() { - String line = ">AF123456.1 extra tokens here | {\"description\":\"x\"}"; - try { - ParsedHeader ph = parser.parse(line); - assertEquals("AF123456.1", ph.getId()); - } catch (IOException e) { - fail("Should not throw: " + e.getMessage()); - } + String line = + ">AF123456.1 extra tokens here | " + + " {\"description\":\"x\", \"molecule_type\":\"dna\", \"topology\":\"linear\"}"; + + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + assertEquals("AF123456.1", ph.getId()); } @Test void parsesCurlyQuotesAndWeirdSpacingInKeys() { String line = - ">ID1 | { \u201Cdescription\u201D: \u201CPinus\u201D, \u201C molecule_type\u201D: \"genomic\" , \u201Ctopology\u201D: \"CIRCULAR\" }"; - try { - ParsedHeader ph = parser.parse(line); - FastaHeader h = ph.getHeader(); - assertEquals("Pinus", h.getDescription()); - assertEquals("genomic", h.getMoleculeType()); - assertEquals(Topology.CIRCULAR, h.getTopology()); - } catch (IOException e) { - fail("Should not throw with normalized curly quotes: " + e.getMessage()); - } + ">ID1 | { \u201Cdescription\u201D: \u201CPinus\u201D, \u201C molecule_type\u201D:\"genomic\", \u201Ctopology\u201D:\"CIRCULAR\" }"; + + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + FastaHeader h = ph.getHeader(); + + assertEquals("Pinus", h.getDescription()); + assertEquals("genomic", h.getMoleculeType()); + assertEquals(Topology.CIRCULAR, h.getTopology()); } @Test void normalizesKeyVariantsAndChromosomeOptionals() { String line = - ">ID2 | { \"Description\":\"Desc\", \"molecule-type\":\"rna\", \"Chromosome Type\":\"plasmid\", \"chromosome_location\":\"chr12:100-200\", \"CHROMOSOME_NAME\":\"pX\" }"; - try { - ParsedHeader ph = parser.parse(line); - FastaHeader h = ph.getHeader(); - - assertEquals("Desc", h.getDescription()); - assertEquals("rna", h.getMoleculeType()); - assertEquals(Optional.of("plasmid"), h.getChromosomeType()); - assertEquals(Optional.of("chr12:100-200"), h.getChromosomeLocation()); - assertEquals(Optional.of("pX"), h.getChromosomeName()); - } catch (IOException e) { - fail("Should not throw: " + e.getMessage()); - } + ">ID2 | { \"Description\":\"Desc\", \"molecule-type\":\"rna\", \"topology\":\"linear\", " + + "\"Chromosome Type\":\"plasmid\", \"chromosome_location\":\"chr12:100-200\", \"CHROMOSOME_NAME\":\"pX\" }"; + + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + FastaHeader h = ph.getHeader(); + + assertEquals("Desc", h.getDescription()); + assertEquals("rna", h.getMoleculeType()); + assertEquals(Topology.LINEAR, h.getTopology()); + assertEquals(Optional.of("plasmid"), h.getChromosomeType()); + assertEquals(Optional.of("chr12:100-200"), h.getChromosomeLocation()); + assertEquals(Optional.of("pX"), h.getChromosomeName()); } @Test void handlesNbspInJson() { String nbsp = "\u00A0"; - String line = (">ID3 | {" + nbsp + "\"description\"" + nbsp + ":" + nbsp + "\"Alpha" + nbsp + "Beta\"" + nbsp - + ",\"topology\":\"linear\"}"); - try { - ParsedHeader ph = parser.parse(line); - FastaHeader h = ph.getHeader(); - - assertEquals("Alpha Beta", h.getDescription()); // NBSP normalized to space - assertEquals(Topology.LINEAR, h.getTopology()); - } catch (IOException e) { - fail("Should not throw with NBSP: " + e.getMessage()); - } - } + String line = + ">ID3 | {" + + nbsp + "\"description\"" + nbsp + ":" + nbsp + "\"Alpha" + nbsp + "Beta\"" + "," + + "\"molecule_type\":\"rna\", \"topology\":\"linear\"}"; - @Test - void unknownTopologyYieldsNull() { - String line = ">ID4 | {\"topology\":\"weird-shape\"}"; - try { - ParsedHeader ph = parser.parse(line); - assertNull(ph.getHeader().getTopology()); - } catch (IOException e) { - fail("Should not throw when topology is unknown: " + e.getMessage()); - } + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + FastaHeader h = ph.getHeader(); + + assertEquals("Alpha Beta", h.getDescription()); + assertEquals("rna", h.getMoleculeType()); + assertEquals(Topology.LINEAR, h.getTopology()); } @Test void missingJsonIsFine_NoPipe() { String line = ">AF999999.5 some label without json"; - try { - ParsedHeader ph = parser.parse(line); - - assertEquals("AF999999.5", ph.getId()); - FastaHeader h = ph.getHeader(); - assertNull(h.getDescription()); - assertNull(h.getMoleculeType()); - assertNull(h.getTopology()); - assertTrue(h.getChromosomeType().isEmpty()); - assertTrue(h.getChromosomeLocation().isEmpty()); - assertTrue(h.getChromosomeName().isEmpty()); - } catch (IOException e) { - fail("Should not throw without pipe/JSON: " + e.getMessage()); - } + + ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); + assertEquals("AF999999.5", ph.getId()); + + FastaHeader h = ph.getHeader(); + assertNull(h.getDescription()); + assertNull(h.getMoleculeType()); + assertNull(h.getTopology()); } @Test - void emptyJsonAfterPipeIsFine() { + void trimsIdAndHandlesJustChevron() { + ParsedHeader ph1 = assertDoesNotThrow(() -> + parser.parse("> AF111 | {\"description\":\"x\",\"molecule_type\":\"dna\",\"topology\":\"linear\"}")); + assertEquals("AF111", ph1.getId()); + + // No pipe: JSON not required + ParsedHeader ph2 = assertDoesNotThrow(() -> parser.parse(">")); + assertEquals("", ph2.getId()); + assertNull(ph2.getHeader().getDescription()); + } + + + // --------------------------------------------------------- + // INVALID CASES — MUST THROW FASTAFIleException + // --------------------------------------------------------- + + @Test + void noJsonAfterPipeThrows() { String line = ">ID5 | "; - try { - ParsedHeader ph = parser.parse(line); - FastaHeader h = ph.getHeader(); - - assertEquals("ID5", ph.getId()); - assertNull(h.getDescription()); - assertNull(h.getMoleculeType()); - assertNull(h.getTopology()); - } catch (IOException e) { - fail("Should not throw for empty JSON after pipe: " + e.getMessage()); - } + assertThrows(FastaFileException.class, () -> parser.parse(line)); + } + + @Test + void emptyJsonAfterPipeThrows() { + String line = ">ID5 | {} "; + assertThrows(FastaFileException.class, () -> parser.parse(line)); + } + + @Test + void jsonWithNullValuesThrows() { + String line = ">ID8 | {\"description\":null, \"molecule_type\":null, \"topology\":null}"; + assertThrows(FastaFileException.class, () -> parser.parse(line)); } + @Test + void missingRequiredFieldsThrows() { + String line = ">ID9 | {\"description\":\"x\"}"; + FastaFileException e = assertThrows(FastaFileException.class, () -> parser.parse(line)); + assertTrue(e.getMessage().contains("missing required")); + } + + @Test + void unknownTopologyThrows() { + String line = ">ID4 | {\"description\":\"x\", \"molecule_type\":\"dna\", \"topology\":\"banana\"}"; + FastaFileException e = assertThrows(FastaFileException.class, () -> parser.parse(line)); + assertTrue(e.getMessage().contains("topology")); + } + + + // --------------------------------------------------------- + // MALFORMED JSON + // --------------------------------------------------------- + @Test void malformedJsonThrowsAndIncludesJsonInMessage() { String badJson = "{\"description\": \"x\", \"molecule_type\": \"genomic\", OOPS }"; String line = ">ID6 | " + badJson; - try { - parser.parse(line); - fail("Expected IOException for malformed JSON"); - } catch (IOException e) { - // Should include a recognizable chunk of normalized JSON - String msg = e.getMessage(); - assertNotNull(msg); - assertTrue(msg.contains("OOPS"), "Message should include offending JSON token"); - assertTrue( - msg.contains("{\"description\": \"x\"") || msg.contains("{\"description\":\"x\""), - "Message should include JSON snippet"); - } - } - @Test - void malformedJsonWithTrailingCommaThrowsAndMentionsComma() { - String badJson = "{ \"description\":\"y\", \"molecule_type\":\"genomic\", }"; - String line = ">ID7 | " + badJson; - try { - parser.parse(line); - fail("Expected IOException for trailing comma"); - } catch (IOException e) { - String msg = e.getMessage(); - assertNotNull(msg); - // different Jackson versions phrase this differently; just assert we included the JSON - assertTrue(msg.contains("\"description\":\"y\"")); - assertTrue(msg.contains("\"molecule_type\":\"genomic\"")); - } + FastaFileException e = assertThrows(FastaFileException.class, () -> parser.parse(line)); + assertTrue(e.getMessage().contains("OOPS")); + assertTrue(e.getMessage().contains("{\"description")); } @Test - void jsonWithNullValuesParsesAndLeavesNulls() { - String line = ">ID8 | {\"description\":null, \"molecule_type\":null, \"topology\":null}"; - try { - ParsedHeader ph = parser.parse(line); - FastaHeader h = ph.getHeader(); - assertNull(h.getDescription()); - assertNull(h.getMoleculeType()); - assertNull(h.getTopology()); - } catch (IOException e) { - fail("Should not throw for explicit nulls: " + e.getMessage()); - } + void malformedJsonBracesThrowsAndIncludesJsonInMessage() { + String badJson = "{\"description\": \"x\", \"molecule_type\": \"genomic\", OOPS }"; + String line = ">ID6 | " + badJson; + + FastaFileException e = assertThrows(FastaFileException.class, () -> parser.parse(line)); + assertTrue(e.getMessage().contains("OOPS")); + assertTrue(e.getMessage().contains("{\"description")); } + @Test - void trimsIdAndHandlesJustChevron() { - try { - ParsedHeader ph1 = parser.parse("> AF111 | {\"description\":\"x\"}"); - assertEquals("AF111", ph1.getId()); - - ParsedHeader ph2 = parser.parse(">"); - assertEquals("", ph2.getId()); - assertNull(ph2.getHeader().getDescription()); - } catch (IOException e) { - fail("Should not throw here: " + e.getMessage()); - } + void malformedJsonWithTrailingCommaThrowsAndMentionsComma() { + String badJson = "{ \"description\":\"y\", \"molecule_type\":\"genomic\", }"; + String line = ">ID7 | " + badJson; + + FastaFileException e = assertThrows(FastaFileException.class, () -> parser.parse(line)); + assertTrue(e.getMessage().contains("\"description\":\"y\"")); + assertTrue(e.getMessage().contains("\"molecule_type\":\"genomic\"")); } } diff --git a/src/test/resources/fasta/example2.txt b/src/test/resources/fasta/example2.txt index ac275453..4ab3564b 100644 --- a/src/test/resources/fasta/example2.txt +++ b/src/test/resources/fasta/example2.txt @@ -1,9 +1,9 @@ ->ID1 | {"description":"first"} +>ID1 | {"description":"x", "molecule_type":"dna", "topology":"linear"} NNAC ACGT TTNn ->ID2 | {"description":"second"} +>ID2 | {"description":"x", "molecule_type":"dna", "topology":"circular"}v ACGT GGGG diff --git a/src/test/resources/fasta/malformedFasta.txt b/src/test/resources/fasta/malformedFasta.txt new file mode 100644 index 00000000..21e52862 --- /dev/null +++ b/src/test/resources/fasta/malformedFasta.txt @@ -0,0 +1,8 @@ +>ID1 | {"description":"something", "molecule_type":"dna", "topology":"linear"} +NNAC +ACGT;';'; +TTNn + +>ID2 | {"description":"x", "molecule_type":"dna", "topology":"linear"} +ACGT +GGGG diff --git a/src/test/resources/fasta/malformedJsonFasta.txt b/src/test/resources/fasta/malformedJsonFasta.txt new file mode 100644 index 00000000..97d3fc85 --- /dev/null +++ b/src/test/resources/fasta/malformedJsonFasta.txt @@ -0,0 +1,8 @@ +>ID1 | {"desc;';ription":"first"} +NNAC +ACGT +TTNn + +>ID2 | {"description":"x", "molecule_type":"dna", "topology":"linear"} +ACGT +GGGG From 543fee93d8966fe3060399ee144376c320d336c2 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Fri, 5 Dec 2025 15:27:17 +0000 Subject: [PATCH 15/31] capitalisation-fix --- .../fasta/FastaFileServiceIntegrationTest.java | 6 +++--- .../sequenceutils/SequenceIndexBuilderTest.java | 6 +++--- .../fasta/sequenceutils/SequenceIndexTest.java | 16 ++++++++-------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java index ab87a03f..96c24a95 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java @@ -23,7 +23,7 @@ class FastaFileServiceIntegrationTest { @Test - void readsMalformedFastaJson_Failure() throws IOException { + void readingMalformedFastaJsonFailure() throws IOException { //more tests like this in the JsonHeaderParserTest File fasta = FastaTestResources.file("fasta", "malformedJsonFasta.txt"); FastaFileService service = new FastaFileService(); @@ -36,7 +36,7 @@ void readsMalformedFastaJson_Failure() throws IOException { @Test - void readsMalformedFastaSequence_Failure() throws IOException { + void readingMalformedFastaSequenceFailure() throws IOException { File fasta = FastaTestResources.file("fasta", "malformedFasta.txt"); FastaFileService service = new FastaFileService(); @@ -48,7 +48,7 @@ void readsMalformedFastaSequence_Failure() throws IOException { } @Test - void basicFastaEntryManipulation_succeeds() throws IOException, FastaFileException { + void basicFastaEntryManipulationSucceeds() throws IOException, FastaFileException { File fasta = FastaTestResources.file("fasta", "example2.txt"); FastaFileService service = new FastaFileService(); service.openNewFile(fasta); diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java index cb57b200..87b4a8a5 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java @@ -38,7 +38,7 @@ private static Path writeAscii(Path dir, String filename, String content) throws } @Test - void buildsIndex_ignoresEmptyLines_countsEdgeNs_only_withinWindow() throws Exception { + void buildsIndexCorrectly() throws Exception { // Layout (US-ASCII): // >ID1 | {"d":"x"}\n // NNAC\n @@ -118,7 +118,7 @@ void buildsIndex_ignoresEmptyLines_countsEdgeNs_only_withinWindow() throws Excep } @Test - void supportsCRLF_beforeNextHeader_and_stillWindowsCorrectly() throws Exception { + void ignoresCRLFCorrectly() throws Exception { // Mix CRLF lines in the sequence part; builder uses LF as terminator and ignores CR as non-base. String header = ">ID2\n"; // simulate CRLF lines by inserting '\r' before '\n' @@ -152,7 +152,7 @@ void supportsCRLF_beforeNextHeader_and_stillWindowsCorrectly() throws Exception } @Test - void ignoresWhitespaceOnlyLines_and_middleLineNs_doNotAffectEdgeCounts() throws Exception { + void ignoresEmptyLinesCorrectly() throws Exception { String header = ">ID3\n"; String l1 = "NACG\n"; // leading N = 1 String l2 = "NNNN\n"; // middle line of Ns — must NOT affect start/end N counts diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java index acbd846d..1072b496 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexTest.java @@ -41,7 +41,7 @@ private SequenceIndex buildIndex(long startN, long endN) { } @Test - void totals_including_and_trimmed() { + void totalsIncludingAndTrimmed() { SequenceIndex idx = buildIndex(/*startN*/ 2, /*endN*/ 3); assertEquals(12, idx.totalBases(), "totalBasesIncludingEdgeNBases"); @@ -49,7 +49,7 @@ void totals_including_and_trimmed() { } @Test - void byteSpan_including_edges_same_line() { + void byteSpanIncludingEdgesSameLine() { SequenceIndex idx = buildIndex(0, 0); // [from..to] = [2..4] -> bytes [101..103], endExclusive = 104 @@ -61,7 +61,7 @@ void byteSpan_including_edges_same_line() { } @Test - void byteSpan_including_edges_crosses_newline() { + void byteSpanIncludingEdgesCrossesNewline() { SequenceIndex idx = buildIndex(0, 0); // [2..5] crosses the newline between line1 and line2 @@ -74,7 +74,7 @@ void byteSpan_including_edges_crosses_newline() { } @Test - void including_edges_validates_total() { + void includingEdgesValidatesTotal() { SequenceIndex idx = buildIndex(0, 0); assertThrows( IllegalArgumentException.class, @@ -83,7 +83,7 @@ void including_edges_validates_total() { } @Test - void trimmed_byteSpan_maps_through_startN() { + void trimmedByteSpanMapsThroughStartN() { SequenceIndex idx = buildIndex(2, 3); assertEquals(7, idx.totalBasesExcludingEdgeNBases()); @@ -95,7 +95,7 @@ void trimmed_byteSpan_maps_through_startN() { } @Test - void trimmed_span_crosses_multiple_lines() { + void trimmedSpanCrossesMultipleLines() { SequenceIndex idx = buildIndex(2, 3); // trimmed total = 7 bases ByteSpan s = idx.byteSpanForBaseRange(4, 7); @@ -106,7 +106,7 @@ void trimmed_span_crosses_multiple_lines() { } @Test - void trimmed_validates_range_against_trimmed_total() { + void trimmedValidatesRangeAgainstTrimmedTotal() { SequenceIndex idx = buildIndex(2, 3); // trimmed total = 7 assertThrows( IllegalArgumentException.class, @@ -115,7 +115,7 @@ void trimmed_validates_range_against_trimmed_total() { } @Test - void zero_edgeNs_behavior_matches_including_method() { + void zeroEdgeNsBehaviorMatchesIncludingMethod() { SequenceIndex idx = buildIndex(0, 0); // no additional N bases ByteSpan a = idx.byteSpanForBaseRange(2, 5); From cdf967ade6be0f333fb4839bfec713ea01097311 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Tue, 9 Dec 2025 16:40:57 +0000 Subject: [PATCH 16/31] fixed-streaming-chars --- .../gff3tools/fasta/FastaFileService.java | 5 +- .../fasta/SequentialFastaFileReader.java | 106 ++++++++++++------ .../FastaFileServiceIntegrationTest.java | 38 +++++++ 3 files changed, 113 insertions(+), 36 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java index 2c86ef51..da32a441 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -13,6 +13,7 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.Reader; import java.util.*; import lombok.Getter; import lombok.Setter; @@ -93,7 +94,7 @@ public String getSequenceRangeAsString(SequenceRangeOption option, * Uses the cached index to translate bases -> bytes, then asks the reader to stream * ASCII bytes while skipping '\n' and '\r' on the fly. */ - public InputStream streamSequenceRange(SequenceRangeOption option, String submissionId, long fromBase, long toBase) + public Reader streamSequenceRange(SequenceRangeOption option, String submissionId, long fromBase, long toBase) throws FastaFileException { ensureFileReaderOpen(); var index = sequenceIndexes.get(submissionId); @@ -113,7 +114,7 @@ public InputStream streamSequenceRange(SequenceRangeOption option, String submis throw new IllegalStateException("Unknown option " + option); } - return reader.getSequenceSlice(span); + return reader.getSequenceSliceReader(span); } // ---------------------------- interactions with the reader ---------------------------- diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index 54ba4882..77910c06 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -24,6 +24,7 @@ public class SequentialFastaFileReader implements AutoCloseable { private static final int BUFFER_SIZE = 64 * 1024; + private static final int CHAR_BUF_SIZE = 8192; private static final byte GT = (byte) '>'; private static final byte LF = (byte) '\n'; private static final byte CR = (byte) '\r'; @@ -61,50 +62,87 @@ public String getSequenceSliceString(ByteSpan span) throws IOException { return readAsciiWithoutNewlines(span.start, span.endEx); } - public InputStream getSequenceSlice(ByteSpan span) { - return new InputStream() { - private long position = span.start; - private final long end = span.endEx; - private final ByteBuffer buffer = ByteBuffer.allocate(8192); // Adjust as needed + /** Char-stream view over [span.start, span.endEx): ASCII decode, skip LF/CR. + * Uses absolute reads; does NOT change channel.position(). */ + /** Char-stream view over [span.start, span.endEx): ASCII decode, skip LF/CR. + * Uses absolute reads; does NOT change channel.position(). */ + public java.io.Reader getSequenceSliceReader(ByteSpan span) { + final long start = span.start; + final long endEx = span.endEx; + + return new java.io.Reader() { + private long pos = start; + + // Allocate direct buffer for I/O… + private final java.nio.ByteBuffer buf = java.nio.ByteBuffer.allocateDirect(CHAR_BUF_SIZE); + { + // …but mark it EMPTY so the very first read() refills it from the channel. + // Without this, hasRemaining() is true and you'd read uninitialized bytes (→ '\0'). + buf.limit(0); + } @Override - public int read() throws IOException { - while (true) { - if (!buffer.hasRemaining()) { - if (position >= end) return -1; - - buffer.clear(); - int toRead = (int) Math.min(buffer.capacity(), end - position); - int read = channel.read(buffer, position); - if (read == -1) return -1; - - position += read; - buffer.flip(); - } + public int read(char[] characterBuffer, + int startingWriteIndexInCharacterBuffer, + int maximumNumberOfCharsToRead) throws java.io.IOException { + + // --- Validate caller’s target window [off .. off + len) + if (characterBuffer == null) throw new NullPointerException("characterBuffer"); + if (startingWriteIndexInCharacterBuffer < 0 || + maximumNumberOfCharsToRead < 0 || + startingWriteIndexInCharacterBuffer + maximumNumberOfCharsToRead > characterBuffer.length) { + throw new IndexOutOfBoundsException( + "off=" + startingWriteIndexInCharacterBuffer + + " len=" + maximumNumberOfCharsToRead + + " bufLen=" + characterBuffer.length); + } + if (maximumNumberOfCharsToRead == 0) return 0; - // Peek the next byte - if (buffer.hasRemaining()) { - byte b = buffer.get(); - if (b == '\n') continue; // Filter out newline - return b & 0xFF; - } + // IMPORTANT: if you didn’t already do this in a ctor/initializer block: + // Newly-allocated ByteBuffer has remaining() == true. Mark it empty so we refill first. + if (buf.limit() != 0 && !buf.hasRemaining() && pos == span.start) { + // no-op branch, kept to show intent; prefer the initializer approach below } - } - @Override - public int read(byte[] b, int off, int len) throws IOException { - int totalRead = 0; + int out = 0; - while (totalRead < len) { - int next = read(); - if (next == -1) break; + while (out < maximumNumberOfCharsToRead) { + // Refill byte buffer if empty + if (!buf.hasRemaining()) { + if (pos >= endEx) break; // EOF for this slice - b[off + totalRead] = (byte) next; - totalRead++; + buf.clear(); // position=0, limit=capacity + int toRead = (int) Math.min(buf.capacity(), endEx - pos); + buf.limit(toRead); // HARD cap at span end + int n = channel.read(buf, pos); // ABSOLUTE read; file cursor unchanged + if (n <= 0) break; // EOF / I/O issue + pos += n; + buf.flip(); // prepare for reading the bytes we just filled + } + + // Drain bytes -> chars into caller’s window [off .. off+len) + while (buf.hasRemaining() && out < maximumNumberOfCharsToRead) { + byte b = buf.get(); + if (b == LF || b == CR) continue; // skip EOL bytes + // ASCII decode: write into cbuf starting at 'off', advancing by 'out' + characterBuffer[startingWriteIndexInCharacterBuffer + out] = (char) (b & 0xFF); + out++; + } } - return (totalRead == 0) ? -1 : totalRead; + // If we produced nothing AND we’re at EOF, signal -1 per Reader contract + return (out == 0) ? -1 : out; + } + + @Override + public int read() throws java.io.IOException { + char[] one = new char[1]; + int n = read(one, 0, 1); + return (n == -1) ? -1 : one[0]; } + + @Override public boolean ready() { return buf.hasRemaining() || pos < endEx; } + @Override public void close() { /* no-op: we don’t own the channel */ } }; } diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java index 96c24a95..a3a7af6c 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java @@ -12,8 +12,10 @@ import static org.junit.jupiter.api.Assertions.*; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.util.List; import java.util.Optional; import java.util.Set; @@ -90,4 +92,40 @@ void basicFastaEntryManipulationSucceeds() throws IOException, FastaFileExceptio service.close(); } + + @Test + void gettingStringAsAStringVsStreamProducesSameResult() throws IOException, FastaFileException { + File fasta = FastaTestResources.file("fasta", "example2.txt"); + FastaFileService service = new FastaFileService(); + service.openNewFile(fasta); + + List entries = service.getFastaEntries(); + assertEquals(2, entries.size(), "should parse 2 FASTA entries"); + + Set ids = + Set.of(entries.get(0).getSubmissionId(), entries.get(1).getSubmissionId()); + assertTrue(ids.contains("ID1")); + Optional entry1 = service.getFastaWithSubmissionId("ID1"); + + String sequence1 = service.getSequenceRangeAsString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases); + assertEquals("NNACACGTTTNn", sequence1); + + String streamedSequence; + try (java.io.Reader r = service.streamSequenceRange(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases)) { + StringBuilder sb = new StringBuilder(); + char[] cbuf = new char[8192]; + int n; + while ((n = r.read(cbuf)) != -1) { + // -------------------- STRING BUILD SPOT -------------------- + // At this point cbuf[0..n) already contains *decoded* characters, + // with all EOLs removed by the Reader above. Appending grows the StringBuilder. + sb.append(cbuf, 0, n); // <<< chars -> StringBuilder (later -> String) + // ----------------------------------------------------------- + } + streamedSequence = sb.toString(); + } + assertEquals("NNACACGTTTNn", streamedSequence); + + service.close(); + } } From be678100f7d8a4e9024c431868f0f4a685e17817 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 10 Dec 2025 10:24:21 +0000 Subject: [PATCH 17/31] improved-tests --- .../ebi/embl/gff3tools/fasta/FastaEntry.java | 1 + .../gff3tools/fasta/FastaFileService.java | 11 +- .../fasta/SequentialFastaFileReader.java | 67 +++++----- .../FastaFileServiceIntegrationTest.java | 114 +++++++++++++++--- 4 files changed, 134 insertions(+), 59 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java index 235ac21a..b7771922 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntry.java @@ -21,6 +21,7 @@ public class FastaEntry { public String accessionId; public FastaHeader header; // json info public long totalBases; + public long totalBasesWithoutNBases; public long leadingNsCount; public long trailingNsCount; } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java index da32a441..e2267c99 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -12,7 +12,6 @@ import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.io.Reader; import java.util.*; import lombok.Getter; @@ -57,9 +56,9 @@ public Optional getFastaWithSubmissionId(String submissionId) throws } /** Return a sequence slice as a String (no EOLs) for [fromBase..toBase] inclusive. */ - public String getSequenceRangeAsString(SequenceRangeOption option, - String submissionId, - long fromBase, long toBase) throws FastaFileException { + public String getSequenceSliceString(SequenceRangeOption option, + String submissionId, + long fromBase, long toBase) throws FastaFileException { ensureFileReaderOpen(); SequenceIndex index = sequenceIndexes.get(submissionId); if (index == null) { @@ -94,7 +93,7 @@ public String getSequenceRangeAsString(SequenceRangeOption option, * Uses the cached index to translate bases -> bytes, then asks the reader to stream * ASCII bytes while skipping '\n' and '\r' on the fly. */ - public Reader streamSequenceRange(SequenceRangeOption option, String submissionId, long fromBase, long toBase) + public Reader getSequenceSliceReader(SequenceRangeOption option, String submissionId, long fromBase, long toBase) throws FastaFileException { ensureFileReaderOpen(); var index = sequenceIndexes.get(submissionId); @@ -133,6 +132,8 @@ public void openNewFile(File fastaFile) throws FastaFileException, IOException { fastaEntry.setTotalBases(entry.sequenceIndex.totalBases()); fastaEntry.setLeadingNsCount(entry.sequenceIndex.startNBasesCount); fastaEntry.setTrailingNsCount(entry.sequenceIndex.endNBasesCount); + long adjustedBases = entry.sequenceIndex.totalBases()- entry.sequenceIndex.startNBasesCount- entry.sequenceIndex.endNBasesCount; + fastaEntry.setTotalBasesWithoutNBases(adjustedBases); fastaEntries.add(fastaEntry); sequenceIndexes.put(entry.getSubmissionId(), entry.sequenceIndex); diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index 77910c06..20939282 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -62,8 +62,6 @@ public String getSequenceSliceString(ByteSpan span) throws IOException { return readAsciiWithoutNewlines(span.start, span.endEx); } - /** Char-stream view over [span.start, span.endEx): ASCII decode, skip LF/CR. - * Uses absolute reads; does NOT change channel.position(). */ /** Char-stream view over [span.start, span.endEx): ASCII decode, skip LF/CR. * Uses absolute reads; does NOT change channel.position(). */ public java.io.Reader getSequenceSliceReader(ByteSpan span) { @@ -73,11 +71,10 @@ public java.io.Reader getSequenceSliceReader(ByteSpan span) { return new java.io.Reader() { private long pos = start; - // Allocate direct buffer for I/O… private final java.nio.ByteBuffer buf = java.nio.ByteBuffer.allocateDirect(CHAR_BUF_SIZE); { - // …but mark it EMPTY so the very first read() refills it from the channel. - // Without this, hasRemaining() is true and you'd read uninitialized bytes (→ '\0'). + // allocate buffer and mark it EMPTY so the very first read() refills it from the channel. + // Without this, hasRemaining() is true and we'll read uninitialized bytes (→ '\0'). buf.limit(0); } @@ -85,55 +82,51 @@ public java.io.Reader getSequenceSliceReader(ByteSpan span) { public int read(char[] characterBuffer, int startingWriteIndexInCharacterBuffer, int maximumNumberOfCharsToRead) throws java.io.IOException { - - // --- Validate caller’s target window [off .. off + len) - if (characterBuffer == null) throw new NullPointerException("characterBuffer"); - if (startingWriteIndexInCharacterBuffer < 0 || - maximumNumberOfCharsToRead < 0 || - startingWriteIndexInCharacterBuffer + maximumNumberOfCharsToRead > characterBuffer.length) { - throw new IndexOutOfBoundsException( - "off=" + startingWriteIndexInCharacterBuffer + - " len=" + maximumNumberOfCharsToRead + - " bufLen=" + characterBuffer.length); - } + // --- Validate caller’s target window [off .. off + len) --- + ValidateTargetWindow(characterBuffer, startingWriteIndexInCharacterBuffer, maximumNumberOfCharsToRead); if (maximumNumberOfCharsToRead == 0) return 0; - // IMPORTANT: if you didn’t already do this in a ctor/initializer block: - // Newly-allocated ByteBuffer has remaining() == true. Mark it empty so we refill first. - if (buf.limit() != 0 && !buf.hasRemaining() && pos == span.start) { - // no-op branch, kept to show intent; prefer the initializer approach below - } - int out = 0; - while (out < maximumNumberOfCharsToRead) { - // Refill byte buffer if empty + // --- Prep the buffer for next read & fill it out --- if (!buf.hasRemaining()) { - if (pos >= endEx) break; // EOF for this slice + if (pos >= endEx) break; //if end of slice reached, stop reading - buf.clear(); // position=0, limit=capacity + buf.clear(); int toRead = (int) Math.min(buf.capacity(), endEx - pos); - buf.limit(toRead); // HARD cap at span end - int n = channel.read(buf, pos); // ABSOLUTE read; file cursor unchanged - if (n <= 0) break; // EOF / I/O issue + buf.limit(toRead); + + int n = channel.read(buf, pos); + if (n <= 0) break; //if no bytes were read, break pos += n; - buf.flip(); // prepare for reading the bytes we just filled + buf.flip(); } - - // Drain bytes -> chars into caller’s window [off .. off+len) + // Drain bytes + ASCII decode -> writees chars into caller’s window [off .. off+len) while (buf.hasRemaining() && out < maximumNumberOfCharsToRead) { byte b = buf.get(); - if (b == LF || b == CR) continue; // skip EOL bytes - // ASCII decode: write into cbuf starting at 'off', advancing by 'out' + if (b == LF || b == CR) continue; // skip irrelevant bytes characterBuffer[startingWriteIndexInCharacterBuffer + out] = (char) (b & 0xFF); out++; } } - - // If we produced nothing AND we’re at EOF, signal -1 per Reader contract + // If we produced nothing AND we’re at EOF, signal -1 return (out == 0) ? -1 : out; } + private void ValidateTargetWindow(char[] characterBuffer, + int startingWriteIndexInCharacterBuffer, + int maximumNumberOfCharsToRead) throws java.io.IOException { + if (characterBuffer == null) throw new NullPointerException("characterBuffer"); + if (startingWriteIndexInCharacterBuffer < 0 || + maximumNumberOfCharsToRead < 0 || + startingWriteIndexInCharacterBuffer + maximumNumberOfCharsToRead > characterBuffer.length) { + throw new IndexOutOfBoundsException( + "off=" + startingWriteIndexInCharacterBuffer + + " len=" + maximumNumberOfCharsToRead + + " bufLen=" + characterBuffer.length); + } + } + @Override public int read() throws java.io.IOException { char[] one = new char[1]; @@ -142,7 +135,7 @@ public int read() throws java.io.IOException { } @Override public boolean ready() { return buf.hasRemaining() || pos < endEx; } - @Override public void close() { /* no-op: we don’t own the channel */ } + @Override public void close() { /* no-op, channel is kept alive */ } }; } diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java index a3a7af6c..65efdeeb 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java @@ -12,10 +12,8 @@ import static org.junit.jupiter.api.Assertions.*; -import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.util.List; import java.util.Optional; import java.util.Set; @@ -50,7 +48,7 @@ void readingMalformedFastaSequenceFailure() throws IOException { } @Test - void basicFastaEntryManipulationSucceeds() throws IOException, FastaFileException { + void gettingSequenceSliceAsStringReturnsCorrectly() throws IOException, FastaFileException { File fasta = FastaTestResources.file("fasta", "example2.txt"); FastaFileService service = new FastaFileService(); service.openNewFile(fasta); @@ -81,20 +79,19 @@ void basicFastaEntryManipulationSucceeds() throws IOException, FastaFileExceptio assertEquals(0, entry2.get().leadingNsCount, "ID2 leading Ns"); assertEquals(0,entry2.get().trailingNsCount, "ID2 trailing Ns"); - String sequence1 = service.getSequenceRangeAsString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases); + String sequence1 = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases); assertEquals("NNACACGTTTNn", sequence1); - String sequence2 = service.getSequenceRangeAsString(SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, entry2.get().totalBases); + String sequence2 = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, entry2.get().totalBases); assertEquals("ACGTGGGG", sequence2); - long adjustedTotalBases = entry1.get().totalBases - entry1.get().leadingNsCount - entry1.get().trailingNsCount; - String sequence1withoutNbases = service.getSequenceRangeAsString(SequenceRangeOption.WITHOUT_N_BASES, entry1.get().submissionId, 1, adjustedTotalBases); + String sequence1withoutNbases = service.getSequenceSliceString(SequenceRangeOption.WITHOUT_N_BASES, entry1.get().submissionId, 1, entry1.get().totalBasesWithoutNBases); assertEquals("ACACGTTT", sequence1withoutNbases); service.close(); } @Test - void gettingStringAsAStringVsStreamProducesSameResult() throws IOException, FastaFileException { + void gettingSequenceViaReaderGivesCorrectResult() throws IOException, FastaFileException { File fasta = FastaTestResources.file("fasta", "example2.txt"); FastaFileService service = new FastaFileService(); service.openNewFile(fasta); @@ -105,27 +102,110 @@ void gettingStringAsAStringVsStreamProducesSameResult() throws IOException, Fast Set ids = Set.of(entries.get(0).getSubmissionId(), entries.get(1).getSubmissionId()); assertTrue(ids.contains("ID1")); + assertTrue(ids.contains("ID2")); Optional entry1 = service.getFastaWithSubmissionId("ID1"); + Optional entry2 = service.getFastaWithSubmissionId("ID2"); - String sequence1 = service.getSequenceRangeAsString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases); - assertEquals("NNACACGTTTNn", sequence1); - + // stream whole sequence with the reader String streamedSequence; - try (java.io.Reader r = service.streamSequenceRange(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases)) { + try (java.io.Reader r = service.getSequenceSliceReader(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases)) { StringBuilder sb = new StringBuilder(); char[] cbuf = new char[8192]; int n; while ((n = r.read(cbuf)) != -1) { - // -------------------- STRING BUILD SPOT -------------------- - // At this point cbuf[0..n) already contains *decoded* characters, - // with all EOLs removed by the Reader above. Appending grows the StringBuilder. - sb.append(cbuf, 0, n); // <<< chars -> StringBuilder (later -> String) - // ----------------------------------------------------------- + sb.append(cbuf, 0, n); } streamedSequence = sb.toString(); } + // compare assertEquals("NNACACGTTTNn", streamedSequence); + // stream whole sequence with the reader + String streamedSequenceWithoutNbases; + try (java.io.Reader r = service.getSequenceSliceReader(SequenceRangeOption.WITHOUT_N_BASES, entry1.get().submissionId, 1, entry1.get().totalBasesWithoutNBases)) { + StringBuilder sb = new StringBuilder(); + char[] cbuf = new char[8192]; + int n; + while ((n = r.read(cbuf)) != -1) { + sb.append(cbuf, 0, n); + } + streamedSequenceWithoutNbases = sb.toString(); + } + // compare + assertEquals("ACACGTTT", streamedSequenceWithoutNbases); + + + // stream sequence with the reader + String streamedSequence2; + try (java.io.Reader r = service.getSequenceSliceReader(SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, entry2.get().totalBases)) { + StringBuilder sb = new StringBuilder(); + char[] cbuf = new char[8192]; + int n; + while ((n = r.read(cbuf)) != -1) { + sb.append(cbuf, 0, n); + } + streamedSequence2 = sb.toString(); + } + // compare + assertEquals("ACGTGGGG", streamedSequence2); + + + service.close(); + } + + @Test + void gettingStringAsAStringVsStreamProducesSameResultSlices() throws IOException, FastaFileException { + File fasta = FastaTestResources.file("fasta", "example2.txt"); + FastaFileService service = new FastaFileService(); + service.openNewFile(fasta); + + List entries = service.getFastaEntries(); + assertEquals(2, entries.size(), "should parse 2 FASTA entries"); + + Set ids = + Set.of(entries.get(0).getSubmissionId(), entries.get(1).getSubmissionId()); + assertTrue(ids.contains("ID1")); + assertTrue(ids.contains("ID2")); + Optional entry1 = service.getFastaWithSubmissionId("ID1"); + Optional entry2 = service.getFastaWithSubmissionId("ID2"); + + for(long end=2; end <= entry1.get().totalBases; end++) { + // get slice as string + String sequence = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, end); + // stream sequence with the reader + String streamedSequence; + try (java.io.Reader r = service.getSequenceSliceReader(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, end)) + { + StringBuilder sb = new StringBuilder(); + char[] cbuf = new char[8192]; + int n; + while ((n = r.read(cbuf)) != -1) { + sb.append(cbuf, 0, n); + } + streamedSequence = sb.toString(); + } + // compare + assertEquals(sequence, streamedSequence); + } + + for(long end=2; end <= entry2.get().totalBases; end++) { + // get slice as string + String sequence2 = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, end); + // stream sequence with the reader + String streamedSequence2; + try (java.io.Reader r = service.getSequenceSliceReader(SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, end)) { + StringBuilder sb = new StringBuilder(); + char[] cbuf = new char[8192]; + int n; + while ((n = r.read(cbuf)) != -1) { + sb.append(cbuf, 0, n); + } + streamedSequence2 = sb.toString(); + } + // compare + assertEquals(sequence2, streamedSequence2); + } + service.close(); } } From 381d975d12f47eb675376ee01c2218267b85e5af Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 10 Dec 2025 14:41:49 +0000 Subject: [PATCH 18/31] draft-finished --- .../fasta/SequentialFastaFileReader.java | 4 +- .../sequenceutils/SequenceIndexBuilder.java | 5 +- .../FastaFileServiceIntegrationTest.java | 57 ++++++++++++++++--- src/test/resources/fasta/example2.txt | 2 +- ...malformedFasta.txt => malformed_fasta.txt} | 0 ...JsonFasta.txt => malformed_json_fasta.txt} | 0 6 files changed, 54 insertions(+), 14 deletions(-) rename src/test/resources/fasta/{malformedFasta.txt => malformed_fasta.txt} (100%) rename src/test/resources/fasta/{malformedJsonFasta.txt => malformed_json_fasta.txt} (100%) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index 20939282..a1fe4e58 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -23,8 +23,8 @@ public class SequentialFastaFileReader implements AutoCloseable { - private static final int BUFFER_SIZE = 64 * 1024; - private static final int CHAR_BUF_SIZE = 8192; + private static final int BUFFER_SIZE = 4 * 1024 * 1024; // 4 MB + private static final int CHAR_BUF_SIZE = 512 * 1024; // 512 KB private static final byte GT = (byte) '>'; private static final byte LF = (byte) '\n'; private static final byte CR = (byte) '\r'; diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java index ade0670c..ba920784 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java @@ -19,9 +19,8 @@ import java.util.List; public final class SequenceIndexBuilder { - - private static final int SCAN_BUF_SIZE = 64 * 1024; - private static final int COUNT_BUF_SIZE = 8 * 1024; + private static final int SCAN_BUF_SIZE = 4 * 1024 * 1024; // 4 MB + private static final int COUNT_BUF_SIZE = 4 * 1024 * 1024; // 2 MB private static final byte GT = (byte) '>'; private static final byte LF = (byte) '\n'; diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java index 65efdeeb..d27ba8a8 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java @@ -17,6 +17,8 @@ import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.stream.Collectors; + import org.junit.jupiter.api.Test; import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; @@ -24,7 +26,7 @@ class FastaFileServiceIntegrationTest { @Test void readingMalformedFastaJsonFailure() throws IOException { //more tests like this in the JsonHeaderParserTest - File fasta = FastaTestResources.file("fasta", "malformedJsonFasta.txt"); + File fasta = FastaTestResources.file("fasta", "malformed_json_fasta.txt"); FastaFileService service = new FastaFileService(); assertThrows(FastaFileException.class, () -> { @@ -37,7 +39,7 @@ void readingMalformedFastaJsonFailure() throws IOException { //more tests like t @Test void readingMalformedFastaSequenceFailure() throws IOException { - File fasta = FastaTestResources.file("fasta", "malformedFasta.txt"); + File fasta = FastaTestResources.file("fasta", "malformed_fasta.txt"); FastaFileService service = new FastaFileService(); assertThrows(FastaFileException.class, () -> { @@ -56,8 +58,9 @@ void gettingSequenceSliceAsStringReturnsCorrectly() throws IOException, FastaFil List entries = service.getFastaEntries(); assertEquals(2, entries.size(), "should parse 2 FASTA entries"); - Set ids = - Set.of(entries.get(0).getSubmissionId(), entries.get(1).getSubmissionId()); + Set ids = entries.stream() + .map(e -> e.getSubmissionId()) + .collect(Collectors.toSet()); assertTrue(ids.contains("ID1")); assertTrue(ids.contains("ID2")); @@ -99,8 +102,9 @@ void gettingSequenceViaReaderGivesCorrectResult() throws IOException, FastaFileE List entries = service.getFastaEntries(); assertEquals(2, entries.size(), "should parse 2 FASTA entries"); - Set ids = - Set.of(entries.get(0).getSubmissionId(), entries.get(1).getSubmissionId()); + Set ids = entries.stream() + .map(e -> e.getSubmissionId()) + .collect(Collectors.toSet()); assertTrue(ids.contains("ID1")); assertTrue(ids.contains("ID2")); Optional entry1 = service.getFastaWithSubmissionId("ID1"); @@ -162,8 +166,9 @@ void gettingStringAsAStringVsStreamProducesSameResultSlices() throws IOException List entries = service.getFastaEntries(); assertEquals(2, entries.size(), "should parse 2 FASTA entries"); - Set ids = - Set.of(entries.get(0).getSubmissionId(), entries.get(1).getSubmissionId()); + Set ids = entries.stream() + .map(e -> e.getSubmissionId()) + .collect(Collectors.toSet()); assertTrue(ids.contains("ID1")); assertTrue(ids.contains("ID2")); Optional entry1 = service.getFastaWithSubmissionId("ID1"); @@ -208,4 +213,40 @@ void gettingStringAsAStringVsStreamProducesSameResultSlices() throws IOException service.close(); } + + // to run this, curl the sequence with: curl -o single_fasta_large_sequence.txt https://www.ebi.ac.uk/ena/cram/md5/11398cc4b68f2cceb4fd50b742d4b1ec + // then to add the fasta header run something like : + // + // tmp="$(mktemp "${TMPDIR:-/tmp}/prepend.XXXXXX")" && + //{ printf '%s\n' '>ID1 | {"description":"x", "molecule_type":"dna", "topology":"linear"}'; cat -- single_fasta_large_sequence.txt; } >"$tmp" && + //mv -f -- "$tmp" single_fasta_large_sequence.txt + // + // then just move the fasta into whatever/gff3tools/src/test/resources/fasta/ + // and run the test + //@Test + void readBigSequenceSuccessfully() throws IOException, FastaFileException { + File fasta = FastaTestResources.file("fasta", "single_fasta_large_sequence.txt"); + FastaFileService service = new FastaFileService(); + service.openNewFile(fasta); + + List entries = service.getFastaEntries(); + assertEquals(1, entries.size(), "should parse 1 FASTA entry"); + + Set ids = entries.stream() + .map(e -> e.getSubmissionId()) + .collect(Collectors.toSet()); + assertTrue(ids.contains("ID1")); + Optional entry1 = service.getFastaWithSubmissionId("ID1"); + + //get first 16 chars + String sequenceStart = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, 16); + assertEquals(sequenceStart, "GGGCTTTAAATGGCTC"); + + //get last 16 chars + String sequenceEnd = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, entry1.get().totalBases-15, entry1.get().totalBases); + assertEquals(sequenceEnd, "GAATTCTGATGGCTGT"); + + service.close(); + } + } diff --git a/src/test/resources/fasta/example2.txt b/src/test/resources/fasta/example2.txt index 4ab3564b..a91ba778 100644 --- a/src/test/resources/fasta/example2.txt +++ b/src/test/resources/fasta/example2.txt @@ -4,6 +4,6 @@ ACGT TTNn ->ID2 | {"description":"x", "molecule_type":"dna", "topology":"circular"}v +>ID2 | {"description":"x", "molecule_type":"dna", "topology":"circular"} ACGT GGGG diff --git a/src/test/resources/fasta/malformedFasta.txt b/src/test/resources/fasta/malformed_fasta.txt similarity index 100% rename from src/test/resources/fasta/malformedFasta.txt rename to src/test/resources/fasta/malformed_fasta.txt diff --git a/src/test/resources/fasta/malformedJsonFasta.txt b/src/test/resources/fasta/malformed_json_fasta.txt similarity index 100% rename from src/test/resources/fasta/malformedJsonFasta.txt rename to src/test/resources/fasta/malformed_json_fasta.txt From 8fb18b0c884176ca45c3e9049126337ff83f3c35 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 10 Dec 2025 14:48:54 +0000 Subject: [PATCH 19/31] tests-corrected --- .../gff3tools/fasta/FastaFileService.java | 13 ++- .../fasta/SequentialFastaFileReader.java | 47 ++++++---- .../fasta/headerutils/JsonHeaderParser.java | 31 ++----- .../fasta/sequenceutils/SequenceAlphabet.java | 1 - .../sequenceutils/SequenceIndexBuilder.java | 27 +++--- .../FastaFileServiceIntegrationTest.java | 93 ++++++++++--------- .../headerutils/JsonHeaderParserTest.java | 36 +++---- .../SequenceIndexBuilderTest.java | 12 +-- 8 files changed, 128 insertions(+), 132 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java index e2267c99..2d0fdbf8 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -56,9 +56,8 @@ public Optional getFastaWithSubmissionId(String submissionId) throws } /** Return a sequence slice as a String (no EOLs) for [fromBase..toBase] inclusive. */ - public String getSequenceSliceString(SequenceRangeOption option, - String submissionId, - long fromBase, long toBase) throws FastaFileException { + public String getSequenceSliceString(SequenceRangeOption option, String submissionId, long fromBase, long toBase) + throws FastaFileException { ensureFileReaderOpen(); SequenceIndex index = sequenceIndexes.get(submissionId); if (index == null) { @@ -82,12 +81,10 @@ public String getSequenceSliceString(SequenceRangeOption option, } catch (IOException ioe) { throw new FastaFileException( "I/O while reading slice for " + submissionId + " bytes " + span.start + ".." + (span.endEx - 1), - ioe - ); + ioe); } } - /** * Return a sequence slice for reader [fromBase..toBase] (1-based, inclusive) for the given ID. * Uses the cached index to translate bases -> bytes, then asks the reader to stream @@ -132,7 +129,9 @@ public void openNewFile(File fastaFile) throws FastaFileException, IOException { fastaEntry.setTotalBases(entry.sequenceIndex.totalBases()); fastaEntry.setLeadingNsCount(entry.sequenceIndex.startNBasesCount); fastaEntry.setTrailingNsCount(entry.sequenceIndex.endNBasesCount); - long adjustedBases = entry.sequenceIndex.totalBases()- entry.sequenceIndex.startNBasesCount- entry.sequenceIndex.endNBasesCount; + long adjustedBases = entry.sequenceIndex.totalBases() + - entry.sequenceIndex.startNBasesCount + - entry.sequenceIndex.endNBasesCount; fastaEntry.setTotalBasesWithoutNBases(adjustedBases); fastaEntries.add(fastaEntry); diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index a1fe4e58..c80c3cf1 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -23,8 +23,8 @@ public class SequentialFastaFileReader implements AutoCloseable { - private static final int BUFFER_SIZE = 4 * 1024 * 1024; // 4 MB - private static final int CHAR_BUF_SIZE = 512 * 1024; // 512 KB + private static final int BUFFER_SIZE = 4 * 1024 * 1024; // 4 MB + private static final int CHAR_BUF_SIZE = 512 * 1024; // 512 KB private static final byte GT = (byte) '>'; private static final byte LF = (byte) '\n'; private static final byte CR = (byte) '\r'; @@ -72,6 +72,7 @@ public java.io.Reader getSequenceSliceReader(ByteSpan span) { private long pos = start; private final java.nio.ByteBuffer buf = java.nio.ByteBuffer.allocateDirect(CHAR_BUF_SIZE); + { // allocate buffer and mark it EMPTY so the very first read() refills it from the channel. // Without this, hasRemaining() is true and we'll read uninitialized bytes (→ '\0'). @@ -79,9 +80,9 @@ public java.io.Reader getSequenceSliceReader(ByteSpan span) { } @Override - public int read(char[] characterBuffer, - int startingWriteIndexInCharacterBuffer, - int maximumNumberOfCharsToRead) throws java.io.IOException { + public int read( + char[] characterBuffer, int startingWriteIndexInCharacterBuffer, int maximumNumberOfCharsToRead) + throws java.io.IOException { // --- Validate caller’s target window [off .. off + len) --- ValidateTargetWindow(characterBuffer, startingWriteIndexInCharacterBuffer, maximumNumberOfCharsToRead); if (maximumNumberOfCharsToRead == 0) return 0; @@ -90,21 +91,21 @@ public int read(char[] characterBuffer, while (out < maximumNumberOfCharsToRead) { // --- Prep the buffer for next read & fill it out --- if (!buf.hasRemaining()) { - if (pos >= endEx) break; //if end of slice reached, stop reading + if (pos >= endEx) break; // if end of slice reached, stop reading buf.clear(); int toRead = (int) Math.min(buf.capacity(), endEx - pos); buf.limit(toRead); int n = channel.read(buf, pos); - if (n <= 0) break; //if no bytes were read, break + if (n <= 0) break; // if no bytes were read, break pos += n; buf.flip(); } // Drain bytes + ASCII decode -> writees chars into caller’s window [off .. off+len) while (buf.hasRemaining() && out < maximumNumberOfCharsToRead) { byte b = buf.get(); - if (b == LF || b == CR) continue; // skip irrelevant bytes + if (b == LF || b == CR) continue; // skip irrelevant bytes characterBuffer[startingWriteIndexInCharacterBuffer + out] = (char) (b & 0xFF); out++; } @@ -113,17 +114,16 @@ public int read(char[] characterBuffer, return (out == 0) ? -1 : out; } - private void ValidateTargetWindow(char[] characterBuffer, - int startingWriteIndexInCharacterBuffer, - int maximumNumberOfCharsToRead) throws java.io.IOException { + private void ValidateTargetWindow( + char[] characterBuffer, int startingWriteIndexInCharacterBuffer, int maximumNumberOfCharsToRead) + throws java.io.IOException { if (characterBuffer == null) throw new NullPointerException("characterBuffer"); - if (startingWriteIndexInCharacterBuffer < 0 || - maximumNumberOfCharsToRead < 0 || - startingWriteIndexInCharacterBuffer + maximumNumberOfCharsToRead > characterBuffer.length) { - throw new IndexOutOfBoundsException( - "off=" + startingWriteIndexInCharacterBuffer + - " len=" + maximumNumberOfCharsToRead + - " bufLen=" + characterBuffer.length); + if (startingWriteIndexInCharacterBuffer < 0 + || maximumNumberOfCharsToRead < 0 + || startingWriteIndexInCharacterBuffer + maximumNumberOfCharsToRead > characterBuffer.length) { + throw new IndexOutOfBoundsException("off=" + startingWriteIndexInCharacterBuffer + " len=" + + maximumNumberOfCharsToRead + " bufLen=" + + characterBuffer.length); } } @@ -134,8 +134,15 @@ public int read() throws java.io.IOException { return (n == -1) ? -1 : one[0]; } - @Override public boolean ready() { return buf.hasRemaining() || pos < endEx; } - @Override public void close() { /* no-op, channel is kept alive */ } + @Override + public boolean ready() { + return buf.hasRemaining() || pos < endEx; + } + + @Override + public void close() { + /* no-op, channel is kept alive */ + } }; } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java index b9b14fbb..6ca3fe55 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java @@ -14,7 +14,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.util.*; - import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; import uk.ac.ebi.embl.gff3tools.fasta.Topology; @@ -40,10 +39,8 @@ public ParsedHeader parse(String headerLine) throws FastaFileException { private static void fillFromJson(String raw, FastaHeader h) throws FastaFileException { if (raw == null || raw.isEmpty()) { - throw new FastaFileException( - "FASTA header contains a '|', but no JSON object was provided. " + - "Expected something like: >id { \"description\": \"...\", \"moleculeType\": \"DNA\", ... }" - ); + throw new FastaFileException("FASTA header contains a '|', but no JSON object was provided. " + + "Expected something like: >id { \"description\": \"...\", \"moleculeType\": \"DNA\", ... }"); } // Normalize curly quotes / NBSPs @@ -59,21 +56,18 @@ private static void fillFromJson(String raw, FastaHeader h) throws FastaFileExce node = MAPPER.readTree(normalized); if (node == null || !node.isObject()) { throw new FastaFileException( - "FASTA header JSON did not parse into an object. " + - "Received: " + normalized - ); + "FASTA header JSON did not parse into an object. " + "Received: " + normalized); } } catch (IOException e) { - throw new FastaFileException( - "Malformed FASTA header JSON. Failed to parse: " + normalized, e - ); + throw new FastaFileException("Malformed FASTA header JSON. Failed to parse: " + normalized, e); } // Extract fields Map m = new HashMap<>(); node.fields().forEachRemaining(e -> { String key = (e.getKey() == null ? "" : e.getKey()) - .trim().toLowerCase(Locale.ROOT) + .trim() + .toLowerCase(Locale.ROOT) .replaceAll("[\\s_-]+", ""); String val = e.getValue().isNull() ? null : e.getValue().asText(); m.put(key, val); @@ -94,20 +88,15 @@ private static void fillFromJson(String raw, FastaHeader h) throws FastaFileExce // 🔍 Validate required fields List missing = new ArrayList<>(); - if (h.description == null) - missing.add("description"); + if (h.description == null) missing.add("description"); - if (h.moleculeType == null) - missing.add("moleculeType"); + if (h.moleculeType == null) missing.add("moleculeType"); - if (h.topology == null) - missing.add("topology (must be 'LINEAR' or 'CIRCULAR')"); + if (h.topology == null) missing.add("topology (must be 'LINEAR' or 'CIRCULAR')"); if (!missing.isEmpty()) { throw new FastaFileException( - "FASTA header JSON is missing required fields: " + missing + - ". Parsed JSON was: " + normalized - ); + "FASTA header JSON is missing required fields: " + missing + ". Parsed JSON was: " + normalized); } } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java index f9f8c203..6a555767 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceAlphabet.java @@ -59,5 +59,4 @@ public String describeAllowed() { sb.append("]"); return sb.toString(); } - } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java index ba920784..cf1858aa 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java @@ -10,17 +10,16 @@ */ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; -import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; - import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.List; +import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; public final class SequenceIndexBuilder { - private static final int SCAN_BUF_SIZE = 4 * 1024 * 1024; // 4 MB - private static final int COUNT_BUF_SIZE = 4 * 1024 * 1024; // 2 MB + private static final int SCAN_BUF_SIZE = 4 * 1024 * 1024; // 4 MB + private static final int COUNT_BUF_SIZE = 4 * 1024 * 1024; // 2 MB private static final byte GT = (byte) '>'; private static final byte LF = (byte) '\n'; @@ -51,7 +50,7 @@ public Result buildFrom(long startPos) throws IOException, FastaFileException { ByteBuffer buf = newScanBuffer(); // ------------- scan raw bytes into provisional "sequence lines" ------------- - while (s.pos entries = service.getFastaEntries(); assertEquals(2, entries.size(), "should parse 2 FASTA entries"); - Set ids = entries.stream() - .map(e -> e.getSubmissionId()) - .collect(Collectors.toSet()); + Set ids = entries.stream().map(e -> e.getSubmissionId()).collect(Collectors.toSet()); assertTrue(ids.contains("ID1")); assertTrue(ids.contains("ID2")); @@ -78,16 +74,22 @@ void gettingSequenceSliceAsStringReturnsCorrectly() throws IOException, FastaFil // From the sample file above: assertEquals(2, entry1.get().leadingNsCount, "ID1 leading Ns"); - assertEquals(2, entry1.get().trailingNsCount, "ID1 trailing Ns"); + assertEquals(2, entry1.get().trailingNsCount, "ID1 trailing Ns"); assertEquals(0, entry2.get().leadingNsCount, "ID2 leading Ns"); - assertEquals(0,entry2.get().trailingNsCount, "ID2 trailing Ns"); + assertEquals(0, entry2.get().trailingNsCount, "ID2 trailing Ns"); - String sequence1 = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases); + String sequence1 = service.getSequenceSliceString( + SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases); assertEquals("NNACACGTTTNn", sequence1); - String sequence2 = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, entry2.get().totalBases); + String sequence2 = service.getSequenceSliceString( + SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, entry2.get().totalBases); assertEquals("ACGTGGGG", sequence2); - String sequence1withoutNbases = service.getSequenceSliceString(SequenceRangeOption.WITHOUT_N_BASES, entry1.get().submissionId, 1, entry1.get().totalBasesWithoutNBases); + String sequence1withoutNbases = service.getSequenceSliceString( + SequenceRangeOption.WITHOUT_N_BASES, + entry1.get().submissionId, + 1, + entry1.get().totalBasesWithoutNBases); assertEquals("ACACGTTT", sequence1withoutNbases); service.close(); @@ -102,17 +104,16 @@ void gettingSequenceViaReaderGivesCorrectResult() throws IOException, FastaFileE List entries = service.getFastaEntries(); assertEquals(2, entries.size(), "should parse 2 FASTA entries"); - Set ids = entries.stream() - .map(e -> e.getSubmissionId()) - .collect(Collectors.toSet()); + Set ids = entries.stream().map(e -> e.getSubmissionId()).collect(Collectors.toSet()); assertTrue(ids.contains("ID1")); assertTrue(ids.contains("ID2")); Optional entry1 = service.getFastaWithSubmissionId("ID1"); Optional entry2 = service.getFastaWithSubmissionId("ID2"); - // stream whole sequence with the reader + // stream whole sequence with the reader String streamedSequence; - try (java.io.Reader r = service.getSequenceSliceReader(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases)) { + try (java.io.Reader r = service.getSequenceSliceReader( + SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, entry1.get().totalBases)) { StringBuilder sb = new StringBuilder(); char[] cbuf = new char[8192]; int n; @@ -126,7 +127,11 @@ void gettingSequenceViaReaderGivesCorrectResult() throws IOException, FastaFileE // stream whole sequence with the reader String streamedSequenceWithoutNbases; - try (java.io.Reader r = service.getSequenceSliceReader(SequenceRangeOption.WITHOUT_N_BASES, entry1.get().submissionId, 1, entry1.get().totalBasesWithoutNBases)) { + try (java.io.Reader r = service.getSequenceSliceReader( + SequenceRangeOption.WITHOUT_N_BASES, + entry1.get().submissionId, + 1, + entry1.get().totalBasesWithoutNBases)) { StringBuilder sb = new StringBuilder(); char[] cbuf = new char[8192]; int n; @@ -138,10 +143,10 @@ void gettingSequenceViaReaderGivesCorrectResult() throws IOException, FastaFileE // compare assertEquals("ACACGTTT", streamedSequenceWithoutNbases); - // stream sequence with the reader String streamedSequence2; - try (java.io.Reader r = service.getSequenceSliceReader(SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, entry2.get().totalBases)) { + try (java.io.Reader r = service.getSequenceSliceReader( + SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, entry2.get().totalBases)) { StringBuilder sb = new StringBuilder(); char[] cbuf = new char[8192]; int n; @@ -153,7 +158,6 @@ void gettingSequenceViaReaderGivesCorrectResult() throws IOException, FastaFileE // compare assertEquals("ACGTGGGG", streamedSequence2); - service.close(); } @@ -166,21 +170,20 @@ void gettingStringAsAStringVsStreamProducesSameResultSlices() throws IOException List entries = service.getFastaEntries(); assertEquals(2, entries.size(), "should parse 2 FASTA entries"); - Set ids = entries.stream() - .map(e -> e.getSubmissionId()) - .collect(Collectors.toSet()); + Set ids = entries.stream().map(e -> e.getSubmissionId()).collect(Collectors.toSet()); assertTrue(ids.contains("ID1")); assertTrue(ids.contains("ID2")); Optional entry1 = service.getFastaWithSubmissionId("ID1"); Optional entry2 = service.getFastaWithSubmissionId("ID2"); - for(long end=2; end <= entry1.get().totalBases; end++) { + for (long end = 2; end <= entry1.get().totalBases; end++) { // get slice as string - String sequence = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, end); + String sequence = service.getSequenceSliceString( + SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, end); // stream sequence with the reader String streamedSequence; - try (java.io.Reader r = service.getSequenceSliceReader(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, end)) - { + try (java.io.Reader r = service.getSequenceSliceReader( + SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, end)) { StringBuilder sb = new StringBuilder(); char[] cbuf = new char[8192]; int n; @@ -193,12 +196,14 @@ void gettingStringAsAStringVsStreamProducesSameResultSlices() throws IOException assertEquals(sequence, streamedSequence); } - for(long end=2; end <= entry2.get().totalBases; end++) { + for (long end = 2; end <= entry2.get().totalBases; end++) { // get slice as string - String sequence2 = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, end); + String sequence2 = service.getSequenceSliceString( + SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, end); // stream sequence with the reader String streamedSequence2; - try (java.io.Reader r = service.getSequenceSliceReader(SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, end)) { + try (java.io.Reader r = service.getSequenceSliceReader( + SequenceRangeOption.WHOLE_SEQUENCE, entry2.get().submissionId, 1, end)) { StringBuilder sb = new StringBuilder(); char[] cbuf = new char[8192]; int n; @@ -214,16 +219,18 @@ void gettingStringAsAStringVsStreamProducesSameResultSlices() throws IOException service.close(); } - // to run this, curl the sequence with: curl -o single_fasta_large_sequence.txt https://www.ebi.ac.uk/ena/cram/md5/11398cc4b68f2cceb4fd50b742d4b1ec + // to run this, curl the sequence with: curl -o single_fasta_large_sequence.txt + // https://www.ebi.ac.uk/ena/cram/md5/11398cc4b68f2cceb4fd50b742d4b1ec // then to add the fasta header run something like : // // tmp="$(mktemp "${TMPDIR:-/tmp}/prepend.XXXXXX")" && - //{ printf '%s\n' '>ID1 | {"description":"x", "molecule_type":"dna", "topology":"linear"}'; cat -- single_fasta_large_sequence.txt; } >"$tmp" && - //mv -f -- "$tmp" single_fasta_large_sequence.txt + // { printf '%s\n' '>ID1 | {"description":"x", "molecule_type":"dna", "topology":"linear"}'; cat -- + // single_fasta_large_sequence.txt; } >"$tmp" && + // mv -f -- "$tmp" single_fasta_large_sequence.txt // // then just move the fasta into whatever/gff3tools/src/test/resources/fasta/ // and run the test - //@Test + // @Test void readBigSequenceSuccessfully() throws IOException, FastaFileException { File fasta = FastaTestResources.file("fasta", "single_fasta_large_sequence.txt"); FastaFileService service = new FastaFileService(); @@ -232,21 +239,23 @@ void readBigSequenceSuccessfully() throws IOException, FastaFileException { List entries = service.getFastaEntries(); assertEquals(1, entries.size(), "should parse 1 FASTA entry"); - Set ids = entries.stream() - .map(e -> e.getSubmissionId()) - .collect(Collectors.toSet()); + Set ids = entries.stream().map(e -> e.getSubmissionId()).collect(Collectors.toSet()); assertTrue(ids.contains("ID1")); Optional entry1 = service.getFastaWithSubmissionId("ID1"); - //get first 16 chars - String sequenceStart = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, 16); + // get first 16 chars + String sequenceStart = + service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, 1, 16); assertEquals(sequenceStart, "GGGCTTTAAATGGCTC"); - //get last 16 chars - String sequenceEnd = service.getSequenceSliceString(SequenceRangeOption.WHOLE_SEQUENCE, entry1.get().submissionId, entry1.get().totalBases-15, entry1.get().totalBases); + // get last 16 chars + String sequenceEnd = service.getSequenceSliceString( + SequenceRangeOption.WHOLE_SEQUENCE, + entry1.get().submissionId, + entry1.get().totalBases - 15, + entry1.get().totalBases); assertEquals(sequenceEnd, "GAATTCTGATGGCTGT"); service.close(); } - } diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java index f29aa610..0f83646e 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java @@ -1,16 +1,22 @@ /* - * Copyright 2025 EMBL... + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. */ package uk.ac.ebi.embl.gff3tools.fasta.headerutils; import static org.junit.jupiter.api.Assertions.*; +import java.util.Optional; import org.junit.jupiter.api.Test; import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; import uk.ac.ebi.embl.gff3tools.fasta.Topology; -import java.util.Optional; - public class JsonHeaderParserTest { private final JsonHeaderParser parser = new JsonHeaderParser(); @@ -38,9 +44,8 @@ void parsesStandardHeaderWithJson() { @Test void picksFirstTokenAsIdEvenWithExtraStuff() { - String line = - ">AF123456.1 extra tokens here | " + - " {\"description\":\"x\", \"molecule_type\":\"dna\", \"topology\":\"linear\"}"; + String line = ">AF123456.1 extra tokens here | " + + " {\"description\":\"x\", \"molecule_type\":\"dna\", \"topology\":\"linear\"}"; ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); assertEquals("AF123456.1", ph.getId()); @@ -61,9 +66,8 @@ void parsesCurlyQuotesAndWeirdSpacingInKeys() { @Test void normalizesKeyVariantsAndChromosomeOptionals() { - String line = - ">ID2 | { \"Description\":\"Desc\", \"molecule-type\":\"rna\", \"topology\":\"linear\", " + - "\"Chromosome Type\":\"plasmid\", \"chromosome_location\":\"chr12:100-200\", \"CHROMOSOME_NAME\":\"pX\" }"; + String line = ">ID2 | { \"Description\":\"Desc\", \"molecule-type\":\"rna\", \"topology\":\"linear\", " + + "\"Chromosome Type\":\"plasmid\", \"chromosome_location\":\"chr12:100-200\", \"CHROMOSOME_NAME\":\"pX\" }"; ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); FastaHeader h = ph.getHeader(); @@ -79,10 +83,9 @@ void normalizesKeyVariantsAndChromosomeOptionals() { @Test void handlesNbspInJson() { String nbsp = "\u00A0"; - String line = - ">ID3 | {" + - nbsp + "\"description\"" + nbsp + ":" + nbsp + "\"Alpha" + nbsp + "Beta\"" + "," + - "\"molecule_type\":\"rna\", \"topology\":\"linear\"}"; + String line = ">ID3 | {" + nbsp + + "\"description\"" + nbsp + ":" + nbsp + "\"Alpha" + nbsp + "Beta\"" + "," + + "\"molecule_type\":\"rna\", \"topology\":\"linear\"}"; ParsedHeader ph = assertDoesNotThrow(() -> parser.parse(line)); FastaHeader h = ph.getHeader(); @@ -107,8 +110,8 @@ void missingJsonIsFine_NoPipe() { @Test void trimsIdAndHandlesJustChevron() { - ParsedHeader ph1 = assertDoesNotThrow(() -> - parser.parse("> AF111 | {\"description\":\"x\",\"molecule_type\":\"dna\",\"topology\":\"linear\"}")); + ParsedHeader ph1 = assertDoesNotThrow(() -> parser.parse( + "> AF111 | {\"description\":\"x\",\"molecule_type\":\"dna\",\"topology\":\"linear\"}")); assertEquals("AF111", ph1.getId()); // No pipe: JSON not required @@ -117,7 +120,6 @@ void trimsIdAndHandlesJustChevron() { assertNull(ph2.getHeader().getDescription()); } - // --------------------------------------------------------- // INVALID CASES — MUST THROW FASTAFIleException // --------------------------------------------------------- @@ -154,7 +156,6 @@ void unknownTopologyThrows() { assertTrue(e.getMessage().contains("topology")); } - // --------------------------------------------------------- // MALFORMED JSON // --------------------------------------------------------- @@ -179,7 +180,6 @@ void malformedJsonBracesThrowsAndIncludesJsonInMessage() { assertTrue(e.getMessage().contains("{\"description")); } - @Test void malformedJsonWithTrailingCommaThrowsAndMentionsComma() { String badJson = "{ \"description\":\"y\", \"molecule_type\":\"genomic\", }"; diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java index 87b4a8a5..d754dd49 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilderTest.java @@ -52,7 +52,7 @@ void buildsIndexCorrectly() throws Exception { String l1 = "NNAC\n"; // leading N=2 String l2 = "acgt\n"; String l3 = "ttnN\n"; // trailing N=2 - String empties = "\n\t\n\n"; + String empties = "\n\n\n"; String nextHead = ">NEXT\n"; String fasta = header + l1 + l2 + l3 + empties + nextHead; @@ -118,12 +118,10 @@ void buildsIndexCorrectly() throws Exception { } @Test - void ignoresCRLFCorrectly() throws Exception { - // Mix CRLF lines in the sequence part; builder uses LF as terminator and ignores CR as non-base. + void buildsIndexCorrectlyTest2() throws Exception { String header = ">ID2\n"; - // simulate CRLF lines by inserting '\r' before '\n' - String l1 = "NNxx".replace('x', 'A') + "\r\n"; // "NNAA\r\n" - String l2 = "gggg\r\n"; + String l1 = "NNxx".replace('x', 'A') + "\n"; + String l2 = "gggg\n"; String next = ">H2\n"; String fasta = header + l1 + l2 + next; @@ -156,7 +154,7 @@ void ignoresEmptyLinesCorrectly() throws Exception { String header = ">ID3\n"; String l1 = "NACG\n"; // leading N = 1 String l2 = "NNNN\n"; // middle line of Ns — must NOT affect start/end N counts - String blanks = " \n\t\n"; + String blanks = "\n\n"; String l3 = "GGGn\n"; // trailing n = 1 String next = ">K\n"; From e60efc4e6a8bcdb138d7572e782dbcd74356ee07 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 10 Dec 2025 15:06:55 +0000 Subject: [PATCH 20/31] comments --- .../ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java index e4edc521..a4d27603 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java @@ -18,10 +18,10 @@ @Getter @Setter public class FastaHeader { - String description; // mandatory (can be empty if you insist) - String moleculeType; // mandatory (can be null if empty allowed) - Topology topology; // mandatory (can be null if empty allowed) - Optional chromosomeType; // optional (doesnt have to be a json) + String description; // mandatory + String moleculeType; // mandatory + Topology topology; // mandatory + Optional chromosomeType; // optional (doesnt have to be in the json at all) Optional chromosomeLocation; // optional Optional chromosomeName; // optional } From 1f22ff9ad011b76d5632390e178603169e6d1eb0 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 10 Dec 2025 15:07:33 +0000 Subject: [PATCH 21/31] comments --- .../embl/gff3tools/fasta/headerutils/JsonHeaderParser.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java index 6ca3fe55..775cb0d9 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java @@ -85,13 +85,10 @@ private static void fillFromJson(String raw, FastaHeader h) throws FastaFileExce if (m.containsKey("chromosomename")) h.setChromosomeName(Optional.ofNullable(emptyToNull(m.get("chromosomename")))); - // 🔍 Validate required fields + // Validate required fields List missing = new ArrayList<>(); - if (h.description == null) missing.add("description"); - if (h.moleculeType == null) missing.add("moleculeType"); - if (h.topology == null) missing.add("topology (must be 'LINEAR' or 'CIRCULAR')"); if (!missing.isEmpty()) { From 84fb3690e9acb57c1a208bea017180248f28faf7 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 10 Dec 2025 15:11:37 +0000 Subject: [PATCH 22/31] comment-and-gap-cleanup --- .../uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java | 2 +- .../ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java | 2 +- .../fasta/sequenceutils/SequenceIndexBuilder.java | 7 +++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java index 1c96ef48..82e68264 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaEntryInternal.java @@ -20,7 +20,7 @@ class FastaEntryInternal { String submissionId; String accessionId; - FastaHeader header; // json info + FastaHeader header; // information needed for accessing the file long fastaStartByte; // position of '>' in the file SequenceIndex sequenceIndex; // a smart index for querying ranges in the file diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java index 32f1cd02..35aa7cfd 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/LineEntry.java @@ -11,7 +11,7 @@ package uk.ac.ebi.embl.gff3tools.fasta.sequenceutils; public final class LineEntry { - public long baseStart; // 1-based, inclusive (after edits) + public long baseStart; // 1-based, inclusive public long baseEnd; // 1-based, inclusive public long byteStart; // absolute byte offset of first base in this line public long byteEndExclusive; // absolute byte offset one past last base diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java index cf1858aa..41fb174b 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java @@ -82,7 +82,7 @@ private static final class ScanState { long pos; // absolute scan position long firstBaseByte = -1; // first allowed base byte seen long lastBaseByte = -1; // last allowed base byte seen - long nextHdr; // byte of next header (or fileSize) + long nextHdr; // byte of next header (or file end) long lineFirstByte = -1; // first allowed base byte in current line long lineLastByte = -1; // last allowed base byte in current line @@ -121,7 +121,7 @@ private boolean processBuffer(ByteBuffer buf, ScanState s) throws IOException, F commitOpenLineIfAny(s); // finalize any in-flight line return true; } else if (b == LF) { // end of a displayed sequence line - commitOpenLineIfAny(s); // (2) only lines with bases are committed + commitOpenLineIfAny(s); // only lines with bases are committed continue; } else if (alphabet.isAllowed(b)) { observeBase(abs, s); @@ -164,7 +164,7 @@ private void observeBase(long abs, ScanState s) { } private void commitOpenLineIfAny(ScanState s) { - if (s.basesInLine <= 0) return; // (2) skip empty lines + if (s.basesInLine <= 0) return; // skip empty lines long baseStart = s.basesSoFar + 1; long baseEnd = s.basesSoFar + s.basesInLine; long byteStart = s.lineFirstByte; @@ -191,7 +191,6 @@ private List filterLinesWithinWindow(List raw, long firstB out.add(L); } } - // baseStart/baseEnd are already contiguous (1..N) in raw; filtering preserves order & numbering return out; } From 80afdd62bc62a838ebf4ca8bf875a339637e12ea Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 10 Dec 2025 15:20:38 +0000 Subject: [PATCH 23/31] optimized-reading-a-bit-more --- .../ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java | 2 +- .../embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index c80c3cf1..fbb3d67a 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -153,7 +153,7 @@ public List readAll() throws FastaFileException, IOException var entry = readNext(position); if (entry.isEmpty()) break; entries.add(entry.get()); - position = channel.position(); + position = entry.get().getSequenceIndex().lastBaseByte; // read from the end of last sequence } return entries; } diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java index 4b85de6d..0ea7d3d1 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java @@ -230,7 +230,7 @@ void gettingStringAsAStringVsStreamProducesSameResultSlices() throws IOException // // then just move the fasta into whatever/gff3tools/src/test/resources/fasta/ // and run the test - // @Test + //@Test void readBigSequenceSuccessfully() throws IOException, FastaFileException { File fasta = FastaTestResources.file("fasta", "single_fasta_large_sequence.txt"); FastaFileService service = new FastaFileService(); From e13ff99ce20c95a8bff57919de1a0b43d94a5c35 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Wed, 10 Dec 2025 15:27:58 +0000 Subject: [PATCH 24/31] spacing-fixes-for-spotless --- .../embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java index 0ea7d3d1..4b85de6d 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java @@ -230,7 +230,7 @@ void gettingStringAsAStringVsStreamProducesSameResultSlices() throws IOException // // then just move the fasta into whatever/gff3tools/src/test/resources/fasta/ // and run the test - //@Test + // @Test void readBigSequenceSuccessfully() throws IOException, FastaFileException { File fasta = FastaTestResources.file("fasta", "single_fasta_large_sequence.txt"); FastaFileService service = new FastaFileService(); From b483272383a2fcc3f26fb9e0be7a5124bc6577e7 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Thu, 11 Dec 2025 10:17:31 +0000 Subject: [PATCH 25/31] switched-from-optional-to-string-in-fields --- .../gff3tools/fasta/headerutils/FastaHeader.java | 7 +++---- .../fasta/headerutils/JsonHeaderParser.java | 13 +++---------- .../fasta/headerutils/JsonHeaderParserTest.java | 13 ++++++------- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java index a4d27603..59be8951 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/FastaHeader.java @@ -10,7 +10,6 @@ */ package uk.ac.ebi.embl.gff3tools.fasta.headerutils; -import java.util.Optional; import lombok.Getter; import lombok.Setter; import uk.ac.ebi.embl.gff3tools.fasta.Topology; @@ -21,7 +20,7 @@ public class FastaHeader { String description; // mandatory String moleculeType; // mandatory Topology topology; // mandatory - Optional chromosomeType; // optional (doesnt have to be in the json at all) - Optional chromosomeLocation; // optional - Optional chromosomeName; // optional + String chromosomeType; // optional (doesnt have to be in the json at all) + String chromosomeLocation; // optional + String chromosomeName; // optional } diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java index 775cb0d9..6e974ecc 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java @@ -27,9 +27,6 @@ public ParsedHeader parse(String headerLine) throws FastaFileException { String id = idPart.isEmpty() ? "" : idPart.split("\\s+")[0]; FastaHeader h = new FastaHeader(); - h.setChromosomeType(Optional.empty()); - h.setChromosomeLocation(Optional.empty()); - h.setChromosomeName(Optional.empty()); if (pipe >= 0) { fillFromJson(rest.substring(pipe + 1).trim(), h); // may throw IOException @@ -77,13 +74,9 @@ private static void fillFromJson(String raw, FastaHeader h) throws FastaFileExce h.setDescription(m.get("description")); h.setMoleculeType(m.get("moleculetype")); h.setTopology(parseTopology(m.get("topology"))); - - if (m.containsKey("chromosometype")) - h.setChromosomeType(Optional.ofNullable(emptyToNull(m.get("chromosometype")))); - if (m.containsKey("chromosomelocation")) - h.setChromosomeLocation(Optional.ofNullable(emptyToNull(m.get("chromosomelocation")))); - if (m.containsKey("chromosomename")) - h.setChromosomeName(Optional.ofNullable(emptyToNull(m.get("chromosomename")))); + h.setChromosomeType(m.get("chromosometype")); + h.setChromosomeLocation(m.get("chromosomelocation")); + h.setChromosomeName(emptyToNull(m.get("chromosomename"))); // Validate required fields List missing = new ArrayList<>(); diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java index 0f83646e..396923c9 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParserTest.java @@ -12,7 +12,6 @@ import static org.junit.jupiter.api.Assertions.*; -import java.util.Optional; import org.junit.jupiter.api.Test; import uk.ac.ebi.embl.gff3tools.exception.FastaFileException; import uk.ac.ebi.embl.gff3tools.fasta.Topology; @@ -37,9 +36,9 @@ void parsesStandardHeaderWithJson() { assertEquals("Pinus sativa", h.getDescription()); assertEquals("genomic", h.getMoleculeType()); assertEquals(Topology.CIRCULAR, h.getTopology()); - assertTrue(h.getChromosomeType().isEmpty()); - assertTrue(h.getChromosomeLocation().isEmpty()); - assertTrue(h.getChromosomeName().isEmpty()); + assertEquals(null, h.getChromosomeType()); + assertEquals(null, h.getChromosomeLocation()); + assertEquals(null, h.getChromosomeName()); } @Test @@ -75,9 +74,9 @@ void normalizesKeyVariantsAndChromosomeOptionals() { assertEquals("Desc", h.getDescription()); assertEquals("rna", h.getMoleculeType()); assertEquals(Topology.LINEAR, h.getTopology()); - assertEquals(Optional.of("plasmid"), h.getChromosomeType()); - assertEquals(Optional.of("chr12:100-200"), h.getChromosomeLocation()); - assertEquals(Optional.of("pX"), h.getChromosomeName()); + assertEquals("plasmid", h.getChromosomeType()); + assertEquals("chr12:100-200", h.getChromosomeLocation()); + assertEquals("pX", h.getChromosomeName()); } @Test From 8662531e6dbe6472859e417a56c8f633af0fde77 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Thu, 11 Dec 2025 10:20:59 +0000 Subject: [PATCH 26/31] comment --- .../java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java index 2d0fdbf8..c3829746 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -33,7 +33,7 @@ public final class FastaFileService { private HashMap sequenceIndexes = new HashMap<>(); private File file; - private SequentialFastaFileReader reader; // owned here + private SequentialFastaFileReader reader; public FastaFileService() { this.file = null; From c206db8792bf9d8d785a5dd99737150f08d54e39 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Thu, 11 Dec 2025 14:07:58 +0000 Subject: [PATCH 27/31] renamed-function --- .../ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index fbb3d67a..d070c5e5 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -59,7 +59,7 @@ public boolean readingFile() { } public String getSequenceSliceString(ByteSpan span) throws IOException { - return readAsciiWithoutNewlines(span.start, span.endEx); + return getHeaderASCIILine(span.start, span.endEx); } /** Char-stream view over [span.start, span.endEx): ASCII decode, skip LF/CR. @@ -192,7 +192,7 @@ private Optional readNext(long from) throws FastaFileExcepti } /** Read ASCII bytes from [byteStart, byteEndExclusive) skipping LF/CR; does not change channel.position(). */ - public String readAsciiWithoutNewlines(long byteStart, long byteEndExclusive) throws IOException { + public String getHeaderASCIILine(long byteStart, long byteEndExclusive) throws IOException { if (byteStart < 0 || byteEndExclusive < byteStart || byteEndExclusive > fileSize) { throw new IllegalArgumentException("Bad byte window: " + byteStart + ".." + byteEndExclusive); } From a71080d4357738611898a8c14ee7038cb2895217 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Thu, 11 Dec 2025 14:13:51 +0000 Subject: [PATCH 28/31] renamed-function --- .../fasta/SequentialFastaFileReader.java | 66 +++++++++---------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index d070c5e5..4e8f1659 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -59,7 +59,36 @@ public boolean readingFile() { } public String getSequenceSliceString(ByteSpan span) throws IOException { - return getHeaderASCIILine(span.start, span.endEx); + long byteStart = span.start; + long byteEndExclusive = span.endEx; + + if (byteStart < 0 || byteEndExclusive < byteStart || byteEndExclusive > fileSize) { + throw new IllegalArgumentException("Bad byte window: " + byteStart + ".." + byteEndExclusive); + } + long remain = byteEndExclusive - byteStart; + long off = byteStart; + + // pre-size builder with a sane cap (skip newlines, so content <= remain) + int expect = (int) Math.min(remain, 1_000_000L); + StringBuilder sb = new StringBuilder(expect); + + ByteBuffer buf = ByteBuffer.allocateDirect(BUFFER_SIZE); + while (remain > 0) { + buf.clear(); + int want = (int) Math.min(buf.capacity(), remain); + buf.limit(want); + int n = channel.read(buf, off); + if (n <= 0) break; + buf.flip(); + while (buf.hasRemaining()) { + byte b = buf.get(); + if (b == LF || b == CR) continue; + sb.append((char) (b & 0xFF)); + } + remain -= n; + off += n; + } + return sb.toString(); } /** Char-stream view over [span.start, span.endEx): ASCII decode, skip LF/CR. @@ -167,7 +196,7 @@ private Optional readNext(long from) throws FastaFileExcepti long headerPos = headerPosOpt.getAsLong(); channel.position(headerPos); - String headerLine = readAsciiLineFromCurrentPosition(); + String headerLine = readHeaderLine(); if (headerLine == null) throw new FastaFileException("Header is malformed at byte " + headerPos); ParsedHeader ph = headerParser.parse(headerLine); @@ -191,37 +220,6 @@ private Optional readNext(long from) throws FastaFileExcepti } } - /** Read ASCII bytes from [byteStart, byteEndExclusive) skipping LF/CR; does not change channel.position(). */ - public String getHeaderASCIILine(long byteStart, long byteEndExclusive) throws IOException { - if (byteStart < 0 || byteEndExclusive < byteStart || byteEndExclusive > fileSize) { - throw new IllegalArgumentException("Bad byte window: " + byteStart + ".." + byteEndExclusive); - } - long remain = byteEndExclusive - byteStart; - long off = byteStart; - - // pre-size builder with a sane cap (skip newlines, so content <= remain) - int expect = (int) Math.min(remain, 1_000_000L); - StringBuilder sb = new StringBuilder(expect); - - ByteBuffer buf = ByteBuffer.allocateDirect(BUFFER_SIZE); - while (remain > 0) { - buf.clear(); - int want = (int) Math.min(buf.capacity(), remain); - buf.limit(want); - int n = channel.read(buf, off); - if (n <= 0) break; - buf.flip(); - while (buf.hasRemaining()) { - byte b = buf.get(); - if (b == LF || b == CR) continue; // omit line breaks on the fly - sb.append((char) (b & 0xFF)); // ASCII - } - remain -= n; - off += n; - } - return sb.toString(); - } - // ------------------ header seeking & line reading ------------------ private OptionalLong seekToNextHeader(long from) throws IOException { @@ -260,7 +258,7 @@ private byte peek(long abs) throws IOException { } /** Reads one ASCII line from current position, advances past LF or to EOF. */ - private String readAsciiLineFromCurrentPosition() throws IOException { + private String readHeaderLine() throws IOException { long scanPos = channel.position(); if (scanPos >= fileSize) return null; From 2adc6e31aece33b704d50f6de9612c33dcdf4ad3 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Thu, 11 Dec 2025 14:19:07 +0000 Subject: [PATCH 29/31] renamed-function --- .../gff3tools/fasta/SequentialFastaFileReader.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java index 4e8f1659..421a4416 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/SequentialFastaFileReader.java @@ -194,9 +194,7 @@ private Optional readNext(long from) throws FastaFileExcepti if (headerPosOpt.isEmpty()) return Optional.empty(); long headerPos = headerPosOpt.getAsLong(); - channel.position(headerPos); - - String headerLine = readHeaderLine(); + String headerLine = readHeaderLine(headerPos); if (headerLine == null) throw new FastaFileException("Header is malformed at byte " + headerPos); ParsedHeader ph = headerParser.parse(headerLine); @@ -257,8 +255,10 @@ private byte peek(long abs) throws IOException { return (n == 1) ? one.get(0) : 0; } - /** Reads one ASCII line from current position, advances past LF or to EOF. */ - private String readHeaderLine() throws IOException { + /** Reads one ASCII line from input position, assuming the position handed to it contains '>', advances past LF or to EOF. */ + private String readHeaderLine(long from) throws IOException { + channel.position(from); + long scanPos = channel.position(); if (scanPos >= fileSize) return null; From 271681ac97e9418cd1bd44244bdc17ba2208f410 Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Thu, 11 Dec 2025 14:21:33 +0000 Subject: [PATCH 30/31] removed-useless-bit --- .../uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java index c3829746..94d83e4f 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileService.java @@ -116,7 +116,7 @@ public Reader getSequenceSliceReader(SequenceRangeOption option, String submissi // ---------------------------- interactions with the reader ---------------------------- public void openNewFile(File fastaFile) throws FastaFileException, IOException { - ensureFileReaderClosed(); // if already open, close first + close(); // if already open, close first this.file = Objects.requireNonNull(fastaFile, "file"); this.fastaEntries.clear(); this.sequenceIndexes.clear(); @@ -147,10 +147,6 @@ public void close() throws IOException { } } - private void ensureFileReaderClosed() throws IOException { - if (reader != null) close(); - } - private void ensureFileReaderOpen() { if (reader == null || !reader.readingFile()) throw new IllegalStateException("Service is not open. Call open() first."); From 9e81919a7e1e91a41afc4708e0bbee48e727601e Mon Sep 17 00:00:00 2001 From: Iva Tutis Date: Thu, 18 Dec 2025 16:11:53 +0000 Subject: [PATCH 31/31] added-carriage-return-ignoring --- .../fasta/headerutils/JsonHeaderParser.java | 1 + .../sequenceutils/SequenceIndexBuilder.java | 3 +- .../FastaFileServiceIntegrationTest.java | 63 ++++++++++++++++++- .../gff3tools/fasta/FastaTestResources.java | 2 +- src/test/resources/fasta/example.txt | 29 +++------ src/test/resources/fasta/example2.txt | 9 --- .../resources/fasta/example_to_delete.txt | 24 +++++++ .../example_with_carriage_return_char.txt | 26 ++++++++ 8 files changed, 121 insertions(+), 36 deletions(-) delete mode 100644 src/test/resources/fasta/example2.txt create mode 100644 src/test/resources/fasta/example_to_delete.txt create mode 100644 src/test/resources/fasta/example_with_carriage_return_char.txt diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java index 6e974ecc..5f61a309 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/headerutils/JsonHeaderParser.java @@ -41,6 +41,7 @@ private static void fillFromJson(String raw, FastaHeader h) throws FastaFileExce } // Normalize curly quotes / NBSPs + String normalized = raw.replace('\u201C', '"') .replace('\u201D', '"') .replace('\u2018', '\'') diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java index 41fb174b..3a075d4a 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/fasta/sequenceutils/SequenceIndexBuilder.java @@ -23,6 +23,7 @@ public final class SequenceIndexBuilder { private static final byte GT = (byte) '>'; private static final byte LF = (byte) '\n'; + private static final byte CR = (byte) '\r'; public static final class Result { public final SequenceIndex index; @@ -120,7 +121,7 @@ private boolean processBuffer(ByteBuffer buf, ScanState s) throws IOException, F s.nextHdr = abs; // stop window at header byte commitOpenLineIfAny(s); // finalize any in-flight line return true; - } else if (b == LF) { // end of a displayed sequence line + } else if (b == LF || b == CR) { // end of a displayed sequence line or CR commitOpenLineIfAny(s); // only lines with bases are committed continue; } else if (alphabet.isAllowed(b)) { diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java index 4b85de6d..82876f7e 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaFileServiceIntegrationTest.java @@ -47,9 +47,66 @@ void readingMalformedFastaSequenceFailure() throws IOException { service.close(); } + @Test + void proccessingEntriesWithCarriageReturnsCorrectly() throws IOException, FastaFileException { + File fasta = FastaTestResources.file("fasta", "example_with_carriage_return_char.txt"); + FastaFileService service = new FastaFileService(); + service.openNewFile(fasta); + + List entries = service.getFastaEntries(); + assertEquals(2, entries.size(), "should parse 2 FASTA entries"); + + Set ids = entries.stream().map(e -> e.getSubmissionId()).collect(Collectors.toSet()); + assertTrue(ids.contains("AF123456.1")); + assertTrue(ids.contains("AF123455.2")); + + Optional entry1 = service.getFastaWithSubmissionId("AF123456.1"); + Optional entry2 = service.getFastaWithSubmissionId("AF123455.2"); + Optional imaginaryEntry = service.getFastaWithSubmissionId("ID3"); + assertTrue(entry1.isPresent(), "index for AF123456.1 must exist"); + assertTrue(entry2.isPresent(), "index for AF123455.2 must exist"); + assertTrue(imaginaryEntry.isEmpty(), "index for ID3 must not exist"); + + // From the sample file above: + assertEquals(9, entry1.get().leadingNsCount, "AF123456.1 leading Ns"); + assertEquals(1, entry1.get().trailingNsCount, "AF123456.1 trailing Ns"); + assertEquals(0, entry2.get().leadingNsCount, "AF123455.2 leading Ns"); + assertEquals(0, entry2.get().trailingNsCount, "AF123455.2 trailing Ns"); + + String sequence1StartSlice = + service.getSequenceSliceString(SequenceRangeOption.WITHOUT_N_BASES, entry1.get().submissionId, 1, 11); + assertEquals("CCCGGCGCGGG", sequence1StartSlice); + + String sequence1EndSlice = service.getSequenceSliceString( + SequenceRangeOption.WITHOUT_N_BASES, + entry1.get().submissionId, + entry1.get().totalBasesWithoutNBases - 9, + entry1.get().totalBasesWithoutNBases); + assertEquals("AAAAAAAAAA", sequence1EndSlice); + + String sequence2withoutNbases = service.getSequenceSliceString( + SequenceRangeOption.WITHOUT_N_BASES, + entry2.get().submissionId, + 1, + entry2.get().totalBasesWithoutNBases); + assertEquals( + "CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC" + + "TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG" + + "AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG" + + "ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA" + + "CGCGTTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCAGACGTGGCCAGCAA" + + "ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC" + + "ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT" + + "CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA" + + "AAAAAAAAAAAA", + sequence2withoutNbases); + + service.close(); + } + @Test void gettingSequenceSliceAsStringReturnsCorrectly() throws IOException, FastaFileException { - File fasta = FastaTestResources.file("fasta", "example2.txt"); + File fasta = FastaTestResources.file("fasta", "example.txt"); FastaFileService service = new FastaFileService(); service.openNewFile(fasta); @@ -97,7 +154,7 @@ void gettingSequenceSliceAsStringReturnsCorrectly() throws IOException, FastaFil @Test void gettingSequenceViaReaderGivesCorrectResult() throws IOException, FastaFileException { - File fasta = FastaTestResources.file("fasta", "example2.txt"); + File fasta = FastaTestResources.file("fasta", "example.txt"); FastaFileService service = new FastaFileService(); service.openNewFile(fasta); @@ -163,7 +220,7 @@ void gettingSequenceViaReaderGivesCorrectResult() throws IOException, FastaFileE @Test void gettingStringAsAStringVsStreamProducesSameResultSlices() throws IOException, FastaFileException { - File fasta = FastaTestResources.file("fasta", "example2.txt"); + File fasta = FastaTestResources.file("fasta", "example.txt"); FastaFileService service = new FastaFileService(); service.openNewFile(fasta); diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaTestResources.java b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaTestResources.java index e3f09cfe..87427e65 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaTestResources.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/fasta/FastaTestResources.java @@ -18,7 +18,7 @@ public final class FastaTestResources { private FastaTestResources() {} - /** Returns a Path to a resource like ("fasta", "example2.txt"). */ + /** Returns a Path to a resource like ("fasta", "example.txt"). */ public static Path path(String dir, String fileName) { Objects.requireNonNull(dir, "dir"); Objects.requireNonNull(fileName, "fileName"); diff --git a/src/test/resources/fasta/example.txt b/src/test/resources/fasta/example.txt index 8079c4e9..a91ba778 100644 --- a/src/test/resources/fasta/example.txt +++ b/src/test/resources/fasta/example.txt @@ -1,24 +1,9 @@ +>ID1 | {"description":"x", "molecule_type":"dna", "topology":"linear"} +NNAC +ACGT -NONSENSE -NONSENSE +TTNn ->AF123456.1 |{ “description”: “Pinus sativa isolate xyz, complete mitochondrion”, “ molecule_type”: “genomic”, “topology”: “circular” } -CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC -TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG -AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG -ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA -CGCGTTTTGCATATTACAGTTGAGTGCCTCGACTTAGATTGCAATATAAGCGGCCAGCAA -ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC -ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT -CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA -AAAAAAAAAAAA ->AF123455.2 |{ “description”: “Pinus sativa isolate xyz, complete mitochondrion”, “ molecule_type”: “genomic”, “topology”: “circular” } -CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC -TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG -AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG -ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA -CGCGTTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCAGACGTGGCCAGCAA -ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC -ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT -CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA -AAAAAAAAAAAA +>ID2 | {"description":"x", "molecule_type":"dna", "topology":"circular"} +ACGT +GGGG diff --git a/src/test/resources/fasta/example2.txt b/src/test/resources/fasta/example2.txt deleted file mode 100644 index a91ba778..00000000 --- a/src/test/resources/fasta/example2.txt +++ /dev/null @@ -1,9 +0,0 @@ ->ID1 | {"description":"x", "molecule_type":"dna", "topology":"linear"} -NNAC -ACGT - -TTNn - ->ID2 | {"description":"x", "molecule_type":"dna", "topology":"circular"} -ACGT -GGGG diff --git a/src/test/resources/fasta/example_to_delete.txt b/src/test/resources/fasta/example_to_delete.txt new file mode 100644 index 00000000..8079c4e9 --- /dev/null +++ b/src/test/resources/fasta/example_to_delete.txt @@ -0,0 +1,24 @@ + +NONSENSE +NONSENSE + +>AF123456.1 |{ “description”: “Pinus sativa isolate xyz, complete mitochondrion”, “ molecule_type”: “genomic”, “topology”: “circular” } +CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC +TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG +AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG +ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA +CGCGTTTTGCATATTACAGTTGAGTGCCTCGACTTAGATTGCAATATAAGCGGCCAGCAA +ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC +ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT +CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA +AAAAAAAAAAAA +>AF123455.2 |{ “description”: “Pinus sativa isolate xyz, complete mitochondrion”, “ molecule_type”: “genomic”, “topology”: “circular” } +CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC +TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG +AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG +ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA +CGCGTTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCAGACGTGGCCAGCAA +ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC +ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT +CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA +AAAAAAAAAAAA diff --git a/src/test/resources/fasta/example_with_carriage_return_char.txt b/src/test/resources/fasta/example_with_carriage_return_char.txt new file mode 100644 index 00000000..2c2637c9 --- /dev/null +++ b/src/test/resources/fasta/example_with_carriage_return_char.txt @@ -0,0 +1,26 @@ +>AF123456.1 |{"description":"x", "molecule_type":"dna", "topology":"circular"} +nnnNNNNNNCCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC +TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG + +AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG + +ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA + +CGCGTTTTGCATATTACAGTTGAGTGCCTCGACTTAGATTGCAATATAAGCGGCCAGCAA +ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC +ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT + +CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA +AAAAAAAAAAAAN +>AF123455.2 |{"description":"x", "molecule_type":"dna", "topology":"circular"} +CCCGGCGCGGGCAAGAAGCTGCCGCGTCTGCCCAAGTGTGCCCGCTGCCGCAACCACGGC +TACTCCTCGCCGCTGAAGGGGCACAAGCGGTTCTGCATGTGGCGGGACTGCCAGTGCAAG +AAGTGCAGCCTGATCCGCCGAGCGGCAGGGGTGATGGCCGTGCAGGTTGCACTGAGGAGG + +ATGTGTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCGTATCGCCAAATTAA +CGCGTTTTGTAGTGGTTCCTCGTAGGCTCCAGACGTTTTCTCCTCAGACGTGGCCAGCAA +ACAAGTCTCAAAAAAAAGTTACGTGCGTTTCTGCGAGTGTTATTTTGTTAAGAACGGCTC + +ACAGTGTCCTCTTCCTGTGTTACAGAAGCCAACCTGAAATGAAACTAGTCTGGAAAAATT +CATTGTTCTCTGTAGTTGCAGCTGTACCTGAAATAAAAATGTTATTGATGACTGAAAAAA +AAAAAAAAAAAA