diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/validation/fix/CdsRnaLocusFix.java b/src/main/java/uk/ac/ebi/embl/gff3tools/validation/fix/CdsRnaLocusFix.java new file mode 100644 index 00000000..3404f9e9 --- /dev/null +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/validation/fix/CdsRnaLocusFix.java @@ -0,0 +1,134 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.validation.fix; + +import static uk.ac.ebi.embl.gff3tools.validation.meta.ValidationType.ANNOTATION; + +import java.util.*; +import lombok.extern.slf4j.Slf4j; +import uk.ac.ebi.embl.gff3tools.gff3.GFF3Annotation; +import uk.ac.ebi.embl.gff3tools.gff3.GFF3Attributes; +import uk.ac.ebi.embl.gff3tools.gff3.GFF3Feature; +import uk.ac.ebi.embl.gff3tools.utils.ConversionUtils; +import uk.ac.ebi.embl.gff3tools.utils.OntologyClient; +import uk.ac.ebi.embl.gff3tools.utils.OntologyTerm; +import uk.ac.ebi.embl.gff3tools.validation.meta.FixMethod; +import uk.ac.ebi.embl.gff3tools.validation.meta.Gff3Fix; + +@Slf4j +@Gff3Fix( + name = "CDS_RNA_LOCUS", + description = + "Transfers gene, gene_synonym, and locus_tag attributes from gene features to their corresponding CDS, rRNA, and tRNA child features based on location overlap.") +public class CdsRnaLocusFix { + + private final OntologyClient ontologyClient = ConversionUtils.getOntologyClient(); + + @FixMethod( + rule = "CDS_RNA_LOCUS", + description = + "Transfers gene, gene_synonym, and locus_tag attributes from gene features to their corresponding CDS, rRNA, and tRNA child features based on location overlap.", + type = ANNOTATION) + public void fix(GFF3Annotation annotation, int line) { + List geneFeatures = new ArrayList<>(); + List nonLocusFeatures = new ArrayList<>(); + for (GFF3Feature feature : annotation.getFeatures()) { + Optional soIdOpt = ontologyClient.findTermByNameOrSynonym(feature.getName()); + if (soIdOpt.isEmpty()) return; + + String soId = soIdOpt.get(); + // Determine if this is a gene feature + boolean isGene = OntologyTerm.GENE.ID.equals(soId) + || OntologyTerm.PSEUDOGENE.ID.equals(soId) + || OntologyTerm.UNITARY_PSEUDOGENE.ID.equals(soId) + || ontologyClient.isSelfOrDescendantOf(soId, OntologyTerm.PSEUDOGENE.ID) + || ontologyClient.isSelfOrDescendantOf(soId, OntologyTerm.UNITARY_PSEUDOGENE.ID); + + // Determine if this is a relevant CDS/tRNA/rRNA feature + boolean isRelevant = OntologyTerm.CDS.ID.equals(soId) + || OntologyTerm.TRNA.ID.equals(soId) + || OntologyTerm.RRNA.ID.equals(soId) + || ontologyClient.isSelfOrDescendantOf(soId, OntologyTerm.CDS.ID) + || ontologyClient.isSelfOrDescendantOf(soId, OntologyTerm.TRNA.ID) + || ontologyClient.isSelfOrDescendantOf(soId, OntologyTerm.RRNA.ID); + + // Gene feature: store in gene list + if (isGene) { + geneFeatures.add(feature); + continue; + } + + if (isRelevant) { + boolean hasGeneFields = feature.hasAttribute(GFF3Attributes.LOCUS_TAG) + || feature.hasAttribute(GFF3Attributes.GENE) + || feature.hasAttribute(GFF3Attributes.GENE_SYNONYM); + + if (!hasGeneFields) { + nonLocusFeatures.add(feature); + } + } + } + + if (geneFeatures.isEmpty() || nonLocusFeatures.isEmpty()) { + return; + } + + // Second pass: propagate gene attributes to relevant features + for (GFF3Feature child : nonLocusFeatures) { + + for (GFF3Feature gene : geneFeatures) { + + if (isLocationWithin(child.getStart(), child.getEnd(), gene.getStart(), gene.getEnd())) { + propagateGeneAttributes(gene, child, line); + break; + } + } + } + } + + private boolean isLocationWithin(long start1, long end1, long start2, long end2) { + return start1 >= start2 && end1 <= end2; + } + + private void propagateGeneAttributes(GFF3Feature geneFeature, GFF3Feature childFeature, int line) { + String locusTag = geneFeature.getAttributeByName(GFF3Attributes.LOCUS_TAG); + String gene = geneFeature.getAttributeByName(GFF3Attributes.GENE); + String geneSynonym = geneFeature.getAttributeByName(GFF3Attributes.GENE_SYNONYM); + + if (locusTag != null && !childFeature.hasAttribute(GFF3Attributes.LOCUS_TAG)) { + childFeature.setAttribute(GFF3Attributes.LOCUS_TAG, locusTag); + log.info( + "Adding {} from gene {} to {} at line {}", + GFF3Attributes.LOCUS_TAG, + geneFeature.getName(), + childFeature.getName(), + line); + } + if (geneSynonym != null && !childFeature.hasAttribute(GFF3Attributes.GENE_SYNONYM)) { + childFeature.setAttribute(GFF3Attributes.GENE_SYNONYM, geneSynonym); + log.info( + "Adding {} from gene {} to {} at line {}", + GFF3Attributes.GENE_SYNONYM, + geneFeature.getName(), + childFeature.getName(), + line); + } + if (gene != null && !childFeature.hasAttribute(GFF3Attributes.GENE)) { + childFeature.setAttribute(GFF3Attributes.GENE, gene); + log.info( + "Adding {} from gene {} to {} at line {}", + GFF3Attributes.GENE, + geneFeature.getName(), + childFeature.getName(), + line); + } + } +} diff --git a/src/main/java/uk/ac/ebi/embl/gff3tools/validation/fix/EcNumberValueFix.java b/src/main/java/uk/ac/ebi/embl/gff3tools/validation/fix/EcNumberValueFix.java index 3a8fa7f9..6acafc87 100644 --- a/src/main/java/uk/ac/ebi/embl/gff3tools/validation/fix/EcNumberValueFix.java +++ b/src/main/java/uk/ac/ebi/embl/gff3tools/validation/fix/EcNumberValueFix.java @@ -12,6 +12,8 @@ import static uk.ac.ebi.embl.gff3tools.validation.meta.ValidationType.FEATURE; +import java.util.List; +import java.util.regex.Matcher; import java.util.regex.Pattern; import lombok.extern.slf4j.Slf4j; import uk.ac.ebi.embl.gff3tools.gff3.GFF3Attributes; @@ -20,26 +22,85 @@ import uk.ac.ebi.embl.gff3tools.validation.meta.Gff3Fix; @Slf4j -@Gff3Fix(name = "EC_NUMBER_ATTRIBUTE", description = "Remove the EC_number attribute if not matches the pattern") +@Gff3Fix(name = "EC_NUMBER", description = "Remove the EC_number attribute if not matches the pattern") public class EcNumberValueFix { private static final Pattern EC_NUMBER_PATTERN = Pattern.compile("^[0-9]+(\\.(?:[0-9]+|-)){0,2}\\.(?:[0-9]+|-|n[0-9]*)$"); + private static final Pattern PRODUCT_EC_NUMBER_PATTERN = Pattern.compile( + "\\(?\\[?(?:EC|ec|Ec)[:=]?\\s*((?:\\d{1,3}|-)\\.(?:\\d{1,3}|-)\\.(?:\\d{1,3}|-)\\.(?:\\d{1,3}|-))]?\\)?"); @FixMethod( - rule = "EC_NUMBER_ATTRIBUTE", + rule = "EC_NUMBER", description = "Remove the EC_number attribute if not matches the pattern", type = FEATURE) public void fixFeature(GFF3Feature feature, int line) { + + if (!feature.hasAttribute(GFF3Attributes.EC_NUMBER) && !feature.hasAttribute(GFF3Attributes.PRODUCT)) { + return; + } + String ecNumber = feature.getAttributeByName(GFF3Attributes.EC_NUMBER); - if (ecNumber == null || ecNumber.isBlank()) return; - if (ecNumber.equalsIgnoreCase("deleted") || !isValidECNumber(ecNumber.trim())) { - log.info("Removing invalid values on {} attribute at line: {}", GFF3Attributes.EC_NUMBER, line); + if (ecNumber != null) { + if ("deleted".equalsIgnoreCase(ecNumber) || !isValidECNumber(ecNumber.trim())) { + log.info( + "Fix to remove {} attribute due to invalid value '{}' at line: {}", + GFF3Attributes.EC_NUMBER, + ecNumber, + line); + feature.removeAttribute(GFF3Attributes.EC_NUMBER); + return; + } + } + + List productValues = feature.getAttributeValueList(GFF3Attributes.PRODUCT); + if (productValues == null || productValues.isEmpty()) { + return; + } + + boolean hasInvalidProduct = productValues.stream() + .anyMatch(p -> p != null + && ("hypothetical protein".equalsIgnoreCase(p.trim()) || "unknown".equalsIgnoreCase(p.trim()))); + + if (hasInvalidProduct) { + log.info("Fix: removing EC_NUMBER because product is 'hypothetical protein' or 'unknown' at line {}", line); feature.removeAttribute(GFF3Attributes.EC_NUMBER); + return; + } + + for (String product : productValues) { + if (product == null) continue; + if (hasEcNumber(product)) { + String extracted = getEcNumberFromProduct(product); + + if (!extracted.isEmpty() && isValidECNumber(extracted)) { + + log.info( + "Fix: setting EC_NUMBER='{}' extracted from product '{}' at line {}", + extracted, + product, + line); + + feature.setAttribute(GFF3Attributes.EC_NUMBER, extracted); + return; + } + } } } + private String getEcNumberFromProduct(String product) { + Matcher matcher = PRODUCT_EC_NUMBER_PATTERN.matcher(product); + if (matcher.find()) { + return matcher.group(1).trim(); + } + return ""; + } + + private boolean hasEcNumber(String product) { + return PRODUCT_EC_NUMBER_PATTERN.matcher(product).find(); + } + private boolean isValidECNumber(String ecNumber) { return EC_NUMBER_PATTERN.matcher(ecNumber).matches(); } diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/TestUtils.java b/src/test/java/uk/ac/ebi/embl/gff3tools/TestUtils.java index abf4a8b5..1214c645 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/TestUtils.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/TestUtils.java @@ -185,6 +185,24 @@ public static GFF3Feature createGFF3FeatureWithAccession( ); } + public static GFF3Feature createGFF3Feature( + String featureName, String seqId, long start, long end, Map attributes) { + + return new GFF3Feature( + Optional.of(featureName), + Optional.empty(), + seqId, + Optional.empty(), + ".", + featureName, + start, + end, + ".", + "+", + "", + attributes); + } + public static String defaultAccession() { return DEFAULT_ACCESSION; } diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/validation/fix/CdsRnaLocusFixTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/validation/fix/CdsRnaLocusFixTest.java new file mode 100644 index 00000000..4ddff5b8 --- /dev/null +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/validation/fix/CdsRnaLocusFixTest.java @@ -0,0 +1,344 @@ +/* + * Copyright 2025 EMBL - European Bioinformatics Institute + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ +package uk.ac.ebi.embl.gff3tools.validation.fix; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import uk.ac.ebi.embl.gff3tools.TestUtils; +import uk.ac.ebi.embl.gff3tools.gff3.GFF3Annotation; +import uk.ac.ebi.embl.gff3tools.gff3.GFF3Attributes; +import uk.ac.ebi.embl.gff3tools.gff3.GFF3Feature; +import uk.ac.ebi.embl.gff3tools.utils.OntologyTerm; + +public class CdsRnaLocusFixTest { + + GFF3Annotation geneAnnotation; + + private CdsRnaLocusFix cdsRnsLocusFix; + + @BeforeEach + public void setUp() { + cdsRnsLocusFix = new CdsRnaLocusFix(); + geneAnnotation = new GFF3Annotation(); + } + + @Test + public void testFixFeatureWithNoGenes() { + Map attributes = new HashMap<>(); + attributes.put(GFF3Attributes.PROTEIN_ID, "protein_id"); + GFF3Feature f1 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "CD001", 5L, 10L, attributes); + geneAnnotation.addFeature(f1); + cdsRnsLocusFix.fix(geneAnnotation, 1); + Assertions.assertNotNull(f1); + assertEquals(1, f1.getAttributes().size()); + } + + @Test + public void testFixFeatureWithGeneAndCDSWithNoOverlapLocation() { + Map a1 = new HashMap<>(); + a1.put(GFF3Attributes.PROTEIN_ID, "protein_id"); + GFF3Feature f1 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "CD001", 5L, 10L, a1); + + Map a2 = new HashMap<>(); + a2.put(GFF3Attributes.GENE, "gene"); + GFF3Feature f2 = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "CD001", 15L, 20L, a2); + + geneAnnotation.setFeatures(List.of(f1, f2)); + cdsRnsLocusFix.fix(geneAnnotation, 1); + + Assertions.assertNotNull(f1); + Assertions.assertNotNull(f2); + assertEquals(1, f1.getAttributes().size()); + } + + @Test + public void testFixFeatureWithGeneAndCDSWithNoUpdate() { + Map a1 = new HashMap<>(); + a1.put(GFF3Attributes.PROTEIN_ID, "protein_id"); + GFF3Feature f1 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "CD001", 5L, 10L, a1); + + Map a2 = new HashMap<>(); + GFF3Feature f2 = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "CD001", 4L, 11L, a2); + + geneAnnotation.setFeatures(List.of(f1, f2)); + cdsRnsLocusFix.fix(geneAnnotation, 1); + + Assertions.assertNotNull(f1); + Assertions.assertNotNull(f2); + assertEquals(1, f1.getAttributes().size()); + } + + @Test + public void testFixFeatureWithGeneAndCDSWithOverlapLocation() { + Map a1 = new HashMap<>(); + a1.put(GFF3Attributes.PROTEIN_ID, "protein_id"); + GFF3Feature f1 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "CD001", 300L, 350L, a1); + + Map a2 = new HashMap<>(); + a2.put(GFF3Attributes.GENE, "gene"); + a2.put(GFF3Attributes.LOCUS_TAG, "locus_tag"); + a2.put(GFF3Attributes.GENE_SYNONYM, "gene_synonym"); + GFF3Feature f2 = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "CD001", 200L, 400L, a2); + + geneAnnotation.setFeatures(List.of(f1, f2)); + cdsRnsLocusFix.fix(geneAnnotation, 1); + + Assertions.assertNotNull(f1); + Assertions.assertNotNull(f2); + assertEquals(4, f1.getAttributes().size()); + assertEquals(f1.getAttributeByName(GFF3Attributes.GENE), f2.getAttributeByName(GFF3Attributes.GENE)); + assertEquals( + f1.getAttributeByName(GFF3Attributes.GENE_SYNONYM), f2.getAttributeByName(GFF3Attributes.GENE_SYNONYM)); + assertEquals(f1.getAttributeByName(GFF3Attributes.LOCUS_TAG), f2.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + } + + @Test + public void testFixFeatureWithMultipleFeaturesAndGenePropagation() { + Map cdsAttrs = new HashMap<>(); + cdsAttrs.put(GFF3Attributes.PROTEIN_ID, "protein_id"); + GFF3Feature cds = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACCESSION1", 100L, 200L, cdsAttrs); + + Map trnaAttrs = new HashMap<>(); + trnaAttrs.put(GFF3Attributes.PRODUCT, "tRNA-type"); + GFF3Feature trna = TestUtils.createGFF3Feature(OntologyTerm.TRNA.name(), "ACCESSION1", 150L, 180L, trnaAttrs); + + Map rrna1Attrs = new HashMap<>(); + rrna1Attrs.put(GFF3Attributes.PRODUCT, "16S"); + GFF3Feature rrna1 = TestUtils.createGFF3Feature(OntologyTerm.RRNA.name(), "ACCESSION1", 170L, 190L, rrna1Attrs); + + Map rrna2Attrs = new HashMap<>(); + rrna2Attrs.put(GFF3Attributes.PRODUCT, "23S"); + GFF3Feature rrna2 = TestUtils.createGFF3Feature(OntologyTerm.RRNA.name(), "ACCESSION1", 195L, 205L, rrna2Attrs); + + Map geneAttrs = new HashMap<>(); + geneAttrs.put(GFF3Attributes.GENE, "geneX"); + geneAttrs.put(GFF3Attributes.LOCUS_TAG, "LOCUS123"); + geneAttrs.put(GFF3Attributes.GENE_SYNONYM, "geneSynX"); + GFF3Feature gene = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "ACCESSION1", 100L, 210L, geneAttrs); + + geneAnnotation.setFeatures(List.of(cds, gene, trna, rrna1, rrna2, gene)); + + cdsRnsLocusFix.fix(geneAnnotation, 1); + + for (GFF3Feature f : geneAnnotation.getFeatures()) { + Assertions.assertEquals( + gene.getAttributeByName(GFF3Attributes.GENE), f.getAttributeByName(GFF3Attributes.GENE)); + Assertions.assertEquals( + gene.getAttributeByName(GFF3Attributes.GENE_SYNONYM), + f.getAttributeByName(GFF3Attributes.GENE_SYNONYM)); + Assertions.assertEquals( + gene.getAttributeByName(GFF3Attributes.LOCUS_TAG), f.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + } + + Assertions.assertEquals(3, gene.getAttributes().size()); + Assertions.assertEquals("geneX", gene.getAttributeByName(GFF3Attributes.GENE)); + Assertions.assertEquals("geneSynX", gene.getAttributeByName(GFF3Attributes.GENE_SYNONYM)); + Assertions.assertEquals("LOCUS123", gene.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + } + + @Test + public void testFixFeatureDifferentAccession() { + Map cdsAttrs = new HashMap<>(); + cdsAttrs.put(GFF3Attributes.PROTEIN_ID, "p1"); + GFF3Feature cds = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC1", 100L, 200L, cdsAttrs); + + Map geneAttrs = new HashMap<>(); + geneAttrs.put(GFF3Attributes.GENE, "geneB"); + geneAttrs.put(GFF3Attributes.LOCUS_TAG, "LOC2"); + GFF3Feature gene = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "ACC1", 90L, 210L, geneAttrs); + + geneAnnotation.setFeatures(List.of(cds)); + cdsRnsLocusFix.fix(geneAnnotation, 1); + + // Second Annotation + geneAnnotation.setFeatures(List.of(gene)); + cdsRnsLocusFix.fix(geneAnnotation, 1); + + Assertions.assertNull(cds.getAttributeByName(GFF3Attributes.GENE)); + Assertions.assertNull(cds.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + } + + @Test + public void testFixFeatureMultipleAccession() { + Map cds1Attrs = new HashMap<>(); + cds1Attrs.put(GFF3Attributes.PROTEIN_ID, "p1"); + GFF3Feature cds1 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC1", 100L, 200L, cds1Attrs); + + Map cds2Attrs = new HashMap<>(); + cds2Attrs.put(GFF3Attributes.PROTEIN_ID, "p2"); + GFF3Feature cds2 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC2", 150L, 250L, cds2Attrs); + + Map geneAttrs = new HashMap<>(); + geneAttrs.put(GFF3Attributes.GENE, "geneC"); + geneAttrs.put(GFF3Attributes.LOCUS_TAG, "LOC3"); + GFF3Feature gene = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "ACC1", 90L, 210L, geneAttrs); + + geneAnnotation.setFeatures(List.of(cds1, cds2, gene)); + + cdsRnsLocusFix.fix(geneAnnotation, 1); + + Assertions.assertEquals("geneC", cds1.getAttributeByName(GFF3Attributes.GENE)); + Assertions.assertEquals("LOC3", cds1.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + + Assertions.assertNull(cds2.getAttributeByName(GFF3Attributes.GENE)); + Assertions.assertNull(cds2.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + } + + @Test + public void testFixFeatureOverlapDifferentAccession() { + Map cds1Attrs = new HashMap<>(); + cds1Attrs.put(GFF3Attributes.PROTEIN_ID, "p1"); + GFF3Feature cds1 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC1", 100L, 200L, cds1Attrs); + + Map cds2Attrs = new HashMap<>(); + cds2Attrs.put(GFF3Attributes.PROTEIN_ID, "p2"); + GFF3Feature cds2 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC2", 150L, 180L, cds2Attrs); + + Map gene1Attrs = new HashMap<>(); + gene1Attrs.put(GFF3Attributes.GENE, "gene1"); + gene1Attrs.put(GFF3Attributes.LOCUS_TAG, "LOC1"); + GFF3Feature gene1 = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "ACC1", 90L, 210L, gene1Attrs); + + Map gene2Attrs = new HashMap<>(); + gene2Attrs.put(GFF3Attributes.GENE, "gene2"); + gene2Attrs.put(GFF3Attributes.LOCUS_TAG, "LOC2"); + GFF3Feature gene2 = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "ACC2", 140L, 190L, gene2Attrs); + + geneAnnotation.setFeatures(List.of(cds1, gene1)); + + cdsRnsLocusFix.fix(geneAnnotation, 1); + + geneAnnotation.setFeatures(List.of(cds2, gene2)); + + cdsRnsLocusFix.fix(geneAnnotation, 1); + + Assertions.assertEquals("gene1", cds1.getAttributeByName(GFF3Attributes.GENE)); + Assertions.assertEquals("LOC1", cds1.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + + Assertions.assertEquals("gene2", cds2.getAttributeByName(GFF3Attributes.GENE)); + Assertions.assertEquals("LOC2", cds2.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + } + + @Test + public void testFixFeatureChildAfterAllGenes() { + + Map geneAttrs = new HashMap<>(); + geneAttrs.put(GFF3Attributes.GENE, "geneX"); + geneAttrs.put(GFF3Attributes.LOCUS_TAG, "LOCX"); + GFF3Feature gene = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "ACC1", 100L, 300L, geneAttrs); + + Map cdsAttrs = new HashMap<>(); + cdsAttrs.put(GFF3Attributes.PROTEIN_ID, "p1"); + GFF3Feature cds = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC1", 150L, 250L, cdsAttrs); + + // Child appears AFTER gene in ordering + geneAnnotation.setFeatures(List.of(gene, cds)); + + cdsRnsLocusFix.fix(geneAnnotation, 1); + + Assertions.assertEquals("geneX", cds.getAttributeByName(GFF3Attributes.GENE)); + Assertions.assertEquals("LOCX", cds.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + } + + @Test + public void testFixFeatureChildBeforeGene() { + + Map cdsAttrs = new HashMap<>(); + cdsAttrs.put(GFF3Attributes.PROTEIN_ID, "p1"); + GFF3Feature cds = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC1", 150L, 250L, cdsAttrs); + + Map geneAttrs = new HashMap<>(); + geneAttrs.put(GFF3Attributes.GENE, "geneY"); + geneAttrs.put(GFF3Attributes.LOCUS_TAG, "LOCY"); + GFF3Feature gene = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "ACC1", 100L, 300L, geneAttrs); + + // Child first, then gene + geneAnnotation.setFeatures(List.of(cds, gene)); + cdsRnsLocusFix.fix(geneAnnotation, 1); + + Assertions.assertEquals("geneY", cds.getAttributeByName(GFF3Attributes.GENE)); + Assertions.assertEquals("LOCY", cds.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + } + + @Test + public void testFixFeatureMultipleGeneOverlapFirstMatch() { + + Map cdsAttrs = new HashMap<>(); + cdsAttrs.put(GFF3Attributes.PROTEIN_ID, "p1"); + GFF3Feature cds = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC1", 150L, 200L, cdsAttrs); + + Map gene1Attrs = Map.of( + GFF3Attributes.GENE, "gene1", + GFF3Attributes.LOCUS_TAG, "LOC1"); + GFF3Feature gene1 = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "ACC1", 100L, 300L, gene1Attrs); + + Map gene2Attrs = Map.of( + GFF3Attributes.GENE, "gene2", + GFF3Attributes.LOCUS_TAG, "LOC2"); + GFF3Feature gene2 = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "ACC1", 120L, 220L, gene2Attrs); + + // Both overlap, but gene1 appears first + geneAnnotation.setFeatures(List.of(gene1, gene2, cds)); + cdsRnsLocusFix.fix(geneAnnotation, 1); + + Assertions.assertEquals("gene1", cds.getAttributeByName(GFF3Attributes.GENE)); + Assertions.assertEquals("LOC1", cds.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + } + + @Test + public void testFixFeatureChildrenAroundGenes() { + + GFF3Feature cds1 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC1", 105L, 110L, new HashMap<>()); + GFF3Feature cds2 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC1", 180L, 190L, new HashMap<>()); + + Map geneAttrs = new HashMap<>(); + geneAttrs.put(GFF3Attributes.GENE, "geneZ"); + geneAttrs.put(GFF3Attributes.LOCUS_TAG, "LOCZ"); + + GFF3Feature gene = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "ACC1", 100L, 200L, geneAttrs); + + GFF3Feature cds3 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC1", 120L, 150L, new HashMap<>()); + + geneAnnotation.setFeatures(List.of(cds1, cds2, gene, cds3)); + + cdsRnsLocusFix.fix(geneAnnotation, 1); + + Assertions.assertEquals("LOCZ", cds1.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + Assertions.assertEquals("LOCZ", cds2.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + Assertions.assertEquals("LOCZ", cds3.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + } + + @Test + public void testFixFeatureChildAfterLastGeneFailsInOldApproach() { + + Map geneAttrs = new HashMap<>(); + geneAttrs.put(GFF3Attributes.GENE, "geneX"); + geneAttrs.put(GFF3Attributes.LOCUS_TAG, "LOCX"); + + GFF3Feature gene = TestUtils.createGFF3Feature(OntologyTerm.GENE.name(), "ACC1", 100L, 300L, geneAttrs); + + GFF3Feature cds1 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC1", 150L, 200L, new HashMap<>()); + GFF3Feature cds2 = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), "ACC1", 180L, 250L, new HashMap<>()); + + // cds2 appears AFTER all genes + geneAnnotation.setFeatures(List.of(gene, cds1, cds2)); + + cdsRnsLocusFix.fix(geneAnnotation, 1); + + Assertions.assertEquals("LOCX", cds2.getAttributeByName(GFF3Attributes.LOCUS_TAG)); + } +} diff --git a/src/test/java/uk/ac/ebi/embl/gff3tools/validation/fix/EcNumberValueFixTest.java b/src/test/java/uk/ac/ebi/embl/gff3tools/validation/fix/EcNumberValueFixTest.java index db3c7c93..3c5b4223 100644 --- a/src/test/java/uk/ac/ebi/embl/gff3tools/validation/fix/EcNumberValueFixTest.java +++ b/src/test/java/uk/ac/ebi/embl/gff3tools/validation/fix/EcNumberValueFixTest.java @@ -10,9 +10,10 @@ */ package uk.ac.ebi.embl.gff3tools.validation.fix; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.*; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; @@ -36,7 +37,7 @@ public void setUp() { @Test public void testEcNumberValueFixWithoutEcNumber() { Map attributes = new HashMap<>(); - attributes.put(GFF3Attributes.PROTEIN_ID, "protein_id"); + attributes.put(GFF3Attributes.PROTEIN_ID, List.of("transfer RNA-leucine", "tRNA-Thr")); feature = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), OntologyTerm.CDS.name(), attributes); ecNumberValueFix.fixFeature(feature, 1); Assertions.assertNotNull(feature); @@ -130,4 +131,63 @@ public void testEcNumberValueFixWithLeadingDot() { Assertions.assertNotNull(feature); assertEquals(0, feature.getAttributes().size()); } + + @Test + public void testEcNumberValueFixWithUnknownProduct() { + Map attributes = new HashMap<>(); + attributes.put(GFF3Attributes.PRODUCT, List.of("unknown", "transfer RNA-leucine", "tRNA-Thr")); + attributes.put(GFF3Attributes.EC_NUMBER, "..0.2.3.1.."); + + feature = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), OntologyTerm.CDS.name(), attributes); + ecNumberValueFix.fixFeature(feature, 1); + Assertions.assertNotNull(feature); + assertEquals(1, feature.getAttributes().size()); + assertFalse(feature.hasAttribute(GFF3Attributes.EC_NUMBER)); + } + + @Test + public void testEcNumberValueFixOnProductWithECNumber() { + Map attributes = new HashMap<>(); + attributes.put(GFF3Attributes.PRODUCT, List.of("product EC:1.1.1.1", "transfer RNA-leucine", "tRNA-Thr")); + feature = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), OntologyTerm.CDS.name(), attributes); + ecNumberValueFix.fixFeature(feature, 1); + Assertions.assertNotNull(feature); + assertEquals(2, feature.getAttributes().size()); + assertTrue(feature.hasAttribute(GFF3Attributes.EC_NUMBER)); + } + + @Test + public void testEcNumberValueFixOnProductWithEcNumber() { + Map attributes = new HashMap<>(); + attributes.put(GFF3Attributes.PRODUCT, List.of("transfer RNA-leucine", "product [Ec:1.1.1.1]", "tRNA-Thr")); + feature = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), OntologyTerm.CDS.name(), attributes); + ecNumberValueFix.fixFeature(feature, 1); + Assertions.assertNotNull(feature); + assertEquals(2, feature.getAttributes().size()); + assertTrue(feature.hasAttribute(GFF3Attributes.EC_NUMBER)); + } + + @Test + public void testEcNumberValueFixOnProductWithecNumber() { + Map attributes = new HashMap<>(); + attributes.put(GFF3Attributes.PRODUCT, List.of("product (ec:1.1.1.1)", "protein")); + feature = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), OntologyTerm.CDS.name(), attributes); + ecNumberValueFix.fixFeature(feature, 1); + Assertions.assertNotNull(feature); + assertEquals(2, feature.getAttributes().size()); + assertTrue(feature.hasAttribute(GFF3Attributes.EC_NUMBER)); + } + + @Test + public void testEcNumberValueFixRemoveEc() { + Map attributes = new HashMap<>(); + attributes.put(GFF3Attributes.PRODUCT, List.of("product", "protein")); + attributes.put(GFF3Attributes.EC_NUMBER, "product"); + + feature = TestUtils.createGFF3Feature(OntologyTerm.CDS.name(), OntologyTerm.CDS.name(), attributes); + ecNumberValueFix.fixFeature(feature, 1); + Assertions.assertNotNull(feature); + assertEquals(1, feature.getAttributes().size()); + assertFalse(feature.hasAttribute(GFF3Attributes.EC_NUMBER)); + } }