diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java index ccf2711c1d..e4b2fac667 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java @@ -146,7 +146,11 @@ metadata, getEncodingDetector(context))) { parser.setContentHandler(new XHTMLDowngradeHandler( new HtmlHandler(mapper, handler, metadata, context, extractScripts))); - parser.parse(reader.asInputSource()); + try { + parser.parse(reader.asInputSource()); + } catch (StringIndexOutOfBoundsException e) { + throw new TikaException(e.getMessage(), e); + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 408c850921..192a1f9171 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -22,6 +22,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.ByteArrayInputStream; @@ -1269,6 +1270,19 @@ public void testPreferenceForTitleElement() throws Exception { assertEquals("OldMetaTitle", m.get("title")); } + @Test + public void testUnbalancedQuotes() throws Exception { + //this tests handling of unbalanced quotes (see TIKA-2328) + String testData = ""; + assertThrows(TikaException.class, () -> { + new HtmlParser().parse(new ByteArrayInputStream(testData.getBytes()), + new BodyContentHandler(), + new Metadata(), + new ParseContext()); + + }); + } + private class EncodingDetectorRunner implements Callable { final static String DONE = "done";