NVIDIA-NeMo · zeel2104 · Apr 8, 2026 · May 21, 2026
@@ -19,6 +19,7 @@
 from nemo_curator.stages.text.download import DocumentExtractor
 from nemo_curator.stages.text.download.html_extractors import HTMLExtractorAlgorithm
 from nemo_curator.stages.text.download.html_extractors.justext import JusTextExtractor
+from nemo_curator.stages.text.download.html_extractors.model_based import ModelBasedHTMLExtractionStage
 from nemo_curator.stages.text.download.html_extractors.resiliparse import ResiliparseExtractor
 from nemo_curator.stages.text.download.html_extractors.trafilatura import TrafilaturaExtractor
 from nemo_curator.stages.text.download.html_extractors.utils import get_stop_list_dict
@@ -44,6 +45,8 @@ def __init__(
                 algorithm = ResiliparseExtractor(**algorithm_kwargs)
             elif algorithm == "trafilatura":
                 algorithm = TrafilaturaExtractor(**algorithm_kwargs)
+            elif algorithm in {"model", "model_based"}:
+                algorithm = ModelBasedHTMLExtractionStage(**algorithm_kwargs)
             else:
                 msg = f"Invalid algorithm: {algorithm}"
                 raise ValueError(msg)

@@ -14,12 +14,17 @@
 
 from .base import HTMLExtractorAlgorithm
 from .justext import JusTextExtractor
+from .model_based import HTMLElement, HTMLElementClassifier, HTMLElementPrediction, ModelBasedHTMLExtractionStage
 from .resiliparse import ResiliparseExtractor
 from .trafilatura import TrafilaturaExtractor
 
 __all__ = [
+    "HTMLElement",
+    "HTMLElementClassifier",
+    "HTMLElementPrediction",
     "HTMLExtractorAlgorithm",
     "JusTextExtractor",
+    "ModelBasedHTMLExtractionStage",
     "ResiliparseExtractor",
     "TrafilaturaExtractor",
 ]