Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions nemo_curator/stages/text/download/common_crawl/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from nemo_curator.stages.text.download import DocumentExtractor
from nemo_curator.stages.text.download.html_extractors import HTMLExtractorAlgorithm
from nemo_curator.stages.text.download.html_extractors.justext import JusTextExtractor
from nemo_curator.stages.text.download.html_extractors.model_based import ModelBasedHTMLExtractionStage
from nemo_curator.stages.text.download.html_extractors.resiliparse import ResiliparseExtractor
from nemo_curator.stages.text.download.html_extractors.trafilatura import TrafilaturaExtractor
from nemo_curator.stages.text.download.html_extractors.utils import get_stop_list_dict
Expand All @@ -44,6 +45,8 @@ def __init__(
algorithm = ResiliparseExtractor(**algorithm_kwargs)
elif algorithm == "trafilatura":
algorithm = TrafilaturaExtractor(**algorithm_kwargs)
elif algorithm in {"model", "model_based"}:
algorithm = ModelBasedHTMLExtractionStage(**algorithm_kwargs)
else:
msg = f"Invalid algorithm: {algorithm}"
raise ValueError(msg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,17 @@

from .base import HTMLExtractorAlgorithm
from .justext import JusTextExtractor
from .model_based import HTMLElement, HTMLElementClassifier, HTMLElementPrediction, ModelBasedHTMLExtractionStage
from .resiliparse import ResiliparseExtractor
from .trafilatura import TrafilaturaExtractor

__all__ = [
"HTMLElement",
"HTMLElementClassifier",
"HTMLElementPrediction",
"HTMLExtractorAlgorithm",
"JusTextExtractor",
"ModelBasedHTMLExtractionStage",
"ResiliparseExtractor",
"TrafilaturaExtractor",
]
Loading