Skip to content

Commit 6b187e7

Browse files
committed
Add model-based HTML extraction stage
Signed-off-by: Zeel <desaizeel2128@gmail.com>
1 parent d4d2fd6 commit 6b187e7

4 files changed

Lines changed: 549 additions & 0 deletions

File tree

nemo_curator/stages/text/download/common_crawl/extract.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from nemo_curator.stages.text.download import DocumentExtractor
2020
from nemo_curator.stages.text.download.html_extractors import HTMLExtractorAlgorithm
2121
from nemo_curator.stages.text.download.html_extractors.justext import JusTextExtractor
22+
from nemo_curator.stages.text.download.html_extractors.model_based import ModelBasedHTMLExtractionStage
2223
from nemo_curator.stages.text.download.html_extractors.resiliparse import ResiliparseExtractor
2324
from nemo_curator.stages.text.download.html_extractors.trafilatura import TrafilaturaExtractor
2425
from nemo_curator.stages.text.download.html_extractors.utils import get_stop_list_dict
@@ -44,6 +45,8 @@ def __init__(
4445
algorithm = ResiliparseExtractor(**algorithm_kwargs)
4546
elif algorithm == "trafilatura":
4647
algorithm = TrafilaturaExtractor(**algorithm_kwargs)
48+
elif algorithm in {"model", "model_based"}:
49+
algorithm = ModelBasedHTMLExtractionStage(**algorithm_kwargs)
4750
else:
4851
msg = f"Invalid algorithm: {algorithm}"
4952
raise ValueError(msg)

nemo_curator/stages/text/download/html_extractors/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,17 @@
1414

1515
from .base import HTMLExtractorAlgorithm
1616
from .justext import JusTextExtractor
17+
from .model_based import HTMLElement, HTMLElementClassifier, HTMLElementPrediction, ModelBasedHTMLExtractionStage
1718
from .resiliparse import ResiliparseExtractor
1819
from .trafilatura import TrafilaturaExtractor
1920

2021
__all__ = [
22+
"HTMLElement",
23+
"HTMLElementClassifier",
24+
"HTMLElementPrediction",
2125
"HTMLExtractorAlgorithm",
2226
"JusTextExtractor",
27+
"ModelBasedHTMLExtractionStage",
2328
"ResiliparseExtractor",
2429
"TrafilaturaExtractor",
2530
]

0 commit comments

Comments
 (0)