Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- name: Install Dependencies
run: |
pip install --upgrade -r requirements.txt -r requirements-test.txt
pip install -e .
pip install -e '.[all]'
- name: Unit Test
if: matrix.os == 'ubuntu-latest' || matrix.os == 'macOs-latest'
Expand Down
46 changes: 35 additions & 11 deletions ir_datasets/lazy_libs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,20 @@ def requests():

def bs4():
if 'bs4' not in _cache:
import bs4
try:
import bs4
except ImportError as ie:
raise ImportError("This dataset requires beautifulsoup4. Run 'pip install ir_datasets[beautifulsoup4]' to install dependencies for this dataset") from ie
_cache['bs4'] = bs4
return _cache['bs4']


def inscriptis():
if 'inscriptis' not in _cache:
import inscriptis
try:
import inscriptis
except ImportError as ie:
raise ImportError("This dataset requires inscriptis. Run 'pip install ir_datasets[inscriptis]' to install dependencies for this dataset") from ie
_cache['inscriptis'] = inscriptis
return _cache['inscriptis']

Expand All @@ -53,19 +59,28 @@ def json():

def trec_car():
if 'trec_car' not in _cache:
import trec_car.read_data
try:
import trec_car.read_data
except ImportError as ie:
raise ImportError("This dataset requires trec-car-tools. Run 'pip install ir_datasets[car]' to install dependencies for this dataset") from ie
_cache['trec_car'] = trec_car
return _cache['trec_car']

def warc():
if 'warc' not in _cache:
import warc
try:
import warc
except ImportError as ie:
raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie
_cache['warc'] = warc
return _cache['warc']

def warc_clueweb09():
if 'warc_clueweb09' not in _cache:
import warc3_wet_clueweb09
try:
import warc3_wet_clueweb09
except ImportError as ie:
raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie
_cache['warc_clueweb09'] = warc3_wet_clueweb09
return _cache['warc_clueweb09']

Expand All @@ -83,7 +98,10 @@ def lz4_frame():

def zlib_state():
if 'zlib_state' not in _cache:
import zlib_state
try:
import zlib_state
except ImportError as ie:
raise ImportError("This dataset requires zlib-state. Run 'pip install ir_datasets[zlib-state]' to install dependencies for this dataset") from ie
_cache['zlib_state'] = zlib_state
return _cache['zlib_state']

Expand All @@ -101,7 +119,10 @@ def lxml_html():

def ijson():
if 'ijson' not in _cache:
import ijson
try:
import ijson
except ImportError as ie:
raise ImportError("This dataset requires ijson. Run 'pip install ir_datasets[ijson]' to install dependencies for this dataset") from ie
_cache['ijson'] = ijson
return _cache['ijson']

Expand All @@ -110,21 +131,24 @@ def pyautocorpus():
try:
import pyautocorpus
except ImportError as ie:
raise ImportError("This dataset requires pyautocorpus. Run 'pip install pyautocorpus'") from ie
raise ImportError("This dataset requires pyautocorpus. Run 'pip install ir_datasets[pyautocorpus]' to install dependencies for this dataset") from ie
_cache['pyautocorpus'] = pyautocorpus
return _cache['pyautocorpus']

def unlzw3():
if 'unlzw3' not in _cache:
import unlzw3
try:
import unlzw3
except ImportError as ex:
raise ImportError("This dataset requires unlzw3. Run 'pip install ir_datasets[unlzw3]' to install dependencies for this dataset") from ex
_cache['unlzw3'] = unlzw3
return _cache['unlzw3']

def pyarrow_parquet():
if 'pyarrow_parquet' not in _cache:
try:
import pyarrow.parquet
except ImportError as ex:
raise ImportError("This dataset requires pyarrow. Run 'pip install pyarrow>=16.1.0'") from ex
except ImportError as ie:
raise ImportError("This dataset requires pyarrow. Run 'pip install ir_datasets[pyarrow]' to install dependencies for this dataset") from ie
_cache['pyarrow_parquet'] = pyarrow.parquet
return _cache['pyarrow_parquet']
42 changes: 42 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,48 @@ exclude = ["test"]
version = {attr = "ir_datasets.__version__"}
dependencies = {file = ["requirements.txt"]}

[project.optional-dependencies]
car = [
"trec-car-tools>=2.5.4",
]
warc = [
"warc3-wet>=0.2.3",
"warc3-wet-clueweb09>=0.2.5"
]
pyautocorpus = [
"pyautocorpus>=0.1.12"
]
pyarrow = [
"pyarrow>=16.1.0"
]
unlzw3 = [
"unlzw3>=0.2.1"
]
beautifulsoup4 = [
"beautifulsoup4>=4.4.1"
]
inscriptis = [
"inscriptis>=2.2.0"
]
zlib-state = [
"zlib-state>=0.1.3"
]
ijson = [
"ijson>=3.1.3"
]
all = [
"trec-car-tools>=2.5.4",
"warc3-wet>=0.2.3",
"warc3-wet-clueweb09>=0.2.5",
"pyarrow>=16.1.0",
"pyautocorpus>=0.1.12",
"unlzw3>=0.2.1",
"beautifulsoup4>=4.4.1",
"inscriptis>=2.2.0",
"zlib-state>=0.1.3",
"ijson>=3.1.3"
]

[project.urls]
"Homepage" = "https://ir-datasets.com/"
"Documentation" = "https://project.readthedocs.io/"
Expand Down
11 changes: 1 addition & 10 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,6 @@
beautifulsoup4>=4.4.1
inscriptis>=2.2.0
lxml>=4.5.2
lxml>=4.5.2,<6.0.0
numpy>=1.18.1
pyyaml>=5.3.1
requests>=2.22.0
tqdm>=4.38.0
trec-car-tools>=2.5.4
lz4>=3.1.10
warc3-wet>=0.2.3
warc3-wet-clueweb09>=0.2.5
zlib-state>=0.1.3
ijson>=3.1.3
unlzw3>=0.2.1
pyarrow>=16.1.0