allenai · seanmacavaney · Oct 16, 2025 · Jul 25, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -25,7 +25,7 @@ jobs:
     - name: Install Dependencies
       run: |
         pip install --upgrade -r requirements.txt -r requirements-test.txt
-        pip install -e .
+        pip install -e '.[all]'
 
     - name: Unit Test
       if: matrix.os == 'ubuntu-latest' || matrix.os == 'macOs-latest'

diff --git a/ir_datasets/lazy_libs.py b/ir_datasets/lazy_libs.py
@@ -25,14 +25,20 @@ def requests():
 
 def bs4():
     if 'bs4' not in _cache:
-        import bs4
+        try:
+            import bs4
+        except ImportError as ie:
+            raise ImportError("This dataset requires beautifulsoup4. Run 'pip install ir_datasets[beautifulsoup4]' to install dependencies for this dataset") from ie
         _cache['bs4'] = bs4
     return _cache['bs4']
 
 
 def inscriptis():
     if 'inscriptis' not in _cache:
-        import inscriptis
+        try:
+            import inscriptis
+        except ImportError as ie:
+            raise ImportError("This dataset requires inscriptis. Run 'pip install ir_datasets[inscriptis]' to install dependencies for this dataset") from ie
         _cache['inscriptis'] = inscriptis
     return _cache['inscriptis']
 
@@ -53,19 +59,28 @@ def json():
 
 def trec_car():
     if 'trec_car' not in _cache:
-        import trec_car.read_data
+        try:
+            import trec_car.read_data
+        except ImportError as ie:
+            raise ImportError("This dataset requires trec-car-tools. Run 'pip install ir_datasets[car]' to install dependencies for this dataset") from ie
         _cache['trec_car'] = trec_car
     return _cache['trec_car']
 
 def warc():
     if 'warc' not in _cache:
-        import warc
+        try:
+            import warc
+        except ImportError as ie:
+            raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie
         _cache['warc'] = warc
     return _cache['warc']
 
 def warc_clueweb09():
     if 'warc_clueweb09' not in _cache:
-        import warc3_wet_clueweb09
+        try:
+            import warc3_wet_clueweb09
+        except ImportError as ie:
+            raise ImportError("This dataset requires warc. Run 'pip install ir_datasets[warc]' to install dependencies for this dataset") from ie
         _cache['warc_clueweb09'] = warc3_wet_clueweb09
     return _cache['warc_clueweb09']
 
@@ -83,7 +98,10 @@ def lz4_frame():
 
 def zlib_state():
     if 'zlib_state' not in _cache:
-        import zlib_state
+        try:
+            import zlib_state
+        except ImportError as ie:
+            raise ImportError("This dataset requires zlib-state. Run 'pip install ir_datasets[zlib-state]' to install dependencies for this dataset") from ie
         _cache['zlib_state'] = zlib_state
     return _cache['zlib_state']
 
@@ -101,7 +119,10 @@ def lxml_html():
 
 def ijson():
     if 'ijson' not in _cache:
-        import ijson
+        try:
+            import ijson
+        except ImportError as ie:
+            raise ImportError("This dataset requires ijson. Run 'pip install ir_datasets[ijson]' to install dependencies for this dataset") from ie
         _cache['ijson'] = ijson
     return _cache['ijson']
 
@@ -110,21 +131,24 @@ def pyautocorpus():
         try:
             import pyautocorpus
         except ImportError as ie:
-            raise ImportError("This dataset requires pyautocorpus. Run 'pip install pyautocorpus'") from ie
+            raise ImportError("This dataset requires pyautocorpus. Run 'pip install ir_datasets[pyautocorpus]' to install dependencies for this dataset") from ie
         _cache['pyautocorpus'] = pyautocorpus
     return _cache['pyautocorpus']
 
 def unlzw3():
     if 'unlzw3' not in _cache:
-        import unlzw3
+        try:
+            import unlzw3
+        except ImportError as ex:
+            raise ImportError("This dataset requires unlzw3. Run 'pip install ir_datasets[unlzw3]' to install dependencies for this dataset") from ex
         _cache['unlzw3'] = unlzw3
     return _cache['unlzw3']
 
 def pyarrow_parquet():
     if 'pyarrow_parquet' not in _cache:
         try:
             import pyarrow.parquet
-        except ImportError as ex:
-            raise ImportError("This dataset requires pyarrow. Run 'pip install pyarrow>=16.1.0'") from ex
+        except ImportError as ie:
+            raise ImportError("This dataset requires pyarrow. Run 'pip install ir_datasets[pyarrow]' to install dependencies for this dataset") from ie
         _cache['pyarrow_parquet'] = pyarrow.parquet
     return _cache['pyarrow_parquet']
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,48 @@ exclude = ["test"]
 version = {attr = "ir_datasets.__version__"}
 dependencies = {file = ["requirements.txt"]}
 
+[project.optional-dependencies]
+car = [
+  "trec-car-tools>=2.5.4",
+]
+warc = [
+  "warc3-wet>=0.2.3",
+  "warc3-wet-clueweb09>=0.2.5"
+]
+pyautocorpus = [
+  "pyautocorpus>=0.1.12"
+]
+pyarrow = [
+  "pyarrow>=16.1.0"
+]
+unlzw3 = [
+  "unlzw3>=0.2.1"
+]
+beautifulsoup4 = [
+  "beautifulsoup4>=4.4.1"
+]
+inscriptis = [
+  "inscriptis>=2.2.0"
+]
+zlib-state = [
+  "zlib-state>=0.1.3"
+]
+ijson = [
+  "ijson>=3.1.3"
+]
+all = [
+  "trec-car-tools>=2.5.4",
+  "warc3-wet>=0.2.3",
+  "warc3-wet-clueweb09>=0.2.5",
+  "pyarrow>=16.1.0",
+  "pyautocorpus>=0.1.12",
+  "unlzw3>=0.2.1",
+  "beautifulsoup4>=4.4.1",
+  "inscriptis>=2.2.0",
+  "zlib-state>=0.1.3",
+  "ijson>=3.1.3"
+]
+
 [project.urls]
 "Homepage" = "https://ir-datasets.com/"
 "Documentation" = "https://project.readthedocs.io/"

diff --git a/requirements.txt b/requirements.txt
@@ -1,15 +1,6 @@
-beautifulsoup4>=4.4.1
-inscriptis>=2.2.0
-lxml>=4.5.2
+lxml>=4.5.2,<6.0.0
 numpy>=1.18.1
 pyyaml>=5.3.1
 requests>=2.22.0
 tqdm>=4.38.0
-trec-car-tools>=2.5.4
 lz4>=3.1.10
-warc3-wet>=0.2.3
-warc3-wet-clueweb09>=0.2.5
-zlib-state>=0.1.3
-ijson>=3.1.3
-unlzw3>=0.2.1
-pyarrow>=16.1.0