Skip to content

Commit 2f5ce81

Browse files
committed
feat: docstore options with MMAP/MEMORY
1 parent 1b00248 commit 2f5ce81

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+191
-148
lines changed

ir_datasets/datasets/aol_ia.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finialized_file
1111
from ir_datasets.formats import TrecQrels, TsvQueries, DocstoreBackedDocs, BaseQlogs
1212
from ir_datasets.datasets.base import Dataset, YamlDocumentation
13+
from ir_datasets.indices import DEFAULT_DOCSTORE_OPTIONS, DocstoreOptions, PickleLz4FullStore
1314

1415
_logger = ir_datasets.log.easy()
1516

@@ -98,13 +99,13 @@ def __init__(self, log_dlcs, id2wb_dlc, base_path):
9899
if not self._base_path.exists():
99100
self._base_path.mkdir(exist_ok=True, parents=True)
100101

101-
def docs_store(self):
102+
def docs_store(self, options=DEFAULT_DOCSTORE_OPTIONS):
102103
self._build_docs()
103-
return self._internal_docs_store()
104+
return self._internal_docs_store(options)
104105

105-
def _internal_docs_store(self):
106+
def _internal_docs_store(self, options: DocstoreOptions):
106107
if self._docs_store is None:
107-
self._docs_store = ir_datasets.indices.PickleLz4FullStore(self._base_path/'docs.pklz4', None, AolIaDoc, 'doc_id', ['doc_id'], count_hint=ir_datasets.util.count_hint(NAME))
108+
self._docs_store = PickleLz4FullStore(self._base_path/'docs.pklz4', None, AolIaDoc, 'doc_id', ['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), options=options)
108109
return self._docs_store
109110

110111
def _build_docs(self):

ir_datasets/datasets/beir.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from ir_datasets.util import ZipExtract, Cache, Lazy, Migrator
66
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
77
from ir_datasets.formats import BaseQueries, BaseDocs, BaseQrels, GenericDoc, GenericQuery, TrecQrel
8-
from ir_datasets.indices import PickleLz4FullStore
8+
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
99

1010
_logger = ir_datasets.log.easy()
1111

@@ -179,14 +179,15 @@ def _docs_iter(self):
179179
def docs_cls(self):
180180
return self._doc_type
181181

182-
def docs_store(self, field='doc_id'):
182+
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
183183
return PickleLz4FullStore(
184184
path=f'{ir_datasets.util.home_path()/NAME/self._name}/docs.pklz4',
185185
init_iter_fn=self._docs_iter,
186186
data_cls=self.docs_cls(),
187187
lookup_field=field,
188188
index_fields=['doc_id'],
189189
count_hint=ir_datasets.util.count_hint(f'{NAME}/{self._name}'),
190+
options=options
190191
)
191192

192193
def docs_count(self):

ir_datasets/datasets/c4.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from ir_datasets.util import DownloadConfig, Download, RequestsDownload, TarExtractAll, GzipExtract
99
from ir_datasets.formats import BaseDocs, TrecXmlQueries, DocSourceSeekableIter, DocSource, SourceDocIter
1010
from ir_datasets.datasets.base import Dataset, YamlDocumentation
11-
from ir_datasets.indices import Docstore
11+
from ir_datasets.indices import Docstore, DEFAULT_DOCSTORE_OPTIONS
1212

1313
_logger = ir_datasets.log.easy()
1414

@@ -112,8 +112,8 @@ def seek(self, idx):
112112

113113

114114
class C4Docstore(Docstore):
115-
def __init__(self, docs):
116-
super().__init__(docs.docs_cls(), 'doc_id')
115+
def __init__(self, docs, options=DEFAULT_DOCSTORE_OPTIONS):
116+
super().__init__(docs.docs_cls(), 'doc_id', options=options)
117117
self.docs = docs
118118

119119
def get_many_iter(self, doc_ids):
@@ -157,9 +157,9 @@ def docs_iter(self):
157157
def docs_cls(self):
158158
return C4Doc
159159

160-
def docs_store(self, field='doc_id'):
160+
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
161161
assert field == 'doc_id'
162-
return C4Docstore(self)
162+
return C4Docstore(self, options=options)
163163

164164
def docs_count(self, force=False):
165165
if force or self._sources is not None:

ir_datasets/datasets/car.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from ir_datasets.util import DownloadConfig, TarExtract, ReTar
44
from ir_datasets.formats import TrecQrels, BaseDocs, BaseQueries, GenericDoc
55
from ir_datasets.datasets.base import Dataset, YamlDocumentation
6-
from ir_datasets.indices import PickleLz4FullStore
6+
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
77

88

99
NAME = 'car'
@@ -53,14 +53,15 @@ def docs_iter(self):
5353
def docs_cls(self):
5454
return GenericDoc
5555

56-
def docs_store(self, field='doc_id'):
56+
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
5757
return PickleLz4FullStore(
5858
path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4',
5959
init_iter_fn=self.docs_iter,
6060
data_cls=self.docs_cls(),
6161
lookup_field=field,
6262
index_fields=['doc_id'],
6363
count_hint=self._count_hint,
64+
options=options
6465
)
6566

6667
def docs_count(self):

ir_datasets/datasets/clinicaltrials.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from ir_datasets.util import DownloadConfig, GzipExtract, ZipExtract
1313
from ir_datasets.formats import BaseDocs, BaseQueries, GenericQuery, TrecQrels, TrecXmlQueries
1414
from ir_datasets.datasets.base import Dataset, YamlDocumentation
15-
from ir_datasets.indices import PickleLz4FullStore
15+
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
1616
from . import medline
1717

1818
_logger = ir_datasets.log.easy()
@@ -93,14 +93,15 @@ def _parse_doc(self, xml):
9393
def docs_path(self, force=True):
9494
return ir_datasets.util.home_path()/NAME/self._name/'corpus'
9595

96-
def docs_store(self, field='doc_id'):
96+
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
9797
return PickleLz4FullStore(
9898
path=f'{self.docs_path(force=False)}.pklz4',
9999
init_iter_fn=self._docs_iter,
100100
data_cls=self.docs_cls(),
101101
lookup_field=field,
102102
index_fields=['doc_id'],
103103
count_hint=self._count_hint,
104+
options=options
104105
)
105106

106107
def docs_cls(self):

ir_datasets/datasets/codesearchnet.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from ir_datasets.util import DownloadConfig, TarExtract, ZipExtractCache
1010
from ir_datasets.formats import BaseDocs, BaseQueries, BaseQrels
1111
from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel
12-
from ir_datasets.indices import PickleLz4FullStore
12+
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
1313
from ir_datasets.datasets.base import Dataset, YamlDocumentation
1414

1515

@@ -70,14 +70,15 @@ def docs_iter(self):
7070
def docs_cls(self):
7171
return CodeSearchNetDoc
7272

73-
def docs_store(self, field='doc_id'):
73+
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
7474
return PickleLz4FullStore(
7575
path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4',
7676
init_iter_fn=self.docs_iter,
7777
data_cls=self.docs_cls(),
7878
lookup_field=field,
7979
index_fields=['doc_id'],
8080
count_hint=ir_datasets.util.count_hint(NAME),
81+
options=options
8182
)
8283

8384
def docs_count(self):

ir_datasets/datasets/cord19.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from ir_datasets.util import Lazy, DownloadConfig
1414
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation
1515
from ir_datasets.formats import BaseDocs, TrecXmlQueries, TrecQrels, GenericQuery, GenericQrel
16-
from ir_datasets.indices import PickleLz4FullStore
16+
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
1717

1818

1919
NAME = 'cord19'
@@ -158,14 +158,15 @@ def _docs_iter(self):
158158
else:
159159
yield Cord19Doc(did, title, doi, date, abstract)
160160

161-
def docs_store(self, field='doc_id'):
161+
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
162162
return PickleLz4FullStore(
163163
path=f'{self.docs_path(force=False)}.pklz4',
164164
init_iter_fn=self._docs_iter,
165165
data_cls=self.docs_cls(),
166166
lookup_field=field,
167167
index_fields=['doc_id'],
168168
count_hint=self._count_hint,
169+
options=options
169170
)
170171

171172
def docs_count(self):

ir_datasets/datasets/cranfield.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from ir_datasets.util import DownloadConfig, TarExtract, Cache
77
from ir_datasets.formats import BaseDocs, BaseQueries, BaseQrels
88
from ir_datasets.formats import GenericDoc, GenericQuery, TrecQrel
9-
from ir_datasets.indices import PickleLz4FullStore
9+
from ir_datasets.indices import PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
1010
from ir_datasets.datasets.base import Dataset, YamlDocumentation
1111

1212

@@ -77,14 +77,15 @@ def docs_iter(self):
7777
def docs_cls(self):
7878
return CranfieldDoc
7979

80-
def docs_store(self, field='doc_id'):
80+
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
8181
return PickleLz4FullStore(
8282
path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4',
8383
init_iter_fn=self.docs_iter,
8484
data_cls=self.docs_cls(),
8585
lookup_field=field,
8686
index_fields=['doc_id'],
8787
count_hint=ir_datasets.util.count_hint(NAME),
88+
options=options
8889
)
8990

9091
def docs_count(self):

ir_datasets/datasets/gov.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract
1313
from ir_datasets.formats import TrecQrels, TrecQueries, TrecColonQueries, BaseDocs, GenericQuery, BaseQrels
1414
from ir_datasets.datasets.base import Dataset, YamlDocumentation
15-
from ir_datasets.indices import Docstore, PickleLz4FullStore
15+
from ir_datasets.indices import Docstore, PickleLz4FullStore, DEFAULT_DOCSTORE_OPTIONS
1616

1717

1818
_logger = ir_datasets.log.easy()
@@ -135,14 +135,15 @@ def _extract_next_block(self, inp, START, END):
135135
return inp, None
136136
return inp[i_end+len(END):], inp[i_start+len(START):i_end]
137137

138-
def docs_store(self, field='doc_id'):
138+
def docs_store(self, field='doc_id', options=DEFAULT_DOCSTORE_OPTIONS):
139139
return PickleLz4FullStore(
140140
path=f'{self.docs_path(force=False)}.pklz4',
141141
init_iter_fn=self._docs_iter,
142142
data_cls=self.docs_cls(),
143143
lookup_field=field,
144144
index_fields=['doc_id'],
145145
count_hint=ir_datasets.util.count_hint(NAME),
146+
options=options
146147
)
147148

148149
def docs_count(self):

ir_datasets/datasets/gov2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,9 +205,9 @@ def _docs_file_counts(self):
205205
self._docs_file_counts_cache = result
206206
return self._docs_file_counts_cache
207207

208-
def docs_store(self):
208+
def docs_store(self, options=ir_datasets.indices.DEFAULT_DOCSTORE_OPTIONS):
209209
docstore = Gov2Docstore(self)
210-
return ir_datasets.indices.CacheDocstore(docstore, f'{self.docs_path(force=False)}.cache')
210+
return ir_datasets.indices.CacheDocstore(docstore, f'{self.docs_path(force=False)}.cache', options=options)
211211

212212
def docs_count(self):
213213
return sum(self._docs_file_counts().values())

0 commit comments

Comments
 (0)