Skip to content

Commit 1b00248

Browse files
committed
WIP: use mmap or memory in NumpyPosIndex/NumpySortedIndex
1 parent 9ef99aa commit 1b00248

File tree

2 files changed

+20
-9
lines changed

2 files changed

+20
-9
lines changed

ir_datasets/indices/lz4_pickle.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,17 +153,17 @@ def bin(self):
153153
finally:
154154
f.close() # mapping stays valid after this
155155
else:
156-
assert False, f"File access {self._file_access} not supported"
156+
assert False, f"File access {self._file_access} not supported / {FileAccess.FILE}"
157157
return self._bin
158158

159159
def pos(self):
160160
if self._pos is None:
161-
self._pos = NumpyPosIndex(self._pos_path)
161+
self._pos = NumpyPosIndex(self._pos_path, file_access=self._file_access)
162162
return self._pos
163163

164164
def idx(self):
165165
if self._idx is None:
166-
self._idx = NumpySortedIndex(self._idx_path)
166+
self._idx = NumpySortedIndex(self._idx_path, file_access=self._file_access)
167167
return self._idx
168168

169169
def close(self):

ir_datasets/indices/numpy_sorted_index.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
import os
22
import ir_datasets
3-
3+
from ir_datasets.indices import FileAccess
44

55
class NumpySortedIndex:
6-
def __init__(self, path):
6+
def __init__(self, path, file_access=FileAccess.MMAP):
77
self.path = path
88
self.transaction = None
99
self.mmap_keys = None
1010
self.mmap_poss = None
1111
self.doccount = None
1212
self.keylen = None
1313
self.np = None
14+
self.file_access = file_access
1415

1516
def add(self, key, idx):
1617
if self.transaction is None:
@@ -52,8 +53,13 @@ def _lazy_load(self):
5253
with open(f'{self.path}.meta', 'rt') as f:
5354
self.keylen, self.doccount = f.read().split()
5455
self.keylen, self.doccount = int(self.keylen), int(self.doccount)
55-
self.mmap_keys = self.np.memmap(f'{self.path}.key', dtype=f'S{self.keylen}', mode='r', shape=(self.doccount,))
56-
self.mmap_poss = self.np.memmap(f'{self.path}.pos', dtype='int64', mode='r', shape=(self.doccount,))
56+
57+
if self.file_access == FileAccess.MEMORY:
58+
self.mmap_keys = self.np.fromfile(f'{self.path}.key', dtype=f'S{self.keylen}', count=self.doccount)
59+
self.mmap_poss = self.np.fromfile(f'{self.path}.pos', dtype='int64', count=self.doccount)
60+
else:
61+
self.mmap_keys = self.np.memmap(f'{self.path}.key', dtype=f'S{self.keylen}', mode='r', shape=(self.doccount,))
62+
self.mmap_poss = self.np.memmap(f'{self.path}.pos', dtype='int64', mode='r', shape=(self.doccount,))
5763

5864
def __getitem__(self, keys):
5965
self._lazy_load()
@@ -102,8 +108,9 @@ def __len__(self):
102108

103109

104110
class NumpyPosIndex:
105-
def __init__(self, path):
111+
def __init__(self, path, file_access=FileAccess.MMAP):
106112
self.path = path
113+
self.file_access = file_access
107114
self.transaction = None
108115
self.mmap = None
109116
self.np = None
@@ -137,7 +144,11 @@ def _lazy_load(self):
137144
self.np = ir_datasets.lazy_libs.numpy()
138145
if self.mmap is None and self._exists():
139146
current_count = os.stat(self.path).st_size // 8
140-
self.mmap = self.np.memmap(self.path, dtype='int64', mode='r', shape=(current_count,))
147+
148+
if self.file_access == FileAccess.MEMORY:
149+
self.mmap = self.np.fromfile(self.path, dtype='int64', count=current_count)
150+
else:
151+
self.mmap = self.np.memmap(self.path, dtype='int64', mode='r', shape=(current_count,))
141152

142153
def __getitem__(self, idxs):
143154
self._lazy_load()

0 commit comments

Comments
 (0)