|
1 | 1 | import os |
2 | 2 | import ir_datasets |
3 | | - |
| 3 | +from ir_datasets.indices import FileAccess |
4 | 4 |
|
5 | 5 | class NumpySortedIndex: |
6 | | - def __init__(self, path): |
| 6 | + def __init__(self, path, file_access=FileAccess.MMAP): |
7 | 7 | self.path = path |
8 | 8 | self.transaction = None |
9 | 9 | self.mmap_keys = None |
10 | 10 | self.mmap_poss = None |
11 | 11 | self.doccount = None |
12 | 12 | self.keylen = None |
13 | 13 | self.np = None |
| 14 | + self.file_access = file_access |
14 | 15 |
|
15 | 16 | def add(self, key, idx): |
16 | 17 | if self.transaction is None: |
@@ -52,8 +53,13 @@ def _lazy_load(self): |
52 | 53 | with open(f'{self.path}.meta', 'rt') as f: |
53 | 54 | self.keylen, self.doccount = f.read().split() |
54 | 55 | self.keylen, self.doccount = int(self.keylen), int(self.doccount) |
55 | | - self.mmap_keys = self.np.memmap(f'{self.path}.key', dtype=f'S{self.keylen}', mode='r', shape=(self.doccount,)) |
56 | | - self.mmap_poss = self.np.memmap(f'{self.path}.pos', dtype='int64', mode='r', shape=(self.doccount,)) |
| 56 | + |
| 57 | + if self.file_access == FileAccess.MEMORY: |
| 58 | + self.mmap_keys = self.np.fromfile(f'{self.path}.key', dtype=f'S{self.keylen}', count=self.doccount) |
| 59 | + self.mmap_poss = self.np.fromfile(f'{self.path}.pos', dtype='int64', count=self.doccount) |
| 60 | + else: |
| 61 | + self.mmap_keys = self.np.memmap(f'{self.path}.key', dtype=f'S{self.keylen}', mode='r', shape=(self.doccount,)) |
| 62 | + self.mmap_poss = self.np.memmap(f'{self.path}.pos', dtype='int64', mode='r', shape=(self.doccount,)) |
57 | 63 |
|
58 | 64 | def __getitem__(self, keys): |
59 | 65 | self._lazy_load() |
@@ -102,8 +108,9 @@ def __len__(self): |
102 | 108 |
|
103 | 109 |
|
104 | 110 | class NumpyPosIndex: |
105 | | - def __init__(self, path): |
| 111 | + def __init__(self, path, file_access=FileAccess.MMAP): |
106 | 112 | self.path = path |
| 113 | + self.file_access = file_access |
107 | 114 | self.transaction = None |
108 | 115 | self.mmap = None |
109 | 116 | self.np = None |
@@ -137,7 +144,11 @@ def _lazy_load(self): |
137 | 144 | self.np = ir_datasets.lazy_libs.numpy() |
138 | 145 | if self.mmap is None and self._exists(): |
139 | 146 | current_count = os.stat(self.path).st_size // 8 |
140 | | - self.mmap = self.np.memmap(self.path, dtype='int64', mode='r', shape=(current_count,)) |
| 147 | + |
| 148 | + if self.file_access == FileAccess.MEMORY: |
| 149 | + self.mmap = self.np.fromfile(self.path, dtype='int64', count=current_count) |
| 150 | + else: |
| 151 | + self.mmap = self.np.memmap(self.path, dtype='int64', mode='r', shape=(current_count,)) |
141 | 152 |
|
142 | 153 | def __getitem__(self, idxs): |
143 | 154 | self._lazy_load() |
|
0 commit comments