Merge pull request #86 from lapp0/parallel-model-tokenizer-index-load

lapp0 · web-flow · commit 65ae1585b3dd · 2024-07-29T18:47:53.000-05:00
Parallel model tokenizer index load
diff --git a/benchmarks/bench_regex_fsm.py b/benchmarks/bench_regex_fsm.py
@@ -0,0 +1,56 @@
+import random
+
+from outlines.caching import cache_disabled
+from outlines.fsm.regex import reduced_vocabulary
+from outlines.models.tokenizer import Tokenizer
+
+from .common import ensure_numba_compiled
+
+
+class MockTokenizer(Tokenizer):
+    def __init__(self, token_strs):
+        self.eos_token = "<eos>"
+        self.eos_token_id = 0
+        self.pad_token_id = 1
+        self.special_tokens = {0, 1}
+
+        self.vocabulary = {"<eos>": 0, "<pad>": 1}
+
+        for i, tok in enumerate(token_strs):
+            self.vocabulary[tok] = i + 2
+
+    @classmethod
+    def from_random_tokens(cls, n_tokens, max_token_length=8, seed=42):
+        random.seed(seed)
+        tokens = [
+            "".join(
+                chr(random.randint(0, 4096))
+                for __ in range(random.randint(0, max_token_length))
+            )
+            for _ in range(n_tokens)
+        ]
+        return cls(tokens)
+
+    def convert_token_to_string(self, token):
+        return token
+
+    def __hash__(self):
+        return hash(tuple(sorted(self.vocabulary.items())))
+
+
+def reduced_vocabulary_uncached(*args, **kwargs):
+    return reduced_vocabulary.__wrapped__(*args, **kwargs)
+
+
+class RegexReducedVocabularyBenchmark:
+    params = [10000, 100000, 1000000]
+    param_names = ["vocab_size"]
+
+    def setup(self, vocab_size):
+        ensure_numba_compiled(MockTokenizer([chr(i) for i in range(128)]))
+
+        self.tokenizer = MockTokenizer.from_random_tokens(vocab_size)
+
+    @cache_disabled()
+    def time_reduced_vocabulary(self, _):
+        reduced_vocabulary_uncached(self.tokenizer)
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -138,7 +138,7 @@ nav:
           - TGI: reference/models/tgi.md
           - ExllamaV2: reference/models/exllamav2.md
           - MLX: reference/models/mlxlm.md
-          - Mamba: reference/models/mamba.md
+          - Mamba: reference/models/transformers.md
         - API:
             - OpenAI: reference/models/openai.md
   - API Reference:
diff --git a/outlines/caching.py b/outlines/caching.py
@@ -153,8 +153,8 @@ def disable_cache():
 
     `outlines.cache.disable` should be called right after importing outlines:
 
-    >>> import outlines.cache as cache
-    >>> cache.disable()
+    >>> import outlines.caching as cache
+    >>> cache.disable_cache()
 
     """
     global _caching_enabled
diff --git a/outlines/fsm/regex.py b/outlines/fsm/regex.py
@@ -887,23 +887,11 @@ def gpt2_unicode_to_bytes():
     return {v: k for k, v in gpt2_bytes_to_unicode().items()}
 
 
-# TODO: Cannot cache typed collections to disk, yet.  See
-# https://github.com/numba/numba/issues/4698
-@lru_cache
-def reduced_vocabulary(
-    tokenizer: "Tokenizer",
-) -> Tuple[List[Tuple[str, Sequence[int]]], Set[int]]:
-    """Create a map from decoded vocabulary tokens to lists of equivalent token ids."""
+def get_normalized_vocab(tokenizer: "Tokenizer") -> Tuple[Dict[int, str], Set[int]]:
+    norm_vocab = {}
     empty_token_ids = set()
-    vocabulary: Dict[Union[str, Tuple[str, ...]], List[int]] = {}
     for token, token_idx in tokenizer.vocabulary.items():
-        if token in tokenizer.special_tokens:
-            continue
-
-        token_str: Union[str, Tuple[str, ...]] = tokenizer.convert_token_to_string(
-            token
-        )
-
+        token_str = tokenizer.convert_token_to_string(token)
         if token_str:
             # invalid utf-8 sequences are replaced with � (\ufffd), but there
             # might also be tokens specifically for �, ��, ���, etc.
@@ -927,22 +915,88 @@ def reduced_vocabulary(
                         )
                 token_str = "".join(byte_symbol(b) for b in token_bytes)
 
-            vocabulary.setdefault(token_str, []).append(token_idx)
+            norm_vocab[token_idx] = token_str
         else:
             empty_token_ids.add(numba.int64(token_idx))
 
-    vocabulary_nb = numba.typed.List.empty_list(
-        numba.types.Tuple(
-            (
-                nb_unicode_type,
-                numba.int64[:],
-            )
-        )
+    return norm_vocab, empty_token_ids
+
+
+@numba.njit(cache=True, nogil=True)
+def to_numba_dict(keys: List[int], values: List[str]):
+    """
+    Pure-python numba dict construction is extremely slow.
+    This helper accepts equal length key and value arrays, and constructs a numba dict
+    """
+    # Define the key and value types for the Numba dictionary
+    numba_dict = numba.typed.Dict.empty(
+        key_type=numba.types.int64,
+        value_type=numba.types.unicode_type,
     )
-    for token_str, token_ids in vocabulary.items():
-        token_ids_np = np.fromiter(token_ids, dtype=np.dtype("int64"))
-        vocabulary_nb.append((token_str, token_ids_np))
 
+    # Fill the Numba dictionary with values from the input lists
+    for i in range(len(keys)):
+        numba_dict[keys[i]] = values[i]
+
+    return numba_dict
+
+
+token_id_str_pair = numba.types.Tuple((nb_unicode_type, numba.int64[:]))
+
+
+@numba.njit(
+    numba.types.ListType(token_id_str_pair)(
+        numba.types.DictType(numba.int64, nb_unicode_type)
+    ),
+    cache=True,
+    nogil=True,
+)
+def vocab_dict_to_inverted_vocab_list(
+    vocab_dict_nb: Dict[int, str]
+) -> List[Tuple[str, Sequence[int]]]:
+    """
+    Helper for `reduced_vocabulary`
+
+    Convert
+    - from `vocab_dict_nb`: Dict[token_id, token_str]
+    - to `vocab_nb`: List[token_str, token_id[:]]
+    """
+    inverse_vocab_dict = numba.typed.Dict.empty(
+        key_type=numba.types.unicode_type, value_type=numba.types.int64[:]
+    )
+
+    # Fill the temporary dictionary
+    for key in vocab_dict_nb:
+        value = vocab_dict_nb[key]
+        if value not in inverse_vocab_dict:
+            inverse_vocab_dict[value] = np.zeros(0, dtype=np.int64)
+        inverse_vocab_dict[value] = np.append(inverse_vocab_dict[value], key)
+
+    # Transfer data from the temporary dictionary to the final dictionary
+    vocab_nb = numba.typed.List.empty_list(token_id_str_pair)
+
+    for value in inverse_vocab_dict:
+        vocab_nb.append((value, inverse_vocab_dict[value]))
+
+    return vocab_nb
+
+
+# TODO: Cannot cache typed collections to disk, yet.  See
+# https://github.com/numba/numba/issues/4698
+@lru_cache
+def reduced_vocabulary(
+    tokenizer: "Tokenizer",
+) -> Tuple[List[Tuple[str, Sequence[int]]], Set[int]]:
+    """
+    Provided the tokenizer, calculate the
+    - vocabulary_nb: mapping of (normalized token str -> token_ids[:])
+    - empty token ids
+    """
+    norm_vocab, empty_token_ids = get_normalized_vocab(tokenizer)
+    norm_vocab_dict_nb = to_numba_dict(
+        np.fromiter(norm_vocab.keys(), dtype=np.int64), list(norm_vocab.values())
+    )
+    vocabulary_nb = vocab_dict_to_inverted_vocab_list(norm_vocab_dict_nb)
     return vocabulary_nb, empty_token_ids
 
 
diff --git a/outlines/models/transformers.py b/outlines/models/transformers.py
@@ -69,7 +69,7 @@ def __init__(self, tokenizer: "PreTrainedTokenizer", **kwargs):
         self.eos_token_id = self.tokenizer.eos_token_id
         self.eos_token = self.tokenizer.eos_token
 
-        if not self.tokenizer.pad_token_id:
+        if self.tokenizer.pad_token_id is None:
             self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
             self.pad_token_id = self.eos_token_id
         else: