labmlai
diff --git a/‎notebooks/evaluate.ipynb
+47-486 b/‎notebooks/evaluate.ipynb
+47-486
diff --git a/‎notebooks/evaluate_old.ipynb
+944 b/‎notebooks/evaluate_old.ipynb
+944
diff --git a/‎notebooks/highlight.ipynb
+189 b/‎notebooks/highlight.ipynb
+189
diff --git a/‎python_autocomplete/bundle.py
+6-2 b/‎python_autocomplete/bundle.py
+6-2
diff --git a/‎python_autocomplete/dataset/__init__.py
+20 b/‎python_autocomplete/dataset/__init__.py
+20
diff --git a/‎python_autocomplete/bpe.py renamed to ‎python_autocomplete/dataset/bpe.py
+122-83 b/‎python_autocomplete/bpe.py renamed to ‎python_autocomplete/dataset/bpe.py
+122-83
diff --git a/‎python_autocomplete/dataset/break_words.py
+91 b/‎python_autocomplete/dataset/break_words.py
+91
@@ -1,5 +1,9 @@
 from labml import experiment, lab
 
 if __name__ == '__main__':
-    experiment.save_bundle(lab.get_path() / 'bundle.tar.gz', '39b03a1e454011ebbaff2b26e3148b3d',
-                           data_files=['cache/itos.json', 'cache/n_tokens.json', 'cache/stoi.json'])
+    experiment.save_bundle(lab.get_path() / 'bundle.tar.gz', 'a6cff3706ec411ebadd9bf753b33bae6',
+                           data_files=['cache/itos.json',
+                                       'cache/n_tokens.json',
+                                       'cache/stoi.json',
+                                       'cache/bpe.json',
+                                       ])
@@ -0,0 +1,20 @@
+import string
+from typing import Dict, List, Tuple
+
+ID_CHARS = set(string.ascii_letters + string.digits + '_')
+
+
+class Tokenizer:
+    n_tokens: int
+    itos: List[str]
+    stoi: Dict[str, int]
+    is_trained: int
+
+    def encode(self, data: str, *, is_silent: bool = True):
+        raise NotImplementedError
+
+    def train(self, data: str):
+        pass
+
+    def rstrip(self, data: str) -> Tuple[str, List[int]]:
+        return data, self.encode(data)
@@ -1,116 +1,146 @@
-import string
+from functools import lru_cache
 from heapq import heappush, heappop
-from typing import List, Tuple
+from typing import List
 
 from labml import lab, monit
+from labml.utils.cache import cache_set
+from python_autocomplete.dataset import Tokenizer
+from python_autocomplete.dataset.break_words import SourceCodeTokenizer
 
-ID_CHARS = set(string.ascii_letters + string.digits + '_')
 
+class BPE(Tokenizer):
+    def __init__(self, bpe_en_de: 'BPEEnDe', word_tokenizer):
+        self.bpe = bpe_en_de
+        self.word_tokenizer = word_tokenizer
+        self.is_trained = True
 
-class BPE:
-    def __init__(self):
-        self.char_itos = []
-        self.char_stoi = {}
-        self.bpe_itos = []
-        self.bpe = []
-        self.common = {}
+    @property
+    def n_tokens(self):
+        return len(self.bpe.bpe)
 
-        self.bpe_itos = self.calc_bpe_itos()
+    @property
+    def itos(self):
+        return self.bpe.bpe_itos
 
-    def to_char_stoi(self, w: str):
-        return [self.char_stoi[c] for c in w]
+    @property
+    def stoi(self):
+        return self.bpe.bpe_stoi
 
-    def calc_bpe_itos(self):
-        itos = list(self.char_itos)
-        itos += [itos[p1] + itos[p2] for p1, p2 in self.bpe[len(self.char_itos):]]
-        return itos
+    def encode(self, data: str, *, is_silent: bool = True):
+        words = self.word_tokenizer.tokenize(data, is_silent=is_silent)
 
+        res = []
+        for w in monit.iterate('Encode words', words, is_silent=is_silent):
+            res += self.bpe.encode(w)
 
-class Tokenizer:
-    def collect_words(self, data: str):
-        raise NotImplementedError
+        return res
 
-    def get_words(self) -> Tuple[List[str], List[int]]:
-        raise NotImplementedError
+    def __call__(self, data: str):
+        encoded = self.encode(data)
+        return [self.itos[c] for c in encoded]
 
-    def tokenize(self, data: str) -> List[str]:
-        raise NotImplementedError
+    def rstrip(self, data: str):
+        words = self.word_tokenizer.tokenize(data, is_silent=True)
+        words = words[:-1]
+        res = []
+        for w in words:
+            res += self.bpe.encode(w)
 
+        return ''.join(words), res
 
-class SourceCodeTokenizer(Tokenizer):
-    def __init__(self):
-        self.words = {}
 
-    def add_word(self, word):
-        if not word:
-            return
+class _BPEEncoder:
+    def __init__(self, pairs):
+        self.pairs = pairs
+        self.codes = []
+        self.next_idx = []
+        self.prev_idx = []
+        self.heap = []
 
-        if word not in self.words:
-            self.words[word] = 1
-        else:
-            self.words[word] += 1
+    def encode(self, codes: List[int]):
+        self.codes = codes
+        self.next_idx = BPELearner.default_next_pointers(len(codes))
+        self.prev_idx = BPELearner.default_prev_pointers(len(codes))
+        self.heap = []
 
-    def tokenize(self, data: str) -> List[str]:
-        last_idx = 0
-        is_id = False
-        res = []
+        for i in range(len(self.codes) - 1):
+            self.add_pair((self.codes[i], self.codes[i + 1]), i)
 
-        for i, c in monit.enum('Collect words', data):
-            if c in ID_CHARS:
-                if not is_id:
-                    if last_idx < i:
-                        res.append(data[last_idx:i])
-                    last_idx = i
-                    is_id = True
-            else:
-                if is_id:
-                    if last_idx < i:
-                        res.append(data[last_idx:i])
-                    last_idx = i
-                    is_id = False
-
-        if last_idx < len(data):
-            res.append(data[last_idx:])
+        while self.heap:
+            _, idx, pair = heappop(self.heap)
+            self.merge(idx, pair)
 
-        return res
+        return [c for c in self.codes if c != -1]
 
-    def collect_words(self, data: str):
-        last_idx = 0
-        is_id = False
+    def merge(self, p2, pair):
+        p3 = self.next_idx[p2]
+
+        if p3 == -1 or pair[0] != self.codes[p2] or pair[1] != self.codes[p3]:
+            return
 
-        for i, c in monit.enum('Collect words', data):
-            if c in ID_CHARS:
-                if not is_id:
-                    self.add_word(data[last_idx:i])
-                    last_idx = i
-                    is_id = True
-            else:
-                if is_id:
-                    self.add_word(data[last_idx:i])
-                    last_idx = i
-                    is_id = False
+        self.codes[p2] = self.pairs[pair]
+        self.codes[p3] = -1
+        p1 = self.prev_idx[p2]
+        p4 = self.next_idx[p3]
 
-        self.add_word(data[last_idx:])
+        if p1 != -1:
+            self.add_pair((self.codes[p1], self.codes[p2]), p1)
+        self.next_idx[p2] = p4
+        if p4 != -1:
+            self.prev_idx[p4] = p2
+            self.add_pair((self.codes[p2], self.codes[p4]), p2)
 
-    def get_words(self):
-        words_list = [(f, w) for w, f in self.words.items()]
-        words_list.sort(key=lambda x: -x[0])
+    def add_pair(self, pair, idx):
+        if pair not in self.pairs:
+            return
 
-        return [w for _, w in words_list], [f for f, _ in words_list]
+        heappush(self.heap, (self.pairs[pair], idx, pair))
 
 
-class NoTokenizer(Tokenizer):
+class BPEEnDe:
     def __init__(self):
-        self.data = ''
+        self.char_itos = []
+        self.char_stoi = {}
+        self.bpe = []
+        self.popular_words = {}
+
+        self.bpe_itos = []
+        self.bpe_stoi = {}
+        self.pairs = {}
+        self.encoder = None
+
+    def load(self, char_itos, char_stoi, bpe):
+        self.char_itos = char_itos
+        self.char_stoi = char_stoi
+        self.bpe = bpe
+
+        self.calc()
+
+    def set_popular_words(self, popular_words):
+        self.popular_words = popular_words
+
+    def calc(self):
+        self.bpe_itos = self.calc_bpe_itos()
+        self.bpe_stoi = {s: i for i, s in enumerate(self.bpe_itos)}
+        self.pairs = {(p[0], p[1]): c for c, p in enumerate(self.bpe) if not isinstance(p, int)}
 
-    def collect_words(self, data):
-        self.data += data
+        self.encoder = _BPEEncoder(self.pairs)
 
-    def get_words(self):
-        return [self.data], [1]
+    def to_char_stoi(self, w: str):
+        return [self.char_stoi[c] for c in w]
 
-    def tokenize(self, data: str) -> List[str]:
-        return [data]
+    def calc_bpe_itos(self):
+        itos = list(self.char_itos)
+        for p1, p2 in self.bpe[len(self.char_itos):]:
+            itos.append(itos[p1] + itos[p2])
+        return itos
+
+    @lru_cache(1024)
+    def encode(self, word: str):
+        if word in self.popular_words:
+            return self.popular_words[word]
+
+        return self.encoder.encode([self.char_stoi[c] for c in word if c in self.char_stoi])
 
 
 class BPELearner:
@@ -284,7 +314,7 @@ def main():
     path = lab.get_data_path() / 'train.py'
 
     with open(str(path), 'r') as f:
-        data = f.read()[:100_000]
+        data = f.read()
 
     tokenizer = SourceCodeTokenizer()
     tokenizer.collect_words(data)
@@ -295,6 +325,15 @@ def main():
     print(bpe.bpe_itos()[len(bpe.char_itos):])
     print(len(data), bpe.get_length())
 
+    cache_set('bpe', {
+        'char_itos': bpe.char_itos,
+        'char_stoi': bpe.char_stoi,
+        'bpe': bpe.bpe
+    })
+
+    bpe_en_de = BPEEnDe()
+    bpe_en_de.load(bpe.char_itos, bpe.char_stoi, bpe.bpe)
+
 
 if __name__ == '__main__':
     main()
@@ -0,0 +1,91 @@
+from typing import List, Tuple
+
+from labml import monit
+from python_autocomplete.dataset import ID_CHARS
+
+
+class WordTokenizer:
+    def collect_words(self, data: str):
+        raise NotImplementedError
+
+    def get_words(self) -> Tuple[List[str], List[int]]:
+        raise NotImplementedError
+
+    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
+        raise NotImplementedError
+
+
+class SourceCodeTokenizer(WordTokenizer):
+    def __init__(self):
+        self.words = {}
+
+    def add_word(self, word):
+        if not word:
+            return
+
+        if word not in self.words:
+            self.words[word] = 1
+        else:
+            self.words[word] += 1
+
+    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
+        last_idx = 0
+        is_id = False
+        res = []
+
+        for i, c in monit.enum('Collect words', data, is_silent=is_silent):
+            if c in ID_CHARS:
+                if not is_id:
+                    if last_idx < i:
+                        res.append(data[last_idx:i])
+                    last_idx = i
+                    is_id = True
+            else:
+                if is_id:
+                    if last_idx < i:
+                        res.append(data[last_idx:i])
+                    last_idx = i
+                    is_id = False
+
+        if last_idx < len(data):
+            res.append(data[last_idx:])
+
+        return res
+
+    def collect_words(self, data: str):
+        last_idx = 0
+        is_id = False
+
+        for i, c in monit.enum('Collect words', data):
+            if c in ID_CHARS:
+                if not is_id:
+                    self.add_word(data[last_idx:i])
+                    last_idx = i
+                    is_id = True
+            else:
+                if is_id:
+                    self.add_word(data[last_idx:i])
+                    last_idx = i
+                    is_id = False
+
+        self.add_word(data[last_idx:])
+
+    def get_words(self):
+        words_list = [(f, w) for w, f in self.words.items()]
+        words_list.sort(key=lambda x: -x[0])
+
+        return [w for _, w in words_list], [f for f, _ in words_list]
+
+
+class NoTokenizer(WordTokenizer):
+    def __init__(self):
+        self.data = ''
+
+    def collect_words(self, data):
+        self.data += data
+
+    def get_words(self):
+        return [self.data], [1]
+
+    def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]:
+        return [data]