Add ctransformers support

rodjjo · Aug 21, 2023 · ed2c318 · ed2c318
1 parent 6d11902
commit ed2c318
Show file tree

Hide file tree

Showing 7 changed files with 180 additions and 115 deletions.
diff --git a/python_stuff/dependencies.py b/python_stuff/dependencies.py
@@ -32,7 +32,8 @@ def have_dependencies():
         'future',
         'einops',
         'torch',
-        'markdown'
+        'markdown',
+        'ctransformers'
     ]
     for l in lib_names:
         if not os.path.exists(os.path.join(lib_dir, l)):

diff --git a/python_stuff/models/downloader.py b/python_stuff/models/downloader.py
@@ -9,18 +9,24 @@
 from bottled_ai import progress_title
 
 def download_model(repo_id: str) -> bool:
-    progress_title(f"Downloading the model {repo_id}. Please wait")
-    if have_local_model(repo_id):
+    try:
+        progress_title(f"Downloading the model {repo_id}. Please wait")
+        if have_local_model(repo_id):
+            return True
+        files = get_models_file(repo_id)
+        for f in files:
+            if cached_file(
+                repo_id,
+                os.path.basename(f),
+                cache_dir=CACHE_DIR,
+            ) is None:
+                return False
         return True
-    files = get_models_file(repo_id)
-    for f in files:
-        if cached_file(
-            repo_id,
-            os.path.basename(f),
-            cache_dir=CACHE_DIR,
-        ) is None:
+    except Exception as ex:
+        with open("exception.txt", "w") as fp:
+            fp.write(str(ex))
+            fp.flush()
             return False
-    return True
 
 def remove_model(repo_id: str) -> bool:
     rmtree(MODELS_MAP[repo_id]['dirname'], ignore_errors=True)

diff --git a/python_stuff/models/inference.py b/python_stuff/models/inference.py
@@ -1,3 +1,5 @@
+import traceback
+
 import torch
 import gc
 import random
@@ -44,23 +46,6 @@ def __call__(self, *args, **kwargs) -> bool:
         return progress_canceled()
 
 
-def tokenize_single_input(tokenizer, prompt):
-    # OpenChat V2
-    human_prefix = "User:"
-    prefix    = "Assistant GPT4:"
-    eot_token = "<|end_of_turn|>"
-    bos_token = "<s>"
-
-    def _tokenize(text):
-        return tokenizer.convert_tokens_to_ids(tokenizer._tokenize(text))
-
-    def _tokenize_special(special_name):
-        return tokenizer.convert_tokens_to_ids(special_name)
-
-    return [_tokenize_special(bos_token)] + _tokenize(human_prefix) + _tokenize(prompt) + [_tokenize_special(eot_token)] + \
-          _tokenize(prefix)
-
-
 def set_manual_seed(seed):
     seed = int(seed)
     if seed == -1:
@@ -74,77 +59,100 @@ def set_manual_seed(seed):
 
 @torch.no_grad()
 def generate_text(model_id: str, params: dict) -> dict:
-    prompt = params['prompt']
-    context = params.get('context') or 'You are a helpful AI assistant.'
-    max_new_tokens = params.get('max_new_tokens', 512)
-    temperature = params.get('temperature', 1)
-    top_p = params.get('top_p', 1)
-    top_k = params.get('top_k', 0)
-    repetition_penalty = params.get('repetition_penalty', 1)
-    set_max_memory(params.get('mem_gpu', -1), params.get('mem_cpu', -1))
-
-    progress_title("Loading the model")
-    progress(0, 100)
-    model, tokenizer = get_model(model_id)
-    if model is None:
+    try:
+        prompt = params['prompt']
+        context = params.get('context') or 'You are a helpful AI assistant.'
+        max_new_tokens = params.get('max_new_tokens', 512)
+        temperature = params.get('temperature', 1)
+        top_p = params.get('top_p', 1)
+        top_k = params.get('top_k', 0)
+        repetition_penalty = params.get('repetition_penalty', 1)
+        set_max_memory(params.get('mem_gpu', -1), params.get('mem_cpu', -1))
+
+        progress_title("Loading the model")
+        progress(0, 100)
+        model, tokenizer = get_model(model_id)
+        if model is None:
+            return {
+                "html": "no model loaded",
+                "raw": "no model loaded"
+            }
+        progress_title("Generating text")
+
+        set_manual_seed(-1)
+
+        cfg = MODELS_MAP[model_id]
+        fallBackTemplate = '{instruction}\n\nUSER: {input}\nASSISTANT:'
+        for template in cfg['templates']:
+            if '{instruction}' in template and '{input}' in template:
+                hasTemplate = True
+                fallBackTemplate = template
+                break
+            elif '{input}' in template:
+                fallBackTemplate = template
+        if '{instruction}' in fallBackTemplate:
+            prompt = fallBackTemplate.format(instruction=context, input=prompt)
+        else:
+            prompt = fallBackTemplate.format(input=prompt)
+
+        response_prefix = cfg['response_after']
+
+        if cfg['loader'] == 'ctransformers':
+            inputs = str(prompt)
+        else:
+            inputs = tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=True).to('cuda:0')
+        if cfg['loader'] == 'ctransformers':
+            output = ''
+            for txt in model(
+                        prompt=inputs,
+                        max_new_tokens=max_new_tokens,
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=int(top_k),
+                        repetition_penalty=repetition_penalty,
+                        stream=True
+                    ):
+                progress_text(txt)
+                output = output + txt
+                if progress_canceled():
+                    break
+        else:
+            output = model.generate(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                streamer=Streamer(tokenizer),
+                stopping_criteria=transformers.StoppingCriteriaList() + [
+                    CanceledChecker()
+                ]
+            )[0]
+
+        new_tokens = len(output) - len(inputs[0])
+        if cfg['loader'] == 'ctransformers':
+            response = output
+        else:
+            response = tokenizer.decode(output[-new_tokens:], True)
+
+        if 'ASSISTANT:' in response_prefix:
+            response = response.split('\n');
+            for i, v in enumerate(response):
+                if not v.startswith(response_prefix):
+                    continue
+                v = v.split(response_prefix, maxsplit=1)
+                response[i] = v[0] if len(v) < 2 else v[1]
+            response = '\n'.join(response)
+
+        del inputs
+        gc.collect()
         return {
-            "html": "no model loaded",
-            "raw": "no model loaded"
+            "html": convert2html(response),
+            "raw": response
         }
-    progress_title("Generating text")
-
-    set_manual_seed(-1)
-
-    cfg = MODELS_MAP[model_id]
-    fallBackTemplate = '{instruction}\n\nUSER: {input}\nASSISTANT:'
-    for template in cfg['templates']:
-        if '{instruction}' in template and '{input}' in template:
-            hasTemplate = True
-            fallBackTemplate = template
-            break
-        elif '{input}' in template:
-            fallBackTemplate = template
-    if '{instruction}' in fallBackTemplate:
-        prompt = fallBackTemplate.format(instruction=context, input=prompt)
-    else:
-        prompt = fallBackTemplate.format(input=prompt)
-
-    if MODELS_MAP[model_id]['loader'] == 'auto_gpt_openchat':
-        prompt = tokenize_single_input(tokenizer, prompt)
-
-    response_prefix = cfg['response_after']
-
-    # inputs = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda:0')
-    inputs = tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=True).to('cuda:0')
-
-    output = model.generate(
-        inputs=inputs,
-        max_new_tokens=max_new_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-        repetition_penalty=repetition_penalty,
-        streamer=Streamer(tokenizer),
-        stopping_criteria=transformers.StoppingCriteriaList() + [
-            CanceledChecker()
-        ]
-    )[0]
-
-    new_tokens = len(output) - len(inputs[0])
-    response = tokenizer.decode(output[-new_tokens:], True)
-
-    if 'ASSISTANT:' in response_prefix:
-        response = response.split('\n');
-        for i, v in enumerate(response):
-            if not v.startswith(response_prefix):
-                continue
-            v = v.split(response_prefix, maxsplit=1)
-            response[i] = v[0] if len(v) < 2 else v[1]
-        response = '\n'.join(response)
-
-    del inputs
-    gc.collect()
-    return {
-        "html": convert2html(response),
-        "raw": response
-    }
+    except Exception as ex:
+        return {
+            "html": convert2html(str(ex) + str(traceback.format_exc())),
+            "raw": str(ex),
+        }
diff --git a/python_stuff/models/listing.py b/python_stuff/models/listing.py
@@ -9,7 +9,7 @@
         'name': 'Nous-Hermes-13b (GPTQ)',
         'model_basename': 'nous-hermes-13b-GPTQ-4bit-128g.no-act.order',
         'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--Nous-Hermes-13B-GPTQ'),
-        'loader': 'auto_gpt',
+        'loader': 'auto_gptq',
         'locally': False,
         'description': (
             'Nous-Hermes-13b is a state-of-the-art language model fine-tuned on over 300,000 instructions.'
@@ -22,14 +22,13 @@
         'response_after':'### Response:\n',
         'templates': [
             '### Instruction:\n{instruction}\n### Input:\n{input}\n### Response:\n',
-            '### Instruction:\n{input}\n### Response:\n',
         ]
     }, {
         'id': 'TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ',
         'name': 'Wizard-Vicuna-13B-Uncensored-GPTQ',
         'model_basename': 'Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order',
         'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--Wizard-Vicuna-13B-Uncensored-GPTQ'),
-        'loader': 'auto_gpt',
+        'loader': 'auto_gptq',
         'locally': False, 
         'description': (
             'This is wizard-vicuna-13b trained with a subset of the dataset - responses that contained alignment / moralizing were removed.\n'
@@ -41,14 +40,13 @@
         'response_after':'### Response:\n',
         'templates': [
             '### Instruction:\n{instruction}>\n### Input:\n{input}>\n### Response:\n',
-            '### Instruction:\n{input}>\n### Response:\n',
         ]
     }, {
         'id': 'TheBloke/openchat_v2_w-GPTQ',
         'name': 'openchat_v2_w-GPT',
         'model_basename': 'openchat_v2_w-GPTQ-4bit-128g.no-act.order',
         'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--openchat_v2_w-GPT'),
-        'loader': 'auto_gpt_openchat',
+        'loader': 'auto_gptq_openchat',
         'locally': False, 
         'description': (
             'The OpenChat v2 family is inspired by offline reinforcement learning, '
@@ -62,7 +60,7 @@
         'name': 'WizardLM-7B-uncensored-GPTQ',
         'model_basename': 'WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order',
         'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--WizardLM-7B-uncensored-GPTQ'),
-        'loader': 'auto_gpt',
+        'loader': 'auto_gptq',
         'locally': False, 
         'description': (
            'This is WizardLM trained with a subset of the dataset - responses that contained alignment / moralizing were removed.\n' 
@@ -79,7 +77,7 @@
         'name': 'Nous-Hermes-Llama-2-7B-GPTQ',
         'model_basename': 'gptq_model-4bit-128g',
         'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--Nous-Hermes-Llama-2-7B-GPTQ'),
-        'loader': 'auto_gpt',
+        'loader': 'auto_gptq',
         'locally': False, 
         'description': (
            'Nous-Hermes-Llama2-7b is a state-of-the-art language model fine-tuned on over 300,000 instructions.\n'
@@ -89,31 +87,29 @@
         'response_after': '### Response:\n',
         'templates': [
             '### Instruction:\n{instruction}\n### Input:\n{input}\n### Response:\n',
-            '### Instruction:\n{input}\n### Response:\n',
         ]
     },
     {
         'id': 'TheBloke/StableBeluga-7B-GPTQ',
         'name': 'StableBeluga-7B-GPTQ',
         'model_basename': 'gptq_model-4bit-128g',
         'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--StableBeluga-7B-GPTQ'),
-        'loader': 'auto_gpt',
+        'loader': 'auto_gptq',
         'locally': False, 
         'description': (
            'Stable Beluga 7B is a Llama2 7B model finetuned on an Orca style Dataset'
         ),
         'response_after': '### Response:\n',
         'templates': [
             '### System:\n{instruction}\n### User:\n{input}\n### Response:\n',
-            '### User:\n{input}\n### Response:\n',
         ]
     },
     {
         'id': 'TheBloke/stablecode-instruct-alpha-3b-GPTQ',
         'name': 'stablecode-instruct-alpha-3b-GPTQ',
         'model_basename': 'gptq_model-4bit-128g',
         'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--stablecode-instruct-alpha-3b-GPTQ'),
-        'loader': 'auto_gpt',
+        'loader': 'auto_gptq',
         'locally': False, 
         'have_tokenizer_model': False,
         'description': (
@@ -123,6 +119,22 @@
         'templates': [
             '### Instruction:\n{input}\n### Response:\n'
         ]
+    }, {
+        'id': 'TheBloke/orca_mini_3B-GGML',
+        'name': 'orca_mini_3B-GGML',
+        'model_basename': 'orca-mini-3b.ggmlv3.q8_0',
+        'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--orca_mini_3B-GGML'),
+        'loader': 'ctransformers',
+        'model_type': 'llama',
+        'locally': False, 
+        'have_tokenizer_model': False,
+        'description': (
+           'An OpenLLaMa-3B model model trained on explain tuned datasets, created using Instructions and Input from WizardLM, Alpaca & Dolly-V2 datasets and applying Orca Research Paper dataset construction approaches.'
+        ),
+        'response_after': '### Response:\n',
+        'templates': [
+            '### System:\n{instruction}\n### User:\n{input}\n### Response:\n',
+        ]
     }
 ]
 
@@ -152,6 +164,10 @@ def get_models_file(repo_id) -> List[str]:
         if os.path.isdir(p):
             basedir = p
             break
+    if mdl['loader'] == 'ctransformers':
+        return [
+            os.path.join(basedir, f'{mdl["model_basename"]}.bin'),
+        ]
     if mdl.get('have_tokenizer_model', True):
         params = [
             os.path.join(basedir, 'tokenizer.model'),