diff --git a/python_stuff/dependencies.py b/python_stuff/dependencies.py index 09182b2..0fa0605 100644 --- a/python_stuff/dependencies.py +++ b/python_stuff/dependencies.py @@ -32,7 +32,8 @@ def have_dependencies(): 'future', 'einops', 'torch', - 'markdown' + 'markdown', + 'ctransformers' ] for l in lib_names: if not os.path.exists(os.path.join(lib_dir, l)): diff --git a/python_stuff/models/downloader.py b/python_stuff/models/downloader.py index 270c924..9d0574d 100644 --- a/python_stuff/models/downloader.py +++ b/python_stuff/models/downloader.py @@ -9,18 +9,24 @@ from bottled_ai import progress_title def download_model(repo_id: str) -> bool: - progress_title(f"Downloading the model {repo_id}. Please wait") - if have_local_model(repo_id): + try: + progress_title(f"Downloading the model {repo_id}. Please wait") + if have_local_model(repo_id): + return True + files = get_models_file(repo_id) + for f in files: + if cached_file( + repo_id, + os.path.basename(f), + cache_dir=CACHE_DIR, + ) is None: + return False return True - files = get_models_file(repo_id) - for f in files: - if cached_file( - repo_id, - os.path.basename(f), - cache_dir=CACHE_DIR, - ) is None: + except Exception as ex: + with open("exception.txt", "w") as fp: + fp.write(str(ex)) + fp.flush() return False - return True def remove_model(repo_id: str) -> bool: rmtree(MODELS_MAP[repo_id]['dirname'], ignore_errors=True) diff --git a/python_stuff/models/inference.py b/python_stuff/models/inference.py index d160a34..cd52c13 100644 --- a/python_stuff/models/inference.py +++ b/python_stuff/models/inference.py @@ -1,3 +1,5 @@ +import traceback + import torch import gc import random @@ -44,23 +46,6 @@ def __call__(self, *args, **kwargs) -> bool: return progress_canceled() -def tokenize_single_input(tokenizer, prompt): - # OpenChat V2 - human_prefix = "User:" - prefix = "Assistant GPT4:" - eot_token = "<|end_of_turn|>" - bos_token = "" - - def _tokenize(text): - return tokenizer.convert_tokens_to_ids(tokenizer._tokenize(text)) - - def _tokenize_special(special_name): - return tokenizer.convert_tokens_to_ids(special_name) - - return [_tokenize_special(bos_token)] + _tokenize(human_prefix) + _tokenize(prompt) + [_tokenize_special(eot_token)] + \ - _tokenize(prefix) - - def set_manual_seed(seed): seed = int(seed) if seed == -1: @@ -74,77 +59,100 @@ def set_manual_seed(seed): @torch.no_grad() def generate_text(model_id: str, params: dict) -> dict: - prompt = params['prompt'] - context = params.get('context') or 'You are a helpful AI assistant.' - max_new_tokens = params.get('max_new_tokens', 512) - temperature = params.get('temperature', 1) - top_p = params.get('top_p', 1) - top_k = params.get('top_k', 0) - repetition_penalty = params.get('repetition_penalty', 1) - set_max_memory(params.get('mem_gpu', -1), params.get('mem_cpu', -1)) - - progress_title("Loading the model") - progress(0, 100) - model, tokenizer = get_model(model_id) - if model is None: + try: + prompt = params['prompt'] + context = params.get('context') or 'You are a helpful AI assistant.' + max_new_tokens = params.get('max_new_tokens', 512) + temperature = params.get('temperature', 1) + top_p = params.get('top_p', 1) + top_k = params.get('top_k', 0) + repetition_penalty = params.get('repetition_penalty', 1) + set_max_memory(params.get('mem_gpu', -1), params.get('mem_cpu', -1)) + + progress_title("Loading the model") + progress(0, 100) + model, tokenizer = get_model(model_id) + if model is None: + return { + "html": "no model loaded", + "raw": "no model loaded" + } + progress_title("Generating text") + + set_manual_seed(-1) + + cfg = MODELS_MAP[model_id] + fallBackTemplate = '{instruction}\n\nUSER: {input}\nASSISTANT:' + for template in cfg['templates']: + if '{instruction}' in template and '{input}' in template: + hasTemplate = True + fallBackTemplate = template + break + elif '{input}' in template: + fallBackTemplate = template + if '{instruction}' in fallBackTemplate: + prompt = fallBackTemplate.format(instruction=context, input=prompt) + else: + prompt = fallBackTemplate.format(input=prompt) + + response_prefix = cfg['response_after'] + + if cfg['loader'] == 'ctransformers': + inputs = str(prompt) + else: + inputs = tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=True).to('cuda:0') + if cfg['loader'] == 'ctransformers': + output = '' + for txt in model( + prompt=inputs, + max_new_tokens=max_new_tokens, + temperature=temperature, + top_p=top_p, + top_k=int(top_k), + repetition_penalty=repetition_penalty, + stream=True + ): + progress_text(txt) + output = output + txt + if progress_canceled(): + break + else: + output = model.generate( + inputs=inputs, + max_new_tokens=max_new_tokens, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + streamer=Streamer(tokenizer), + stopping_criteria=transformers.StoppingCriteriaList() + [ + CanceledChecker() + ] + )[0] + + new_tokens = len(output) - len(inputs[0]) + if cfg['loader'] == 'ctransformers': + response = output + else: + response = tokenizer.decode(output[-new_tokens:], True) + + if 'ASSISTANT:' in response_prefix: + response = response.split('\n'); + for i, v in enumerate(response): + if not v.startswith(response_prefix): + continue + v = v.split(response_prefix, maxsplit=1) + response[i] = v[0] if len(v) < 2 else v[1] + response = '\n'.join(response) + + del inputs + gc.collect() return { - "html": "no model loaded", - "raw": "no model loaded" + "html": convert2html(response), + "raw": response } - progress_title("Generating text") - - set_manual_seed(-1) - - cfg = MODELS_MAP[model_id] - fallBackTemplate = '{instruction}\n\nUSER: {input}\nASSISTANT:' - for template in cfg['templates']: - if '{instruction}' in template and '{input}' in template: - hasTemplate = True - fallBackTemplate = template - break - elif '{input}' in template: - fallBackTemplate = template - if '{instruction}' in fallBackTemplate: - prompt = fallBackTemplate.format(instruction=context, input=prompt) - else: - prompt = fallBackTemplate.format(input=prompt) - - if MODELS_MAP[model_id]['loader'] == 'auto_gpt_openchat': - prompt = tokenize_single_input(tokenizer, prompt) - - response_prefix = cfg['response_after'] - - # inputs = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda:0') - inputs = tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=True).to('cuda:0') - - output = model.generate( - inputs=inputs, - max_new_tokens=max_new_tokens, - temperature=temperature, - top_p=top_p, - top_k=top_k, - repetition_penalty=repetition_penalty, - streamer=Streamer(tokenizer), - stopping_criteria=transformers.StoppingCriteriaList() + [ - CanceledChecker() - ] - )[0] - - new_tokens = len(output) - len(inputs[0]) - response = tokenizer.decode(output[-new_tokens:], True) - - if 'ASSISTANT:' in response_prefix: - response = response.split('\n'); - for i, v in enumerate(response): - if not v.startswith(response_prefix): - continue - v = v.split(response_prefix, maxsplit=1) - response[i] = v[0] if len(v) < 2 else v[1] - response = '\n'.join(response) - - del inputs - gc.collect() - return { - "html": convert2html(response), - "raw": response - } \ No newline at end of file + except Exception as ex: + return { + "html": convert2html(str(ex) + str(traceback.format_exc())), + "raw": str(ex), + } \ No newline at end of file diff --git a/python_stuff/models/listing.py b/python_stuff/models/listing.py index 7619aa4..36468d2 100644 --- a/python_stuff/models/listing.py +++ b/python_stuff/models/listing.py @@ -9,7 +9,7 @@ 'name': 'Nous-Hermes-13b (GPTQ)', 'model_basename': 'nous-hermes-13b-GPTQ-4bit-128g.no-act.order', 'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--Nous-Hermes-13B-GPTQ'), - 'loader': 'auto_gpt', + 'loader': 'auto_gptq', 'locally': False, 'description': ( 'Nous-Hermes-13b is a state-of-the-art language model fine-tuned on over 300,000 instructions.' @@ -22,14 +22,13 @@ 'response_after':'### Response:\n', 'templates': [ '### Instruction:\n{instruction}\n### Input:\n{input}\n### Response:\n', - '### Instruction:\n{input}\n### Response:\n', ] }, { 'id': 'TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ', 'name': 'Wizard-Vicuna-13B-Uncensored-GPTQ', 'model_basename': 'Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order', 'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--Wizard-Vicuna-13B-Uncensored-GPTQ'), - 'loader': 'auto_gpt', + 'loader': 'auto_gptq', 'locally': False, 'description': ( 'This is wizard-vicuna-13b trained with a subset of the dataset - responses that contained alignment / moralizing were removed.\n' @@ -41,14 +40,13 @@ 'response_after':'### Response:\n', 'templates': [ '### Instruction:\n{instruction}>\n### Input:\n{input}>\n### Response:\n', - '### Instruction:\n{input}>\n### Response:\n', ] }, { 'id': 'TheBloke/openchat_v2_w-GPTQ', 'name': 'openchat_v2_w-GPT', 'model_basename': 'openchat_v2_w-GPTQ-4bit-128g.no-act.order', 'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--openchat_v2_w-GPT'), - 'loader': 'auto_gpt_openchat', + 'loader': 'auto_gptq_openchat', 'locally': False, 'description': ( 'The OpenChat v2 family is inspired by offline reinforcement learning, ' @@ -62,7 +60,7 @@ 'name': 'WizardLM-7B-uncensored-GPTQ', 'model_basename': 'WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order', 'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--WizardLM-7B-uncensored-GPTQ'), - 'loader': 'auto_gpt', + 'loader': 'auto_gptq', 'locally': False, 'description': ( 'This is WizardLM trained with a subset of the dataset - responses that contained alignment / moralizing were removed.\n' @@ -79,7 +77,7 @@ 'name': 'Nous-Hermes-Llama-2-7B-GPTQ', 'model_basename': 'gptq_model-4bit-128g', 'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--Nous-Hermes-Llama-2-7B-GPTQ'), - 'loader': 'auto_gpt', + 'loader': 'auto_gptq', 'locally': False, 'description': ( 'Nous-Hermes-Llama2-7b is a state-of-the-art language model fine-tuned on over 300,000 instructions.\n' @@ -89,7 +87,6 @@ 'response_after': '### Response:\n', 'templates': [ '### Instruction:\n{instruction}\n### Input:\n{input}\n### Response:\n', - '### Instruction:\n{input}\n### Response:\n', ] }, { @@ -97,7 +94,7 @@ 'name': 'StableBeluga-7B-GPTQ', 'model_basename': 'gptq_model-4bit-128g', 'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--StableBeluga-7B-GPTQ'), - 'loader': 'auto_gpt', + 'loader': 'auto_gptq', 'locally': False, 'description': ( 'Stable Beluga 7B is a Llama2 7B model finetuned on an Orca style Dataset' @@ -105,7 +102,6 @@ 'response_after': '### Response:\n', 'templates': [ '### System:\n{instruction}\n### User:\n{input}\n### Response:\n', - '### User:\n{input}\n### Response:\n', ] }, { @@ -113,7 +109,7 @@ 'name': 'stablecode-instruct-alpha-3b-GPTQ', 'model_basename': 'gptq_model-4bit-128g', 'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--stablecode-instruct-alpha-3b-GPTQ'), - 'loader': 'auto_gpt', + 'loader': 'auto_gptq', 'locally': False, 'have_tokenizer_model': False, 'description': ( @@ -123,6 +119,22 @@ 'templates': [ '### Instruction:\n{input}\n### Response:\n' ] + }, { + 'id': 'TheBloke/orca_mini_3B-GGML', + 'name': 'orca_mini_3B-GGML', + 'model_basename': 'orca-mini-3b.ggmlv3.q8_0', + 'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--orca_mini_3B-GGML'), + 'loader': 'ctransformers', + 'model_type': 'llama', + 'locally': False, + 'have_tokenizer_model': False, + 'description': ( + 'An OpenLLaMa-3B model model trained on explain tuned datasets, created using Instructions and Input from WizardLM, Alpaca & Dolly-V2 datasets and applying Orca Research Paper dataset construction approaches.' + ), + 'response_after': '### Response:\n', + 'templates': [ + '### System:\n{instruction}\n### User:\n{input}\n### Response:\n', + ] } ] @@ -152,6 +164,10 @@ def get_models_file(repo_id) -> List[str]: if os.path.isdir(p): basedir = p break + if mdl['loader'] == 'ctransformers': + return [ + os.path.join(basedir, f'{mdl["model_basename"]}.bin'), + ] if mdl.get('have_tokenizer_model', True): params = [ os.path.join(basedir, 'tokenizer.model'), diff --git a/python_stuff/models/loader.py b/python_stuff/models/loader.py index a993ead..4f2c559 100644 --- a/python_stuff/models/loader.py +++ b/python_stuff/models/loader.py @@ -4,6 +4,8 @@ import torch from transformers import AutoTokenizer from auto_gptq import AutoGPTQForCausalLM +from ctransformers import AutoModelForCausalLM + from models.paths import CACHE_DIR from models.listing import MODELS_MAP, get_models_file, have_local_model @@ -26,6 +28,7 @@ def get_model(repo_id: str) -> Tuple[AutoGPTQForCausalLM, AutoTokenizer]: return SELECTED_MODEL['model'], SELECTED_MODEL['tokenizer'] + def select_model(repo_id: str) -> bool: global SELECTED_MODEL if SELECTED_MODEL is not None and SELECTED_MODEL['repo_id'] == repo_id: @@ -33,14 +36,22 @@ def select_model(repo_id: str) -> bool: unload_model() if not have_local_model(repo_id): return False + if MODELS_MAP[repo_id]['loader'] == 'auto_gptq': + return select_autogptq_model(repo_id) + if MODELS_MAP[repo_id]['loader'] == 'ctransformers': + return select_ggml_model(repo_id) + return False + + +def select_autogptq_model(repo_id: str) -> bool: + global SELECTED_MODEL params = dict( device="cuda:0", use_safetensors=True, use_triton=False, cache_dir=CACHE_DIR, model_basename=MODELS_MAP[repo_id]['model_basename'], - local_files_only=True, - trust_remote_code=True + local_files_only=True ) if MAX_MEMORY: params['max_memory'] = MAX_MEMORY @@ -71,6 +82,29 @@ def select_model(repo_id: str) -> bool: } +def select_ggml_model(repo_id): + global SELECTED_MODEL + + params = dict( + local_files_only=True, + model_type=MODELS_MAP[repo_id].get('model_type', 'gpt-2') + ) + if MAX_MEMORY: + params['max_memory'] = MAX_MEMORY + local_file = get_models_file(repo_id)[0] + model = AutoModelForCausalLM.from_pretrained( + local_file, + **params + ) + + SELECTED_MODEL = { + 'model': model, + 'tokenizer': None, + 'repo_id': repo_id + } + + + def unload_model(): global SELECTED_MODEL SELECTED_MODEL = None diff --git a/python_stuff/requirements.txt b/python_stuff/requirements.txt index 2ff5b73..c6da5d6 100644 --- a/python_stuff/requirements.txt +++ b/python_stuff/requirements.txt @@ -13,7 +13,7 @@ markdown==3.4.4 einops==0.6.1 pynvml==11.5.0 # ctransformers -# https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.22+cu117-py3-none-any.whl +https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.22+cu117-py3-none-any.whl # auto-gptq https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.4.1/auto_gptq-0.4.1+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" \ No newline at end of file diff --git a/src/config/config.h b/src/config/config.h index e68208e..4260914 100644 --- a/src/config/config.h +++ b/src/config/config.h @@ -10,7 +10,7 @@ namespace bottled_ai int max_new_tokens = 512; float temperature = 1; float top_p = 1; - float top_k = 1; + float top_k = 0; float repetition_penalty = 1; std::string context = "You are a helpful AI assistant."; } model_config_t;