Skip to content

Commit

Permalink
Add ctransformers support
Browse files Browse the repository at this point in the history
  • Loading branch information
rodjjo committed Aug 21, 2023
1 parent 6d11902 commit ed2c318
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 115 deletions.
3 changes: 2 additions & 1 deletion python_stuff/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def have_dependencies():
'future',
'einops',
'torch',
'markdown'
'markdown',
'ctransformers'
]
for l in lib_names:
if not os.path.exists(os.path.join(lib_dir, l)):
Expand Down
26 changes: 16 additions & 10 deletions python_stuff/models/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,24 @@
from bottled_ai import progress_title

def download_model(repo_id: str) -> bool:
progress_title(f"Downloading the model {repo_id}. Please wait")
if have_local_model(repo_id):
try:
progress_title(f"Downloading the model {repo_id}. Please wait")
if have_local_model(repo_id):
return True
files = get_models_file(repo_id)
for f in files:
if cached_file(
repo_id,
os.path.basename(f),
cache_dir=CACHE_DIR,
) is None:
return False
return True
files = get_models_file(repo_id)
for f in files:
if cached_file(
repo_id,
os.path.basename(f),
cache_dir=CACHE_DIR,
) is None:
except Exception as ex:
with open("exception.txt", "w") as fp:
fp.write(str(ex))
fp.flush()
return False
return True

def remove_model(repo_id: str) -> bool:
rmtree(MODELS_MAP[repo_id]['dirname'], ignore_errors=True)
Expand Down
186 changes: 97 additions & 89 deletions python_stuff/models/inference.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import traceback

import torch
import gc
import random
Expand Down Expand Up @@ -44,23 +46,6 @@ def __call__(self, *args, **kwargs) -> bool:
return progress_canceled()


def tokenize_single_input(tokenizer, prompt):
# OpenChat V2
human_prefix = "User:"
prefix = "Assistant GPT4:"
eot_token = "<|end_of_turn|>"
bos_token = "<s>"

def _tokenize(text):
return tokenizer.convert_tokens_to_ids(tokenizer._tokenize(text))

def _tokenize_special(special_name):
return tokenizer.convert_tokens_to_ids(special_name)

return [_tokenize_special(bos_token)] + _tokenize(human_prefix) + _tokenize(prompt) + [_tokenize_special(eot_token)] + \
_tokenize(prefix)


def set_manual_seed(seed):
seed = int(seed)
if seed == -1:
Expand All @@ -74,77 +59,100 @@ def set_manual_seed(seed):

@torch.no_grad()
def generate_text(model_id: str, params: dict) -> dict:
prompt = params['prompt']
context = params.get('context') or 'You are a helpful AI assistant.'
max_new_tokens = params.get('max_new_tokens', 512)
temperature = params.get('temperature', 1)
top_p = params.get('top_p', 1)
top_k = params.get('top_k', 0)
repetition_penalty = params.get('repetition_penalty', 1)
set_max_memory(params.get('mem_gpu', -1), params.get('mem_cpu', -1))

progress_title("Loading the model")
progress(0, 100)
model, tokenizer = get_model(model_id)
if model is None:
try:
prompt = params['prompt']
context = params.get('context') or 'You are a helpful AI assistant.'
max_new_tokens = params.get('max_new_tokens', 512)
temperature = params.get('temperature', 1)
top_p = params.get('top_p', 1)
top_k = params.get('top_k', 0)
repetition_penalty = params.get('repetition_penalty', 1)
set_max_memory(params.get('mem_gpu', -1), params.get('mem_cpu', -1))

progress_title("Loading the model")
progress(0, 100)
model, tokenizer = get_model(model_id)
if model is None:
return {
"html": "no model loaded",
"raw": "no model loaded"
}
progress_title("Generating text")

set_manual_seed(-1)

cfg = MODELS_MAP[model_id]
fallBackTemplate = '{instruction}\n\nUSER: {input}\nASSISTANT:'
for template in cfg['templates']:
if '{instruction}' in template and '{input}' in template:
hasTemplate = True
fallBackTemplate = template
break
elif '{input}' in template:
fallBackTemplate = template
if '{instruction}' in fallBackTemplate:
prompt = fallBackTemplate.format(instruction=context, input=prompt)
else:
prompt = fallBackTemplate.format(input=prompt)

response_prefix = cfg['response_after']

if cfg['loader'] == 'ctransformers':
inputs = str(prompt)
else:
inputs = tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=True).to('cuda:0')
if cfg['loader'] == 'ctransformers':
output = ''
for txt in model(
prompt=inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=int(top_k),
repetition_penalty=repetition_penalty,
stream=True
):
progress_text(txt)
output = output + txt
if progress_canceled():
break
else:
output = model.generate(
inputs=inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
streamer=Streamer(tokenizer),
stopping_criteria=transformers.StoppingCriteriaList() + [
CanceledChecker()
]
)[0]

new_tokens = len(output) - len(inputs[0])
if cfg['loader'] == 'ctransformers':
response = output
else:
response = tokenizer.decode(output[-new_tokens:], True)

if 'ASSISTANT:' in response_prefix:
response = response.split('\n');
for i, v in enumerate(response):
if not v.startswith(response_prefix):
continue
v = v.split(response_prefix, maxsplit=1)
response[i] = v[0] if len(v) < 2 else v[1]
response = '\n'.join(response)

del inputs
gc.collect()
return {
"html": "no model loaded",
"raw": "no model loaded"
"html": convert2html(response),
"raw": response
}
progress_title("Generating text")

set_manual_seed(-1)

cfg = MODELS_MAP[model_id]
fallBackTemplate = '{instruction}\n\nUSER: {input}\nASSISTANT:'
for template in cfg['templates']:
if '{instruction}' in template and '{input}' in template:
hasTemplate = True
fallBackTemplate = template
break
elif '{input}' in template:
fallBackTemplate = template
if '{instruction}' in fallBackTemplate:
prompt = fallBackTemplate.format(instruction=context, input=prompt)
else:
prompt = fallBackTemplate.format(input=prompt)

if MODELS_MAP[model_id]['loader'] == 'auto_gpt_openchat':
prompt = tokenize_single_input(tokenizer, prompt)

response_prefix = cfg['response_after']

# inputs = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda:0')
inputs = tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=True).to('cuda:0')

output = model.generate(
inputs=inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
streamer=Streamer(tokenizer),
stopping_criteria=transformers.StoppingCriteriaList() + [
CanceledChecker()
]
)[0]

new_tokens = len(output) - len(inputs[0])
response = tokenizer.decode(output[-new_tokens:], True)

if 'ASSISTANT:' in response_prefix:
response = response.split('\n');
for i, v in enumerate(response):
if not v.startswith(response_prefix):
continue
v = v.split(response_prefix, maxsplit=1)
response[i] = v[0] if len(v) < 2 else v[1]
response = '\n'.join(response)

del inputs
gc.collect()
return {
"html": convert2html(response),
"raw": response
}
except Exception as ex:
return {
"html": convert2html(str(ex) + str(traceback.format_exc())),
"raw": str(ex),
}
38 changes: 27 additions & 11 deletions python_stuff/models/listing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
'name': 'Nous-Hermes-13b (GPTQ)',
'model_basename': 'nous-hermes-13b-GPTQ-4bit-128g.no-act.order',
'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--Nous-Hermes-13B-GPTQ'),
'loader': 'auto_gpt',
'loader': 'auto_gptq',
'locally': False,
'description': (
'Nous-Hermes-13b is a state-of-the-art language model fine-tuned on over 300,000 instructions.'
Expand All @@ -22,14 +22,13 @@
'response_after':'### Response:\n',
'templates': [
'### Instruction:\n{instruction}\n### Input:\n{input}\n### Response:\n',
'### Instruction:\n{input}\n### Response:\n',
]
}, {
'id': 'TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ',
'name': 'Wizard-Vicuna-13B-Uncensored-GPTQ',
'model_basename': 'Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order',
'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--Wizard-Vicuna-13B-Uncensored-GPTQ'),
'loader': 'auto_gpt',
'loader': 'auto_gptq',
'locally': False,
'description': (
'This is wizard-vicuna-13b trained with a subset of the dataset - responses that contained alignment / moralizing were removed.\n'
Expand All @@ -41,14 +40,13 @@
'response_after':'### Response:\n',
'templates': [
'### Instruction:\n{instruction}>\n### Input:\n{input}>\n### Response:\n',
'### Instruction:\n{input}>\n### Response:\n',
]
}, {
'id': 'TheBloke/openchat_v2_w-GPTQ',
'name': 'openchat_v2_w-GPT',
'model_basename': 'openchat_v2_w-GPTQ-4bit-128g.no-act.order',
'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--openchat_v2_w-GPT'),
'loader': 'auto_gpt_openchat',
'loader': 'auto_gptq_openchat',
'locally': False,
'description': (
'The OpenChat v2 family is inspired by offline reinforcement learning, '
Expand All @@ -62,7 +60,7 @@
'name': 'WizardLM-7B-uncensored-GPTQ',
'model_basename': 'WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order',
'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--WizardLM-7B-uncensored-GPTQ'),
'loader': 'auto_gpt',
'loader': 'auto_gptq',
'locally': False,
'description': (
'This is WizardLM trained with a subset of the dataset - responses that contained alignment / moralizing were removed.\n'
Expand All @@ -79,7 +77,7 @@
'name': 'Nous-Hermes-Llama-2-7B-GPTQ',
'model_basename': 'gptq_model-4bit-128g',
'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--Nous-Hermes-Llama-2-7B-GPTQ'),
'loader': 'auto_gpt',
'loader': 'auto_gptq',
'locally': False,
'description': (
'Nous-Hermes-Llama2-7b is a state-of-the-art language model fine-tuned on over 300,000 instructions.\n'
Expand All @@ -89,31 +87,29 @@
'response_after': '### Response:\n',
'templates': [
'### Instruction:\n{instruction}\n### Input:\n{input}\n### Response:\n',
'### Instruction:\n{input}\n### Response:\n',
]
},
{
'id': 'TheBloke/StableBeluga-7B-GPTQ',
'name': 'StableBeluga-7B-GPTQ',
'model_basename': 'gptq_model-4bit-128g',
'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--StableBeluga-7B-GPTQ'),
'loader': 'auto_gpt',
'loader': 'auto_gptq',
'locally': False,
'description': (
'Stable Beluga 7B is a Llama2 7B model finetuned on an Orca style Dataset'
),
'response_after': '### Response:\n',
'templates': [
'### System:\n{instruction}\n### User:\n{input}\n### Response:\n',
'### User:\n{input}\n### Response:\n',
]
},
{
'id': 'TheBloke/stablecode-instruct-alpha-3b-GPTQ',
'name': 'stablecode-instruct-alpha-3b-GPTQ',
'model_basename': 'gptq_model-4bit-128g',
'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--stablecode-instruct-alpha-3b-GPTQ'),
'loader': 'auto_gpt',
'loader': 'auto_gptq',
'locally': False,
'have_tokenizer_model': False,
'description': (
Expand All @@ -123,6 +119,22 @@
'templates': [
'### Instruction:\n{input}\n### Response:\n'
]
}, {
'id': 'TheBloke/orca_mini_3B-GGML',
'name': 'orca_mini_3B-GGML',
'model_basename': 'orca-mini-3b.ggmlv3.q8_0',
'dirname': os.path.join(CACHE_DIR, 'models--TheBloke--orca_mini_3B-GGML'),
'loader': 'ctransformers',
'model_type': 'llama',
'locally': False,
'have_tokenizer_model': False,
'description': (
'An OpenLLaMa-3B model model trained on explain tuned datasets, created using Instructions and Input from WizardLM, Alpaca & Dolly-V2 datasets and applying Orca Research Paper dataset construction approaches.'
),
'response_after': '### Response:\n',
'templates': [
'### System:\n{instruction}\n### User:\n{input}\n### Response:\n',
]
}
]

Expand Down Expand Up @@ -152,6 +164,10 @@ def get_models_file(repo_id) -> List[str]:
if os.path.isdir(p):
basedir = p
break
if mdl['loader'] == 'ctransformers':
return [
os.path.join(basedir, f'{mdl["model_basename"]}.bin'),
]
if mdl.get('have_tokenizer_model', True):
params = [
os.path.join(basedir, 'tokenizer.model'),
Expand Down
Loading

0 comments on commit ed2c318

Please sign in to comment.