diff --git a/generate.py b/generate.py index 6d359e0..420008c 100755 --- a/generate.py +++ b/generate.py @@ -73,9 +73,9 @@ def main( tokenizer_path=checkpoint_path, max_gpu_num_blocks=max_gpu_num_blocks, max_seq_len=max_seq_len, - load_model=load_model, + # load_model=load_model, compiled_model=compiled_model, - triton_weight=triton_weight, + # triton_weight=triton_weight, device=device, ) diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py index 55dbbc9..76c77d7 100644 --- a/lite_llama/utils/common.py +++ b/lite_llama/utils/common.py @@ -67,7 +67,8 @@ def get_gpu_memory(gpu_type="amd", device_id="0"): elif gpu_type == "cpu": return None except Exception as e: - from utils.logger import log + from .logger import get_logger + log = get_logger(__name__) log.warning(f"Unable to fetch GPU memory: {e}") return None @@ -82,7 +83,8 @@ def count_tokens(texts: List[str], tokenizer) -> int: def get_model_type(checkpoint_path: str) -> str | None: - from utils.logger import log + from .logger import get_logger + log = get_logger(__name__) model_type = ["llama", "falcon", "mpt", "qwen2", "llava"]