harleyszhang · yanghailong-git · Jul 30, 2025
diff --git a/generate.py b/generate.py
@@ -73,9 +73,9 @@ def main(
         tokenizer_path=checkpoint_path,
         max_gpu_num_blocks=max_gpu_num_blocks,
         max_seq_len=max_seq_len,
-        load_model=load_model,
+        # load_model=load_model,
         compiled_model=compiled_model,
-        triton_weight=triton_weight,
+        # triton_weight=triton_weight,
         device=device,
     )
 

diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py
@@ -67,7 +67,8 @@ def get_gpu_memory(gpu_type="amd", device_id="0"):
         elif gpu_type == "cpu":
             return None
     except Exception as e:
-        from utils.logger import log
+        from .logger import get_logger
+        log = get_logger(__name__)
 
         log.warning(f"Unable to fetch GPU memory: {e}")
         return None
@@ -82,7 +83,8 @@ def count_tokens(texts: List[str], tokenizer) -> int:
 
 
 def get_model_type(checkpoint_path: str) -> str | None:
-    from utils.logger import log
+    from .logger import get_logger
+    log = get_logger(__name__)
 
     model_type = ["llama", "falcon", "mpt", "qwen2", "llava"]