mavaa · mavaa · May 24, 2024 · May 24, 2024 · May 24, 2024 · May 24, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "submodules/llm4decompile"]
+	path = submodules/llm4decompile
+	url = https://github.com/albertan017/LLM4Decompile
diff --git a/requirements_llm4decompile.txt b/requirements_llm4decompile.txt
@@ -0,0 +1,7 @@
+tqdm
+transformers
+loguru
+tqdm
+text_generation
+vllm
+flash-attn
diff --git a/run_llm4decompile.py b/run_llm4decompile.py
@@ -0,0 +1,20 @@
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+if __name__ == '__main__':
+    model_path = 'LLM4Binary/llm4decompile-6.7b-v1.5' # V1.5 Model
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda()
+
+    with open("data_decompile_eval/llm4decompileprompts/task_000_d.txt_prompt.txt_.asm",'r') as f:#optimization level O0
+        asm_func = f.read()
+    inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=2048)### max length to 4096, max new tokens should be below the range
+    c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
+
+    with open(fileName +'.c','r') as f:#original file
+        func = f.read()
+
+    print(f'original function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions
+    print(f'decompiled function:\n{c_func_decompile}')
diff --git a/run_llm4decompile.sh b/run_llm4decompile.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+source .venv/bin/activate
+source .env
+
+sudo docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size 8G -v ./:/src -w /src rocm/pytorch
diff --git a/submodules/llm4decompile b/submodules/llm4decompile
diff --git a/transform_cases_for_llm4decompile.py b/transform_cases_for_llm4decompile.py
@@ -0,0 +1,34 @@
+import os
+from src.util import create_folder_if_not_exists
+
+base_dir = "data_decompile_eval/disassemblies"
+prompt_folder = os.path.join('data_decompile_eval', 'llm4decompileprompts')
+create_folder_if_not_exists(prompt_folder)
+
+for output_file in sorted(os.listdir(base_dir)):
+    prompt_filename = os.path.join(prompt_folder, f"{output_file}_prompt.txt")
+    input_asm = ''
+    file_path = os.path.join(base_dir, output_file)
+    print(f"reading {file_path}")
+    with open(file_path) as f:#asm file
+        asm= f.read()
+        if '<'+'.text'+'>:' not in asm: #IMPORTANT replace func0 with the function name
+            raise ValueError("compile fails")
+        asm = '<'+'.text'+'>:' + asm.split('<'+'.text'+'>:')[-1].split('\n\n')[0] #IMPORTANT replace func0 with the function name
+        asm_clean = ""
+        asm_sp = asm.split("\n")
+        for tmp in asm_sp:
+            if len(tmp.split("\t"))<3 and '00' in tmp:
+                continue
+            idx = min(
+                len(tmp.split("\t")) - 1, 2
+            )
+            tmp_asm = "\t".join(tmp.split("\t")[idx:])  # remove the binary code
+            tmp_asm = tmp_asm.split("#")[0].strip()  # remove the comments
+            asm_clean += tmp_asm + "\n"
+    input_asm = asm_clean.strip()
+    before = f"# This is the assembly code:\n"#prompt
+    after = "\n# What is the source code?\n"#prompt
+    input_asm_prompt = before+input_asm.strip()+after
+    with open(prompt_filename +'_' + '.asm','w',encoding='utf-8') as f:
+        f.write(input_asm_prompt)