diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..efcecde --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "submodules/llm4decompile"] + path = submodules/llm4decompile + url = https://github.com/albertan017/LLM4Decompile diff --git a/requirements_llm4decompile.txt b/requirements_llm4decompile.txt new file mode 100644 index 0000000..ce1dd92 --- /dev/null +++ b/requirements_llm4decompile.txt @@ -0,0 +1,7 @@ +tqdm +transformers +loguru +tqdm +text_generation +vllm +flash-attn diff --git a/run_llm4decompile.py b/run_llm4decompile.py new file mode 100644 index 0000000..146766f --- /dev/null +++ b/run_llm4decompile.py @@ -0,0 +1,20 @@ +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch + +if __name__ == '__main__': + model_path = 'LLM4Binary/llm4decompile-6.7b-v1.5' # V1.5 Model + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda() + + with open("data_decompile_eval/llm4decompileprompts/task_000_d.txt_prompt.txt_.asm",'r') as f:#optimization level O0 + asm_func = f.read() + inputs = tokenizer(asm_func, return_tensors="pt").to(model.device) + with torch.no_grad(): + outputs = model.generate(**inputs, max_new_tokens=2048)### max length to 4096, max new tokens should be below the range + c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1]) + + with open(fileName +'.c','r') as f:#original file + func = f.read() + + print(f'original function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions + print(f'decompiled function:\n{c_func_decompile}') diff --git a/run_llm4decompile.sh b/run_llm4decompile.sh new file mode 100755 index 0000000..d9d35b9 --- /dev/null +++ b/run_llm4decompile.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +source .venv/bin/activate +source .env + +sudo docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size 8G -v ./:/src -w /src rocm/pytorch diff --git a/submodules/llm4decompile b/submodules/llm4decompile new file mode 160000 index 0000000..aa23b74 --- /dev/null +++ b/submodules/llm4decompile @@ -0,0 +1 @@ +Subproject commit aa23b74ed144f50944755f772cb8a186540edfc5 diff --git a/transform_cases_for_llm4decompile.py b/transform_cases_for_llm4decompile.py new file mode 100644 index 0000000..50690b0 --- /dev/null +++ b/transform_cases_for_llm4decompile.py @@ -0,0 +1,34 @@ +import os +from src.util import create_folder_if_not_exists + +base_dir = "data_decompile_eval/disassemblies" +prompt_folder = os.path.join('data_decompile_eval', 'llm4decompileprompts') +create_folder_if_not_exists(prompt_folder) + +for output_file in sorted(os.listdir(base_dir)): + prompt_filename = os.path.join(prompt_folder, f"{output_file}_prompt.txt") + input_asm = '' + file_path = os.path.join(base_dir, output_file) + print(f"reading {file_path}") + with open(file_path) as f:#asm file + asm= f.read() + if '<'+'.text'+'>:' not in asm: #IMPORTANT replace func0 with the function name + raise ValueError("compile fails") + asm = '<'+'.text'+'>:' + asm.split('<'+'.text'+'>:')[-1].split('\n\n')[0] #IMPORTANT replace func0 with the function name + asm_clean = "" + asm_sp = asm.split("\n") + for tmp in asm_sp: + if len(tmp.split("\t"))<3 and '00' in tmp: + continue + idx = min( + len(tmp.split("\t")) - 1, 2 + ) + tmp_asm = "\t".join(tmp.split("\t")[idx:]) # remove the binary code + tmp_asm = tmp_asm.split("#")[0].strip() # remove the comments + asm_clean += tmp_asm + "\n" + input_asm = asm_clean.strip() + before = f"# This is the assembly code:\n"#prompt + after = "\n# What is the source code?\n"#prompt + input_asm_prompt = before+input_asm.strip()+after + with open(prompt_filename +'_' + '.asm','w',encoding='utf-8') as f: + f.write(input_asm_prompt)