Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "submodules/llm4decompile"]
path = submodules/llm4decompile
url = https://github.com/albertan017/LLM4Decompile
7 changes: 7 additions & 0 deletions requirements_llm4decompile.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
tqdm
transformers
loguru
tqdm
text_generation
vllm
flash-attn
20 changes: 20 additions & 0 deletions run_llm4decompile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

if __name__ == '__main__':
model_path = 'LLM4Binary/llm4decompile-6.7b-v1.5' # V1.5 Model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda()

with open("data_decompile_eval/llm4decompileprompts/task_000_d.txt_prompt.txt_.asm",'r') as f:#optimization level O0
asm_func = f.read()
inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=2048)### max length to 4096, max new tokens should be below the range
c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])

with open(fileName +'.c','r') as f:#original file
func = f.read()

print(f'original function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions
print(f'decompiled function:\n{c_func_decompile}')
6 changes: 6 additions & 0 deletions run_llm4decompile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

source .venv/bin/activate
source .env

sudo docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size 8G -v ./:/src -w /src rocm/pytorch
1 change: 1 addition & 0 deletions submodules/llm4decompile
Submodule llm4decompile added at aa23b7
34 changes: 34 additions & 0 deletions transform_cases_for_llm4decompile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
from src.util import create_folder_if_not_exists

base_dir = "data_decompile_eval/disassemblies"
prompt_folder = os.path.join('data_decompile_eval', 'llm4decompileprompts')
create_folder_if_not_exists(prompt_folder)

for output_file in sorted(os.listdir(base_dir)):
prompt_filename = os.path.join(prompt_folder, f"{output_file}_prompt.txt")
input_asm = ''
file_path = os.path.join(base_dir, output_file)
print(f"reading {file_path}")
with open(file_path) as f:#asm file
asm= f.read()
if '<'+'.text'+'>:' not in asm: #IMPORTANT replace func0 with the function name
raise ValueError("compile fails")
asm = '<'+'.text'+'>:' + asm.split('<'+'.text'+'>:')[-1].split('\n\n')[0] #IMPORTANT replace func0 with the function name
asm_clean = ""
asm_sp = asm.split("\n")
for tmp in asm_sp:
if len(tmp.split("\t"))<3 and '00' in tmp:
continue
idx = min(
len(tmp.split("\t")) - 1, 2
)
tmp_asm = "\t".join(tmp.split("\t")[idx:]) # remove the binary code
tmp_asm = tmp_asm.split("#")[0].strip() # remove the comments
asm_clean += tmp_asm + "\n"
input_asm = asm_clean.strip()
before = f"# This is the assembly code:\n"#prompt
after = "\n# What is the source code?\n"#prompt
input_asm_prompt = before+input_asm.strip()+after
with open(prompt_filename +'_' + '.asm','w',encoding='utf-8') as f:
f.write(input_asm_prompt)