import editdistance from hexdump2 import hexdump import gradio as gr import shlex import subprocess import tempfile description = """This is a space testing a method for evaluating the quality of decompilation. Currently unhandled features: * PIC stuff * Global references * Function calls * Wildcards in target function? * How to extract compilable decompilation from decompilers? """ def trim(str, n): return "\n".join(str.splitlines()[n:]) def trim_objdump(str): return trim(str, 7) def disassemble_bytes(byte_data, architecture): with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as temp_bin_file: temp_bin_file.write(byte_data) temp_bin_file_name = temp_bin_file.name disassembly = subprocess.run( ["objdump", "-D", "-b", "binary", "-m", architecture, temp_bin_file_name], capture_output=True, text=True ).stdout disassembly = trim_objdump(disassembly) return disassembly def compile(compiler, flags, source): # Create a temporary file for the C source code with tempfile.NamedTemporaryFile(suffix=".c", delete=False) as temp_c_file: temp_c_file.write(source.encode()) temp_c_file_name = temp_c_file.name # Create a temporary file for the object file with tempfile.NamedTemporaryFile(suffix=".o", delete=False) as temp_o_file: temp_o_file_name = temp_o_file.name # Compile the C file to an object file result = subprocess.run( [compiler, "-c", temp_c_file_name] + shlex.split(flags) + ["-o", temp_o_file_name], capture_output=True, text=True, ) compile_output = result.stdout + result.stderr # Create a temporary file for the raw bytes with tempfile.NamedTemporaryFile(suffix=".raw", delete=True) as raw_bytes_file: subprocess.run( [ "objcopy", "--only-section", ".text", # XXX in reality we should probably look at the sections "--only-section", ".text.*", "-O", "binary", temp_o_file_name, raw_bytes_file.name, ] ) compiled_bytes = raw_bytes_file.read() # Disassemble the object file disassembly = subprocess.run( ["objdump", "-d", temp_o_file_name], capture_output=True, text=True ).stdout disassembly = trim_objdump(disassembly) if result.returncode == 0: return compiled_bytes, compile_output, disassembly else: return None, compile_output, disassembly def predict(target_bytes, source, compiler, flags, architecture): target_bytes = bytes.fromhex(target_bytes) compiled_bytes, compile_output, compiled_disassembly = compile(compiler, flags, source) target_disassembly = disassemble_bytes(target_bytes, architecture) if compiled_bytes is not None: return ( hexdump(compiled_bytes, result="return"), hexdump(target_bytes, result="return"), editdistance.eval(compiled_bytes, target_bytes), compile_output, compiled_disassembly, target_disassembly ) else: return ( "Compilation failed", hexdump(target_bytes, result="return"), -1, compile_output, compiled_disassembly, target_disassembly ) def run(): demo = gr.Interface( fn=predict, description=description, inputs=[ gr.Textbox( lines=10, label="Bytes of Target Function (in hex)", value="b8 2a 00 00 00 c3", ), gr.Textbox( lines=10, label="Decompiled C Source Code", value="int foo() { return 0; }", ), gr.Textbox(label="Compiler", value="g++"), gr.Textbox(label="Compiler Flags", value="-O2"), gr.Textbox(label="Architecture (for disassembler)", value="i386"), ], outputs=[ gr.Textbox(label="Compiled bytes"), gr.Textbox(label="Target bytes"), gr.Number(label="Edit distance (lower is better)"), gr.Textbox(label="Compiler Output"), gr.Textbox(label="Compiled Disassembly"), gr.Textbox(label="Target Disassembly"), ], ) demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) run()