# This file is part of Metasm, the Ruby assembly manipulation suite
# Copyright (C) 2006-2009 Yoann GUILLOT
# Licence is LGPL, see LICENCE in the top-level directory
# To use your own patterns, create a script that defines Deobfuscate::Patterns, then eval() this file.
# Use your script as argument to --plugin
# This script is to be used with the --plugin option of samples/disassemble(-gtk).rb
# It holds methods to ease the definition of instruction patterns that are to be replaced
# by another arbitrary instruction sequence, using mostly a regexp syntax
# The pattern search&replace is done every time the disassembler
# finds a new instruction, through the callback_newinstr callback.
# The patterns can use shortcuts for frequently-used regexps (like 'any machine registers'),
# defined in the PatternMacros hash.
# The patterns are matched first against the sequence of instruction opcode names, then
# each instruction is rendered as text (using Instruction#to_s), and the global regexp
# is checked.
# Backreferences can be used in the substitution instruction sequence, through the %1 ... %9
# special values.
# A pattern consists of a sequence of regexp for instructions, separated by ' ; '
# Each subregexps should not match multiple instructions (ie a patterns matches a fixed-length
# instruction sequence, whose length equals the number of ' ; '-separated regexps)
# The first word of each regexp should match only the instruction opcode name.
# The substitution may be a Proc, which will receive |dasm object, matched decodedinstr list| as
# arguments, and should return:
# a String, holding a sequence of instructions separated by ' ; ', which will be parsed by the CPU (no labels allowed)
# nil if the pattern did not match, continue searching
# an Array of Instruction/DecodedInstruction. If the array is the original di list, same as returning nil
# If the substitution array is different from the matched sequence, the new instructions are passed
# to dasm.replace_instrs, which will patch the disassembler decoded instruction graph ; and each
# new instruction is passed through the callback once more, allowing for recursive patterns.
module Deobfuscate
# special constructs : %i => an integer (immediate/standard label)
# %r => standard x86 register (except esp), all sizes
# %m => modr/m 32 (memory indirection or reg)
PatternMacros = {
'%i' => '(?:-|loc_|sub_|xref_)?[0-9][0-9a-fA-F]*h?',
'%r' => '(?:[re]?[abcd]x|[re]?[sd]i|[re]?bp|[abcd][lh])',
'%m' => '(?:(?:dword ptr )?\[.*?\]|eax|ebx|ecx|edx|edi|esi|ebp)',
} if not defined? PatternMacros
# instructions are separated by ' ; '
# instruction must be '<simple regexp matching opcode> <arbitrary regexp>'
# in the pattern target, %1-%9 are used for backreferences from the regexp match
Patterns = {
'nop ; (.*)' => '%1', # concat 'nop' into following instruction
'mov (%r|esp), \1' => 'nop',
'lea (%r|esp), (?:dword ptr )?\[\1(?:\+0)?\]' => 'nop',
'(.*)' => lambda { |dasm, list| # remove 'jmp imm' preceding us without interfering with running dasm
if pdi = prev_di(dasm, list.last) and pdi.opcode.name == 'jmp' and
pdi.instruction.args[0].kind_of? Metasm::Expression
dasm.replace_instrs(pdi.address, pdi.address, [])
#'call %i ; pop (%r)' => lambda { |dasm, list| "mov %1, #{list.first.next_addr}" },
} if not defined? Patterns
# returns an array of strings matching the regexp (only |,?,[], non-nested allowed, no special chars)
# expand_regexp['a[bcd]?(ef|gh)'] => [abef acef adef aef abgh acgh adgh agh]
def self.expand_regexp(str)
case str
when nil, '', '.*'; return [str.to_s]
when /^\\\./
l1, p2 = ['.'], $'
when /^(\w+)(\?)?/
s1, q, p2 = $1, $2, $'
l1 = (q ? [s1, s1.chop] : [s1])
when /^\[(.*?)\](\?)?/
p1, q, p2 = $1, $2, $'
l1 = p1.split(//)
l1 << '' if q
when /^\((?:\?:)?(.*?)\)(\?)?/
p1, q, p2 = $1, $2, $'
l1 = p1.split('|').map { |p| expand_regexp(p) }.flatten
l1 << '' if q
else raise "bad pattern #{str.inspect}"
expand_regexp(p2).map { |s2| l1.map { |s1_| s1_ + s2 } }.flatten.uniq
# find the instr preceding adi ; follows from_normal if it is a single element array
def self.prev_di(dasm, di)
if di.block.list.first != di
elsif di.block.from_normal.to_a.length == 1
# preprocess the pattern list to optimize matching on each new instruction
# last pattern instr opname => prev instr opname => prev instr opname => :pattern => [patterns]
def self.generate_precalc(next_hash, next_ops, pattern)
if next_ops.empty?
next_hash[:pattern] ||= []
next_hash[:pattern] << pattern
(expand_regexp(next_ops[-1]) rescue ['.*']).each { |op|
nh = next_hash[op] ||= {}
generate_precalc(nh, next_ops[0...-1], pattern)
PrecalcPatterns = {} if not defined? PrecalcPatterns
# replace Macros in patterns, do some precalc to speedup pattern matching
def self.init
Patterns.keys.each { |pat|
# replace PatternMacros in patterns
newp = pat.dup
PatternMacros.each { |mk, mv| newp.gsub!(mk, mv) }
Patterns[newp] = Patterns.delete(pat) if pat != newp
pat = newp
# TODO handle instructions with prefix (lock/rep), conditional regexp over multiple instructions..
ops = pat.split(' ; ').map { |instr| instr[/^\S+/] }
generate_precalc(PrecalcPatterns, ops, pat)
# the actual disassembler callback
# checks the current instruction opname against the end of patterns using precomputed tree, then check previous instr etc
# once full pattern may match, convert each instr to string, and run the regexp match
# on match, reuse the captures in the pattern target, parse the target, generate decoded instrs, and replace in the dasm graph.
# on match, rerun the callback on each replaced instruction (for recursive patterns)
def self.newinstr_callback(dasm, di)
# compute the merged subtree of t1 and t2
# merges patterns if found
mergetree = lambda { |t1, t2|
if t1 and t2
case t1
when Array; t1 + t2
when Hash; (t1.keys | t2.keys).inject({}) { |t, k| t.update k => mergetree[t1[k], t2[k]] }
else t1 || t2
di_seq = [di]
lastdi = di
tree = PrecalcPatterns
tree = mergetree[tree['.*'], tree[lastdi.instruction.opname]]
newinstrs = match = nil
# walk the Precalc tree
while tree
if tree[:pattern]
strs = di_seq.map { |pdi| pdi.instruction.to_s }
break if tree[:pattern].find { |pat|
if match = /^#{pat}$/.match(strs.join(' ; '))
newinstrs = Patterns[pat]
newinstrs = newinstrs[dasm, di_seq] if newinstrs.kind_of? Proc
newinstrs = nil if newinstrs == di_seq
else newinstrs = nil
} or tree.length == 1
if lastdi = prev_di(dasm, lastdi)
di_seq.unshift lastdi
tree = mergetree[tree['.*'], tree[lastdi.instruction.opname]]
else break
# match found : create instruction stream, replace in dasm, recurse
if newinstrs
# replace %1-%9 by the matched substrings
newinstrs = newinstrs.gsub(/%(\d)/) { match.captures[$1.to_i-1] }.split(' ; ').map { |str| dasm.cpu.parse_instruction(str) } if newinstrs.kind_of? String
if newinstrs.last.kind_of? Metasm::Instruction and newinstrs.last.opname != 'jmp' and
lastdi.address + di_seq.inject(-di.bin_length) { |len, i| len + i.bin_length } != di.address
# ensure that the last instr ends the same place as the original last instr (to allow disassemble_block to continue)
newinstrs << dasm.cpu.parse_instruction("jmp #{Metasm::Expression[di.next_addr]}")
# nop ; jmp => jmp
newinstrs.shift if newinstrs.length >= 2 and newinstrs.first.kind_of? Metasm::Instruction and newinstrs.first.opname == 'nop'
# remove instructions from the match to have only 2 linked blocks passed to replace_instrs
unused = di_seq[1..-2] || []
unused.delete_if { |udi| udi.block.address == di_seq[0].block.address or udi.block.address == di_seq[-1].block.address }
dasm.replace_instrs(unused.shift.address, unused.shift.address, []) while unused.length > 1
dasm.replace_instrs(unused.first.address, unused.first.address, []) if not unused.empty?
# patch the dasm graph
if dasm.replace_instrs(lastdi.address, di.address, newinstrs)
puts ' deobfuscate', di_seq, ' into', newinstrs, ' ---' if $DEBUG
# recurse, keep the last generated di to return to caller as replacement
newinstrs.each { |bdi| di = newinstr_callback(dasm, bdi) || di }
di = nil
# call newinstr_callback on all existing instructions of dasm
def self.deobfuscate_existing(dasm)
dasm.each_instructionblock { |b|
b.list.dup.each { |di| newinstr_callback(dasm, di) }
# calls dasm.merge_blocks(true) on all instruction blocks to merge sequences of blocks
def self.merge_blocks(dasm)
dasm.each_instructionblock { |b|
if pv = dasm.di_at(b.from_normal.to_a.first) and not pv.block.list.last.opcode.props[:setip] and
b.from_normal.length == 1 and pv.block.to_normal.to_a.length == 1
dasm.merge_blocks(pv.block, b, true)
# update DecodedInstr.to_s to include instr length
class Metasm::DecodedInstruction
def to_s ; "#{Metasm::Expression[address] if address} +#{bin_length} #{instruction}" end
# do the pattern precalc
if self.kind_of? Metasm::Disassembler
dasm = self
# setup the newinstr callback
dasm.callback_newinstr = lambda { |di| Deobfuscate.newinstr_callback(dasm, di) }