From bce161e6227e6aa6e9e5a850b96a3c36faa2bf47 Mon Sep 17 00:00:00 2001 From: bmc <> Date: Mon, 13 Feb 2006 22:52:01 +0000 Subject: [PATCH] * add utf-8 support, with overlong and invalid overlong encodings * update utf-7 support to a mode to specify "all" git-svn-id: file:///home/svn/incoming/trunk@3526 4d416f70-5f16-0410-b530-b9f4589650da --- lib/rex/text.rb | 130 +++++++++++++++++++++++++++++++++--------- lib/rex/text.rb.ut.rb | 21 ++++++- 2 files changed, 124 insertions(+), 27 deletions(-) diff --git a/lib/rex/text.rb b/lib/rex/text.rb index 24d29e94ef..182bf5cdd2 100644 --- a/lib/rex/text.rb +++ b/lib/rex/text.rb @@ -115,32 +115,37 @@ module Text end # - # Converts standard ASCII text to 16-bit unicode + # Converts standard ASCII text to a unicode string. # - # By default, little-endian unicode. By providing non-nil value for - # endian, convert to 16-bit big-endian unicode. NOTE, most systems require - # a marker to specify that the unicode text being provided is in - # big-endian. Use 0xFEFF, which is not a "legal" unicode code point. + # Supported unicode types include: utf-16le, utf16-be, utf32-le, utf32-be, utf-7, and utf-8 + # + # Providing 'mode' provides hints to the actual encoder as to how it should encode the string. Only UTF-7 and UTF-8 use "mode". + # + # utf-7 by default does not encode alphanumeric and a few other characters. By specifying the mode of "all", then all of the characters are encoded, not just the non-alphanumeric set. + # to_unicode(str, 'utf-7', 'all') + # + # utf-8 specifies that alphanumeric characters are used directly, eg "a" is just "a". However, there exist 6 different overlong encodings of "a" that are technically not valid, but parse just fine in most utf-8 parsers. (0xC1A1, 0xE081A1, 0xF08081A1, 0xF8808081A1, 0xFC80808081A1, 0xFE8080808081A1). How many bytes to use for the overlong enocding is specified providing 'size'. + # to_unicode(str, 'utf-8', 'overlong', 2) # - def self.to_unicode(str='', mode = 'utf-16le') - case mode - when 'utf-16le' - return str.unpack('C*').pack('v*') - when 'utf-16be' - return str.unpack('C*').pack('n*') - when 'utf-32le' - return str.unpack('C*').pack('V*') - when 'utf-32be' - return str.unpack('C*').pack('N*') - when 'utf-7' - return str.gsub(/[^\n\r\t\ A-Za-z0-9\'\(\),-.\/\:\?]/){ |a| - out = '' - if a != '+' - out = encode_base64(to_unicode(a, 'utf-16be')).gsub(/[=\r\n]/, '') - end - '+' + out + '-' - } - when 'utf-7-all' + # Many utf-8 parsers also allow invalid overlong encodings, where bits that are unused when encoding a single byte are modified. Many parsers will ignore these bits, rendering simple string matching to be ineffective for dealing with UTF-8 strings. There are many more invalid overlong encodings possible for "a". For example, three encodings are available for an invalid 2 byte encoding of "a". (0xC1E1 0xC161 0xC121). By specifying "invalid", a random invalid encoding is chosen for the given byte size. + # to_unicode(str, 'utf-8', 'invalid', 2) + # + # utf-7 defaults to 'normal' utf-7 encoding + # utf-8 defaults to 2 byte 'normal' encoding + # + def self.to_unicode(str='', type = 'utf-16le', mode = '', size = 2) + case type + when 'utf-16le' + return str.unpack('C*').pack('v*') + when 'utf-16be' + return str.unpack('C*').pack('n*') + when 'utf-32le' + return str.unpack('C*').pack('V*') + when 'utf-32be' + return str.unpack('C*').pack('N*') + when 'utf-7' + case mode + when 'all' return str.gsub(/./){ |a| out = '' if 'a' != '+' @@ -148,9 +153,82 @@ module Text end '+' + out + '-' } - else - raise TypeError, 'invalid utf type' + else + return str.gsub(/[^\n\r\t\ A-Za-z0-9\'\(\),-.\/\:\?]/){ |a| + out = '' + if a != '+' + out = encode_base64(to_unicode(a, 'utf-16be')).gsub(/[=\r\n]/, '') + end + '+' + out + '-' + } end + when 'utf-8' + if size >= 2 and size <= 7 + string = '' + str.each_byte { |a| + if a > 0x7f || mode != '' + # ugh. turn a single byte into the binary representation of it, in array form + bin = [a].pack('C').unpack('B8')[0].split(//) + + # even more ugh. + bin.collect!{|a| a = a.to_i} + + out = Array.new(8 * size, 0) + + 0.upto(size - 1) { |i| + out[i] = 1 + out[i * 8] = 1 + } + + i = 0 + byte = 0 + bin.reverse.each { |bit| + if i < 6 + mod = (((size * 8) - 1) - byte * 8) - i + out[mod] = bit + else + byte = byte + 1 + i = 0 + redo + end + i = i + 1 + } + + if mode != '' + case mode + when 'overlong' + # do nothing, since we already handle this as above... + when 'invalid' + done = 0 + while done == 0 + bits = [7, 8, 15, 16, 23, 24, 31, 32, 41] + bits.each { |bit| + bit = (size * 8) - bit + if bit > 1 + set = rand(2) + if out[bit] != set + out[bit] = set + done = 1 + end + end + } + end + else + raise TypeError, 'Invalid mode. Only "overlong" and "invalid" are acceptable modes for utf-8' + end + end + string += [out.join('')].pack('B*') + else + string += [a].pack('C') + end + } + return string + else + raise TypeError, 'invalid utf-8 size' + end + else + raise TypeError, 'invalid utf type' + end end # diff --git a/lib/rex/text.rb.ut.rb b/lib/rex/text.rb.ut.rb index ee1bee784c..6997047cfe 100644 --- a/lib/rex/text.rb.ut.rb +++ b/lib/rex/text.rb.ut.rb @@ -23,7 +23,26 @@ class Rex::Text::UnitTest < Test::Unit::TestCase assert_equal("a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00", Rex::Text.to_unicode('abc', 'utf-32le'), 'utf-32le') assert_equal("\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c", Rex::Text.to_unicode('abc', 'utf-32be'), 'utf-32be') assert_equal("abc+-abc-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7'), 'utf-7') - assert_equal("+AGE-+AGI-+AGM-+ACs-+AGE-+AGI-+AGM-+AC0-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7-all'), 'utf-7-all') + assert_equal("+AGE-+AGI-+AGM-+ACs-+AGE-+AGI-+AGM-+AC0-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7', 'all'), 'utf-7-all') + + assert_equal("a\303\272", Rex::Text.to_unicode("a\xFA", 'utf-8')) + assert_equal("\xC1\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 2), 'utf-8 overlong') + assert_equal("\xE0\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 3), 'utf-8 overlong') + assert_equal("\xF0\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 4), 'utf-8 overlong') + assert_equal("\xF8\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 5), 'utf-8 overlong') + assert_equal("\xFC\x80\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 6), 'utf-8 overlong') + assert_equal("\xFE\x80\x80\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 7), 'utf-8 overlong') + 100.times { + assert(["\xC1\x21","\xC1\x61","\xC1\xE1"].include?(Rex::Text.to_unicode('a', 'utf-8', 'invalid')), 'utf-8 invalid') + assert(["\xE0\x01\x21","\xE0\x01\x61","\xE0\x01\xA1","\xE0\x01\xE1","\xE0\x41\x21","\xE0\x41\x61","\xE0\x41\xA1","\xE0\x41\xE1","\xE0\x81\x21","\xE0\x81\x61","\xE0\x81\xA1","\xE0\x81\xE1","\xE0\xC1\x21","\xE0\xC1\x61","\xE0\xC1\xA1","\xE0\xC1\xE1"].include?(Rex::Text.to_unicode('a', 'utf-8', 'invalid', 3)), 'utf-8 invalid 3 byte') + } + + assert_raises(TypeError) { + Rex::Text.to_unicode('a', 'utf-8', '', 8) + } + assert_raises(TypeError) { + Rex::Text.to_unicode('a', 'utf-8', 'foo', 6) + } end def test_zlib