* add utf-8 support, with overlong and invalid overlong encodings
* update utf-7 support to a mode to specify "all" git-svn-id: file:///home/svn/incoming/trunk@3526 4d416f70-5f16-0410-b530-b9f4589650daunstable
parent
9a634f0df9
commit
bce161e622
108
lib/rex/text.rb
108
lib/rex/text.rb
|
@ -115,15 +115,26 @@ module Text
|
|||
end
|
||||
|
||||
#
|
||||
# Converts standard ASCII text to 16-bit unicode
|
||||
# Converts standard ASCII text to a unicode string.
|
||||
#
|
||||
# By default, little-endian unicode. By providing non-nil value for
|
||||
# endian, convert to 16-bit big-endian unicode. NOTE, most systems require
|
||||
# a marker to specify that the unicode text being provided is in
|
||||
# big-endian. Use 0xFEFF, which is not a "legal" unicode code point.
|
||||
# Supported unicode types include: utf-16le, utf16-be, utf32-le, utf32-be, utf-7, and utf-8
|
||||
#
|
||||
def self.to_unicode(str='', mode = 'utf-16le')
|
||||
case mode
|
||||
# Providing 'mode' provides hints to the actual encoder as to how it should encode the string. Only UTF-7 and UTF-8 use "mode".
|
||||
#
|
||||
# utf-7 by default does not encode alphanumeric and a few other characters. By specifying the mode of "all", then all of the characters are encoded, not just the non-alphanumeric set.
|
||||
# to_unicode(str, 'utf-7', 'all')
|
||||
#
|
||||
# utf-8 specifies that alphanumeric characters are used directly, eg "a" is just "a". However, there exist 6 different overlong encodings of "a" that are technically not valid, but parse just fine in most utf-8 parsers. (0xC1A1, 0xE081A1, 0xF08081A1, 0xF8808081A1, 0xFC80808081A1, 0xFE8080808081A1). How many bytes to use for the overlong enocding is specified providing 'size'.
|
||||
# to_unicode(str, 'utf-8', 'overlong', 2)
|
||||
#
|
||||
# Many utf-8 parsers also allow invalid overlong encodings, where bits that are unused when encoding a single byte are modified. Many parsers will ignore these bits, rendering simple string matching to be ineffective for dealing with UTF-8 strings. There are many more invalid overlong encodings possible for "a". For example, three encodings are available for an invalid 2 byte encoding of "a". (0xC1E1 0xC161 0xC121). By specifying "invalid", a random invalid encoding is chosen for the given byte size.
|
||||
# to_unicode(str, 'utf-8', 'invalid', 2)
|
||||
#
|
||||
# utf-7 defaults to 'normal' utf-7 encoding
|
||||
# utf-8 defaults to 2 byte 'normal' encoding
|
||||
#
|
||||
def self.to_unicode(str='', type = 'utf-16le', mode = '', size = 2)
|
||||
case type
|
||||
when 'utf-16le'
|
||||
return str.unpack('C*').pack('v*')
|
||||
when 'utf-16be'
|
||||
|
@ -133,14 +144,8 @@ module Text
|
|||
when 'utf-32be'
|
||||
return str.unpack('C*').pack('N*')
|
||||
when 'utf-7'
|
||||
return str.gsub(/[^\n\r\t\ A-Za-z0-9\'\(\),-.\/\:\?]/){ |a|
|
||||
out = ''
|
||||
if a != '+'
|
||||
out = encode_base64(to_unicode(a, 'utf-16be')).gsub(/[=\r\n]/, '')
|
||||
end
|
||||
'+' + out + '-'
|
||||
}
|
||||
when 'utf-7-all'
|
||||
case mode
|
||||
when 'all'
|
||||
return str.gsub(/./){ |a|
|
||||
out = ''
|
||||
if 'a' != '+'
|
||||
|
@ -148,6 +153,79 @@ module Text
|
|||
end
|
||||
'+' + out + '-'
|
||||
}
|
||||
else
|
||||
return str.gsub(/[^\n\r\t\ A-Za-z0-9\'\(\),-.\/\:\?]/){ |a|
|
||||
out = ''
|
||||
if a != '+'
|
||||
out = encode_base64(to_unicode(a, 'utf-16be')).gsub(/[=\r\n]/, '')
|
||||
end
|
||||
'+' + out + '-'
|
||||
}
|
||||
end
|
||||
when 'utf-8'
|
||||
if size >= 2 and size <= 7
|
||||
string = ''
|
||||
str.each_byte { |a|
|
||||
if a > 0x7f || mode != ''
|
||||
# ugh. turn a single byte into the binary representation of it, in array form
|
||||
bin = [a].pack('C').unpack('B8')[0].split(//)
|
||||
|
||||
# even more ugh.
|
||||
bin.collect!{|a| a = a.to_i}
|
||||
|
||||
out = Array.new(8 * size, 0)
|
||||
|
||||
0.upto(size - 1) { |i|
|
||||
out[i] = 1
|
||||
out[i * 8] = 1
|
||||
}
|
||||
|
||||
i = 0
|
||||
byte = 0
|
||||
bin.reverse.each { |bit|
|
||||
if i < 6
|
||||
mod = (((size * 8) - 1) - byte * 8) - i
|
||||
out[mod] = bit
|
||||
else
|
||||
byte = byte + 1
|
||||
i = 0
|
||||
redo
|
||||
end
|
||||
i = i + 1
|
||||
}
|
||||
|
||||
if mode != ''
|
||||
case mode
|
||||
when 'overlong'
|
||||
# do nothing, since we already handle this as above...
|
||||
when 'invalid'
|
||||
done = 0
|
||||
while done == 0
|
||||
bits = [7, 8, 15, 16, 23, 24, 31, 32, 41]
|
||||
bits.each { |bit|
|
||||
bit = (size * 8) - bit
|
||||
if bit > 1
|
||||
set = rand(2)
|
||||
if out[bit] != set
|
||||
out[bit] = set
|
||||
done = 1
|
||||
end
|
||||
end
|
||||
}
|
||||
end
|
||||
else
|
||||
raise TypeError, 'Invalid mode. Only "overlong" and "invalid" are acceptable modes for utf-8'
|
||||
end
|
||||
end
|
||||
string += [out.join('')].pack('B*')
|
||||
else
|
||||
string += [a].pack('C')
|
||||
end
|
||||
}
|
||||
return string
|
||||
else
|
||||
raise TypeError, 'invalid utf-8 size'
|
||||
end
|
||||
else
|
||||
raise TypeError, 'invalid utf type'
|
||||
end
|
||||
|
|
|
@ -23,7 +23,26 @@ class Rex::Text::UnitTest < Test::Unit::TestCase
|
|||
assert_equal("a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00", Rex::Text.to_unicode('abc', 'utf-32le'), 'utf-32le')
|
||||
assert_equal("\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c", Rex::Text.to_unicode('abc', 'utf-32be'), 'utf-32be')
|
||||
assert_equal("abc+-abc-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7'), 'utf-7')
|
||||
assert_equal("+AGE-+AGI-+AGM-+ACs-+AGE-+AGI-+AGM-+AC0-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7-all'), 'utf-7-all')
|
||||
assert_equal("+AGE-+AGI-+AGM-+ACs-+AGE-+AGI-+AGM-+AC0-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7', 'all'), 'utf-7-all')
|
||||
|
||||
assert_equal("a\303\272", Rex::Text.to_unicode("a\xFA", 'utf-8'))
|
||||
assert_equal("\xC1\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 2), 'utf-8 overlong')
|
||||
assert_equal("\xE0\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 3), 'utf-8 overlong')
|
||||
assert_equal("\xF0\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 4), 'utf-8 overlong')
|
||||
assert_equal("\xF8\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 5), 'utf-8 overlong')
|
||||
assert_equal("\xFC\x80\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 6), 'utf-8 overlong')
|
||||
assert_equal("\xFE\x80\x80\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 7), 'utf-8 overlong')
|
||||
100.times {
|
||||
assert(["\xC1\x21","\xC1\x61","\xC1\xE1"].include?(Rex::Text.to_unicode('a', 'utf-8', 'invalid')), 'utf-8 invalid')
|
||||
assert(["\xE0\x01\x21","\xE0\x01\x61","\xE0\x01\xA1","\xE0\x01\xE1","\xE0\x41\x21","\xE0\x41\x61","\xE0\x41\xA1","\xE0\x41\xE1","\xE0\x81\x21","\xE0\x81\x61","\xE0\x81\xA1","\xE0\x81\xE1","\xE0\xC1\x21","\xE0\xC1\x61","\xE0\xC1\xA1","\xE0\xC1\xE1"].include?(Rex::Text.to_unicode('a', 'utf-8', 'invalid', 3)), 'utf-8 invalid 3 byte')
|
||||
}
|
||||
|
||||
assert_raises(TypeError) {
|
||||
Rex::Text.to_unicode('a', 'utf-8', '', 8)
|
||||
}
|
||||
assert_raises(TypeError) {
|
||||
Rex::Text.to_unicode('a', 'utf-8', 'foo', 6)
|
||||
}
|
||||
end
|
||||
|
||||
def test_zlib
|
||||
|
|
Loading…
Reference in New Issue