* add utf-8 support, with overlong and invalid overlong encodings

* update utf-7 support to a mode to specify "all"


git-svn-id: file:///home/svn/incoming/trunk@3526 4d416f70-5f16-0410-b530-b9f4589650da
unstable
bmc 2006-02-13 22:52:01 +00:00
parent 9a634f0df9
commit bce161e622
2 changed files with 124 additions and 27 deletions

View File

@ -115,32 +115,37 @@ module Text
end
#
# Converts standard ASCII text to 16-bit unicode
# Converts standard ASCII text to a unicode string.
#
# By default, little-endian unicode. By providing non-nil value for
# endian, convert to 16-bit big-endian unicode. NOTE, most systems require
# a marker to specify that the unicode text being provided is in
# big-endian. Use 0xFEFF, which is not a "legal" unicode code point.
# Supported unicode types include: utf-16le, utf16-be, utf32-le, utf32-be, utf-7, and utf-8
#
# Providing 'mode' provides hints to the actual encoder as to how it should encode the string. Only UTF-7 and UTF-8 use "mode".
#
# utf-7 by default does not encode alphanumeric and a few other characters. By specifying the mode of "all", then all of the characters are encoded, not just the non-alphanumeric set.
# to_unicode(str, 'utf-7', 'all')
#
# utf-8 specifies that alphanumeric characters are used directly, eg "a" is just "a". However, there exist 6 different overlong encodings of "a" that are technically not valid, but parse just fine in most utf-8 parsers. (0xC1A1, 0xE081A1, 0xF08081A1, 0xF8808081A1, 0xFC80808081A1, 0xFE8080808081A1). How many bytes to use for the overlong enocding is specified providing 'size'.
# to_unicode(str, 'utf-8', 'overlong', 2)
#
def self.to_unicode(str='', mode = 'utf-16le')
case mode
when 'utf-16le'
return str.unpack('C*').pack('v*')
when 'utf-16be'
return str.unpack('C*').pack('n*')
when 'utf-32le'
return str.unpack('C*').pack('V*')
when 'utf-32be'
return str.unpack('C*').pack('N*')
when 'utf-7'
return str.gsub(/[^\n\r\t\ A-Za-z0-9\'\(\),-.\/\:\?]/){ |a|
out = ''
if a != '+'
out = encode_base64(to_unicode(a, 'utf-16be')).gsub(/[=\r\n]/, '')
end
'+' + out + '-'
}
when 'utf-7-all'
# Many utf-8 parsers also allow invalid overlong encodings, where bits that are unused when encoding a single byte are modified. Many parsers will ignore these bits, rendering simple string matching to be ineffective for dealing with UTF-8 strings. There are many more invalid overlong encodings possible for "a". For example, three encodings are available for an invalid 2 byte encoding of "a". (0xC1E1 0xC161 0xC121). By specifying "invalid", a random invalid encoding is chosen for the given byte size.
# to_unicode(str, 'utf-8', 'invalid', 2)
#
# utf-7 defaults to 'normal' utf-7 encoding
# utf-8 defaults to 2 byte 'normal' encoding
#
def self.to_unicode(str='', type = 'utf-16le', mode = '', size = 2)
case type
when 'utf-16le'
return str.unpack('C*').pack('v*')
when 'utf-16be'
return str.unpack('C*').pack('n*')
when 'utf-32le'
return str.unpack('C*').pack('V*')
when 'utf-32be'
return str.unpack('C*').pack('N*')
when 'utf-7'
case mode
when 'all'
return str.gsub(/./){ |a|
out = ''
if 'a' != '+'
@ -148,9 +153,82 @@ module Text
end
'+' + out + '-'
}
else
raise TypeError, 'invalid utf type'
else
return str.gsub(/[^\n\r\t\ A-Za-z0-9\'\(\),-.\/\:\?]/){ |a|
out = ''
if a != '+'
out = encode_base64(to_unicode(a, 'utf-16be')).gsub(/[=\r\n]/, '')
end
'+' + out + '-'
}
end
when 'utf-8'
if size >= 2 and size <= 7
string = ''
str.each_byte { |a|
if a > 0x7f || mode != ''
# ugh. turn a single byte into the binary representation of it, in array form
bin = [a].pack('C').unpack('B8')[0].split(//)
# even more ugh.
bin.collect!{|a| a = a.to_i}
out = Array.new(8 * size, 0)
0.upto(size - 1) { |i|
out[i] = 1
out[i * 8] = 1
}
i = 0
byte = 0
bin.reverse.each { |bit|
if i < 6
mod = (((size * 8) - 1) - byte * 8) - i
out[mod] = bit
else
byte = byte + 1
i = 0
redo
end
i = i + 1
}
if mode != ''
case mode
when 'overlong'
# do nothing, since we already handle this as above...
when 'invalid'
done = 0
while done == 0
bits = [7, 8, 15, 16, 23, 24, 31, 32, 41]
bits.each { |bit|
bit = (size * 8) - bit
if bit > 1
set = rand(2)
if out[bit] != set
out[bit] = set
done = 1
end
end
}
end
else
raise TypeError, 'Invalid mode. Only "overlong" and "invalid" are acceptable modes for utf-8'
end
end
string += [out.join('')].pack('B*')
else
string += [a].pack('C')
end
}
return string
else
raise TypeError, 'invalid utf-8 size'
end
else
raise TypeError, 'invalid utf type'
end
end
#

View File

@ -23,7 +23,26 @@ class Rex::Text::UnitTest < Test::Unit::TestCase
assert_equal("a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00", Rex::Text.to_unicode('abc', 'utf-32le'), 'utf-32le')
assert_equal("\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c", Rex::Text.to_unicode('abc', 'utf-32be'), 'utf-32be')
assert_equal("abc+-abc-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7'), 'utf-7')
assert_equal("+AGE-+AGI-+AGM-+ACs-+AGE-+AGI-+AGM-+AC0-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7-all'), 'utf-7-all')
assert_equal("+AGE-+AGI-+AGM-+ACs-+AGE-+AGI-+AGM-+AC0-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7', 'all'), 'utf-7-all')
assert_equal("a\303\272", Rex::Text.to_unicode("a\xFA", 'utf-8'))
assert_equal("\xC1\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 2), 'utf-8 overlong')
assert_equal("\xE0\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 3), 'utf-8 overlong')
assert_equal("\xF0\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 4), 'utf-8 overlong')
assert_equal("\xF8\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 5), 'utf-8 overlong')
assert_equal("\xFC\x80\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 6), 'utf-8 overlong')
assert_equal("\xFE\x80\x80\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 7), 'utf-8 overlong')
100.times {
assert(["\xC1\x21","\xC1\x61","\xC1\xE1"].include?(Rex::Text.to_unicode('a', 'utf-8', 'invalid')), 'utf-8 invalid')
assert(["\xE0\x01\x21","\xE0\x01\x61","\xE0\x01\xA1","\xE0\x01\xE1","\xE0\x41\x21","\xE0\x41\x61","\xE0\x41\xA1","\xE0\x41\xE1","\xE0\x81\x21","\xE0\x81\x61","\xE0\x81\xA1","\xE0\x81\xE1","\xE0\xC1\x21","\xE0\xC1\x61","\xE0\xC1\xA1","\xE0\xC1\xE1"].include?(Rex::Text.to_unicode('a', 'utf-8', 'invalid', 3)), 'utf-8 invalid 3 byte')
}
assert_raises(TypeError) {
Rex::Text.to_unicode('a', 'utf-8', '', 8)
}
assert_raises(TypeError) {
Rex::Text.to_unicode('a', 'utf-8', 'foo', 6)
}
end
def test_zlib