metasploit-framework/lib/metasm/misc/pdfparse.rb

662 lines
17 KiB
Ruby

# This file is part of Metasm, the Ruby assembly manipulation suite
# Copyright (C) 2008 Yoann GUILLOT
#
# Licence is LGPL, see LICENCE in the top-level directory
#
# parses a PDF file
# used by ppc_pdf2oplist
#
require 'zlib'
# a Virtual string backed by a file, which is read on-demand
class VString
# creates a VString from a file
def self.read(fname)
raise 'need a PDF filename' if not fname
new File.open(fname, 'rb'), File.size(fname)
end
def initialize(fd, len)
@fd = fd
@len = len
end
def length; @len end
def [](start, len=nil)
return if not start
if start.kind_of? Range
len = start.end
len -= 1 if start.exclude_end?
len = @len+1+len if len < 0
start = start.begin
end
start = @len+1+start if start < 0
return nil if start < 0 or len < 0 or start > @len
@fd.pos = start
@fd.read len
end
# search on a small region (1k or 1M)
def index(sub, off=0)
off += 1 + @len if off < 0
return if not ret = self[off, 1024].index(sub) || self[off, 1024*1024].index(sub)
ret + off
end
# search on a small region (1k or 1M)
def rindex(sub, off=@len)
off += 1 + @len if off < 0
p1 = [off, 1024].min
p2 = [off, 1024*1024].min
if ret = self[off-p1, p1].rindex(sub)
ret + off-p1
elsif p1 != p2 and ret = self[off-p2, p2].rindex(sub)
ret + off-p2
end
end
end
# a PDF parser
class PDF
attr_accessor :str, :off, :trailer, :hdr, :xrefs, :xoff
# reads a filename as a PDF using VString
def self.read(filename)
new(VString.read(filename))
end
def initialize(str=nil)
read str if str
end
# reads a string as a PDF, interpret basic informations (header, trailer, xref table)
def read(str)
@str = str
@off = 0
readhdr
raise 'bad pdf: no trailer' unless @off = @str.rindex("trailer", @str.length)
readtrailer
self
end
def readhdr
@hdr = @str[@off, @off = @str.index("\n", @off)]
end
# reads the pdf trailer
# XXX the xref table referenced here may be the first of the file, so we suppose the last is just before the 'trailer' command..
def readtrailer
toff = @off
readcmd
@trailer = readhash
readcmd
@xroff = readint
@xoff = {} # [gen] => { id => off }
@xrefs = {} # [gen] => { id => obj }
@off = @xroff
readcmd
readxrtable
off2 = @off
if @off < toff and readcmd == 'trailer' and off = @str.rindex('xref', toff)
@off = off
readcmd
readxrtable
@off = off2
readcmd
@trailer.update readhash
end
end
def readxrtable
while @str[@off, 7] != 'trailer'
objnr = readint
objcnt = readint
@str[@off, 20*objcnt].scan(/(\d+) (\d+) (.)/) { |o, g, u|
(@xoff[g.to_i] ||= {})[objnr] = o.to_i if u == 'n'
objnr += 1
}
@off += 20*objcnt
skipspc
end
end
def readint
buf = ''
loop do
case c = @str[@off, 1]
when '+', '-'; break if not buf.empty?
when '.'; break if buf.include? '.'
when '0'..'9'
else break
end
buf << c
@off += 1
end
return if buf.empty?
skipspc
buf.include?('.') ? buf.to_f : buf.to_i
end
def readstr
buf = ''
case @str[@off, 1]
when '('
nest = 0
loop do
@off += 1
case c = @str[@off, 1]
when '('; nest += 1 ; buf << c
when ')'; nest -= 1 ; break if nest < 0 ; buf << c
when '\\'
@off += 1
case c = @str[@off, 1]
when 'n'; buf << ?\n
when 'r'; buf << ?\r
when 't'; buf << ?\t
when 'b'; buf << ?\b
when '0'..'7'
if ('0'..'7').include?(cc = @str[@off+1, 1])
@off += 1 ; c << cc
if ('0'..'7').include?(cc = @str[@off+1, 1])
@off += 1 ; c << cc
end
end
buf << c.to_i(8)
when nil; break
else buf << c
end
when nil; break
else buf << c
end
end
when '<'
loop do
@off += 1
case c = @str[@off, 1]
when '0'..'9', 'a'..'f', 'A'..'F'; buf << c
when ' ', "\n", "\r", "\t"
else break
end
end
buf << '0' if buf.length % 2 == 1
buf = [buf].pack('H*')
else return
end
@off += 1
skipspc
buf
end
def readname
return if @str[@off, 1] != '/'
buf = ''
loop do
@off += 1
case c = @str[@off, 1]
when '#'; buf << @str[@off+1, 2].to_i(16) ; @off += 2
when nil, /[\s\(\)\{\}<>\[\]\/]/; break
else buf << c
end
end
skipspc
buf
end
def readarray
return if @str[@off, 1] != '['
buf = []
@off += 1
skipspc
buf << readany until @str[@off, 1] == ']' or @off >= @str.length
@off += 1
skipspc
buf
end
def readhash
return if @str[@off, 2] != '<<'
buf = {}
@off += 2
skipspc
buf[readname] = readany until @str[@off, 2] == '>>' or @off >= @str.length
buf.delete_if { |k, v| v == :null }
@off += 2
skipspc
buf
end
def readcmd
buf = ''
loop do
case c = @str[@off, 1]
when nil, /[\s\(\)\{\}<>\[\]\/%]/; break
else buf << c
end
@off += 1
end
skipspc
buf
end
def newstream(hash, data)
f = [hash['Filter']].flatten.compact
if f.length == 1 and f.first == 'FlateDecode'
data = Zlib::Inflate.inflate(data)
elsif f.length == 0
else puts "stream filter #{f.inspect} unsupported"
end
hash[:data] = data
hash
end
class Ref
attr_accessor :gen, :id
def initialize(pdf, gen, id)
@pdf, @gen, @id = pdf, gen, id
end
def inspect
"#<Ref @pdf=#{@pdf.object_id.to_s(16)} @gen=#@gen @id=#@id>"
end
def deref(depth=1)
@pdf.deref(self, depth)
end
def method_missing(*a, &b)
deref.send(*a, &b)
end
end
# reads & returns any pdf object according to its 1st char (almost)
# updates @xrefs if the object is indirect
def readany
case @str[@off, 1]
when nil; return
when '/'; readname
when '+', '-'; readint
when '0'..'9'
i = readint
if ('0'..'9').include?(@str[@off, 1])
poff = @off
g = readint
case readcmd
when 'obj'
@xrefs[g] ||= {}
i = @xrefs[g][i] ||= readany
raise 'no endobj' if readcmd != 'endobj'
when 'R'
i = Ref.new(self, g, i)
else @off = poff
end
end
i
when '['; readarray
when '('; readstr
when '<'
if @str[@off+1, 1] == '<'
h = readhash
if @str[@off, 6] == 'stream' and i = @str.index("\n", @off) # readcmd may eat spaces that are part of the stream
l = h['Length'].to_i
h = newstream(h, @str[i+1, l])
@off = i+1+l
skipspc
raise 'no endstream' if readcmd != 'endstream'
end
h
else readstr
end
else
case c = readcmd
when 'true', 'false', 'null'; c.to_sym
when 'xref'; readxrtable ; (@trailer ||= {}).update readhash if readcmd == 'trailer' ; readint if readcmd == 'startxref' ; :xref
else raise "unknown cmd #{c.inspect}"
end
end
end
def skipspc
while @off < @str.length
case @str[@off, 1]
when '%'; @off += 1 until @str[@off, 1] == "\n" or @off >= @str.length
when ' ', "\n", "\r", "\t"
else break
end
@off += 1
end
end
# dereference references from the specified root, with the specified depth
def deref(obj, depth=1)
if obj.kind_of? Ref
@xrefs[obj.gen] ||= {}
if not nobj = @xrefs[obj.gen][obj.id]
pvoff = @off
raise 'unknown ref off' unless @off = @xoff[obj.gen][obj.id]
puts "deref #{obj.gen} #{obj.id} => #{@off.to_s(16)}" if $DEBUG
nobj = @xrefs[obj.gen][obj.id] = readany || :poil
@off = pvoff
end
obj = nobj
end
depth -= 1
case obj
when Hash; obj = obj.dup ; obj.each { |k, v| obj[k] = deref(v, depth) }
when Array; obj = obj.dup ; obj.each_with_index { |v, i| obj[i] = deref(v, depth) }
end if depth > 0
obj
end
# returns the :data field for a Hash or the concatenation of the :data fields of the children for an Array
def page_data(ct)
if deref(ct).kind_of? Array
ct.map { |c| c[:data] }.join
else
ct[:data]
end
end
# iterates over the PDF pages, yields each PSPage
def each_page(h=@trailer['Root']['Pages'])
if h['Kids']
h['Kids'].each { |k| each_page(k, &Proc.new) }
else
yield PSPage.new(page_data(h['Contents']))
end
end
# returns the nr-th page of the pdf as a PSPage
def page(nr, ar=@trailer['Root']['Pages']['Kids'])
ar.each { |kid|
if kid['Count']
break page(nr, kid['Kids']) if nr <= kid['Count']
nr -= kid['Count']
else
nr -= 1
break PSPage.new(page_data(kid['Contents'])) if nr <= 0
end
}
end
end
# a PostScript page (lines with position information)
class PSPage
class Line
CHARWIDTH=400
attr_accessor :str, :x, :y, :fontx, :fonty
# parses a postscript line, returns a line with individual characters at the right place (more or less)
def initialize(str, x, y, fontx, fonty, charspc, wordspc)
@raw, @charspc, @wordspc = str, charspc, wordspc
@x, @y, @fontx, @fonty = x, y, fontx, fonty
str = str[1...-1] if str[0] == ?[
@str = ''
bs = char = false
#lastchar = nil
spc = ''
str.each_byte { |b|
if not bs
# special chars (unescaped)
case b
when ?( # new word: honor word spacing
spc = (-spc.to_f/CHARWIDTH).round
if spc > 0 and not @str.empty?
@str << (' '*spc)
elsif spc < 0
@str.chop! while @str[-1] == ?\ and (spc += 1) <= 0# and (lastchar != ?\ or @str[-2] == lastchar)
end
char = true
next
when ?\\ # bs character
bs = true
next
when ?) # end of word
char = false
spc = ''
next
end
end
# octal escape sequence: leave as is (actual char depends on font)
if bs and (?0..?7).include? b; @str << ?\\ end
bs = false
if char
# update current rendered string, honoring charspc
@str << b
@str << (' ' * (charspc*1000/CHARWIDTH).round) if charspc > 0.1
@str << (' ' * (wordspc*1000/CHARWIDTH).round) if b == ?\ and wordspc > 0.1
#lastchar = b
else
# between strings: store word spacing integer
spc << b
end
}
puts "(#{x}, #{y} #{fontx}, #{fonty}) #@str" if $VERBOSE
end
def to_s ; @str end
end
attr_accessor :lines
def initialize(str=nil)
parse(str) if str
end
# remove lines not within ymin and ymax
def clip_lines(ymin, ymax)
ymin, ymax = ymax, ymin if ymin > ymax
@lines.each { |la| la.delete_if { |l| l.y < ymin or l.y > ymax } }
@lines.delete_if { |la| la.empty? }
self
end
# parse a postscript string to an array of paragraph (itself an array of lines)
# handles text strings and basic cursor position updates
def parse(str)
@lines = []
curx = cury = 0
fontx = fonty = 12
charspc = wordspc = 0
stack = []
linelead = -12
ps2tok(str) { |t|
case t
when Float, String; print "#{t} "
else puts t
end if $VERBOSE
case t
when Float, String; stack << t # be postfix !
when :BT; intext = true ; @lines << [] # begin text
when :ET; intext = false # end text
when :Tj, :TJ # print line
@lines.last << Line.new(stack.pop, curx, cury, fontx, fonty, charspc, wordspc)
when :Td, :TD # move cursor
linelead = stack.last*fonty if t == :TD
cury += stack.pop*fonty
curx += stack.pop*fontx
when :'T*' # new line
cury += linelead
when :Tc # character spacing
# RHAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
#3.17731 Tc 9 0 0 9 343.41 653.84998 Tm
#[(3T)3202(O)729(R)3179(A)-3689(S)3178(I)]TJ
# => 3 TO RA SI
charspc = stack.pop
when :Tw
wordspc = stack.pop
when :Tm # set transform matrix (scale, rotate, translate)
params = Array.new(6) { stack.pop }.reverse
next if params[0] == 0.0 # rotated text
fontx, _, _, fonty, curx, cury = params
end
}
end
# yields PS tokens: floats, commands, and strings
def ps2tok(str)
loop do
case str
when ''; break
when /\A-?\d+(?:\.\d+)?/; tok = $&.to_f
when /\A\((?:\\.|[^\\)])*\)/; tok = $&
when /\A\[(?:[^\](]*\((?:\\.|[^\\)])*\))*[^\]]*\]/; tok = $&
when /\A[a-zA-Z0-9_*]+/; tok = $&.to_sym rescue nil
when /\A\S+/, /\A\s+/
end
str = str[$&.length..-1]
yield tok if tok
end
end
# renders the lines, according to the layout (almost ;) )
def to_s
mx = @lines.flatten.map { |l| l.x }.min
py = nil
strs = ['']
@lines.sort_by { |la| -la.map { |l| l.y }.max.to_i }.each { |la|
y = la.map { |l| l.y }.max
strs.concat ['']*((py-y)/12) if py and py > y
la.sort_by { |l| [-l.y, l.x] }.each { |l|
# 9 == base font size
strs << '' if y > l.y+l.fonty*0.9 or strs.last.length*1000/Line::CHARWIDTH/9 > l.x-mx
strs[-1] = strs.last.ljust((l.x-mx)*1000/Line::CHARWIDTH/9-1) << ' ' << l.str
y = l.y
}
py = y if not py or py > y
}
strs.join("\n")
end
end
if __FILE__ == $0
require 'pp'
begin
pdf = PDF.read ARGV.shift
if $VERBOSE
puts 'Info: '
pp pdf.deref(pdf.trailer['Info'])
puts
end
if not ARGV.empty?
ARGV.each { |pagenr|
puts pdf.page(pagenr.to_i)
}
else
puts 'Pages: '
pagecnt = 0
pdf.each_page { |page|
pagecnt += 1
puts " ------- p.#{pagecnt} ---------", page
}
end
rescue
puts "at #{pdf.off.to_s(16) if pdf}", $!, $!.backtrace[0, 24]
end
end
__END__
PostScript text formatting, shamelessly ripped from the web (http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/)
Object 3, which contains the contents of page one of our document, is worth commenting on since it shows how text streams are used in PDF. The object looks like:
3 0 obj
<<
/Length 168
>>
stream
BT
/F4 1 Tf
12 0 0 12 50.64 731.52 Tm
0 0 0 rg
BX /GS2 gs EX
0 Tc
0 Tw
@charspc = charspc
[(This is 12-point )10(T)41(imes. )
18(This sentence will appear near
the top of page one.)]TJ
ET
endstream
endobj
The stream object (which is 168 bytes long) is bracketed by BT and ET operators, for Begin Text and End Text. The Tf command selects our font and its size in user-space units, which is given as 1. "But aren't we using 12-point type?" you may be wondering. Yes, we are. That's specified in the next line, ending in Tm (which is the set-text-matrix operator). For space reasons, we won't say much about coordinate system transformations and matrices here, but if you're familiar with the use of matrices in PostScript, the same rules apply in PDF. A transform matrix is given by an array of six numbers, the first and fourth of which determine scaling in x and y, respectively. We see in our text matrix, the scaling factor is 12. That means we will use 12-point type. The last two numbers in the matrix (50.64 and 731.52) specify a translation, in user-space units. The effect of the translation is to put our text approximately 10.1 inches high on the page, with a left margin of 0.7 inch.
The line ending with rg sets our ink color to an RGB value of 0 0 0, or black. The BX operator says that we are beginning a section that allows undefined operators. In this section, we apply the gs operator (which sets parameters in the extended graphics state), using /GS2 as our EGS specifications. The EX operator ends the section allowing undefined operators. In essence, we're saying "Any reading application that understands what's in this special section can execute the instructions contained there, but if you don't understand the instructions, just go on." The reason this section has to be handled this way is that extended graphics state instructions often contain device-dependent instructions. The lack of generality means we should bracket those instructions with BX/EX.
The Tc and Tw operators are for setting character spacing and word spacing, respectively.
Finally, we come to the text that will be displayed on our page. Oddly enough, it's specified in an array of text snippets interspersed with integers, such as:
(This is 12-point )10(T)41(imes. )
The number 10 represents a kerning value, in thousandths of an em. (An em is a typographical unit of measurement equal to the size of the font.) This number is subtracted from the 'x' coordinate of the letter(s) that follow, displacing the text to the left. The capital 'T' is displaced 10 units to the left, while "imes. " is displaced 41 units. The TJ at the end of the array is the operator for "show text, allowing individual character spacing."
Finally, ET closes off the text block, and endstream closes off the stream.
b closepath, fill,and stroke path.
B fill and stroke path.
b* closepath, eofill,and stroke path.
B* eofill and stroke path.
BI begin image.
BMC begin marked content.
BT begin text object.
BX begin section allowing undefined operators.
c curveto.
cm concat. Concatenates the matrix to the current transform.
cs setcolorspace for fill.
CS setcolorspace for stroke.
d setdash.
Do execute the named XObject.
DP mark a place in the content stream, with a dictionary.
EI end image.
EMC end marked content.
ET end text object.
EX end section that allows undefined operators.
f fill path.
f* eofill Even/odd fill path.
g setgray (fill).
G setgray (stroke).
gs set parameters in the extended graphics state.
h closepath.
i setflat.
ID begin image data.
j setlinejoin.
J setlinecap.
k setcmykcolor (fill).
K setcmykcolor (stroke).
l lineto.
m moveto.
M setmiterlimit.
n end path without fill or stroke.
q save graphics state.
Q restore graphics state.
re rectangle.
rg setrgbcolor (fill).
RG setrgbcolor (stroke).
s closepath and stroke path.
S stroke path.
sc setcolor (fill).
SC setcolor (stroke).
sh shfill (shaded fill).
Tc set character spacing.
Td move text current point.
TD move text current point and set leading.
Tf set font name and size.
Tj show text.
TJ show text, allowing individual character positioning.
TL set leading.
Tm set text matrix.
Tr set text rendering mode.
Ts set super/subscripting text rise.
Tw set word spacing.
Tz set horizontal scaling.
T* move to start of next line.
v curveto.
w setlinewidth.
W clip.
y curveto.