# This file is part of Metasm, the Ruby assembly manipulation suite # Copyright (C) 2008 Yoann GUILLOT # # Licence is LGPL, see LICENCE in the top-level directory # # parses a PDF file # used by ppc_pdf2oplist # require 'zlib' # a Virtual string backed by a file, which is read on-demand class VString # creates a VString from a file def self.read(fname) raise 'need a PDF filename' if not fname new File.open(fname, 'rb'), File.size(fname) end def initialize(fd, len) @fd = fd @len = len end def length; @len end def [](start, len=nil) return if not start if start.kind_of? Range len = start.end len -= 1 if start.exclude_end? len = @len+1+len if len < 0 start = start.begin end start = @len+1+start if start < 0 return nil if start < 0 or len < 0 or start > @len @fd.pos = start @fd.read len end # search on a small region (1k or 1M) def index(sub, off=0) off += 1 + @len if off < 0 return if not ret = self[off, 1024].index(sub) || self[off, 1024*1024].index(sub) ret + off end # search on a small region (1k or 1M) def rindex(sub, off=@len) off += 1 + @len if off < 0 p1 = [off, 1024].min p2 = [off, 1024*1024].min if ret = self[off-p1, p1].rindex(sub) ret + off-p1 elsif p1 != p2 and ret = self[off-p2, p2].rindex(sub) ret + off-p2 end end end # a PDF parser class PDF attr_accessor :str, :off, :trailer, :hdr, :xrefs, :xoff # reads a filename as a PDF using VString def self.read(filename) new(VString.read(filename)) end def initialize(str=nil) read str if str end # reads a string as a PDF, interpret basic informations (header, trailer, xref table) def read(str) @str = str @off = 0 readhdr raise 'bad pdf: no trailer' unless @off = @str.rindex("trailer", @str.length) readtrailer self end def readhdr @hdr = @str[@off, @off = @str.index("\n", @off)] end # reads the pdf trailer # XXX the xref table referenced here may be the first of the file, so we suppose the last is just before the 'trailer' command.. def readtrailer toff = @off readcmd @trailer = readhash readcmd @xroff = readint @xoff = {} # [gen] => { id => off } @xrefs = {} # [gen] => { id => obj } @off = @xroff readcmd readxrtable off2 = @off if @off < toff and readcmd == 'trailer' and off = @str.rindex('xref', toff) @off = off readcmd readxrtable @off = off2 readcmd @trailer.update readhash end end def readxrtable while @str[@off, 7] != 'trailer' objnr = readint objcnt = readint @str[@off, 20*objcnt].scan(/(\d+) (\d+) (.)/) { |o, g, u| (@xoff[g.to_i] ||= {})[objnr] = o.to_i if u == 'n' objnr += 1 } @off += 20*objcnt skipspc end end def readint buf = '' loop do case c = @str[@off, 1] when '+', '-'; break if not buf.empty? when '.'; break if buf.include? '.' when '0'..'9' else break end buf << c @off += 1 end return if buf.empty? skipspc buf.include?('.') ? buf.to_f : buf.to_i end def readstr buf = '' case @str[@off, 1] when '(' nest = 0 loop do @off += 1 case c = @str[@off, 1] when '('; nest += 1 ; buf << c when ')'; nest -= 1 ; break if nest < 0 ; buf << c when '\\' @off += 1 case c = @str[@off, 1] when 'n'; buf << ?\n when 'r'; buf << ?\r when 't'; buf << ?\t when 'b'; buf << ?\b when '0'..'7' if ('0'..'7').include?(cc = @str[@off+1, 1]) @off += 1 ; c << cc if ('0'..'7').include?(cc = @str[@off+1, 1]) @off += 1 ; c << cc end end buf << c.to_i(8) when nil; break else buf << c end when nil; break else buf << c end end when '<' loop do @off += 1 case c = @str[@off, 1] when '0'..'9', 'a'..'f', 'A'..'F'; buf << c when ' ', "\n", "\r", "\t" else break end end buf << '0' if buf.length % 2 == 1 buf = [buf].pack('H*') else return end @off += 1 skipspc buf end def readname return if @str[@off, 1] != '/' buf = '' loop do @off += 1 case c = @str[@off, 1] when '#'; buf << @str[@off+1, 2].to_i(16) ; @off += 2 when nil, /[\s\(\)\{\}<>\[\]\/]/; break else buf << c end end skipspc buf end def readarray return if @str[@off, 1] != '[' buf = [] @off += 1 skipspc buf << readany until @str[@off, 1] == ']' or @off >= @str.length @off += 1 skipspc buf end def readhash return if @str[@off, 2] != '<<' buf = {} @off += 2 skipspc buf[readname] = readany until @str[@off, 2] == '>>' or @off >= @str.length buf.delete_if { |k, v| v == :null } @off += 2 skipspc buf end def readcmd buf = '' loop do case c = @str[@off, 1] when nil, /[\s\(\)\{\}<>\[\]\/%]/; break else buf << c end @off += 1 end skipspc buf end def newstream(hash, data) f = [hash['Filter']].flatten.compact if f.length == 1 and f.first == 'FlateDecode' data = Zlib::Inflate.inflate(data) elsif f.length == 0 else puts "stream filter #{f.inspect} unsupported" end hash[:data] = data hash end class Ref attr_accessor :gen, :id def initialize(pdf, gen, id) @pdf, @gen, @id = pdf, gen, id end def inspect "#" end def deref(depth=1) @pdf.deref(self, depth) end def method_missing(*a, &b) deref.send(*a, &b) end end # reads & returns any pdf object according to its 1st char (almost) # updates @xrefs if the object is indirect def readany case @str[@off, 1] when nil; return when '/'; readname when '+', '-'; readint when '0'..'9' i = readint if ('0'..'9').include?(@str[@off, 1]) poff = @off g = readint case readcmd when 'obj' @xrefs[g] ||= {} i = @xrefs[g][i] ||= readany raise 'no endobj' if readcmd != 'endobj' when 'R' i = Ref.new(self, g, i) else @off = poff end end i when '['; readarray when '('; readstr when '<' if @str[@off+1, 1] == '<' h = readhash if @str[@off, 6] == 'stream' and i = @str.index("\n", @off) # readcmd may eat spaces that are part of the stream l = h['Length'].to_i h = newstream(h, @str[i+1, l]) @off = i+1+l skipspc raise 'no endstream' if readcmd != 'endstream' end h else readstr end else case c = readcmd when 'true', 'false', 'null'; c.to_sym when 'xref'; readxrtable ; (@trailer ||= {}).update readhash if readcmd == 'trailer' ; readint if readcmd == 'startxref' ; :xref else raise "unknown cmd #{c.inspect}" end end end def skipspc while @off < @str.length case @str[@off, 1] when '%'; @off += 1 until @str[@off, 1] == "\n" or @off >= @str.length when ' ', "\n", "\r", "\t" else break end @off += 1 end end # dereference references from the specified root, with the specified depth def deref(obj, depth=1) if obj.kind_of? Ref @xrefs[obj.gen] ||= {} if not nobj = @xrefs[obj.gen][obj.id] pvoff = @off raise 'unknown ref off' unless @off = @xoff[obj.gen][obj.id] puts "deref #{obj.gen} #{obj.id} => #{@off.to_s(16)}" if $DEBUG nobj = @xrefs[obj.gen][obj.id] = readany || :poil @off = pvoff end obj = nobj end depth -= 1 case obj when Hash; obj = obj.dup ; obj.each { |k, v| obj[k] = deref(v, depth) } when Array; obj = obj.dup ; obj.each_with_index { |v, i| obj[i] = deref(v, depth) } end if depth > 0 obj end # returns the :data field for a Hash or the concatenation of the :data fields of the children for an Array def page_data(ct) if deref(ct).kind_of? Array ct.map { |c| c[:data] }.join else ct[:data] end end # iterates over the PDF pages, yields each PSPage def each_page(h=@trailer['Root']['Pages']) if h['Kids'] h['Kids'].each { |k| each_page(k, &Proc.new) } else yield PSPage.new(page_data(h['Contents'])) end end # returns the nr-th page of the pdf as a PSPage def page(nr, ar=@trailer['Root']['Pages']['Kids']) ar.each { |kid| if kid['Count'] break page(nr, kid['Kids']) if nr <= kid['Count'] nr -= kid['Count'] else nr -= 1 break PSPage.new(page_data(kid['Contents'])) if nr <= 0 end } end end # a PostScript page (lines with position information) class PSPage class Line CHARWIDTH=400 attr_accessor :str, :x, :y, :fontx, :fonty # parses a postscript line, returns a line with individual characters at the right place (more or less) def initialize(str, x, y, fontx, fonty, charspc, wordspc) @raw, @charspc, @wordspc = str, charspc, wordspc @x, @y, @fontx, @fonty = x, y, fontx, fonty str = str[1...-1] if str[0] == ?[ @str = '' bs = char = false #lastchar = nil spc = '' str.each_byte { |b| if not bs # special chars (unescaped) case b when ?( # new word: honor word spacing spc = (-spc.to_f/CHARWIDTH).round if spc > 0 and not @str.empty? @str << (' '*spc) elsif spc < 0 @str.chop! while @str[-1] == ?\ and (spc += 1) <= 0# and (lastchar != ?\ or @str[-2] == lastchar) end char = true next when ?\\ # bs character bs = true next when ?) # end of word char = false spc = '' next end end # octal escape sequence: leave as is (actual char depends on font) if bs and (?0..?7).include? b; @str << ?\\ end bs = false if char # update current rendered string, honoring charspc @str << b @str << (' ' * (charspc*1000/CHARWIDTH).round) if charspc > 0.1 @str << (' ' * (wordspc*1000/CHARWIDTH).round) if b == ?\ and wordspc > 0.1 #lastchar = b else # between strings: store word spacing integer spc << b end } puts "(#{x}, #{y} #{fontx}, #{fonty}) #@str" if $VERBOSE end def to_s ; @str end end attr_accessor :lines def initialize(str=nil) parse(str) if str end # remove lines not within ymin and ymax def clip_lines(ymin, ymax) ymin, ymax = ymax, ymin if ymin > ymax @lines.each { |la| la.delete_if { |l| l.y < ymin or l.y > ymax } } @lines.delete_if { |la| la.empty? } self end # parse a postscript string to an array of paragraph (itself an array of lines) # handles text strings and basic cursor position updates def parse(str) @lines = [] curx = cury = 0 fontx = fonty = 12 charspc = wordspc = 0 stack = [] linelead = -12 ps2tok(str) { |t| case t when Float, String; print "#{t} " else puts t end if $VERBOSE case t when Float, String; stack << t # be postfix ! when :BT; intext = true ; @lines << [] # begin text when :ET; intext = false # end text when :Tj, :TJ # print line @lines.last << Line.new(stack.pop, curx, cury, fontx, fonty, charspc, wordspc) when :Td, :TD # move cursor linelead = stack.last*fonty if t == :TD cury += stack.pop*fonty curx += stack.pop*fontx when :'T*' # new line cury += linelead when :Tc # character spacing # RHAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA #3.17731 Tc 9 0 0 9 343.41 653.84998 Tm #[(3T)3202(O)729(R)3179(A)-3689(S)3178(I)]TJ # => 3 TO RA SI charspc = stack.pop when :Tw wordspc = stack.pop when :Tm # set transform matrix (scale, rotate, translate) params = Array.new(6) { stack.pop }.reverse next if params[0] == 0.0 # rotated text fontx, _, _, fonty, curx, cury = params end } end # yields PS tokens: floats, commands, and strings def ps2tok(str) loop do case str when ''; break when /\A-?\d+(?:\.\d+)?/; tok = $&.to_f when /\A\((?:\\.|[^\\)])*\)/; tok = $& when /\A\[(?:[^\](]*\((?:\\.|[^\\)])*\))*[^\]]*\]/; tok = $& when /\A[a-zA-Z0-9_*]+/; tok = $&.to_sym rescue nil when /\A\S+/, /\A\s+/ end str = str[$&.length..-1] yield tok if tok end end # renders the lines, according to the layout (almost ;) ) def to_s mx = @lines.flatten.map { |l| l.x }.min py = nil strs = [''] @lines.sort_by { |la| -la.map { |l| l.y }.max.to_i }.each { |la| y = la.map { |l| l.y }.max strs.concat ['']*((py-y)/12) if py and py > y la.sort_by { |l| [-l.y, l.x] }.each { |l| # 9 == base font size strs << '' if y > l.y+l.fonty*0.9 or strs.last.length*1000/Line::CHARWIDTH/9 > l.x-mx strs[-1] = strs.last.ljust((l.x-mx)*1000/Line::CHARWIDTH/9-1) << ' ' << l.str y = l.y } py = y if not py or py > y } strs.join("\n") end end if __FILE__ == $0 require 'pp' begin pdf = PDF.read ARGV.shift if $VERBOSE puts 'Info: ' pp pdf.deref(pdf.trailer['Info']) puts end if not ARGV.empty? ARGV.each { |pagenr| puts pdf.page(pagenr.to_i) } else puts 'Pages: ' pagecnt = 0 pdf.each_page { |page| pagecnt += 1 puts " ------- p.#{pagecnt} ---------", page } end rescue puts "at #{pdf.off.to_s(16) if pdf}", $!, $!.backtrace[0, 24] end end __END__ PostScript text formatting, shamelessly ripped from the web (http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/) Object 3, which contains the contents of page one of our document, is worth commenting on since it shows how text streams are used in PDF. The object looks like: 3 0 obj << /Length 168 >> stream BT /F4 1 Tf 12 0 0 12 50.64 731.52 Tm 0 0 0 rg BX /GS2 gs EX 0 Tc 0 Tw @charspc = charspc [(This is 12-point )10(T)41(imes. ) 18(This sentence will appear near the top of page one.)]TJ ET endstream endobj The stream object (which is 168 bytes long) is bracketed by BT and ET operators, for Begin Text and End Text. The Tf command selects our font and its size in user-space units, which is given as 1. "But aren't we using 12-point type?" you may be wondering. Yes, we are. That's specified in the next line, ending in Tm (which is the set-text-matrix operator). For space reasons, we won't say much about coordinate system transformations and matrices here, but if you're familiar with the use of matrices in PostScript, the same rules apply in PDF. A transform matrix is given by an array of six numbers, the first and fourth of which determine scaling in x and y, respectively. We see in our text matrix, the scaling factor is 12. That means we will use 12-point type. The last two numbers in the matrix (50.64 and 731.52) specify a translation, in user-space units. The effect of the translation is to put our text approximately 10.1 inches high on the page, with a left margin of 0.7 inch. The line ending with rg sets our ink color to an RGB value of 0 0 0, or black. The BX operator says that we are beginning a section that allows undefined operators. In this section, we apply the gs operator (which sets parameters in the extended graphics state), using /GS2 as our EGS specifications. The EX operator ends the section allowing undefined operators. In essence, we're saying "Any reading application that understands what's in this special section can execute the instructions contained there, but if you don't understand the instructions, just go on." The reason this section has to be handled this way is that extended graphics state instructions often contain device-dependent instructions. The lack of generality means we should bracket those instructions with BX/EX. The Tc and Tw operators are for setting character spacing and word spacing, respectively. Finally, we come to the text that will be displayed on our page. Oddly enough, it's specified in an array of text snippets interspersed with integers, such as: (This is 12-point )10(T)41(imes. ) The number 10 represents a kerning value, in thousandths of an em. (An em is a typographical unit of measurement equal to the size of the font.) This number is subtracted from the 'x' coordinate of the letter(s) that follow, displacing the text to the left. The capital 'T' is displaced 10 units to the left, while "imes. " is displaced 41 units. The TJ at the end of the array is the operator for "show text, allowing individual character spacing." Finally, ET closes off the text block, and endstream closes off the stream. b closepath, fill,and stroke path. B fill and stroke path. b* closepath, eofill,and stroke path. B* eofill and stroke path. BI begin image. BMC begin marked content. BT begin text object. BX begin section allowing undefined operators. c curveto. cm concat. Concatenates the matrix to the current transform. cs setcolorspace for fill. CS setcolorspace for stroke. d setdash. Do execute the named XObject. DP mark a place in the content stream, with a dictionary. EI end image. EMC end marked content. ET end text object. EX end section that allows undefined operators. f fill path. f* eofill Even/odd fill path. g setgray (fill). G setgray (stroke). gs set parameters in the extended graphics state. h closepath. i setflat. ID begin image data. j setlinejoin. J setlinecap. k setcmykcolor (fill). K setcmykcolor (stroke). l lineto. m moveto. M setmiterlimit. n end path without fill or stroke. q save graphics state. Q restore graphics state. re rectangle. rg setrgbcolor (fill). RG setrgbcolor (stroke). s closepath and stroke path. S stroke path. sc setcolor (fill). SC setcolor (stroke). sh shfill (shaded fill). Tc set character spacing. Td move text current point. TD move text current point and set leading. Tf set font name and size. Tj show text. TJ show text, allowing individual character positioning. TL set leading. Tm set text matrix. Tr set text rendering mode. Ts set super/subscripting text rise. Tw set word spacing. Tz set horizontal scaling. T* move to start of next line. v curveto. w setlinewidth. W clip. y curveto.