metasploit-framework/lib/metasm/misc/ppc_pdf2oplist.rb

#    This file is part of Metasm, the Ruby assembly manipulation suite
#    Copyright (C) 2006-2009 Yoann GUILLOT
#
#    Licence is LGPL, see LICENCE in the top-level directory
#
# parses the PPC specification PDF to generate the opcode list
#

require 'pdfparse'

$field_mask = {}
$field_shift = {}
$opcodes = []
def make_instr(bins, bits, text)
	# calc bitfields length from their offset
	last = 32
	bitlen = []
	bits.reverse_each { |bit|
		bitlen.unshift last-bit
		last = bit
	}

	# the opcode binary value (w/o fields)
	bin = 0
	fields = []

	# parse the data
	bins.zip(bits, bitlen).each { |val, off, len|
		off = 32-(off+len)
		msk = (1 << len) - 1
		case val
		when '/', '//', '///'	# reserved field, value unspecified
		when /^\d+$/; bin |= val.to_i << off	# constant field
		when /^[A-Za-z]+$/
			fld = val.downcase.to_sym
			fld = "#{fld}_".to_sym while $field_mask[fld] and ($field_mask[fld] != msk or $field_shift[fld] != off)
			fields << fld
			$field_mask[fld] ||= msk
			$field_shift[fld] ||= off
		end
	}

	text.each { |txt|
		# fnabs FRT,FRB (Rc=0)
		curbin = bin
		curfields = fields.dup
		txt.sub!('  Rc=1)', '  (Rc=1)') if txt.include? 'fdiv.'		# typo: fdiv. has no '('
		if txt =~ /(.*\S)\s*\((\w+=.*)\)/
			txt = $1
			$2.split.each { |e|
 				raise e if e !~ /(\w+)=(\d+)/
				name, val = $1.downcase, $2.to_i
				raise "bad bit #{name} in #{txt}" if not fld = curfields.find { |fld_| fld_.to_s.delete('_') == name }
				curfields.delete fld
				curbin |= val << $field_shift[fld]
			}
		end
		opname, args = txt.split(/\s+/, 2)
		args = args.to_s.downcase.split(/\s*,\s*/).map { |arg| fld = curfields.find { |fld_| fld_.to_s.delete('_') == arg } ; curfields.delete fld ; fld }
		if args.include? nil and curfields.length == 2 and (curfields - [:ra, :d]).empty?
			args[args.index(nil)] = :ra_i16
			curfields.clear
		elsif args.include? nil and curfields.length == 2 and (curfields - [:ra, :ds]).empty?
			args[args.index(nil)] = :ra_i16s
			curfields.clear
		elsif args.include? nil and curfields.length == 2 and (curfields - [:ra, :dq]).empty?
			args[args.index(nil)] = :ra_i16q
			curfields.clear
		elsif args.include? nil and curfields.length == 1
			args[args.index(nil)] = curfields.shift
		end
		raise "bad args #{args.inspect} (#{curfields.inspect}) in #{txt}" if args.include? nil
		$opcodes << [opname, curbin, args]

		n = (opname.inspect << ',').ljust(10) + '0x%08X' % curbin
		n << ', ' if not args.empty?
		puts "\taddop " + n + args.map { |e| e.inspect }.join(', ')
	}
end

# handle instruction aliases
# NOT WORKING
# should be implemented in the parser/displayer instead of opcode list
# manual work needed for eg conditionnal jumps
def make_alias(newop, newargs, oldop, oldargs)
	raise "unknown alias #{newop} => #{oldop}" if not op = $opcodes.reverse.find { |op_| op_[0] == oldop }
	op2 = op.dup
	op2[0] = newop
	oldargs.each_with_index { |oa, i|
		# XXX bcctr 4, 6  ->  bcctr 4, 6, 0 => not the work
		if oa =~ /^[0-9]+$/ or oa =~ /^0x[0-9a-f]+$/i
			fld = op[2][i]
			op2[1] |= Integer(oa) << $field_shift[fld]
		end
	}
	puts "#\talias #{newop} #{newargs.join(', ')}  ->  #{oldop} #{oldargs.join(', ')}".downcase
end

require 'enumerator'
def epilog
	puts "\n\t@field_shift = {"
	puts $field_shift.sort_by { |k, v| k.to_s }.enum_slice(6).map { |slc|
		"\t\t" + slc.map { |k, v| "#{k.inspect} => #{v}" }.join(', ')
	}.join(",\n")
	puts "\t}"
	puts "\n\t@field_mask = {"
	puts $field_mask.sort_by { |k, v| k.to_s }.enum_slice(6).map { |slc|
		"\t\t" + slc.map { |k, v| "#{k.inspect} => #{v > 1000 ? '0x%X' % v : v}" }.join(', ')
	}.join(",\n")
	puts "\t}"
end

$foundop = false
def parse_page(lines)
	# all instr defining pages include this
	return unless lines.find { |l| l.str =~ /Special Registers Altered|Memory Barrier Instructions|Data Cache Instructions/  }	# sync L/dcbt

	ilist = [] # line buffer
	extended = false

	# concat lines with same y
	lines = lines.sort_by { |l| [-l.y, l.x] }
	lastline = nil
	lines.delete_if { |l|
		if lastline and lastline.y == l.y and ([lastline.fontx, lastline.fonty] == [l.fontx, l.fonty] or l.str =~ /^\s*$/)
			lastline.str << ' ' << l.str
			true
		else
			lastline = l
			false
		end
	}

	lines.each { |l|
		# search for the bit indices list
		if l.fonty < 7 and l.str =~ /^0 [\d ]+ 31\s*$/ and (ilist.last.str.split.length == l.str.split.length or ilist.last.str.split.length == l.str.split.length-1)
			$foundop = true
			bitindices = l.str.split.map { |i| i.to_i }
			# previous line is the binary encoding
			encoding = ilist.pop.str.split
			bitindices.pop if encoding.length < bitindices.length
			# previous line is the instruction text format
			ilist.pop if ilist.last.str =~ /\[POWER2? mnemonics?: (.*)\]/
			text = []
			text.unshift l while l = ilist.pop and l = l.str and (l =~ /,|\)$/ or text.empty?)
			ilist = []
			make_instr(encoding, bitindices, text)
		elsif l.str.include? 'Special Registers Altered'
			if not $foundop
				puts ilist.map { |l_| "(#{l_.y}) #{l_.str}" }
				puts lines.map { |l_| "(#{l_.y}) #{l_.str}" } if ilist.empty?
				raise 'nofoundop'
			else
				$foundop = false
			end
		elsif l.str =~ /Extended:\s+Equivalent to:/
			extended = true
		elsif extended
			if l.str.include? ',' and l.str =~ /^(\S+)\s+(\S+)\s+(\S+)\s+(.*)/ and $opcodes.find { |op| op[0] == $3 }
				newop, newargs, exop, exargs = $1, $2, $3, $4
				make_alias(newop, newargs.split(','), exop, exargs.split(','))
			else extended = false
			end
		else ilist << l
		end
	}
end

# PowerPC Architecture v2.02:
#  1 - User Instruction Set
#  2 - Virtual Environment
#  3 - Operating Environment
Dir['PPC_Vers202_Book?_public.pdf'].sort.each { |book|
	$stderr.puts book if $stderr.tty?
	pdf = PDF.read book
	pagecount = pdf.trailer['Root']['Pages']['Count'] || 0
	curpage = 0
	pdf.each_page { |p|
		$stderr.print "#{curpage+=1}/#{pagecount} \r" if $stderr.tty?
		p.clip_lines(50, 740)
		list = p.lines.flatten

		# split columns
		sp1, sp2 = list.partition { |l| l.x < 288 }

		parse_page(sp1)
		parse_page(sp2)
	}
	$stderr.print "           \r" if $stderr.tty?
}

epilog()