Extra parsing modules

git-svn-id: file:///home/svn/framework3/trunk@9030 4d416f70-5f16-0410-b530-b9f4589650da
unstable
et 2010-04-07 03:32:50 +00:00
parent c8aae09827
commit 62ba505f10
3 changed files with 186 additions and 12 deletions

View File

@ -3,14 +3,18 @@ require 'pathname'
require 'uri' require 'uri'
$flarebinary = "/Users/et/Downloads/flare" $flarebinary = "/home/et/Downloads/flare"
$flareoutdir = "/Users/et/Downloads/temp/" $flareoutdir = "/home/et/Downloads/"
class CrawlerFlash < BaseParser class CrawlerFlash < BaseParser
def parse(request,result) def parse(request,result)
rexp = ['loadMovieNum\(\'(.*?)\'',
'loadMovie\(\'(.*?)\'',
'getURL\(\'(.*?)\''
]
if !result['Content-Type'].include? "application/x-shockwave-flash" if !result['Content-Type'].include? "application/x-shockwave-flash"
return return
@ -22,7 +26,8 @@ class CrawlerFlash < BaseParser
ffile = File.new(outswf, "wb") ffile = File.new(outswf, "wb")
ffile.puts(result.body) ffile.puts(result.body)
ffile.close
system("#{$flarebinary} #{outswf}") system("#{$flarebinary} #{outswf}")
outflr = outswf.gsub('.swf','.flr') outflr = outswf.gsub('.swf','.flr')
@ -36,14 +41,13 @@ class CrawlerFlash < BaseParser
File.open(outflr, "r") do |infile| File.open(outflr, "r") do |infile|
while (line = infile.gets) while (line = infile.gets)
links = line.to_s.scan(/\b(?:(?:https?|ftp):\/\/|www\.)[-a-z0-9+&@#\/%?=~_|!:,.;]*[-a-z0-9+&@#\/%=~_|]/i) #" rexp.each do |r|
links = line.to_s.scan(Regexp.new(r,true)) #"
links.each do |link| links.each do |link|
puts "SWF: #{link}"
begin begin
uri = URI.parse(link) uri = URI.parse(link[0])
tssl = false tssl = false
if uri.scheme == "https" if uri.scheme == "https"
@ -94,6 +98,7 @@ class CrawlerFlash < BaseParser
'data' => nil 'data' => nil
} }
insertnewpath(hreq) insertnewpath(hreq)
rescue URI::InvalidURIError rescue URI::InvalidURIError
@ -102,9 +107,8 @@ class CrawlerFlash < BaseParser
end end
end end
end end
end end
end
puts "Done. "
end end
end end

View File

@ -0,0 +1,85 @@
require 'rubygems'
require 'pathname'
require 'hpricot'
require 'uri'
class CrawlerObjects < BaseParser
def parse(request,result)
if !result['Content-Type'].include? "text/html"
return
end
hr = ''
m = ''
doc = Hpricot(result.body.to_s)
doc.search("//object/embed").each do |obj|
s = obj['src']
begin
uri = URI.parse(s)
tssl = false
if uri.scheme == "https"
tssl = true
else
tssl = false
end
if !uri.host or uri.host == nil
thost = request['rhost']
tssl = self.targetssl
else
thost = uri.host
end
if !uri.port or uri.port == nil
tport = request['rport']
else
tport = uri.port
end
if !uri.path or uri.path == nil
tpath = "/"
else
tpath = uri.path
end
newp = Pathname.new(tpath)
oldp = Pathname.new(request['uri'])
if !newp.absolute?
if oldp.to_s[-1,1] == '/'
newp = oldp+newp
else
if !newp.to_s.empty?
newp = File.join(oldp.dirname,newp)
end
end
end
hreq = {
'rhost' => thost,
'rport' => tport,
'uri' => newp.to_s,
'method' => 'GET',
'ctype' => 'text/html',
'ssl' => tssl,
'query' => uri.query,
'data' => nil
}
#puts hreq
insertnewpath(hreq)
rescue URI::InvalidURIError
#puts "Parse error"
#puts "Error: #{link[0]}"
end
end
end
end

View File

@ -0,0 +1,85 @@
require 'rubygems'
require 'pathname'
require 'hpricot'
require 'uri'
class CrawlerScripts < BaseParser
def parse(request,result)
if !result['Content-Type'].include? "text/html"
return
end
hr = ''
m = ''
doc = Hpricot(result.body.to_s)
doc.search("//script").each do |obj|
s = obj['src']
begin
uri = URI.parse(s)
tssl = false
if uri.scheme == "https"
tssl = true
else
tssl = false
end
if !uri.host or uri.host == nil
thost = request['rhost']
tssl = self.targetssl
else
thost = uri.host
end
if !uri.port or uri.port == nil
tport = request['rport']
else
tport = uri.port
end
if !uri.path or uri.path == nil
tpath = "/"
else
tpath = uri.path
end
newp = Pathname.new(tpath)
oldp = Pathname.new(request['uri'])
if !newp.absolute?
if oldp.to_s[-1,1] == '/'
newp = oldp+newp
else
if !newp.to_s.empty?
newp = File.join(oldp.dirname,newp)
end
end
end
hreq = {
'rhost' => thost,
'rport' => tport,
'uri' => newp.to_s,
'method' => 'GET',
'ctype' => 'text/html',
'ssl' => tssl,
'query' => uri.query,
'data' => nil
}
#puts hreq
insertnewpath(hreq)
rescue URI::InvalidURIError
#puts "Parse error"
#puts "Error: #{link[0]}"
end
end
end
end