From 62ba505f10f70bb4aa44f67cbf41b6e0dc6c16c6 Mon Sep 17 00:00:00 2001
From: et <>
Date: Wed, 7 Apr 2010 03:32:50 +0000
Subject: [PATCH] Extra parsing modules

git-svn-id: file:///home/svn/framework3/trunk@9030 4d416f70-5f16-0410-b530-b9f4589650da
---
 data/msfcrawler/flash.rb   | 28 +++++++------
 data/msfcrawler/objects.rb | 85 ++++++++++++++++++++++++++++++++++++++
 data/msfcrawler/scripts.rb | 85 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 186 insertions(+), 12 deletions(-)
 create mode 100644 data/msfcrawler/objects.rb
 create mode 100644 data/msfcrawler/scripts.rb

diff --git a/data/msfcrawler/flash.rb b/data/msfcrawler/flash.rb
index 9c6385f89b..2ae549d90f 100644
--- a/data/msfcrawler/flash.rb
+++ b/data/msfcrawler/flash.rb
@@ -3,14 +3,18 @@ require 'pathname'
 require 'uri'
 
 
-$flarebinary = "/Users/et/Downloads/flare"
-$flareoutdir = "/Users/et/Downloads/temp/"
+$flarebinary = "/home/et/Downloads/flare"
+$flareoutdir = "/home/et/Downloads/"
 
 class CrawlerFlash < BaseParser
 
 
 	def parse(request,result)
-		
+		rexp = ['loadMovieNum\(\'(.*?)\'',
+			'loadMovie\(\'(.*?)\'',
+			'getURL\(\'(.*?)\''
+			]		
+
 		
 		if !result['Content-Type'].include? "application/x-shockwave-flash"
 			return
@@ -22,7 +26,8 @@ class CrawlerFlash < BaseParser
 		
 		ffile = File.new(outswf, "wb")    
 		ffile.puts(result.body)
-		
+		ffile.close		
+
 		system("#{$flarebinary} #{outswf}")
 		
 		outflr = outswf.gsub('.swf','.flr')
@@ -36,14 +41,13 @@ class CrawlerFlash < BaseParser
 		
 		File.open(outflr, "r") do |infile|
 			while (line = infile.gets)
-			
-				links = line.to_s.scan(/\b(?:(?:https?|ftp):\/\/|www\.)[-a-z0-9+&@#\/%?=~_|!:,.;]*[-a-z0-9+&@#\/%=~_|]/i) #" 
+
+			rexp.each do |r|						
+				links = line.to_s.scan(Regexp.new(r,true)) #" 
 				links.each do |link| 
 				
-					puts "SWF: #{link}"
-			
 					begin
-						uri = URI.parse(link)
+						uri = URI.parse(link[0])
 			
 						tssl = false
 						if uri.scheme == "https"
@@ -94,6 +98,7 @@ class CrawlerFlash < BaseParser
 							'data'		=> nil
 						}
 				
+					
 						insertnewpath(hreq)
 					
 					rescue URI::InvalidURIError
@@ -102,9 +107,8 @@ class CrawlerFlash < BaseParser
 					end
 				end	
 			end
-		end
-										
-		puts "Done. "
+			end
+		end										
 	end 
 end
 
diff --git a/data/msfcrawler/objects.rb b/data/msfcrawler/objects.rb
new file mode 100644
index 0000000000..ca9b706b9f
--- /dev/null
+++ b/data/msfcrawler/objects.rb
@@ -0,0 +1,85 @@
+require 'rubygems'
+require 'pathname'
+require 'hpricot'
+require 'uri'
+
+class CrawlerObjects < BaseParser
+
+	def parse(request,result)
+		
+		if !result['Content-Type'].include? "text/html"
+			return
+		end
+		
+		hr = ''
+		m = ''
+
+		doc = Hpricot(result.body.to_s)
+		doc.search("//object/embed").each do |obj|
+
+			s = obj['src']
+
+			begin
+				uri = URI.parse(s)
+			
+				tssl = false
+				if uri.scheme == "https"
+					tssl = true
+				else
+					tssl = false
+				end
+
+				if !uri.host or uri.host == nil
+					thost = request['rhost']
+					tssl = self.targetssl	
+				else
+					thost = uri.host	
+				end
+
+				if !uri.port or uri.port == nil
+					tport = request['rport']
+				else
+					tport = uri.port
+				end
+
+				if !uri.path or uri.path == nil
+					tpath = "/"
+				else
+					tpath = uri.path
+				end
+				
+				newp = Pathname.new(tpath)
+				oldp = Pathname.new(request['uri'])
+				if !newp.absolute?
+					if oldp.to_s[-1,1] == '/'
+						newp = oldp+newp
+					else
+						if !newp.to_s.empty?
+							newp = File.join(oldp.dirname,newp)
+						end
+					end		
+				end
+				
+				hreq = {
+					'rhost'		=> thost,
+					'rport'		=> tport,
+					'uri'  		=> newp.to_s,
+					'method'   	=> 'GET',
+					'ctype'		=> 'text/html',
+					'ssl'		=> tssl,
+					'query'		=> uri.query,
+					'data'		=> nil
+					
+				}
+				#puts hreq
+				insertnewpath(hreq)
+			
+					
+			rescue URI::InvalidURIError
+				#puts "Parse error"
+				#puts "Error: #{link[0]}"
+			end
+		end			
+	end 
+end
+
diff --git a/data/msfcrawler/scripts.rb b/data/msfcrawler/scripts.rb
new file mode 100644
index 0000000000..bfcbb5fe0b
--- /dev/null
+++ b/data/msfcrawler/scripts.rb
@@ -0,0 +1,85 @@
+require 'rubygems'
+require 'pathname'
+require 'hpricot'
+require 'uri'
+
+class CrawlerScripts < BaseParser
+
+	def parse(request,result)
+		
+		if !result['Content-Type'].include? "text/html"
+			return
+		end
+		
+		hr = ''
+		m = ''
+
+		doc = Hpricot(result.body.to_s)
+		doc.search("//script").each do |obj|
+
+			s = obj['src']
+
+			begin
+				uri = URI.parse(s)
+			
+				tssl = false
+				if uri.scheme == "https"
+					tssl = true
+				else
+					tssl = false
+				end
+
+				if !uri.host or uri.host == nil
+					thost = request['rhost']
+					tssl = self.targetssl	
+				else
+					thost = uri.host	
+				end
+
+				if !uri.port or uri.port == nil
+					tport = request['rport']
+				else
+					tport = uri.port
+				end
+
+				if !uri.path or uri.path == nil
+					tpath = "/"
+				else
+					tpath = uri.path
+				end
+				
+				newp = Pathname.new(tpath)
+				oldp = Pathname.new(request['uri'])
+				if !newp.absolute?
+					if oldp.to_s[-1,1] == '/'
+						newp = oldp+newp
+					else
+						if !newp.to_s.empty?
+							newp = File.join(oldp.dirname,newp)
+						end
+					end		
+				end
+				
+				hreq = {
+					'rhost'		=> thost,
+					'rport'		=> tport,
+					'uri'  		=> newp.to_s,
+					'method'   	=> 'GET',
+					'ctype'		=> 'text/html',
+					'ssl'		=> tssl,
+					'query'		=> uri.query,
+					'data'		=> nil
+					
+				}
+				#puts hreq
+				insertnewpath(hreq)
+			
+					
+			rescue URI::InvalidURIError
+				#puts "Parse error"
+				#puts "Error: #{link[0]}"
+			end
+		end			
+	end 
+end
+