From 5d05ca154abca8cbca754295242f87dd52b1d8a3 Mon Sep 17 00:00:00 2001
From: Brent Cook <bcook@rapid7.com>
Date: Mon, 14 Aug 2017 01:08:53 -0400
Subject: [PATCH] added http client 'download' method and updates to pdf author
 module from @bcoles

---
 lib/msf/core/exploit/http/client.rb          |  41 ++++-
 modules/auxiliary/gather/http_pdf_authors.rb | 179 ++++++++++++-------
 2 files changed, 149 insertions(+), 71 deletions(-)

diff --git a/lib/msf/core/exploit/http/client.rb b/lib/msf/core/exploit/http/client.rb
index 5944a5c822..d7bcc0c46e 100644
--- a/lib/msf/core/exploit/http/client.rb
+++ b/lib/msf/core/exploit/http/client.rb
@@ -511,7 +511,16 @@ module Exploit::Remote::HttpClient
   #
   # Returns a hash of request opts from a URL string
   def request_opts_from_url(url)
-    tgt = URI.parse(url)
+    # verify and extract components from the URL
+    begin
+      tgt = URI.parse(url)
+      raise 'Invalid URL' unless tgt.scheme =~ %r{https?}
+      raise 'Invalid URL' if tgt.host.to_s.eql? ''
+    rescue => e
+      print_error "Could not parse URL: #{e}"
+      return nil
+    end
+
     opts = { 'rhost' => tgt.host, 'rport' => tgt.port, 'uri' => tgt.request_uri }
     opts['SSL'] = true if tgt.scheme == 'https'
     if tgt.query and tgt.query.size > 13
@@ -528,11 +537,39 @@ module Exploit::Remote::HttpClient
   #
   # Returns response from a simple URL call
   def request_url(url, keepalive = false)
-    res = send_request_raw(request_opts_from_url(url))
+    opts = request_opts_from_url(url)
+    return nil if opts.nil?
+    res = send_request_raw(opts)
     disconnect unless keepalive
     return res
   end
 
+  #
+  # Downloads a URL
+  def download(url)
+    print_status "Downloading '#{url}'"
+
+    begin
+      target = URI.parse url
+      raise 'Invalid URL' unless target.scheme =~ /https?/
+      raise 'Invalid URL' if target.host.to_s.eql? ''
+    rescue => e
+      print_error "Could not parse URL: #{e}"
+      return nil
+    end
+
+    res = request_url(url)
+
+    unless res
+      print_error 'Connection failed'
+      return nil
+    end
+
+    print_status "- HTTP #{res.code} - #{res.body.length} bytes"
+
+    res.code == 200 ? res.body : nil
+  end
+
   # removes HTML tags from a provided string.
   # The string is html-unescaped before the tags are removed
   # Leading whitespaces and double linebreaks are removed too
diff --git a/modules/auxiliary/gather/http_pdf_authors.rb b/modules/auxiliary/gather/http_pdf_authors.rb
index b49281b501..3217609ca2 100644
--- a/modules/auxiliary/gather/http_pdf_authors.rb
+++ b/modules/auxiliary/gather/http_pdf_authors.rb
@@ -1,36 +1,47 @@
 ##
-# This module requires Metasploit: http://metasploit.com/download
+# This module requires Metasploit: https://metasploit.com/download
 # Current source: https://github.com/rapid7/metasploit-framework
 ##
 
 require 'pdf-reader'
 
 class MetasploitModule < Msf::Auxiliary
-
   include Msf::Exploit::Remote::HttpClient
+  include Msf::Auxiliary::Report
 
   def initialize(info = {})
     super(update_info(info,
       'Name'        => 'Gather PDF Authors',
       'Description' => %q{
-        This module downloads PDF files and extracts the author's
+        This module downloads PDF documents and extracts the author's
         name from the document metadata.
+
+        This module expects a URL to be provided using the URL option.
+        Alternatively, multiple URLs can be provided by supplying the
+        path to a file containing a list of URLs in the URL_LIST option.
+
+        The URL_TYPE option is used to specify the type of URLs supplied.
+
+        By specifying 'pdf' for the URL_TYPE, the module will treat
+        the specified URL(s) as PDF documents. The module will
+        download the documents and extract the authors' names from the
+        document metadata.
+
+        By specifying 'html' for the URL_TYPE, the module will treat
+        the specified URL(s) as HTML pages. The module will scrape the
+        pages for links to PDF documents, download the PDF documents,
+        and extract the author's name from the document metadata.
       },
       'License'     => MSF_LICENSE,
       'Author'      => 'Brendan Coles <bcoles[at]gmail.com>'))
     register_options(
       [
-        OptString.new('URL', [ false, 'The URL of a PDF to analyse', '' ]),
-        OptString.new('URL_LIST', [ false, 'File containing a list of PDF URLs to analyze', '' ]),
-        OptString.new('OUTFILE', [ false, 'File to store output', '' ])
-      ])
-    register_advanced_options(
-      [
-        OptString.new('SSL_VERIFY', [ true, 'Verify SSL certificate', true ]),
-        OptString.new('PROXY', [ false, 'Proxy server to route connection. <host>:<port>', nil ]),
-        OptString.new('PROXY_USER', [ false, 'Proxy Server User', nil ]),
-        OptString.new('PROXY_PASS', [ false, 'Proxy Server Password', nil ])
+        OptString.new('URL', [ false, 'The target URL', '' ]),
+        OptString.new('URL_LIST', [ false, 'File containing a list of target URLs', '' ]),
+        OptEnum.new('URL_TYPE', [ true, 'The type of URL(s) specified', 'html', [ 'pdf', 'html' ] ]),
+        OptBool.new('STORE_LOOT', [ false, 'Store authors in loot', true ])
       ])
+    deregister_options 'RHOST', 'RPORT', 'VHOST'
   end
 
   def progress(current, total)
@@ -47,79 +58,117 @@ class MetasploitModule < Msf::Auxiliary
     end
 
     unless File.file? datastore['URL_LIST'].to_s
-      fail_with Failure::BadConfig, "File '#{datastore['URL_LIST']}' does not exit"
+      fail_with Failure::BadConfig, "File '#{datastore['URL_LIST']}' does not exist"
     end
 
-    File.open(datastore['URL_LIST'], 'rb') {|f| f.read}.split(/\r?\n/)
+    File.open(datastore['URL_LIST'], 'rb') { |f| f.read }.split(/\r?\n/)
   end
 
   def read(data)
-    begin
+    Timeout.timeout(10) do
       reader = PDF::Reader.new data
       return parse reader
-    rescue PDF::Reader::MalformedPDFError
-      print_error "Could not parse PDF: PDF is malformed"
-      return
-    rescue PDF::Reader::UnsupportedFeatureError
-      print_error "Could not parse PDF: PDF::Reader::UnsupportedFeatureError"
-      return
-    rescue => e
-      print_error "Could not parse PDF: Unhandled exception: #{e}"
-      return
     end
+  rescue PDF::Reader::MalformedPDFError
+    print_error "Could not parse PDF: PDF is malformed (MalformedPDFError)"
+    return
+  rescue PDF::Reader::UnsupportedFeatureError
+    print_error "Could not parse PDF: PDF contains unsupported features (UnsupportedFeatureError)"
+    return
+  rescue SystemStackError
+    print_error "Could not parse PDF: PDF is malformed (SystemStackError)"
+    return
+  rescue SyntaxError
+    print_error "Could not parse PDF: PDF is malformed (SyntaxError)"
+    return
+  rescue Timeout::Error
+    print_error "Could not parse PDF: PDF is malformed (Timeout)"
+    return
+  rescue => e
+    print_error "Could not parse PDF: Unhandled exception: #{e}"
+    return
   end
 
   def parse(reader)
     # PDF
-    #print_status "PDF Version: #{reader.pdf_version}"
-    #print_status "PDF Title: #{reader.info['title']}"
-    #print_status "PDF Info: #{reader.info}"
-    #print_status "PDF Metadata: #{reader.metadata}"
-    #print_status "PDF Pages: #{reader.page_count}"
+    # print_status "PDF Version: #{reader.pdf_version}"
+    # print_status "PDF Title: #{reader.info['title']}"
+    # print_status "PDF Info: #{reader.info}"
+    # print_status "PDF Metadata: #{reader.metadata}"
+    # print_status "PDF Pages: #{reader.page_count}"
 
     # Software
-    #print_status "PDF Creator: #{reader.info[:Creator]}"
-    #print_status "PDF Producer: #{reader.info[:Producer]}"
+    # print_status "PDF Creator: #{reader.info[:Creator]}"
+    # print_status "PDF Producer: #{reader.info[:Producer]}"
 
     # Author
     reader.info[:Author].class == String ? reader.info[:Author].split(/\r?\n/).first : ''
   end
 
-  def download(url)
-    print_status "Downloading PDF from '#{url}'"
-
-    res = request_url(url)
-    print_status "HTTP #{res.code} -- Downloaded PDF (#{res.body.length} bytes)"
-
-    return res.code == 200 ? StringIO.new(res.body) : StringIO.new
-  end
-
-  def write_output(data)
-    return if datastore['OUTFILE'].to_s.eql? ''
-
-    print_status "Writing data to #{datastore['OUTFILE']}..."
-    file_name = datastore['OUTFILE']
-
-    if FileTest::exist?(file_name)
-      print_status 'OUTFILE already exists, appending..'
-    end
-
-    File.open(file_name, 'ab') do |fd|
-      fd.write(data)
-    end
-  end
-
   def run
-
     urls = load_urls
+
+    if datastore['URL_TYPE'].eql? 'html'
+      urls = extract_pdf_links urls
+
+      if urls.empty?
+        print_error 'Found no links to PDF files'
+        return
+      end
+
+      print_line
+      print_good "Found links to #{urls.size} PDF files:"
+      print_line urls.join "\n"
+      print_line
+    end
+
+    authors = extract_authors urls
+
+    print_line
+
+    if authors.empty?
+      print_status 'Found no authors'
+      return
+    end
+
+    print_good "Found #{authors.size} authors: #{authors.join ', '}"
+
+    return unless datastore['STORE_LOOT']
+
+    p = store_loot 'pdf.authors', 'text/plain', nil, authors.join("\n"), 'pdf.authors.txt', 'PDF authors'
+    print_good "File saved in: #{p}"
+  end
+
+  def extract_pdf_links(urls)
     print_status "Processing #{urls.size} URLs..."
+
+    pdf_urls = []
+    urls.each_with_index do |url, index|
+      next if url.blank?
+      html = download url
+      next if html.blank?
+      doc = Nokogiri::HTML html
+      doc.search('a[href]').select { |n| n['href'][/(\.pdf$|\.pdf\?)/] }.map do |n|
+        pdf_urls << URI.join(url, n['href']).to_s
+      end
+      progress(index + 1, urls.size)
+    end
+
+    pdf_urls.uniq
+  end
+
+  def extract_authors(urls)
+    print_status "Processing #{urls.size} URLs..."
+
     authors = []
     max_len = 256
     urls.each_with_index do |url, index|
       next if url.blank?
-      contents = download url
-      next if contents.blank?
-      author = read contents
+      file = download url
+      next if file.blank?
+      pdf = StringIO.new
+      pdf.puts file
+      author = read pdf
       unless author.blank?
         print_good "PDF Author: #{author}"
         if author.length > max_len
@@ -132,14 +181,6 @@ class MetasploitModule < Msf::Auxiliary
       progress(index + 1, urls.size)
     end
 
-    print_line
-
-    if authors.empty?
-      print_status 'Found no authors'
-      return
-    end
-
-    print_good "Found #{authors.size} authors: #{authors.join ', '}"
-    write_output authors.join "\n"
+    authors.uniq
   end
 end