Fix parsing for keyword search feature. Update HTML and text table output. Add reputation check filtering based on PR from @christruncer

2018-05-06 20:34:55 +02:00 · 2018-05-06 20:34:55 +02:00 · e3985c2c37
parent 26dd64870d
commit e3985c2c37
2 changed files with 115 additions and 86 deletions
--- a/README.md
+++ b/README.md
@ -8,6 +8,11 @@ This Python based tool was written to quickly query the Expireddomains.net searc

 ## Changes

+- 6 May 2018
+    + Fixed expired domains parsing when performing a keyword search
+    + Minor HTML and text table output updates
+    + Filtered reputation checks to only execute for .COM, .ORG, and .NET domains and removed check for Archive.org records when performing a default or keyword search. Credit to @christruncer for the original PR and idea.
+
 - 11 April 2018
    + Added OCR support for CAPTCHA solving with tesseract. Thanks to t94j0 for the idea in [AIRMASTER](https://github.com/t94j0/AIRMASTER)  
    + Added support for input file list of potential domains (-f/--filename)
@ -85,11 +90,11 @@ List DomainHunter options

 Use defaults to check for most recent 100 domains and check reputation
    
-    python ./domainhunter.py
+    python3 ./domainhunter.py

 Search for 1000 most recently expired/deleted domains, but don't check reputation

-    python ./domainhunter.py -r 1000
+    python3 ./domainhunter.py -r 1000

 Perform all reputation checks for a single domain

--- a/domainhunter.py
+++ b/domainhunter.py
@ -4,17 +4,15 @@
 ## Author:      @joevest and @andrewchiles
 ## Description: Checks expired domains, reputation/categorization, and Archive.org history to determine 
 ##              good candidates for phishing and C2 domain names
-# Add OCR support for BlueCoat/SiteReview CAPTCHA using tesseract
-# Add support for input file list of potential domains
-# Add additional error checking for ExpiredDomains.net parsing
-# Changed -q/--query switch to -k/--keyword to better match its purpose
+
 import time 
 import random
 import argparse
 import json
 import base64
+import os

-__version__ = "20180411"
+__version__ = "20180506"

 ## Functions

@ -34,7 +32,7 @@ def doSleep(timing):
 def checkBluecoat(domain):
    try:
        url = 'https://sitereview.bluecoat.com/resource/lookup'
-        postData = {'url':domain,'captcha':''}   # HTTP POST Parameters
+        postData = {'url':domain,'captcha':''}
        headers = {'User-Agent':useragent,
                    'Content-Type':'application/json; charset=UTF-8',
                    'Referer':'https://sitereview.bluecoat.com/lookup'}
@ -51,16 +49,14 @@ def checkBluecoat(domain):
        # Print notice if CAPTCHAs are blocking accurate results and attempt to solve if --ocr
        if a == 'captcha':
            if ocr:
-                # This request is performed in a browser, but is not needed for our purposes
+                # This request is also performed by a browser, but is not needed for our purposes
                #captcharequestURL = 'https://sitereview.bluecoat.com/resource/captcha-request'
-                #print('[*] Requesting CAPTCHA')
-                #response = s.get(url=captcharequestURL,headers=headers,cookies=cookies,verify=False)

                print('[*] Received CAPTCHA challenge!')
                captcha = solveCaptcha('https://sitereview.bluecoat.com/resource/captcha.jpg',s)
                
                if captcha:
-                    b64captcha = base64.b64encode(captcha.encode('utf-8')).decode('utf-8')
+                    b64captcha = base64.urlsafe_b64encode(captcha.encode('utf-8')).decode('utf-8')
                   
                    # Send CAPTCHA solution via GET since inclusion with the domain categorization request doens't work anymore
                    captchasolutionURL = 'https://sitereview.bluecoat.com/resource/captcha-request/{0}'.format(b64captcha)
@ -124,7 +120,7 @@ def checkIBMXForce(domain):
        return "-"

 def checkTalos(domain):
-    url = "https://www.talosintelligence.com/sb_api/query_lookup?query=%2Fapi%2Fv2%2Fdetails%2Fdomain%2F&query_entry={0}&offset=0&order=ip+asc".format(domain)
+    url = 'https://www.talosintelligence.com/sb_api/query_lookup?query=%2Fapi%2Fv2%2Fdetails%2Fdomain%2F&query_entry={0}&offset=0&order=ip+asc'.format(domain)
    headers = {'User-Agent':useragent,
               'Referer':url}

@ -222,10 +218,7 @@ def checkDomain(domain):

    if domain in maldomainsList:
        print("[!] {}: Identified as known malware domain (malwaredomains.com)".format(domain))
-    
-    mxtoolbox = checkMXToolbox(domain)
-    print("[+] {}: {}".format(domain, mxtoolbox))
-    
+      
    bluecoat = checkBluecoat(domain)
    print("[+] {}: {}".format(domain, bluecoat))
    
@ -234,13 +227,21 @@ def checkDomain(domain):

    ciscotalos = checkTalos(domain)
    print("[+] {}: {}".format(domain, ciscotalos))
+
+    mxtoolbox = checkMXToolbox(domain)
+    print("[+] {}: {}".format(domain, mxtoolbox))
+
    print("")
-    return
+    
+    results = [domain,bluecoat,ibmxforce,ciscotalos,mxtoolbox]
+    return results

 def solveCaptcha(url,session):  
    # Downloads CAPTCHA image and saves to current directory for OCR with tesseract
    # Returns CAPTCHA string or False if error occured
+    
    jpeg = 'captcha.jpg'
+    
    try:
        response = session.get(url=url,headers=headers,verify=False, stream=True)
        if response.status_code == 200:
@ -251,13 +252,32 @@ def solveCaptcha(url,session):
            print('[-] Error downloading CAPTCHA file!')
            return False

+        # Perform basic OCR without additional image enhancement
        text = pytesseract.image_to_string(Image.open(jpeg))
        text = text.replace(" ", "")
+        
+        # Remove CAPTCHA file
+        try:
+            os.remove(jpeg)
+        except OSError:
+            pass
+
        return text
+
    except Exception as e:
        print("[-] Error solving CAPTCHA - {0}".format(e))
+        
        return False

+def drawTable(header,data):
+    
+    data.insert(0,header)
+    t = Texttable(max_width=maxwidth)
+    t.add_rows(data)
+    t.header(header)
+    
+    return(t.draw())
+
 ## MAIN
 if __name__ == "__main__":

@ -265,7 +285,7 @@ if __name__ == "__main__":
    parser.add_argument('-k','--keyword', help='Keyword used to refine search results', required=False, default=False, type=str, dest='keyword')
    parser.add_argument('-c','--check', help='Perform domain reputation checks', required=False, default=False, action='store_true', dest='check')
    parser.add_argument('-f','--filename', help='Specify input file of line delimited domain names to check', required=False, default=False, type=str, dest='filename')
-    parser.add_argument('--ocr', help='Perform OCR on CAPTCHAs when present', required=False, default=False, action='store_true')
+    parser.add_argument('--ocr', help='Perform OCR on CAPTCHAs when challenged', required=False, default=False, action='store_true')
    parser.add_argument('-r','--maxresults', help='Number of results to return when querying latest expired/deleted domains', required=False, default=100, type=int, dest='maxresults')
    parser.add_argument('-s','--single', help='Performs detailed reputation checks against a single domain name/IP.', required=False, default=False, dest='single')
    parser.add_argument('-t','--timing', help='Modifies request timing to avoid CAPTCHAs. Slowest(0) = 90-120 seconds, Default(3) = 10-20 seconds, Fastest(5) = no delay', required=False, default=3, type=int, choices=range(0,6), dest='timing')
@ -294,9 +314,9 @@ if __name__ == "__main__":
        except Exception as e:
            print("Expired Domains Reputation Check")
            print("[-] Missing OCR dependencies: {}".format(str(e)))
-            print("[*] Install required Python dependencies by running `pip3 install -r requirements.txt`")
-            print("[*] Ubuntu\Debian - Install tesseract by running `apt-get install tesseract-ocr python3-imaging`")
-            print("[*] MAC OSX - Install tesseract with homebrew by running `brew install tesseract`")
+            print("[*] Install required Python dependencies by running: pip3 install -r requirements.txt")
+            print("[*] Ubuntu\Debian - Install tesseract by running: apt-get install tesseract-ocr python3-imaging")
+            print("[*] macOS - Install tesseract with homebrew by running: brew install tesseract")
            quit(0)

 ## Variables
@ -314,14 +334,16 @@ if __name__ == "__main__":

    maxwidth = args.maxwidth
    
-    malwaredomainsURL = 'http://mirror1.malwaredomains.com/files/justdomains'
-    expireddomainsqueryURL = 'https://www.expireddomains.net/domain-name-search'
-    
    ocr = args.ocr
+    
+    malwaredomainsURL = 'http://mirror1.malwaredomains.com/files/justdomains'
+
+    expireddomainsqueryURL = 'https://www.expireddomains.net/domain-name-search'  

    timestamp = time.strftime("%Y%m%d_%H%M%S")
            
    useragent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
+   
    headers = {'User-Agent':useragent}

    requests.packages.urllib3.disable_warnings()
@ -329,8 +351,6 @@ if __name__ == "__main__":
    # HTTP Session container, used to manage cookies, session tokens and other session information
    s = requests.Session()

-    data = []
-
    title = '''
 ____   ___  __  __    _    ___ _   _   _   _ _   _ _   _ _____ _____ ____  
 |  _ \ / _ \|  \/  |  / \  |_ _| \ | | | | | | | | | \ | |_   _| ____|  _ \ 
@ -357,22 +377,29 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
    # Retrieve reputation for a single choosen domain (Quick Mode)
    if single:
        checkDomain(single)
-        quit(0)
+        exit(0)

-    # Perform detailed domain reputation checks against input file
+    # Perform detailed domain reputation checks against input file, print table, and quit
    if filename:
+        # Initialize our list with an empty row for the header
+        data = []
        try:
            with open(filename, 'r') as domainsList:
                for line in domainsList.read().splitlines():
-                    checkDomain(line)
+                    data.append(checkDomain(line))
                    doSleep(timing)
+
+                # Print results table
+                header = ['Domain', 'BlueCoat', 'IBM X-Force', 'Cisco Talos', 'MXToolbox']
+                print(drawTable(header,data))
+
        except KeyboardInterrupt:
            print('Caught keyboard interrupt. Exiting!')
-            quit(0)
+            exit(0)
        except Exception as e:
-            print('[-] {}'.format(e))
-            quit(1)
-        quit(0)
+            print('[-] Error: {}'.format(e))
+            exit(1)
+        exit(0)
     
    # Generic Proxy support 
    # TODO: add as a parameter 
@ -389,8 +416,9 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'

    # Generate list of URLs to query for expired/deleted domains
    urls = []
+    domain_list = []

-    # Use the keyword string to narrow domain search if provided
+    # Use the keyword string to narrow domain search if provided. This generates a list of URLs to query
    if keyword:
        print('[*] Fetching expired or deleted domains containing "{}"'.format(keyword))
        for i in range (0,maxresults,25):
@ -404,14 +432,12 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
    # If no keyword provided, retrieve list of recently expired domains in batches of 25 results.
    else:
        print('[*] Fetching expired or deleted domains...')
-        # Caculate number of URLs to request since we're performing a request for four different resources instead of one
-        numresults = int(maxresults / 4)
+        # Caculate number of URLs to request since we're performing a request for two different resources instead of one
+        numresults = int(maxresults / 2)
        for i in range (0,(numresults),25):
            urls.append('https://www.expireddomains.net/backorder-expired-domains?start={}&o=changed&r=a'.format(i))
            urls.append('https://www.expireddomains.net/deleted-com-domains/?start={}&o=changed&r=a'.format(i))
-            urls.append('https://www.expireddomains.net/deleted-net-domains/?start={}&o=changed&r=a'.format(i))
-            urls.append('https://www.expireddomains.net/deleted-org-domains/?start={}&o=changed&r=a'.format(i))
-    
+ 
    for url in urls:

        print("[*]  {}".format(url))
@ -423,7 +449,6 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'

        r1 = random.randint(100000,999999)

-
        # Known good example _pk_id.10.dd0a cookie: 5abbbc772cbacfb1.1496760705.2.1496760705.1496760705
        pk_str = '5abbbc772cbacfb1' + '.1496' + str(r1) + '.2.1496' + str(r1) + '.1496' + str(r1)

@ -435,10 +460,10 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
        #domainrequest = s.get(url,headers=headers,verify=False,cookies=jar,proxies=proxies)

        domains = domainrequest.text
-
+   
        # Turn the HTML into a Beautiful Soup object
-        soup = BeautifulSoup(domains, 'lxml')
-        
+        soup = BeautifulSoup(domains, 'lxml')    
+        #print(soup)
        try:
            table = soup.find("table")
            for row in table.findAll('tr')[1:]:
@ -449,8 +474,6 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                cells = row.findAll("td")

                if len(cells) >= 1:
-                    output = ""
-
                    if keyword:

                        c0 = row.find('td').find('a').text   # domain
@ -466,10 +489,9 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                        c10 = cells[10].find(text=True) # status org
                        c11 = cells[11].find(text=True) # status de
                        c12 = cells[12].find(text=True) # tld registered
-                        c13 = cells[13].find(text=True) # Related Domains
-                        c14 = cells[14].find(text=True) # Domain list
-                        c15 = cells[15].find(text=True) # status
-                        c16 = cells[16].find(text=True) # related links
+                        c13 = cells[13].find(text=True) # Source List
+                        c14 = cells[14].find(text=True) # Domain Status
+                        c15 = ""                        # Related Domains

                    else:
                        c0 = cells[0].find(text=True)   # domain
@ -487,12 +509,6 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                        c12 = cells[12].find(text=True) # tld registered
                        c13 = cells[13].find(text=True) # changes
                        c14 = cells[14].find(text=True) # whois
-                        c15 = ""                        # not used
-                        c16 = ""                        # not used
-                        c17 = ""                        # not used
-
-                        # Expired Domains results have an additional 'Availability' column that breaks parsing "deleted" domains
-                        #c15 = cells[15].find(text=True) # related links

                    available = ''
                    if c8 == "available":
@ -507,42 +523,49 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                    if c11 == "available":
                        available += ".de "

+                    # Only grab status for keyword searches since it doesn't exist otherwise
                    status = ""
-                    if c15:
-                        status = c15
+                    if keyword:
+                        status = c14
+                    
+                    bluecoat = ''
+                    ibmxforce = ''
+                    ciscotalos = ''
+
+                    if check == True:
+                        # Only perform reputation checks if domain is a .com .net. .org and not in maldomains list
+                        if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):

-                    # Skip additional reputation checks if this domain is already categorized as malicious 
-                    if c0 in maldomainsList:
-                        print("[-] Skipping {} - Identified as known malware domain").format(c0)
-                    else:
-                        bluecoat = ''
-                        ibmxforce = ''
-                        if c3 == '-':
-                            bluecoat = 'ignored'
-                            ibmxforce = 'ignored'
-                        elif check == True:
                            bluecoat = checkBluecoat(c0)
                            print("[+] {}: {}".format(c0, bluecoat))
                            ibmxforce = checkIBMXForce(c0)
                            print("[+] {}: {}".format(c0, ibmxforce))
+                            ciscotalos = checkTalos(c0)
+                            print("[+] {}: {}".format(c0, ciscotalos))
                            # Sleep to avoid captchas
                            doSleep(timing)
                        else:
-                            bluecoat = "skipped"
-                            ibmxforce = "skipped"
-                        # Append parsed domain data to list
-                        data.append([c0,c3,c4,available,status,bluecoat,ibmxforce])
+                            bluecoat = 'skipped'
+                            ibmxforce = 'skipped'
+                            ciscotalos = 'skipped'
+
+                    # Append parsed domain data to list
+                    domain_list.append([c0,c3,c4,available,status,bluecoat,ibmxforce,ciscotalos])               
+
        except Exception as e: 
-            #print(e)
+        #    print(e)
            pass

+        # Add additional sleep on requests to ExpiredDomains.net to avoid errors
+        time.sleep(5)
+
    # Check for valid results before continuing
-    if not(data):
-        print("[-] No results found for keyword: {0}".format(keyword))
-        quit(0)
+    if len(domain_list) == 0:
+        print("[-] No domain results found")
+        exit(0)

    # Sort domain list by column 2 (Birth Year)
-    sortedData = sorted(data, key=lambda x: x[1], reverse=True) 
+    sortedDomains = sorted(domain_list, key=lambda x: x[1], reverse=True) 

    # Build HTML Table
    html = ''
@ -556,9 +579,11 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                    <th>Entries</th>
                    <th>TLDs Available</th>
                    <th>Status</th>
-                    <th>Symantec</th>
+                    <th>BlueCoat</th>
                    <th>Categorization</th>
-                    <th>IBM-xForce</th>
+                    <th>IBM X-Force</th>
+                    <th>Categorization</th>
+                    <th>Cisco Talos</th>
                    <th>Categorization</th>
                    <th>WatchGuard</th>
                    <th>Namecheap</th>
@ -570,7 +595,7 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
    htmlFooter = '</body></html>'

    # Build HTML table contents
-    for i in sortedData:
+    for i in sortedDomains:
        htmlTableBody += '<tr>'
        htmlTableBody += '<td>{}</td>'.format(i[0]) # Domain
        htmlTableBody += '<td>{}</td>'.format(i[1]) # Birth
@ -578,10 +603,12 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
        htmlTableBody += '<td>{}</td>'.format(i[3]) # TLDs
        htmlTableBody += '<td>{}</td>'.format(i[4]) # Status

-        htmlTableBody += '<td><a href="https://sitereview.bluecoat.com/sitereview#/?search={}" target="_blank">Bluecoat</a></td>'.format(i[0]) # Bluecoat
+        htmlTableBody += '<td><a href="https://sitereview.bluecoat.com/" target="_blank">Bluecoat</a></td>'.format(i[0]) # Bluecoat
        htmlTableBody += '<td>{}</td>'.format(i[5]) # Bluecoat Categorization
        htmlTableBody += '<td><a href="https://exchange.xforce.ibmcloud.com/url/{}" target="_blank">IBM-xForce</a></td>'.format(i[0]) # IBM xForce
        htmlTableBody += '<td>{}</td>'.format(i[6]) # IBM x-Force Categorization
+        htmlTableBody += '<td><a href="https://www.talosintelligence.com/reputation_center/lookup?search={0}" target="_blank">Cisco Talos</a></td>'.format(i[0]) # Cisco Talos
+        htmlTableBody += '<td>{}</td>'.format(i[7]) # Cisco Talos
        htmlTableBody += '<td><a href="http://www.borderware.com/domain_lookup.php?ip={}" target="_blank">WatchGuard</a></td>'.format(i[0]) # Borderware WatchGuard
        htmlTableBody += '<td><a href="https://www.namecheap.com/domains/registration/results.aspx?domain={}" target="_blank">Namecheap</a></td>'.format(i[0]) # Namecheap
        htmlTableBody += '<td><a href="http://web.archive.org/web/*/{}" target="_blank">Archive.org</a></td>'.format(i[0]) # Archive.org
@ -598,8 +625,5 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
    print("[*] Log written to {}\n".format(logfilename))
    
    # Print Text Table
-    t = Texttable(max_width=maxwidth)
-    t.add_rows(sortedData)
-    header = ['Domain', 'Birth', '#', 'TLDs', 'Status', 'Symantec', 'IBM']
-    t.header(header)
-    print(t.draw())
+    header = ['Domain', 'Birth', '#', 'TLDs', 'Status', 'BlueCoat', 'IBM', 'Cisco Talos']
+    print(drawTable(header,sortedDomains))