diff --git a/README.md b/README.md index fb3a6b0..95d2901 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,11 @@ This Python based tool was written to quickly query the Expireddomains.net searc ## Changes +- 6 May 2018 + + Fixed expired domains parsing when performing a keyword search + + Minor HTML and text table output updates + + Filtered reputation checks to only execute for .COM, .ORG, and .NET domains and removed check for Archive.org records when performing a default or keyword search. Credit to @christruncer for the original PR and idea. + - 11 April 2018 + Added OCR support for CAPTCHA solving with tesseract. Thanks to t94j0 for the idea in [AIRMASTER](https://github.com/t94j0/AIRMASTER) + Added support for input file list of potential domains (-f/--filename) @@ -85,11 +90,11 @@ List DomainHunter options Use defaults to check for most recent 100 domains and check reputation - python ./domainhunter.py + python3 ./domainhunter.py Search for 1000 most recently expired/deleted domains, but don't check reputation - python ./domainhunter.py -r 1000 + python3 ./domainhunter.py -r 1000 Perform all reputation checks for a single domain diff --git a/domainhunter.py b/domainhunter.py index 0569a83..34de10e 100644 --- a/domainhunter.py +++ b/domainhunter.py @@ -4,17 +4,15 @@ ## Author: @joevest and @andrewchiles ## Description: Checks expired domains, reputation/categorization, and Archive.org history to determine ## good candidates for phishing and C2 domain names -# Add OCR support for BlueCoat/SiteReview CAPTCHA using tesseract -# Add support for input file list of potential domains -# Add additional error checking for ExpiredDomains.net parsing -# Changed -q/--query switch to -k/--keyword to better match its purpose + import time import random import argparse import json import base64 +import os -__version__ = "20180411" +__version__ = "20180506" ## Functions @@ -34,7 +32,7 @@ def doSleep(timing): def checkBluecoat(domain): try: url = 'https://sitereview.bluecoat.com/resource/lookup' - postData = {'url':domain,'captcha':''} # HTTP POST Parameters + postData = {'url':domain,'captcha':''} headers = {'User-Agent':useragent, 'Content-Type':'application/json; charset=UTF-8', 'Referer':'https://sitereview.bluecoat.com/lookup'} @@ -51,16 +49,14 @@ def checkBluecoat(domain): # Print notice if CAPTCHAs are blocking accurate results and attempt to solve if --ocr if a == 'captcha': if ocr: - # This request is performed in a browser, but is not needed for our purposes + # This request is also performed by a browser, but is not needed for our purposes #captcharequestURL = 'https://sitereview.bluecoat.com/resource/captcha-request' - #print('[*] Requesting CAPTCHA') - #response = s.get(url=captcharequestURL,headers=headers,cookies=cookies,verify=False) print('[*] Received CAPTCHA challenge!') captcha = solveCaptcha('https://sitereview.bluecoat.com/resource/captcha.jpg',s) if captcha: - b64captcha = base64.b64encode(captcha.encode('utf-8')).decode('utf-8') + b64captcha = base64.urlsafe_b64encode(captcha.encode('utf-8')).decode('utf-8') # Send CAPTCHA solution via GET since inclusion with the domain categorization request doens't work anymore captchasolutionURL = 'https://sitereview.bluecoat.com/resource/captcha-request/{0}'.format(b64captcha) @@ -124,7 +120,7 @@ def checkIBMXForce(domain): return "-" def checkTalos(domain): - url = "https://www.talosintelligence.com/sb_api/query_lookup?query=%2Fapi%2Fv2%2Fdetails%2Fdomain%2F&query_entry={0}&offset=0&order=ip+asc".format(domain) + url = 'https://www.talosintelligence.com/sb_api/query_lookup?query=%2Fapi%2Fv2%2Fdetails%2Fdomain%2F&query_entry={0}&offset=0&order=ip+asc'.format(domain) headers = {'User-Agent':useragent, 'Referer':url} @@ -222,10 +218,7 @@ def checkDomain(domain): if domain in maldomainsList: print("[!] {}: Identified as known malware domain (malwaredomains.com)".format(domain)) - - mxtoolbox = checkMXToolbox(domain) - print("[+] {}: {}".format(domain, mxtoolbox)) - + bluecoat = checkBluecoat(domain) print("[+] {}: {}".format(domain, bluecoat)) @@ -234,13 +227,21 @@ def checkDomain(domain): ciscotalos = checkTalos(domain) print("[+] {}: {}".format(domain, ciscotalos)) + + mxtoolbox = checkMXToolbox(domain) + print("[+] {}: {}".format(domain, mxtoolbox)) + print("") - return + + results = [domain,bluecoat,ibmxforce,ciscotalos,mxtoolbox] + return results def solveCaptcha(url,session): # Downloads CAPTCHA image and saves to current directory for OCR with tesseract # Returns CAPTCHA string or False if error occured + jpeg = 'captcha.jpg' + try: response = session.get(url=url,headers=headers,verify=False, stream=True) if response.status_code == 200: @@ -251,13 +252,32 @@ def solveCaptcha(url,session): print('[-] Error downloading CAPTCHA file!') return False + # Perform basic OCR without additional image enhancement text = pytesseract.image_to_string(Image.open(jpeg)) text = text.replace(" ", "") + + # Remove CAPTCHA file + try: + os.remove(jpeg) + except OSError: + pass + return text + except Exception as e: print("[-] Error solving CAPTCHA - {0}".format(e)) + return False +def drawTable(header,data): + + data.insert(0,header) + t = Texttable(max_width=maxwidth) + t.add_rows(data) + t.header(header) + + return(t.draw()) + ## MAIN if __name__ == "__main__": @@ -265,7 +285,7 @@ if __name__ == "__main__": parser.add_argument('-k','--keyword', help='Keyword used to refine search results', required=False, default=False, type=str, dest='keyword') parser.add_argument('-c','--check', help='Perform domain reputation checks', required=False, default=False, action='store_true', dest='check') parser.add_argument('-f','--filename', help='Specify input file of line delimited domain names to check', required=False, default=False, type=str, dest='filename') - parser.add_argument('--ocr', help='Perform OCR on CAPTCHAs when present', required=False, default=False, action='store_true') + parser.add_argument('--ocr', help='Perform OCR on CAPTCHAs when challenged', required=False, default=False, action='store_true') parser.add_argument('-r','--maxresults', help='Number of results to return when querying latest expired/deleted domains', required=False, default=100, type=int, dest='maxresults') parser.add_argument('-s','--single', help='Performs detailed reputation checks against a single domain name/IP.', required=False, default=False, dest='single') parser.add_argument('-t','--timing', help='Modifies request timing to avoid CAPTCHAs. Slowest(0) = 90-120 seconds, Default(3) = 10-20 seconds, Fastest(5) = no delay', required=False, default=3, type=int, choices=range(0,6), dest='timing') @@ -294,9 +314,9 @@ if __name__ == "__main__": except Exception as e: print("Expired Domains Reputation Check") print("[-] Missing OCR dependencies: {}".format(str(e))) - print("[*] Install required Python dependencies by running `pip3 install -r requirements.txt`") - print("[*] Ubuntu\Debian - Install tesseract by running `apt-get install tesseract-ocr python3-imaging`") - print("[*] MAC OSX - Install tesseract with homebrew by running `brew install tesseract`") + print("[*] Install required Python dependencies by running: pip3 install -r requirements.txt") + print("[*] Ubuntu\Debian - Install tesseract by running: apt-get install tesseract-ocr python3-imaging") + print("[*] macOS - Install tesseract with homebrew by running: brew install tesseract") quit(0) ## Variables @@ -314,14 +334,16 @@ if __name__ == "__main__": maxwidth = args.maxwidth - malwaredomainsURL = 'http://mirror1.malwaredomains.com/files/justdomains' - expireddomainsqueryURL = 'https://www.expireddomains.net/domain-name-search' - ocr = args.ocr + + malwaredomainsURL = 'http://mirror1.malwaredomains.com/files/justdomains' + + expireddomainsqueryURL = 'https://www.expireddomains.net/domain-name-search' timestamp = time.strftime("%Y%m%d_%H%M%S") useragent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)' + headers = {'User-Agent':useragent} requests.packages.urllib3.disable_warnings() @@ -329,8 +351,6 @@ if __name__ == "__main__": # HTTP Session container, used to manage cookies, session tokens and other session information s = requests.Session() - data = [] - title = ''' ____ ___ __ __ _ ___ _ _ _ _ _ _ _ _ _____ _____ ____ | _ \ / _ \| \/ | / \ |_ _| \ | | | | | | | | | \ | |_ _| ____| _ \ @@ -357,22 +377,29 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' # Retrieve reputation for a single choosen domain (Quick Mode) if single: checkDomain(single) - quit(0) + exit(0) - # Perform detailed domain reputation checks against input file + # Perform detailed domain reputation checks against input file, print table, and quit if filename: + # Initialize our list with an empty row for the header + data = [] try: with open(filename, 'r') as domainsList: for line in domainsList.read().splitlines(): - checkDomain(line) + data.append(checkDomain(line)) doSleep(timing) + + # Print results table + header = ['Domain', 'BlueCoat', 'IBM X-Force', 'Cisco Talos', 'MXToolbox'] + print(drawTable(header,data)) + except KeyboardInterrupt: print('Caught keyboard interrupt. Exiting!') - quit(0) + exit(0) except Exception as e: - print('[-] {}'.format(e)) - quit(1) - quit(0) + print('[-] Error: {}'.format(e)) + exit(1) + exit(0) # Generic Proxy support # TODO: add as a parameter @@ -389,8 +416,9 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' # Generate list of URLs to query for expired/deleted domains urls = [] + domain_list = [] - # Use the keyword string to narrow domain search if provided + # Use the keyword string to narrow domain search if provided. This generates a list of URLs to query if keyword: print('[*] Fetching expired or deleted domains containing "{}"'.format(keyword)) for i in range (0,maxresults,25): @@ -404,14 +432,12 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' # If no keyword provided, retrieve list of recently expired domains in batches of 25 results. else: print('[*] Fetching expired or deleted domains...') - # Caculate number of URLs to request since we're performing a request for four different resources instead of one - numresults = int(maxresults / 4) + # Caculate number of URLs to request since we're performing a request for two different resources instead of one + numresults = int(maxresults / 2) for i in range (0,(numresults),25): urls.append('https://www.expireddomains.net/backorder-expired-domains?start={}&o=changed&r=a'.format(i)) urls.append('https://www.expireddomains.net/deleted-com-domains/?start={}&o=changed&r=a'.format(i)) - urls.append('https://www.expireddomains.net/deleted-net-domains/?start={}&o=changed&r=a'.format(i)) - urls.append('https://www.expireddomains.net/deleted-org-domains/?start={}&o=changed&r=a'.format(i)) - + for url in urls: print("[*] {}".format(url)) @@ -423,7 +449,6 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' r1 = random.randint(100000,999999) - # Known good example _pk_id.10.dd0a cookie: 5abbbc772cbacfb1.1496760705.2.1496760705.1496760705 pk_str = '5abbbc772cbacfb1' + '.1496' + str(r1) + '.2.1496' + str(r1) + '.1496' + str(r1) @@ -435,10 +460,10 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' #domainrequest = s.get(url,headers=headers,verify=False,cookies=jar,proxies=proxies) domains = domainrequest.text - + # Turn the HTML into a Beautiful Soup object - soup = BeautifulSoup(domains, 'lxml') - + soup = BeautifulSoup(domains, 'lxml') + #print(soup) try: table = soup.find("table") for row in table.findAll('tr')[1:]: @@ -449,8 +474,6 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' cells = row.findAll("td") if len(cells) >= 1: - output = "" - if keyword: c0 = row.find('td').find('a').text # domain @@ -466,10 +489,9 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' c10 = cells[10].find(text=True) # status org c11 = cells[11].find(text=True) # status de c12 = cells[12].find(text=True) # tld registered - c13 = cells[13].find(text=True) # Related Domains - c14 = cells[14].find(text=True) # Domain list - c15 = cells[15].find(text=True) # status - c16 = cells[16].find(text=True) # related links + c13 = cells[13].find(text=True) # Source List + c14 = cells[14].find(text=True) # Domain Status + c15 = "" # Related Domains else: c0 = cells[0].find(text=True) # domain @@ -487,12 +509,6 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' c12 = cells[12].find(text=True) # tld registered c13 = cells[13].find(text=True) # changes c14 = cells[14].find(text=True) # whois - c15 = "" # not used - c16 = "" # not used - c17 = "" # not used - - # Expired Domains results have an additional 'Availability' column that breaks parsing "deleted" domains - #c15 = cells[15].find(text=True) # related links available = '' if c8 == "available": @@ -507,42 +523,49 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' if c11 == "available": available += ".de " + # Only grab status for keyword searches since it doesn't exist otherwise status = "" - if c15: - status = c15 + if keyword: + status = c14 + + bluecoat = '' + ibmxforce = '' + ciscotalos = '' + + if check == True: + # Only perform reputation checks if domain is a .com .net. .org and not in maldomains list + if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList): - # Skip additional reputation checks if this domain is already categorized as malicious - if c0 in maldomainsList: - print("[-] Skipping {} - Identified as known malware domain").format(c0) - else: - bluecoat = '' - ibmxforce = '' - if c3 == '-': - bluecoat = 'ignored' - ibmxforce = 'ignored' - elif check == True: bluecoat = checkBluecoat(c0) print("[+] {}: {}".format(c0, bluecoat)) ibmxforce = checkIBMXForce(c0) print("[+] {}: {}".format(c0, ibmxforce)) + ciscotalos = checkTalos(c0) + print("[+] {}: {}".format(c0, ciscotalos)) # Sleep to avoid captchas doSleep(timing) else: - bluecoat = "skipped" - ibmxforce = "skipped" - # Append parsed domain data to list - data.append([c0,c3,c4,available,status,bluecoat,ibmxforce]) + bluecoat = 'skipped' + ibmxforce = 'skipped' + ciscotalos = 'skipped' + + # Append parsed domain data to list + domain_list.append([c0,c3,c4,available,status,bluecoat,ibmxforce,ciscotalos]) + except Exception as e: - #print(e) + # print(e) pass + # Add additional sleep on requests to ExpiredDomains.net to avoid errors + time.sleep(5) + # Check for valid results before continuing - if not(data): - print("[-] No results found for keyword: {0}".format(keyword)) - quit(0) + if len(domain_list) == 0: + print("[-] No domain results found") + exit(0) # Sort domain list by column 2 (Birth Year) - sortedData = sorted(data, key=lambda x: x[1], reverse=True) + sortedDomains = sorted(domain_list, key=lambda x: x[1], reverse=True) # Build HTML Table html = '' @@ -556,9 +579,11 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' Entries TLDs Available Status - Symantec + BlueCoat Categorization - IBM-xForce + IBM X-Force + Categorization + Cisco Talos Categorization WatchGuard Namecheap @@ -570,7 +595,7 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' htmlFooter = '' # Build HTML table contents - for i in sortedData: + for i in sortedDomains: htmlTableBody += '' htmlTableBody += '{}'.format(i[0]) # Domain htmlTableBody += '{}'.format(i[1]) # Birth @@ -578,10 +603,12 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' htmlTableBody += '{}'.format(i[3]) # TLDs htmlTableBody += '{}'.format(i[4]) # Status - htmlTableBody += 'Bluecoat'.format(i[0]) # Bluecoat + htmlTableBody += 'Bluecoat'.format(i[0]) # Bluecoat htmlTableBody += '{}'.format(i[5]) # Bluecoat Categorization htmlTableBody += 'IBM-xForce'.format(i[0]) # IBM xForce htmlTableBody += '{}'.format(i[6]) # IBM x-Force Categorization + htmlTableBody += 'Cisco Talos'.format(i[0]) # Cisco Talos + htmlTableBody += '{}'.format(i[7]) # Cisco Talos htmlTableBody += 'WatchGuard'.format(i[0]) # Borderware WatchGuard htmlTableBody += 'Namecheap'.format(i[0]) # Namecheap htmlTableBody += 'Archive.org'.format(i[0]) # Archive.org @@ -598,8 +625,5 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' print("[*] Log written to {}\n".format(logfilename)) # Print Text Table - t = Texttable(max_width=maxwidth) - t.add_rows(sortedData) - header = ['Domain', 'Birth', '#', 'TLDs', 'Status', 'Symantec', 'IBM'] - t.header(header) - print(t.draw()) + header = ['Domain', 'Birth', '#', 'TLDs', 'Status', 'BlueCoat', 'IBM', 'Cisco Talos'] + print(drawTable(header,sortedDomains))