Fix parsing for keyword search feature. Update HTML and text table output. Add reputation check filtering based on PR from @christruncer

master
Andrew Chiles 2018-05-06 20:34:55 +02:00
parent 26dd64870d
commit e3985c2c37
2 changed files with 115 additions and 86 deletions

View File

@ -8,6 +8,11 @@ This Python based tool was written to quickly query the Expireddomains.net searc
## Changes
- 6 May 2018
+ Fixed expired domains parsing when performing a keyword search
+ Minor HTML and text table output updates
+ Filtered reputation checks to only execute for .COM, .ORG, and .NET domains and removed check for Archive.org records when performing a default or keyword search. Credit to @christruncer for the original PR and idea.
- 11 April 2018
+ Added OCR support for CAPTCHA solving with tesseract. Thanks to t94j0 for the idea in [AIRMASTER](https://github.com/t94j0/AIRMASTER)
+ Added support for input file list of potential domains (-f/--filename)
@ -85,11 +90,11 @@ List DomainHunter options
Use defaults to check for most recent 100 domains and check reputation
python ./domainhunter.py
python3 ./domainhunter.py
Search for 1000 most recently expired/deleted domains, but don't check reputation
python ./domainhunter.py -r 1000
python3 ./domainhunter.py -r 1000
Perform all reputation checks for a single domain

View File

@ -4,17 +4,15 @@
## Author: @joevest and @andrewchiles
## Description: Checks expired domains, reputation/categorization, and Archive.org history to determine
## good candidates for phishing and C2 domain names
# Add OCR support for BlueCoat/SiteReview CAPTCHA using tesseract
# Add support for input file list of potential domains
# Add additional error checking for ExpiredDomains.net parsing
# Changed -q/--query switch to -k/--keyword to better match its purpose
import time
import random
import argparse
import json
import base64
import os
__version__ = "20180411"
__version__ = "20180506"
## Functions
@ -34,7 +32,7 @@ def doSleep(timing):
def checkBluecoat(domain):
try:
url = 'https://sitereview.bluecoat.com/resource/lookup'
postData = {'url':domain,'captcha':''} # HTTP POST Parameters
postData = {'url':domain,'captcha':''}
headers = {'User-Agent':useragent,
'Content-Type':'application/json; charset=UTF-8',
'Referer':'https://sitereview.bluecoat.com/lookup'}
@ -51,16 +49,14 @@ def checkBluecoat(domain):
# Print notice if CAPTCHAs are blocking accurate results and attempt to solve if --ocr
if a == 'captcha':
if ocr:
# This request is performed in a browser, but is not needed for our purposes
# This request is also performed by a browser, but is not needed for our purposes
#captcharequestURL = 'https://sitereview.bluecoat.com/resource/captcha-request'
#print('[*] Requesting CAPTCHA')
#response = s.get(url=captcharequestURL,headers=headers,cookies=cookies,verify=False)
print('[*] Received CAPTCHA challenge!')
captcha = solveCaptcha('https://sitereview.bluecoat.com/resource/captcha.jpg',s)
if captcha:
b64captcha = base64.b64encode(captcha.encode('utf-8')).decode('utf-8')
b64captcha = base64.urlsafe_b64encode(captcha.encode('utf-8')).decode('utf-8')
# Send CAPTCHA solution via GET since inclusion with the domain categorization request doens't work anymore
captchasolutionURL = 'https://sitereview.bluecoat.com/resource/captcha-request/{0}'.format(b64captcha)
@ -124,7 +120,7 @@ def checkIBMXForce(domain):
return "-"
def checkTalos(domain):
url = "https://www.talosintelligence.com/sb_api/query_lookup?query=%2Fapi%2Fv2%2Fdetails%2Fdomain%2F&query_entry={0}&offset=0&order=ip+asc".format(domain)
url = 'https://www.talosintelligence.com/sb_api/query_lookup?query=%2Fapi%2Fv2%2Fdetails%2Fdomain%2F&query_entry={0}&offset=0&order=ip+asc'.format(domain)
headers = {'User-Agent':useragent,
'Referer':url}
@ -222,10 +218,7 @@ def checkDomain(domain):
if domain in maldomainsList:
print("[!] {}: Identified as known malware domain (malwaredomains.com)".format(domain))
mxtoolbox = checkMXToolbox(domain)
print("[+] {}: {}".format(domain, mxtoolbox))
bluecoat = checkBluecoat(domain)
print("[+] {}: {}".format(domain, bluecoat))
@ -234,13 +227,21 @@ def checkDomain(domain):
ciscotalos = checkTalos(domain)
print("[+] {}: {}".format(domain, ciscotalos))
mxtoolbox = checkMXToolbox(domain)
print("[+] {}: {}".format(domain, mxtoolbox))
print("")
return
results = [domain,bluecoat,ibmxforce,ciscotalos,mxtoolbox]
return results
def solveCaptcha(url,session):
# Downloads CAPTCHA image and saves to current directory for OCR with tesseract
# Returns CAPTCHA string or False if error occured
jpeg = 'captcha.jpg'
try:
response = session.get(url=url,headers=headers,verify=False, stream=True)
if response.status_code == 200:
@ -251,13 +252,32 @@ def solveCaptcha(url,session):
print('[-] Error downloading CAPTCHA file!')
return False
# Perform basic OCR without additional image enhancement
text = pytesseract.image_to_string(Image.open(jpeg))
text = text.replace(" ", "")
# Remove CAPTCHA file
try:
os.remove(jpeg)
except OSError:
pass
return text
except Exception as e:
print("[-] Error solving CAPTCHA - {0}".format(e))
return False
def drawTable(header,data):
data.insert(0,header)
t = Texttable(max_width=maxwidth)
t.add_rows(data)
t.header(header)
return(t.draw())
## MAIN
if __name__ == "__main__":
@ -265,7 +285,7 @@ if __name__ == "__main__":
parser.add_argument('-k','--keyword', help='Keyword used to refine search results', required=False, default=False, type=str, dest='keyword')
parser.add_argument('-c','--check', help='Perform domain reputation checks', required=False, default=False, action='store_true', dest='check')
parser.add_argument('-f','--filename', help='Specify input file of line delimited domain names to check', required=False, default=False, type=str, dest='filename')
parser.add_argument('--ocr', help='Perform OCR on CAPTCHAs when present', required=False, default=False, action='store_true')
parser.add_argument('--ocr', help='Perform OCR on CAPTCHAs when challenged', required=False, default=False, action='store_true')
parser.add_argument('-r','--maxresults', help='Number of results to return when querying latest expired/deleted domains', required=False, default=100, type=int, dest='maxresults')
parser.add_argument('-s','--single', help='Performs detailed reputation checks against a single domain name/IP.', required=False, default=False, dest='single')
parser.add_argument('-t','--timing', help='Modifies request timing to avoid CAPTCHAs. Slowest(0) = 90-120 seconds, Default(3) = 10-20 seconds, Fastest(5) = no delay', required=False, default=3, type=int, choices=range(0,6), dest='timing')
@ -294,9 +314,9 @@ if __name__ == "__main__":
except Exception as e:
print("Expired Domains Reputation Check")
print("[-] Missing OCR dependencies: {}".format(str(e)))
print("[*] Install required Python dependencies by running `pip3 install -r requirements.txt`")
print("[*] Ubuntu\Debian - Install tesseract by running `apt-get install tesseract-ocr python3-imaging`")
print("[*] MAC OSX - Install tesseract with homebrew by running `brew install tesseract`")
print("[*] Install required Python dependencies by running: pip3 install -r requirements.txt")
print("[*] Ubuntu\Debian - Install tesseract by running: apt-get install tesseract-ocr python3-imaging")
print("[*] macOS - Install tesseract with homebrew by running: brew install tesseract")
quit(0)
## Variables
@ -314,14 +334,16 @@ if __name__ == "__main__":
maxwidth = args.maxwidth
malwaredomainsURL = 'http://mirror1.malwaredomains.com/files/justdomains'
expireddomainsqueryURL = 'https://www.expireddomains.net/domain-name-search'
ocr = args.ocr
malwaredomainsURL = 'http://mirror1.malwaredomains.com/files/justdomains'
expireddomainsqueryURL = 'https://www.expireddomains.net/domain-name-search'
timestamp = time.strftime("%Y%m%d_%H%M%S")
useragent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
headers = {'User-Agent':useragent}
requests.packages.urllib3.disable_warnings()
@ -329,8 +351,6 @@ if __name__ == "__main__":
# HTTP Session container, used to manage cookies, session tokens and other session information
s = requests.Session()
data = []
title = '''
____ ___ __ __ _ ___ _ _ _ _ _ _ _ _ _____ _____ ____
| _ \ / _ \| \/ | / \ |_ _| \ | | | | | | | | | \ | |_ _| ____| _ \
@ -357,22 +377,29 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
# Retrieve reputation for a single choosen domain (Quick Mode)
if single:
checkDomain(single)
quit(0)
exit(0)
# Perform detailed domain reputation checks against input file
# Perform detailed domain reputation checks against input file, print table, and quit
if filename:
# Initialize our list with an empty row for the header
data = []
try:
with open(filename, 'r') as domainsList:
for line in domainsList.read().splitlines():
checkDomain(line)
data.append(checkDomain(line))
doSleep(timing)
# Print results table
header = ['Domain', 'BlueCoat', 'IBM X-Force', 'Cisco Talos', 'MXToolbox']
print(drawTable(header,data))
except KeyboardInterrupt:
print('Caught keyboard interrupt. Exiting!')
quit(0)
exit(0)
except Exception as e:
print('[-] {}'.format(e))
quit(1)
quit(0)
print('[-] Error: {}'.format(e))
exit(1)
exit(0)
# Generic Proxy support
# TODO: add as a parameter
@ -389,8 +416,9 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
# Generate list of URLs to query for expired/deleted domains
urls = []
domain_list = []
# Use the keyword string to narrow domain search if provided
# Use the keyword string to narrow domain search if provided. This generates a list of URLs to query
if keyword:
print('[*] Fetching expired or deleted domains containing "{}"'.format(keyword))
for i in range (0,maxresults,25):
@ -404,14 +432,12 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
# If no keyword provided, retrieve list of recently expired domains in batches of 25 results.
else:
print('[*] Fetching expired or deleted domains...')
# Caculate number of URLs to request since we're performing a request for four different resources instead of one
numresults = int(maxresults / 4)
# Caculate number of URLs to request since we're performing a request for two different resources instead of one
numresults = int(maxresults / 2)
for i in range (0,(numresults),25):
urls.append('https://www.expireddomains.net/backorder-expired-domains?start={}&o=changed&r=a'.format(i))
urls.append('https://www.expireddomains.net/deleted-com-domains/?start={}&o=changed&r=a'.format(i))
urls.append('https://www.expireddomains.net/deleted-net-domains/?start={}&o=changed&r=a'.format(i))
urls.append('https://www.expireddomains.net/deleted-org-domains/?start={}&o=changed&r=a'.format(i))
for url in urls:
print("[*] {}".format(url))
@ -423,7 +449,6 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
r1 = random.randint(100000,999999)
# Known good example _pk_id.10.dd0a cookie: 5abbbc772cbacfb1.1496760705.2.1496760705.1496760705
pk_str = '5abbbc772cbacfb1' + '.1496' + str(r1) + '.2.1496' + str(r1) + '.1496' + str(r1)
@ -435,10 +460,10 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
#domainrequest = s.get(url,headers=headers,verify=False,cookies=jar,proxies=proxies)
domains = domainrequest.text
# Turn the HTML into a Beautiful Soup object
soup = BeautifulSoup(domains, 'lxml')
soup = BeautifulSoup(domains, 'lxml')
#print(soup)
try:
table = soup.find("table")
for row in table.findAll('tr')[1:]:
@ -449,8 +474,6 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
cells = row.findAll("td")
if len(cells) >= 1:
output = ""
if keyword:
c0 = row.find('td').find('a').text # domain
@ -466,10 +489,9 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
c10 = cells[10].find(text=True) # status org
c11 = cells[11].find(text=True) # status de
c12 = cells[12].find(text=True) # tld registered
c13 = cells[13].find(text=True) # Related Domains
c14 = cells[14].find(text=True) # Domain list
c15 = cells[15].find(text=True) # status
c16 = cells[16].find(text=True) # related links
c13 = cells[13].find(text=True) # Source List
c14 = cells[14].find(text=True) # Domain Status
c15 = "" # Related Domains
else:
c0 = cells[0].find(text=True) # domain
@ -487,12 +509,6 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
c12 = cells[12].find(text=True) # tld registered
c13 = cells[13].find(text=True) # changes
c14 = cells[14].find(text=True) # whois
c15 = "" # not used
c16 = "" # not used
c17 = "" # not used
# Expired Domains results have an additional 'Availability' column that breaks parsing "deleted" domains
#c15 = cells[15].find(text=True) # related links
available = ''
if c8 == "available":
@ -507,42 +523,49 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
if c11 == "available":
available += ".de "
# Only grab status for keyword searches since it doesn't exist otherwise
status = ""
if c15:
status = c15
if keyword:
status = c14
bluecoat = ''
ibmxforce = ''
ciscotalos = ''
if check == True:
# Only perform reputation checks if domain is a .com .net. .org and not in maldomains list
if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):
# Skip additional reputation checks if this domain is already categorized as malicious
if c0 in maldomainsList:
print("[-] Skipping {} - Identified as known malware domain").format(c0)
else:
bluecoat = ''
ibmxforce = ''
if c3 == '-':
bluecoat = 'ignored'
ibmxforce = 'ignored'
elif check == True:
bluecoat = checkBluecoat(c0)
print("[+] {}: {}".format(c0, bluecoat))
ibmxforce = checkIBMXForce(c0)
print("[+] {}: {}".format(c0, ibmxforce))
ciscotalos = checkTalos(c0)
print("[+] {}: {}".format(c0, ciscotalos))
# Sleep to avoid captchas
doSleep(timing)
else:
bluecoat = "skipped"
ibmxforce = "skipped"
# Append parsed domain data to list
data.append([c0,c3,c4,available,status,bluecoat,ibmxforce])
bluecoat = 'skipped'
ibmxforce = 'skipped'
ciscotalos = 'skipped'
# Append parsed domain data to list
domain_list.append([c0,c3,c4,available,status,bluecoat,ibmxforce,ciscotalos])
except Exception as e:
#print(e)
# print(e)
pass
# Add additional sleep on requests to ExpiredDomains.net to avoid errors
time.sleep(5)
# Check for valid results before continuing
if not(data):
print("[-] No results found for keyword: {0}".format(keyword))
quit(0)
if len(domain_list) == 0:
print("[-] No domain results found")
exit(0)
# Sort domain list by column 2 (Birth Year)
sortedData = sorted(data, key=lambda x: x[1], reverse=True)
sortedDomains = sorted(domain_list, key=lambda x: x[1], reverse=True)
# Build HTML Table
html = ''
@ -556,9 +579,11 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
<th>Entries</th>
<th>TLDs Available</th>
<th>Status</th>
<th>Symantec</th>
<th>BlueCoat</th>
<th>Categorization</th>
<th>IBM-xForce</th>
<th>IBM X-Force</th>
<th>Categorization</th>
<th>Cisco Talos</th>
<th>Categorization</th>
<th>WatchGuard</th>
<th>Namecheap</th>
@ -570,7 +595,7 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
htmlFooter = '</body></html>'
# Build HTML table contents
for i in sortedData:
for i in sortedDomains:
htmlTableBody += '<tr>'
htmlTableBody += '<td>{}</td>'.format(i[0]) # Domain
htmlTableBody += '<td>{}</td>'.format(i[1]) # Birth
@ -578,10 +603,12 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
htmlTableBody += '<td>{}</td>'.format(i[3]) # TLDs
htmlTableBody += '<td>{}</td>'.format(i[4]) # Status
htmlTableBody += '<td><a href="https://sitereview.bluecoat.com/sitereview#/?search={}" target="_blank">Bluecoat</a></td>'.format(i[0]) # Bluecoat
htmlTableBody += '<td><a href="https://sitereview.bluecoat.com/" target="_blank">Bluecoat</a></td>'.format(i[0]) # Bluecoat
htmlTableBody += '<td>{}</td>'.format(i[5]) # Bluecoat Categorization
htmlTableBody += '<td><a href="https://exchange.xforce.ibmcloud.com/url/{}" target="_blank">IBM-xForce</a></td>'.format(i[0]) # IBM xForce
htmlTableBody += '<td>{}</td>'.format(i[6]) # IBM x-Force Categorization
htmlTableBody += '<td><a href="https://www.talosintelligence.com/reputation_center/lookup?search={0}" target="_blank">Cisco Talos</a></td>'.format(i[0]) # Cisco Talos
htmlTableBody += '<td>{}</td>'.format(i[7]) # Cisco Talos
htmlTableBody += '<td><a href="http://www.borderware.com/domain_lookup.php?ip={}" target="_blank">WatchGuard</a></td>'.format(i[0]) # Borderware WatchGuard
htmlTableBody += '<td><a href="https://www.namecheap.com/domains/registration/results.aspx?domain={}" target="_blank">Namecheap</a></td>'.format(i[0]) # Namecheap
htmlTableBody += '<td><a href="http://web.archive.org/web/*/{}" target="_blank">Archive.org</a></td>'.format(i[0]) # Archive.org
@ -598,8 +625,5 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)'
print("[*] Log written to {}\n".format(logfilename))
# Print Text Table
t = Texttable(max_width=maxwidth)
t.add_rows(sortedData)
header = ['Domain', 'Birth', '#', 'TLDs', 'Status', 'Symantec', 'IBM']
t.header(header)
print(t.draw())
header = ['Domain', 'Birth', '#', 'TLDs', 'Status', 'BlueCoat', 'IBM', 'Cisco Talos']
print(drawTable(header,sortedDomains))