From 4be8c803eb0749768cba9e09aaa2dcd0f69a17ff Mon Sep 17 00:00:00 2001 From: Joe Vest Date: Thu, 4 Oct 2018 10:58:35 -0500 Subject: [PATCH] tweak logic and fix table column indexes --- .gitignore | 6 ++ README.md | 45 +++++++++------ domainhunter.py | 146 ++++++++++++++++++++++++++++++++++-------------- 3 files changed, 138 insertions(+), 59 deletions(-) diff --git a/.gitignore b/.gitignore index 623c252..d3d9885 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ *.html *.txt *.jpg + +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json \ No newline at end of file diff --git a/README.md b/README.md index 5b5d17a..9897628 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,12 @@ This Python based tool was written to quickly query the Expireddomains.net searc ## Changes +- 4 October 2018 + + Tweaked parsing logic + + Fixed changes parsed columns indexes + + Added additional TLDs to found TLD if the TLD is marked available. + + If thisistest.com is found and thisistest.org is mark available, thisistest.org will be added to the search list + - 17 September 2018 + Fixed Symantec WebPulse Site Review parsing errors caused by service updates @@ -67,37 +73,42 @@ Optional - Install additional OCR support dependencies ## Usage -List DomainHunter options - - python3 domainhunter.py -h - usage: domainhunter.py [-h] [-q QUERY] [-c] [-r MAXRESULTS] [-s SINGLE] - [-w MAXWIDTH] [-v] + usage: domainhunter.py [-h] [-a] [-k KEYWORD] [-c] [-f FILENAME] [--ocr] + [-r MAXRESULTS] [-s SINGLE] [-t {0,1,2,3,4,5}] + [-w MAXWIDTH] [-V] - Finds expired domains, domain categorization, and Archive.org history to - determine good candidates for C2 and phishing domains + Finds expired domains, domain categorization, and Archive.org history to determine good candidates for C2 and phishing domains optional arguments: - -h, --help show this help message and exit - -k KEYWORD, --keyword KEYWORD + -h, --help show this help message and exit + -a, --alexa Filter results to Alexa listings + -k KEYWORD, --keyword KEYWORD Keyword used to refine search results - -c, --check Perform domain reputation checks - -f FILENAME, --filename FILENAME + -c, --check Perform domain reputation checks + -f FILENAME, --filename FILENAME Specify input file of line delimited domain names to check - --ocr Perform OCR on CAPTCHAs when present - -r MAXRESULTS, --maxresults MAXRESULTS + --ocr Perform OCR on CAPTCHAs when challenged + -r MAXRESULTS, --maxresults MAXRESULTS Number of results to return when querying latest expired/deleted domains - -s SINGLE, --single SINGLE + -s SINGLE, --single SINGLE Performs detailed reputation checks against a single domain name/IP. - -t {0,1,2,3,4,5}, --timing {0,1,2,3,4,5} + -t {0,1,2,3,4,5}, --timing {0,1,2,3,4,5} Modifies request timing to avoid CAPTCHAs. Slowest(0) = 90-120 seconds, Default(3) = 10-20 seconds, Fastest(5) = no delay - -w MAXWIDTH, --maxwidth MAXWIDTH + -w MAXWIDTH, --maxwidth MAXWIDTH Width of text table - -V, --version show program's version number and exit + -V, --version show program's version number and exit + + Examples: + ./domainhunter.py -k apples -c --ocr -t5 + ./domainhunter.py --check --ocr -t3 + ./domainhunter.py --single mydomain.com + ./domainhunter.py --keyword tech --check --ocr --timing 5 --alexa + ./domaihunter.py --filename inputlist.txt --ocr --timing 5 Use defaults to check for most recent 100 domains and check reputation diff --git a/domainhunter.py b/domainhunter.py index 83c314b..d949c6e 100644 --- a/domainhunter.py +++ b/domainhunter.py @@ -293,7 +293,8 @@ if __name__ == "__main__": parser = argparse.ArgumentParser( description='Finds expired domains, domain categorization, and Archive.org history to determine good candidates for C2 and phishing domains', - epilog = '''Examples: + epilog = ''' +Examples: ./domainhunter.py -k apples -c --ocr -t5 ./domainhunter.py --check --ocr -t3 ./domainhunter.py --single mydomain.com @@ -313,8 +314,6 @@ if __name__ == "__main__": parser.add_argument('-V','--version', action='version',version='%(prog)s {version}'.format(version=__version__)) args = parser.parse_args() - - # Load dependent modules try: import requests @@ -495,6 +494,8 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' #print(soup) try: table = soup.find("table") + + rows = table.findAll('tr')[1:] for row in table.findAll('tr')[1:]: # Alternative way to extract domain name @@ -510,60 +511,118 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' c2 = cells[2].find(text=True) # domainpop c3 = cells[3].find(text=True) # birth c4 = cells[4].find(text=True) # Archive.org entries - c5 = cells[5].find(text=True) # similarweb - c6 = cells[6].find(text=True) # similarweb country code - c7 = cells[7].find(text=True) # Dmoz.org - c8 = cells[8].find(text=True) # status com - c9 = cells[9].find(text=True) # status net - c10 = cells[10].find(text=True) # status org - c11 = cells[11].find(text=True) # status de - c12 = cells[12].find(text=True) # tld registered - c13 = cells[13].find(text=True) # Source List - c14 = cells[14].find(text=True) # Domain Status - c15 = "" # Related Domains + c5 = cells[5].find(text=True) # Alexa + c6 = cells[6].find(text=True) # Dmoz.org + c7 = cells[7].find(text=True) # status com + c8 = cells[8].find(text=True) # status net + c9 = cells[9].find(text=True) # status org + c10 = cells[10].find(text=True) # status de + c11 = cells[11].find(text=True) # TLDs + c12 = cells[12].find(text=True) # RDT + c13 = cells[13].find(text=True) # List + c14 = cells[14].find(text=True) # Status + c15 = "" # Links + + # create available TLD list + available = '' + if c7 == "available": + available += ".com " + + if c8 == "available": + available += ".net " + + if c9 == "available": + available += ".org " + + if c10 == "available": + available += ".de " + + # Only grab status for keyword searches since it doesn't exist otherwise + status = "" + if keyword: + status = c14 + + # Only add Expired, not Pending, Backorder, etc + if c13 == "Expired": + # Append parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain) + #if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList): + # domain_list.append([c0,c3,c4,available,status]) + + # Add other TLDs to list if marked available + if (c7 == "available") and (c0 not in maldomainsList): + dom = c0.split(".")[0] + ".com" + domain_list.append([dom,c3,c4,available,status]) + + if (c8 == "available") and (c0 not in maldomainsList): + dom = c0.split(".")[0] + ".net" + domain_list.append([dom,c3,c4,available,status]) + + if (c9 == "available") and (c0 not in maldomainsList): + dom = c0.split(".")[0] + ".org" + domain_list.append([dom,c3,c4,available,status]) + + if (c10 == "available") and (c0 not in maldomainsList): + dom = c0.split(".")[0] + ".de" + domain_list.append([dom,c3,c4,available,status]) # Non-keyword search table format is slightly different else: + c0 = cells[0].find(text=True) # domain c1 = cells[1].find(text=True) # bl c2 = cells[2].find(text=True) # domainpop c3 = cells[3].find(text=True) # birth c4 = cells[4].find(text=True) # Archive.org entries - c5 = cells[5].find(text=True) # similarweb - c6 = cells[6].find(text=True) # similarweb country code - c7 = cells[7].find(text=True) # Dmoz.org - c8 = cells[8].find(text=True) # status com - c9 = cells[9].find(text=True) # status net - c10 = cells[10].find(text=True) # status org - c11 = cells[11].find(text=True) # status de - c12 = cells[12].find(text=True) # tld registered - c13 = cells[13].find(text=True) # changes - c14 = cells[14].find(text=True) # whois + c5 = cells[5].find(text=True) # Alexa + c6 = cells[6].find(text=True) # Dmoz.org + c7 = cells[7].find(text=True) # status com + c8 = cells[8].find(text=True) # status net + c9 = cells[9].find(text=True) # status org + c10 = cells[10].find(text=True) # status de + c11 = cells[11].find(text=True) # TLDs + c12 = cells[12].find(text=True) # RDT + c13 = cells[13].find(text=True) # End Date + c14 = cells[14].find(text=True) # Links + + # create available TLD list + available = '' + if c7 == "available": + available += ".com " - available = '' - if c8 == "available": - available += ".com " + if c8 == "available": + available += ".net " - if c9 == "available": - available += ".net " + if c9 == "available": + available += ".org " - if c10 == "available": - available += ".org " + if c10 == "available": + available += ".de " - if c11 == "available": - available += ".de " + status = "" - # Only grab status for keyword searches since it doesn't exist otherwise - status = "" - if keyword: - status = c14 + # Add other TLDs to list if marked available + if (c7 == "available") and (c0 not in maldomainsList): + dom = c0.split(".")[0] + ".com" + domain_list.append([dom,c3,c4,available,status]) - # Append parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain) - if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList): - domain_list.append([c0,c3,c4,available,status]) + if (c8 == "available") and (c0 not in maldomainsList): + dom = c0.split(".")[0] + ".net" + domain_list.append([dom,c3,c4,available,status]) + + if (c9 == "available") and (c0 not in maldomainsList): + dom = c0.split(".")[0] + ".org" + domain_list.append([dom,c3,c4,available,status]) + + if (c10 == "available") and (c0 not in maldomainsList): + dom = c0.split(".")[0] + ".de" + domain_list.append([dom,c3,c4,available,status]) + # Append original parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain) + #if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList): + # domain_list.append([c0,c3,c4,available,status]) + except Exception as e: - #print(e) + print("[!] Error: ", e) pass # Add additional sleep on requests to ExpiredDomains.net to avoid errors @@ -577,7 +636,10 @@ If you plan to use this content for illegal purpose, don't. Have a nice day :)' if check: print("\n[*] Performing reputation checks for {} domains".format(len(domain_list))) - for domain_entry in domain_list: + domain_list_unique = [] + [domain_list_unique.append(item) for item in domain_list if item not in domain_list_unique] + + for domain_entry in domain_list_unique: domain = domain_entry[0] birthdate = domain_entry[1] archiveentries = domain_entry[2]