From 4be8c803eb0749768cba9e09aaa2dcd0f69a17ff Mon Sep 17 00:00:00 2001
From: Joe Vest <joe@Joes-MacBook-Pro.local>
Date: Thu, 4 Oct 2018 10:58:35 -0500
Subject: [PATCH] tweak logic and fix table column indexes

---
 .gitignore      |   6 ++
 README.md       |  45 +++++++++------
 domainhunter.py | 146 ++++++++++++++++++++++++++++++++++--------------
 3 files changed, 138 insertions(+), 59 deletions(-)

diff --git a/.gitignore b/.gitignore
index 623c252..d3d9885 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,9 @@
 *.html
 *.txt
 *.jpg
+
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
\ No newline at end of file
diff --git a/README.md b/README.md
index 5b5d17a..9897628 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,12 @@ This Python based tool was written to quickly query the Expireddomains.net searc
 
 ## Changes
 
+- 4 October 2018
+   + Tweaked parsing logic
+   + Fixed changes parsed columns indexes
+   + Added additional TLDs to found TLD if the TLD is marked available.
+        + If thisistest.com is found and thisistest.org is mark available, thisistest.org will be added to the search list
+
 - 17 September 2018
     + Fixed Symantec WebPulse Site Review parsing errors caused by service updates
 
@@ -67,37 +73,42 @@ Optional - Install additional OCR support dependencies
 
 ## Usage
 
-List DomainHunter options
-    
-    python3 domainhunter.py -h
-    usage: domainhunter.py [-h] [-q QUERY] [-c] [-r MAXRESULTS] [-s SINGLE]
-                           [-w MAXWIDTH] [-v]
+    usage: domainhunter.py [-h] [-a] [-k KEYWORD] [-c] [-f FILENAME] [--ocr]
+                        [-r MAXRESULTS] [-s SINGLE] [-t {0,1,2,3,4,5}]
+                        [-w MAXWIDTH] [-V]
 
-    Finds expired domains, domain categorization, and Archive.org history to
-    determine good candidates for C2 and phishing domains
+    Finds expired domains, domain categorization, and Archive.org history to determine good candidates for C2 and phishing domains
 
     optional arguments:
-      -h, --help            show this help message and exit
-      -k KEYWORD, --keyword KEYWORD
+    -h, --help            show this help message and exit
+    -a, --alexa           Filter results to Alexa listings
+    -k KEYWORD, --keyword KEYWORD
                             Keyword used to refine search results
-      -c, --check           Perform domain reputation checks
-      -f FILENAME, --filename FILENAME
+    -c, --check           Perform domain reputation checks
+    -f FILENAME, --filename FILENAME
                             Specify input file of line delimited domain names to
                             check
-      --ocr                 Perform OCR on CAPTCHAs when present
-      -r MAXRESULTS, --maxresults MAXRESULTS
+    --ocr                 Perform OCR on CAPTCHAs when challenged
+    -r MAXRESULTS, --maxresults MAXRESULTS
                             Number of results to return when querying latest
                             expired/deleted domains
-      -s SINGLE, --single SINGLE
+    -s SINGLE, --single SINGLE
                             Performs detailed reputation checks against a single
                             domain name/IP.
-      -t {0,1,2,3,4,5}, --timing {0,1,2,3,4,5}
+    -t {0,1,2,3,4,5}, --timing {0,1,2,3,4,5}
                             Modifies request timing to avoid CAPTCHAs. Slowest(0)
                             = 90-120 seconds, Default(3) = 10-20 seconds,
                             Fastest(5) = no delay
-      -w MAXWIDTH, --maxwidth MAXWIDTH
+    -w MAXWIDTH, --maxwidth MAXWIDTH
                             Width of text table
-      -V, --version         show program's version number and exit
+    -V, --version         show program's version number and exit
+
+    Examples:
+    ./domainhunter.py -k apples -c --ocr -t5
+    ./domainhunter.py --check --ocr -t3
+    ./domainhunter.py --single mydomain.com
+    ./domainhunter.py --keyword tech --check --ocr --timing 5 --alexa
+    ./domaihunter.py --filename inputlist.txt --ocr --timing 5
 
 Use defaults to check for most recent 100 domains and check reputation
     
diff --git a/domainhunter.py b/domainhunter.py
index 83c314b..d949c6e 100644
--- a/domainhunter.py
+++ b/domainhunter.py
@@ -293,7 +293,8 @@ if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(
         description='Finds expired domains, domain categorization, and Archive.org history to determine good candidates for C2 and phishing domains',
-        epilog = '''Examples:
+        epilog = '''
+Examples:
 ./domainhunter.py -k apples -c --ocr -t5
 ./domainhunter.py --check --ocr -t3
 ./domainhunter.py --single mydomain.com
@@ -313,8 +314,6 @@ if __name__ == "__main__":
     parser.add_argument('-V','--version', action='version',version='%(prog)s {version}'.format(version=__version__))
     args = parser.parse_args()
 
- 
-
     # Load dependent modules
     try:
         import requests
@@ -495,6 +494,8 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
         #print(soup)
         try:
             table = soup.find("table")
+
+            rows = table.findAll('tr')[1:]
             for row in table.findAll('tr')[1:]:
 
                 # Alternative way to extract domain name
@@ -510,60 +511,118 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                         c2 = cells[2].find(text=True)   # domainpop
                         c3 = cells[3].find(text=True)   # birth
                         c4 = cells[4].find(text=True)   # Archive.org entries
-                        c5 = cells[5].find(text=True)   # similarweb
-                        c6 = cells[6].find(text=True)   # similarweb country code
-                        c7 = cells[7].find(text=True)   # Dmoz.org
-                        c8 = cells[8].find(text=True)   # status com
-                        c9 = cells[9].find(text=True)   # status net
-                        c10 = cells[10].find(text=True) # status org
-                        c11 = cells[11].find(text=True) # status de
-                        c12 = cells[12].find(text=True) # tld registered
-                        c13 = cells[13].find(text=True) # Source List
-                        c14 = cells[14].find(text=True) # Domain Status
-                        c15 = ""                        # Related Domains
+                        c5 = cells[5].find(text=True)   # Alexa
+                        c6 = cells[6].find(text=True)   # Dmoz.org
+                        c7 = cells[7].find(text=True)   # status com
+                        c8 = cells[8].find(text=True)   # status net
+                        c9 = cells[9].find(text=True)   # status org
+                        c10 = cells[10].find(text=True) # status de
+                        c11 = cells[11].find(text=True) # TLDs
+                        c12 = cells[12].find(text=True) # RDT
+                        c13 = cells[13].find(text=True) # List
+                        c14 = cells[14].find(text=True) # Status
+                        c15 = ""                        # Links 
+
+                        # create available TLD list
+                        available = ''
+                        if c7 == "available":
+                            available += ".com "
+
+                        if c8 == "available":
+                            available += ".net "
+
+                        if c9 == "available":
+                            available += ".org "
+
+                        if c10 == "available":
+                            available += ".de "
+                        
+                        # Only grab status for keyword searches since it doesn't exist otherwise
+                        status = ""
+                        if keyword:
+                            status = c14
+                        
+                        # Only add Expired, not Pending, Backorder, etc
+                        if c13 == "Expired":
+                            # Append parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain)
+                            #if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):
+                            #    domain_list.append([c0,c3,c4,available,status]) 
+
+                            # Add other TLDs to list if marked available
+                            if (c7 == "available") and (c0 not in maldomainsList):
+                                dom = c0.split(".")[0] + ".com"
+                                domain_list.append([dom,c3,c4,available,status]) 
+
+                            if (c8 == "available") and (c0 not in maldomainsList):
+                                dom = c0.split(".")[0] + ".net"
+                                domain_list.append([dom,c3,c4,available,status])    
+
+                            if (c9 == "available") and (c0 not in maldomainsList):
+                                dom = c0.split(".")[0] + ".org"
+                                domain_list.append([dom,c3,c4,available,status])  
+
+                            if (c10 == "available") and (c0 not in maldomainsList):
+                                dom = c0.split(".")[0] + ".de"
+                                domain_list.append([dom,c3,c4,available,status])  
 
                     # Non-keyword search table format is slightly different
                     else:
+                    
                         c0 = cells[0].find(text=True)   # domain
                         c1 = cells[1].find(text=True)   # bl
                         c2 = cells[2].find(text=True)   # domainpop
                         c3 = cells[3].find(text=True)   # birth
                         c4 = cells[4].find(text=True)   # Archive.org entries
-                        c5 = cells[5].find(text=True)   # similarweb
-                        c6 = cells[6].find(text=True)   # similarweb country code
-                        c7 = cells[7].find(text=True)   # Dmoz.org
-                        c8 = cells[8].find(text=True)   # status com
-                        c9 = cells[9].find(text=True)   # status net
-                        c10 = cells[10].find(text=True) # status org
-                        c11 = cells[11].find(text=True) # status de
-                        c12 = cells[12].find(text=True) # tld registered
-                        c13 = cells[13].find(text=True) # changes
-                        c14 = cells[14].find(text=True) # whois
+                        c5 = cells[5].find(text=True)   # Alexa
+                        c6 = cells[6].find(text=True)   # Dmoz.org
+                        c7 = cells[7].find(text=True)   # status com
+                        c8 = cells[8].find(text=True)   # status net
+                        c9 = cells[9].find(text=True)   # status org
+                        c10 = cells[10].find(text=True) # status de
+                        c11 = cells[11].find(text=True) # TLDs
+                        c12 = cells[12].find(text=True) # RDT
+                        c13 = cells[13].find(text=True) # End Date
+                        c14 = cells[14].find(text=True) # Links
+                        
+                        # create available TLD list
+                        available = ''
+                        if c7 == "available":
+                            available += ".com "
 
-                    available = ''
-                    if c8 == "available":
-                        available += ".com "
+                        if c8 == "available":
+                            available += ".net "
 
-                    if c9 == "available":
-                        available += ".net "
+                        if c9 == "available":
+                            available += ".org "
 
-                    if c10 == "available":
-                        available += ".org "
+                        if c10 == "available":
+                            available += ".de "
 
-                    if c11 == "available":
-                        available += ".de "
+                        status = ""
 
-                    # Only grab status for keyword searches since it doesn't exist otherwise
-                    status = ""
-                    if keyword:
-                        status = c14
+                        # Add other TLDs to list if marked available
+                        if (c7 == "available") and (c0 not in maldomainsList):
+                            dom = c0.split(".")[0] + ".com"
+                            domain_list.append([dom,c3,c4,available,status]) 
 
-                    # Append parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain)
-                    if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):
-                        domain_list.append([c0,c3,c4,available,status])               
+                        if (c8 == "available") and (c0 not in maldomainsList):
+                            dom = c0.split(".")[0] + ".net"
+                            domain_list.append([dom,c3,c4,available,status])    
+                        
+                        if (c9 == "available") and (c0 not in maldomainsList):
+                            dom = c0.split(".")[0] + ".org"
+                            domain_list.append([dom,c3,c4,available,status])  
+                    
+                        if (c10 == "available") and (c0 not in maldomainsList):
+                            dom = c0.split(".")[0] + ".de"
+                            domain_list.append([dom,c3,c4,available,status])  
 
+                        # Append original parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain)
+                        #if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):
+                        #    domain_list.append([c0,c3,c4,available,status]) 
+                        
         except Exception as e: 
-            #print(e)
+            print("[!] Error: ", e)
             pass
 
         # Add additional sleep on requests to ExpiredDomains.net to avoid errors
@@ -577,7 +636,10 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
         if check:
             print("\n[*] Performing reputation checks for {} domains".format(len(domain_list)))
         
-        for domain_entry in domain_list:
+        domain_list_unique = []
+        [domain_list_unique.append(item) for item in domain_list if item not in domain_list_unique]
+
+        for domain_entry in domain_list_unique:
             domain = domain_entry[0]
             birthdate = domain_entry[1]
             archiveentries = domain_entry[2]