tweak logic and fix table column indexes

2018-10-04 10:58:35 -05:00 · 2018-10-04 10:58:35 -05:00 · 4be8c803eb
parent 8f8abdb6e3
commit 4be8c803eb
3 changed files with 138 additions and 59 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,9 @@
 *.html
 *.txt
 *.jpg
+
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
--- a/README.md
+++ b/README.md
@ -8,6 +8,12 @@ This Python based tool was written to quickly query the Expireddomains.net searc

 ## Changes

+- 4 October 2018
+   + Tweaked parsing logic
+   + Fixed changes parsed columns indexes
+   + Added additional TLDs to found TLD if the TLD is marked available.
+        + If thisistest.com is found and thisistest.org is mark available, thisistest.org will be added to the search list
+
 - 17 September 2018
    + Fixed Symantec WebPulse Site Review parsing errors caused by service updates

@ -67,37 +73,42 @@ Optional - Install additional OCR support dependencies

 ## Usage

-List DomainHunter options
-    
-    python3 domainhunter.py -h
-    usage: domainhunter.py [-h] [-q QUERY] [-c] [-r MAXRESULTS] [-s SINGLE]
-                           [-w MAXWIDTH] [-v]
+    usage: domainhunter.py [-h] [-a] [-k KEYWORD] [-c] [-f FILENAME] [--ocr]
+                        [-r MAXRESULTS] [-s SINGLE] [-t {0,1,2,3,4,5}]
+                        [-w MAXWIDTH] [-V]

-    Finds expired domains, domain categorization, and Archive.org history to
-    determine good candidates for C2 and phishing domains
+    Finds expired domains, domain categorization, and Archive.org history to determine good candidates for C2 and phishing domains

    optional arguments:
-      -h, --help            show this help message and exit
-      -k KEYWORD, --keyword KEYWORD
+    -h, --help            show this help message and exit
+    -a, --alexa           Filter results to Alexa listings
+    -k KEYWORD, --keyword KEYWORD
                            Keyword used to refine search results
-      -c, --check           Perform domain reputation checks
-      -f FILENAME, --filename FILENAME
+    -c, --check           Perform domain reputation checks
+    -f FILENAME, --filename FILENAME
                            Specify input file of line delimited domain names to
                            check
-      --ocr                 Perform OCR on CAPTCHAs when present
-      -r MAXRESULTS, --maxresults MAXRESULTS
+    --ocr                 Perform OCR on CAPTCHAs when challenged
+    -r MAXRESULTS, --maxresults MAXRESULTS
                            Number of results to return when querying latest
                            expired/deleted domains
-      -s SINGLE, --single SINGLE
+    -s SINGLE, --single SINGLE
                            Performs detailed reputation checks against a single
                            domain name/IP.
-      -t {0,1,2,3,4,5}, --timing {0,1,2,3,4,5}
+    -t {0,1,2,3,4,5}, --timing {0,1,2,3,4,5}
                            Modifies request timing to avoid CAPTCHAs. Slowest(0)
                            = 90-120 seconds, Default(3) = 10-20 seconds,
                            Fastest(5) = no delay
-      -w MAXWIDTH, --maxwidth MAXWIDTH
+    -w MAXWIDTH, --maxwidth MAXWIDTH
                            Width of text table
-      -V, --version         show program's version number and exit
+    -V, --version         show program's version number and exit
+
+    Examples:
+    ./domainhunter.py -k apples -c --ocr -t5
+    ./domainhunter.py --check --ocr -t3
+    ./domainhunter.py --single mydomain.com
+    ./domainhunter.py --keyword tech --check --ocr --timing 5 --alexa
+    ./domaihunter.py --filename inputlist.txt --ocr --timing 5

 Use defaults to check for most recent 100 domains and check reputation
    
--- a/domainhunter.py
+++ b/domainhunter.py
@ -293,7 +293,8 @@ if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        description='Finds expired domains, domain categorization, and Archive.org history to determine good candidates for C2 and phishing domains',
-        epilog = '''Examples:
+        epilog = '''
+Examples:
 ./domainhunter.py -k apples -c --ocr -t5
 ./domainhunter.py --check --ocr -t3
 ./domainhunter.py --single mydomain.com
@ -313,8 +314,6 @@ if __name__ == "__main__":
    parser.add_argument('-V','--version', action='version',version='%(prog)s {version}'.format(version=__version__))
    args = parser.parse_args()

- 
-
    # Load dependent modules
    try:
        import requests
@ -495,6 +494,8 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
        #print(soup)
        try:
            table = soup.find("table")
+
+            rows = table.findAll('tr')[1:]
            for row in table.findAll('tr')[1:]:

                # Alternative way to extract domain name
@ -510,60 +511,118 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                        c2 = cells[2].find(text=True)   # domainpop
                        c3 = cells[3].find(text=True)   # birth
                        c4 = cells[4].find(text=True)   # Archive.org entries
-                        c5 = cells[5].find(text=True)   # similarweb
-                        c6 = cells[6].find(text=True)   # similarweb country code
-                        c7 = cells[7].find(text=True)   # Dmoz.org
-                        c8 = cells[8].find(text=True)   # status com
-                        c9 = cells[9].find(text=True)   # status net
-                        c10 = cells[10].find(text=True) # status org
-                        c11 = cells[11].find(text=True) # status de
-                        c12 = cells[12].find(text=True) # tld registered
-                        c13 = cells[13].find(text=True) # Source List
-                        c14 = cells[14].find(text=True) # Domain Status
-                        c15 = ""                        # Related Domains
+                        c5 = cells[5].find(text=True)   # Alexa
+                        c6 = cells[6].find(text=True)   # Dmoz.org
+                        c7 = cells[7].find(text=True)   # status com
+                        c8 = cells[8].find(text=True)   # status net
+                        c9 = cells[9].find(text=True)   # status org
+                        c10 = cells[10].find(text=True) # status de
+                        c11 = cells[11].find(text=True) # TLDs
+                        c12 = cells[12].find(text=True) # RDT
+                        c13 = cells[13].find(text=True) # List
+                        c14 = cells[14].find(text=True) # Status
+                        c15 = ""                        # Links 
+
+                        # create available TLD list
+                        available = ''
+                        if c7 == "available":
+                            available += ".com "
+
+                        if c8 == "available":
+                            available += ".net "
+
+                        if c9 == "available":
+                            available += ".org "
+
+                        if c10 == "available":
+                            available += ".de "
+                        
+                        # Only grab status for keyword searches since it doesn't exist otherwise
+                        status = ""
+                        if keyword:
+                            status = c14
+                        
+                        # Only add Expired, not Pending, Backorder, etc
+                        if c13 == "Expired":
+                            # Append parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain)
+                            #if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):
+                            #    domain_list.append([c0,c3,c4,available,status]) 
+
+                            # Add other TLDs to list if marked available
+                            if (c7 == "available") and (c0 not in maldomainsList):
+                                dom = c0.split(".")[0] + ".com"
+                                domain_list.append([dom,c3,c4,available,status]) 
+
+                            if (c8 == "available") and (c0 not in maldomainsList):
+                                dom = c0.split(".")[0] + ".net"
+                                domain_list.append([dom,c3,c4,available,status])    
+
+                            if (c9 == "available") and (c0 not in maldomainsList):
+                                dom = c0.split(".")[0] + ".org"
+                                domain_list.append([dom,c3,c4,available,status])  
+
+                            if (c10 == "available") and (c0 not in maldomainsList):
+                                dom = c0.split(".")[0] + ".de"
+                                domain_list.append([dom,c3,c4,available,status])  

                    # Non-keyword search table format is slightly different
                    else:
+                    
                        c0 = cells[0].find(text=True)   # domain
                        c1 = cells[1].find(text=True)   # bl
                        c2 = cells[2].find(text=True)   # domainpop
                        c3 = cells[3].find(text=True)   # birth
                        c4 = cells[4].find(text=True)   # Archive.org entries
-                        c5 = cells[5].find(text=True)   # similarweb
-                        c6 = cells[6].find(text=True)   # similarweb country code
-                        c7 = cells[7].find(text=True)   # Dmoz.org
-                        c8 = cells[8].find(text=True)   # status com
-                        c9 = cells[9].find(text=True)   # status net
-                        c10 = cells[10].find(text=True) # status org
-                        c11 = cells[11].find(text=True) # status de
-                        c12 = cells[12].find(text=True) # tld registered
-                        c13 = cells[13].find(text=True) # changes
-                        c14 = cells[14].find(text=True) # whois
+                        c5 = cells[5].find(text=True)   # Alexa
+                        c6 = cells[6].find(text=True)   # Dmoz.org
+                        c7 = cells[7].find(text=True)   # status com
+                        c8 = cells[8].find(text=True)   # status net
+                        c9 = cells[9].find(text=True)   # status org
+                        c10 = cells[10].find(text=True) # status de
+                        c11 = cells[11].find(text=True) # TLDs
+                        c12 = cells[12].find(text=True) # RDT
+                        c13 = cells[13].find(text=True) # End Date
+                        c14 = cells[14].find(text=True) # Links
+                        
+                        # create available TLD list
+                        available = ''
+                        if c7 == "available":
+                            available += ".com "

-                    available = ''
-                    if c8 == "available":
-                        available += ".com "
+                        if c8 == "available":
+                            available += ".net "

-                    if c9 == "available":
-                        available += ".net "
+                        if c9 == "available":
+                            available += ".org "

-                    if c10 == "available":
-                        available += ".org "
+                        if c10 == "available":
+                            available += ".de "

-                    if c11 == "available":
-                        available += ".de "
+                        status = ""

-                    # Only grab status for keyword searches since it doesn't exist otherwise
-                    status = ""
-                    if keyword:
-                        status = c14
+                        # Add other TLDs to list if marked available
+                        if (c7 == "available") and (c0 not in maldomainsList):
+                            dom = c0.split(".")[0] + ".com"
+                            domain_list.append([dom,c3,c4,available,status]) 

-                    # Append parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain)
-                    if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):
-                        domain_list.append([c0,c3,c4,available,status])               
+                        if (c8 == "available") and (c0 not in maldomainsList):
+                            dom = c0.split(".")[0] + ".net"
+                            domain_list.append([dom,c3,c4,available,status])    
+                        
+                        if (c9 == "available") and (c0 not in maldomainsList):
+                            dom = c0.split(".")[0] + ".org"
+                            domain_list.append([dom,c3,c4,available,status])  
+                    
+                        if (c10 == "available") and (c0 not in maldomainsList):
+                            dom = c0.split(".")[0] + ".de"
+                            domain_list.append([dom,c3,c4,available,status])  

+                        # Append original parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain)
+                        #if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):
+                        #    domain_list.append([c0,c3,c4,available,status]) 
+                        
        except Exception as e: 
-            #print(e)
+            print("[!] Error: ", e)
            pass

        # Add additional sleep on requests to ExpiredDomains.net to avoid errors
@ -577,7 +636,10 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
        if check:
            print("\n[*] Performing reputation checks for {} domains".format(len(domain_list)))
        
-        for domain_entry in domain_list:
+        domain_list_unique = []
+        [domain_list_unique.append(item) for item in domain_list if item not in domain_list_unique]
+
+        for domain_entry in domain_list_unique:
            domain = domain_entry[0]
            birthdate = domain_entry[1]
            archiveentries = domain_entry[2]