tweak logic and fix table column indexes

2018-10-04 10:58:35 -05:00 · 2018-10-04 10:58:35 -05:00 · 4be8c803eb
parent 8f8abdb6e3
commit 4be8c803eb
3 changed files with 138 additions and 59 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,9 @@
 *.html
 *.txt
 *.jpg
 .vscode/*
 !.vscode/settings.json
 !.vscode/tasks.json
 !.vscode/launch.json
 !.vscode/extensions.json
--- a/README.md
+++ b/README.md
@ -8,6 +8,12 @@ This Python based tool was written to quickly query the Expireddomains.net searc
 ## Changes
 - 4 October 2018
   + Tweaked parsing logic
   + Fixed changes parsed columns indexes
   + Added additional TLDs to found TLD if the TLD is marked available.
        + If thisistest.com is found and thisistest.org is mark available, thisistest.org will be added to the search list
 - 17 September 2018
    + Fixed Symantec WebPulse Site Review parsing errors caused by service updates
@ -67,37 +73,42 @@ Optional - Install additional OCR support dependencies
 ## Usage
-List DomainHunter options
+    usage: domainhunter.py [-h] [-a] [-k KEYWORD] [-c] [-f FILENAME] [--ocr]
-    
+                        [-r MAXRESULTS] [-s SINGLE] [-t {0,1,2,3,4,5}]
-    python3 domainhunter.py -h
+                        [-w MAXWIDTH] [-V]
    usage: domainhunter.py [-h] [-q QUERY] [-c] [-r MAXRESULTS] [-s SINGLE]
                           [-w MAXWIDTH] [-v]
-    Finds expired domains, domain categorization, and Archive.org history to
+    Finds expired domains, domain categorization, and Archive.org history to determine good candidates for C2 and phishing domains
    determine good candidates for C2 and phishing domains
    optional arguments:
-      -h, --help            show this help message and exit
+    -h, --help            show this help message and exit
-      -k KEYWORD, --keyword KEYWORD
+    -a, --alexa           Filter results to Alexa listings
    -k KEYWORD, --keyword KEYWORD
                            Keyword used to refine search results
-      -c, --check           Perform domain reputation checks
+    -c, --check           Perform domain reputation checks
-      -f FILENAME, --filename FILENAME
+    -f FILENAME, --filename FILENAME
                            Specify input file of line delimited domain names to
                            check
-      --ocr                 Perform OCR on CAPTCHAs when present
+    --ocr                 Perform OCR on CAPTCHAs when challenged
-      -r MAXRESULTS, --maxresults MAXRESULTS
+    -r MAXRESULTS, --maxresults MAXRESULTS
                            Number of results to return when querying latest
                            expired/deleted domains
-      -s SINGLE, --single SINGLE
+    -s SINGLE, --single SINGLE
                            Performs detailed reputation checks against a single
                            domain name/IP.
-      -t {0,1,2,3,4,5}, --timing {0,1,2,3,4,5}
+    -t {0,1,2,3,4,5}, --timing {0,1,2,3,4,5}
                            Modifies request timing to avoid CAPTCHAs. Slowest(0)
                            = 90-120 seconds, Default(3) = 10-20 seconds,
                            Fastest(5) = no delay
-      -w MAXWIDTH, --maxwidth MAXWIDTH
+    -w MAXWIDTH, --maxwidth MAXWIDTH
                            Width of text table
-      -V, --version         show program's version number and exit
+    -V, --version         show program's version number and exit
    Examples:
    ./domainhunter.py -k apples -c --ocr -t5
    ./domainhunter.py --check --ocr -t3
    ./domainhunter.py --single mydomain.com
    ./domainhunter.py --keyword tech --check --ocr --timing 5 --alexa
    ./domaihunter.py --filename inputlist.txt --ocr --timing 5
 Use defaults to check for most recent 100 domains and check reputation
--- a/domainhunter.py
+++ b/domainhunter.py
@ -293,7 +293,8 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Finds expired domains, domain categorization, and Archive.org history to determine good candidates for C2 and phishing domains',
-        epilog = '''Examples:
+        epilog = '''
 Examples:
 ./domainhunter.py -k apples -c --ocr -t5
 ./domainhunter.py --check --ocr -t3
 ./domainhunter.py --single mydomain.com
@ -313,8 +314,6 @@ if __name__ == "__main__":
    parser.add_argument('-V','--version', action='version',version='%(prog)s {version}'.format(version=__version__))
    args = parser.parse_args()
    # Load dependent modules
    try:
        import requests
@ -495,6 +494,8 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
        #print(soup)
        try:
            table = soup.find("table")
            rows = table.findAll('tr')[1:]
            for row in table.findAll('tr')[1:]:
                # Alternative way to extract domain name
@ -510,60 +511,118 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                        c2 = cells[2].find(text=True)   # domainpop
                        c3 = cells[3].find(text=True)   # birth
                        c4 = cells[4].find(text=True)   # Archive.org entries
-                        c5 = cells[5].find(text=True)   # similarweb
+                        c5 = cells[5].find(text=True)   # Alexa
-                        c6 = cells[6].find(text=True)   # similarweb country code
+                        c6 = cells[6].find(text=True)   # Dmoz.org
-                        c7 = cells[7].find(text=True)   # Dmoz.org
+                        c7 = cells[7].find(text=True)   # status com
-                        c8 = cells[8].find(text=True)   # status com
+                        c8 = cells[8].find(text=True)   # status net
-                        c9 = cells[9].find(text=True)   # status net
+                        c9 = cells[9].find(text=True)   # status org
-                        c10 = cells[10].find(text=True) # status org
+                        c10 = cells[10].find(text=True) # status de
-                        c11 = cells[11].find(text=True) # status de
+                        c11 = cells[11].find(text=True) # TLDs
-                        c12 = cells[12].find(text=True) # tld registered
+                        c12 = cells[12].find(text=True) # RDT
-                        c13 = cells[13].find(text=True) # Source List
+                        c13 = cells[13].find(text=True) # List
-                        c14 = cells[14].find(text=True) # Domain Status
+                        c14 = cells[14].find(text=True) # Status
-                        c15 = ""                        # Related Domains
+                        c15 = ""                        # Links 
                        # create available TLD list
                        available = ''
                        if c7 == "available":
                            available += ".com "
                        if c8 == "available":
                            available += ".net "
                        if c9 == "available":
                            available += ".org "
                        if c10 == "available":
                            available += ".de "
                        # Only grab status for keyword searches since it doesn't exist otherwise
                        status = ""
                        if keyword:
                            status = c14
                        # Only add Expired, not Pending, Backorder, etc
                        if c13 == "Expired":
                            # Append parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain)
                            #if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):
                            #    domain_list.append([c0,c3,c4,available,status]) 
                            # Add other TLDs to list if marked available
                            if (c7 == "available") and (c0 not in maldomainsList):
                                dom = c0.split(".")[0] + ".com"
                                domain_list.append([dom,c3,c4,available,status]) 
                            if (c8 == "available") and (c0 not in maldomainsList):
                                dom = c0.split(".")[0] + ".net"
                                domain_list.append([dom,c3,c4,available,status])    
                            if (c9 == "available") and (c0 not in maldomainsList):
                                dom = c0.split(".")[0] + ".org"
                                domain_list.append([dom,c3,c4,available,status])  
                            if (c10 == "available") and (c0 not in maldomainsList):
                                dom = c0.split(".")[0] + ".de"
                                domain_list.append([dom,c3,c4,available,status])  
                    # Non-keyword search table format is slightly different
                    else:
                        c0 = cells[0].find(text=True)   # domain
                        c1 = cells[1].find(text=True)   # bl
                        c2 = cells[2].find(text=True)   # domainpop
                        c3 = cells[3].find(text=True)   # birth
                        c4 = cells[4].find(text=True)   # Archive.org entries
-                        c5 = cells[5].find(text=True)   # similarweb
+                        c5 = cells[5].find(text=True)   # Alexa
-                        c6 = cells[6].find(text=True)   # similarweb country code
+                        c6 = cells[6].find(text=True)   # Dmoz.org
-                        c7 = cells[7].find(text=True)   # Dmoz.org
+                        c7 = cells[7].find(text=True)   # status com
-                        c8 = cells[8].find(text=True)   # status com
+                        c8 = cells[8].find(text=True)   # status net
-                        c9 = cells[9].find(text=True)   # status net
+                        c9 = cells[9].find(text=True)   # status org
-                        c10 = cells[10].find(text=True) # status org
+                        c10 = cells[10].find(text=True) # status de
-                        c11 = cells[11].find(text=True) # status de
+                        c11 = cells[11].find(text=True) # TLDs
-                        c12 = cells[12].find(text=True) # tld registered
+                        c12 = cells[12].find(text=True) # RDT
-                        c13 = cells[13].find(text=True) # changes
+                        c13 = cells[13].find(text=True) # End Date
-                        c14 = cells[14].find(text=True) # whois
+                        c14 = cells[14].find(text=True) # Links
                        # create available TLD list
                        available = ''
                        if c7 == "available":
                            available += ".com "
-                    available = ''
+                        if c8 == "available":
-                    if c8 == "available":
+                            available += ".net "
                        available += ".com "
-                    if c9 == "available":
+                        if c9 == "available":
-                        available += ".net "
+                            available += ".org "
-                    if c10 == "available":
+                        if c10 == "available":
-                        available += ".org "
+                            available += ".de "
-                    if c11 == "available":
+                        status = ""
                        available += ".de "
-                    # Only grab status for keyword searches since it doesn't exist otherwise
+                        # Add other TLDs to list if marked available
-                    status = ""
+                        if (c7 == "available") and (c0 not in maldomainsList):
-                    if keyword:
+                            dom = c0.split(".")[0] + ".com"
-                        status = c14
+                            domain_list.append([dom,c3,c4,available,status]) 
-                    # Append parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain)
+                        if (c8 == "available") and (c0 not in maldomainsList):
-                    if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):
+                            dom = c0.split(".")[0] + ".net"
-                        domain_list.append([c0,c3,c4,available,status])               
+                            domain_list.append([dom,c3,c4,available,status])    
                        if (c9 == "available") and (c0 not in maldomainsList):
                            dom = c0.split(".")[0] + ".org"
                            domain_list.append([dom,c3,c4,available,status])  
                        if (c10 == "available") and (c0 not in maldomainsList):
                            dom = c0.split(".")[0] + ".de"
                            domain_list.append([dom,c3,c4,available,status])  
                        # Append original parsed domain data to list if it matches our criteria (.com|.net|.org and not a known malware domain)
                        #if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):
                        #    domain_list.append([c0,c3,c4,available,status]) 
        except Exception as e: 
-            #print(e)
+            print("[!] Error: ", e)
            pass
        # Add additional sleep on requests to ExpiredDomains.net to avoid errors
@ -577,7 +636,10 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
        if check:
            print("\n[*] Performing reputation checks for {} domains".format(len(domain_list)))
-        for domain_entry in domain_list:
+        domain_list_unique = []
        [domain_list_unique.append(item) for item in domain_list if item not in domain_list_unique]
        for domain_entry in domain_list_unique:
            domain = domain_entry[0]
            birthdate = domain_entry[1]
            archiveentries = domain_entry[2]