Merge pull request #267 from vzamanillo/github-rate-limit

Handle GitHub Search API rate limit exceeded
2020-07-18 16:45:13 +05:30 · 2020-07-18 16:45:13 +05:30 · e30c5670c3
parent c0b4d45746 798815c519
commit e30c5670c3
4 changed files with 185 additions and 53 deletions
--- a/pkg/runner/config.go
+++ b/pkg/runner/config.go
@ -120,7 +120,7 @@ func (c ConfigFile) GetKeys() subscraping.Keys {
 		keys.DNSDB = c.DNSDB[rand.Intn(len(c.DNSDB))]
 	}
 	if (len(c.GitHub)) > 0 {
-		keys.GitHub = c.GitHub[rand.Intn(len(c.GitHub))]
+		keys.GitHub = c.GitHub
 	}

 	if len(c.IntelX) > 0 {
--- a/pkg/subscraping/sources/github/github.go
+++ b/pkg/subscraping/sources/github/github.go
@ -5,9 +5,12 @@ import (
 	"context"
 	"fmt"
 	"io/ioutil"
-	"regexp"
-	"strings"
+	"net/http"
 	"net/url"
+	"regexp"
+	"strconv"
+	"strings"
+	"time"

 	jsoniter "github.com/json-iterator/go"

@ -16,9 +19,14 @@ import (
 	"github.com/tomnomnom/linkheader"
 )

+type textMatch struct {
+	Fragment string `json:"fragment"`
+}
+
 type item struct {
 	Name    		string `json:"name"`
 	HtmlUrl 		string `json:"html_url"`
+	TextMatches []textMatch `json:"text_matches"`
 }

 type response struct {
@ -33,31 +41,45 @@ func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Se
 	results := make(chan subscraping.Result)

 	go func() {
-		if session.Keys.GitHub == "" {
+		if len(session.Keys.GitHub) == 0 {
 			close(results)
 			return
 		}

+		tokens := NewTokenManager(session.Keys.GitHub)
+
 		// search on GitHub with exact match
 		searchURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=\"%s\"", domain)
-		headers := map[string]string{
-			"Accept":        "application/vnd.github.v3+json",
-			"Authorization": "token " + session.Keys.GitHub,
-		}
-		s.enumerate(ctx, searchURL, headers, s.DomainRegexp(domain), session, results)
+		s.enumerate(ctx, searchURL, s.DomainRegexp(domain), tokens, session, results)
 		close(results)
 	}()

 	return results
 }

-func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[string]string, domainRegexp *regexp.Regexp, session *subscraping.Session, results chan subscraping.Result) {
+func (s *Source) enumerate(ctx context.Context, searchURL string, domainRegexp *regexp.Regexp, tokens *Tokens, session *subscraping.Session, results chan subscraping.Result) {
 	select {
 	case <-ctx.Done():
 		return
 	default:
 	}

+	token := tokens.Get()
+
+	if token.RetryAfter > 0 {
+		if len(tokens.pool) == 1 {
+			gologger.Verbosef("GitHub Search request rate limit exceeded, waiting for %d seconds before retry... \n", s.Name(), token.RetryAfter)
+			time.Sleep(time.Duration(token.RetryAfter) * time.Second)
+		} else {
+			token = tokens.Get()
+		}
+	}
+
+	headers := map[string]string{
+		"Accept":        "application/vnd.github.v3.text-match+json",
+		"Authorization": "token " + token.Hash,
+	}
+
 	// Initial request to GitHub search
 	resp, err := session.Get(ctx, searchURL, "", headers)
 	if err != nil {
@ -65,6 +87,14 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
 		return
 	}

+	// Retry enumerarion after Retry-After seconds on rate limit abuse detected
+	ratelimitRemaining, _ := strconv.ParseInt(resp.Header.Get("X-Ratelimit-Remaining"), 10, 64)
+	if resp.StatusCode == http.StatusForbidden && ratelimitRemaining == 0 {
+		retryAfterSeconds, _ := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64)
+		tokens.setCurrentTokenExceeded(retryAfterSeconds)
+
+		s.enumerate(ctx, searchURL, domainRegexp, tokens, session, results)
+	} else {
 		// Links header, first, next, last...
 		linksHeader := linkheader.Parse(resp.Header.Get("Link"))

@ -80,7 +110,7 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st

 		// Response items iteration
 		for _, item := range data.Items {
-		resp, err := session.NormalGetWithContext(ctx, s.RawUrl(item.HtmlUrl))
+			resp, err := session.NormalGetWithContext(ctx, rawUrl(item.HtmlUrl))
 			if err != nil {
 				results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
 				return
@ -94,10 +124,20 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
 				return
 			}

+			var subdomains []string
+
 			// Search for domain matches in the code
-		domainMatch := domainRegexp.FindStringSubmatch(string(code))
-		if len(domainMatch) > 0 {
-			results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: domainMatch[1]}
+
+			subdomains = append(subdomains, matches(domainRegexp, normalizeContent(string(code)))...)
+
+			// Text matches iteration per item
+			for _, textMatch := range item.TextMatches {
+				// Search for domain matches in the text fragment
+				subdomains = append(subdomains, matches(domainRegexp, normalizeContent(textMatch.Fragment))...)
+			}
+
+			for _, subdomain := range unique(subdomains) {
+				results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: subdomain}
 			}
 		}

@ -109,22 +149,53 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
 					results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
 					return
 				}
-			gologger.Verbosef("Next URL %s\n", s.Name(), nextUrl)
-			s.enumerate(ctx, nextUrl, headers, domainRegexp, session, results)
+				s.enumerate(ctx, nextUrl, domainRegexp, tokens, session, results)
 			}
 		}
 	}
+}
+
+// Normalize content before matching, query unescape, remove tabs and new line chars
+func normalizeContent(content string) string {
+	normalizedContent, _ := url.QueryUnescape(content)
+	normalizedContent = strings.Replace(normalizedContent, "\\t", "", -1)
+	normalizedContent = strings.Replace(normalizedContent, "\\n", "", -1)
+	return normalizedContent
+}
+
+// Remove duplicates from string array
+func unique(arr []string) []string {
+    occured := map[string]bool{}
+    result := []string{}
+    for e := range arr {
+        if occured[arr[e]] != true {
+            occured[arr[e]] = true
+            result = append(result, arr[e])
+        }
+    }
+    return result
+}
+
+// Find matches by regular expression in any content
+func matches(regexp *regexp.Regexp, content string) []string {
+	var matches []string
+	match := regexp.FindAllString(content, -1)
+	if len(match) > 0 {
+		matches = unique(match)
+	}
+	return matches
+}
+
+// Raw URL to get the files code and match for subdomains
+func rawUrl(htmlUrl string) string {
+	domain := strings.Replace(htmlUrl, "https://github.com/", "https://raw.githubusercontent.com/", -1)
+	return strings.Replace(domain, "/blob/", "/", -1)
+}

 // Domain regular expression to match subdomains in github files code
 func (s *Source) DomainRegexp(domain string) *regexp.Regexp {
 	rdomain := strings.Replace(domain, ".", "\\.", -1)
-	return regexp.MustCompile("(([0-9a-z_\\-\\.]+)\\." + rdomain + ")")
-}
-
-// Raw URL to get the files code and match for subdomains
-func (s *Source) RawUrl(htmlUrl string) string {
-	domain := strings.Replace(htmlUrl, "https://github.com/", "https://raw.githubusercontent.com/", -1)
-	return strings.Replace(domain, "/blob/", "/", -1)
+	return regexp.MustCompile("(\\w+[.])*" + rdomain)
 }

 // Name returns the name of the source
--- a/pkg/subscraping/sources/github/tokenmanager.go
+++ b/pkg/subscraping/sources/github/tokenmanager.go
@ -0,0 +1,61 @@
+package github
+
+import "time"
+
+type token struct {
+	Hash         string
+	RetryAfter   int64
+	ExceededTime time.Time
+}
+
+type Tokens struct {
+	current int
+	pool    []token
+}
+
+func NewTokenManager(keys []string) *Tokens {
+	pool := []token{}
+	for _, key := range keys {
+		t := token{Hash: key, ExceededTime: time.Time{}, RetryAfter: 0}
+		pool = append(pool, t)
+	}
+
+	return &Tokens{
+		current: 0,
+		pool:    pool,
+	}
+}
+
+func (r *Tokens) setCurrentTokenExceeded(retryAfter int64) {
+	if r.current >= len(r.pool) {
+		r.current = r.current % len(r.pool)
+	}
+	if r.pool[r.current].RetryAfter == 0 {
+		r.pool[r.current].ExceededTime = time.Now()
+		r.pool[r.current].RetryAfter = retryAfter
+	}
+}
+
+func (r *Tokens) Get() token {
+	resetExceededTokens(r)
+
+	if r.current >= len(r.pool) {
+		r.current = r.current % len(r.pool)
+	}
+
+	result := r.pool[r.current]
+	r.current++
+
+	return result
+}
+
+func resetExceededTokens(r *Tokens) {
+	for i, token := range r.pool {
+		if token.RetryAfter > 0 {
+			if int64(time.Since(token.ExceededTime)/time.Second) > token.RetryAfter {
+				r.pool[i].ExceededTime = time.Time{}
+				r.pool[i].RetryAfter = 0
+			}
+		}
+	}
+}
--- a/pkg/subscraping/types.go
+++ b/pkg/subscraping/types.go
@ -35,7 +35,7 @@ type Keys struct {
 	Certspotter          string `json:"certspotter"`
 	Chaos                string `json:"chaos"`
 	DNSDB                string `json:"dnsdb"`
-	GitHub               string `json:"github"`
+	GitHub               []string `json:"github"`
 	IntelXHost           string `json:"intelXHost"`
 	IntelXKey            string `json:"intelXKey"`
 	PassiveTotalUsername string `json:"passivetotal_username"`