Merge pull request #267 from vzamanillo/github-rate-limit

Handle GitHub Search API rate limit exceeded
master
bauthard 2020-07-18 16:45:13 +05:30 committed by GitHub
commit e30c5670c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 185 additions and 53 deletions

View File

@ -120,7 +120,7 @@ func (c ConfigFile) GetKeys() subscraping.Keys {
keys.DNSDB = c.DNSDB[rand.Intn(len(c.DNSDB))]
}
if (len(c.GitHub)) > 0 {
keys.GitHub = c.GitHub[rand.Intn(len(c.GitHub))]
keys.GitHub = c.GitHub
}
if len(c.IntelX) > 0 {

View File

@ -5,9 +5,12 @@ import (
"context"
"fmt"
"io/ioutil"
"regexp"
"strings"
"net/http"
"net/url"
"regexp"
"strconv"
"strings"
"time"
jsoniter "github.com/json-iterator/go"
@ -16,9 +19,14 @@ import (
"github.com/tomnomnom/linkheader"
)
type textMatch struct {
Fragment string `json:"fragment"`
}
type item struct {
Name string `json:"name"`
HtmlUrl string `json:"html_url"`
Name string `json:"name"`
HtmlUrl string `json:"html_url"`
TextMatches []textMatch `json:"text_matches"`
}
type response struct {
@ -33,31 +41,45 @@ func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Se
results := make(chan subscraping.Result)
go func() {
if session.Keys.GitHub == "" {
if len(session.Keys.GitHub) == 0 {
close(results)
return
}
tokens := NewTokenManager(session.Keys.GitHub)
// search on GitHub with exact match
searchURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=\"%s\"", domain)
headers := map[string]string{
"Accept": "application/vnd.github.v3+json",
"Authorization": "token " + session.Keys.GitHub,
}
s.enumerate(ctx, searchURL, headers, s.DomainRegexp(domain), session, results)
s.enumerate(ctx, searchURL, s.DomainRegexp(domain), tokens, session, results)
close(results)
}()
return results
}
func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[string]string, domainRegexp *regexp.Regexp, session *subscraping.Session, results chan subscraping.Result) {
func (s *Source) enumerate(ctx context.Context, searchURL string, domainRegexp *regexp.Regexp, tokens *Tokens, session *subscraping.Session, results chan subscraping.Result) {
select {
case <-ctx.Done():
return
default:
}
token := tokens.Get()
if token.RetryAfter > 0 {
if len(tokens.pool) == 1 {
gologger.Verbosef("GitHub Search request rate limit exceeded, waiting for %d seconds before retry... \n", s.Name(), token.RetryAfter)
time.Sleep(time.Duration(token.RetryAfter) * time.Second)
} else {
token = tokens.Get()
}
}
headers := map[string]string{
"Accept": "application/vnd.github.v3.text-match+json",
"Authorization": "token " + token.Hash,
}
// Initial request to GitHub search
resp, err := session.Get(ctx, searchURL, "", headers)
if err != nil {
@ -65,66 +87,115 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
return
}
// Links header, first, next, last...
linksHeader := linkheader.Parse(resp.Header.Get("Link"))
// Retry enumerarion after Retry-After seconds on rate limit abuse detected
ratelimitRemaining, _ := strconv.ParseInt(resp.Header.Get("X-Ratelimit-Remaining"), 10, 64)
if resp.StatusCode == http.StatusForbidden && ratelimitRemaining == 0 {
retryAfterSeconds, _ := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64)
tokens.setCurrentTokenExceeded(retryAfterSeconds)
data := response{}
s.enumerate(ctx, searchURL, domainRegexp, tokens, session, results)
} else {
// Links header, first, next, last...
linksHeader := linkheader.Parse(resp.Header.Get("Link"))
// Marshall json reponse
err = jsoniter.NewDecoder(resp.Body).Decode(&data)
resp.Body.Close()
if err != nil {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
return
}
data := response{}
// Response items iteration
for _, item := range data.Items {
resp, err := session.NormalGetWithContext(ctx, s.RawUrl(item.HtmlUrl))
if err != nil {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
return
}
// Get the item code from the raw file url
code, err := ioutil.ReadAll(resp.Body)
// Marshall json reponse
err = jsoniter.NewDecoder(resp.Body).Decode(&data)
resp.Body.Close()
if err != nil {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
return
}
// Search for domain matches in the code
domainMatch := domainRegexp.FindStringSubmatch(string(code))
if len(domainMatch) > 0 {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: domainMatch[1]}
}
}
// Proccess the next link recursively
for _, link := range linksHeader {
if link.Rel == "next" {
nextUrl, err := url.QueryUnescape(link.URL)
// Response items iteration
for _, item := range data.Items {
resp, err := session.NormalGetWithContext(ctx, rawUrl(item.HtmlUrl))
if err != nil {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
return
}
gologger.Verbosef("Next URL %s\n", s.Name(), nextUrl)
s.enumerate(ctx, nextUrl, headers, domainRegexp, session, results)
// Get the item code from the raw file url
code, err := ioutil.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
return
}
var subdomains []string
// Search for domain matches in the code
subdomains = append(subdomains, matches(domainRegexp, normalizeContent(string(code)))...)
// Text matches iteration per item
for _, textMatch := range item.TextMatches {
// Search for domain matches in the text fragment
subdomains = append(subdomains, matches(domainRegexp, normalizeContent(textMatch.Fragment))...)
}
for _, subdomain := range unique(subdomains) {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: subdomain}
}
}
// Proccess the next link recursively
for _, link := range linksHeader {
if link.Rel == "next" {
nextUrl, err := url.QueryUnescape(link.URL)
if err != nil {
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
return
}
s.enumerate(ctx, nextUrl, domainRegexp, tokens, session, results)
}
}
}
}
// Normalize content before matching, query unescape, remove tabs and new line chars
func normalizeContent(content string) string {
normalizedContent, _ := url.QueryUnescape(content)
normalizedContent = strings.Replace(normalizedContent, "\\t", "", -1)
normalizedContent = strings.Replace(normalizedContent, "\\n", "", -1)
return normalizedContent
}
// Remove duplicates from string array
func unique(arr []string) []string {
occured := map[string]bool{}
result := []string{}
for e := range arr {
if occured[arr[e]] != true {
occured[arr[e]] = true
result = append(result, arr[e])
}
}
return result
}
// Find matches by regular expression in any content
func matches(regexp *regexp.Regexp, content string) []string {
var matches []string
match := regexp.FindAllString(content, -1)
if len(match) > 0 {
matches = unique(match)
}
return matches
}
// Raw URL to get the files code and match for subdomains
func rawUrl(htmlUrl string) string {
domain := strings.Replace(htmlUrl, "https://github.com/", "https://raw.githubusercontent.com/", -1)
return strings.Replace(domain, "/blob/", "/", -1)
}
// Domain regular expression to match subdomains in github files code
func (s *Source) DomainRegexp(domain string) *regexp.Regexp {
rdomain := strings.Replace(domain, ".", "\\.", -1)
return regexp.MustCompile("(([0-9a-z_\\-\\.]+)\\." + rdomain + ")")
}
// Raw URL to get the files code and match for subdomains
func (s *Source) RawUrl(htmlUrl string) string {
domain := strings.Replace(htmlUrl, "https://github.com/", "https://raw.githubusercontent.com/", -1)
return strings.Replace(domain, "/blob/", "/", -1)
return regexp.MustCompile("(\\w+[.])*" + rdomain)
}
// Name returns the name of the source

View File

@ -0,0 +1,61 @@
package github
import "time"
type token struct {
Hash string
RetryAfter int64
ExceededTime time.Time
}
type Tokens struct {
current int
pool []token
}
func NewTokenManager(keys []string) *Tokens {
pool := []token{}
for _, key := range keys {
t := token{Hash: key, ExceededTime: time.Time{}, RetryAfter: 0}
pool = append(pool, t)
}
return &Tokens{
current: 0,
pool: pool,
}
}
func (r *Tokens) setCurrentTokenExceeded(retryAfter int64) {
if r.current >= len(r.pool) {
r.current = r.current % len(r.pool)
}
if r.pool[r.current].RetryAfter == 0 {
r.pool[r.current].ExceededTime = time.Now()
r.pool[r.current].RetryAfter = retryAfter
}
}
func (r *Tokens) Get() token {
resetExceededTokens(r)
if r.current >= len(r.pool) {
r.current = r.current % len(r.pool)
}
result := r.pool[r.current]
r.current++
return result
}
func resetExceededTokens(r *Tokens) {
for i, token := range r.pool {
if token.RetryAfter > 0 {
if int64(time.Since(token.ExceededTime)/time.Second) > token.RetryAfter {
r.pool[i].ExceededTime = time.Time{}
r.pool[i].RetryAfter = 0
}
}
}
}

View File

@ -35,7 +35,7 @@ type Keys struct {
Certspotter string `json:"certspotter"`
Chaos string `json:"chaos"`
DNSDB string `json:"dnsdb"`
GitHub string `json:"github"`
GitHub []string `json:"github"`
IntelXHost string `json:"intelXHost"`
IntelXKey string `json:"intelXKey"`
PassiveTotalUsername string `json:"passivetotal_username"`