Merge pull request #267 from vzamanillo/github-rate-limit
Handle GitHub Search API rate limit exceededmaster
commit
e30c5670c3
|
@ -120,7 +120,7 @@ func (c ConfigFile) GetKeys() subscraping.Keys {
|
||||||
keys.DNSDB = c.DNSDB[rand.Intn(len(c.DNSDB))]
|
keys.DNSDB = c.DNSDB[rand.Intn(len(c.DNSDB))]
|
||||||
}
|
}
|
||||||
if (len(c.GitHub)) > 0 {
|
if (len(c.GitHub)) > 0 {
|
||||||
keys.GitHub = c.GitHub[rand.Intn(len(c.GitHub))]
|
keys.GitHub = c.GitHub
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(c.IntelX) > 0 {
|
if len(c.IntelX) > 0 {
|
||||||
|
|
|
@ -5,9 +5,12 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"regexp"
|
"net/http"
|
||||||
"strings"
|
|
||||||
"net/url"
|
"net/url"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
jsoniter "github.com/json-iterator/go"
|
jsoniter "github.com/json-iterator/go"
|
||||||
|
|
||||||
|
@ -16,9 +19,14 @@ import (
|
||||||
"github.com/tomnomnom/linkheader"
|
"github.com/tomnomnom/linkheader"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type textMatch struct {
|
||||||
|
Fragment string `json:"fragment"`
|
||||||
|
}
|
||||||
|
|
||||||
type item struct {
|
type item struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
HtmlUrl string `json:"html_url"`
|
HtmlUrl string `json:"html_url"`
|
||||||
|
TextMatches []textMatch `json:"text_matches"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type response struct {
|
type response struct {
|
||||||
|
@ -33,31 +41,45 @@ func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Se
|
||||||
results := make(chan subscraping.Result)
|
results := make(chan subscraping.Result)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
if session.Keys.GitHub == "" {
|
if len(session.Keys.GitHub) == 0 {
|
||||||
close(results)
|
close(results)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tokens := NewTokenManager(session.Keys.GitHub)
|
||||||
|
|
||||||
// search on GitHub with exact match
|
// search on GitHub with exact match
|
||||||
searchURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=\"%s\"", domain)
|
searchURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=\"%s\"", domain)
|
||||||
headers := map[string]string{
|
s.enumerate(ctx, searchURL, s.DomainRegexp(domain), tokens, session, results)
|
||||||
"Accept": "application/vnd.github.v3+json",
|
|
||||||
"Authorization": "token " + session.Keys.GitHub,
|
|
||||||
}
|
|
||||||
s.enumerate(ctx, searchURL, headers, s.DomainRegexp(domain), session, results)
|
|
||||||
close(results)
|
close(results)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
return results
|
return results
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[string]string, domainRegexp *regexp.Regexp, session *subscraping.Session, results chan subscraping.Result) {
|
func (s *Source) enumerate(ctx context.Context, searchURL string, domainRegexp *regexp.Regexp, tokens *Tokens, session *subscraping.Session, results chan subscraping.Result) {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
default:
|
default:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
token := tokens.Get()
|
||||||
|
|
||||||
|
if token.RetryAfter > 0 {
|
||||||
|
if len(tokens.pool) == 1 {
|
||||||
|
gologger.Verbosef("GitHub Search request rate limit exceeded, waiting for %d seconds before retry... \n", s.Name(), token.RetryAfter)
|
||||||
|
time.Sleep(time.Duration(token.RetryAfter) * time.Second)
|
||||||
|
} else {
|
||||||
|
token = tokens.Get()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
headers := map[string]string{
|
||||||
|
"Accept": "application/vnd.github.v3.text-match+json",
|
||||||
|
"Authorization": "token " + token.Hash,
|
||||||
|
}
|
||||||
|
|
||||||
// Initial request to GitHub search
|
// Initial request to GitHub search
|
||||||
resp, err := session.Get(ctx, searchURL, "", headers)
|
resp, err := session.Get(ctx, searchURL, "", headers)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -65,6 +87,14 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Retry enumerarion after Retry-After seconds on rate limit abuse detected
|
||||||
|
ratelimitRemaining, _ := strconv.ParseInt(resp.Header.Get("X-Ratelimit-Remaining"), 10, 64)
|
||||||
|
if resp.StatusCode == http.StatusForbidden && ratelimitRemaining == 0 {
|
||||||
|
retryAfterSeconds, _ := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64)
|
||||||
|
tokens.setCurrentTokenExceeded(retryAfterSeconds)
|
||||||
|
|
||||||
|
s.enumerate(ctx, searchURL, domainRegexp, tokens, session, results)
|
||||||
|
} else {
|
||||||
// Links header, first, next, last...
|
// Links header, first, next, last...
|
||||||
linksHeader := linkheader.Parse(resp.Header.Get("Link"))
|
linksHeader := linkheader.Parse(resp.Header.Get("Link"))
|
||||||
|
|
||||||
|
@ -80,7 +110,7 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
|
||||||
|
|
||||||
// Response items iteration
|
// Response items iteration
|
||||||
for _, item := range data.Items {
|
for _, item := range data.Items {
|
||||||
resp, err := session.NormalGetWithContext(ctx, s.RawUrl(item.HtmlUrl))
|
resp, err := session.NormalGetWithContext(ctx, rawUrl(item.HtmlUrl))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
|
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
|
||||||
return
|
return
|
||||||
|
@ -94,10 +124,20 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var subdomains []string
|
||||||
|
|
||||||
// Search for domain matches in the code
|
// Search for domain matches in the code
|
||||||
domainMatch := domainRegexp.FindStringSubmatch(string(code))
|
|
||||||
if len(domainMatch) > 0 {
|
subdomains = append(subdomains, matches(domainRegexp, normalizeContent(string(code)))...)
|
||||||
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: domainMatch[1]}
|
|
||||||
|
// Text matches iteration per item
|
||||||
|
for _, textMatch := range item.TextMatches {
|
||||||
|
// Search for domain matches in the text fragment
|
||||||
|
subdomains = append(subdomains, matches(domainRegexp, normalizeContent(textMatch.Fragment))...)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, subdomain := range unique(subdomains) {
|
||||||
|
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: subdomain}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -109,22 +149,53 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
|
||||||
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
|
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
gologger.Verbosef("Next URL %s\n", s.Name(), nextUrl)
|
s.enumerate(ctx, nextUrl, domainRegexp, tokens, session, results)
|
||||||
s.enumerate(ctx, nextUrl, headers, domainRegexp, session, results)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize content before matching, query unescape, remove tabs and new line chars
|
||||||
|
func normalizeContent(content string) string {
|
||||||
|
normalizedContent, _ := url.QueryUnescape(content)
|
||||||
|
normalizedContent = strings.Replace(normalizedContent, "\\t", "", -1)
|
||||||
|
normalizedContent = strings.Replace(normalizedContent, "\\n", "", -1)
|
||||||
|
return normalizedContent
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove duplicates from string array
|
||||||
|
func unique(arr []string) []string {
|
||||||
|
occured := map[string]bool{}
|
||||||
|
result := []string{}
|
||||||
|
for e := range arr {
|
||||||
|
if occured[arr[e]] != true {
|
||||||
|
occured[arr[e]] = true
|
||||||
|
result = append(result, arr[e])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find matches by regular expression in any content
|
||||||
|
func matches(regexp *regexp.Regexp, content string) []string {
|
||||||
|
var matches []string
|
||||||
|
match := regexp.FindAllString(content, -1)
|
||||||
|
if len(match) > 0 {
|
||||||
|
matches = unique(match)
|
||||||
|
}
|
||||||
|
return matches
|
||||||
|
}
|
||||||
|
|
||||||
|
// Raw URL to get the files code and match for subdomains
|
||||||
|
func rawUrl(htmlUrl string) string {
|
||||||
|
domain := strings.Replace(htmlUrl, "https://github.com/", "https://raw.githubusercontent.com/", -1)
|
||||||
|
return strings.Replace(domain, "/blob/", "/", -1)
|
||||||
|
}
|
||||||
|
|
||||||
// Domain regular expression to match subdomains in github files code
|
// Domain regular expression to match subdomains in github files code
|
||||||
func (s *Source) DomainRegexp(domain string) *regexp.Regexp {
|
func (s *Source) DomainRegexp(domain string) *regexp.Regexp {
|
||||||
rdomain := strings.Replace(domain, ".", "\\.", -1)
|
rdomain := strings.Replace(domain, ".", "\\.", -1)
|
||||||
return regexp.MustCompile("(([0-9a-z_\\-\\.]+)\\." + rdomain + ")")
|
return regexp.MustCompile("(\\w+[.])*" + rdomain)
|
||||||
}
|
|
||||||
|
|
||||||
// Raw URL to get the files code and match for subdomains
|
|
||||||
func (s *Source) RawUrl(htmlUrl string) string {
|
|
||||||
domain := strings.Replace(htmlUrl, "https://github.com/", "https://raw.githubusercontent.com/", -1)
|
|
||||||
return strings.Replace(domain, "/blob/", "/", -1)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Name returns the name of the source
|
// Name returns the name of the source
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
package github
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
type token struct {
|
||||||
|
Hash string
|
||||||
|
RetryAfter int64
|
||||||
|
ExceededTime time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
type Tokens struct {
|
||||||
|
current int
|
||||||
|
pool []token
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewTokenManager(keys []string) *Tokens {
|
||||||
|
pool := []token{}
|
||||||
|
for _, key := range keys {
|
||||||
|
t := token{Hash: key, ExceededTime: time.Time{}, RetryAfter: 0}
|
||||||
|
pool = append(pool, t)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &Tokens{
|
||||||
|
current: 0,
|
||||||
|
pool: pool,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *Tokens) setCurrentTokenExceeded(retryAfter int64) {
|
||||||
|
if r.current >= len(r.pool) {
|
||||||
|
r.current = r.current % len(r.pool)
|
||||||
|
}
|
||||||
|
if r.pool[r.current].RetryAfter == 0 {
|
||||||
|
r.pool[r.current].ExceededTime = time.Now()
|
||||||
|
r.pool[r.current].RetryAfter = retryAfter
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *Tokens) Get() token {
|
||||||
|
resetExceededTokens(r)
|
||||||
|
|
||||||
|
if r.current >= len(r.pool) {
|
||||||
|
r.current = r.current % len(r.pool)
|
||||||
|
}
|
||||||
|
|
||||||
|
result := r.pool[r.current]
|
||||||
|
r.current++
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func resetExceededTokens(r *Tokens) {
|
||||||
|
for i, token := range r.pool {
|
||||||
|
if token.RetryAfter > 0 {
|
||||||
|
if int64(time.Since(token.ExceededTime)/time.Second) > token.RetryAfter {
|
||||||
|
r.pool[i].ExceededTime = time.Time{}
|
||||||
|
r.pool[i].RetryAfter = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -35,7 +35,7 @@ type Keys struct {
|
||||||
Certspotter string `json:"certspotter"`
|
Certspotter string `json:"certspotter"`
|
||||||
Chaos string `json:"chaos"`
|
Chaos string `json:"chaos"`
|
||||||
DNSDB string `json:"dnsdb"`
|
DNSDB string `json:"dnsdb"`
|
||||||
GitHub string `json:"github"`
|
GitHub []string `json:"github"`
|
||||||
IntelXHost string `json:"intelXHost"`
|
IntelXHost string `json:"intelXHost"`
|
||||||
IntelXKey string `json:"intelXKey"`
|
IntelXKey string `json:"intelXKey"`
|
||||||
PassiveTotalUsername string `json:"passivetotal_username"`
|
PassiveTotalUsername string `json:"passivetotal_username"`
|
||||||
|
|
Loading…
Reference in New Issue