|
|
|
@ -5,9 +5,12 @@ import (
|
|
|
|
|
"context"
|
|
|
|
|
"fmt"
|
|
|
|
|
"io/ioutil"
|
|
|
|
|
"regexp"
|
|
|
|
|
"strings"
|
|
|
|
|
"net/http"
|
|
|
|
|
"net/url"
|
|
|
|
|
"regexp"
|
|
|
|
|
"strconv"
|
|
|
|
|
"strings"
|
|
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
jsoniter "github.com/json-iterator/go"
|
|
|
|
|
|
|
|
|
@ -16,9 +19,14 @@ import (
|
|
|
|
|
"github.com/tomnomnom/linkheader"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
type textMatch struct {
|
|
|
|
|
Fragment string `json:"fragment"`
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type item struct {
|
|
|
|
|
Name string `json:"name"`
|
|
|
|
|
HtmlUrl string `json:"html_url"`
|
|
|
|
|
TextMatches []textMatch `json:"text_matches"`
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type response struct {
|
|
|
|
@ -33,31 +41,45 @@ func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Se
|
|
|
|
|
results := make(chan subscraping.Result)
|
|
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
|
if session.Keys.GitHub == "" {
|
|
|
|
|
if len(session.Keys.GitHub) == 0 {
|
|
|
|
|
close(results)
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tokens := NewTokenManager(session.Keys.GitHub)
|
|
|
|
|
|
|
|
|
|
// search on GitHub with exact match
|
|
|
|
|
searchURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=\"%s\"", domain)
|
|
|
|
|
headers := map[string]string{
|
|
|
|
|
"Accept": "application/vnd.github.v3+json",
|
|
|
|
|
"Authorization": "token " + session.Keys.GitHub,
|
|
|
|
|
}
|
|
|
|
|
s.enumerate(ctx, searchURL, headers, s.DomainRegexp(domain), session, results)
|
|
|
|
|
s.enumerate(ctx, searchURL, s.DomainRegexp(domain), tokens, session, results)
|
|
|
|
|
close(results)
|
|
|
|
|
}()
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[string]string, domainRegexp *regexp.Regexp, session *subscraping.Session, results chan subscraping.Result) {
|
|
|
|
|
func (s *Source) enumerate(ctx context.Context, searchURL string, domainRegexp *regexp.Regexp, tokens *Tokens, session *subscraping.Session, results chan subscraping.Result) {
|
|
|
|
|
select {
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
return
|
|
|
|
|
default:
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
token := tokens.Get()
|
|
|
|
|
|
|
|
|
|
if token.RetryAfter > 0 {
|
|
|
|
|
if len(tokens.pool) == 1 {
|
|
|
|
|
gologger.Verbosef("GitHub Search request rate limit exceeded, waiting for %d seconds before retry... \n", s.Name(), token.RetryAfter)
|
|
|
|
|
time.Sleep(time.Duration(token.RetryAfter) * time.Second)
|
|
|
|
|
} else {
|
|
|
|
|
token = tokens.Get()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
headers := map[string]string{
|
|
|
|
|
"Accept": "application/vnd.github.v3.text-match+json",
|
|
|
|
|
"Authorization": "token " + token.Hash,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Initial request to GitHub search
|
|
|
|
|
resp, err := session.Get(ctx, searchURL, "", headers)
|
|
|
|
|
if err != nil {
|
|
|
|
@ -65,6 +87,14 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Retry enumerarion after Retry-After seconds on rate limit abuse detected
|
|
|
|
|
ratelimitRemaining, _ := strconv.ParseInt(resp.Header.Get("X-Ratelimit-Remaining"), 10, 64)
|
|
|
|
|
if resp.StatusCode == http.StatusForbidden && ratelimitRemaining == 0 {
|
|
|
|
|
retryAfterSeconds, _ := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64)
|
|
|
|
|
tokens.setCurrentTokenExceeded(retryAfterSeconds)
|
|
|
|
|
|
|
|
|
|
s.enumerate(ctx, searchURL, domainRegexp, tokens, session, results)
|
|
|
|
|
} else {
|
|
|
|
|
// Links header, first, next, last...
|
|
|
|
|
linksHeader := linkheader.Parse(resp.Header.Get("Link"))
|
|
|
|
|
|
|
|
|
@ -80,7 +110,7 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
|
|
|
|
|
|
|
|
|
|
// Response items iteration
|
|
|
|
|
for _, item := range data.Items {
|
|
|
|
|
resp, err := session.NormalGetWithContext(ctx, s.RawUrl(item.HtmlUrl))
|
|
|
|
|
resp, err := session.NormalGetWithContext(ctx, rawUrl(item.HtmlUrl))
|
|
|
|
|
if err != nil {
|
|
|
|
|
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
|
|
|
|
|
return
|
|
|
|
@ -94,10 +124,20 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var subdomains []string
|
|
|
|
|
|
|
|
|
|
// Search for domain matches in the code
|
|
|
|
|
domainMatch := domainRegexp.FindStringSubmatch(string(code))
|
|
|
|
|
if len(domainMatch) > 0 {
|
|
|
|
|
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: domainMatch[1]}
|
|
|
|
|
|
|
|
|
|
subdomains = append(subdomains, matches(domainRegexp, normalizeContent(string(code)))...)
|
|
|
|
|
|
|
|
|
|
// Text matches iteration per item
|
|
|
|
|
for _, textMatch := range item.TextMatches {
|
|
|
|
|
// Search for domain matches in the text fragment
|
|
|
|
|
subdomains = append(subdomains, matches(domainRegexp, normalizeContent(textMatch.Fragment))...)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, subdomain := range unique(subdomains) {
|
|
|
|
|
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: subdomain}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -109,22 +149,53 @@ func (s *Source) enumerate(ctx context.Context, searchURL string, headers map[st
|
|
|
|
|
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
gologger.Verbosef("Next URL %s\n", s.Name(), nextUrl)
|
|
|
|
|
s.enumerate(ctx, nextUrl, headers, domainRegexp, session, results)
|
|
|
|
|
s.enumerate(ctx, nextUrl, domainRegexp, tokens, session, results)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Normalize content before matching, query unescape, remove tabs and new line chars
|
|
|
|
|
func normalizeContent(content string) string {
|
|
|
|
|
normalizedContent, _ := url.QueryUnescape(content)
|
|
|
|
|
normalizedContent = strings.Replace(normalizedContent, "\\t", "", -1)
|
|
|
|
|
normalizedContent = strings.Replace(normalizedContent, "\\n", "", -1)
|
|
|
|
|
return normalizedContent
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Remove duplicates from string array
|
|
|
|
|
func unique(arr []string) []string {
|
|
|
|
|
occured := map[string]bool{}
|
|
|
|
|
result := []string{}
|
|
|
|
|
for e := range arr {
|
|
|
|
|
if occured[arr[e]] != true {
|
|
|
|
|
occured[arr[e]] = true
|
|
|
|
|
result = append(result, arr[e])
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return result
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Find matches by regular expression in any content
|
|
|
|
|
func matches(regexp *regexp.Regexp, content string) []string {
|
|
|
|
|
var matches []string
|
|
|
|
|
match := regexp.FindAllString(content, -1)
|
|
|
|
|
if len(match) > 0 {
|
|
|
|
|
matches = unique(match)
|
|
|
|
|
}
|
|
|
|
|
return matches
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Raw URL to get the files code and match for subdomains
|
|
|
|
|
func rawUrl(htmlUrl string) string {
|
|
|
|
|
domain := strings.Replace(htmlUrl, "https://github.com/", "https://raw.githubusercontent.com/", -1)
|
|
|
|
|
return strings.Replace(domain, "/blob/", "/", -1)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Domain regular expression to match subdomains in github files code
|
|
|
|
|
func (s *Source) DomainRegexp(domain string) *regexp.Regexp {
|
|
|
|
|
rdomain := strings.Replace(domain, ".", "\\.", -1)
|
|
|
|
|
return regexp.MustCompile("(([0-9a-z_\\-\\.]+)\\." + rdomain + ")")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Raw URL to get the files code and match for subdomains
|
|
|
|
|
func (s *Source) RawUrl(htmlUrl string) string {
|
|
|
|
|
domain := strings.Replace(htmlUrl, "https://github.com/", "https://raw.githubusercontent.com/", -1)
|
|
|
|
|
return strings.Replace(domain, "/blob/", "/", -1)
|
|
|
|
|
return regexp.MustCompile("(\\w+[.])*" + rdomain)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Name returns the name of the source
|
|
|
|
|