452918ac82
Some anti-bot challenge pages return HTTP 200 instead of 403/503, bypassing the FS fallback. Read response body on 200, check for challenge indicators, and fall through to FlareSolverr if detected.
389 lines
8.9 KiB
Go
Executable File
389 lines
8.9 KiB
Go
Executable File
package httpclient
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"net/url"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/sardanioss/httpcloak"
|
|
"golang.org/x/time/rate"
|
|
)
|
|
|
|
const defaultUserAgent = "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Mobile Safari/537.36"
|
|
|
|
var (
|
|
verboseLog bool
|
|
defaultOnce sync.Once
|
|
defaultClient *Client
|
|
)
|
|
|
|
func SetVerboseLog(enabled bool) { verboseLog = enabled }
|
|
|
|
func DefaultClient() *Client {
|
|
defaultOnce.Do(func() {
|
|
defaultClient = newClient()
|
|
})
|
|
return defaultClient
|
|
}
|
|
|
|
func NewClient(opts ...Option) *Client {
|
|
c := newClient()
|
|
for _, o := range opts {
|
|
o(c)
|
|
}
|
|
return c
|
|
}
|
|
|
|
func newClient() *Client {
|
|
hc := httpcloak.NewSession("chrome-latest",
|
|
httpcloak.WithSessionTimeout(30*time.Second),
|
|
)
|
|
c := &Client{
|
|
hc: hc,
|
|
fsClient: nil,
|
|
rateLimit: 1,
|
|
burst: 1,
|
|
userAgent: defaultUserAgent,
|
|
limiters: map[string]*rate.Limiter{},
|
|
verboseLog: verboseLog,
|
|
}
|
|
fsClient, err := NewFlareSolverrClient()
|
|
if err == nil {
|
|
c.fsClient = fsClient
|
|
}
|
|
return c
|
|
}
|
|
|
|
type Client struct {
|
|
hc *httpcloak.Session
|
|
fsClient *FlareSolverrClient
|
|
rateLimit float64
|
|
burst int
|
|
userAgent string
|
|
verboseLog bool
|
|
|
|
mu sync.Mutex
|
|
limiters map[string]*rate.Limiter
|
|
}
|
|
|
|
type Option func(*Client)
|
|
|
|
func WithRateLimit(rps float64, burst int) Option {
|
|
return func(c *Client) {
|
|
c.rateLimit = rps
|
|
c.burst = burst
|
|
}
|
|
}
|
|
|
|
func WithTimeout(d time.Duration) Option {
|
|
return func(c *Client) { c.hc = httpcloak.NewSession("chrome-latest",
|
|
httpcloak.WithSessionTimeout(d),
|
|
) }
|
|
}
|
|
|
|
func WithUserAgent(ua string) Option {
|
|
return func(c *Client) { c.userAgent = ua }
|
|
}
|
|
|
|
func WithVerboseLog(enabled bool) Option {
|
|
return func(c *Client) { c.verboseLog = enabled }
|
|
}
|
|
|
|
func (c *Client) limiter(host string) *rate.Limiter {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
l, ok := c.limiters[host]
|
|
if !ok {
|
|
l = rate.NewLimiter(rate.Limit(c.rateLimit), c.burst)
|
|
c.limiters[host] = l
|
|
}
|
|
return l
|
|
}
|
|
|
|
// Do tries a direct request via httpcloak (Chrome TLS fingerprint) first.
|
|
// httpcloak's TLS fingerprint matches Chrome, so if we already have a
|
|
// cf_clearance cookie from a previous FlareSolverr solve, Cloudflare won't
|
|
// challenge us. If we do get challenged (403/503), falls back to FlareSolverr.
|
|
func (c *Client) Do(req *http.Request) (*http.Response, error) {
|
|
if err := c.limiter(req.URL.Host).Wait(req.Context()); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
resp, err := c.doDirect(req)
|
|
var directStatus int
|
|
if err == nil {
|
|
directStatus = resp.StatusCode
|
|
// Some Cloudflare challenge pages return HTTP 200 but contain challenge
|
|
// JavaScript. Check for challenge content on any response.
|
|
if resp.StatusCode == http.StatusOK {
|
|
body, readErr := io.ReadAll(io.LimitReader(resp.Body, 32*1024))
|
|
resp.Body.Close()
|
|
if readErr == nil && len(body) > 0 {
|
|
if isCloudflareChallenge(body) {
|
|
directStatus = http.StatusForbidden
|
|
} else {
|
|
// Not a challenge — wrap body and return
|
|
return &http.Response{
|
|
StatusCode: resp.StatusCode,
|
|
Header: resp.Header,
|
|
Body: io.NopCloser(bytes.NewReader(body)),
|
|
ContentLength: int64(len(body)),
|
|
Request: req,
|
|
}, nil
|
|
}
|
|
} else {
|
|
return resp, nil
|
|
}
|
|
} else if resp.StatusCode != http.StatusForbidden && resp.StatusCode != http.StatusServiceUnavailable {
|
|
return resp, nil
|
|
} else {
|
|
resp.Body.Close()
|
|
}
|
|
}
|
|
|
|
if c.fsClient == nil {
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return nil, fmt.Errorf("HTTP %d (challenge detected but FlareSolverr not configured)", resp.StatusCode)
|
|
}
|
|
|
|
return c.doFS(req, directStatus)
|
|
}
|
|
|
|
func (c *Client) doDirect(req *http.Request) (*http.Response, error) {
|
|
if c.verboseLog {
|
|
log.Printf("[httpclient] DIRECT %s %s", req.Method, req.URL.String())
|
|
}
|
|
if req.Header.Get("User-Agent") == "" {
|
|
req.Header.Set("User-Agent", c.userAgent)
|
|
}
|
|
|
|
hreq := &httpcloak.Request{
|
|
Method: req.Method,
|
|
URL: req.URL.String(),
|
|
Headers: req.Header,
|
|
}
|
|
if req.Body != nil {
|
|
hreq.Body = req.Body
|
|
}
|
|
|
|
hresp, err := c.hc.Do(req.Context(), hreq)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
body, err := hresp.Bytes()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if c.verboseLog {
|
|
log.Printf("[httpclient] DIRECT RESPONSE %s status=%d", req.URL.String(), hresp.StatusCode)
|
|
}
|
|
|
|
return &http.Response{
|
|
StatusCode: hresp.StatusCode,
|
|
Header: hresp.Headers,
|
|
Body: io.NopCloser(bytes.NewReader(body)),
|
|
ContentLength: int64(len(body)),
|
|
Request: req,
|
|
}, nil
|
|
}
|
|
|
|
func (c *Client) doFS(req *http.Request, directStatus int) (*http.Response, error) {
|
|
if c.verboseLog {
|
|
log.Printf("[httpclient] FS FALLBACK %s %s", req.Method, req.URL.String())
|
|
}
|
|
|
|
rawURL := req.URL.String()
|
|
rawBody, statusCode, fsHeaders, cookies, fsRespURL, err := c.fsClient.GetRaw(req.Context(), rawURL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
respURL := rawURL
|
|
if fsRespURL != "" {
|
|
respURL = fsRespURL
|
|
}
|
|
|
|
// Feed FS cookies into the httpcloak session for subsequent direct requests
|
|
if len(cookies) > 0 {
|
|
if parsedRespURL, uErr := url.Parse(respURL); uErr == nil {
|
|
for _, ck := range cookies {
|
|
if ck.Domain == "" {
|
|
ck.Domain = parsedRespURL.Host
|
|
}
|
|
c.hc.SetCookie(ck.Name, ck.Value)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if FS returned challenge page instead of real content
|
|
if statusCode == 200 && isCloudflareChallenge([]byte(rawBody)) {
|
|
if directStatus >= 400 {
|
|
statusCode = directStatus
|
|
} else {
|
|
return nil, fmt.Errorf("FlareSolverr returned challenge page for %s", rawURL)
|
|
}
|
|
}
|
|
|
|
hdr := make(http.Header)
|
|
if len(fsHeaders) > 0 {
|
|
for k, v := range fsHeaders {
|
|
switch val := v.(type) {
|
|
case string:
|
|
hdr.Set(k, val)
|
|
case []any:
|
|
for _, sv := range val {
|
|
hdr.Add(k, fmt.Sprint(sv))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if len(cookies) > 0 {
|
|
for _, ck := range cookies {
|
|
hdr.Add("Set-Cookie", ck.String())
|
|
}
|
|
}
|
|
for k, v := range req.Header {
|
|
if hdr.Get(k) == "" {
|
|
hdr[k] = v
|
|
}
|
|
}
|
|
|
|
body := stripFSWrapper([]byte(rawBody))
|
|
|
|
return &http.Response{
|
|
StatusCode: statusCode,
|
|
Header: hdr,
|
|
Body: io.NopCloser(bytes.NewReader(body)),
|
|
Request: req,
|
|
}, nil
|
|
}
|
|
|
|
func (c *Client) HTTPClient() *http.Client {
|
|
return &http.Client{
|
|
Transport: hcTransport{c.hc},
|
|
Timeout: 30 * time.Second,
|
|
}
|
|
}
|
|
|
|
// hcTransport wraps httpcloak.Session as an http.RoundTripper
|
|
type hcTransport struct {
|
|
hc *httpcloak.Session
|
|
}
|
|
|
|
func (t hcTransport) RoundTrip(req *http.Request) (*http.Response, error) {
|
|
var body io.Reader
|
|
if req.Body != nil {
|
|
body = req.Body
|
|
}
|
|
hreq := &httpcloak.Request{
|
|
Method: req.Method,
|
|
URL: req.URL.String(),
|
|
Headers: req.Header,
|
|
Body: body,
|
|
}
|
|
hresp, err := t.hc.Do(req.Context(), hreq)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
bodyBytes, err := hresp.Bytes()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &http.Response{
|
|
StatusCode: hresp.StatusCode,
|
|
Header: hresp.Headers,
|
|
Body: io.NopCloser(bytes.NewReader(bodyBytes)),
|
|
ContentLength: int64(len(bodyBytes)),
|
|
Request: req,
|
|
}, nil
|
|
}
|
|
|
|
func (c *Client) Cookie(name, host string) string {
|
|
cks := c.hc.GetCookies()
|
|
if v, ok := cks[name]; ok {
|
|
return v
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// Get is a convenience wrapper around Do.
|
|
func (c *Client) Get(ctx context.Context, urlStr string) (*http.Response, error) {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, urlStr, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return c.Do(req)
|
|
}
|
|
|
|
// Post is a convenience wrapper around Do.
|
|
func (c *Client) Post(ctx context.Context, urlStr string, bodyType string, body io.Reader) (*http.Response, error) {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, urlStr, body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("Content-Type", bodyType)
|
|
return c.Do(req)
|
|
}
|
|
|
|
// isCloudflareChallenge detects whether the response body is a Cloudflare
|
|
// challenge page (i.e. FS failed to solve it and Chrome rendered the challenge).
|
|
func isCloudflareChallenge(body []byte) bool {
|
|
indicators := []string{
|
|
"Just a moment...",
|
|
"cf_chl_opt",
|
|
"challenges.cloudflare.com",
|
|
"/cdn-cgi/challenge-platform",
|
|
"Enable JavaScript and cookies",
|
|
}
|
|
for _, ind := range indicators {
|
|
if bytes.Contains(body, []byte(ind)) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// stripFSWrapper removes FlareSolverr's Chrome HTML wrapper.
|
|
func stripFSWrapper(body []byte) []byte {
|
|
if !bytes.HasPrefix(bytes.TrimSpace(body), []byte("<html")) {
|
|
return body
|
|
}
|
|
preStart := bytes.Index(body, []byte("<pre>"))
|
|
if preStart < 0 {
|
|
return body
|
|
}
|
|
preEnd := bytes.LastIndex(body, []byte("</pre>"))
|
|
if preEnd <= preStart {
|
|
return body
|
|
}
|
|
return body[preStart+5 : preEnd]
|
|
}
|
|
|
|
func retryAfter(resp *http.Response) time.Duration {
|
|
ra := resp.Header.Get("Retry-After")
|
|
if ra == "" {
|
|
return 5 * time.Second
|
|
}
|
|
if secs, err := strconv.ParseFloat(ra, 64); err == nil {
|
|
return time.Duration(secs * float64(time.Second))
|
|
}
|
|
if t, err := http.ParseTime(ra); err == nil {
|
|
d := time.Until(t)
|
|
if d > 0 {
|
|
return d
|
|
}
|
|
}
|
|
return 5 * time.Second
|
|
}
|