Files
achmad bc639f0b2d fix: read full response body instead of truncating at 32KB
The challenge detection only needs a sample (32KB), but when the body
is not a challenge, the full body must be returned. The previous code
discarded everything after 32KB, causing Madara parsers to find 0
manga items in truncated HTML.
2026-05-14 23:30:36 +07:00

397 lines
9.2 KiB
Go
Executable File

package httpclient
import (
"bytes"
"context"
"fmt"
"io"
"log"
"net/http"
"net/url"
"strconv"
"sync"
"time"
"github.com/sardanioss/httpcloak"
"golang.org/x/time/rate"
)
const defaultUserAgent = "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Mobile Safari/537.36"
var (
verboseLog bool
defaultOnce sync.Once
defaultClient *Client
)
func SetVerboseLog(enabled bool) { verboseLog = enabled }
func DefaultClient() *Client {
defaultOnce.Do(func() {
defaultClient = newClient()
})
return defaultClient
}
func NewClient(opts ...Option) *Client {
c := newClient()
for _, o := range opts {
o(c)
}
return c
}
func newClient() *Client {
hc := httpcloak.NewSession("chrome-latest",
httpcloak.WithSessionTimeout(30*time.Second),
)
c := &Client{
hc: hc,
fsClient: nil,
rateLimit: 1,
burst: 1,
userAgent: defaultUserAgent,
limiters: map[string]*rate.Limiter{},
verboseLog: verboseLog,
}
fsClient, err := NewFlareSolverrClient()
if err == nil {
c.fsClient = fsClient
}
return c
}
type Client struct {
hc *httpcloak.Session
fsClient *FlareSolverrClient
rateLimit float64
burst int
userAgent string
verboseLog bool
mu sync.Mutex
limiters map[string]*rate.Limiter
}
type Option func(*Client)
func WithRateLimit(rps float64, burst int) Option {
return func(c *Client) {
c.rateLimit = rps
c.burst = burst
}
}
func WithTimeout(d time.Duration) Option {
return func(c *Client) { c.hc = httpcloak.NewSession("chrome-latest",
httpcloak.WithSessionTimeout(d),
) }
}
func WithUserAgent(ua string) Option {
return func(c *Client) { c.userAgent = ua }
}
func WithVerboseLog(enabled bool) Option {
return func(c *Client) { c.verboseLog = enabled }
}
func (c *Client) limiter(host string) *rate.Limiter {
c.mu.Lock()
defer c.mu.Unlock()
l, ok := c.limiters[host]
if !ok {
l = rate.NewLimiter(rate.Limit(c.rateLimit), c.burst)
c.limiters[host] = l
}
return l
}
// Do tries a direct request via httpcloak (Chrome TLS fingerprint) first.
// httpcloak's TLS fingerprint matches Chrome, so if we already have a
// cf_clearance cookie from a previous FlareSolverr solve, Cloudflare won't
// challenge us. If we do get challenged (403/503), falls back to FlareSolverr.
func (c *Client) Do(req *http.Request) (*http.Response, error) {
if err := c.limiter(req.URL.Host).Wait(req.Context()); err != nil {
return nil, err
}
resp, err := c.doDirect(req)
var directStatus int
if err == nil {
directStatus = resp.StatusCode
// Some Cloudflare challenge pages return HTTP 200 but contain challenge
// JavaScript. Check for challenge content on any response.
if resp.StatusCode == http.StatusOK {
// Read a sample to check for challenge content
sample, readErr := io.ReadAll(io.LimitReader(resp.Body, 32*1024))
if readErr == nil && len(sample) > 0 && isCloudflareChallenge(sample) {
resp.Body.Close()
directStatus = http.StatusForbidden
} else {
// Read the rest of the body
rest, restErr := io.ReadAll(resp.Body)
resp.Body.Close()
if restErr == nil {
fullBody := append(sample, rest...)
return &http.Response{
StatusCode: resp.StatusCode,
Header: resp.Header,
Body: io.NopCloser(bytes.NewReader(fullBody)),
ContentLength: int64(len(fullBody)),
Request: req,
}, nil
}
return &http.Response{
StatusCode: resp.StatusCode,
Header: resp.Header,
Body: io.NopCloser(bytes.NewReader(sample)),
Request: req,
}, nil
}
} else if resp.StatusCode != http.StatusForbidden && resp.StatusCode != http.StatusServiceUnavailable {
return resp, nil
} else {
resp.Body.Close()
}
}
if c.fsClient == nil {
if err != nil {
return nil, err
}
return nil, fmt.Errorf("HTTP %d (challenge detected but FlareSolverr not configured)", resp.StatusCode)
}
return c.doFS(req, directStatus)
}
func (c *Client) doDirect(req *http.Request) (*http.Response, error) {
if c.verboseLog {
log.Printf("[httpclient] DIRECT %s %s", req.Method, req.URL.String())
}
if req.Header.Get("User-Agent") == "" {
req.Header.Set("User-Agent", c.userAgent)
}
hreq := &httpcloak.Request{
Method: req.Method,
URL: req.URL.String(),
Headers: req.Header,
}
if req.Body != nil {
hreq.Body = req.Body
}
hresp, err := c.hc.Do(req.Context(), hreq)
if err != nil {
return nil, err
}
body, err := hresp.Bytes()
if err != nil {
return nil, err
}
if c.verboseLog {
log.Printf("[httpclient] DIRECT RESPONSE %s status=%d", req.URL.String(), hresp.StatusCode)
}
return &http.Response{
StatusCode: hresp.StatusCode,
Header: hresp.Headers,
Body: io.NopCloser(bytes.NewReader(body)),
ContentLength: int64(len(body)),
Request: req,
}, nil
}
func (c *Client) doFS(req *http.Request, directStatus int) (*http.Response, error) {
if c.verboseLog {
log.Printf("[httpclient] FS FALLBACK %s %s", req.Method, req.URL.String())
}
rawURL := req.URL.String()
rawBody, statusCode, fsHeaders, cookies, fsRespURL, err := c.fsClient.GetRaw(req.Context(), rawURL)
if err != nil {
return nil, err
}
respURL := rawURL
if fsRespURL != "" {
respURL = fsRespURL
}
// Feed FS cookies into the httpcloak session for subsequent direct requests
if len(cookies) > 0 {
if parsedRespURL, uErr := url.Parse(respURL); uErr == nil {
for _, ck := range cookies {
if ck.Domain == "" {
ck.Domain = parsedRespURL.Host
}
c.hc.SetCookie(ck.Name, ck.Value)
}
}
}
// Check if FS returned challenge page instead of real content
if statusCode == 200 && isCloudflareChallenge([]byte(rawBody)) {
if directStatus >= 400 {
statusCode = directStatus
} else {
return nil, fmt.Errorf("FlareSolverr returned challenge page for %s", rawURL)
}
}
hdr := make(http.Header)
if len(fsHeaders) > 0 {
for k, v := range fsHeaders {
switch val := v.(type) {
case string:
hdr.Set(k, val)
case []any:
for _, sv := range val {
hdr.Add(k, fmt.Sprint(sv))
}
}
}
}
if len(cookies) > 0 {
for _, ck := range cookies {
hdr.Add("Set-Cookie", ck.String())
}
}
for k, v := range req.Header {
if hdr.Get(k) == "" {
hdr[k] = v
}
}
body := stripFSWrapper([]byte(rawBody))
return &http.Response{
StatusCode: statusCode,
Header: hdr,
Body: io.NopCloser(bytes.NewReader(body)),
Request: req,
}, nil
}
func (c *Client) HTTPClient() *http.Client {
return &http.Client{
Transport: hcTransport{c.hc},
Timeout: 30 * time.Second,
}
}
// hcTransport wraps httpcloak.Session as an http.RoundTripper
type hcTransport struct {
hc *httpcloak.Session
}
func (t hcTransport) RoundTrip(req *http.Request) (*http.Response, error) {
var body io.Reader
if req.Body != nil {
body = req.Body
}
hreq := &httpcloak.Request{
Method: req.Method,
URL: req.URL.String(),
Headers: req.Header,
Body: body,
}
hresp, err := t.hc.Do(req.Context(), hreq)
if err != nil {
return nil, err
}
bodyBytes, err := hresp.Bytes()
if err != nil {
return nil, err
}
return &http.Response{
StatusCode: hresp.StatusCode,
Header: hresp.Headers,
Body: io.NopCloser(bytes.NewReader(bodyBytes)),
ContentLength: int64(len(bodyBytes)),
Request: req,
}, nil
}
func (c *Client) Cookie(name, host string) string {
cks := c.hc.GetCookies()
if v, ok := cks[name]; ok {
return v
}
return ""
}
// Get is a convenience wrapper around Do.
func (c *Client) Get(ctx context.Context, urlStr string) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, urlStr, nil)
if err != nil {
return nil, err
}
return c.Do(req)
}
// Post is a convenience wrapper around Do.
func (c *Client) Post(ctx context.Context, urlStr string, bodyType string, body io.Reader) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodPost, urlStr, body)
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", bodyType)
return c.Do(req)
}
// isCloudflareChallenge detects whether the response body is a Cloudflare
// challenge page (i.e. FS failed to solve it and Chrome rendered the challenge).
func isCloudflareChallenge(body []byte) bool {
indicators := []string{
"Just a moment...",
"cf_chl_opt",
"challenges.cloudflare.com",
"/cdn-cgi/challenge-platform",
"Enable JavaScript and cookies",
}
for _, ind := range indicators {
if bytes.Contains(body, []byte(ind)) {
return true
}
}
return false
}
// stripFSWrapper removes FlareSolverr's Chrome HTML wrapper.
func stripFSWrapper(body []byte) []byte {
if !bytes.HasPrefix(bytes.TrimSpace(body), []byte("<html")) {
return body
}
preStart := bytes.Index(body, []byte("<pre>"))
if preStart < 0 {
return body
}
preEnd := bytes.LastIndex(body, []byte("</pre>"))
if preEnd <= preStart {
return body
}
return body[preStart+5 : preEnd]
}
func retryAfter(resp *http.Response) time.Duration {
ra := resp.Header.Get("Retry-After")
if ra == "" {
return 5 * time.Second
}
if secs, err := strconv.ParseFloat(ra, 64); err == nil {
return time.Duration(secs * float64(time.Second))
}
if t, err := http.ParseTime(ra); err == nil {
d := time.Until(t)
if d > 0 {
return d
}
}
return 5 * time.Second
}