goyomi/sources/base/util/util.go

package util

import (
	"encoding/json"
	"regexp"
	"strconv"
	"strings"
	"time"
	"unicode"
)

// ParseRelativeDate converts strings like "2 days ago", "3 hours ago" to unix milliseconds.
func ParseRelativeDate(s string) int64 {
	s = strings.TrimSpace(strings.ToLower(s))
	now := time.Now()

	if strings.Contains(s, "just now") || strings.Contains(s, "sekarang") {
		return now.UnixMilli()
	}
	if strings.Contains(s, "today") {
		y, m, d := now.Date()
		return time.Date(y, m, d, 0, 0, 0, 0, now.Location()).UnixMilli()
	}
	if strings.Contains(s, "yesterday") {
		y, m, d := now.AddDate(0, 0, -1).Date()
		return time.Date(y, m, d, 0, 0, 0, 0, now.Location()).UnixMilli()
	}

	num := extractLeadingNumber(s)
	if num == 0 {
		return 0
	}

	switch {
	case anyWord(s, "second", "segundo", "giây", "detik"):
		return now.Add(-time.Duration(num) * time.Second).UnixMilli()
	case anyWord(s, "minute", "minuto", "min", "dakika", "phút", "menit"):
		return now.Add(-time.Duration(num) * time.Minute).UnixMilli()
	case anyWord(s, "hour", "hora", "heure", "saat", "jam", "giờ", "ore"):
		return now.Add(-time.Duration(num) * time.Hour).UnixMilli()
	case anyWord(s, "day", "día", "dia", "jour", "gün", "hari", "ngày", "วัน", "giorni"):
		return now.AddDate(0, 0, -num).UnixMilli()
	case anyWord(s, "week", "semana", "tuần"):
		return now.AddDate(0, 0, -num*7).UnixMilli()
	case anyWord(s, "month", "mes", "tháng"):
		return now.AddDate(0, -num, 0).UnixMilli()
	case anyWord(s, "year", "año", "năm"):
		return now.AddDate(-num, 0, 0).UnixMilli()
	}
	return 0
}

func extractLeadingNumber(s string) int {
	for i, c := range s {
		if unicode.IsDigit(c) {
			end := i + 1
			for end < len(s) && s[end] >= '0' && s[end] <= '9' {
				end++
			}
			n, _ := strconv.Atoi(s[i:end])
			return n
		}
	}
	return 0
}

func anyWord(s string, words ...string) bool {
	for _, w := range words {
		if strings.Contains(s, w) {
			return true
		}
	}
	return false
}

// ParseAbsoluteDate parses a date string using common Go reference time layouts.
// layout uses Go time format (e.g. "January 02, 2006", "2006-01-02").
func ParseAbsoluteDate(s, layout string) int64 {
	s = strings.TrimSpace(s)
	if s == "" {
		return 0
	}
	t, err := time.ParseInLocation(layout, s, time.UTC)
	if err != nil {
		return 0
	}
	return t.UnixMilli()
}

// SlugFromURL returns the last non-empty path segment of a URL string.
func SlugFromURL(rawURL string) string {
	rawURL = strings.TrimRight(rawURL, "/")
	idx := strings.LastIndex(rawURL, "/")
	if idx < 0 {
		return rawURL
	}
	slug := rawURL[idx+1:]
	if q := strings.IndexByte(slug, '?'); q >= 0 {
		slug = slug[:q]
	}
	if f := strings.IndexByte(slug, '#'); f >= 0 {
		slug = slug[:f]
	}
	return slug
}

var htmlEntityRe = regexp.MustCompile(`&[a-zA-Z]+;|&#\d+;`)
var multiSpaceRe = regexp.MustCompile(`\s+`)

// CleanText decodes common HTML entities and normalises whitespace.
func CleanText(s string) string {
	replacer := strings.NewReplacer(
		"&amp;", "&", "&lt;", "<", "&gt;", ">",
		"&quot;", `"`, "&#39;", "'", "&apos;", "'",
		"&nbsp;", " ", "&#160;", " ",
	)
	s = replacer.Replace(s)
	s = htmlEntityRe.ReplaceAllString(s, "")
	return strings.TrimSpace(multiSpaceRe.ReplaceAllString(s, " "))
}

// StatusFromString maps common status strings to source.Status* constants.
func StatusFromString(s string) int {
	s = strings.ToLower(strings.TrimSpace(s))
	switch {
	case anyWord(s, "ongoing", "en cours", "releasing", "publishing", "airing", "devam", "laufend", "em lançamento", "актуален"):
		return 1 // StatusOngoing
	case anyWord(s, "completed", "complete", "terminé", "finalizado", "abgeschlossen", "завершён", "tamamlandı"):
		return 2 // StatusCompleted
	case anyWord(s, "licensed"):
		return 3 // StatusLicensed
	case anyWord(s, "hiatus", "on hiatus", "en pause"):
		return 5 // StatusHiatus
	case anyWord(s, "cancelled", "canceled", "dropped", "abandonné", "заброшено"):
		return 6 // StatusCancelled
	}
	return 0 // StatusUnknown
}

// nextDataRe matches the JSON blob inside a NextJS __NEXT_DATA__ script tag.
var nextDataRe = regexp.MustCompile(`<script[^>]+id="__NEXT_DATA__"[^>]*>([\s\S]*?)</script>`)

// ExtractNextDataJSON extracts the JSON object from a NextJS __NEXT_DATA__ script tag.
func ExtractNextDataJSON(html string) (json.RawMessage, error) {
	m := nextDataRe.FindStringSubmatch(html)
	if len(m) < 2 {
		return nil, nil
	}
	raw := strings.TrimSpace(m[1])
	return json.RawMessage(raw), nil
}

// AbsURL resolves a potentially relative URL against a base URL string.
func AbsURL(base, ref string) string {
	if ref == "" {
		return ""
	}
	if strings.HasPrefix(ref, "http://") || strings.HasPrefix(ref, "https://") {
		return ref
	}
	base = strings.TrimRight(base, "/")
	if strings.HasPrefix(ref, "/") {
		// absolute path — strip to origin
		if i := strings.Index(base[8:], "/"); i >= 0 {
			base = base[:8+i]
		}
		return base + ref
	}
	return base + "/" + ref
}

// ImgAttr returns the best image src from common lazy-loading data attributes.
// Checks data-lazy-src, data-src, data-cfsrc, data-setbg, then falls back to src.
func ImgAttr(attrs map[string]string, baseURL string) string {
	for _, key := range []string{"data-lazy-src", "data-src", "data-cfsrc", "data-setbg", "data-manga-src", "src"} {
		if v := attrs[key]; v != "" {
			return AbsURL(baseURL, v)
		}
	}
	return ""
}