403 lines
11 KiB
Go
403 lines
11 KiB
Go
// Package mangawork implements the MangaWork manga base.
|
|
// HTML scraping for browse/details; multipart POST to wp-admin/admin-ajax.php for chapter list; CF-protected.
|
|
package mangawork
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"mime/multipart"
|
|
"net/http"
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
"goyomi/internal/httpclient"
|
|
"goyomi/internal/source"
|
|
"goyomi/sources/base/util"
|
|
)
|
|
|
|
type Config struct {
|
|
Name string
|
|
BaseURL string
|
|
Lang string
|
|
SeriesPath string // default: "series"
|
|
MangaPath string // default: "manga"
|
|
ChapterDateFmt string // default: "02/01/2006"
|
|
AuthorLabel string // default: "Autor(es)"
|
|
PopularOrder string // default: "popular"
|
|
LatestOrder string // default: "update"
|
|
}
|
|
|
|
type Source struct {
|
|
cfg Config
|
|
client *httpclient.Client
|
|
id int64
|
|
}
|
|
|
|
func New(cfg Config) *Source {
|
|
if cfg.SeriesPath == "" {
|
|
cfg.SeriesPath = "series"
|
|
}
|
|
if cfg.MangaPath == "" {
|
|
cfg.MangaPath = "manga"
|
|
}
|
|
if cfg.ChapterDateFmt == "" {
|
|
cfg.ChapterDateFmt = "02/01/2006"
|
|
}
|
|
if cfg.AuthorLabel == "" {
|
|
cfg.AuthorLabel = "Autor(es)"
|
|
}
|
|
if cfg.PopularOrder == "" {
|
|
cfg.PopularOrder = "popular"
|
|
}
|
|
if cfg.LatestOrder == "" {
|
|
cfg.LatestOrder = "update"
|
|
}
|
|
c := httpclient.NewClient(httpclient.WithRateLimit(2, 3))
|
|
return &Source{cfg: cfg, client: c, id: source.GenerateSourceID(cfg.Name, cfg.Lang)}
|
|
}
|
|
|
|
func (s *Source) ID() int64 { return s.id }
|
|
func (s *Source) Name() string { return s.cfg.Name }
|
|
func (s *Source) Lang() string { return s.cfg.Lang }
|
|
func (s *Source) SupportsLatest() bool { return true }
|
|
|
|
func (s *Source) base() string { return strings.TrimRight(s.cfg.BaseURL, "/") }
|
|
|
|
func (s *Source) get(ctx context.Context, rawURL string) (*goquery.Document, error) {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("Referer", s.cfg.BaseURL+"/")
|
|
resp, err := s.client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("mangawork: HTTP %d", resp.StatusCode)
|
|
}
|
|
return goquery.NewDocumentFromReader(resp.Body)
|
|
}
|
|
|
|
func (s *Source) buildSeriesURL(page int, query, order, status, typ string) string {
|
|
u, _ := url.Parse(s.base() + "/" + s.cfg.SeriesPath + "/")
|
|
q := u.Query()
|
|
q.Set("title", query)
|
|
q.Set("order", order)
|
|
if status != "" {
|
|
q.Set("status", status)
|
|
}
|
|
if typ != "" {
|
|
q.Set("type", typ)
|
|
}
|
|
q.Set("page", fmt.Sprint(page))
|
|
u.RawQuery = q.Encode()
|
|
return u.String()
|
|
}
|
|
|
|
func (s *Source) parseMangaList(doc *goquery.Document) source.MangasPage {
|
|
mangaSelector := fmt.Sprintf("div.w-full.h-full:has(a[href*='/%s/'])", s.cfg.MangaPath)
|
|
anchorSelector := fmt.Sprintf("a[href*='/%s/']", s.cfg.MangaPath)
|
|
var mangas []source.SManga
|
|
doc.Find(mangaSelector).Each(func(_ int, el *goquery.Selection) {
|
|
anchor := el.Find(anchorSelector).First()
|
|
u := anchor.AttrOr("href", "")
|
|
if u == "" {
|
|
return
|
|
}
|
|
m := source.SManga{URL: u}
|
|
m.Title = strings.TrimSpace(anchor.Find("h1").Text())
|
|
if m.Title == "" {
|
|
m.Title = strings.TrimSpace(anchor.AttrOr("title", ""))
|
|
}
|
|
if thumb := anchor.Find("img").First().AttrOr("src", ""); thumb != "" {
|
|
m.ThumbnailURL = util.AbsURL(s.cfg.BaseURL, thumb)
|
|
}
|
|
if m.URL != "" && m.Title != "" {
|
|
mangas = append(mangas, m)
|
|
}
|
|
})
|
|
hasNext := doc.Find(".pagination .page-numbers.current + a[href]").Length() > 0
|
|
return source.MangasPage{Mangas: mangas, HasNextPage: hasNext}
|
|
}
|
|
|
|
func (s *Source) GetPopularManga(page int) (source.MangasPage, error) {
|
|
doc, err := s.get(context.Background(), s.buildSeriesURL(page, "", s.cfg.PopularOrder, "", ""))
|
|
if err != nil {
|
|
return source.MangasPage{}, err
|
|
}
|
|
return s.parseMangaList(doc), nil
|
|
}
|
|
|
|
func (s *Source) GetLatestUpdates(page int) (source.MangasPage, error) {
|
|
doc, err := s.get(context.Background(), s.buildSeriesURL(page, "", s.cfg.LatestOrder, "", ""))
|
|
if err != nil {
|
|
return source.MangasPage{}, err
|
|
}
|
|
return s.parseMangaList(doc), nil
|
|
}
|
|
|
|
func (s *Source) GetSearchManga(page int, query string, filters []source.Filter) (source.MangasPage, error) {
|
|
doc, err := s.get(context.Background(), s.buildSeriesURL(page, query, "title", "", ""))
|
|
if err != nil {
|
|
return source.MangasPage{}, err
|
|
}
|
|
return s.parseMangaList(doc), nil
|
|
}
|
|
|
|
func (s *Source) GetMangaDetails(manga source.SManga) (source.SManga, error) {
|
|
doc, err := s.get(context.Background(), util.AbsURL(s.cfg.BaseURL, manga.URL))
|
|
if err != nil {
|
|
return manga, err
|
|
}
|
|
result := source.SManga{URL: manga.URL}
|
|
result.Title = strings.TrimSpace(doc.Find("h1.text-4xl.font-bold.mb-2").Text())
|
|
if result.Title == "" {
|
|
result.Title = manga.Title
|
|
}
|
|
if thumb := doc.Find("img[itemprop=image], [itemprop=image] img").First().AttrOr("src", ""); thumb != "" {
|
|
result.ThumbnailURL = util.AbsURL(s.cfg.BaseURL, thumb)
|
|
}
|
|
result.Description = strings.TrimSpace(doc.Find("div.text-base.leading-relaxed.mb-6.text-muted-foreground").Text())
|
|
|
|
genres := doc.Find("[itemprop=genre]")
|
|
var genreNames []string
|
|
genres.Each(func(_ int, el *goquery.Selection) {
|
|
if t := strings.TrimSpace(el.Text()); t != "" {
|
|
genreNames = append(genreNames, t)
|
|
}
|
|
})
|
|
result.Genre = strings.Join(genreNames, ", ")
|
|
|
|
// Status from the element before the first genre
|
|
if first := genres.First(); first.Length() > 0 {
|
|
statusLabel := strings.TrimSpace(first.Prev().Text())
|
|
result.Status = parseStatus(statusLabel)
|
|
}
|
|
|
|
// Author via info item search
|
|
result.Author = findInfoValue(doc, s.cfg.AuthorLabel)
|
|
return result, nil
|
|
}
|
|
|
|
// findInfoValue finds the value of an info item with the given label.
|
|
// Info items: "div.grid.grid-cols-2.gap-4.text-sm.text-gray-600.mb-6 > div"
|
|
// Each item has a <strong> label and <p> value.
|
|
func findInfoValue(doc *goquery.Document, label string) string {
|
|
const infoItemSelector = "div.grid.grid-cols-2.gap-4.text-sm.text-gray-600.mb-6 > div"
|
|
var result string
|
|
doc.Find(infoItemSelector).Each(func(_ int, el *goquery.Selection) {
|
|
if result != "" {
|
|
return
|
|
}
|
|
if strings.TrimSpace(el.Find("strong").Text()) == label {
|
|
result = strings.TrimSpace(el.Find("p").Text())
|
|
}
|
|
})
|
|
return result
|
|
}
|
|
|
|
func (s *Source) GetChapterList(manga source.SManga) ([]source.SChapter, error) {
|
|
doc, err := s.get(context.Background(), util.AbsURL(s.cfg.BaseURL, manga.URL))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
container := doc.Find("#chapter_list.chapter_list_container").First()
|
|
var chapters []source.SChapter
|
|
|
|
// Collect chapters already present on the page
|
|
container.Find("li").Each(func(_ int, el *goquery.Selection) {
|
|
if ch, ok := s.chapterFromElement(el); ok {
|
|
chapters = append(chapters, ch)
|
|
}
|
|
})
|
|
|
|
postID := strings.TrimSpace(container.AttrOr("data-post-id", ""))
|
|
if postID == "" {
|
|
return chapters, nil
|
|
}
|
|
count := strings.TrimSpace(container.AttrOr("data-count", "1000"))
|
|
|
|
// Paginate via admin-ajax if there are more chapters
|
|
currentPage := 1
|
|
for {
|
|
nextBtn := doc.Find("button.load-chapters[data-paged]").First()
|
|
if nextBtn.Length() == 0 {
|
|
break
|
|
}
|
|
nextPage := strings.TrimSpace(nextBtn.AttrOr("data-paged", ""))
|
|
if nextPage == "" || nextPage == fmt.Sprint(currentPage) {
|
|
break
|
|
}
|
|
order := strings.TrimSpace(nextBtn.AttrOr("data-order", "DESC"))
|
|
|
|
ajaxDoc, err := s.postChapterListPage(manga.URL, postID, count, nextPage, order)
|
|
if err != nil {
|
|
break
|
|
}
|
|
doc = ajaxDoc
|
|
ajaxDoc.Find("li").Each(func(_ int, el *goquery.Selection) {
|
|
if ch, ok := s.chapterFromElement(el); ok {
|
|
chapters = append(chapters, ch)
|
|
}
|
|
})
|
|
if p, _ := fmt.Sscan(nextPage, ¤tPage); p == 0 {
|
|
break
|
|
}
|
|
}
|
|
|
|
// Deduplicate
|
|
seen := map[string]bool{}
|
|
unique := chapters[:0]
|
|
for _, ch := range chapters {
|
|
if !seen[ch.URL] {
|
|
seen[ch.URL] = true
|
|
unique = append(unique, ch)
|
|
}
|
|
}
|
|
return unique, nil
|
|
}
|
|
|
|
var chapterNumRe = regexp.MustCompile(`(\d+(?:[.,]\d+)?)`)
|
|
|
|
func (s *Source) chapterFromElement(el *goquery.Selection) (source.SChapter, bool) {
|
|
anchor := el.Find("a[href]").First()
|
|
if anchor.Length() == 0 {
|
|
return source.SChapter{}, false
|
|
}
|
|
u := anchor.AttrOr("href", "")
|
|
if u == "" {
|
|
return source.SChapter{}, false
|
|
}
|
|
|
|
nameEl := el.Find("span.m-0, span.line-clamp-1").First()
|
|
name := strings.TrimSpace(nameEl.Text())
|
|
if name == "" {
|
|
name = strings.TrimSpace(anchor.Text())
|
|
}
|
|
|
|
// Date from last span
|
|
var dateStr string
|
|
el.Find("span").Each(func(_ int, sp *goquery.Selection) {
|
|
t := strings.TrimSpace(sp.Text())
|
|
if t != "" && t != name {
|
|
dateStr = t
|
|
}
|
|
})
|
|
|
|
return source.SChapter{
|
|
URL: u,
|
|
Name: name,
|
|
DateUpload: parseChapterDate(dateStr, s.cfg.ChapterDateFmt),
|
|
}, true
|
|
}
|
|
|
|
func (s *Source) postChapterListPage(referer, postID, count, page, order string) (*goquery.Document, error) {
|
|
var buf bytes.Buffer
|
|
w := multipart.NewWriter(&buf)
|
|
_ = w.WriteField("action", "load_chapters")
|
|
_ = w.WriteField("post_id", postID)
|
|
_ = w.WriteField("count", count)
|
|
_ = w.WriteField("paged", page)
|
|
_ = w.WriteField("order", order)
|
|
w.Close()
|
|
|
|
ajaxURL := s.base() + "/wp-admin/admin-ajax.php"
|
|
req, err := http.NewRequestWithContext(context.Background(), http.MethodPost, ajaxURL, &buf)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("Content-Type", w.FormDataContentType())
|
|
req.Header.Set("Referer", util.AbsURL(s.cfg.BaseURL, referer))
|
|
req.Header.Set("Origin", s.cfg.BaseURL)
|
|
req.Header.Set("Accept", "*/*")
|
|
|
|
resp, err := s.client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
body, _ := io.ReadAll(resp.Body)
|
|
return goquery.NewDocumentFromReader(bytes.NewReader(body))
|
|
}
|
|
|
|
var pageImageRe = regexp.MustCompile(`"image"\s*:\s*"([^"]+)"`)
|
|
|
|
func (s *Source) GetPageList(chapter source.SChapter) ([]source.Page, error) {
|
|
doc, err := s.get(context.Background(), util.AbsURL(s.cfg.BaseURL, chapter.URL))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Primary: div.reader-area img
|
|
var pages []source.Page
|
|
doc.Find("div.reader-area img#imagech, div.reader-area img[src*='/manga_auto_capitulos/']").Each(func(i int, img *goquery.Selection) {
|
|
u := imgAttr(img)
|
|
if u != "" {
|
|
pages = append(pages, source.Page{Index: i, ImageURL: util.AbsURL(s.cfg.BaseURL, u)})
|
|
}
|
|
})
|
|
if len(pages) > 0 {
|
|
return pages, nil
|
|
}
|
|
|
|
// Fallback: extract "image": "url" from inline script
|
|
var scriptData string
|
|
doc.Find("script").Each(func(_ int, el *goquery.Selection) {
|
|
html, _ := el.Html()
|
|
if strings.Contains(html, `"image"`) {
|
|
scriptData += html
|
|
}
|
|
})
|
|
for i, m := range pageImageRe.FindAllStringSubmatch(scriptData, -1) {
|
|
imgURL := strings.ReplaceAll(m[1], `\/`, "/")
|
|
pages = append(pages, source.Page{Index: i, ImageURL: imgURL})
|
|
}
|
|
return pages, nil
|
|
}
|
|
|
|
func imgAttr(img *goquery.Selection) string {
|
|
for _, attr := range []string{"data-lazy-src", "data-src", "src"} {
|
|
if v, ok := img.Attr(attr); ok && v != "" {
|
|
return v
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func parseChapterDate(s, format string) int64 {
|
|
if s == "" {
|
|
return 0
|
|
}
|
|
t, err := time.Parse(format, s)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
return t.UnixMilli()
|
|
}
|
|
|
|
func parseStatus(s string) int {
|
|
switch strings.ToLower(s) {
|
|
case "publishing", "ongoing", "em andamento":
|
|
return source.StatusOngoing
|
|
case "finished", "completed", "concluído", "concluido", "finalizado":
|
|
return source.StatusCompleted
|
|
case "on hold", "on-hold", "hiatus", "em hiato":
|
|
return source.StatusHiatus
|
|
case "cancelled", "canceled", "cancelado":
|
|
return source.StatusCancelled
|
|
}
|
|
return source.StatusUnknown
|
|
}
|
|
|
|
func (s *Source) GetImageURL(page source.Page) (string, error) { return page.ImageURL, nil }
|
|
func (s *Source) GetFilterList() []source.Filter { return nil }
|