package internal.hw.crawler.strategies.crawl;

import internal.hw.crawler.models.Article;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.net.URL;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class PeopleCnCrawlStrategy implements CrawlStrategy {
    private static final Logger log = LoggerFactory.getLogger(PeopleCnCrawlStrategy.class);
    private final List<String> supportedDomains = List.of("people.cn", "people.com.cn");
    /* 示例 URL：http://env.people.com.cn/n1/2026/0530/c1010-40730688.html */
    private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/c(\\d+)-(\\d+).html");

    @Override
    public boolean supports(URL url) {
        String host = url.getHost();
        for (String domain : supportedDomains) {
            if (host.equals(domain) || host.endsWith("." + domain)) {
                return true;
            }
        }
        return false;
    }

    @Override
    public List<Article> parse(URL url, Document doc) throws CrawlException {
        if (isHomepage(url)) {
            return CrawlUtils.parseHomepage(doc, idRegex, (articleUrl, articleDoc) -> {
                try {
                    return parseSingle(articleUrl, articleDoc);
                } catch (CrawlException e) {
                    log.warn("Failed to parse article: {}", articleUrl, e);
                    return null;
                }
            });
        } else {
            return List.of(parseSingle(url, doc));
        }
    }

    private boolean isHomepage(URL url) {
        String host = url.getHost();
        boolean matched = supportedDomains.stream()
            .anyMatch(d -> host.equals(d) || host.endsWith("." + d));
        if (!matched) return false;
        String path = url.getPath();
        return path == null || path.isEmpty() || path.equals("/");
    }

    private Article parseSingle(URL url, Document doc) throws CrawlException {
        Matcher matcher = idRegex.matcher(url.getPath());
        if (!matcher.find()) {
            throw new CrawlException(String.format("Cannot determine id for %s", url));
        }

        String id = String.format("%s%s-c%s-%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4));
        Element titleEl = doc.selectFirst(".layout.rm_txt h1");
        if (titleEl == null) throw new CrawlException(String.format("Cannot find title for %s", url));
        String title = titleEl.text();

        Set<String> authors = new HashSet<>();
        Element authorEl = doc.selectFirst("div.author.cf");
        if (authorEl != null) {
            authors.add(authorEl.text());
        }

        Element contentEl = doc.selectFirst("div#rm_txt_zw");
        if (contentEl == null) throw new CrawlException(String.format("Cannot find content for %s", url));
        String content = contentEl.text();

        Article article = new Article();
        article.setId(id);
        article.setSource("people-cn");
        article.setUrl(url);
        article.setTitle(title);
        article.setAuthors(authors);
        article.setContent(content);

        return article;
    }
}