package com.example.datacollect.strategy; import com.example.datacollect.model.Article; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; public class NewsStrategy implements CrawlStrategy { // 1. 添加 Logger 成员 private static final Logger logger = LoggerFactory.getLogger(NewsStrategy.class); // 使用正则匹配 private static final Pattern URL_PATTERN = Pattern.compile(".*news\\.example\\.com.*"); @Override public boolean supports(String url) { return URL_PATTERN.matcher(url).matches(); } @Override public List
parse(String url, Document doc) { List
articles = new ArrayList<>(); // 2. 添加解析过程日志 logger.debug("开始解析 URL: [{}]", url); Elements items = doc.select(".article-headline"); if (items.isEmpty()) { logger.warn("在 URL [{}] 中未找到符合选择器 '.article-headline' 的文章标题元素。", url); return articles; } for (Element e : items) { String title = e.text().trim(); if (!title.isEmpty()) { articles.add(new Article(title, url, "")); logger.trace("提取到文章标题: {}", title); } } logger.info("成功解析 URL [{}],共提取 {} 篇文章。", url, articles.size()); return articles; } @Override public int getPriority() { return 10; } }