diff --git a/w11/strategy/BlogStrategy.java.java b/w11/strategy/BlogStrategy.java.java new file mode 100644 index 0000000..5edac8d --- /dev/null +++ b/w11/strategy/BlogStrategy.java.java @@ -0,0 +1,49 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(BlogStrategy.class); + + @Override + public boolean supports(String url) { + boolean supported = url.contains("blog.example.com"); + logger.debug("BlogStrategy supports {}: {}", url, supported); + return supported; + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + try { + Elements titles = doc.select(".post-title"); + if (titles.isEmpty()) { + logger.warn("No .post-title elements found for URL: {}", url); + throw new ParseException("No .post-title elements found on page: " + url); + } + + for (Element e : titles) { + String title = e.text(); + if (title == null || title.isBlank()) { + logger.warn("Found empty title at URL: {}", url); + continue; + } + articles.add(new Article(title, url, "")); + logger.debug("Parsed article: {}", title); + } + } catch (Exception e) { + logger.error("Parse error for URL {}: {}", url, e.getMessage(), e); + throw new ParseException("Failed to parse blog page: " + url, e); + } + return articles; + } +} \ No newline at end of file diff --git a/w11/strategy/CrawlStrategy.java.java b/w11/strategy/CrawlStrategy.java.java new file mode 100644 index 0000000..aa4edb7 --- /dev/null +++ b/w11/strategy/CrawlStrategy.java.java @@ -0,0 +1,25 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; + +import java.util.List; + +public interface CrawlStrategy { + /** + * 解析文档并提取文章列表 + * @param url 原始URL + * @param doc Jsoup文档对象 + * @return 文章列表 + * @throws ParseException 解析失败时抛出 + */ + List
parse(String url, Document doc) throws ParseException; + + /** + * 判断该策略是否支持指定的URL + * @param url 目标URL + * @return 是否支持 + */ + boolean supports(String url); +} \ No newline at end of file diff --git a/w11/strategy/NewsStrategy.java.java b/w11/strategy/NewsStrategy.java.java new file mode 100644 index 0000000..7f72f70 --- /dev/null +++ b/w11/strategy/NewsStrategy.java.java @@ -0,0 +1,50 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +public class NewsStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(NewsStrategy.class); + + @Override + public boolean supports(String url) { + boolean supported = url.contains("news.example.com"); + logger.debug("NewsStrategy supports {}: {}", url, supported); + return supported; + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + try { + Elements headlines = doc.select(".headline"); + if (headlines.isEmpty()) { + logger.warn("No .headline elements found for URL: {}", url); + throw new ParseException("No .headline elements found on page: " + url); + } + + for (Element e : headlines) { + String title = e.text(); + String link = e.hasAttr("href") ? e.attr("abs:href") : url; + if (title == null || title.isBlank()) { + logger.warn("Found empty headline at URL: {}", url); + continue; + } + articles.add(new Article(title, link, "")); + logger.debug("Parsed news article: {}", title); + } + } catch (Exception e) { + logger.error("Parse error for URL {}: {}", url, e.getMessage(), e); + throw new ParseException("Failed to parse news page: " + url, e); + } + return articles; + } +} \ No newline at end of file diff --git a/w11/strategy/StrategyFactory.java.java b/w11/strategy/StrategyFactory.java.java new file mode 100644 index 0000000..78e19e8 --- /dev/null +++ b/w11/strategy/StrategyFactory.java.java @@ -0,0 +1,43 @@ +package com.example.datacollect.strategy; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +public class StrategyFactory { + private static final Logger logger = LoggerFactory.getLogger(StrategyFactory.class); + private final List strategies = new ArrayList<>(); + + public StrategyFactory() { + // 注册所有策略 + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + logger.info("StrategyFactory initialized with {} strategies", strategies.size()); + } + + public CrawlStrategy getStrategy(String url) { + if (url == null || url.isBlank()) { + logger.warn("Null or blank URL provided to getStrategy"); + return null; + } + + for (CrawlStrategy strategy : strategies) { + if (strategy.supports(url)) { + logger.debug("Found strategy {} for URL: {}", strategy.getClass().getSimpleName(), url); + return strategy; + } + } + + logger.warn("No strategy found for URL: {}", url); + return null; + } + + public void registerStrategy(CrawlStrategy strategy) { + if (strategy != null) { + strategies.add(strategy); + logger.info("Registered new strategy: {}", strategy.getClass().getSimpleName()); + } + } +} \ No newline at end of file