diff --git a/W11/strategy/BlogStrategy.java b/W11/strategy/BlogStrategy.java new file mode 100644 index 0000000..9033aac --- /dev/null +++ b/W11/strategy/BlogStrategy.java @@ -0,0 +1,32 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(BlogStrategy.class); + + @Override + public boolean supports(String url) { + return url.contains("blog.example.com"); + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + logger.debug("Parsing blog content from: {}", url); + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + logger.debug("Parsed {} articles from blog", articles.size()); + return articles; + } +} diff --git a/W11/strategy/CrawlStrategy.java b/W11/strategy/CrawlStrategy.java new file mode 100644 index 0000000..7f6248b --- /dev/null +++ b/W11/strategy/CrawlStrategy.java @@ -0,0 +1,11 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface CrawlStrategy { + List
parse(String url, Document doc) throws ParseException; + boolean supports(String url); +} diff --git a/W11/strategy/HnuNewsStrategy.java b/W11/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..0264eb2 --- /dev/null +++ b/W11/strategy/HnuNewsStrategy.java @@ -0,0 +1,56 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + +public class HnuNewsStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(HnuNewsStrategy.class); + + @Override + public boolean supports(String url) { + return url.contains("news.hnu.edu.cn"); + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + logger.debug("Parsing HNU news content from: {}", url); + List
articles = new ArrayList<>(); + Elements listItems = doc.select("ul.list11 li"); + + for (Element li : listItems) { + Element link = li.selectFirst("a"); + if (link == null) continue; + + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); + } + + String title = ""; + Element titleEl = link.selectFirst("h4.l2.h4s2"); + if (titleEl != null) { + title = titleEl.text().trim(); + } + + String content = ""; + Element contentEl = link.selectFirst("p.l3.ps3"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + logger.debug("Parsed {} articles from HNU news", articles.size()); + return articles; + } +} diff --git a/W11/strategy/NewsStrategy.java b/W11/strategy/NewsStrategy.java new file mode 100644 index 0000000..20410ff --- /dev/null +++ b/W11/strategy/NewsStrategy.java @@ -0,0 +1,32 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + +public class NewsStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(NewsStrategy.class); + + @Override + public boolean supports(String url) { + return url.contains("news.example.com"); + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + logger.debug("Parsing news content from: {}", url); + List
articles = new ArrayList<>(); + Elements items = doc.select(".article-headline"); + for (Element e : items) { + articles.add(new Article(e.text(), url, "")); + } + logger.debug("Parsed {} articles from news", articles.size()); + return articles; + } +} diff --git a/W11/strategy/StrategyFactory.java b/W11/strategy/StrategyFactory.java new file mode 100644 index 0000000..e804c30 --- /dev/null +++ b/W11/strategy/StrategyFactory.java @@ -0,0 +1,34 @@ +package com.example.datacollect.strategy; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + +public class StrategyFactory { + private static final Logger logger = LoggerFactory.getLogger(StrategyFactory.class); + private final List strategies = new ArrayList<>(); + + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + logger.info("StrategyFactory initialized with {} strategies", strategies.size()); + } + + public CrawlStrategy getStrategy(String url) { + for (CrawlStrategy s : strategies) { + if (s.supports(url)) { + logger.debug("Found strategy for URL: {} -> {}", url, s.getClass().getSimpleName()); + return s; + } + } + logger.warn("No strategy found for URL: {}", url); + return null; + } + + public void register(CrawlStrategy strategy) { + strategies.add(strategy); + logger.info("Registered new strategy: {}", strategy.getClass().getSimpleName()); + } +}