diff --git a/w10/HnuNoticeStrategy.java b/w10/HnuNoticeStrategy.java new file mode 100644 index 0000000..783dc94 --- /dev/null +++ b/w10/HnuNoticeStrategy.java @@ -0,0 +1,76 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +public class HnuNoticeStrategy implements CrawlStrategy { + + @Override + public boolean supports(String url) { + return url.contains("hnu.edu.cn/tzgg") || url.contains("www.hnu.edu.cn"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + + Elements listItems = doc.select("div.list-content li, ul.news_list li, div.news-list li, table.list tr"); + + for (Element item : listItems) { + Element link = item.selectFirst("a"); + if (link == null) continue; + + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + if (articleUrl.startsWith("/")) { + articleUrl = "https://www.hnu.edu.cn" + articleUrl; + } else if (articleUrl.startsWith("../")) { + articleUrl = "https://www.hnu.edu.cn/" + articleUrl.replace("../", ""); + } else { + articleUrl = "https://www.hnu.edu.cn/" + articleUrl; + } + } + + String title = link.text().trim(); + + String date = ""; + Element dateEl = item.selectFirst("span.date, span.time, font.time, .news-date"); + if (dateEl != null) { + date = dateEl.text().trim(); + } + + String summary = ""; + Element summaryEl = item.selectFirst("p.summary, span.summary, .news-summary"); + if (summaryEl != null) { + summary = summaryEl.text().trim(); + } + + String content = "Date: " + date + "\nSummary: " + summary; + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + if (articles.isEmpty()) { + Elements items = doc.select("a[href*='info']"); + for (Element link : items) { + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + articleUrl = "https://www.hnu.edu.cn" + (articleUrl.startsWith("/") ? "" : "/") + articleUrl; + } + String title = link.text().trim(); + if (!title.isEmpty() && articleUrl.contains("/info/")) { + articles.add(new Article(title, articleUrl, "")); + } + } + } + + return articles; + } +} \ No newline at end of file diff --git a/w10/StrategyFactory.java b/w10/StrategyFactory.java new file mode 100644 index 0000000..7368831 --- /dev/null +++ b/w10/StrategyFactory.java @@ -0,0 +1,28 @@ +package com.example.datacollect.strategy; + +import java.util.ArrayList; +import java.util.List; + +public class StrategyFactory { + private final List strategies = new ArrayList<>(); + + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + strategies.add(new HnuNoticeStrategy()); + } + + public CrawlStrategy getStrategy(String url) { + for (CrawlStrategy s : strategies) { + if (s.supports(url)) { + return s; + } + } + return null; + } + + public void register(CrawlStrategy strategy) { + strategies.add(strategy); + } +}