package com.example.datacollect.strategy; import com.example.datacollect.model.Article; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.List; public class HnuNoticeStrategy implements CrawlStrategy { @Override public boolean supports(String url) { return url.contains("hnu.edu.cn/tzgg") || url.contains("www.hnu.edu.cn"); } @Override public List
parse(String url, Document doc) { List
articles = new ArrayList<>(); Elements listItems = doc.select("div.list-content li, ul.news_list li, div.news-list li, table.list tr"); for (Element item : listItems) { Element link = item.selectFirst("a"); if (link == null) continue; String articleUrl = link.attr("href"); if (!articleUrl.startsWith("http")) { if (articleUrl.startsWith("/")) { articleUrl = "https://www.hnu.edu.cn" + articleUrl; } else if (articleUrl.startsWith("../")) { articleUrl = "https://www.hnu.edu.cn/" + articleUrl.replace("../", ""); } else { articleUrl = "https://www.hnu.edu.cn/" + articleUrl; } } String title = link.text().trim(); String date = ""; Element dateEl = item.selectFirst("span.date, span.time, font.time, .news-date"); if (dateEl != null) { date = dateEl.text().trim(); } String summary = ""; Element summaryEl = item.selectFirst("p.summary, span.summary, .news-summary"); if (summaryEl != null) { summary = summaryEl.text().trim(); } String content = "Date: " + date + "\nSummary: " + summary; if (!title.isEmpty()) { articles.add(new Article(title, articleUrl, content)); } } if (articles.isEmpty()) { Elements items = doc.select("a[href*='info']"); for (Element link : items) { String articleUrl = link.attr("href"); if (!articleUrl.startsWith("http")) { articleUrl = "https://www.hnu.edu.cn" + (articleUrl.startsWith("/") ? "" : "/") + articleUrl; } String title = link.text().trim(); if (!title.isEmpty() && articleUrl.contains("/info/")) { articles.add(new Article(title, articleUrl, "")); } } } return articles; } }