diff --git a/w10/HnuNewsStrategy.java b/w10/HnuNewsStrategy.java new file mode 100644 index 0000000..1b35d32 --- /dev/null +++ b/w10/HnuNewsStrategy.java @@ -0,0 +1,57 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class HnuNewsStrategy implements CrawlStrategy { + private static final Pattern URL_PATTERN = Pattern.compile("https?://[^/]*news\\.hnu\\.edu\\.cn.*"); + + @Override + public boolean supports(String url) { + return url != null && URL_PATTERN.matcher(url).matches(); + } + + @Override + public int getPriority() { + return 100; + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements listItems = doc.select("ul.list11 li"); + + for (Element li : listItems) { + Element link = li.selectFirst("a"); + if (link == null) continue; + + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); + } + + String title = ""; + Element titleEl = link.selectFirst("h4.l2.h4s2"); + if (titleEl != null) { + title = titleEl.text().trim(); + } + + String content = ""; + Element contentEl = link.selectFirst("p.l3.ps3"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + return articles; + } +}