package com.example.datacollect.strategy; import com.example.datacollect.model.Article; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.List; public class BaiduBaikeStrategy implements CrawlStrategy { @Override public boolean supports(String url) { return url.contains("baike.baidu.com"); } @Override public List
parse(String url, Document doc) { List
articles = new ArrayList<>(); String title = doc.selectFirst("h1.lemma-title, h1.title-text") != null ? doc.selectFirst("h1.lemma-title, h1.title-text").text().trim() : ""; String content = ""; Element contentEl = doc.selectFirst("div.lemma-summary, div.summary-content, div.j-summary"); if (contentEl != null) { content = contentEl.text().trim(); } if (!title.isEmpty()) { articles.add(new Article(title, url, content)); } Elements relatedLinks = doc.select("a[href*='/item/']"); for (Element link : relatedLinks) { String linkUrl = link.attr("abs:href"); String linkTitle = link.text().trim(); if (!linkTitle.isEmpty() && !linkTitle.contains("编辑") && !linkTitle.contains("分享")) { articles.add(new Article(linkTitle, linkUrl, "")); } } return articles; } }