import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.time.LocalDate; import java.util.ArrayList; import java.util.List; public class Article { private String title; private String content; private String url; private String author; private LocalDate publishDate; public Article(String title, String content, String url, String author, LocalDate publishDate) { this.title = title; this.content = content; this.url = url; this.author = author; this.publishDate = publishDate; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public LocalDate getPublishDate() { return publishDate; } public void setPublishDate(LocalDate publishDate) { this.publishDate = publishDate; } @Override public String toString() { return "标题:" + title + " | 作者:" + author + " | 发布日期:" + publishDate + " | 链接:" + url; } } interface CrawlStrategy { List
crawl(); } class BaiduCrawl implements CrawlStrategy { @Override public List
crawl() { List
articleList = new ArrayList<>(); try { Document doc = Jsoup.connect("https://top.baidu.com/board?tab=realtime") .userAgent("Mozilla/5.0") .timeout(5000) .get(); Elements items = doc.getElementsByClass("category-wrap_iQLoo horizontal_1eKyQ"); int count = 0; for (Element item : items) { if (count >= 8) break; String title = item.getElementsByClass("c-single-text-ellipsis").text(); String link = item.select("a").attr("href"); String fullUrl = "https://top.baidu.com" + link; Article article = new Article(title, "", fullUrl, "百度热搜", LocalDate.now()); articleList.add(article); count++; } } catch (Exception e) { System.out.println("百度热搜爬取失败"); } return articleList; } } class PeopleCrawl implements CrawlStrategy { @Override public List
crawl() { List
articleList = new ArrayList<>(); try { Document doc = Jsoup.connect("http://www.people.com.cn/GB/59476/review/20260514.html") .userAgent("Mozilla/5.0") .timeout(5000) .get(); Elements links = doc.select("a[href^=http://politics.people.com.cn]"); int count = 0; for (Element link : links) { if (count >= 5) break; String title = link.text(); String url = link.attr("href"); if (!title.isEmpty()) { Article article = new Article(title, "", url, "人民网", LocalDate.now()); articleList.add(article); count++; } } } catch (Exception e) { System.out.println("人民网新闻爬取失败"); } return articleList; } } class StrategyFactory { public static CrawlStrategy getCrawlStrategy(String type) { if ("baidu".equalsIgnoreCase(type)) { return new BaiduCrawl(); } else if ("people".equalsIgnoreCase(type)) { return new PeopleCrawl(); } return null; } } class ArticleRepository { private final List
allArticles = new ArrayList<>(); public void saveArticles(List
list) { allArticles.addAll(list); } public List
getAllArticles() { return allArticles; } } interface Command { void execute(); } class CrawlCommand implements Command { private final CrawlStrategy strategy; private final ArticleRepository repository; public CrawlCommand(CrawlStrategy strategy, ArticleRepository repository) { this.strategy = strategy; this.repository = repository; } @Override public void execute() { List
articles = strategy.crawl(); repository.saveArticles(articles); articles.forEach(System.out::println); } } class CrawlMain { public static void main(String[] args) { ArticleRepository repository = new ArticleRepository(); System.out.println("===== 百度实时热搜 ====="); Command baiduCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("baidu"), repository); baiduCmd.execute(); System.out.println("\n===== 人民网头条 ====="); Command peopleCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("people"), repository); peopleCmd.execute(); System.out.println("\n===== 全部新闻汇总 ====="); repository.getAllArticles().forEach(System.out::println); } }