diff --git a/w10/Article.java b/w10/Article.java new file mode 100644 index 0000000..9c00a5e --- /dev/null +++ b/w10/Article.java @@ -0,0 +1,153 @@ +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.List; + +public class Article { + private String title; + private String content; + private String url; + private String author; + private LocalDate publishDate; + + public Article(String title, String content, String url, String author, LocalDate publishDate) { + this.title = title; + this.content = content; + this.url = url; + this.author = author; + this.publishDate = publishDate; + } + + public String getAuthor() { return author; } + public void setAuthor(String author) { this.author = author; } + + public LocalDate getPublishDate() { return publishDate; } + public void setPublishDate(LocalDate publishDate) { this.publishDate = publishDate; } + + @Override + public String toString() { + return "标题:" + title + + " | 作者:" + author + + " | 发布日期:" + publishDate + + " | 链接:" + url; + } +} + +interface CrawlStrategy { + List
crawl(); +} + +class BaiduCrawl implements CrawlStrategy { + @Override + public List
crawl() { + List
articleList = new ArrayList<>(); + try { + Document doc = Jsoup.connect("https://top.baidu.com/board?tab=realtime") + .userAgent("Mozilla/5.0") + .timeout(5000) + .get(); + Elements items = doc.getElementsByClass("category-wrap_iQLoo horizontal_1eKyQ"); + int count = 0; + for (Element item : items) { + if (count >= 8) break; + String title = item.getElementsByClass("c-single-text-ellipsis").text(); + String link = item.select("a").attr("href"); + String fullUrl = "https://top.baidu.com" + link; + Article article = new Article(title, "", fullUrl, "百度热搜", LocalDate.now()); + articleList.add(article); + count++; + } + } catch (Exception e) { + System.out.println("百度热搜爬取失败"); + } + return articleList; + } +} + +class PeopleCrawl implements CrawlStrategy { + @Override + public List
crawl() { + List
articleList = new ArrayList<>(); + try { + Document doc = Jsoup.connect("http://www.people.com.cn/GB/59476/review/20260514.html") + .userAgent("Mozilla/5.0") + .timeout(5000) + .get(); + Elements links = doc.select("a[href^=http://politics.people.com.cn]"); + int count = 0; + for (Element link : links) { + if (count >= 5) break; + String title = link.text(); + String url = link.attr("href"); + if (!title.isEmpty()) { + Article article = new Article(title, "", url, "人民网", LocalDate.now()); + articleList.add(article); + count++; + } + } + } catch (Exception e) { + System.out.println("人民网新闻爬取失败"); + } + return articleList; + } +} + +class StrategyFactory { + public static CrawlStrategy getCrawlStrategy(String type) { + if ("baidu".equalsIgnoreCase(type)) { + return new BaiduCrawl(); + } else if ("people".equalsIgnoreCase(type)) { + return new PeopleCrawl(); + } + return null; + } +} + +class ArticleRepository { + private final List
allArticles = new ArrayList<>(); + + public void saveArticles(List
list) { + allArticles.addAll(list); + } + + public List
getAllArticles() { + return allArticles; + } +} + +interface Command { + void execute(); +} + +class CrawlCommand implements Command { + private final CrawlStrategy strategy; + private final ArticleRepository repository; + + public CrawlCommand(CrawlStrategy strategy, ArticleRepository repository) { + this.strategy = strategy; + this.repository = repository; + } + + @Override + public void execute() { + List
articles = strategy.crawl(); + repository.saveArticles(articles); + articles.forEach(System.out::println); + } + } + class CrawlMain { + public static void main(String[] args) { + ArticleRepository repository = new ArticleRepository(); + System.out.println("===== 百度实时热搜 ====="); + Command baiduCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("baidu"), repository); + baiduCmd.execute(); + System.out.println("\n===== 人民网头条 ====="); + Command peopleCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("people"), repository); + peopleCmd.execute(); + System.out.println("\n===== 全部新闻汇总 ====="); + repository.getAllArticles().forEach(System.out::println); + } + } \ No newline at end of file