Browse Source

添加 'w10/Article.java'

main
dengxitong 3 weeks ago
parent
commit
b1ecb74277
  1. 153
      w10/Article.java

153
w10/Article.java

@ -0,0 +1,153 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.List;
public class Article {
private String title;
private String content;
private String url;
private String author;
private LocalDate publishDate;
public Article(String title, String content, String url, String author, LocalDate publishDate) {
this.title = title;
this.content = content;
this.url = url;
this.author = author;
this.publishDate = publishDate;
}
public String getAuthor() { return author; }
public void setAuthor(String author) { this.author = author; }
public LocalDate getPublishDate() { return publishDate; }
public void setPublishDate(LocalDate publishDate) { this.publishDate = publishDate; }
@Override
public String toString() {
return "标题:" + title +
" | 作者:" + author +
" | 发布日期:" + publishDate +
" | 链接:" + url;
}
}
interface CrawlStrategy {
List<Article> crawl();
}
class BaiduCrawl implements CrawlStrategy {
@Override
public List<Article> crawl() {
List<Article> articleList = new ArrayList<>();
try {
Document doc = Jsoup.connect("https://top.baidu.com/board?tab=realtime")
.userAgent("Mozilla/5.0")
.timeout(5000)
.get();
Elements items = doc.getElementsByClass("category-wrap_iQLoo horizontal_1eKyQ");
int count = 0;
for (Element item : items) {
if (count >= 8) break;
String title = item.getElementsByClass("c-single-text-ellipsis").text();
String link = item.select("a").attr("href");
String fullUrl = "https://top.baidu.com" + link;
Article article = new Article(title, "", fullUrl, "百度热搜", LocalDate.now());
articleList.add(article);
count++;
}
} catch (Exception e) {
System.out.println("百度热搜爬取失败");
}
return articleList;
}
}
class PeopleCrawl implements CrawlStrategy {
@Override
public List<Article> crawl() {
List<Article> articleList = new ArrayList<>();
try {
Document doc = Jsoup.connect("http://www.people.com.cn/GB/59476/review/20260514.html")
.userAgent("Mozilla/5.0")
.timeout(5000)
.get();
Elements links = doc.select("a[href^=http://politics.people.com.cn]");
int count = 0;
for (Element link : links) {
if (count >= 5) break;
String title = link.text();
String url = link.attr("href");
if (!title.isEmpty()) {
Article article = new Article(title, "", url, "人民网", LocalDate.now());
articleList.add(article);
count++;
}
}
} catch (Exception e) {
System.out.println("人民网新闻爬取失败");
}
return articleList;
}
}
class StrategyFactory {
public static CrawlStrategy getCrawlStrategy(String type) {
if ("baidu".equalsIgnoreCase(type)) {
return new BaiduCrawl();
} else if ("people".equalsIgnoreCase(type)) {
return new PeopleCrawl();
}
return null;
}
}
class ArticleRepository {
private final List<Article> allArticles = new ArrayList<>();
public void saveArticles(List<Article> list) {
allArticles.addAll(list);
}
public List<Article> getAllArticles() {
return allArticles;
}
}
interface Command {
void execute();
}
class CrawlCommand implements Command {
private final CrawlStrategy strategy;
private final ArticleRepository repository;
public CrawlCommand(CrawlStrategy strategy, ArticleRepository repository) {
this.strategy = strategy;
this.repository = repository;
}
@Override
public void execute() {
List<Article> articles = strategy.crawl();
repository.saveArticles(articles);
articles.forEach(System.out::println);
}
}
class CrawlMain {
public static void main(String[] args) {
ArticleRepository repository = new ArticleRepository();
System.out.println("===== 百度实时热搜 =====");
Command baiduCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("baidu"), repository);
baiduCmd.execute();
System.out.println("\n===== 人民网头条 =====");
Command peopleCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("people"), repository);
peopleCmd.execute();
System.out.println("\n===== 全部新闻汇总 =====");
repository.getAllArticles().forEach(System.out::println);
}
}
Loading…
Cancel
Save