You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
153 lines
4.9 KiB
153 lines
4.9 KiB
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import java.time.LocalDate;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class Article {
|
|
private String title;
|
|
private String content;
|
|
private String url;
|
|
private String author;
|
|
private LocalDate publishDate;
|
|
|
|
public Article(String title, String content, String url, String author, LocalDate publishDate) {
|
|
this.title = title;
|
|
this.content = content;
|
|
this.url = url;
|
|
this.author = author;
|
|
this.publishDate = publishDate;
|
|
}
|
|
|
|
public String getAuthor() { return author; }
|
|
public void setAuthor(String author) { this.author = author; }
|
|
|
|
public LocalDate getPublishDate() { return publishDate; }
|
|
public void setPublishDate(LocalDate publishDate) { this.publishDate = publishDate; }
|
|
|
|
@Override
|
|
public String toString() {
|
|
return "标题:" + title +
|
|
" | 作者:" + author +
|
|
" | 发布日期:" + publishDate +
|
|
" | 链接:" + url;
|
|
}
|
|
}
|
|
|
|
interface CrawlStrategy {
|
|
List<Article> crawl();
|
|
}
|
|
|
|
class BaiduCrawl implements CrawlStrategy {
|
|
@Override
|
|
public List<Article> crawl() {
|
|
List<Article> articleList = new ArrayList<>();
|
|
try {
|
|
Document doc = Jsoup.connect("https://top.baidu.com/board?tab=realtime")
|
|
.userAgent("Mozilla/5.0")
|
|
.timeout(5000)
|
|
.get();
|
|
Elements items = doc.getElementsByClass("category-wrap_iQLoo horizontal_1eKyQ");
|
|
int count = 0;
|
|
for (Element item : items) {
|
|
if (count >= 8) break;
|
|
String title = item.getElementsByClass("c-single-text-ellipsis").text();
|
|
String link = item.select("a").attr("href");
|
|
String fullUrl = "https://top.baidu.com" + link;
|
|
Article article = new Article(title, "", fullUrl, "百度热搜", LocalDate.now());
|
|
articleList.add(article);
|
|
count++;
|
|
}
|
|
} catch (Exception e) {
|
|
System.out.println("百度热搜爬取失败");
|
|
}
|
|
return articleList;
|
|
}
|
|
}
|
|
|
|
class PeopleCrawl implements CrawlStrategy {
|
|
@Override
|
|
public List<Article> crawl() {
|
|
List<Article> articleList = new ArrayList<>();
|
|
try {
|
|
Document doc = Jsoup.connect("http://www.people.com.cn/GB/59476/review/20260514.html")
|
|
.userAgent("Mozilla/5.0")
|
|
.timeout(5000)
|
|
.get();
|
|
Elements links = doc.select("a[href^=http://politics.people.com.cn]");
|
|
int count = 0;
|
|
for (Element link : links) {
|
|
if (count >= 5) break;
|
|
String title = link.text();
|
|
String url = link.attr("href");
|
|
if (!title.isEmpty()) {
|
|
Article article = new Article(title, "", url, "人民网", LocalDate.now());
|
|
articleList.add(article);
|
|
count++;
|
|
}
|
|
}
|
|
} catch (Exception e) {
|
|
System.out.println("人民网新闻爬取失败");
|
|
}
|
|
return articleList;
|
|
}
|
|
}
|
|
|
|
class StrategyFactory {
|
|
public static CrawlStrategy getCrawlStrategy(String type) {
|
|
if ("baidu".equalsIgnoreCase(type)) {
|
|
return new BaiduCrawl();
|
|
} else if ("people".equalsIgnoreCase(type)) {
|
|
return new PeopleCrawl();
|
|
}
|
|
return null;
|
|
}
|
|
}
|
|
|
|
class ArticleRepository {
|
|
private final List<Article> allArticles = new ArrayList<>();
|
|
|
|
public void saveArticles(List<Article> list) {
|
|
allArticles.addAll(list);
|
|
}
|
|
|
|
public List<Article> getAllArticles() {
|
|
return allArticles;
|
|
}
|
|
}
|
|
|
|
interface Command {
|
|
void execute();
|
|
}
|
|
|
|
class CrawlCommand implements Command {
|
|
private final CrawlStrategy strategy;
|
|
private final ArticleRepository repository;
|
|
|
|
public CrawlCommand(CrawlStrategy strategy, ArticleRepository repository) {
|
|
this.strategy = strategy;
|
|
this.repository = repository;
|
|
}
|
|
|
|
@Override
|
|
public void execute() {
|
|
List<Article> articles = strategy.crawl();
|
|
repository.saveArticles(articles);
|
|
articles.forEach(System.out::println);
|
|
}
|
|
}
|
|
class CrawlMain {
|
|
public static void main(String[] args) {
|
|
ArticleRepository repository = new ArticleRepository();
|
|
System.out.println("===== 百度实时热搜 =====");
|
|
Command baiduCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("baidu"), repository);
|
|
baiduCmd.execute();
|
|
System.out.println("\n===== 人民网头条 =====");
|
|
Command peopleCmd = new CrawlCommand(StrategyFactory.getCrawlStrategy("people"), repository);
|
|
peopleCmd.execute();
|
|
System.out.println("\n===== 全部新闻汇总 =====");
|
|
repository.getAllArticles().forEach(System.out::println);
|
|
}
|
|
}
|