You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
121 lines
4.7 KiB
121 lines
4.7 KiB
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import java.io.*;
|
|
import java.util.*;
|
|
|
|
public class SimpleCrawler {
|
|
private static final String BASE_URL = "https://www.people.com.cn";
|
|
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
List<NewsItem> allNews = new ArrayList<>();
|
|
|
|
System.out.println("正在爬取人民网新闻...");
|
|
try {
|
|
allNews.addAll(crawlNews(BASE_URL));
|
|
} catch (Exception e) {
|
|
System.out.println("爬取失败: " + e.getMessage());
|
|
}
|
|
|
|
Set<String> seen = new HashSet<>();
|
|
List<NewsItem> uniqueNews = new ArrayList<>();
|
|
for (NewsItem item : allNews) {
|
|
String key = item.title + "|" + item.url;
|
|
if (!seen.contains(key)) {
|
|
seen.add(key);
|
|
uniqueNews.add(item);
|
|
}
|
|
}
|
|
|
|
Collections.sort(uniqueNews, (a, b) -> Integer.compare(b.hotRank, a.hotRank));
|
|
|
|
int limit = Math.min(500, uniqueNews.size());
|
|
List<NewsItem> topNews = uniqueNews.subList(0, limit);
|
|
|
|
System.out.println("正在导出 " + topNews.size() + " 条新闻到CSV...");
|
|
exportToCSV(topNews, "people_news_500.csv");
|
|
|
|
System.out.println("完成!CSV文件已生成: people_news_500.csv");
|
|
}
|
|
|
|
private static List<NewsItem> crawlNews(String url) throws Exception {
|
|
List<NewsItem> news = new ArrayList<>();
|
|
int rank = 1;
|
|
|
|
Document doc = Jsoup.connect(url)
|
|
.userAgent(USER_AGENT)
|
|
.timeout(30000)
|
|
.get();
|
|
|
|
Elements items = doc.select("a");
|
|
for (Element item : items) {
|
|
String title = item.text().trim();
|
|
String itemUrl = item.attr("abs:href");
|
|
|
|
if (isValidNews(title, itemUrl)) {
|
|
String category = classifyNews(itemUrl);
|
|
news.add(new NewsItem(title, itemUrl, category, rank++));
|
|
}
|
|
}
|
|
return news;
|
|
}
|
|
|
|
private static boolean isValidNews(String title, String url) {
|
|
if (title == null || title.isEmpty() || title.length() < 8) return false;
|
|
if (url == null || url.isEmpty() || !url.startsWith("http")) return false;
|
|
if (!url.contains("people.com.cn")) return false;
|
|
|
|
String[] invalidKeywords = {"图片", "视频", "广告", "关于我们", "联系我们", "隐私政策",
|
|
"免责声明", "网站地图", "京ICP证", "许可证", "下载客户端", "人民日报社概况",
|
|
"地方频道", "信息网络传播", "广播电视节目", "增值电信业务", "互联网新闻信息",
|
|
"网络文化经营", "服务条款", "意见反馈", "设为首页", "加入收藏", "站内搜索"};
|
|
|
|
for (String keyword : invalidKeywords) {
|
|
if (title.contains(keyword)) return false;
|
|
}
|
|
|
|
String[] invalidPaths = {"/img/", "/GB/50142/", "/GB/1018/", "/GB/422044/", "/GB/408835/"};
|
|
for (String path : invalidPaths) {
|
|
if (url.contains(path)) return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
private static String classifyNews(String url) {
|
|
if (url.contains("/politics.") || url.contains("/cpc.")) {
|
|
return "时政新闻";
|
|
} else if (url.contains("/finance.") || url.contains("/economy.")) {
|
|
return "财经新闻";
|
|
} else if (url.contains("/health.")) {
|
|
return "健康资讯";
|
|
} else if (url.contains("/ent.") || url.contains("/sports.")) {
|
|
return "文体娱乐";
|
|
} else {
|
|
return "热点资讯";
|
|
}
|
|
}
|
|
|
|
private static void exportToCSV(List<NewsItem> news, String filename) throws Exception {
|
|
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) {
|
|
writer.write("标题,链接,分类,热度排名");
|
|
writer.newLine();
|
|
for (NewsItem item : news) {
|
|
writer.write(String.format("\"%s\",\"%s\",\"%s\",%d",
|
|
item.title.replace("\"", "\"\""),
|
|
item.url.replace("\"", "\"\""),
|
|
item.category,
|
|
item.hotRank));
|
|
writer.newLine();
|
|
}
|
|
}
|
|
}
|
|
|
|
static class NewsItem {
|
|
String title, url, category;
|
|
int hotRank;
|
|
NewsItem(String t, String u, String c, int r) { title = t; url = u; category = c; hotRank = r; }
|
|
}
|
|
}
|