You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

121 lines
4.7 KiB

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.*;
public class SimpleCrawler {
private static final String BASE_URL = "https://www.people.com.cn";
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
public static void main(String[] args) throws Exception {
List<NewsItem> allNews = new ArrayList<>();
System.out.println("正在爬取人民网新闻...");
try {
allNews.addAll(crawlNews(BASE_URL));
} catch (Exception e) {
System.out.println("爬取失败: " + e.getMessage());
}
Set<String> seen = new HashSet<>();
List<NewsItem> uniqueNews = new ArrayList<>();
for (NewsItem item : allNews) {
String key = item.title + "|" + item.url;
if (!seen.contains(key)) {
seen.add(key);
uniqueNews.add(item);
}
}
Collections.sort(uniqueNews, (a, b) -> Integer.compare(b.hotRank, a.hotRank));
int limit = Math.min(500, uniqueNews.size());
List<NewsItem> topNews = uniqueNews.subList(0, limit);
System.out.println("正在导出 " + topNews.size() + " 条新闻到CSV...");
exportToCSV(topNews, "people_news_500.csv");
System.out.println("完成!CSV文件已生成: people_news_500.csv");
}
private static List<NewsItem> crawlNews(String url) throws Exception {
List<NewsItem> news = new ArrayList<>();
int rank = 1;
Document doc = Jsoup.connect(url)
.userAgent(USER_AGENT)
.timeout(30000)
.get();
Elements items = doc.select("a");
for (Element item : items) {
String title = item.text().trim();
String itemUrl = item.attr("abs:href");
if (isValidNews(title, itemUrl)) {
String category = classifyNews(itemUrl);
news.add(new NewsItem(title, itemUrl, category, rank++));
}
}
return news;
}
private static boolean isValidNews(String title, String url) {
if (title == null || title.isEmpty() || title.length() < 8) return false;
if (url == null || url.isEmpty() || !url.startsWith("http")) return false;
if (!url.contains("people.com.cn")) return false;
String[] invalidKeywords = {"图片", "视频", "广告", "关于我们", "联系我们", "隐私政策",
"免责声明", "网站地图", "京ICP证", "许可证", "下载客户端", "人民日报社概况",
"地方频道", "信息网络传播", "广播电视节目", "增值电信业务", "互联网新闻信息",
"网络文化经营", "服务条款", "意见反馈", "设为首页", "加入收藏", "站内搜索"};
for (String keyword : invalidKeywords) {
if (title.contains(keyword)) return false;
}
String[] invalidPaths = {"/img/", "/GB/50142/", "/GB/1018/", "/GB/422044/", "/GB/408835/"};
for (String path : invalidPaths) {
if (url.contains(path)) return false;
}
return true;
}
private static String classifyNews(String url) {
if (url.contains("/politics.") || url.contains("/cpc.")) {
return "时政新闻";
} else if (url.contains("/finance.") || url.contains("/economy.")) {
return "财经新闻";
} else if (url.contains("/health.")) {
return "健康资讯";
} else if (url.contains("/ent.") || url.contains("/sports.")) {
return "文体娱乐";
} else {
return "热点资讯";
}
}
private static void exportToCSV(List<NewsItem> news, String filename) throws Exception {
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) {
writer.write("标题,链接,分类,热度排名");
writer.newLine();
for (NewsItem item : news) {
writer.write(String.format("\"%s\",\"%s\",\"%s\",%d",
item.title.replace("\"", "\"\""),
item.url.replace("\"", "\"\""),
item.category,
item.hotRank));
writer.newLine();
}
}
}
static class NewsItem {
String title, url, category;
int hotRank;
NewsItem(String t, String u, String c, int r) { title = t; url = u; category = c; hotRank = r; }
}
}