java/project/爬虫3/SimpleCrawler.java


								import org.jsoup.Jsoup;

								import org.jsoup.nodes.Document;

								import org.jsoup.nodes.Element;

								import org.jsoup.select.Elements;

								import java.io.*;

								import java.util.*;


								public class SimpleCrawler {

								    private static final String BASE_URL = "https://www.people.com.cn";

								    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";


								    public static void main(String[] args) throws Exception {

								        List<NewsItem> allNews = new ArrayList<>();


								        System.out.println("正在爬取人民网新闻...");

								        try {

								            allNews.addAll(crawlNews(BASE_URL));

								        } catch (Exception e) {

								            System.out.println("爬取失败: " + e.getMessage());

								        }


								        Set<String> seen = new HashSet<>();

								        List<NewsItem> uniqueNews = new ArrayList<>();

								        for (NewsItem item : allNews) {

								            String key = item.title + "|" + item.url;

								            if (!seen.contains(key)) {

								                seen.add(key);

								                uniqueNews.add(item);

								            }

								        }


								        Collections.sort(uniqueNews, (a, b) -> Integer.compare(b.hotRank, a.hotRank));


								        int limit = Math.min(500, uniqueNews.size());

								        List<NewsItem> topNews = uniqueNews.subList(0, limit);


								        System.out.println("正在导出 " + topNews.size() + " 条新闻到CSV...");

								        exportToCSV(topNews, "people_news_500.csv");


								        System.out.println("完成！CSV文件已生成: people_news_500.csv");

								    }


								    private static List<NewsItem> crawlNews(String url) throws Exception {

								        List<NewsItem> news = new ArrayList<>();

								        int rank = 1;


								        Document doc = Jsoup.connect(url)

								                .userAgent(USER_AGENT)

								                .timeout(30000)

								                .get();


								        Elements items = doc.select("a");

								        for (Element item : items) {

								            String title = item.text().trim();

								            String itemUrl = item.attr("abs:href");


								            if (isValidNews(title, itemUrl)) {

								                String category = classifyNews(itemUrl);

								                news.add(new NewsItem(title, itemUrl, category, rank++));

								            }

								        }

								        return news;

								    }


								    private static boolean isValidNews(String title, String url) {

								        if (title == null || title.isEmpty() || title.length() < 8) return false;

								        if (url == null || url.isEmpty() || !url.startsWith("http")) return false;

								        if (!url.contains("people.com.cn")) return false;


								        String[] invalidKeywords = {"图片", "视频", "广告", "关于我们", "联系我们", "隐私政策",

								            "免责声明", "网站地图", "京ICP证", "许可证", "下载客户端", "人民日报社概况",

								            "地方频道", "信息网络传播", "广播电视节目", "增值电信业务", "互联网新闻信息",

								            "网络文化经营", "服务条款", "意见反馈", "设为首页", "加入收藏", "站内搜索"};


								        for (String keyword : invalidKeywords) {

								            if (title.contains(keyword)) return false;

								        }


								        String[] invalidPaths = {"/img/", "/GB/50142/", "/GB/1018/", "/GB/422044/", "/GB/408835/"};

								        for (String path : invalidPaths) {

								            if (url.contains(path)) return false;

								        }


								        return true;

								    }


								    private static String classifyNews(String url) {

								        if (url.contains("/politics.") || url.contains("/cpc.")) {

								            return "时政新闻";

								        } else if (url.contains("/finance.") || url.contains("/economy.")) {

								            return "财经新闻";

								        } else if (url.contains("/health.")) {

								            return "健康资讯";

								        } else if (url.contains("/ent.") || url.contains("/sports.")) {

								            return "文体娱乐";

								        } else {

								            return "热点资讯";

								        }

								    }


								    private static void exportToCSV(List<NewsItem> news, String filename) throws Exception {

								        try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8"))) {

								            writer.write("标题,链接,分类,热度排名");

								            writer.newLine();

								            for (NewsItem item : news) {

								                writer.write(String.format("\"%s\",\"%s\",\"%s\",%d",

								                    item.title.replace("\"", "\"\""),

								                    item.url.replace("\"", "\"\""),

								                    item.category,

								                    item.hotRank));

								                writer.newLine();

								            }

								        }

								    }


								    static class NewsItem {

								        String title, url, category;

								        int hotRank;

								        NewsItem(String t, String u, String c, int r) { title = t; url = u; category = c; hotRank = r; }

								    }

								}