import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * 中国非物质文化遗产爬虫 * 爬取 Wikipedia 上的国家级非遗项目列表(1557项) */ public class IntangibleHeritageCrawler { // Wikipedia 非遗列表页面 private static final String WIKIPEDIA_URL = "https://zh.wikipedia.org/wiki/国家级非物质文化遗产代表性项目名录"; // 输出文件路径 private static final String OUTPUT_FILE = "intangible_heritage.csv"; public static void main(String[] args) { System.out.println("开始爬取国家级非物质文化遗产项目数据..."); System.out.println("目标:1557个非遗项目\n"); int totalCount = 0; try (BufferedWriter writer = new BufferedWriter(new FileWriter(OUTPUT_FILE))) { // 写入CSV表头 writer.write("项目名称,类别,地区,简介"); writer.newLine(); System.out.println("正在访问 Wikipedia 页面..."); Document doc = Jsoup.connect(WIKIPEDIA_URL) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") .timeout(30000) .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") .header("Accept-Language", "zh-CN,zh;q=0.9") .followRedirects(true) .get(); System.out.println("页面标题:" + doc.title()); // Wikipedia 的表格通常有特定的class Elements tables = doc.select("table.wikitable, table.sortable"); System.out.println("找到 " + tables.size() + " 个表格\n"); // 遍历所有表格 for (Element table : tables) { // 提取表格标题(类别) String category = ""; Element caption = table.selectFirst("caption"); if (caption != null) { category = caption.text().trim(); } // 提取表格行 Elements rows = table.select("tr"); System.out.println("表格:" + category + " - 共 " + rows.size() + " 行"); int tableCount = 0; for (Element row : rows) { try { // 提取单元格 Elements cells = row.select("td"); if (cells.size() >= 2) { // 第一列通常是项目名称 String name = cells.get(0).text().trim(); // 第二列通常是地区 String region = cells.get(1).text().trim(); // 如果有第三列,可能是简介或批次 String description = ""; if (cells.size() >= 3) { description = cells.get(2).text().trim(); } // 清理数据 name = cleanCsvField(name); category = cleanCsvField(category); region = cleanCsvField(region); description = cleanCsvField(description); // 如果项目名称不为空且不是表头,则写入CSV if (!name.isEmpty() && !name.equals("项目名称") && !name.equals("名称") && !name.equals("序号") && name.length() > 1) { writer.write(String.format("%s,%s,%s,%s", name, category, region, description)); writer.newLine(); tableCount++; totalCount++; if (totalCount % 100 == 0) { System.out.println(" 已爬取 " + totalCount + " 个项目..."); } } } } catch (Exception e) { System.err.println(" 解析行时出错:" + e.getMessage()); } } System.out.println(" 该表格爬取完成!共 " + tableCount + " 个项目\n"); // 延时,避免请求过快 Thread.sleep(1000); } System.out.println("\n========================================"); System.out.println("全部爬取完成!"); System.out.println("共爬取 " + totalCount + " 个非遗项目"); System.out.println("目标:1557个项目"); System.out.println("完成率:" + String.format("%.2f", (totalCount / 1557.0) * 100) + "%"); System.out.println("========================================"); System.out.println("数据已保存到:" + OUTPUT_FILE); } catch (IOException e) { System.err.println("爬取失败:" + e.getMessage()); e.printStackTrace(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } /** * 清理CSV字段中的特殊字符 */ private static String cleanCsvField(String field) { if (field == null) { return ""; } // 移除换行符和制表符 field = field.replace("\n", " ").replace("\r", " ").replace("\t", " "); // 移除引用标记 field = field.replace("[", "").replace("]", ""); // 如果包含逗号,用双引号包裹 if (field.contains(",")) { field = "\"" + field.replace("\"", "\"\"") + "\""; } return field; } }