Compare commits
1 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
74a2cd6319 | 3 weeks ago |
3 changed files with 397 additions and 0 deletions
Binary file not shown.
@ -0,0 +1,149 @@ |
|||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
/** |
||||
|
* 中国非物质文化遗产爬虫 |
||||
|
* 爬取 Wikipedia 上的国家级非遗项目列表(1557项) |
||||
|
*/ |
||||
|
public class IntangibleHeritageCrawler { |
||||
|
|
||||
|
// Wikipedia 非遗列表页面
|
||||
|
private static final String WIKIPEDIA_URL = "https://zh.wikipedia.org/wiki/国家级非物质文化遗产代表性项目名录"; |
||||
|
// 输出文件路径
|
||||
|
private static final String OUTPUT_FILE = "intangible_heritage.csv"; |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
System.out.println("开始爬取国家级非物质文化遗产项目数据..."); |
||||
|
System.out.println("目标:1557个非遗项目\n"); |
||||
|
|
||||
|
int totalCount = 0; |
||||
|
|
||||
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(OUTPUT_FILE))) { |
||||
|
// 写入CSV表头
|
||||
|
writer.write("项目名称,类别,地区,简介"); |
||||
|
writer.newLine(); |
||||
|
|
||||
|
System.out.println("正在访问 Wikipedia 页面..."); |
||||
|
|
||||
|
Document doc = Jsoup.connect(WIKIPEDIA_URL) |
||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") |
||||
|
.timeout(30000) |
||||
|
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") |
||||
|
.header("Accept-Language", "zh-CN,zh;q=0.9") |
||||
|
.followRedirects(true) |
||||
|
.get(); |
||||
|
|
||||
|
System.out.println("页面标题:" + doc.title()); |
||||
|
|
||||
|
// Wikipedia 的表格通常有特定的class
|
||||
|
Elements tables = doc.select("table.wikitable, table.sortable"); |
||||
|
System.out.println("找到 " + tables.size() + " 个表格\n"); |
||||
|
|
||||
|
// 遍历所有表格
|
||||
|
for (Element table : tables) { |
||||
|
// 提取表格标题(类别)
|
||||
|
String category = ""; |
||||
|
Element caption = table.selectFirst("caption"); |
||||
|
if (caption != null) { |
||||
|
category = caption.text().trim(); |
||||
|
} |
||||
|
|
||||
|
// 提取表格行
|
||||
|
Elements rows = table.select("tr"); |
||||
|
System.out.println("表格:" + category + " - 共 " + rows.size() + " 行"); |
||||
|
|
||||
|
int tableCount = 0; |
||||
|
for (Element row : rows) { |
||||
|
try { |
||||
|
// 提取单元格
|
||||
|
Elements cells = row.select("td"); |
||||
|
|
||||
|
if (cells.size() >= 2) { |
||||
|
// 第一列通常是项目名称
|
||||
|
String name = cells.get(0).text().trim(); |
||||
|
|
||||
|
// 第二列通常是地区
|
||||
|
String region = cells.get(1).text().trim(); |
||||
|
|
||||
|
// 如果有第三列,可能是简介或批次
|
||||
|
String description = ""; |
||||
|
if (cells.size() >= 3) { |
||||
|
description = cells.get(2).text().trim(); |
||||
|
} |
||||
|
|
||||
|
// 清理数据
|
||||
|
name = cleanCsvField(name); |
||||
|
category = cleanCsvField(category); |
||||
|
region = cleanCsvField(region); |
||||
|
description = cleanCsvField(description); |
||||
|
|
||||
|
// 如果项目名称不为空且不是表头,则写入CSV
|
||||
|
if (!name.isEmpty() && |
||||
|
!name.equals("项目名称") && |
||||
|
!name.equals("名称") && |
||||
|
!name.equals("序号") && |
||||
|
name.length() > 1) { |
||||
|
|
||||
|
writer.write(String.format("%s,%s,%s,%s", name, category, region, description)); |
||||
|
writer.newLine(); |
||||
|
tableCount++; |
||||
|
totalCount++; |
||||
|
|
||||
|
if (totalCount % 100 == 0) { |
||||
|
System.out.println(" 已爬取 " + totalCount + " 个项目..."); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
System.err.println(" 解析行时出错:" + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
System.out.println(" 该表格爬取完成!共 " + tableCount + " 个项目\n"); |
||||
|
|
||||
|
// 延时,避免请求过快
|
||||
|
Thread.sleep(1000); |
||||
|
} |
||||
|
|
||||
|
System.out.println("\n========================================"); |
||||
|
System.out.println("全部爬取完成!"); |
||||
|
System.out.println("共爬取 " + totalCount + " 个非遗项目"); |
||||
|
System.out.println("目标:1557个项目"); |
||||
|
System.out.println("完成率:" + String.format("%.2f", (totalCount / 1557.0) * 100) + "%"); |
||||
|
System.out.println("========================================"); |
||||
|
System.out.println("数据已保存到:" + OUTPUT_FILE); |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
System.err.println("爬取失败:" + e.getMessage()); |
||||
|
e.printStackTrace(); |
||||
|
} catch (InterruptedException e) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 清理CSV字段中的特殊字符 |
||||
|
*/ |
||||
|
private static String cleanCsvField(String field) { |
||||
|
if (field == null) { |
||||
|
return ""; |
||||
|
} |
||||
|
// 移除换行符和制表符
|
||||
|
field = field.replace("\n", " ").replace("\r", " ").replace("\t", " "); |
||||
|
// 移除引用标记
|
||||
|
field = field.replace("[", "").replace("]", ""); |
||||
|
// 如果包含逗号,用双引号包裹
|
||||
|
if (field.contains(",")) { |
||||
|
field = "\"" + field.replace("\"", "\"\"") + "\""; |
||||
|
} |
||||
|
return field; |
||||
|
} |
||||
|
} |
||||
|
Loading…
Reference in new issue