You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

145 lines
4.7 KiB

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
/**
* 通用爬虫父类
* 封装通用功能,定义抽象方法让子类实现具体解析逻辑
*/
public abstract class BaseCrawler {
// 通用请求头设置
protected static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
protected static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
protected static final String ACCEPT_LANGUAGE = "zh-CN,zh;q=0.9";
protected static final int TIMEOUT = 30000;
// 延时时间(毫秒)
protected static final int DELAY_MS = 1000;
// 进度统计
protected int totalCount = 0;
protected int targetCount = 0;
/**
* 抽象方法:解析数据
* 子类必须实现具体的解析逻辑
*/
protected abstract void parseData(Document doc, BufferedWriter writer) throws IOException, InterruptedException;
/**
* 运行爬虫
* @param url 目标URL
* @param outputFile 输出文件路径
* @param targetCount 目标数量(用于进度计算)
*/
public void run(String url, String outputFile, int targetCount) {
this.targetCount = targetCount;
totalCount = 0;
System.out.println("开始爬取数据...");
System.out.println("目标:" + targetCount + "个项目\n");
try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
// 写入CSV表头
writeHeader(writer);
System.out.println("正在访问页面...");
// 发送HTTP请求获取页面
Document doc = fetchDocument(url);
System.out.println("页面标题:" + doc.title());
// 调用子类实现的解析方法
parseData(doc, writer);
// 输出结果
printResult(outputFile);
} catch (IOException e) {
System.err.println("爬取失败:" + e.getMessage());
e.printStackTrace();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
/**
* 获取页面文档
* @param url 目标URL
* @return 页面文档
* @throws IOException 网络异常
*/
protected Document fetchDocument(String url) throws IOException {
return Jsoup.connect(url)
.userAgent(USER_AGENT)
.timeout(TIMEOUT)
.header("Accept", ACCEPT)
.header("Accept-Language", ACCEPT_LANGUAGE)
.followRedirects(true)
.get();
}
/**
* 写入CSV表头
* 子类可以重写此方法以自定义表头
*/
protected void writeHeader(BufferedWriter writer) throws IOException {
writer.write("项目名称,类别,地区,简介");
writer.newLine();
}
/**
* 延时防反爬
*/
protected void delay() throws InterruptedException {
Thread.sleep(DELAY_MS);
}
/**
* 清理CSV字段中的特殊字符
* @param field 字段值
* @return 清理后的字段值
*/
protected String cleanCsvField(String field) {
if (field == null) {
return "";
}
// 移除换行符和制表符
field = field.replace("\n", " ").replace("\r", " ").replace("\t", " ");
// 移除引用标记
field = field.replace("[", "").replace("]", "");
// 如果包含逗号,用双引号包裹
if (field.contains(",")) {
field = "\"" + field.replace("\"", "\"\"") + "\"";
}
return field;
}
/**
* 打印进度
* @param count 当前处理数量
*/
protected void printProgress(int count) {
if (count % 100 == 0) {
System.out.println(" 已爬取 " + count + " 个项目...");
}
}
/**
* 打印结果
* @param outputFile 输出文件路径
*/
protected void printResult(String outputFile) {
System.out.println("\n========================================");
System.out.println("全部爬取完成!");
System.out.println("共爬取 " + totalCount + " 个项目");
System.out.println("目标:" + targetCount + "个项目");
System.out.println("完成率:" + String.format("%.2f", (totalCount / (double) targetCount) * 100) + "%");
System.out.println("========================================");
System.out.println("数据已保存到:" + outputFile);
}
}