You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
145 lines
4.7 KiB
145 lines
4.7 KiB
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
|
|
import java.io.BufferedWriter;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
|
|
/**
|
|
* 通用爬虫父类
|
|
* 封装通用功能,定义抽象方法让子类实现具体解析逻辑
|
|
*/
|
|
public abstract class BaseCrawler {
|
|
// 通用请求头设置
|
|
protected static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
|
|
protected static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
|
|
protected static final String ACCEPT_LANGUAGE = "zh-CN,zh;q=0.9";
|
|
protected static final int TIMEOUT = 30000;
|
|
|
|
// 延时时间(毫秒)
|
|
protected static final int DELAY_MS = 1000;
|
|
|
|
// 进度统计
|
|
protected int totalCount = 0;
|
|
protected int targetCount = 0;
|
|
|
|
/**
|
|
* 抽象方法:解析数据
|
|
* 子类必须实现具体的解析逻辑
|
|
*/
|
|
protected abstract void parseData(Document doc, BufferedWriter writer) throws IOException, InterruptedException;
|
|
|
|
/**
|
|
* 运行爬虫
|
|
* @param url 目标URL
|
|
* @param outputFile 输出文件路径
|
|
* @param targetCount 目标数量(用于进度计算)
|
|
*/
|
|
public void run(String url, String outputFile, int targetCount) {
|
|
this.targetCount = targetCount;
|
|
totalCount = 0;
|
|
|
|
System.out.println("开始爬取数据...");
|
|
System.out.println("目标:" + targetCount + "个项目\n");
|
|
|
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
|
|
// 写入CSV表头
|
|
writeHeader(writer);
|
|
|
|
System.out.println("正在访问页面...");
|
|
|
|
// 发送HTTP请求获取页面
|
|
Document doc = fetchDocument(url);
|
|
|
|
System.out.println("页面标题:" + doc.title());
|
|
|
|
// 调用子类实现的解析方法
|
|
parseData(doc, writer);
|
|
|
|
// 输出结果
|
|
printResult(outputFile);
|
|
|
|
} catch (IOException e) {
|
|
System.err.println("爬取失败:" + e.getMessage());
|
|
e.printStackTrace();
|
|
} catch (InterruptedException e) {
|
|
Thread.currentThread().interrupt();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* 获取页面文档
|
|
* @param url 目标URL
|
|
* @return 页面文档
|
|
* @throws IOException 网络异常
|
|
*/
|
|
protected Document fetchDocument(String url) throws IOException {
|
|
return Jsoup.connect(url)
|
|
.userAgent(USER_AGENT)
|
|
.timeout(TIMEOUT)
|
|
.header("Accept", ACCEPT)
|
|
.header("Accept-Language", ACCEPT_LANGUAGE)
|
|
.followRedirects(true)
|
|
.get();
|
|
}
|
|
|
|
/**
|
|
* 写入CSV表头
|
|
* 子类可以重写此方法以自定义表头
|
|
*/
|
|
protected void writeHeader(BufferedWriter writer) throws IOException {
|
|
writer.write("项目名称,类别,地区,简介");
|
|
writer.newLine();
|
|
}
|
|
|
|
/**
|
|
* 延时防反爬
|
|
*/
|
|
protected void delay() throws InterruptedException {
|
|
Thread.sleep(DELAY_MS);
|
|
}
|
|
|
|
/**
|
|
* 清理CSV字段中的特殊字符
|
|
* @param field 字段值
|
|
* @return 清理后的字段值
|
|
*/
|
|
protected String cleanCsvField(String field) {
|
|
if (field == null) {
|
|
return "";
|
|
}
|
|
// 移除换行符和制表符
|
|
field = field.replace("\n", " ").replace("\r", " ").replace("\t", " ");
|
|
// 移除引用标记
|
|
field = field.replace("[", "").replace("]", "");
|
|
// 如果包含逗号,用双引号包裹
|
|
if (field.contains(",")) {
|
|
field = "\"" + field.replace("\"", "\"\"") + "\"";
|
|
}
|
|
return field;
|
|
}
|
|
|
|
/**
|
|
* 打印进度
|
|
* @param count 当前处理数量
|
|
*/
|
|
protected void printProgress(int count) {
|
|
if (count % 100 == 0) {
|
|
System.out.println(" 已爬取 " + count + " 个项目...");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* 打印结果
|
|
* @param outputFile 输出文件路径
|
|
*/
|
|
protected void printResult(String outputFile) {
|
|
System.out.println("\n========================================");
|
|
System.out.println("全部爬取完成!");
|
|
System.out.println("共爬取 " + totalCount + " 个项目");
|
|
System.out.println("目标:" + targetCount + "个项目");
|
|
System.out.println("完成率:" + String.format("%.2f", (totalCount / (double) targetCount) * 100) + "%");
|
|
System.out.println("========================================");
|
|
System.out.println("数据已保存到:" + outputFile);
|
|
}
|
|
}
|
|
|