import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; /** * 通用爬虫父类 * 封装通用功能,定义抽象方法让子类实现具体解析逻辑 */ public abstract class BaseCrawler { // 通用请求头设置 protected static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; protected static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; protected static final String ACCEPT_LANGUAGE = "zh-CN,zh;q=0.9"; protected static final int TIMEOUT = 30000; // 延时时间(毫秒) protected static final int DELAY_MS = 1000; // 进度统计 protected int totalCount = 0; protected int targetCount = 0; /** * 抽象方法:解析数据 * 子类必须实现具体的解析逻辑 */ protected abstract void parseData(Document doc, BufferedWriter writer) throws IOException, InterruptedException; /** * 运行爬虫 * @param url 目标URL * @param outputFile 输出文件路径 * @param targetCount 目标数量(用于进度计算) */ public void run(String url, String outputFile, int targetCount) { this.targetCount = targetCount; totalCount = 0; System.out.println("开始爬取数据..."); System.out.println("目标:" + targetCount + "个项目\n"); try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) { // 写入CSV表头 writeHeader(writer); System.out.println("正在访问页面..."); // 发送HTTP请求获取页面 Document doc = fetchDocument(url); System.out.println("页面标题:" + doc.title()); // 调用子类实现的解析方法 parseData(doc, writer); // 输出结果 printResult(outputFile); } catch (IOException e) { System.err.println("爬取失败:" + e.getMessage()); e.printStackTrace(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } /** * 获取页面文档 * @param url 目标URL * @return 页面文档 * @throws IOException 网络异常 */ protected Document fetchDocument(String url) throws IOException { return Jsoup.connect(url) .userAgent(USER_AGENT) .timeout(TIMEOUT) .header("Accept", ACCEPT) .header("Accept-Language", ACCEPT_LANGUAGE) .followRedirects(true) .get(); } /** * 写入CSV表头 * 子类可以重写此方法以自定义表头 */ protected void writeHeader(BufferedWriter writer) throws IOException { writer.write("项目名称,类别,地区,简介"); writer.newLine(); } /** * 延时防反爬 */ protected void delay() throws InterruptedException { Thread.sleep(DELAY_MS); } /** * 清理CSV字段中的特殊字符 * @param field 字段值 * @return 清理后的字段值 */ protected String cleanCsvField(String field) { if (field == null) { return ""; } // 移除换行符和制表符 field = field.replace("\n", " ").replace("\r", " ").replace("\t", " "); // 移除引用标记 field = field.replace("[", "").replace("]", ""); // 如果包含逗号,用双引号包裹 if (field.contains(",")) { field = "\"" + field.replace("\"", "\"\"") + "\""; } return field; } /** * 打印进度 * @param count 当前处理数量 */ protected void printProgress(int count) { if (count % 100 == 0) { System.out.println(" 已爬取 " + count + " 个项目..."); } } /** * 打印结果 * @param outputFile 输出文件路径 */ protected void printResult(String outputFile) { System.out.println("\n========================================"); System.out.println("全部爬取完成!"); System.out.println("共爬取 " + totalCount + " 个项目"); System.out.println("目标:" + targetCount + "个项目"); System.out.println("完成率:" + String.format("%.2f", (totalCount / (double) targetCount) * 100) + "%"); System.out.println("========================================"); System.out.println("数据已保存到:" + outputFile); } }