java/w3/BaseCrawler.java


								import org.jsoup.Jsoup;

								import org.jsoup.nodes.Document;


								import java.io.BufferedWriter;

								import java.io.FileWriter;

								import java.io.IOException;


								/**

								 * 通用爬虫父类

								 * 封装通用功能，定义抽象方法让子类实现具体解析逻辑

								 */

								public abstract class BaseCrawler {

								    // 通用请求头设置

								    protected static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";

								    protected static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";

								    protected static final String ACCEPT_LANGUAGE = "zh-CN,zh;q=0.9";

								    protected static final int TIMEOUT = 30000;


								    // 延时时间（毫秒）

								    protected static final int DELAY_MS = 1000;


								    // 进度统计

								    protected int totalCount = 0;

								    protected int targetCount = 0;


								    /**

								     * 抽象方法：解析数据

								     * 子类必须实现具体的解析逻辑

								     */

								    protected abstract void parseData(Document doc, BufferedWriter writer) throws IOException, InterruptedException;


								    /**

								     * 运行爬虫

								     * @param url 目标URL

								     * @param outputFile 输出文件路径

								     * @param targetCount 目标数量（用于进度计算）

								     */

								    public void run(String url, String outputFile, int targetCount) {

								        this.targetCount = targetCount;

								        totalCount = 0;


								        System.out.println("开始爬取数据...");

								        System.out.println("目标：" + targetCount + "个项目\n");


								        try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {

								            // 写入CSV表头

								            writeHeader(writer);


								            System.out.println("正在访问页面...");


								            // 发送HTTP请求获取页面

								            Document doc = fetchDocument(url);


								            System.out.println("页面标题：" + doc.title());


								            // 调用子类实现的解析方法

								            parseData(doc, writer);


								            // 输出结果

								            printResult(outputFile);


								        } catch (IOException e) {

								            System.err.println("爬取失败：" + e.getMessage());

								            e.printStackTrace();

								        } catch (InterruptedException e) {

								            Thread.currentThread().interrupt();

								        }

								    }


								    /**

								     * 获取页面文档

								     * @param url 目标URL

								     * @return 页面文档

								     * @throws IOException 网络异常

								     */

								    protected Document fetchDocument(String url) throws IOException {

								        return Jsoup.connect(url)

								                .userAgent(USER_AGENT)

								                .timeout(TIMEOUT)

								                .header("Accept", ACCEPT)

								                .header("Accept-Language", ACCEPT_LANGUAGE)

								                .followRedirects(true)

								                .get();

								    }


								    /**

								     * 写入CSV表头

								     * 子类可以重写此方法以自定义表头

								     */

								    protected void writeHeader(BufferedWriter writer) throws IOException {

								        writer.write("项目名称,类别,地区,简介");

								        writer.newLine();

								    }


								    /**

								     * 延时防反爬

								     */

								    protected void delay() throws InterruptedException {

								        Thread.sleep(DELAY_MS);

								    }


								    /**

								     * 清理CSV字段中的特殊字符

								     * @param field 字段值

								     * @return 清理后的字段值

								     */

								    protected String cleanCsvField(String field) {

								        if (field == null) {

								            return "";

								        }

								        // 移除换行符和制表符

								        field = field.replace("\n", " ").replace("\r", " ").replace("\t", " ");

								        // 移除引用标记

								        field = field.replace("[", "").replace("]", "");

								        // 如果包含逗号，用双引号包裹

								        if (field.contains(",")) {

								            field = "\"" + field.replace("\"", "\"\"") + "\"";

								        }

								        return field;

								    }


								    /**

								     * 打印进度

								     * @param count 当前处理数量

								     */

								    protected void printProgress(int count) {

								        if (count % 100 == 0) {

								            System.out.println("  已爬取 " + count + " 个项目...");

								        }

								    }


								    /**

								     * 打印结果

								     * @param outputFile 输出文件路径

								     */

								    protected void printResult(String outputFile) {

								        System.out.println("\n========================================");

								        System.out.println("全部爬取完成！");

								        System.out.println("共爬取 " + totalCount + " 个项目");

								        System.out.println("目标：" + targetCount + "个项目");

								        System.out.println("完成率：" + String.format("%.2f", (totalCount / (double) targetCount) * 100) + "%");

								        System.out.println("========================================");

								        System.out.println("数据已保存到：" + outputFile);

								    }

								}