import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;

/**
 * 通用爬虫父类
 * 封装通用功能，定义抽象方法让子类实现具体解析逻辑
 */
public abstract class BaseCrawler {
    // 通用请求头设置
    protected static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
    protected static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
    protected static final String ACCEPT_LANGUAGE = "zh-CN,zh;q=0.9";
    protected static final int TIMEOUT = 30000;
    
    // 延时时间（毫秒）
    protected static final int DELAY_MS = 1000;
    
    // 进度统计
    protected int totalCount = 0;
    protected int targetCount = 0;
    
    /**
     * 抽象方法：解析数据
     * 子类必须实现具体的解析逻辑
     */
    protected abstract void parseData(Document doc, BufferedWriter writer) throws IOException, InterruptedException;
    
    /**
     * 运行爬虫
     * @param url 目标URL
     * @param outputFile 输出文件路径
     * @param targetCount 目标数量（用于进度计算）
     */
    public void run(String url, String outputFile, int targetCount) {
        this.targetCount = targetCount;
        totalCount = 0;
        
        System.out.println("开始爬取数据...");
        System.out.println("目标：" + targetCount + "个项目\n");
        
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
            // 写入CSV表头
            writeHeader(writer);
            
            System.out.println("正在访问页面...");
            
            // 发送HTTP请求获取页面
            Document doc = fetchDocument(url);
            
            System.out.println("页面标题：" + doc.title());
            
            // 调用子类实现的解析方法
            parseData(doc, writer);
            
            // 输出结果
            printResult(outputFile);
            
        } catch (IOException e) {
            System.err.println("爬取失败：" + e.getMessage());
            e.printStackTrace();
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
        }
    }
    
    /**
     * 获取页面文档
     * @param url 目标URL
     * @return 页面文档
     * @throws IOException 网络异常
     */
    protected Document fetchDocument(String url) throws IOException {
        return Jsoup.connect(url)
                .userAgent(USER_AGENT)
                .timeout(TIMEOUT)
                .header("Accept", ACCEPT)
                .header("Accept-Language", ACCEPT_LANGUAGE)
                .followRedirects(true)
                .get();
    }
    
    /**
     * 写入CSV表头
     * 子类可以重写此方法以自定义表头
     */
    protected void writeHeader(BufferedWriter writer) throws IOException {
        writer.write("项目名称,类别,地区,简介");
        writer.newLine();
    }
    
    /**
     * 延时防反爬
     */
    protected void delay() throws InterruptedException {
        Thread.sleep(DELAY_MS);
    }
    
    /**
     * 清理CSV字段中的特殊字符
     * @param field 字段值
     * @return 清理后的字段值
     */
    protected String cleanCsvField(String field) {
        if (field == null) {
            return "";
        }
        // 移除换行符和制表符
        field = field.replace("\n", " ").replace("\r", " ").replace("\t", " ");
        // 移除引用标记
        field = field.replace("[", "").replace("]", "");
        // 如果包含逗号，用双引号包裹
        if (field.contains(",")) {
            field = "\"" + field.replace("\"", "\"\"") + "\"";
        }
        return field;
    }
    
    /**
     * 打印进度
     * @param count 当前处理数量
     */
    protected void printProgress(int count) {
        if (count % 100 == 0) {
            System.out.println("  已爬取 " + count + " 个项目...");
        }
    }
    
    /**
     * 打印结果
     * @param outputFile 输出文件路径
     */
    protected void printResult(String outputFile) {
        System.out.println("\n========================================");
        System.out.println("全部爬取完成！");
        System.out.println("共爬取 " + totalCount + " 个项目");
        System.out.println("目标：" + targetCount + "个项目");
        System.out.println("完成率：" + String.format("%.2f", (totalCount / (double) targetCount) * 100) + "%");
        System.out.println("========================================");
        System.out.println("数据已保存到：" + outputFile);
    }
}