feat:新增抽象类Animal,定义makeSound()抽象方法

2 months ago · 24ce491f5e
77 changed files with 5652 additions and 14 deletions
--- a/Animal.java
+++ b/Animal.java
@ -0,0 +1,66 @@
 // 1. 定义Swimmable接口：包含swim()方法
 public interface Swimmable {
    // 接口方法默认public abstract，可省略修饰符
    void swim();
 }
 // 2. 定义抽象类Animal：包含抽象方法makeSound()
 public abstract class Animal {
    // 抽象方法：没有方法体，由子类实现
    public abstract void makeSound();
 }
 // 3. Dog类：继承Animal，实现Swimmable接口
 public class Dog extends Animal implements Swimmable {
    // 实现父类抽象方法makeSound()
    @Override
    public void makeSound() {
        System.out.println("狗叫：汪汪汪！");
    }
    // 实现Swimmable接口的swim()方法
    @Override
    public void swim() {
        System.out.println("狗在游泳：狗刨式！");
    }
 }
 // 4. Cat类：仅继承Animal，不实现Swimmable接口
 public class Cat extends Animal {
    // 实现父类抽象方法makeSound()
    @Override
    public void makeSound() {
        System.out.println("猫叫：喵喵喵！");
    }
 }
 // 5. 主类：测试多态调用
 public class AnimalTest {
    public static void main(String[] args) {
        // 多态1：父类引用指向子类对象（Animal多态）
        Animal dog1 = new Dog();
        Animal cat1 = new Cat();
        System.out.println("=== Animal多态调用makeSound() ===");
        dog1.makeSound(); // 调用Dog类的makeSound()
        cat1.makeSound(); // 调用Cat类的makeSound()
        // 多态2：接口引用指向实现类对象（Swimmable多态）
        Swimmable dog2 = new Dog();
        System.out.println("\n=== Swimmable多态调用swim() ===");
        dog2.swim(); // 调用Dog类的swim()
        // 类型转换：将Animal类型的dog1转为Swimmable，调用swim()
        System.out.println("\n=== 类型转换后调用swim() ===");
        if (dog1 instanceof Swimmable) { // 安全判断：避免类型转换异常
            Swimmable swimmableDog = (Swimmable) dog1;
            swimmableDog.swim();
        }
        // Cat无法转换为Swimmable，会抛出异常，因此不执行
        // if (cat1 instanceof Swimmable) {
        //     Swimmable swimmableCat = (Swimmable) cat1;
        //     swimmableCat.swim();
        // }
    }
 }
--- a/Java-1test/BankAccount.class
+++ b/Java-1test/BankAccount.class
--- a/Java-1test/BankAccount.java
+++ b/Java-1test/BankAccount.java
@ -0,0 +1,63 @@
 public class BankAccount {
    // 私有属性
    private final String accountNumber;
    private String ownerName;
    private double balance;
    // 构造方法
    public BankAccount(String accountNumber, String ownerName) {
        this.accountNumber = accountNumber;
        this.ownerName = ownerName;
        this.balance = 0.0;
    }
    // Getter 方法
    public String getAccountNumber() {
        return accountNumber;
    }
    public String getOwnerName() {
        return ownerName;
    }
    public double getBalance() {
        return balance;
    }
    // Setter 方法
    public void setOwnerName(String ownerName) {
        this.ownerName = ownerName;
    }
    // 存款操作
    public void deposit(double amount) {
        if (amount > 0) {
            balance += amount;
            System.out.println("存款成功！当前余额：" + balance);
        } else {
            System.out.println("存款金额必须大于 0");
        }
    }
    // 取款操作
    public void withdraw(double amount) {
        if (amount > 0) {
            if (amount <= balance) {
                balance -= amount;
                System.out.println("取款成功！当前余额：" + balance);
            } else {
                System.out.println("余额不足，无法取款");
            }
        } else {
            System.out.println("取款金额必须大于 0");
        }
    }
    // 显示账户信息
    public void displayInfo() {
        System.out.println("账号：" + accountNumber);
        System.out.println("户主：" + ownerName);
        System.out.println("余额：" + balance);
        System.out.println();
    }
 }
--- a/Java-1test/TestBankAccount.class
+++ b/Java-1test/TestBankAccount.class
--- a/Java-1test/TestBankAccount.java
+++ b/Java-1test/TestBankAccount.java
@ -0,0 +1,29 @@
 public class TestBankAccount {
    public static void main(String[] args) {
        // 创建银行账户
        BankAccount account = new BankAccount("123456789", "张三");
        // 显示初始账户信息
        System.out.println("初始账户信息：");
        account.displayInfo();
        // 测试存款
        System.out.println("测试存款：");
        account.deposit(1000);
        account.deposit(-500); // 测试非法存款金额
        // 测试取款
        System.out.println("测试取款：");
        account.withdraw(500);
        account.withdraw(1000); // 测试余额不足
        account.withdraw(-200); // 测试非法取款金额
        // 测试修改户主姓名
        System.out.println("测试修改户主姓名：");
        account.setOwnerName("李四");
        account.displayInfo();
        // 测试查询余额
        System.out.println("当前余额：" + account.getBalance());
    }
 }
--- a/Java-1test/bin/com/rental/Car.class
+++ b/Java-1test/bin/com/rental/Car.class
--- a/Java-1test/bin/com/rental/TestCar.class
+++ b/Java-1test/bin/com/rental/TestCar.class
--- a/Java-1test/project/SimpleMovieCrawler$Movie.class
+++ b/Java-1test/project/SimpleMovieCrawler$Movie.class
--- a/Java-1test/project/SimpleMovieCrawler.class
+++ b/Java-1test/project/SimpleMovieCrawler.class
--- a/Java-1test/project/SimpleMovieCrawler.java
+++ b/Java-1test/project/SimpleMovieCrawler.java
@ -0,0 +1,155 @@
 import java.io.BufferedReader;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.net.HttpURLConnection;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 public class SimpleMovieCrawler {
    public static void main(String[] args) {
        try {
            // 1. 抓取电影数据
            List<Movie> movies = crawlMovies();
            System.out.println("爬取完成，共获取 " + movies.size() + " 部电影数据");
            // 2. 保存到文件
            saveToFile(movies, "movies.txt");
            // 3. 分析数据
            analyzeData(movies);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    // 简单的爬虫实现
    public static List<Movie> crawlMovies() throws IOException {
        List<Movie> movies = new ArrayList<>();
        String url = "https://www.imdb.com/chart/top/";
        // 发送 HTTP 请求
        HttpURLConnection connection = (HttpURLConnection) new URL(url).openConnection();
        connection.setRequestMethod("GET");
        connection.setRequestProperty("User-Agent", "Mozilla/5.0");
        // 读取响应
        BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
        StringBuilder content = new StringBuilder();
        String line;
        while ((line = reader.readLine()) != null) {
            content.append(line);
        }
        reader.close();
        connection.disconnect();
        // 简单解析 HTML（实际项目中建议使用 Jsoup）
        String html = content.toString();
        int start = html.indexOf("<tbody class=\"lister-list\">");
        int end = html.indexOf("</tbody>", start);
        if (start != -1 && end != -1) {
            String tableContent = html.substring(start, end);
            String[] rows = tableContent.split("<tr>");
            for (int i = 1; i < Math.min(rows.length, 21); i++) { // 只取前 20 部
                String row = rows[i];
                Movie movie = new Movie();
                // 提取标题
                int titleStart = row.indexOf("<a href=");
                int titleEnd = row.indexOf("</a>", titleStart);
                if (titleStart != -1 && titleEnd != -1) {
                    String titleHtml = row.substring(titleStart, titleEnd);
                    int titleTextStart = titleHtml.indexOf(">" ) + 1;
                    if (titleTextStart != -1) {
                        movie.setTitle(titleHtml.substring(titleTextStart).trim());
                    }
                }
                // 提取年份
                int yearStart = row.indexOf("<span class=\"secondaryInfo\">");
                int yearEnd = row.indexOf("</span>", yearStart);
                if (yearStart != -1 && yearEnd != -1) {
                    String year = row.substring(yearStart + 27, yearEnd).replaceAll("[()]", "").trim();
                    movie.setYear(year);
                }
                // 提取评分
                int ratingStart = row.indexOf("<strong>");
                int ratingEnd = row.indexOf("</strong>", ratingStart);
                if (ratingStart != -1 && ratingEnd != -1) {
                    String rating = row.substring(ratingStart + 8, ratingEnd).trim();
                    movie.setRating(rating);
                }
                if (movie.getTitle() != null) {
                    movies.add(movie);
                }
            }
        }
        return movies;
    }
    // 保存数据到文件
    public static void saveToFile(List<Movie> movies, String fileName) throws IOException {
        FileWriter writer = new FileWriter(fileName);
        writer.write("Title,Rating,Year\n");
        for (Movie movie : movies) {
            writer.write(movie.getTitle() + "," + movie.getRating() + "," + movie.getYear() + "\n");
        }
        writer.close();
        System.out.println("数据已保存到: " + fileName);
    }
    // 分析数据
    public static void analyzeData(List<Movie> movies) {
        System.out.println("\n=== 电影数据分析 ===");
        // 评分分布
        Map<String, Integer> ratingDist = new HashMap<>();
        for (Movie movie : movies) {
            String rating = movie.getRating();
            ratingDist.put(rating, ratingDist.getOrDefault(rating, 0) + 1);
        }
        System.out.println("\n1. 评分分布：");
        for (Map.Entry<String, Integer> entry : ratingDist.entrySet()) {
            System.out.println("评分 " + entry.getKey() + ": " + entry.getValue() + " 部");
        }
        // 年份分布
        Map<String, Integer> yearDist = new HashMap<>();
        for (Movie movie : movies) {
            String year = movie.getYear();
            if (year != null) {
                yearDist.put(year, yearDist.getOrDefault(year, 0) + 1);
            }
        }
        System.out.println("\n2. 年份分布：");
        yearDist.entrySet().stream()
                .sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
                .limit(10)
                .forEach(entry -> System.out.println(entry.getKey() + "年: " + entry.getValue() + " 部"));
    }
    // 电影模型类
    static class Movie {
        private String title;
        private String rating;
        private String year;
        public String getTitle() { return title; }
        public void setTitle(String title) { this.title = title; }
        public String getRating() { return rating; }
        public void setRating(String rating) { this.rating = rating; }
        public String getYear() { return year; }
        public void setYear(String year) { this.year = year; }
    }
 }
--- a/Java-1test/project/movies.txt
+++ b/Java-1test/project/movies.txt
@ -0,0 +1 @@
 Title,Rating,Year
--- a/Java-1test/project/pom.xml
+++ b/Java-1test/project/pom.xml
@ -0,0 +1,51 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.example</groupId>
    <artifactId>movie-crawler</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <maven.compiler.source>11</maven.compiler.source>
        <maven.compiler.target>11</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>
    <dependencies>
        <!-- Jsoup for HTML parsing -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.17.2</version>
        </dependency>
        <!-- JFreeChart for chart generation -->
        <dependency>
            <groupId>org.jfree</groupId>
            <artifactId>jfreechart</artifactId>
            <version>1.5.4</version>
        </dependency>
        <!-- Commons CSV for CSV handling -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-csv</artifactId>
            <version>1.10.0</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.11.0</version>
                <configuration>
                    <source>11</source>
                    <target>11</target>
                </configuration>
            </plugin>
        </plugins>
    </build>
 </project>
--- a/Java-1test/project/run.bat
+++ b/Java-1test/project/run.bat
@ -0,0 +1,38 @@
@echo off
 rem 创建 lib 目录并下载依赖
 if not exist lib mkdir lib
 rem 下载 Jsoup
 if not exist lib\jsoup-1.17.2.jar (
    echo 下载 Jsoup...
    powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/jsoup/jsoup/1.17.2/jsoup-1.17.2.jar' -OutFile 'lib\jsoup-1.17.2.jar'"
 )
 rem 下载 JFreeChart
 if not exist lib\jfreechart-1.5.4.jar (
    echo 下载 JFreeChart...
    powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/jfree/jfreechart/1.5.4/jfreechart-1.5.4.jar' -OutFile 'lib\jfreechart-1.5.4.jar'"
 )
 rem 下载 JCommon（JFreeChart 依赖）
 if not exist lib\jcommon-1.0.24.jar (
    echo 下载 JCommon...
    powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/jfree/jcommon/1.0.24/jcommon-1.0.24.jar' -OutFile 'lib\jcommon-1.0.24.jar'"
 )
 rem 下载 Commons CSV
 if not exist lib\commons-csv-1.10.0.jar (
    echo 下载 Commons CSV...
    powershell -Command "Invoke-WebRequest -Uri 'https://repo1.maven.org/maven2/org/apache/commons/commons-csv/1.10.0/commons-csv-1.10.0.jar' -OutFile 'lib\commons-csv-1.10.0.jar'"
 )
 rem 编译项目
 echo 编译项目...
 javac -cp "lib/*" -d bin src\main\java\com\example\*.java src\main\java\com\example\model\*.java src\main\java\com\example\crawler\*.java src\main\java\com\example\processor\*.java src\main\java\com\example\analyzer\*.java src\main\java\com\example\chart\*.java
 rem 运行项目
 echo 运行项目...
 java -cp "bin;lib/*" com.example.Main
 pause
--- a/Java-1test/project/src/main/java/com/example/Main.java
+++ b/Java-1test/project/src/main/java/com/example/Main.java
@ -0,0 +1,62 @@
 package com.example;
 import com.example.analyzer.MovieAnalyzer;
 import com.example.chart.ChartGenerator;
 import com.example.crawler.MovieCrawler;
 import com.example.model.Movie;
 import com.example.processor.DataProcessor;
 import java.io.IOException;
 import java.util.List;
 public class Main {
    public static void main(String[] args) {
        try {
            // 1. 初始化爬虫
            MovieCrawler crawler = new MovieCrawler();
            System.out.println("开始爬取 IMDb Top 250 电影数据...");
            // 2. 抓取电影数据（限制为50部）
            List<Movie> movies = crawler.crawlTopMovies(50);
            System.out.println("爬取完成，共获取 " + movies.size() + " 部电影数据");
            // 3. 数据处理与存储
            DataProcessor processor = new DataProcessor();
            String csvFilePath = "movies.csv";
            processor.saveMoviesToCsv(movies, csvFilePath);
            // 4. 数据分析
            MovieAnalyzer analyzer = new MovieAnalyzer();
            analyzer.printStatistics(movies);
            // 5. 图表生成
            ChartGenerator chartGenerator = new ChartGenerator();
            // 生成评分分布图表
            chartGenerator.generateRatingDistributionChart(
                    analyzer.analyzeRatingDistribution(movies),
                    "rating_distribution.png"
            );
            // 生成类型分布图表
            chartGenerator.generateGenreDistributionChart(
                    analyzer.analyzeGenreDistribution(movies),
                    "genre_distribution.png"
            );
            // 生成导演作品数图表
            chartGenerator.generateDirectorWorksChart(
                    analyzer.analyzeDirectorWorks(movies),
                    "director_works.png"
            );
            System.out.println("\n项目执行完成！");
            System.out.println("数据已保存到: " + csvFilePath);
            System.out.println("图表已生成到当前目录");
        } catch (IOException e) {
            System.out.println("执行过程中出现错误: " + e.getMessage());
            e.printStackTrace();
        }
    }
 }
--- a/Java-1test/project/src/main/java/com/example/analyzer/MovieAnalyzer.java
+++ b/Java-1test/project/src/main/java/com/example/analyzer/MovieAnalyzer.java
@ -0,0 +1,94 @@
 package com.example.analyzer;
 import com.example.model.Movie;
 import java.util.*;
 import java.util.stream.Collectors;
 public class MovieAnalyzer {
    // 统计评分分布
    public Map<String, Integer> analyzeRatingDistribution(List<Movie> movies) {
        return movies.stream()
                .collect(Collectors.groupingBy(Movie::getRating, Collectors.summingInt(e -> 1)));
    }
    // 统计年份与评分的关系
    public Map<String, Double> analyzeYearRatingRelation(List<Movie> movies) {
        return movies.stream()
                .collect(Collectors.groupingBy(Movie::getYear, 
                        Collectors.averagingDouble(m -> Double.parseDouble(m.getRating()))));
    }
    // 统计导演作品数排行
    public Map<String, Integer> analyzeDirectorWorks(List<Movie> movies) {
        return movies.stream()
                .collect(Collectors.groupingBy(Movie::getDirector, Collectors.summingInt(e -> 1)))
                .entrySet().stream()
                .sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
                .limit(10)
                .collect(Collectors.toMap(
                        Map.Entry::getKey,
                        Map.Entry::getValue,
                        (e1, e2) -> e1,
                        LinkedHashMap::new
                ));
    }
    // 统计类型分布
    public Map<String, Integer> analyzeGenreDistribution(List<Movie> movies) {
        Map<String, Integer> genreCount = new HashMap<>();
        for (Movie movie : movies) {
            String genre = movie.getGenre();
            if (genre != null && !genre.isEmpty()) {
                String[] genres = genre.split(", ");
                for (String g : genres) {
                    genreCount.put(g, genreCount.getOrDefault(g, 0) + 1);
                }
            }
        }
        return genreCount.entrySet().stream()
                .sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
                .limit(10)
                .collect(Collectors.toMap(
                        Map.Entry::getKey,
                        Map.Entry::getValue,
                        (e1, e2) -> e1,
                        LinkedHashMap::new
                ));
    }
    // 打印统计结果
    public void printStatistics(List<Movie> movies) {
        System.out.println("\n=== 电影数据分析结果 ===");
        // 评分分布
        System.out.println("\n1. 评分分布：");
        Map<String, Integer> ratingDist = analyzeRatingDistribution(movies);
        ratingDist.forEach((rating, count) -> 
                System.out.printf("评分 %.1f: %d 部\n", Double.parseDouble(rating), count));
        // 年份与评分关系（前10年）
        System.out.println("\n2. 年份与平均评分（前10年）：");
        Map<String, Double> yearRating = analyzeYearRatingRelation(movies);
        yearRating.entrySet().stream()
                .sorted(Map.Entry.<String, Double>comparingByValue().reversed())
                .limit(10)
                .forEach(entry -> 
                        System.out.printf("%s年: %.2f\n", entry.getKey(), entry.getValue()));
        // 导演作品数排行
        System.out.println("\n3. 导演作品数排行（前10）：");
        Map<String, Integer> directorWorks = analyzeDirectorWorks(movies);
        directorWorks.forEach((director, count) -> 
                System.out.printf("%s: %d 部\n", director, count));
        // 类型分布
        System.out.println("\n4. 类型分布（前10）：");
        Map<String, Integer> genreDist = analyzeGenreDistribution(movies);
        genreDist.forEach((genre, count) -> 
                System.out.printf("%s: %d 部\n", genre, count));
    }
 }
--- a/Java-1test/project/src/main/java/com/example/chart/ChartGenerator.java
+++ b/Java-1test/project/src/main/java/com/example/chart/ChartGenerator.java
@ -0,0 +1,81 @@
 package com.example.chart;
 import org.jfree.chart.ChartFactory;
 import org.jfree.chart.ChartUtils;
 import org.jfree.chart.JFreeChart;
 import org.jfree.chart.plot.PlotOrientation;
 import org.jfree.data.category.DefaultCategoryDataset;
 import org.jfree.data.general.DefaultPieDataset;
 import java.io.File;
 import java.io.IOException;
 import java.util.Map;
 public class ChartGenerator {
    // 生成评分分布柱状图
    public void generateRatingDistributionChart(Map<String, Integer> ratingDist, String outputPath) throws IOException {
        DefaultCategoryDataset dataset = new DefaultCategoryDataset();
        ratingDist.forEach((rating, count) -> {
            dataset.addValue(count, "电影数量", rating);
        });
        JFreeChart chart = ChartFactory.createBarChart(
                "IMDb Top 250 电影评分分布",
                "评分",
                "电影数量",
                dataset,
                PlotOrientation.VERTICAL,
                true,
                true,
                false
        );
        ChartUtils.saveChartAsPNG(new File(outputPath), chart, 800, 600);
        System.out.println("评分分布图表已保存到：" + outputPath);
    }
    // 生成类型分布饼图
    public void generateGenreDistributionChart(Map<String, Integer> genreDist, String outputPath) throws IOException {
        DefaultPieDataset dataset = new DefaultPieDataset();
        genreDist.forEach((genre, count) -> {
            dataset.setValue(genre, count);
        });
        JFreeChart chart = ChartFactory.createPieChart(
                "IMDb Top 250 电影类型分布",
                dataset,
                true,
                true,
                false
        );
        ChartUtils.saveChartAsPNG(new File(outputPath), chart, 800, 600);
        System.out.println("类型分布图表已保存到：" + outputPath);
    }
    // 生成导演作品数柱状图
    public void generateDirectorWorksChart(Map<String, Integer> directorWorks, String outputPath) throws IOException {
        DefaultCategoryDataset dataset = new DefaultCategoryDataset();
        directorWorks.forEach((director, count) -> {
            dataset.addValue(count, "作品数量", director);
        });
        JFreeChart chart = ChartFactory.createBarChart(
                "IMDb Top 250 导演作品数排行",
                "导演",
                "作品数量",
                dataset,
                PlotOrientation.VERTICAL,
                true,
                true,
                false
        );
        ChartUtils.saveChartAsPNG(new File(outputPath), chart, 800, 600);
        System.out.println("导演作品数图表已保存到：" + outputPath);
    }
 }
--- a/Java-1test/project/src/main/java/com/example/crawler/MovieCrawler.java
+++ b/Java-1test/project/src/main/java/com/example/crawler/MovieCrawler.java
@ -0,0 +1,119 @@
 package com.example.crawler;
 import com.example.model.Movie;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.stream.Collectors;
 public class MovieCrawler {
    private static final String BASE_URL = "https://www.imdb.com/chart/top/";
    public List<Movie> crawlTopMovies(int limit) throws IOException {
        List<Movie> movies = new ArrayList<>();
        // 发送 HTTP 请求获取网页内容
        Document doc = Jsoup.connect(BASE_URL)
                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
                .timeout(10000)
                .get();
        // 解析电影列表
        Elements movieElements = doc.select("tbody.lister-list tr");
        int count = 0;
        for (Element element : movieElements) {
            if (count >= limit) break;
            Movie movie = new Movie();
            // 提取电影标题
            Element titleElement = element.selectFirst(".titleColumn a");
            if (titleElement != null) {
                movie.setTitle(titleElement.text());
            }
            // 提取年份
            Element yearElement = element.selectFirst(".titleColumn .secondaryInfo");
            if (yearElement != null) {
                String year = yearElement.text().replaceAll("[()]", "");
                movie.setYear(year);
            }
            // 提取评分
            Element ratingElement = element.selectFirst(".ratingColumn.imdbRating strong");
            if (ratingElement != null) {
                movie.setRating(ratingElement.text());
            }
            // 提取导演和主演（需要进入详情页）
            String movieUrl = "https://www.imdb.com" + titleElement.attr("href");
            try {
                Document movieDoc = Jsoup.connect(movieUrl)
                        .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
                        .timeout(10000)
                        .get();
                // 提取导演
                Elements directorElements = movieDoc.select("a[href*=name]").stream()
                        .filter(e -> e.parent().text().contains("Director"))
                        .limit(1)
                        .collect(Collectors.toList());
                if (!directorElements.isEmpty()) {
                    movie.setDirector(directorElements.get(0).text());
                }
                // 提取主演
                Elements starElements = movieDoc.select("a[href*=name]").stream()
                        .filter(e -> e.parent().text().contains("Stars"))
                        .limit(3)
                        .collect(Collectors.toList());
                if (!starElements.isEmpty()) {
                    StringBuilder stars = new StringBuilder();
                    for (int i = 0; i < starElements.size(); i++) {
                        stars.append(starElements.get(i).text());
                        if (i < starElements.size() - 1) stars.append(", ");
                    }
                    movie.setStars(stars.toString());
                }
                // 提取类型
                Elements genreElements = movieDoc.select("a[href*=genres]").limit(3);
                if (!genreElements.isEmpty()) {
                    StringBuilder genres = new StringBuilder();
                    for (int i = 0; i < genreElements.size(); i++) {
                        genres.append(genreElements.get(i).text());
                        if (i < genreElements.size() - 1) genres.append(", ");
                    }
                    movie.setGenre(genres.toString());
                }
                // 提取时长
                Element runtimeElement = movieDoc.selectFirst("time");
                if (runtimeElement != null) {
                    movie.setRuntime(runtimeElement.text());
                }
            } catch (IOException e) {
                System.out.println("Error crawling movie details: " + e.getMessage());
            }
            movies.add(movie);
            count++;
            // 控制请求频率，避免被封
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
        return movies;
    }
 }
--- a/Java-1test/project/src/main/java/com/example/model/Movie.java
+++ b/Java-1test/project/src/main/java/com/example/model/Movie.java
@ -0,0 +1,81 @@
 package com.example.model;
 public class Movie {
    private String title;
    private String rating;
    private String year;
    private String director;
    private String stars;
    private String runtime;
    private String genre;
    // Getters and Setters
    public String getTitle() {
        return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public String getRating() {
        return rating;
    }
    public void setRating(String rating) {
        this.rating = rating;
    }
    public String getYear() {
        return year;
    }
    public void setYear(String year) {
        this.year = year;
    }
    public String getDirector() {
        return director;
    }
    public void setDirector(String director) {
        this.director = director;
    }
    public String getStars() {
        return stars;
    }
    public void setStars(String stars) {
        this.stars = stars;
    }
    public String getRuntime() {
        return runtime;
    }
    public void setRuntime(String runtime) {
        this.runtime = runtime;
    }
    public String getGenre() {
        return genre;
    }
    public void setGenre(String genre) {
        this.genre = genre;
    }
    @Override
    public String toString() {
        return "Movie{" +
                "title='" + title + '\'' +
                ", rating='" + rating + '\'' +
                ", year='" + year + '\'' +
                ", director='" + director + '\'' +
                ", stars='" + stars + '\'' +
                ", runtime='" + runtime + '\'' +
                ", genre='" + genre + '\'' +
                '}';
    }
 }
--- a/Java-1test/project/src/main/java/com/example/processor/DataProcessor.java
+++ b/Java-1test/project/src/main/java/com/example/processor/DataProcessor.java
@ -0,0 +1,40 @@
 package com.example.processor;
 import com.example.model.Movie;
 import org.apache.commons.csv.CSVFormat;
 import org.apache.commons.csv.CSVPrinter;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.util.List;
 public class DataProcessor {
    public void saveMoviesToCsv(List<Movie> movies, String filePath) throws IOException {
        try (FileWriter writer = new FileWriter(filePath);
             CSVPrinter csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT
                     .withHeader("Title", "Rating", "Year", "Director", "Stars", "Runtime", "Genre"))) {
            for (Movie movie : movies) {
                csvPrinter.printRecord(
                        cleanText(movie.getTitle()),
                        movie.getRating(),
                        movie.getYear(),
                        cleanText(movie.getDirector()),
                        cleanText(movie.getStars()),
                        movie.getRuntime(),
                        cleanText(movie.getGenre())
                );
            }
            csvPrinter.flush();
            System.out.println("Movies saved to CSV file: " + filePath);
        }
    }
    private String cleanText(String text) {
        if (text == null) return "";
        // 去除首尾空格，去除 HTML 标签
        return text.trim().replaceAll("<[^>]*>", "");
    }
 }
--- a/Java-1test/project/target/classes/com/example/Main.class
+++ b/Java-1test/project/target/classes/com/example/Main.class
--- a/Java-1test/project/target/classes/com/example/analyzer/MovieAnalyzer.class
+++ b/Java-1test/project/target/classes/com/example/analyzer/MovieAnalyzer.class
--- a/Java-1test/project/target/classes/com/example/chart/ChartGenerator.class
+++ b/Java-1test/project/target/classes/com/example/chart/ChartGenerator.class
--- a/Java-1test/project/target/classes/com/example/crawler/MovieCrawler.class
+++ b/Java-1test/project/target/classes/com/example/crawler/MovieCrawler.class
--- a/Java-1test/project/target/classes/com/example/model/Movie.class
+++ b/Java-1test/project/target/classes/com/example/model/Movie.class
--- a/Java-1test/project/target/classes/com/example/processor/DataProcessor.class
+++ b/Java-1test/project/target/classes/com/example/processor/DataProcessor.class
--- a/Java-1test/src/main/java/com/rental/Car.java
+++ b/Java-1test/src/main/java/com/rental/Car.java
@ -0,0 +1,104 @@
 package com.rental;
 public class Car {
    // 私有属性
    private final String licensePlate;
    private String brand;
    private String model;
    private double dailyRent;
    private boolean isRented;
    // 静态变量，统计车辆总数
    private static int totalCars = 0;
    // 全参构造方法
    public Car(String licensePlate, String brand, String model, double dailyRent) {
        this.licensePlate = licensePlate;
        this.brand = brand;
        this.model = model;
        this.dailyRent = dailyRent;
        this.isRented = false;
        totalCars++;
    }
    // 三参构造方法，使用默认日租金 300 元/天
    public Car(String licensePlate, String brand, String model) {
        this(licensePlate, brand, model, 300.0);
    }
    // Getter 方法
    public String getLicensePlate() {
        return licensePlate;
    }
    public String getBrand() {
        return brand;
    }
    public String getModel() {
        return model;
    }
    public double getDailyRent() {
        return dailyRent;
    }
    public boolean isRented() {
        return isRented;
    }
    // Setter 方法
    public void setBrand(String brand) {
        this.brand = brand;
    }
    public void setModel(String model) {
        this.model = model;
    }
    public void setDailyRent(double dailyRent) {
        if (dailyRent > 0) {
            this.dailyRent = dailyRent;
        } else {
            System.out.println("日租金必须大于 0，保持原值");
        }
    }
    // 业务方法
    public void rentCar() {
        if (isRented) {
            System.out.println("车辆已租出，无法再次租用");
        } else {
            isRented = true;
            System.out.println("车辆租用成功");
        }
    }
    public void returnCar() {
        if (!isRented) {
            System.out.println("车辆未被租用，无需归还");
        } else {
            isRented = false;
            System.out.println("车辆归还成功");
        }
    }
    public double calculateRent(int days) {
        return dailyRent * days;
    }
    // 显示车辆信息
    public void displayInfo() {
        System.out.println("车牌号: " + licensePlate);
        System.out.println("品牌: " + brand);
        System.out.println("型号: " + model);
        System.out.println("日租金: " + dailyRent + " 元/天");
        System.out.println("状态: " + (isRented ? "已租出" : "可租"));
        System.out.println();
    }
    // 静态方法，返回总车辆数
    public static int getTotalCars() {
        return totalCars;
    }
 }
--- a/Java-1test/src/main/java/com/rental/TestCar.java
+++ b/Java-1test/src/main/java/com/rental/TestCar.java
@ -0,0 +1,48 @@
 package com.rental;
 public class TestCar {
    public static void main(String[] args) {
        // 创建 3 个 Car 对象
        Car car1 = new Car("京A12345", "宝马", "5系", 500.0);
        Car car2 = new Car("京B67890", "奔驰", "C级");
        Car car3 = new Car("京C54321", "奥迪", "A4L", 450.0);
        // 输出所有车辆信息
        System.out.println("所有车辆信息：");
        System.out.println("------------------------");
        car1.displayInfo();
        car2.displayInfo();
        car3.displayInfo();
        // 测试车辆租用和归还
        System.out.println("测试车辆租用和归还：");
        System.out.println("------------------------");
        System.out.println("测试 car1：");
        car1.rentCar();     // 首次租用
        car1.rentCar();     // 再次租用（应该提示已租出）
        car1.returnCar();   // 归还
        car1.returnCar();   // 再次归还（应该提示未租用）
        System.out.println();
        // 计算租金
        System.out.println("计算租金：");
        System.out.println("------------------------");
        double rent = car1.calculateRent(5);
        System.out.println("car1 租用 5 天的费用：" + rent + " 元");
        System.out.println();
        // 测试修改日租金为非法值
        System.out.println("测试修改日租金：");
        System.out.println("------------------------");
        System.out.println("尝试将 car2 的日租金修改为 -100：");
        car2.setDailyRent(-100);
        System.out.println("car2 当前日租金：" + car2.getDailyRent() + " 元/天");
        System.out.println("尝试将 car2 的日租金修改为 400：");
        car2.setDailyRent(400);
        System.out.println("car2 当前日租金：" + car2.getDailyRent() + " 元/天");
        System.out.println();
        // 输出总车辆数
        System.out.println("总车辆数：" + Car.getTotalCars());
    }
 }
--- a/Java.实验
+++ b/Java.实验
--- a/project/AddRegressionColumns.java
+++ b/project/AddRegressionColumns.java
@ -0,0 +1,224 @@
 import org.apache.poi.ss.usermodel.*;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import java.io.*;
 import java.util.*;
 import java.util.regex.*;
 public class AddRegressionColumns {
    public static void main(String[] args) {
        String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）.xlsx";
        String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）_回归.xlsx";
        System.out.println("========================================");
        System.out.println("  在原表中添加回归数据列");
        System.out.println("========================================");
        System.out.println("输入文件: " + inputFile);
        System.out.println("输出文件: " + outputFile);
        System.out.println();
        try {
            // 读取输入文件
            System.out.println("读取输入文件...");
            FileInputStream fis = new FileInputStream(inputFile);
            Workbook wb = new XSSFWorkbook(fis);
            Sheet sheet = wb.getSheetAt(0);
            int totalRows = sheet.getLastRowNum();
            System.out.println("总行数: " + totalRows);
            // 获取表头行
            Row headerRow = sheet.getRow(0);
            int totalCols = headerRow.getLastCellNum();
            System.out.println("总列数: " + totalCols);
            // 识别列
            int helpfullCol = -1;
            int commentCountCol = -1;
            List<Integer> commentCols = new ArrayList<>();
            for (int i = 0; i < totalCols; i++) {
                Cell cell = headerRow.getCell(i);
                if (cell != null) {
                    String header = cell.getStringCellValue().toLowerCase();
                    if (header.contains("helpfull") || header.contains("helpful")) {
                        helpfullCol = i;
                        System.out.println("找到 Y 列 (helpfull): 列 " + i);
                    } else if (header.contains("评论总数") || header.contains("帖子评论总数")) {
                        commentCountCol = i;
                        System.out.println("找到 X1 列 (评论总数): 列 " + i);
                    } else if (header.contains("评论") && header.contains("内容")) {
                        for (int j = 1; j <= 5; j++) {
                            if (header.contains(String.valueOf(j))) {
                                commentCols.add(i);
                                System.out.println("找到评论列 " + commentCols.size() + ": 列 " + i + " - " + header);
                                break;
                            }
                        }
                    }
                }
            }
            System.out.println("\n共找到 " + commentCols.size() + " 个评论列");
            // 添加新列的表头
            int yCol = totalCols;
            int x1Col = totalCols + 1;
            int x2Col = totalCols + 2;
            int x3Col = totalCols + 3;
            int x4Col = totalCols + 4;
            int x5Col = totalCols + 5;
            int x6Col = totalCols + 6;
            headerRow.createCell(yCol).setCellValue("Y");
            headerRow.createCell(x1Col).setCellValue("X1");
            headerRow.createCell(x2Col).setCellValue("X2");
            headerRow.createCell(x3Col).setCellValue("X3");
            headerRow.createCell(x4Col).setCellValue("X4");
            headerRow.createCell(x5Col).setCellValue("X5");
            headerRow.createCell(x6Col).setCellValue("X6");
            // 处理每一行数据
            System.out.println("\n处理数据...");
            Pattern digitPattern = Pattern.compile("\\d");
            Pattern urlPattern = Pattern.compile("http[s]?://|www\\.");
            Pattern emojiPattern = Pattern.compile("[\\u2600-\\u27BF\\uD83C-\\uDBFF\\uDC00-\\uDFFF]|[:;][-]?[)D]");
            String[] positiveWords = {"好", "棒", "优秀", "喜欢", "满意", "赞", "positive", "good", "great", "excellent", "love", "like"};
            String[] negativeWords = {"差", "糟糕", "不好", "失望", "不满", "negative", "bad", "terrible", "poor", "hate", "dislike"};
            for (int i = 1; i <= totalRows; i++) {
                if (i % 1000 == 0) {
                    System.out.println("处理第 " + i + "/" + totalRows + " 行...");
                }
                Row row = sheet.getRow(i);
                if (row == null) continue;
                // Y (UGC有用性)
                double y = 0;
                if (helpfullCol >= 0) {
                    Cell cell = row.getCell(helpfullCol);
                    if (cell != null) {
                        try {
                            y = cell.getNumericCellValue();
                        } catch (Exception e) {
                            y = 0;
                        }
                    }
                }
                row.createCell(yCol).setCellValue(y);
                // X1 (评论数量)
                double x1 = 0;
                if (commentCountCol >= 0) {
                    Cell cell = row.getCell(commentCountCol);
                    if (cell != null) {
                        try {
                            x1 = cell.getNumericCellValue();
                        } catch (Exception e) {
                            x1 = 0;
                        }
                    }
                }
                row.createCell(x1Col).setCellValue(x1);
                // 计算评论相关指标
                List<Double> lengths = new ArrayList<>();
                List<Double> complexities = new ArrayList<>();
                List<Double> sentiments = new ArrayList<>();
                List<Double> richnessList = new ArrayList<>();
                for (int colIdx : commentCols) {
                    Cell cell = row.getCell(colIdx);
                    if (cell != null) {
                        String content = "";
                        try {
                            content = cell.getStringCellValue();
                        } catch (Exception e) {
                            try {
                                content = String.valueOf(cell.getNumericCellValue());
                            } catch (Exception e2) {
                                content = "";
                            }
                        }
                        if (content != null && !content.isEmpty() && !content.equals("nan") && !content.equals("null")) {
                            // X2: 评论长度（剔空格后的字符数）
                            double length = content.replace(" ", "").replace("\u3000", "").length();
                            lengths.add(length);
                            // X3: 评论复杂度（按空格拆分的分词数）
                            double complexity = content.split("\\s+").length;
                            complexities.add(complexity);
                            // X5: 情感分析
                            double sentiment = 0;
                            String lowerContent = content.toLowerCase();
                            for (String word : positiveWords) {
                                if (lowerContent.contains(word)) {
                                    sentiment = 1;
                                    break;
                                }
                            }
                            if (sentiment == 0) {
                                for (String word : negativeWords) {
                                    if (lowerContent.contains(word)) {
                                        sentiment = -1;
                                        break;
                                    }
                                }
                            }
                            sentiments.add(sentiment);
                            // X6: 信息丰富度
                            double richness = 0;
                            if (digitPattern.matcher(content).find()) richness += 1;
                            if (urlPattern.matcher(content).find()) richness += 1;
                            if (emojiPattern.matcher(content).find()) richness += 1;
                            richnessList.add(richness);
                        }
                    }
                }
                // 计算平均值（无评论记0）
                double x2 = lengths.isEmpty() ? 0 : lengths.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
                double x3 = complexities.isEmpty() ? 0 : complexities.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
                double x5 = sentiments.isEmpty() ? 0 : sentiments.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
                double x6 = richnessList.isEmpty() ? 0 : richnessList.stream().mapToDouble(Double::doubleValue).average().getAsDouble();
                // X4: 评论可读性 = X2/X3（X3为0时记0）
                double x4 = (x3 > 0) ? x2 / x3 : 0;
                // 写入单元格
                row.createCell(x2Col).setCellValue(x2);
                row.createCell(x3Col).setCellValue(x3);
                row.createCell(x4Col).setCellValue(x4);
                row.createCell(x5Col).setCellValue(x5);
                row.createCell(x6Col).setCellValue(x6);
            }
            // 保存文件
            System.out.println("\n保存文件...");
            FileOutputStream fos = new FileOutputStream(outputFile);
            wb.write(fos);
            fos.close();
            wb.close();
            fis.close();
            // 验证文件
            File output = new File(outputFile);
            if (output.exists()) {
                System.out.println("文件保存成功！");
                System.out.println("文件大小: " + (output.length() / 1024) + " KB");
            }
            System.out.println("\n========================================");
            System.out.println("  任务完成");
            System.out.println("========================================");
        } catch (Exception e) {
            System.out.println("错误: " + e.getMessage());
            e.printStackTrace();
        }
    }
 }
--- a/project/DataCleaner.java
+++ b/project/DataCleaner.java
@ -1,7 +1,3 @@
 package com.project.util;
 import com.project.model.PostInfo;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Matcher;
--- a/project/DataCleaningScript.java
+++ b/project/DataCleaningScript.java
@ -0,0 +1,226 @@
 import java.io.*;
 import java.time.LocalDate;
 import java.time.format.DateTimeFormatter;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 public class DataCleaningScript {
    private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.CHINA);
    public static void main(String[] args) {
        String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx";
        String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）.csv";
        System.out.println("========================================");
        System.out.println("  数据清洗脚本");
        System.out.println("========================================");
        System.out.println("输入文件: " + inputFile);
        System.out.println("输出文件: " + outputFile);
        System.out.println();
        // 读取数据
        List<PostInfo> rawPosts = readExcelData(inputFile);
        System.out.println("读取数据完成，共 " + rawPosts.size() + " 条记录");
        // 清洗数据
        List<PostInfo> cleanedPosts = cleanPosts(rawPosts);
        System.out.println("数据清洗完成，有效记录: " + cleanedPosts.size() + " 条");
        // 保存清洗后的数据
        saveToCSV(cleanedPosts, outputFile);
        System.out.println("数据保存完成！");
        System.out.println();
        System.out.println("========================================");
        System.out.println("  数据清洗任务完成");
        System.out.println("========================================");
    }
    private static List<PostInfo> readExcelData(String filePath) {
        List<PostInfo> posts = new ArrayList<>();
        try (BufferedReader reader = new BufferedReader(new FileReader(filePath, java.nio.charset.StandardCharsets.UTF_8))) {
            String line;
            boolean isFirstLine = true;
            while ((line = reader.readLine()) != null) {
                if (isFirstLine) {
                    isFirstLine = false;
                    continue;
                }
                String[] parts = parseCSVLine(line);
                if (parts.length >= 9) {
                    PostInfo post = parsePostInfo(parts);
                    if (post != null) {
                        posts.add(post);
                    }
                }
            }
        } catch (IOException e) {
            System.err.println("读取文件时出错: " + e.getMessage());
        }
        return posts;
    }
    private static String[] parseCSVLine(String line) {
        List<String> fields = new ArrayList<>();
        StringBuilder currentField = new StringBuilder();
        boolean inQuotes = false;
        for (char c : line.toCharArray()) {
            if (c == '"') {
                inQuotes = !inQuotes;
            } else if (c == ',' && !inQuotes) {
                fields.add(currentField.toString().trim());
                currentField.setLength(0);
            } else {
                currentField.append(c);
            }
        }
        fields.add(currentField.toString().trim());
        return fields.toArray(new String[0]);
    }
    private static PostInfo parsePostInfo(String[] parts) {
        try {
            PostInfo post = new PostInfo();
            post.setTitle(parts[0]);
            post.setContent(parts[1]);
            post.setAuthor(parts[2]);
            if (!parts[3].isEmpty()) {
                post.setPostDate(LocalDate.parse(parts[3], DATE_FORMATTER));
            }
            post.setLikeCount(parseInt(parts[4]));
            post.setCommentCount(parseInt(parts[5]));
            post.setViewCount(parseInt(parts[6]));
            post.setTags(parts[7]);
            post.setSentiment(parts[8]);
            return post;
        } catch (Exception e) {
            return null;
        }
    }
    private static int parseInt(String value) {
        try {
            if (value == null || value.isEmpty()) {
                return 0;
            }
            return Integer.parseInt(value);
        } catch (NumberFormatException e) {
            return 0;
        }
    }
    private static List<PostInfo> cleanPosts(List<PostInfo> rawPosts) {
        List<PostInfo> cleanedPosts = new ArrayList<>();
        for (PostInfo post : rawPosts) {
            PostInfo cleaned = cleanPost(post);
            if (isValidPost(cleaned)) {
                cleanedPosts.add(cleaned);
            }
        }
        return cleanedPosts;
    }
    private static PostInfo cleanPost(PostInfo post) {
        PostInfo cleaned = new PostInfo();
        cleaned.setTitle(cleanText(post.getTitle()));
        cleaned.setContent(cleanContent(post.getContent()));
        cleaned.setAuthor(cleanText(post.getAuthor()));
        cleaned.setPostDate(post.getPostDate());
        cleaned.setLikeCount(post.getLikeCount());
        cleaned.setCommentCount(post.getCommentCount());
        cleaned.setViewCount(post.getViewCount());
        cleaned.setTags(cleanText(post.getTags()));
        cleaned.setSentiment(normalizeSentiment(post.getSentiment()));
        return cleaned;
    }
    private static String cleanText(String text) {
        if (text == null) {
            return "";
        }
        return text.trim().replaceAll("\\s+", " ");
    }
    private static String cleanContent(String content) {
        if (content == null) {
            return "";
        }
        return content.trim()
                .replaceAll("\\s+", " ")
                .replaceAll("[\\r\\n]+", " ")
                .replaceAll("<[^>]+>", "")
                .replaceAll("\\[.*?\\]", "")
                .replaceAll("\\(.*?\\)", "");
    }
    private static String normalizeSentiment(String sentiment) {
        if (sentiment == null || sentiment.isEmpty()) {
            return "中性";
        }
        String lower = sentiment.toLowerCase();
        if (lower.contains("积极") || lower.contains("正面") || lower.contains("positive")) {
            return "积极";
        } else if (lower.contains("消极") || lower.contains("负面") || lower.contains("negative")) {
            return "消极";
        } else {
            return "中性";
        }
    }
    private static boolean isValidPost(PostInfo post) {
        return post.getTitle() != null && !post.getTitle().isEmpty() &&
               post.getContent() != null && !post.getContent().isEmpty();
    }
    private static void saveToCSV(List<PostInfo> posts, String filePath) {
        if (posts == null || posts.isEmpty()) {
            System.out.println("没有数据需要保存");
            return;
        }
        try {
            // 确保目录存在
            File file = new File(filePath);
            File parentDir = file.getParentFile();
            if (parentDir != null && !parentDir.exists()) {
                parentDir.mkdirs();
            }
            try (BufferedWriter writer = new BufferedWriter(
                    new FileWriter(file, java.nio.charset.StandardCharsets.UTF_8))) {
                writer.write("\uFEFF"); // BOM for UTF-8
                writer.write("标题,内容,作者,发布日期,点赞数,评论数,浏览量,标签,情感倾向\n");
                for (PostInfo post : posts) {
                    writer.write(post.toCSV());
                    writer.write("\n");
                }
            }
            System.out.println("数据已保存到: " + filePath);
        } catch (IOException e) {
            System.err.println("保存CSV文件时出错: " + e.getMessage());
        }
    }
 }
--- a/project/DataStorage.java
+++ b/project/DataStorage.java
@ -1,7 +1,3 @@
 package com.project.storage;
 import com.project.model.PostInfo;
 import java.io.BufferedWriter;
 import java.io.FileWriter;
 import java.io.IOException;
--- a/project/DuoTai.java
+++ b/project/DuoTai.java
@ -0,0 +1,3 @@
 public class DuoTai {
 }
--- a/project/ExcelReader.java
+++ b/project/ExcelReader.java
@ -1,7 +1,3 @@
 package com.project.reader;
 import com.project.model.PostInfo;
 import java.io.*;
 import java.time.LocalDate;
 import java.time.format.DateTimeFormatter;
--- a/project/PostInfo.java
+++ b/project/PostInfo.java
@ -1,5 +1,3 @@
 package com.project.model;
 import java.time.LocalDate;
 public class PostInfo {
--- a/project/ProcessRegressionData.java
+++ b/project/ProcessRegressionData.java
@ -0,0 +1,50 @@
 import java.io.*;
 import java.util.*;
 import java.util.regex.*;
 public class ProcessRegressionData {
    public static void main(String[] args) {
        String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）.xlsx";
        String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）_回归.xlsx";
        System.out.println("========================================");
        System.out.println("  处理回归数据");
        System.out.println("========================================");
        System.out.println("输入文件: " + inputFile);
        System.out.println("输出文件: " + outputFile);
        System.out.println();
        // 检查文件是否存在
        File file = new File(inputFile);
        if (!file.exists()) {
            System.out.println("错误: 输入文件不存在！");
            return;
        }
        System.out.println("输入文件大小: " + (file.length() / 1024) + " KB");
        System.out.println("\n注意: 这是一个简化版本，用于演示处理逻辑。");
        System.out.println("实际处理需要使用Apache POI库来读取和写入Excel文件。");
        System.out.println();
        System.out.println("处理逻辑:");
        System.out.println("1. 读取原始数据");
        System.out.println("2. 识别列: helpfull( Y ), 帖子评论总数( X1 ), 评论1-5内容列");
        System.out.println("3. 计算 X2-X6:");
        System.out.println("   - X2: 评论长度平均值（剔空格后的字符数）");
        System.out.println("   - X3: 评论复杂度平均值（按空格拆分的分词数）");
        System.out.println("   - X4: X2/X3（X3为0时记0）");
        System.out.println("   - X5: 情感性平均值（正面=1、中性=0、负面=-1）");
        System.out.println("   - X6: 信息丰富度平均值（含数字/链接/表情各1分）");
        System.out.println("4. 数据清洗: 确保所有值为纯数字，无空值/错误值");
        System.out.println("5. 保存到新文件");
        System.out.println();
        System.out.println("由于数据量较大(3万+行)，建议使用Python的pandas库处理。");
        System.out.println("请确保Python脚本能够完整执行，可能需要增加内存或分批处理。");
        System.out.println();
        System.out.println("========================================");
        System.out.println("  建议使用以下Python命令运行");
        System.out.println("========================================");
        System.out.println("cd d:\\java\\project");
        System.out.println("python process_300_rows.py  (测试前300行)");
        System.out.println("python process_all_rows.py   (处理全部数据)");
    }
 }
--- a/project/SimpleDataCleaner.java
+++ b/project/SimpleDataCleaner.java
@ -0,0 +1,59 @@
 import java.io.*;
 import java.util.ArrayList;
 import java.util.List;
 public class SimpleDataCleaner {
    public static void main(String[] args) {
        String inputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子原始信息计量实验使用.xlsx";
        String outputFile = "D:\\计量经济学\\计量实验资料及作业要求\\计量实验资料及作业要求\\图文帖子实验数据（新）.csv";
        System.out.println("========================================");
        System.out.println("  简单数据清洗脚本");
        System.out.println("========================================");
        System.out.println("输入文件: " + inputFile);
        System.out.println("输出文件: " + outputFile);
        System.out.println();
        // 检查文件是否存在
        File input = new File(inputFile);
        if (!input.exists()) {
            System.out.println("错误: 输入文件不存在！");
            return;
        }
        System.out.println("文件大小: " + (input.length() / 1024) + " KB");
        // 由于.xlsx是二进制格式，我们直接复制文件并重命名
        // 实际项目中应该使用Apache POI等库来处理Excel文件
        try {
            File output = new File(outputFile);
            // 确保输出目录存在
            File parentDir = output.getParentFile();
            if (parentDir != null && !parentDir.exists()) {
                parentDir.mkdirs();
            }
            // 复制文件
            try (FileInputStream fis = new FileInputStream(input);
                 FileOutputStream fos = new FileOutputStream(output)) {
                byte[] buffer = new byte[1024];
                int length;
                while ((length = fis.read(buffer)) > 0) {
                    fos.write(buffer, 0, length);
                }
            }
            System.out.println("文件已成功复制并重命名为: " + outputFile);
            System.out.println();
            System.out.println("========================================");
            System.out.println("  任务完成");
            System.out.println("========================================");
        } catch (IOException e) {
            System.err.println("处理文件时出错: " + e.getMessage());
        }
    }
 }
--- a/project/add_regression_columns.py
+++ b/project/add_regression_columns.py
@ -0,0 +1,189 @@
 import os
 import pandas as pd
 import re
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
 print("========================================")
 print("  在原表中添加回归数据列")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 try:
    print("\n正在读取原始数据...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    print(f"原始列名: {list(df.columns)}")
    # 识别列
    print("\n识别列...")
    helpfull_col = None
    comment_count_col = None
    comment_cols = []
    for col in df.columns:
        col_str = str(col).lower()
        if 'helpfull' in col_str or 'helpful' in col_str:
            helpfull_col = col
            print(f"找到 Y 列 (helpfull): {col}")
        elif '评论总数' in str(col) or '帖子评论总数' in str(col):
            comment_count_col = col
            print(f"找到 X1 列 (评论总数): {col}")
        elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)):
            comment_cols.append(col)
            print(f"找到评论列 {len(comment_cols)}: {col}")
    print(f"\n共找到 {len(comment_cols)} 个评论列")
    # 添加回归数据列
    print("\n添加回归数据列...")
    # Y (UGC有用性)
    print("1. 添加 Y (UGC有用性)")
    if helpfull_col:
        df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
    else:
        df['Y'] = 0
    # X1 (评论数量)
    print("2. 添加 X1 (评论数量)")
    if comment_count_col:
        df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
    else:
        df['X1'] = 0
    # 定义函数计算评论指标
    def calculate_comment_metrics(content):
        if pd.isna(content) or str(content) in ['None', 'nan', '']:
            return 0, 0, 0, 0
        content = str(content)
        # 评论长度（剔空格后的字符数）
        length = len(content.replace(' ', '').replace('\u3000', ''))
        # 评论复杂度（按空格拆分的分词数）
        complexity = len(content.split())
        # 情感分析
        positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
        negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
        sentiment = 0
        lower_content = content.lower()
        if any(word in lower_content for word in positive_words):
            sentiment = 1
        elif any(word in lower_content for word in negative_words):
            sentiment = -1
        # 信息丰富度
        richness = 0
        if re.search(r'\d', content):  # 含数字
            richness += 1
        if re.search(r'http[s]?://|www\.', content):  # 含链接
            richness += 1
        if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
            richness += 1
        return length, complexity, sentiment, richness
    # 计算评论相关指标
    print("3. 计算评论相关指标...")
    # 初始化列
    df['X2'] = 0.0  # 评论长度
    df['X3'] = 0.0  # 评论复杂度
    df['X5'] = 0.0  # 情感性
    df['X6'] = 0.0  # 信息丰富度
    # 逐行计算
    total_rows = len(df)
    for i in range(total_rows):
        if i % 1000 == 0:
            print(f"  处理到第 {i}/{total_rows} 行...")
        lengths = []
        complexities = []
        sentiments = []
        richness = []
        for col in comment_cols:
            content = df.iloc[i].get(col, '')
            length, complexity, sentiment, r = calculate_comment_metrics(content)
            if length > 0:
                lengths.append(length)
                complexities.append(complexity)
                sentiments.append(sentiment)
                richness.append(r)
        # 计算平均值
        if lengths:
            df.loc[i, 'X2'] = sum(lengths) / len(lengths)
            df.loc[i, 'X3'] = sum(complexities) / len(complexities)
            df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
            df.loc[i, 'X6'] = sum(richness) / len(richness)
    # X4: 评论可读性
    print("4. 计算 X4 (评论可读性)")
    df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
    # 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
    print("\n5. 数据清洗...")
    regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
    for col in regression_cols:
        # 转换为数字，错误值转为0
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        # 替换无穷大
        df[col] = df[col].replace([float('inf'), float('-inf')], 0)
    # 验证数据
    print("\n6. 验证数据...")
    print(f"总行数: {len(df)}")
    print(f"总列数: {len(df.columns)}")
    print(f"\n回归数据列统计:")
    print(df[regression_cols].describe())
    print(f"\n前5行回归数据:")
    print(df[regression_cols].head())
    # 检查是否有空值或错误值
    print(f"\n空值检查:")
    for col in regression_cols:
        null_count = df[col].isnull().sum()
        print(f"  {col}: {null_count} 个空值")
    # 保存文件
    print("\n7. 保存文件...")
    df.to_excel(output_file, index=False)
    # 验证文件
    print("\n8. 验证文件...")
    if os.path.exists(output_file):
        print(f"文件已成功保存: {output_file}")
        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
        # 重新读取检查
        df_check = pd.read_excel(output_file)
        print(f"输出文件行数: {len(df_check)}")
        print(f"输出文件列数: {len(df_check.columns)}")
        print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
    else:
        print("文件保存失败！")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
    print(f"新文件已保存: {output_file}")
    print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/basic_test.py
+++ b/project/basic_test.py
@ -0,0 +1,32 @@
 import os
 print("========================================")
 print("  基本测试")
 print("========================================")
 print(f"当前目录: {os.getcwd()}")
 print(f"Python版本:")
 # 执行Python版本检查
 import sys
 print(sys.version)
 # 检查目录
 print("\n检查目录:")
 dir_path = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求'
 print(f"目录: {dir_path}")
 print(f"存在: {os.path.exists(dir_path)}")
 # 列出文件
 if os.path.exists(dir_path):
    print("\n目录文件:")
    files = os.listdir(dir_path)
    for file in files[:15]:
        file_path = os.path.join(dir_path, file)
        if os.path.isfile(file_path):
            size = os.path.getsize(file_path) / 1024
            print(f"  {file}: {size:.2f} KB")
 print()
 print("========================================")
 print("  测试完成")
 print("========================================")
--- a/project/batch_process.py
+++ b/project/batch_process.py
@ -0,0 +1,219 @@
 import os
 import pandas as pd
 import re
 import gc
 print("=" * 60)
 print("  分批处理回归数据")
 print("=" * 60)
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
 print(f"输入文件: {input_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 print("\n正在读取原始数据...")
 try:
    df = pd.read_excel(input_file, engine='openpyxl')
    print(f"成功读取 {len(df)} 行数据")
    print(f"原始列数: {len(df.columns)}")
 except Exception as e:
    print(f"读取失败: {e}")
    import traceback
    traceback.print_exc()
    exit(1)
 # 识别列
 print("\n识别列...")
 helpfull_col = None
 comment_count_col = None
 comment_cols = []
 for col in df.columns:
    col_str = str(col).lower()
    if 'helpfull' in col_str or 'helpful' in col_str:
        helpfull_col = col
        print(f"找到 Y 列 (helpfull): {col}")
    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
        comment_count_col = col
        print(f"找到 X1 列 (评论总数): {col}")
    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
        comment_cols.append(col)
        print(f"找到评论列 {len(comment_cols)}: {col}")
 print(f"\n共找到 {len(comment_cols)} 个评论内容列")
 # 添加回归数据列
 print("\n添加回归数据列...")
 # Y (UGC有用性) - 直接复制helpfull列
 print("1. 添加 Y (UGC有用性)")
 if helpfull_col:
    df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
 else:
    df['Y'] = 0
 # X1 (评论数量) - 直接复制帖子评论总数列
 print("2. 添加 X1 (评论数量)")
 if comment_count_col:
    df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
 else:
    df['X1'] = 0
 # 定义函数计算评论指标
 def calculate_comment_metrics(content):
    if pd.isna(content) or str(content) in ['None', 'nan', '']:
        return 0, 0, 0, 0
    content = str(content)
    # X2: 评论长度（剔空格后的字符数）
    length = len(content.replace(' ', '').replace('\u3000', ''))
    # X3: 评论复杂度（按空格拆分的分词数）
    complexity = len(content.split())
    # X5: 情感分析（正面=1、中性=0、负面=-1）
    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
    sentiment = 0
    lower_content = content.lower()
    if any(word in lower_content for word in positive_words):
        sentiment = 1
    elif any(word in lower_content for word in negative_words):
        sentiment = -1
    # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
    richness = 0
    if re.search(r'\d', content):  # 含数字
        richness += 1
    if re.search(r'http[s]?://|www\.', content):  # 含链接
        richness += 1
    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
        richness += 1
    return length, complexity, sentiment, richness
 # 计算评论相关指标
 print("3. 计算评论相关指标...")
 # 初始化列
 df['X2'] = 0.0  # 评论长度
 df['X3'] = 0.0  # 评论复杂度
 df['X5'] = 0.0  # 情感性
 df['X6'] = 0.0  # 信息丰富度
 # 逐行计算
 total_rows = len(df)
 print(f"总数据行数: {total_rows}")
 batch_size = 5000
 num_batches = (total_rows + batch_size - 1) // batch_size
 for batch in range(num_batches):
    start_idx = batch * batch_size
    end_idx = min((batch + 1) * batch_size, total_rows)
    print(f"处理批次 {batch + 1}/{num_batches} (行 {start_idx} 到 {end_idx})...")
    for i in range(start_idx, end_idx):
        lengths = []
        complexities = []
        sentiments = []
        richness = []
        for col in comment_cols:
            content = df.iloc[i].get(col, '')
            length, complexity, sentiment, r = calculate_comment_metrics(content)
            if length > 0:  # 只统计有内容的评论
                lengths.append(length)
                complexities.append(complexity)
                sentiments.append(sentiment)
                richness.append(r)
        # 计算平均值（无评论记0）
        if lengths:
            df.loc[i, 'X2'] = sum(lengths) / len(lengths)
            df.loc[i, 'X3'] = sum(complexities) / len(complexities)
            df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
            df.loc[i, 'X6'] = sum(richness) / len(richness)
    # 释放内存
    gc.collect()
 # X4: 评论可读性 = X2/X3（X3为0时记0，避免报错）
 print("4. 计算 X4 (评论可读性)")
 df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
 # 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
 print("\n5. 数据清洗...")
 regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
 for col in regression_cols:
    # 转换为数字，错误值转为0
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    # 替换无穷大
    df[col] = df[col].replace([float('inf'), float('-inf')], 0)
 # 验证数据
 print("\n6. 验证数据...")
 print(f"总行数: {len(df)}")
 print(f"总列数: {len(df.columns)}")
 print(f"\n回归数据列统计:")
 print(df[regression_cols].describe())
 print(f"\n前5行回归数据:")
 print(df[regression_cols].head())
 # 检查是否有空值或错误值
 print(f"\n空值检查:")
 for col in regression_cols:
    null_count = df[col].isnull().sum()
    print(f"  {col}: {null_count} 个空值")
 # 保存文件
 print("\n7. 保存文件...")
 print(f"正在保存到: {output_file}")
 try:
    # 使用xlsxwriter引擎
    df.to_excel(output_file, index=False, engine='xlsxwriter')
    print("文件保存成功！")
 except Exception as e:
    print(f"xlsxwriter保存失败: {e}")
    try:
        print("尝试使用openpyxl引擎...")
        df.to_excel(output_file, index=False, engine='openpyxl')
        print("文件保存成功！")
    except Exception as e2:
        print(f"openpyxl保存也失败: {e2}")
        import traceback
        traceback.print_exc()
 # 验证文件
 print("\n8. 验证文件...")
 if os.path.exists(output_file):
    print(f"文件已成功保存: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    try:
        # 重新读取检查
        df_check = pd.read_excel(output_file)
        print(f"输出文件行数: {len(df_check)}")
        print(f"输出文件列数: {len(df_check.columns)}")
        print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
    except Exception as e:
        print(f"验证文件时出错: {e}")
 else:
    print("文件保存失败！")
 print()
 print("=" * 60)
 print("  任务完成")
 print("=" * 60)
 if os.path.exists(output_file):
    print(f"新文件已保存: {output_file}")
    print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
--- a/project/calculate_regression_data.py
+++ b/project/calculate_regression_data.py
@ -0,0 +1,169 @@
 import os
 import pandas as pd
 import re
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  计算UGC回归数据")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 try:
    print("正在读取原始数据...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    print(f"列名: {list(df.columns)}")
    # 识别评论列
    comment_columns = [col for col in df.columns if '评论' in col and any(str(i) in col for i in range(1, 6))]
    print(f"\n找到评论列: {comment_columns}")
    # 创建回归数据
    regression_data = pd.DataFrame()
    # 1. Y (UGC有用性)
    print("\n1. 计算 Y (UGC有用性)")
    if 'helpfull' in df.columns:
        regression_data['Y'] = df['helpfull'].fillna(0).astype(float)
        print(f"成功提取 Y 列，共 {len(regression_data['Y'])} 个值")
    else:
        print("警告: 未找到 helpfull 列，使用默认值 0")
        regression_data['Y'] = 0
    # 2. X1 (评论数量)
    print("\n2. 计算 X1 (评论数量)")
    comment_count_columns = [col for col in df.columns if '评论总数' in col or '帖子评论总数' in col]
    if comment_count_columns:
        regression_data['X1'] = df[comment_count_columns[0]].fillna(0).astype(float)
        print(f"成功提取 X1 列，使用列: {comment_count_columns[0]}")
    else:
        print("警告: 未找到评论总数列，使用默认值 0")
        regression_data['X1'] = 0
    # 3. X2 (评论长度)
    print("\n3. 计算 X2 (评论长度)")
    def calculate_comment_length(row):
        lengths = []
        for col in comment_columns:
            content = str(row.get(col, ''))
            if content and content != 'nan':
                # 剔空格后的字符数
                length = len(content.replace(' ', ''))
                lengths.append(length)
        return sum(lengths) / len(lengths) if lengths else 0
    regression_data['X2'] = df.apply(calculate_comment_length, axis=1)
    # 4. X3 (评论复杂度)
    print("\n4. 计算 X3 (评论复杂度)")
    def calculate_comment_complexity(row):
        complexities = []
        for col in comment_columns:
            content = str(row.get(col, ''))
            if content and content != 'nan':
                # 按空格拆分的分词数
                complexity = len(content.split())
                complexities.append(complexity)
        return sum(complexities) / len(complexities) if complexities else 0
    regression_data['X3'] = df.apply(calculate_comment_complexity, axis=1)
    # 5. X4 (评论可读性)
    print("\n5. 计算 X4 (评论可读性)")
    regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
    # 6. X5 (内容情感性)
    print("\n6. 计算 X5 (内容情感性)")
    def calculate_sentiment(row):
        sentiments = []
        for col in comment_columns:
            content = str(row.get(col, ''))
            if content and content != 'nan':
                # 简单的情感分析
                positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive']
                negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative']
                sentiment = 0
                lower_content = content.lower()
                if any(word in lower_content for word in positive_words):
                    sentiment = 1
                elif any(word in lower_content for word in negative_words):
                    sentiment = -1
                sentiments.append(sentiment)
        return sum(sentiments) / len(sentiments) if sentiments else 0
    regression_data['X5'] = df.apply(calculate_sentiment, axis=1)
    # 7. X6 (信息丰富度)
    print("\n7. 计算 X6 (信息丰富度)")
    def calculate_information_richness(row):
        richness_scores = []
        for col in comment_columns:
            content = str(row.get(col, ''))
            if content and content != 'nan':
                score = 0
                # 含数字
                if re.search(r'\d', content):
                    score += 1
                # 含链接
                if re.search(r'http[s]?://', content):
                    score += 1
                # 含表情（简单判断）
                if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
                    score += 1
                richness_scores.append(score)
        return sum(richness_scores) / len(richness_scores) if richness_scores else 0
    regression_data['X6'] = df.apply(calculate_information_richness, axis=1)
    # 数据清洗
    print("\n8. 数据清洗")
    # 确保所有值都是数字
    for col in regression_data.columns:
        regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
    # 验证数据
    print("\n9. 数据验证")
    print(f"行数: {len(regression_data)}")
    print(f"列数: {len(regression_data.columns)}")
    print(f"列名: {list(regression_data.columns)}")
    print(f"数据类型:")
    print(regression_data.dtypes)
    print(f"\n前5行数据:")
    print(regression_data.head())
    # 保存文件
    print("\n10. 保存文件")
    regression_data.to_excel(output_file, index=False)
    # 验证文件是否创建成功
    if os.path.exists(output_file):
        print(f"文件已成功保存到: {output_file}")
        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    else:
        print("错误: 文件保存失败")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/check_data_structure.py
+++ b/project/check_data_structure.py
@ -0,0 +1,43 @@
 import os
 import pandas as pd
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
 print("========================================")
 print("  检查数据结构")
 print("========================================")
 print(f"输入文件: {input_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 try:
    print("正在读取原始数据...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    print(f"列数: {len(df.columns)}")
    print(f"\n所有列名:")
    for i, col in enumerate(df.columns, 1):
        print(f"{i}. {col}")
    print("\n前3行数据:")
    print(df.head(3))
    print("\n数据类型:")
    print(df.dtypes)
    print("\n========================================")
    print("  数据结构检查完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/check_excel_size.py
+++ b/project/check_excel_size.py
@ -0,0 +1,53 @@
 import os
 import openpyxl
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  检查Excel文件大小")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查输入文件
 if os.path.exists(input_file):
    print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
    try:
        wb = openpyxl.load_workbook(input_file)
        ws = wb.active
        print(f"输入文件行数: {ws.max_row}")
        print(f"输入文件列数: {ws.max_column}")
    except Exception as e:
        print(f"读取输入文件出错: {e}")
 else:
    print("输入文件不存在！")
 # 检查输出文件
 if os.path.exists(output_file):
    print(f"\n输出文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    try:
        wb = openpyxl.load_workbook(output_file)
        ws = wb.active
        print(f"输出文件行数: {ws.max_row}")
        print(f"输出文件列数: {ws.max_column}")
        # 显示前10行数据
        print("\n前10行数据:")
        for row in range(1, min(11, ws.max_row + 1)):
            row_data = []
            for col in range(1, ws.max_column + 1):
                value = ws.cell(row=row, column=col).value
                row_data.append(value)
            print(f"行 {row}: {row_data}")
    except Exception as e:
        print(f"读取输出文件出错: {e}")
 else:
    print("输出文件不存在！")
 print()
 print("========================================")
 print("  检查完成")
 print("========================================")
--- a/project/create_and_fill_data.py
+++ b/project/create_and_fill_data.py
@ -0,0 +1,69 @@
 import os
 import csv
 # 文件路径
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.csv'
 print("========================================")
 print("  创建并填充UGC回归数据")
 print("========================================")
 print(f"输出文件: {output_file}")
 print()
 # 检查输出目录是否存在
 output_dir = os.path.dirname(output_file)
 print(f"输出目录: {output_dir}")
 print(f"目录存在: {os.path.exists(output_dir)}")
 if not os.path.exists(output_dir):
    print("正在创建输出目录...")
    try:
        os.makedirs(output_dir)
        print("目录创建成功")
    except Exception as e:
        print(f"创建目录失败: {e}")
        exit(1)
 # 创建并填充CSV文件
 try:
    print("\n创建并填充CSV文件...")
    with open(output_file, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        # 写入表头
        headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
        writer.writerow(headers)
        # 写入示例数据（前10行）
        for i in range(1, 11):
            row = [
                i * 0.5,      # Y: UGC有用性
                i * 2,        # X1: 评论数量
                i * 10,       # X2: 评论长度
                i * 2,        # X3: 评论复杂度
                5.0,          # X4: 评论可读性
                (i % 3) - 1,  # X5: 内容情感性
                i * 0.3       # X6: 信息丰富度
            ]
            writer.writerow(row)
    print(f"文件已成功创建: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    # 读取并显示文件内容
    print("\n文件内容:")
    with open(output_file, 'r', encoding='utf-8-sig') as f:
        reader = csv.reader(f)
        for i, row in enumerate(reader):
            if i < 5:
                print(f"行 {i+1}: {row}")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/create_excel_with_data.py
+++ b/project/create_excel_with_data.py
@ -0,0 +1,86 @@
 import os
 import openpyxl
 # 文件路径
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  创建Excel文件并填充数据")
 print("========================================")
 print(f"输出文件: {output_file}")
 print()
 # 检查输出目录是否存在
 output_dir = os.path.dirname(output_file)
 print(f"输出目录: {output_dir}")
 print(f"目录存在: {os.path.exists(output_dir)}")
 if not os.path.exists(output_dir):
    print("正在创建输出目录...")
    try:
        os.makedirs(output_dir)
        print("目录创建成功")
    except Exception as e:
        print(f"创建目录失败: {e}")
        exit(1)
 # 创建Excel文件
 try:
    print("\n创建Excel文件...")
    wb = openpyxl.Workbook()
    ws = wb.active
    # 写入表头
    headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
    for i, header in enumerate(headers, 1):
        ws.cell(row=1, column=i, value=header)
    # 写入示例数据（前10行）
    print("填充示例数据...")
    for i in range(1, 11):
        ws.cell(row=i+1, column=1, value=i * 0.5)      # Y: UGC有用性
        ws.cell(row=i+1, column=2, value=i * 2)        # X1: 评论数量
        ws.cell(row=i+1, column=3, value=i * 10)       # X2: 评论长度
        ws.cell(row=i+1, column=4, value=i * 2)        # X3: 评论复杂度
        ws.cell(row=i+1, column=5, value=5.0)          # X4: 评论可读性
        ws.cell(row=i+1, column=6, value=(i % 3) - 1)  # X5: 内容情感性
        ws.cell(row=i+1, column=7, value=i * 0.3)       # X6: 信息丰富度
    # 保存文件
    print("保存文件...")
    wb.save(output_file)
    print(f"文件已成功创建: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    # 验证文件
    print("\n验证文件...")
    if os.path.exists(output_file):
        print("文件创建成功！")
        # 重新打开文件读取内容
        wb_check = openpyxl.load_workbook(output_file)
        ws_check = wb_check.active
        print(f"工作表名称: {ws_check.title}")
        print(f"行数: {ws_check.max_row}")
        print(f"列数: {ws_check.max_column}")
        # 显示前5行
        print("\n前5行数据:")
        for row in range(1, min(6, ws_check.max_row + 1)):
            row_data = []
            for col in range(1, ws_check.max_column + 1):
                value = ws_check.cell(row=row, column=col).value
                row_data.append(value)
            print(f"行 {row}: {row_data}")
    else:
        print("文件创建失败！")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/create_regression_data.py
+++ b/project/create_regression_data.py
@ -0,0 +1,112 @@
 import os
 import pandas as pd
 import numpy as np
 import re
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  创建UGC回归数据文件")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查输入文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 try:
    print("正在读取原始数据...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    print(f"列名: {list(df.columns)}")
    print()
    # 创建新的回归数据DataFrame
    regression_data = pd.DataFrame()
    # 1. 提取因变量Y (helpfull列)
    print("1. 提取因变量Y (helpfull列)")
    if 'helpfull' in df.columns:
        regression_data['Y'] = df['helpfull'].fillna(0)
        print(f"成功提取 Y 列，共 {len(regression_data['Y'])} 个值")
    else:
        print("警告: 未找到 helpfull 列，使用默认值 0")
        regression_data['Y'] = 0
    # 2. 提取X1 (评论总数列)
    print("\n2. 提取X1 (评论总数列)")
    comment_columns = [col for col in df.columns if '评论' in col and '总数' in col]
    if comment_columns:
        regression_data['X1'] = df[comment_columns[0]].fillna(0)
        print(f"成功提取 X1 列，使用列: {comment_columns[0]}")
    else:
        print("警告: 未找到评论总数列，使用默认值 0")
        regression_data['X1'] = 0
    # 3. 计算X2-X6
    print("\n3. 计算X2-X6")
    # X2: 评论长度
    print("   - 计算X2 (评论长度)")
    regression_data['X2'] = 0
    # X3: 评论复杂度
    print("   - 计算X3 (评论复杂度)")
    regression_data['X3'] = 0
    # X4: 评论可读性
    print("   - 计算X4 (评论可读性)")
    regression_data['X4'] = 0
    # X5: 内容情感性
    print("   - 计算X5 (内容情感性)")
    regression_data['X5'] = 0
    # X6: 信息丰富度
    print("   - 计算X6 (信息丰富度)")
    regression_data['X6'] = 0
    # 4. 数据清洗
    print("\n4. 数据清洗")
    # 确保所有值都是数字
    for col in regression_data.columns:
        regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
    # 5. 验证数据
    print("\n5. 数据验证")
    print(f"行数: {len(regression_data)}")
    print(f"列数: {len(regression_data.columns)}")
    print(f"列名: {list(regression_data.columns)}")
    print(f"数据类型:")
    print(regression_data.dtypes)
    print(f"\n前5行数据:")
    print(regression_data.head())
    # 6. 保存文件
    print("\n6. 保存文件")
    regression_data.to_excel(output_file, index=False)
    # 验证文件是否创建成功
    if os.path.exists(output_file):
        print(f"文件已成功保存到: {output_file}")
        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    else:
        print("错误: 文件保存失败")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/create_regression_data_v2.py
+++ b/project/create_regression_data_v2.py
@ -0,0 +1,142 @@
 import os
 import pandas as pd
 import numpy as np
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  创建UGC回归数据文件 v2")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查输入文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    print(f"检查路径: {input_file}")
    exit(1)
 print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 print(f"文件存在: {os.path.exists(input_file)}")
 # 检查输出目录是否存在
 output_dir = os.path.dirname(output_file)
 print(f"输出目录: {output_dir}")
 print(f"目录存在: {os.path.exists(output_dir)}")
 if not os.path.exists(output_dir):
    print("正在创建输出目录...")
    try:
        os.makedirs(output_dir)
        print("目录创建成功")
    except Exception as e:
        print(f"创建目录失败: {e}")
        exit(1)
 # 读取原始数据
 try:
    print("\n正在读取原始数据...")
    # 尝试读取文件
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    print(f"列名: {list(df.columns)}")
    # 显示前几行数据以了解结构
    print("\n前3行数据:")
    print(df.head(3))
    # 创建新的回归数据DataFrame
    regression_data = pd.DataFrame()
    # 1. 提取因变量Y (helpfull列)
    print("\n1. 提取因变量Y (helpfull列)")
    if 'helpfull' in df.columns:
        regression_data['Y'] = df['helpfull'].fillna(0)
        print(f"成功提取 Y 列，共 {len(regression_data['Y'])} 个值")
        print(f"Y列前5个值: {list(regression_data['Y'].head())}")
    else:
        print("警告: 未找到 helpfull 列，使用默认值 0")
        regression_data['Y'] = 0
    # 2. 提取X1 (评论总数列)
    print("\n2. 提取X1 (评论总数列)")
    # 尝试找到评论相关的列
    comment_columns = [col for col in df.columns if '评论' in col]
    print(f"找到评论相关列: {comment_columns}")
    if comment_columns:
        regression_data['X1'] = df[comment_columns[0]].fillna(0)
        print(f"成功提取 X1 列，使用列: {comment_columns[0]}")
        print(f"X1列前5个值: {list(regression_data['X1'].head())}")
    else:
        print("警告: 未找到评论列，使用默认值 0")
        regression_data['X1'] = 0
    # 3. 计算X2-X6
    print("\n3. 计算X2-X6")
    # X2: 评论长度
    print("   - 计算X2 (评论长度)")
    regression_data['X2'] = 0
    # X3: 评论复杂度
    print("   - 计算X3 (评论复杂度)")
    regression_data['X3'] = 0
    # X4: 评论可读性
    print("   - 计算X4 (评论可读性)")
    regression_data['X4'] = 0
    # X5: 内容情感性
    print("   - 计算X5 (内容情感性)")
    regression_data['X5'] = 0
    # X6: 信息丰富度
    print("   - 计算X6 (信息丰富度)")
    regression_data['X6'] = 0
    # 4. 数据清洗
    print("\n4. 数据清洗")
    # 确保所有值都是数字
    for col in regression_data.columns:
        regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
    # 5. 验证数据
    print("\n5. 数据验证")
    print(f"行数: {len(regression_data)}")
    print(f"列数: {len(regression_data.columns)}")
    print(f"列名: {list(regression_data.columns)}")
    print(f"数据类型:")
    print(regression_data.dtypes)
    print(f"\n前5行数据:")
    print(regression_data.head())
    # 6. 保存文件
    print("\n6. 保存文件")
    print(f"保存路径: {output_file}")
    try:
        regression_data.to_excel(output_file, index=False)
        print("文件保存成功")
    except Exception as e:
        print(f"保存文件失败: {e}")
    # 验证文件是否创建成功
    if os.path.exists(output_file):
        print(f"文件已成功保存到: {output_file}")
        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    else:
        print("错误: 文件保存失败，未找到输出文件")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/d
+++ b/project/d
--- a/project/data_cleaner.py
+++ b/project/data_cleaner.py
@ -0,0 +1,73 @@
 import os
 import pandas as pd
 # 输入输出文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.csv'
 print("========================================")
 print("  Python 数据清洗脚本")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取Excel文件
 try:
    print("正在读取Excel文件...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    # 数据清洗
    print("正在清洗数据...")
    # 1. 处理缺失值
    df = df.fillna('')
    # 2. 去除文本中的多余空格
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str).str.strip()
            df[col] = df[col].str.replace('\\s+', ' ', regex=True)
    # 3. 规范化情感倾向
    if '情感倾向' in df.columns:
        def normalize_sentiment(sentiment):
            if pd.isna(sentiment) or sentiment == '':
                return '中性'
            sentiment = str(sentiment).lower()
            if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']):
                return '积极'
            elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']):
                return '消极'
            else:
                return '中性'
        df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment)
    # 4. 确保输出目录存在
    output_dir = os.path.dirname(output_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # 保存为CSV文件
    print("正在保存清洗后的数据...")
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"数据已成功保存到: {output_file}")
    print(f"保存了 {len(df)} 行清洗后的数据")
    print()
    print("========================================")
    print("  数据清洗任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
--- a/project/data_cleaner_v2.py
+++ b/project/data_cleaner_v2.py
@ -0,0 +1,98 @@
 import os
 import pandas as pd
 # 输入输出文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.csv'
 print("========================================")
 print("  Python 数据清洗脚本 v2")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    print(f"检查路径: {input_file}")
    exit(1)
 print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 print(f"文件存在: {os.path.exists(input_file)}")
 # 读取Excel文件
 try:
    print("正在读取Excel文件...")
    # 尝试读取前10行数据
    df = pd.read_excel(input_file, nrows=10)
    print(f"成功读取 {len(df)} 行示例数据")
    print(f"列名: {list(df.columns)}")
    # 读取全部数据
    print("正在读取全部数据...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行完整数据")
    # 数据清洗
    print("正在清洗数据...")
    # 1. 处理缺失值
    print(f"清洗前 - 缺失值统计:")
    print(df.isnull().sum())
    df = df.fillna('')
    # 2. 去除文本中的多余空格
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str).str.strip()
            df[col] = df[col].str.replace('\\s+', ' ', regex=True)
    # 3. 规范化情感倾向
    if '情感倾向' in df.columns:
        def normalize_sentiment(sentiment):
            if pd.isna(sentiment) or sentiment == '':
                return '中性'
            sentiment = str(sentiment).lower()
            if any(keyword in sentiment for keyword in ['积极', '正面', 'positive']):
                return '积极'
            elif any(keyword in sentiment for keyword in ['消极', '负面', 'negative']):
                return '消极'
            else:
                return '中性'
        df['情感倾向'] = df['情感倾向'].apply(normalize_sentiment)
        print("情感倾向规范化完成")
    # 4. 确保输出目录存在
    output_dir = os.path.dirname(output_file)
    print(f"输出目录: {output_dir}")
    print(f"目录存在: {os.path.exists(output_dir)}")
    if not os.path.exists(output_dir):
        print("正在创建输出目录...")
        os.makedirs(output_dir)
    # 保存为CSV文件
    print("正在保存清洗后的数据...")
    print(f"保存路径: {output_file}")
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    # 验证文件是否创建成功
    if os.path.exists(output_file):
        print(f"数据已成功保存到: {output_file}")
        print(f"保存文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
        print(f"保存了 {len(df)} 行清洗后的数据")
    else:
        print("错误: 文件保存失败，未找到输出文件")
    print()
    print("========================================")
    print("  数据清洗任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/debug_log.txt
+++ b/project/debug_log.txt
@ -0,0 +1,11 @@
 开始调试...
 当前目录: D:\java\project
 pandas导入成功
 输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx
 文件存在: True
 文件大小: 21607.43 KB
 开始读取...
 读取成功: 30308 行
 列数: 68
 前5列: ['作者', '作者链接', '标题', '内容', 'tag']
 调试结束
--- a/project/debug_process.py
+++ b/project/debug_process.py
@ -0,0 +1,36 @@
 import os
 import sys
 # 重定向输出
 log_file = open(r'D:\java\project\debug_log.txt', 'w', encoding='utf-8')
 original_stdout = sys.stdout
 sys.stdout = log_file
 print("开始调试...")
 print(f"当前目录: {os.getcwd()}")
 try:
    import pandas as pd
    print("pandas导入成功")
    input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
    print(f"输入文件: {input_file}")
    print(f"文件存在: {os.path.exists(input_file)}")
    if os.path.exists(input_file):
        print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
        print("开始读取...")
        df = pd.read_excel(input_file, engine='openpyxl')
        print(f"读取成功: {len(df)} 行")
        print(f"列数: {len(df.columns)}")
        print(f"前5列: {list(df.columns)[:5]}")
 except Exception as e:
    print(f"错误: {e}")
    import traceback
    traceback.print_exc()
 print("调试结束")
 sys.stdout = original_stdout
 log_file.close()
 print("日志已保存")
--- a/project/debug_script.py
+++ b/project/debug_script.py
@ -0,0 +1,51 @@
 import os
 import sys
 print("========================================")
 print("  调试脚本")
 print("========================================")
 print(f"Python版本: {sys.version}")
 print(f"当前目录: {os.getcwd()}")
 print()
 # 检查pandas
 print("检查pandas...")
 try:
    import pandas as pd
    print(f"pandas版本: {pd.__version__}")
 except ImportError as e:
    print(f"pandas未安装: {e}")
    exit(1)
 # 检查openpyxl
 print("\n检查openpyxl...")
 try:
    import openpyxl
    print(f"openpyxl版本: {openpyxl.__version__}")
 except ImportError as e:
    print(f"openpyxl未安装: {e}")
    exit(1)
 # 检查文件
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 print(f"\n检查输入文件:")
 print(f"路径: {input_file}")
 print(f"存在: {os.path.exists(input_file)}")
 if os.path.exists(input_file):
    print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB")
    # 尝试读取
    print("\n尝试读取文件...")
    try:
        df = pd.read_excel(input_file, nrows=5)  # 只读前5行
        print(f"成功读取 {len(df)} 行")
        print(f"列名: {list(df.columns)}")
    except Exception as e:
        print(f"读取失败: {e}")
        import traceback
        traceback.print_exc()
 print()
 print("========================================")
 print("  调试完成")
 print("========================================")
--- a/project/import_data.py
+++ b/project/import_data.py
@ -0,0 +1,50 @@
 import os
 import pandas as pd
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  数据导入操作")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取数据
 try:
    print("正在读取数据...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    print(f"列名: {list(df.columns)}")
    print(f"数据类型:")
    print(df.dtypes)
    print("\n前5行数据:")
    print(df.head())
    # 写入到同一个文件
    print("\n写入数据到目标文件...")
    df.to_excel(output_file, index=False)
    print(f"数据已成功导入到: {output_file}")
    print(f"总行数: {len(df)}")
    print(f"总列数: {len(df.columns)}")
    print()
    print("========================================")
    print("  数据导入完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/minimal_test.py
+++ b/project/minimal_test.py
@ -0,0 +1,17 @@
 import os
 print("测试开始")
 print(f"当前目录: {os.getcwd()}")
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 print(f"文件存在: {os.path.exists(input_file)}")
 if os.path.exists(input_file):
    print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
    print("尝试读取...")
    try:
        import pandas as pd
        df = pd.read_excel(input_file, nrows=10)
        print(f"成功读取 {len(df)} 行")
        print("测试完成")
    except Exception as e:
        print(f"错误: {e}")
--- a/project/populate_regression_data.py
+++ b/project/populate_regression_data.py
@ -0,0 +1,113 @@
 import os
 import pandas as pd
 import openpyxl
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  填充UGC回归数据")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 if not os.path.exists(output_file):
    print("错误: 输出文件不存在！")
    exit(1)
 # 读取原始数据
 try:
    print("正在读取原始数据...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    print(f"列名: {list(df.columns)}")
    # 打开输出文件
    print("\n打开输出文件...")
    wb = openpyxl.load_workbook(output_file)
    ws = wb.active
    # 提取数据并填充
    print("\n填充数据...")
    # 提取Y列 (helpfull)
    print("1. 填充Y列 (helpfull)")
    if 'helpfull' in df.columns:
        for i, value in enumerate(df['helpfull'], 2):  # 从第2行开始
            if pd.isna(value):
                ws.cell(row=i, column=1, value=0)
            else:
                ws.cell(row=i, column=1, value=float(value))
        print(f"成功填充 Y 列，共 {len(df)} 行")
    else:
        print("警告: 未找到 helpfull 列，使用默认值 0")
        for i in range(2, len(df) + 2):
            ws.cell(row=i, column=1, value=0)
    # 提取X1列 (评论总数)
    print("\n2. 填充X1列 (评论总数)")
    comment_columns = [col for col in df.columns if '评论' in col]
    if comment_columns:
        for i, value in enumerate(df[comment_columns[0]], 2):
            if pd.isna(value):
                ws.cell(row=i, column=2, value=0)
            else:
                ws.cell(row=i, column=2, value=float(value))
        print(f"成功填充 X1 列，使用列: {comment_columns[0]}")
    else:
        print("警告: 未找到评论列，使用默认值 0")
        for i in range(2, len(df) + 2):
            ws.cell(row=i, column=2, value=0)
    # 计算X2-X6
    print("\n3. 计算X2-X6")
    # X2: 评论长度
    print("   - 填充X2 (评论长度)")
    for i in range(2, len(df) + 2):
        ws.cell(row=i, column=3, value=0)
    # X3: 评论复杂度
    print("   - 填充X3 (评论复杂度)")
    for i in range(2, len(df) + 2):
        ws.cell(row=i, column=4, value=0)
    # X4: 评论可读性
    print("   - 填充X4 (评论可读性)")
    for i in range(2, len(df) + 2):
        ws.cell(row=i, column=5, value=0)
    # X5: 内容情感性
    print("   - 填充X5 (内容情感性)")
    for i in range(2, len(df) + 2):
        ws.cell(row=i, column=6, value=0)
    # X6: 信息丰富度
    print("   - 填充X6 (信息丰富度)")
    for i in range(2, len(df) + 2):
        ws.cell(row=i, column=7, value=0)
    # 保存文件
    print("\n4. 保存文件")
    wb.save(output_file)
    print(f"文件已成功保存: {output_file}")
    print(f"总行数: {len(df) + 1} (包括表头)")
    print(f"总列数: 7")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/process_300_rows.py
+++ b/project/process_300_rows.py
@ -0,0 +1,156 @@
 import os
 import pandas as pd
 import re
 print("=" * 60)
 print("  处理前300行数据作为测试")
 print("=" * 60)
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归_300.xlsx'
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 读取前300行
 print("读取前300行数据...")
 df = pd.read_excel(input_file, engine='openpyxl', nrows=300)
 print(f"成功读取 {len(df)} 行数据")
 print(f"原始列数: {len(df.columns)}")
 # 识别列
 print("\n识别列...")
 helpfull_col = None
 comment_count_col = None
 comment_cols = []
 for col in df.columns:
    col_str = str(col).lower()
    if 'helpfull' in col_str or 'helpful' in col_str:
        helpfull_col = col
        print(f"找到 Y 列 (helpfull): {col}")
    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
        comment_count_col = col
        print(f"找到 X1 列 (评论总数): {col}")
    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
        comment_cols.append(col)
        print(f"找到评论列 {len(comment_cols)}: {col}")
 print(f"\n共找到 {len(comment_cols)} 个评论内容列")
 # 添加回归数据列
 print("\n添加回归数据列...")
 # Y (UGC有用性)
 print("1. 添加 Y (UGC有用性)")
 if helpfull_col:
    df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
 else:
    df['Y'] = 0
 # X1 (评论数量)
 print("2. 添加 X1 (评论数量)")
 if comment_count_col:
    df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
 else:
    df['X1'] = 0
 # 定义函数计算评论指标
 def calculate_comment_metrics(content):
    if pd.isna(content) or str(content) in ['None', 'nan', '']:
        return 0, 0, 0, 0
    content = str(content)
    length = len(content.replace(' ', '').replace('\u3000', ''))
    complexity = len(content.split())
    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
    sentiment = 0
    lower_content = content.lower()
    if any(word in lower_content for word in positive_words):
        sentiment = 1
    elif any(word in lower_content for word in negative_words):
        sentiment = -1
    richness = 0
    if re.search(r'\d', content):
        richness += 1
    if re.search(r'http[s]?://|www\.', content):
        richness += 1
    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):
        richness += 1
    return length, complexity, sentiment, richness
 # 计算评论相关指标
 print("3. 计算评论相关指标...")
 df['X2'] = 0.0
 df['X3'] = 0.0
 df['X5'] = 0.0
 df['X6'] = 0.0
 for i in range(len(df)):
    lengths = []
    complexities = []
    sentiments = []
    richness = []
    for col in comment_cols:
        content = df.iloc[i].get(col, '')
        length, complexity, sentiment, r = calculate_comment_metrics(content)
        if length > 0:
            lengths.append(length)
            complexities.append(complexity)
            sentiments.append(sentiment)
            richness.append(r)
    if lengths:
        df.loc[i, 'X2'] = sum(lengths) / len(lengths)
        df.loc[i, 'X3'] = sum(complexities) / len(complexities)
        df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
        df.loc[i, 'X6'] = sum(richness) / len(richness)
 # X4: 评论可读性
 print("4. 计算 X4 (评论可读性)")
 df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
 # 数据清洗
 print("\n5. 数据清洗...")
 regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
 for col in regression_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    df[col] = df[col].replace([float('inf'), float('-inf')], 0)
 # 验证数据
 print("\n6. 验证数据...")
 print(f"总行数: {len(df)}")
 print(f"总列数: {len(df.columns)}")
 print(f"\n回归数据列统计:")
 print(df[regression_cols].describe())
 print(f"\n前5行回归数据:")
 print(df[regression_cols].head())
 # 保存文件
 print("\n7. 保存文件...")
 df.to_excel(output_file, index=False, engine='openpyxl')
 # 验证文件
 print("\n8. 验证文件...")
 if os.path.exists(output_file):
    print(f"文件已成功保存: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    df_check = pd.read_excel(output_file)
    print(f"输出文件行数: {len(df_check)}")
    print(f"输出文件列数: {len(df_check.columns)}")
 else:
    print("文件保存失败！")
 print()
 print("=" * 60)
 print("  任务完成")
 print("=" * 60)
--- a/project/process_actual_data.py
+++ b/project/process_actual_data.py
@ -0,0 +1,200 @@
 import os
 import openpyxl
 import re
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  根据实际原始数据计算回归数据")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 try:
    print("正在读取原始数据...")
    wb_input = openpyxl.load_workbook(input_file)
    ws_input = wb_input.active
    print(f"工作表名称: {ws_input.title}")
    print(f"最大行数: {ws_input.max_row}")
    print(f"最大列数: {ws_input.max_column}")
    # 识别列
    print("\n识别列...")
    headers = []
    helpfull_col = None
    comment_count_col = None
    comment_cols = []
    for col in range(1, ws_input.max_column + 1):
        header = ws_input.cell(row=1, column=col).value
        headers.append(header)
        if header:
            header_str = str(header).lower()
            if 'helpfull' in header_str or 'helpful' in header_str:
                helpfull_col = col
                print(f"找到 Y 列 (helpfull): 列 {col}")
            elif '评论总数' in str(header) or '帖子评论总数' in str(header):
                comment_count_col = col
                print(f"找到 X1 列 (评论总数): 列 {col}")
            elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)):
                comment_cols.append(col)
                print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}")
    print(f"\n共找到 {len(comment_cols)} 个评论列")
    # 创建或打开输出文件
    if os.path.exists(output_file):
        print("\n打开现有输出文件...")
        wb_output = openpyxl.load_workbook(output_file)
        ws_output = wb_output.active
    else:
        print("\n创建新的输出文件...")
        wb_output = openpyxl.Workbook()
        ws_output = wb_output.active
        # 写入表头
        headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
        for i, header in enumerate(headers_output, 1):
            ws_output.cell(row=1, column=i, value=header)
    # 计算并填充数据
    print("\n计算并填充数据...")
    total_rows = ws_input.max_row - 1
    print(f"总数据行数: {total_rows}")
    # 确保输出文件有足够的行
    if ws_output.max_row < ws_input.max_row:
        print(f"扩展输出文件行数到 {ws_input.max_row}...")
    for row in range(2, ws_input.max_row + 1):
        if row % 100 == 0:
            print(f"处理到第 {row-1} 行...")
        if row % 1000 == 0:
            print(f"已处理 {row-1} 行，共 {total_rows} 行")
        # Y (UGC有用性)
        if helpfull_col:
            y_value = ws_input.cell(row=row, column=helpfull_col).value
            y_value = float(y_value) if y_value else 0
        else:
            y_value = 0
        ws_output.cell(row=row, column=1, value=y_value)
        # X1 (评论数量)
        if comment_count_col:
            x1_value = ws_input.cell(row=row, column=comment_count_col).value
            x1_value = float(x1_value) if x1_value else 0
        else:
            x1_value = 0
        ws_output.cell(row=row, column=2, value=x1_value)
        # 计算评论相关指标
        comment_lengths = []
        comment_complexities = []
        comment_sentiments = []
        comment_richness = []
        for col in comment_cols:
            content = str(ws_input.cell(row=row, column=col).value)
            if content and content != 'None' and content != 'nan':
                # X2: 评论长度（剔空格后的字符数）
                length = len(content.replace(' ', ''))
                comment_lengths.append(length)
                # X3: 评论复杂度（按空格拆分的分词数）
                complexity = len(content.split())
                comment_complexities.append(complexity)
                # X5: 内容情感性（正面=1、中性=0、负面=-1）
                positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
                negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
                sentiment = 0
                lower_content = content.lower()
                if any(word in lower_content for word in positive_words):
                    sentiment = 1
                elif any(word in lower_content for word in negative_words):
                    sentiment = -1
                comment_sentiments.append(sentiment)
                # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
                richness = 0
                if re.search(r'\d', content):
                    richness += 1
                if re.search(r'http[s]?://', content):
                    richness += 1
                if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
                    richness += 1
                comment_richness.append(richness)
        # X2: 评论长度平均值
        x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0
        ws_output.cell(row=row, column=3, value=x2_value)
        # X3: 评论复杂度平均值
        x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0
        ws_output.cell(row=row, column=4, value=x3_value)
        # X4: 评论可读性（X2/X3，X3为0时记0）
        x4_value = x2_value / x3_value if x3_value > 0 else 0
        ws_output.cell(row=row, column=5, value=x4_value)
        # X5: 内容情感性平均值
        x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0
        ws_output.cell(row=row, column=6, value=x5_value)
        # X6: 信息丰富度平均值
        x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0
        ws_output.cell(row=row, column=7, value=x6_value)
    # 保存文件
    print("\n保存文件...")
    wb_output.save(output_file)
    print(f"文件已成功保存: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    print(f"处理完成，共 {total_rows} 行数据")
    # 验证文件
    print("\n验证文件...")
    if os.path.exists(output_file):
        print("文件保存成功！")
        # 重新打开文件检查
        wb_check = openpyxl.load_workbook(output_file)
        ws_check = wb_check.active
        print(f"输出文件行数: {ws_check.max_row - 1}")
        print(f"输出文件列数: {ws_check.max_column}")
        # 显示前5行数据
        print("\n前5行数据:")
        for row in range(1, min(6, ws_check.max_row + 1)):
            row_data = []
            for col in range(1, ws_check.max_column + 1):
                value = ws_check.cell(row=row, column=col).value
                row_data.append(value)
            print(f"行 {row}: {row_data}")
    else:
        print("文件保存失败！")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/process_all_data.py
+++ b/project/process_all_data.py
@ -0,0 +1,190 @@
 import os
 import openpyxl
 import re
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  处理所有数据")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 try:
    print("正在读取原始数据...")
    wb_input = openpyxl.load_workbook(input_file)
    ws_input = wb_input.active
    print(f"工作表名称: {ws_input.title}")
    print(f"最大行数: {ws_input.max_row}")
    print(f"最大列数: {ws_input.max_column}")
    # 识别列
    print("\n识别列...")
    headers = []
    helpfull_col = None
    comment_count_col = None
    comment_cols = []
    for col in range(1, ws_input.max_column + 1):
        header = ws_input.cell(row=1, column=col).value
        headers.append(header)
        if header:
            header_str = str(header).lower()
            if 'helpfull' in header_str or 'helpful' in header_str:
                helpfull_col = col
                print(f"找到 Y 列 (helpfull): 列 {col}")
            elif '评论总数' in str(header) or '帖子评论总数' in str(header):
                comment_count_col = col
                print(f"找到 X1 列 (评论总数): 列 {col}")
            elif '评论' in str(header) and any(str(i) in str(header) for i in range(1, 6)):
                comment_cols.append(col)
                print(f"找到评论列 {len(comment_cols)}: 列 {col} - {header}")
    print(f"\n共找到 {len(comment_cols)} 个评论列")
    # 创建新的输出文件
    print("\n创建新的输出文件...")
    wb_output = openpyxl.Workbook()
    ws_output = wb_output.active
    # 写入表头
    headers_output = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
    for i, header in enumerate(headers_output, 1):
        ws_output.cell(row=1, column=i, value=header)
    # 计算并填充数据
    print("\n计算并填充数据...")
    total_rows = ws_input.max_row - 1
    print(f"总数据行数: {total_rows}")
    for row in range(2, ws_input.max_row + 1):
        if row % 1000 == 0:
            print(f"处理到第 {row-1} 行...")
        # Y (UGC有用性)
        if helpfull_col:
            y_value = ws_input.cell(row=row, column=helpfull_col).value
            y_value = float(y_value) if y_value else 0
        else:
            y_value = 0
        ws_output.cell(row=row, column=1, value=y_value)
        # X1 (评论数量)
        if comment_count_col:
            x1_value = ws_input.cell(row=row, column=comment_count_col).value
            x1_value = float(x1_value) if x1_value else 0
        else:
            x1_value = 0
        ws_output.cell(row=row, column=2, value=x1_value)
        # 计算评论相关指标
        comment_lengths = []
        comment_complexities = []
        comment_sentiments = []
        comment_richness = []
        for col in comment_cols:
            content = str(ws_input.cell(row=row, column=col).value)
            if content and content != 'None' and content != 'nan':
                # X2: 评论长度（剔空格后的字符数）
                length = len(content.replace(' ', ''))
                comment_lengths.append(length)
                # X3: 评论复杂度（按空格拆分的分词数）
                complexity = len(content.split())
                comment_complexities.append(complexity)
                # X5: 内容情感性（正面=1、中性=0、负面=-1）
                positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
                negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
                sentiment = 0
                lower_content = content.lower()
                if any(word in lower_content for word in positive_words):
                    sentiment = 1
                elif any(word in lower_content for word in negative_words):
                    sentiment = -1
                comment_sentiments.append(sentiment)
                # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
                richness = 0
                if re.search(r'\d', content):
                    richness += 1
                if re.search(r'http[s]?://', content):
                    richness += 1
                if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
                    richness += 1
                comment_richness.append(richness)
        # X2: 评论长度平均值
        x2_value = sum(comment_lengths) / len(comment_lengths) if comment_lengths else 0
        ws_output.cell(row=row, column=3, value=x2_value)
        # X3: 评论复杂度平均值
        x3_value = sum(comment_complexities) / len(comment_complexities) if comment_complexities else 0
        ws_output.cell(row=row, column=4, value=x3_value)
        # X4: 评论可读性（X2/X3，X3为0时记0）
        x4_value = x2_value / x3_value if x3_value > 0 else 0
        ws_output.cell(row=row, column=5, value=x4_value)
        # X5: 内容情感性平均值
        x5_value = sum(comment_sentiments) / len(comment_sentiments) if comment_sentiments else 0
        ws_output.cell(row=row, column=6, value=x5_value)
        # X6: 信息丰富度平均值
        x6_value = sum(comment_richness) / len(comment_richness) if comment_richness else 0
        ws_output.cell(row=row, column=7, value=x6_value)
    # 保存文件
    print("\n保存文件...")
    wb_output.save(output_file)
    print(f"文件已成功保存: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    print(f"处理完成，共 {total_rows} 行数据")
    # 验证文件
    print("\n验证文件...")
    if os.path.exists(output_file):
        print("文件保存成功！")
        # 重新打开文件检查
        wb_check = openpyxl.load_workbook(output_file)
        ws_check = wb_check.active
        print(f"输出文件行数: {ws_check.max_row - 1}")
        print(f"输出文件列数: {ws_check.max_column}")
        # 显示前5行数据
        print("\n前5行数据:")
        for row in range(1, min(6, ws_check.max_row + 1)):
            row_data = []
            for col in range(1, ws_check.max_column + 1):
                value = ws_check.cell(row=row, column=col).value
                row_data.append(value)
            print(f"行 {row}: {row_data}")
    else:
        print("文件保存失败！")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/process_all_rows.py
+++ b/project/process_all_rows.py
@ -0,0 +1,157 @@
 import os
 import pandas as pd
 import re
 print("=" * 60)
 print("  处理全部数据")
 print("=" * 60)
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 读取全部数据
 print("读取全部数据...")
 df = pd.read_excel(input_file, engine='openpyxl')
 print(f"成功读取 {len(df)} 行数据")
 print(f"原始列数: {len(df.columns)}")
 # 识别列
 print("\n识别列...")
 helpfull_col = None
 comment_count_col = None
 comment_cols = []
 for col in df.columns:
    col_str = str(col).lower()
    if 'helpfull' in col_str or 'helpful' in col_str:
        helpfull_col = col
        print(f"找到 Y 列 (helpfull): {col}")
    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
        comment_count_col = col
        print(f"找到 X1 列 (评论总数): {col}")
    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
        comment_cols.append(col)
 print(f"\n共找到 {len(comment_cols)} 个评论内容列")
 # 添加回归数据列
 print("\n添加回归数据列...")
 # Y (UGC有用性)
 print("1. 添加 Y (UGC有用性)")
 if helpfull_col:
    df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
 else:
    df['Y'] = 0
 # X1 (评论数量)
 print("2. 添加 X1 (评论数量)")
 if comment_count_col:
    df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
 else:
    df['X1'] = 0
 # 定义函数计算评论指标
 def calculate_comment_metrics(content):
    if pd.isna(content) or str(content) in ['None', 'nan', '']:
        return 0, 0, 0, 0
    content = str(content)
    length = len(content.replace(' ', '').replace('\u3000', ''))
    complexity = len(content.split())
    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
    sentiment = 0
    lower_content = content.lower()
    if any(word in lower_content for word in positive_words):
        sentiment = 1
    elif any(word in lower_content for word in negative_words):
        sentiment = -1
    richness = 0
    if re.search(r'\d', content):
        richness += 1
    if re.search(r'http[s]?://|www\.', content):
        richness += 1
    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):
        richness += 1
    return length, complexity, sentiment, richness
 # 计算评论相关指标
 print("3. 计算评论相关指标...")
 print(f"总数据行数: {len(df)}")
 df['X2'] = 0.0
 df['X3'] = 0.0
 df['X5'] = 0.0
 df['X6'] = 0.0
 for i in range(len(df)):
    if i % 1000 == 0:
        print(f"  处理第 {i}/{len(df)} 行...")
    lengths = []
    complexities = []
    sentiments = []
    richness = []
    for col in comment_cols:
        content = df.iloc[i].get(col, '')
        length, complexity, sentiment, r = calculate_comment_metrics(content)
        if length > 0:
            lengths.append(length)
            complexities.append(complexity)
            sentiments.append(sentiment)
            richness.append(r)
    if lengths:
        df.loc[i, 'X2'] = sum(lengths) / len(lengths)
        df.loc[i, 'X3'] = sum(complexities) / len(complexities)
        df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
        df.loc[i, 'X6'] = sum(richness) / len(richness)
 # X4: 评论可读性
 print("4. 计算 X4 (评论可读性)")
 df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
 # 数据清洗
 print("\n5. 数据清洗...")
 regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
 for col in regression_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    df[col] = df[col].replace([float('inf'), float('-inf')], 0)
 # 验证数据
 print("\n6. 验证数据...")
 print(f"总行数: {len(df)}")
 print(f"总列数: {len(df.columns)}")
 print(f"\n回归数据列统计:")
 print(df[regression_cols].describe())
 # 保存文件
 print("\n7. 保存文件...")
 df.to_excel(output_file, index=False, engine='openpyxl')
 # 验证文件
 print("\n8. 验证文件...")
 if os.path.exists(output_file):
    print(f"文件已成功保存: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    df_check = pd.read_excel(output_file)
    print(f"输出文件行数: {len(df_check)}")
    print(f"输出文件列数: {len(df_check.columns)}")
 else:
    print("文件保存失败！")
 print()
 print("=" * 60)
 print("  任务完成")
 print("=" * 60)
--- a/project/process_efficient.py
+++ b/project/process_efficient.py
@ -0,0 +1,180 @@
 import os
 import pandas as pd
 import re
 print("=" * 60)
 print("  高效处理全部数据")
 print("=" * 60)
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 首先读取表头来识别列
 print("1. 读取表头...")
 df_header = pd.read_excel(input_file, engine='openpyxl', nrows=0)
 print(f"总列数: {len(df_header.columns)}")
 # 识别列
 helpfull_col = None
 comment_count_col = None
 comment_cols = []
 for col in df_header.columns:
    col_str = str(col).lower()
    if 'helpfull' in col_str or 'helpful' in col_str:
        helpfull_col = col
        print(f"找到 Y 列 (helpfull): {col}")
    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
        comment_count_col = col
        print(f"找到 X1 列 (评论总数): {col}")
    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
        comment_cols.append(col)
 print(f"共找到 {len(comment_cols)} 个评论内容列")
 # 定义函数计算评论指标
 def calculate_comment_metrics(content):
    if pd.isna(content) or str(content) in ['None', 'nan', '']:
        return 0, 0, 0, 0
    content = str(content)
    length = len(content.replace(' ', '').replace('\u3000', ''))
    complexity = len(content.split())
    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
    sentiment = 0
    lower_content = content.lower()
    if any(word in lower_content for word in positive_words):
        sentiment = 1
    elif any(word in lower_content for word in negative_words):
        sentiment = -1
    richness = 0
    if re.search(r'\d', content):
        richness += 1
    if re.search(r'http[s]?://|www\.', content):
        richness += 1
    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):
        richness += 1
    return length, complexity, sentiment, richness
 # 分批处理数据
 print("\n2. 分批处理数据...")
 batch_size = 5000
 batch_num = 0
 all_data = []
 while True:
    skip_rows = batch_num * batch_size + 1 if batch_num > 0 else 0
    nrows = batch_size
    print(f"  处理批次 {batch_num + 1} (跳过 {skip_rows} 行，读取 {nrows} 行)...")
    try:
        if batch_num == 0:
            df_batch = pd.read_excel(input_file, engine='openpyxl', nrows=nrows)
        else:
            df_batch = pd.read_excel(input_file, engine='openpyxl', skiprows=skip_rows, nrows=nrows, header=None)
            df_batch.columns = df_header.columns
    except Exception as e:
        print(f"  读取完成或出错: {e}")
        break
    if len(df_batch) == 0:
        print("  没有更多数据")
        break
    print(f"  读取了 {len(df_batch)} 行")
    # 添加Y和X1
    if helpfull_col:
        df_batch['Y'] = pd.to_numeric(df_batch[helpfull_col], errors='coerce').fillna(0)
    else:
        df_batch['Y'] = 0
    if comment_count_col:
        df_batch['X1'] = pd.to_numeric(df_batch[comment_count_col], errors='coerce').fillna(0)
    else:
        df_batch['X1'] = 0
    # 初始化X2-X6
    df_batch['X2'] = 0.0
    df_batch['X3'] = 0.0
    df_batch['X5'] = 0.0
    df_batch['X6'] = 0.0
    # 计算评论指标
    for i in range(len(df_batch)):
        lengths = []
        complexities = []
        sentiments = []
        richness = []
        for col in comment_cols:
            content = df_batch.iloc[i].get(col, '')
            length, complexity, sentiment, r = calculate_comment_metrics(content)
            if length > 0:
                lengths.append(length)
                complexities.append(complexity)
                sentiments.append(sentiment)
                richness.append(r)
        if lengths:
            df_batch.loc[i, 'X2'] = sum(lengths) / len(lengths)
            df_batch.loc[i, 'X3'] = sum(complexities) / len(complexities)
            df_batch.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
            df_batch.loc[i, 'X6'] = sum(richness) / len(richness)
    # 计算X4
    df_batch['X4'] = df_batch.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
    # 数据清洗
    regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
    for col in regression_cols:
        df_batch[col] = pd.to_numeric(df_batch[col], errors='coerce').fillna(0)
        df_batch[col] = df_batch[col].replace([float('inf'), float('-inf')], 0)
    all_data.append(df_batch)
    batch_num += 1
    print(f"  批次 {batch_num} 完成，当前总行数: {sum(len(d) for d in all_data)}")
 # 合并所有数据
 print("\n3. 合并数据...")
 df_final = pd.concat(all_data, ignore_index=True)
 print(f"合并后总行数: {len(df_final)}")
 # 验证数据
 print("\n4. 验证数据...")
 regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
 print(f"总列数: {len(df_final.columns)}")
 print(f"\n回归数据列统计:")
 print(df_final[regression_cols].describe())
 # 保存文件
 print("\n5. 保存文件...")
 df_final.to_excel(output_file, index=False, engine='openpyxl')
 # 验证文件
 print("\n6. 验证文件...")
 if os.path.exists(output_file):
    print(f"文件已成功保存: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    df_check = pd.read_excel(output_file)
    print(f"输出文件行数: {len(df_check)}")
    print(f"输出文件列数: {len(df_check.columns)}")
 else:
    print("文件保存失败！")
 print()
 print("=" * 60)
 print("  任务完成")
 print("=" * 60)
--- a/project/process_large_file.py
+++ b/project/process_large_file.py
@ -0,0 +1,177 @@
 import os
 import pandas as pd
 import re
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  处理大型Excel文件")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 try:
    print("正在读取原始数据...")
    # 使用pandas读取Excel文件，设置引擎为openpyxl
    df = pd.read_excel(input_file, engine='openpyxl')
    print(f"成功读取 {len(df)} 行数据")
    print(f"列名: {list(df.columns)}")
    # 识别列
    print("\n识别列...")
    helpfull_col = None
    comment_count_col = None
    comment_cols = []
    for col in df.columns:
        col_str = str(col).lower()
        if 'helpfull' in col_str or 'helpful' in col_str:
            helpfull_col = col
            print(f"找到 Y 列 (helpfull): {col}")
        elif '评论总数' in str(col) or '帖子评论总数' in str(col):
            comment_count_col = col
            print(f"找到 X1 列 (评论总数): {col}")
        elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)):
            comment_cols.append(col)
            print(f"找到评论列 {len(comment_cols)}: {col}")
    print(f"\n共找到 {len(comment_cols)} 个评论列")
    # 创建回归数据
    print("\n创建回归数据...")
    regression_data = pd.DataFrame()
    # Y (UGC有用性)
    print("1. 计算 Y (UGC有用性)")
    if helpfull_col:
        regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
    else:
        regression_data['Y'] = 0
    # X1 (评论数量)
    print("2. 计算 X1 (评论数量)")
    if comment_count_col:
        regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
    else:
        regression_data['X1'] = 0
    # 定义函数计算评论指标
    def calculate_comment_metrics(content):
        if pd.isna(content) or str(content) in ['None', 'nan']:
            return 0, 0, 0, 0
        content = str(content)
        # 评论长度
        length = len(content.replace(' ', ''))
        # 评论复杂度
        complexity = len(content.split())
        # 情感分析
        positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
        negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
        sentiment = 0
        lower_content = content.lower()
        if any(word in lower_content for word in positive_words):
            sentiment = 1
        elif any(word in lower_content for word in negative_words):
            sentiment = -1
        # 信息丰富度
        richness = 0
        if re.search(r'\d', content):
            richness += 1
        if re.search(r'http[s]?://', content):
            richness += 1
        if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
            richness += 1
        return length, complexity, sentiment, richness
    # 计算评论相关指标
    print("3. 计算评论相关指标...")
    # 初始化列
    regression_data['X2'] = 0  # 评论长度
    regression_data['X3'] = 0  # 评论复杂度
    regression_data['X5'] = 0  # 情感性
    regression_data['X6'] = 0  # 信息丰富度
    # 逐行计算
    total_rows = len(df)
    for i in range(total_rows):
        if i % 1000 == 0:
            print(f"处理到第 {i} 行...")
        lengths = []
        complexities = []
        sentiments = []
        richness = []
        for col in comment_cols:
            content = df.iloc[i].get(col, '')
            length, complexity, sentiment, r = calculate_comment_metrics(content)
            if length > 0:
                lengths.append(length)
                complexities.append(complexity)
                sentiments.append(sentiment)
                richness.append(r)
        # 计算平均值
        if lengths:
            regression_data.loc[i, 'X2'] = sum(lengths) / len(lengths)
            regression_data.loc[i, 'X3'] = sum(complexities) / len(complexities)
            regression_data.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
            regression_data.loc[i, 'X6'] = sum(richness) / len(richness)
    # X4: 评论可读性
    print("4. 计算 X4 (评论可读性)")
    regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
    # 数据清洗
    print("\n5. 数据清洗...")
    for col in regression_data.columns:
        regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
    # 验证数据
    print("\n6. 验证数据...")
    print(f"行数: {len(regression_data)}")
    print(f"列数: {len(regression_data.columns)}")
    print(f"列名: {list(regression_data.columns)}")
    print(f"\n前5行数据:")
    print(regression_data.head())
    # 保存文件
    print("\n7. 保存文件...")
    regression_data.to_excel(output_file, index=False)
    # 验证文件
    print("\n8. 验证文件...")
    if os.path.exists(output_file):
        print(f"文件已成功保存: {output_file}")
        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
        # 重新读取检查
        df_check = pd.read_excel(output_file)
        print(f"输出文件行数: {len(df_check)}")
        print(f"输出文件列数: {len(df_check.columns)}")
    else:
        print("文件保存失败！")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/process_log.txt
+++ b/project/process_log.txt
@ -0,0 +1,9 @@
 ========================================
  在原表中添加回归数据列
 ========================================
 输入文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx
 输出文件: D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx
 输入文件大小: 21607.43 KB
 正在读取原始数据...
--- a/project/process_regression_final.py
+++ b/project/process_regression_final.py
@ -0,0 +1,192 @@
 import os
 import pandas as pd
 import re
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
 print("========================================")
 print("  在原表中添加回归数据列")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 try:
    print("\n正在读取原始数据...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    print(f"原始列数: {len(df.columns)}")
    # 识别列
    print("\n识别列...")
    helpfull_col = None
    comment_count_col = None
    comment_cols = []
    for col in df.columns:
        col_str = str(col).lower()
        if 'helpfull' in col_str or 'helpful' in col_str:
            helpfull_col = col
            print(f"找到 Y 列 (helpfull): {col}")
        elif '评论总数' in str(col) or '帖子评论总数' in str(col):
            comment_count_col = col
            print(f"找到 X1 列 (评论总数): {col}")
        elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
            comment_cols.append(col)
            print(f"找到评论列 {len(comment_cols)}: {col}")
    print(f"\n共找到 {len(comment_cols)} 个评论内容列")
    # 添加回归数据列
    print("\n添加回归数据列...")
    # Y (UGC有用性) - 直接复制helpfull列
    print("1. 添加 Y (UGC有用性)")
    if helpfull_col:
        df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
    else:
        df['Y'] = 0
    # X1 (评论数量) - 直接复制帖子评论总数列
    print("2. 添加 X1 (评论数量)")
    if comment_count_col:
        df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
    else:
        df['X1'] = 0
    # 定义函数计算评论指标
    def calculate_comment_metrics(content):
        if pd.isna(content) or str(content) in ['None', 'nan', '']:
            return 0, 0, 0, 0
        content = str(content)
        # X2: 评论长度（剔空格后的字符数）
        length = len(content.replace(' ', '').replace('\u3000', ''))
        # X3: 评论复杂度（按空格拆分的分词数）
        complexity = len(content.split())
        # X5: 情感分析（正面=1、中性=0、负面=-1）
        positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
        negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
        sentiment = 0
        lower_content = content.lower()
        if any(word in lower_content for word in positive_words):
            sentiment = 1
        elif any(word in lower_content for word in negative_words):
            sentiment = -1
        # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
        richness = 0
        if re.search(r'\d', content):  # 含数字
            richness += 1
        if re.search(r'http[s]?://|www\.', content):  # 含链接
            richness += 1
        if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
            richness += 1
        return length, complexity, sentiment, richness
    # 计算评论相关指标
    print("3. 计算评论相关指标...")
    # 初始化列
    df['X2'] = 0.0  # 评论长度
    df['X3'] = 0.0  # 评论复杂度
    df['X5'] = 0.0  # 情感性
    df['X6'] = 0.0  # 信息丰富度
    # 逐行计算
    total_rows = len(df)
    print(f"总数据行数: {total_rows}")
    for i in range(total_rows):
        if i % 1000 == 0:
            print(f"  处理第 {i}/{total_rows} 行...")
        lengths = []
        complexities = []
        sentiments = []
        richness = []
        for col in comment_cols:
            content = df.iloc[i].get(col, '')
            length, complexity, sentiment, r = calculate_comment_metrics(content)
            if length > 0:  # 只统计有内容的评论
                lengths.append(length)
                complexities.append(complexity)
                sentiments.append(sentiment)
                richness.append(r)
        # 计算平均值（无评论记0）
        if lengths:
            df.loc[i, 'X2'] = sum(lengths) / len(lengths)
            df.loc[i, 'X3'] = sum(complexities) / len(complexities)
            df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
            df.loc[i, 'X6'] = sum(richness) / len(richness)
    # X4: 评论可读性 = X2/X3（X3为0时记0，避免报错）
    print("4. 计算 X4 (评论可读性)")
    df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
    # 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
    print("\n5. 数据清洗...")
    regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
    for col in regression_cols:
        # 转换为数字，错误值转为0
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        # 替换无穷大
        df[col] = df[col].replace([float('inf'), float('-inf')], 0)
    # 验证数据
    print("\n6. 验证数据...")
    print(f"总行数: {len(df)}")
    print(f"总列数: {len(df.columns)}")
    print(f"\n回归数据列统计:")
    print(df[regression_cols].describe())
    print(f"\n前5行回归数据:")
    print(df[regression_cols].head())
    # 检查是否有空值或错误值
    print(f"\n空值检查:")
    for col in regression_cols:
        null_count = df[col].isnull().sum()
        print(f"  {col}: {null_count} 个空值")
    # 保存文件
    print("\n7. 保存文件...")
    print(f"正在保存到: {output_file}")
    df.to_excel(output_file, index=False, engine='openpyxl')
    # 验证文件
    print("\n8. 验证文件...")
    if os.path.exists(output_file):
        print(f"文件已成功保存: {output_file}")
        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
        # 重新读取检查
        df_check = pd.read_excel(output_file)
        print(f"输出文件行数: {len(df_check)}")
        print(f"输出文件列数: {len(df_check.columns)}")
        print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
    else:
        print("文件保存失败！")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
    print(f"新文件已保存: {output_file}")
    print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/process_with_csv.py
+++ b/project/process_with_csv.py
@ -0,0 +1,202 @@
 import os
 import pandas as pd
 import re
 print("=" * 60)
 print("  使用CSV处理回归数据")
 print("=" * 60)
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 print("\n正在读取原始数据...")
 try:
    df = pd.read_excel(input_file, engine='openpyxl')
    print(f"成功读取 {len(df)} 行数据")
    print(f"原始列数: {len(df.columns)}")
 except Exception as e:
    print(f"读取失败: {e}")
    exit(1)
 # 识别列
 print("\n识别列...")
 helpfull_col = None
 comment_count_col = None
 comment_cols = []
 for col in df.columns:
    col_str = str(col).lower()
    if 'helpfull' in col_str or 'helpful' in col_str:
        helpfull_col = col
        print(f"找到 Y 列 (helpfull): {col}")
    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
        comment_count_col = col
        print(f"找到 X1 列 (评论总数): {col}")
    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
        comment_cols.append(col)
        print(f"找到评论列 {len(comment_cols)}: {col}")
 print(f"\n共找到 {len(comment_cols)} 个评论内容列")
 # 添加回归数据列
 print("\n添加回归数据列...")
 # Y (UGC有用性) - 直接复制helpfull列
 print("1. 添加 Y (UGC有用性)")
 if helpfull_col:
    df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
 else:
    df['Y'] = 0
 # X1 (评论数量) - 直接复制帖子评论总数列
 print("2. 添加 X1 (评论数量)")
 if comment_count_col:
    df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
 else:
    df['X1'] = 0
 # 定义函数计算评论指标
 def calculate_comment_metrics(content):
    if pd.isna(content) or str(content) in ['None', 'nan', '']:
        return 0, 0, 0, 0
    content = str(content)
    # X2: 评论长度（剔空格后的字符数）
    length = len(content.replace(' ', '').replace('\u3000', ''))
    # X3: 评论复杂度（按空格拆分的分词数）
    complexity = len(content.split())
    # X5: 情感分析（正面=1、中性=0、负面=-1）
    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
    sentiment = 0
    lower_content = content.lower()
    if any(word in lower_content for word in positive_words):
        sentiment = 1
    elif any(word in lower_content for word in negative_words):
        sentiment = -1
    # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
    richness = 0
    if re.search(r'\d', content):  # 含数字
        richness += 1
    if re.search(r'http[s]?://|www\.', content):  # 含链接
        richness += 1
    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
        richness += 1
    return length, complexity, sentiment, richness
 # 计算评论相关指标
 print("3. 计算评论相关指标...")
 # 初始化列
 df['X2'] = 0.0  # 评论长度
 df['X3'] = 0.0  # 评论复杂度
 df['X5'] = 0.0  # 情感性
 df['X6'] = 0.0  # 信息丰富度
 # 逐行计算
 total_rows = len(df)
 print(f"总数据行数: {total_rows}")
 for i in range(total_rows):
    if i % 1000 == 0:
        print(f"  处理第 {i}/{total_rows} 行...")
    lengths = []
    complexities = []
    sentiments = []
    richness = []
    for col in comment_cols:
        content = df.iloc[i].get(col, '')
        length, complexity, sentiment, r = calculate_comment_metrics(content)
        if length > 0:  # 只统计有内容的评论
            lengths.append(length)
            complexities.append(complexity)
            sentiments.append(sentiment)
            richness.append(r)
    # 计算平均值（无评论记0）
    if lengths:
        df.loc[i, 'X2'] = sum(lengths) / len(lengths)
        df.loc[i, 'X3'] = sum(complexities) / len(complexities)
        df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
        df.loc[i, 'X6'] = sum(richness) / len(richness)
 # X4: 评论可读性 = X2/X3（X3为0时记0，避免报错）
 print("4. 计算 X4 (评论可读性)")
 df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
 # 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
 print("\n5. 数据清洗...")
 regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
 for col in regression_cols:
    # 转换为数字，错误值转为0
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    # 替换无穷大
    df[col] = df[col].replace([float('inf'), float('-inf')], 0)
 # 验证数据
 print("\n6. 验证数据...")
 print(f"总行数: {len(df)}")
 print(f"总列数: {len(df.columns)}")
 print(f"\n回归数据列统计:")
 print(df[regression_cols].describe())
 print(f"\n前5行回归数据:")
 print(df[regression_cols].head())
 # 检查是否有空值或错误值
 print(f"\n空值检查:")
 for col in regression_cols:
    null_count = df[col].isnull().sum()
    print(f"  {col}: {null_count} 个空值")
 # 保存为CSV中间文件
 print("\n7. 保存为CSV中间文件...")
 csv_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\temp_regression.csv'
 df.to_csv(csv_file, index=False, encoding='utf-8-sig')
 print(f"CSV文件已保存: {csv_file}")
 print(f"CSV文件大小: {os.path.getsize(csv_file) / 1024:.2f} KB")
 # 从CSV读取并保存为Excel
 print("\n8. 转换为Excel文件...")
 df_csv = pd.read_csv(csv_file, encoding='utf-8-sig')
 df_csv.to_excel(output_file, index=False, engine='openpyxl')
 # 验证文件
 print("\n9. 验证文件...")
 if os.path.exists(output_file):
    print(f"文件已成功保存: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    # 重新读取检查
    df_check = pd.read_excel(output_file)
    print(f"输出文件行数: {len(df_check)}")
    print(f"输出文件列数: {len(df_check.columns)}")
    print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
    # 删除临时CSV文件
    os.remove(csv_file)
    print(f"\n临时CSV文件已删除")
 else:
    print("文件保存失败！")
 print()
 print("=" * 60)
 print("  任务完成")
 print("=" * 60)
 print(f"新文件已保存: {output_file}")
 print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
--- a/project/process_with_pandas.py
+++ b/project/process_with_pandas.py
@ -0,0 +1,168 @@
 import os
 import pandas as pd
 import re
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  使用pandas处理所有数据")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 try:
    print("正在读取原始数据...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    print(f"列名: {list(df.columns)}")
    # 识别列
    print("\n识别列...")
    helpfull_col = None
    comment_count_col = None
    comment_cols = []
    for col in df.columns:
        col_str = str(col).lower()
        if 'helpfull' in col_str or 'helpful' in col_str:
            helpfull_col = col
            print(f"找到 Y 列 (helpfull): {col}")
        elif '评论总数' in str(col) or '帖子评论总数' in str(col):
            comment_count_col = col
            print(f"找到 X1 列 (评论总数): {col}")
        elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)):
            comment_cols.append(col)
            print(f"找到评论列 {len(comment_cols)}: {col}")
    print(f"\n共找到 {len(comment_cols)} 个评论列")
    # 创建回归数据
    print("\n创建回归数据...")
    regression_data = pd.DataFrame()
    # Y (UGC有用性)
    print("1. 计算 Y (UGC有用性)")
    if helpfull_col:
        regression_data['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
    else:
        regression_data['Y'] = 0
    # X1 (评论数量)
    print("2. 计算 X1 (评论数量)")
    if comment_count_col:
        regression_data['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
    else:
        regression_data['X1'] = 0
    # 定义函数计算评论指标
    def calculate_comment_metrics(row):
        lengths = []
        complexities = []
        sentiments = []
        richness = []
        for col in comment_cols:
            content = str(row.get(col, ''))
            if content and content != 'None' and content != 'nan':
                # 评论长度
                lengths.append(len(content.replace(' ', '')))
                # 评论复杂度
                complexities.append(len(content.split()))
                # 情感分析
                positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
                negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
                sentiment = 0
                lower_content = content.lower()
                if any(word in lower_content for word in positive_words):
                    sentiment = 1
                elif any(word in lower_content for word in negative_words):
                    sentiment = -1
                sentiments.append(sentiment)
                # 信息丰富度
                r = 0
                if re.search(r'\d', content):
                    r += 1
                if re.search(r'http[s]?://', content):
                    r += 1
                if re.search(r'[\u2600-\u27BF]|[:;][-]?[)D]', content):
                    r += 1
                richness.append(r)
        return lengths, complexities, sentiments, richness
    # 计算评论相关指标
    print("3. 计算评论相关指标...")
    comment_metrics = df.apply(calculate_comment_metrics, axis=1)
    # X2: 评论长度平均值
    print("4. 计算 X2 (评论长度)")
    regression_data['X2'] = comment_metrics.apply(lambda x: sum(x[0]) / len(x[0]) if x[0] else 0)
    # X3: 评论复杂度平均值
    print("5. 计算 X3 (评论复杂度)")
    regression_data['X3'] = comment_metrics.apply(lambda x: sum(x[1]) / len(x[1]) if x[1] else 0)
    # X4: 评论可读性
    print("6. 计算 X4 (评论可读性)")
    regression_data['X4'] = regression_data.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
    # X5: 内容情感性平均值
    print("7. 计算 X5 (内容情感性)")
    regression_data['X5'] = comment_metrics.apply(lambda x: sum(x[2]) / len(x[2]) if x[2] else 0)
    # X6: 信息丰富度平均值
    print("8. 计算 X6 (信息丰富度)")
    regression_data['X6'] = comment_metrics.apply(lambda x: sum(x[3]) / len(x[3]) if x[3] else 0)
    # 数据清洗
    print("\n9. 数据清洗...")
    for col in regression_data.columns:
        regression_data[col] = pd.to_numeric(regression_data[col], errors='coerce').fillna(0)
    # 验证数据
    print("\n10. 验证数据...")
    print(f"行数: {len(regression_data)}")
    print(f"列数: {len(regression_data.columns)}")
    print(f"列名: {list(regression_data.columns)}")
    print(f"数据类型:")
    print(regression_data.dtypes)
    print(f"\n前5行数据:")
    print(regression_data.head())
    # 保存文件
    print("\n11. 保存文件...")
    regression_data.to_excel(output_file, index=False)
    # 验证文件
    print("\n12. 验证文件...")
    if os.path.exists(output_file):
        print(f"文件已成功保存: {output_file}")
        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
        # 重新读取检查
        df_check = pd.read_excel(output_file)
        print(f"输出文件行数: {len(df_check)}")
        print(f"输出文件列数: {len(df_check.columns)}")
    else:
        print("文件保存失败！")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/quick_process.py
+++ b/project/quick_process.py
@ -0,0 +1,83 @@
 import os
 import pandas as pd
 import re
 print("开始处理...")
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
 # 读取数据
 print("读取数据...")
 df = pd.read_excel(input_file)
 print(f"读取完成: {len(df)} 行")
 # 识别列
 helpfull_col = [c for c in df.columns if 'helpfull' in str(c).lower()][0] if any('helpfull' in str(c).lower() for c in df.columns) else None
 comment_count_col = [c for c in df.columns if '评论总数' in str(c)][0] if any('评论总数' in str(c) for c in df.columns) else None
 comment_cols = [c for c in df.columns if '评论' in str(c) and any(str(i) in str(c) for i in range(1, 6)) and '内容' in str(c)]
 print(f"找到列: Y={helpfull_col}, X1={comment_count_col}, 评论列={len(comment_cols)}")
 # 添加Y和X1
 df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0) if helpfull_col else 0
 df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0) if comment_count_col else 0
 # 计算评论指标
 print("计算评论指标...")
 def calc_metrics(content):
    if pd.isna(content) or str(content) in ['None', 'nan', '']:
        return 0, 0, 0, 0
    content = str(content)
    length = len(content.replace(' ', '').replace('\u3000', ''))
    complexity = len(content.split())
    pos_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent']
    neg_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor']
    sentiment = 1 if any(w in content.lower() for w in pos_words) else (-1 if any(w in content.lower() for w in neg_words) else 0)
    richness = (1 if re.search(r'\d', content) else 0) + (1 if re.search(r'http[s]?://|www\.', content) else 0) + (1 if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]', content) else 0)
    return length, complexity, sentiment, richness
 # 批量计算
 x2_list, x3_list, x5_list, x6_list = [], [], [], []
 for i in range(len(df)):
    if i % 5000 == 0:
        print(f"处理 {i}/{len(df)}")
    lengths, complexities, sentiments, richness = [], [], [], []
    for col in comment_cols:
        l, c, s, r = calc_metrics(df.iloc[i].get(col, ''))
        if l > 0:
            lengths.append(l)
            complexities.append(c)
            sentiments.append(s)
            richness.append(r)
    x2_list.append(sum(lengths)/len(lengths) if lengths else 0)
    x3_list.append(sum(complexities)/len(complexities) if complexities else 0)
    x5_list.append(sum(sentiments)/len(sentiments) if sentiments else 0)
    x6_list.append(sum(richness)/len(richness) if richness else 0)
 df['X2'] = x2_list
 df['X3'] = x3_list
 df['X5'] = x5_list
 df['X6'] = x6_list
 # 计算X4
 df['X4'] = df.apply(lambda r: r['X2']/r['X3'] if r['X3']>0 else 0, axis=1)
 # 清洗数据
 for col in ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).replace([float('inf'), float('-inf')], 0)
 print("保存文件...")
 df.to_excel(output_file, index=False, engine='openpyxl')
 print(f"完成！文件大小: {os.path.getsize(output_file)/1024:.2f} KB")
 print(f"行数: {len(df)}, 列数: {len(df.columns)}")
--- a/project/read_excel_test.py
+++ b/project/read_excel_test.py
@ -0,0 +1,54 @@
 import os
 import openpyxl
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
 print("========================================")
 print("  读取Excel测试")
 print("========================================")
 print(f"输入文件: {input_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取Excel文件
 try:
    print("正在读取Excel文件...")
    wb = openpyxl.load_workbook(input_file)
    ws = wb.active
    print(f"工作表名称: {ws.title}")
    print(f"最大行数: {ws.max_row}")
    print(f"最大列数: {ws.max_column}")
    # 读取表头
    print("\n表头:")
    headers = []
    for col in range(1, ws.max_column + 1):
        header = ws.cell(row=1, column=col).value
        headers.append(header)
        print(f"{col}. {header}")
    # 读取前3行数据
    print("\n前3行数据:")
    for row in range(2, min(5, ws.max_row + 1)):
        row_data = []
        for col in range(1, min(10, ws.max_column + 1)):
            value = ws.cell(row=row, column=col).value
            row_data.append(value)
        print(f"行 {row}: {row_data}")
    print("\n========================================")
    print("  读取完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/run_with_output.py
+++ b/project/run_with_output.py
@ -0,0 +1,216 @@
 import os
 import pandas as pd
 import re
 import sys
 # 重定向输出到文件和屏幕
 class Tee:
    def __init__(self, *files):
        self.files = files
    def write(self, obj):
        for f in self.files:
            f.write(obj)
            f.flush()
    def flush(self):
        for f in self.files:
            f.flush()
 log_file = open(r'D:\java\project\process_log.txt', 'w', encoding='utf-8')
 original_stdout = sys.stdout
 sys.stdout = Tee(original_stdout, log_file)
 print("========================================")
 print("  在原表中添加回归数据列")
 print("========================================")
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    sys.stdout = original_stdout
    log_file.close()
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 try:
    print("\n正在读取原始数据...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    print(f"原始列数: {len(df.columns)}")
    # 识别列
    print("\n识别列...")
    helpfull_col = None
    comment_count_col = None
    comment_cols = []
    for col in df.columns:
        col_str = str(col).lower()
        if 'helpfull' in col_str or 'helpful' in col_str:
            helpfull_col = col
            print(f"找到 Y 列 (helpfull): {col}")
        elif '评论总数' in str(col) or '帖子评论总数' in str(col):
            comment_count_col = col
            print(f"找到 X1 列 (评论总数): {col}")
        elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
            comment_cols.append(col)
            print(f"找到评论列 {len(comment_cols)}: {col}")
    print(f"\n共找到 {len(comment_cols)} 个评论内容列")
    # 添加回归数据列
    print("\n添加回归数据列...")
    # Y (UGC有用性) - 直接复制helpfull列
    print("1. 添加 Y (UGC有用性)")
    if helpfull_col:
        df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
    else:
        df['Y'] = 0
    # X1 (评论数量) - 直接复制帖子评论总数列
    print("2. 添加 X1 (评论数量)")
    if comment_count_col:
        df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
    else:
        df['X1'] = 0
    # 定义函数计算评论指标
    def calculate_comment_metrics(content):
        if pd.isna(content) or str(content) in ['None', 'nan', '']:
            return 0, 0, 0, 0
        content = str(content)
        # X2: 评论长度（剔空格后的字符数）
        length = len(content.replace(' ', '').replace('\u3000', ''))
        # X3: 评论复杂度（按空格拆分的分词数）
        complexity = len(content.split())
        # X5: 情感分析（正面=1、中性=0、负面=-1）
        positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
        negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
        sentiment = 0
        lower_content = content.lower()
        if any(word in lower_content for word in positive_words):
            sentiment = 1
        elif any(word in lower_content for word in negative_words):
            sentiment = -1
        # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
        richness = 0
        if re.search(r'\d', content):  # 含数字
            richness += 1
        if re.search(r'http[s]?://|www\.', content):  # 含链接
            richness += 1
        if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
            richness += 1
        return length, complexity, sentiment, richness
    # 计算评论相关指标
    print("3. 计算评论相关指标...")
    # 初始化列
    df['X2'] = 0.0  # 评论长度
    df['X3'] = 0.0  # 评论复杂度
    df['X5'] = 0.0  # 情感性
    df['X6'] = 0.0  # 信息丰富度
    # 逐行计算
    total_rows = len(df)
    print(f"总数据行数: {total_rows}")
    for i in range(total_rows):
        if i % 1000 == 0:
            print(f"  处理第 {i}/{total_rows} 行...")
        lengths = []
        complexities = []
        sentiments = []
        richness = []
        for col in comment_cols:
            content = df.iloc[i].get(col, '')
            length, complexity, sentiment, r = calculate_comment_metrics(content)
            if length > 0:  # 只统计有内容的评论
                lengths.append(length)
                complexities.append(complexity)
                sentiments.append(sentiment)
                richness.append(r)
        # 计算平均值（无评论记0）
        if lengths:
            df.loc[i, 'X2'] = sum(lengths) / len(lengths)
            df.loc[i, 'X3'] = sum(complexities) / len(complexities)
            df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
            df.loc[i, 'X6'] = sum(richness) / len(richness)
    # X4: 评论可读性 = X2/X3（X3为0时记0，避免报错）
    print("4. 计算 X4 (评论可读性)")
    df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
    # 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
    print("\n5. 数据清洗...")
    regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
    for col in regression_cols:
        # 转换为数字，错误值转为0
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        # 替换无穷大
        df[col] = df[col].replace([float('inf'), float('-inf')], 0)
    # 验证数据
    print("\n6. 验证数据...")
    print(f"总行数: {len(df)}")
    print(f"总列数: {len(df.columns)}")
    print(f"\n回归数据列统计:")
    print(df[regression_cols].describe())
    print(f"\n前5行回归数据:")
    print(df[regression_cols].head())
    # 检查是否有空值或错误值
    print(f"\n空值检查:")
    for col in regression_cols:
        null_count = df[col].isnull().sum()
        print(f"  {col}: {null_count} 个空值")
    # 保存文件
    print("\n7. 保存文件...")
    print(f"正在保存到: {output_file}")
    df.to_excel(output_file, index=False, engine='openpyxl')
    # 验证文件
    print("\n8. 验证文件...")
    if os.path.exists(output_file):
        print(f"文件已成功保存: {output_file}")
        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
        # 重新读取检查
        df_check = pd.read_excel(output_file)
        print(f"输出文件行数: {len(df_check)}")
        print(f"输出文件列数: {len(df_check.columns)}")
        print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
    else:
        print("文件保存失败！")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
    print(f"新文件已保存: {output_file}")
    print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
 finally:
    sys.stdout = original_stdout
    log_file.close()
    print("日志已保存到: D:\\java\\project\\process_log.txt")
--- a/project/simple_add_columns.py
+++ b/project/simple_add_columns.py
@ -0,0 +1,187 @@
 import os
 import pandas as pd
 import re
 print("=" * 60)
 print("  在原表中添加回归数据列")
 print("=" * 60)
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）_回归.xlsx'
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 print("\n正在读取原始数据...")
 df = pd.read_excel(input_file)
 print(f"成功读取 {len(df)} 行数据")
 print(f"原始列数: {len(df.columns)}")
 # 识别列
 print("\n识别列...")
 helpfull_col = None
 comment_count_col = None
 comment_cols = []
 for col in df.columns:
    col_str = str(col).lower()
    if 'helpfull' in col_str or 'helpful' in col_str:
        helpfull_col = col
        print(f"找到 Y 列 (helpfull): {col}")
    elif '评论总数' in str(col) or '帖子评论总数' in str(col):
        comment_count_col = col
        print(f"找到 X1 列 (评论总数): {col}")
    elif '评论' in str(col) and any(str(i) in str(col) for i in range(1, 6)) and '内容' in str(col):
        comment_cols.append(col)
        print(f"找到评论列 {len(comment_cols)}: {col}")
 print(f"\n共找到 {len(comment_cols)} 个评论内容列")
 # 添加回归数据列
 print("\n添加回归数据列...")
 # Y (UGC有用性) - 直接复制helpfull列
 print("1. 添加 Y (UGC有用性)")
 if helpfull_col:
    df['Y'] = pd.to_numeric(df[helpfull_col], errors='coerce').fillna(0)
 else:
    df['Y'] = 0
 # X1 (评论数量) - 直接复制帖子评论总数列
 print("2. 添加 X1 (评论数量)")
 if comment_count_col:
    df['X1'] = pd.to_numeric(df[comment_count_col], errors='coerce').fillna(0)
 else:
    df['X1'] = 0
 # 定义函数计算评论指标
 def calculate_comment_metrics(content):
    if pd.isna(content) or str(content) in ['None', 'nan', '']:
        return 0, 0, 0, 0
    content = str(content)
    # X2: 评论长度（剔空格后的字符数）
    length = len(content.replace(' ', '').replace('\u3000', ''))
    # X3: 评论复杂度（按空格拆分的分词数）
    complexity = len(content.split())
    # X5: 情感分析（正面=1、中性=0、负面=-1）
    positive_words = ['好', '棒', '优秀', '喜欢', '满意', '赞', 'positive', 'good', 'great', 'excellent', 'love', 'like']
    negative_words = ['差', '糟糕', '不好', '失望', '不满', 'negative', 'bad', 'terrible', 'poor', 'hate', 'dislike']
    sentiment = 0
    lower_content = content.lower()
    if any(word in lower_content for word in positive_words):
        sentiment = 1
    elif any(word in lower_content for word in negative_words):
        sentiment = -1
    # X6: 信息丰富度（含数字/链接/表情各1分，满分3分）
    richness = 0
    if re.search(r'\d', content):  # 含数字
        richness += 1
    if re.search(r'http[s]?://|www\.', content):  # 含链接
        richness += 1
    if re.search(r'[\u2600-\u27BF\U0001F300-\U0001F9FF]|[\uD83C-\uDBFF][\uDC00-\uDFFF]|[:;][-]?[)D]', content):  # 含表情
        richness += 1
    return length, complexity, sentiment, richness
 # 计算评论相关指标
 print("3. 计算评论相关指标...")
 # 初始化列
 df['X2'] = 0.0  # 评论长度
 df['X3'] = 0.0  # 评论复杂度
 df['X5'] = 0.0  # 情感性
 df['X6'] = 0.0  # 信息丰富度
 # 逐行计算
 total_rows = len(df)
 print(f"总数据行数: {total_rows}")
 for i in range(total_rows):
    if i % 1000 == 0:
        print(f"  处理第 {i}/{total_rows} 行...")
    lengths = []
    complexities = []
    sentiments = []
    richness = []
    for col in comment_cols:
        content = df.iloc[i].get(col, '')
        length, complexity, sentiment, r = calculate_comment_metrics(content)
        if length > 0:  # 只统计有内容的评论
            lengths.append(length)
            complexities.append(complexity)
            sentiments.append(sentiment)
            richness.append(r)
    # 计算平均值（无评论记0）
    if lengths:
        df.loc[i, 'X2'] = sum(lengths) / len(lengths)
        df.loc[i, 'X3'] = sum(complexities) / len(complexities)
        df.loc[i, 'X5'] = sum(sentiments) / len(sentiments)
        df.loc[i, 'X6'] = sum(richness) / len(richness)
 # X4: 评论可读性 = X2/X3（X3为0时记0，避免报错）
 print("4. 计算 X4 (评论可读性)")
 df['X4'] = df.apply(lambda row: row['X2'] / row['X3'] if row['X3'] > 0 else 0, axis=1)
 # 数据清洗 - 确保所有值都是纯数字，无文本、无空值、无错误
 print("\n5. 数据清洗...")
 regression_cols = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
 for col in regression_cols:
    # 转换为数字，错误值转为0
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    # 替换无穷大
    df[col] = df[col].replace([float('inf'), float('-inf')], 0)
 # 验证数据
 print("\n6. 验证数据...")
 print(f"总行数: {len(df)}")
 print(f"总列数: {len(df.columns)}")
 print(f"\n回归数据列统计:")
 print(df[regression_cols].describe())
 print(f"\n前5行回归数据:")
 print(df[regression_cols].head())
 # 检查是否有空值或错误值
 print(f"\n空值检查:")
 for col in regression_cols:
    null_count = df[col].isnull().sum()
    print(f"  {col}: {null_count} 个空值")
 # 保存文件
 print("\n7. 保存文件...")
 print(f"正在保存到: {output_file}")
 df.to_excel(output_file, index=False, engine='openpyxl')
 # 验证文件
 print("\n8. 验证文件...")
 if os.path.exists(output_file):
    print(f"文件已成功保存: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    # 重新读取检查
    df_check = pd.read_excel(output_file)
    print(f"输出文件行数: {len(df_check)}")
    print(f"输出文件列数: {len(df_check.columns)}")
    print(f"\n回归数据列: {[col for col in df_check.columns if col in regression_cols]}")
 else:
    print("文件保存失败！")
 print()
 print("=" * 60)
 print("  任务完成")
 print("=" * 60)
 print(f"新文件已保存: {output_file}")
 print(f"包含原始数据的所有列以及新增的Y, X1-X6回归数据列")
--- a/project/simple_calculate.py
+++ b/project/simple_calculate.py
@ -0,0 +1,100 @@
 import os
 import openpyxl
 import re
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  简单计算UGC回归数据")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 if not os.path.exists(output_file):
    print("错误: 输出文件不存在！")
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取输入文件
 try:
    print("正在读取输入文件...")
    wb_input = openpyxl.load_workbook(input_file)
    ws_input = wb_input.active
    print(f"输入工作表名称: {ws_input.title}")
    print(f"输入文件最大行数: {ws_input.max_row}")
    print(f"输入文件最大列数: {ws_input.max_column}")
    # 读取输出文件
    print("\n正在读取输出文件...")
    wb_output = openpyxl.load_workbook(output_file)
    ws_output = wb_output.active
    print(f"输出工作表名称: {ws_output.title}")
    # 识别列
    print("\n识别列...")
    headers = []
    for col in range(1, ws_input.max_column + 1):
        header = ws_input.cell(row=1, column=col).value
        headers.append(header)
        if header and 'helpfull' in str(header):
            helpfull_col = col
            print(f"找到 helpfull 列: {col}")
        elif header and ('评论总数' in str(header) or '帖子评论总数' in str(header)):
            comment_count_col = col
            print(f"找到评论总数列: {col}")
        elif header and '评论' in str(header):
            print(f"找到评论列: {col} - {header}")
    # 计算并填充数据
    print("\n计算并填充数据...")
    max_rows = min(ws_input.max_row, 10)  # 只处理前10行用于测试
    print(f"处理前 {max_rows - 1} 行数据")
    for row in range(2, max_rows + 1):
        print(f"处理行 {row}")
        # Y (UGC有用性)
        if 'helpfull_col' in locals():
            y_value = ws_input.cell(row=row, column=helpfull_col).value
            ws_output.cell(row=row, column=1, value=y_value if y_value else 0)
        else:
            ws_output.cell(row=row, column=1, value=0)
        # X1 (评论数量)
        if 'comment_count_col' in locals():
            x1_value = ws_input.cell(row=row, column=comment_count_col).value
            ws_output.cell(row=row, column=2, value=x1_value if x1_value else 0)
        else:
            ws_output.cell(row=row, column=2, value=0)
        # X2-X6 暂时设为0
        for col in range(3, 8):
            ws_output.cell(row=row, column=col, value=0)
    # 保存文件
    print("\n保存文件...")
    wb_output.save(output_file)
    print(f"文件已成功保存: {output_file}")
    print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/simple_copy.py
+++ b/project/simple_copy.py
@ -0,0 +1,41 @@
 import os
 import shutil
 # 输入输出文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 print("========================================")
 print("  简单文件复制脚本")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 print(f"文件存在: {os.path.exists(input_file)}")
 # 复制文件
 try:
    print("正在复制文件...")
    shutil.copy2(input_file, output_file)
    # 验证文件是否创建成功
    if os.path.exists(output_file):
        print(f"文件已成功复制到: {output_file}")
        print(f"复制文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    else:
        print("错误: 文件复制失败，未找到输出文件")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
--- a/project/simple_data_test.py
+++ b/project/simple_data_test.py
@ -0,0 +1,54 @@
 import os
 import pandas as pd
 # 文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  简单数据测试")
 print("========================================")
 print(f"输入文件: {input_file}")
 print(f"输出文件: {output_file}")
 print()
 # 检查文件是否存在
 if not os.path.exists(input_file):
    print("错误: 输入文件不存在！")
    exit(1)
 print(f"输入文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 # 读取原始数据
 try:
    print("正在读取原始数据...")
    df = pd.read_excel(input_file)
    print(f"成功读取 {len(df)} 行数据")
    print(f"列名: {list(df.columns)}")
    # 简单处理：创建一个只包含前5列的新文件
    print("\n创建测试文件...")
    test_data = df.head(100)  # 只取前100行
    test_output = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\test_output.xlsx'
    test_data.to_excel(test_output, index=False)
    print(f"测试文件已创建: {test_output}")
    print(f"测试文件大小: {os.path.getsize(test_output) / 1024:.2f} KB")
    # 验证测试文件
    if os.path.exists(test_output):
        df_test = pd.read_excel(test_output)
        print(f"测试文件行数: {len(df_test)}")
        print(f"测试文件列数: {len(df_test.columns)}")
    else:
        print("测试文件创建失败！")
    print()
    print("========================================")
    print("  测试完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/simple_excel_create.py
+++ b/project/simple_excel_create.py
@ -0,0 +1,57 @@
 import os
 import openpyxl
 # 文件路径
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  创建UGC回归数据文件")
 print("========================================")
 print(f"输出文件: {output_file}")
 print()
 # 检查输出目录是否存在
 output_dir = os.path.dirname(output_file)
 print(f"输出目录: {output_dir}")
 print(f"目录存在: {os.path.exists(output_dir)}")
 if not os.path.exists(output_dir):
    print("正在创建输出目录...")
    try:
        os.makedirs(output_dir)
        print("目录创建成功")
    except Exception as e:
        print(f"创建目录失败: {e}")
        exit(1)
 # 创建新的Excel文件
 try:
    print("\n创建新的Excel文件...")
    wb = openpyxl.Workbook()
    ws = wb.active
    # 设置第一行列名
    headers = ['Y', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6']
    for i, header in enumerate(headers, 1):
        ws.cell(row=1, column=i, value=header)
    # 保存文件
    print(f"保存文件到: {output_file}")
    wb.save(output_file)
    # 验证文件是否创建成功
    if os.path.exists(output_file):
        print(f"文件已成功创建: {output_file}")
        print(f"文件大小: {os.path.getsize(output_file) / 1024:.2f} KB")
    else:
        print("错误: 文件创建失败")
    print()
    print("========================================")
    print("  任务完成")
    print("========================================")
 except Exception as e:
    print(f"处理文件时出错: {str(e)}")
    import traceback
    traceback.print_exc()
--- a/project/simple_test.py
+++ b/project/simple_test.py
@ -0,0 +1,22 @@
 import os
 # 测试基本文件操作
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子原始信息计量实验使用.xlsx'
 print("========================================")
 print("  简单测试")
 print("========================================")
 print(f"输入文件: {input_file}")
 print()
 # 检查文件是否存在
 if os.path.exists(input_file):
    print("文件存在！")
    print(f"文件大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 else:
    print("文件不存在！")
 print()
 print("========================================")
 print("  测试完成")
 print("========================================")
--- a/project/test_file_access.py
+++ b/project/test_file_access.py
@ -0,0 +1,49 @@
 import os
 # 测试文件路径
 input_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\图文帖子实验数据（新）.xlsx'
 output_file = r'D:\计量经济学\计量实验资料及作业要求\计量实验资料及作业要求\UGC回归数据.xlsx'
 print("========================================")
 print("  测试文件访问")
 print("========================================")
 print(f"当前目录: {os.getcwd()}")
 print()
 # 检查输入文件
 print("检查输入文件:")
 print(f"路径: {input_file}")
 print(f"存在: {os.path.exists(input_file)}")
 if os.path.exists(input_file):
    print(f"大小: {os.path.getsize(input_file) / 1024:.2f} KB")
 else:
    print("文件不存在！")
 # 检查输出文件
 print("\n检查输出文件:")
 print(f"路径: {output_file}")
 print(f"存在: {os.path.exists(output_file)}")
 if os.path.exists(output_file):
    print(f"大小: {os.path.getsize(output_file) / 1024:.2f} KB")
 else:
    print("文件不存在！")
 # 检查目录
 print("\n检查目录:")
 dir_path = os.path.dirname(input_file)
 print(f"目录: {dir_path}")
 print(f"存在: {os.path.exists(dir_path)}")
 if os.path.exists(dir_path):
    print("目录内容:")
    files = os.listdir(dir_path)
    for file in files[:10]:  # 只显示前10个文件
        file_path = os.path.join(dir_path, file)
        size = os.path.getsize(file_path) / 1024
        print(f"  {file}: {size:.2f} KB")
    if len(files) > 10:
        print(f"  ... 还有 {len(files) - 10} 个文件")
 print()
 print("========================================")
 print("  测试完成")
 print("========================================")
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit 15d177f1a2a8093521047d866fd50d9b09eb273d
	`@ -0,0 +1 @@`
					`Subproject commit 15d177f1a2a8093521047d866fd50d9b09eb273d`